From 1b8151c081149268ab2fec3961570bb538f1194b Mon Sep 17 00:00:00 2001
From: Michael Schellenberger Costa <miscco@nvidia.com>
Date: Thu, 21 Nov 2024 16:05:35 +0100
Subject: [PATCH 01/45] Try to work around issue with NVHPC in conjunction of
 older CTK versions (#2889)

NVHPC can consume older CTK headers for stdpar, so we need to try and avoid using those
---
 cub/cub/thread/thread_operators.cuh | 38 +++++++++++++++++++++++++++++
 1 file changed, 38 insertions(+)

diff --git a/cub/cub/thread/thread_operators.cuh b/cub/cub/thread/thread_operators.cuh
index cfc47edcfe7..2de65083843 100644
--- a/cub/cub/thread/thread_operators.cuh
+++ b/cub/cub/thread/thread_operators.cuh
@@ -440,10 +440,15 @@ struct SimdMin<__half>
 
   _CCCL_NODISCARD _CCCL_DEVICE _CCCL_FORCEINLINE __half2 operator()(__half2 a, __half2 b) const
   {
+#    if _CCCL_CUDACC_BELOW(12, 0) && defined(_CCCL_CUDA_COMPILER_NVHPC)
+    return __floats2half2_rn(::cuda::minimum<>{}(__half2float(a.x), __half2float(b.x)),
+                             ::cuda::minimum<>{}(__half2float(a.y), __half2float(b.y)));
+#    else // ^^^ _CCCL_CUDACC_BELOW(12, 0) && _CCCL_CUDA_COMPILER_NVHPC ^^^ / vvv otherwise vvv
     NV_IF_TARGET(NV_PROVIDES_SM_80,
                  (return __hmin2(a, b);),
                  (return __halves2half2(__float2half(::cuda::minimum<>{}(__half2float(a.x), __half2float(b.x))),
                                         __float2half(::cuda::minimum<>{}(__half2float(a.y), __half2float(b.y))));));
+#    endif // !_CCCL_CUDACC_BELOW(12, 0) || !_CCCL_CUDA_COMPILER_NVHPC
   }
 };
 
@@ -470,11 +475,16 @@ struct SimdMin<__nv_bfloat16>
 
   _CCCL_NODISCARD _CCCL_DEVICE _CCCL_FORCEINLINE __nv_bfloat162 operator()(__nv_bfloat162 a, __nv_bfloat162 b) const
   {
+#    if _CCCL_CUDACC_BELOW(12, 0) && defined(_CCCL_CUDA_COMPILER_NVHPC)
+    return __floats2bfloat162_rn(::cuda::minimum<>{}(__bfloat162float(a.x), __bfloat162float(b.x)),
+                                 ::cuda::minimum<>{}(__bfloat162float(a.y), __bfloat162float(b.y)));
+#    else // ^^^ _CCCL_CUDACC_BELOW(12, 0) && _CCCL_CUDA_COMPILER_NVHPC ^^^ / vvv otherwise vvv
     NV_IF_TARGET(NV_PROVIDES_SM_80,
                  (return __hmin2(a, b);),
                  (return cub::internal::halves2bfloat162(
                            __float2bfloat16(::cuda::minimum<>{}(__bfloat162float(a.x), __bfloat162float(b.x))),
                            __float2bfloat16(::cuda::minimum<>{}(__bfloat162float(a.y), __bfloat162float(b.y))));));
+#    endif // !_CCCL_CUDACC_BELOW(12, 0) || !_CCCL_CUDA_COMPILER_NVHPC
   }
 };
 
@@ -521,10 +531,15 @@ struct SimdMax<__half>
 
   _CCCL_NODISCARD _CCCL_DEVICE _CCCL_FORCEINLINE __half2 operator()(__half2 a, __half2 b) const
   {
+#    if _CCCL_CUDACC_BELOW(12, 0) && defined(_CCCL_CUDA_COMPILER_NVHPC)
+    return __floats2half2_rn(::cuda::maximum<>{}(__half2float(a.x), __half2float(b.x)),
+                             ::cuda::maximum<>{}(__half2float(a.y), __half2float(b.y)));
+#    else // ^^^ _CCCL_CUDACC_BELOW(12, 0) && _CCCL_CUDA_COMPILER_NVHPC ^^^ / vvv otherwise vvv
     NV_IF_TARGET(NV_PROVIDES_SM_80,
                  (return __hmax2(a, b);),
                  (return __halves2half2(__float2half(::cuda::maximum<>{}(__half2float(a.x), __half2float(b.x))),
                                         __float2half(::cuda::maximum<>{}(__half2float(a.y), __half2float(b.y))));));
+#    endif // !_CCCL_CUDACC_BELOW(12, 0) || !_CCCL_CUDA_COMPILER_NVHPC
   }
 };
 
@@ -539,11 +554,16 @@ struct SimdMax<__nv_bfloat16>
 
   _CCCL_NODISCARD _CCCL_DEVICE _CCCL_FORCEINLINE __nv_bfloat162 operator()(__nv_bfloat162 a, __nv_bfloat162 b) const
   {
+#    if _CCCL_CUDACC_BELOW(12, 0) && defined(_CCCL_CUDA_COMPILER_NVHPC)
+    return __floats2bfloat162_rn(::cuda::maximum<>{}(__bfloat162float(a.x), __bfloat162float(b.x)),
+                                 ::cuda::maximum<>{}(__bfloat162float(a.y), __bfloat162float(b.y)));
+#    else // ^^^ _CCCL_CUDACC_BELOW(12, 0) && _CCCL_CUDA_COMPILER_NVHPC ^^^ / vvv otherwise vvv
     NV_IF_TARGET(NV_PROVIDES_SM_80,
                  (return __hmax2(a, b);),
                  (return cub::internal::halves2bfloat162(
                            __float2bfloat16(::cuda::maximum<>{}(__bfloat162float(a.x), __bfloat162float(b.x))),
                            __float2bfloat16(::cuda::maximum<>{}(__bfloat162float(a.y), __bfloat162float(b.y))));));
+#    endif // !_CCCL_CUDACC_BELOW(12, 0) || !_CCCL_CUDA_COMPILER_NVHPC
   }
 };
 
@@ -566,10 +586,14 @@ struct SimdSum<__half>
 
   _CCCL_NODISCARD _CCCL_DEVICE _CCCL_FORCEINLINE __half2 operator()(__half2 a, __half2 b) const
   {
+#    if _CCCL_CUDACC_BELOW(12, 0) && defined(_CCCL_CUDA_COMPILER_NVHPC)
+    return __floats2half2_rn(__half2float(a.x) + __half2float(b.x), __half2float(a.y) + __half2float(b.y));
+#    else // ^^^ _CCCL_CUDACC_BELOW(12, 0) && _CCCL_CUDA_COMPILER_NVHPC ^^^ / vvv otherwise vvv
     NV_IF_TARGET(NV_PROVIDES_SM_53,
                  (return __hadd2(a, b);),
                  (return __halves2half2(__float2half(__half2float(a.x) + __half2float(b.x)),
                                         __float2half(__half2float(a.y) + __half2float(b.y)));));
+#    endif // !_CCCL_CUDACC_BELOW(12, 0) || !_CCCL_CUDA_COMPILER_NVHPC
   }
 };
 
@@ -584,11 +608,16 @@ struct SimdSum<__nv_bfloat16>
 
   _CCCL_NODISCARD _CCCL_DEVICE _CCCL_FORCEINLINE __nv_bfloat162 operator()(__nv_bfloat162 a, __nv_bfloat162 b) const
   {
+#    if _CCCL_CUDACC_BELOW(12, 0) && defined(_CCCL_CUDA_COMPILER_NVHPC)
+    return __floats2bfloat162_rn(
+      __bfloat162float(a.x) + __bfloat162float(b.x), __bfloat162float(a.y) + __bfloat162float(b.y));
+#    else // ^^^ _CCCL_CUDACC_BELOW(12, 0) && _CCCL_CUDA_COMPILER_NVHPC ^^^ / vvv otherwise vvv
     NV_IF_TARGET(
       NV_PROVIDES_SM_80,
       (return __hadd2(a, b);),
       (return cub::internal::halves2bfloat162(__float2bfloat16(__bfloat162float(a.x) + __bfloat162float(b.x)),
                                               __float2bfloat16(__bfloat162float(a.y) + __bfloat162float(b.y)));));
+#    endif // !_CCCL_CUDACC_BELOW(12, 0) || !_CCCL_CUDA_COMPILER_NVHPC
   }
 };
 
@@ -611,10 +640,14 @@ struct SimdMul<__half>
 
   _CCCL_NODISCARD _CCCL_DEVICE _CCCL_FORCEINLINE __half2 operator()(__half2 a, __half2 b) const
   {
+#    if _CCCL_CUDACC_BELOW(12, 0) && defined(_CCCL_CUDA_COMPILER_NVHPC)
+    return __floats2half2_rn(__half2float(a.x) * __half2float(b.x), __half2float(a.y) * __half2float(b.y));
+#    else // ^^^ _CCCL_CUDACC_BELOW(12, 0) && _CCCL_CUDA_COMPILER_NVHPC ^^^ / vvv otherwise vvv
     NV_IF_TARGET(NV_PROVIDES_SM_53,
                  (return __hmul2(a, b);),
                  (return __halves2half2(__float2half(__half2float(a.x) * __half2float(b.x)),
                                         __float2half(__half2float(a.y) * __half2float(b.y)));));
+#    endif // !_CCCL_CUDACC_BELOW(12, 0) || !_CCCL_CUDA_COMPILER_NVHPC
   }
 };
 
@@ -629,10 +662,15 @@ struct SimdMul<__nv_bfloat16>
 
   _CCCL_NODISCARD _CCCL_DEVICE _CCCL_FORCEINLINE __nv_bfloat162 operator()(__nv_bfloat162 a, __nv_bfloat162 b) const
   {
+#    if _CCCL_CUDACC_BELOW(12, 0) && defined(_CCCL_CUDA_COMPILER_NVHPC)
+    return __floats2bfloat162_rn(
+      __bfloat162float(a.x) * __bfloat162float(b.x), __bfloat162float(a.y) * __bfloat162float(b.y));
+#    else // ^^^ _CCCL_CUDACC_BELOW(12, 0) && _CCCL_CUDA_COMPILER_NVHPC ^^^ / vvv otherwise vvv
     NV_IF_TARGET(NV_PROVIDES_SM_80,
                  (return __hmul2(a, b);),
                  (return halves2bfloat162(__float2bfloat16(__bfloat162float(a.x) * __bfloat162float(b.x)),
                                           __float2bfloat16(__bfloat162float(a.y) * __bfloat162float(b.y)));));
+#    endif // !_CCCL_CUDACC_BELOW(12, 0) || !_CCCL_CUDA_COMPILER_NVHPC
   }
 };
 

From 9af2a13df00318cefb4902500bef74074ec50e8e Mon Sep 17 00:00:00 2001
From: Bernhard Manfred Gruber <bernhardmgruber@gmail.com>
Date: Thu, 21 Nov 2024 16:14:53 +0100
Subject: [PATCH 02/45] Refactoring (#2905)

---
 .../nvbench_helper/nvbench_helper.cuh         | 79 +++++++++----------
 1 file changed, 39 insertions(+), 40 deletions(-)

diff --git a/cub/benchmarks/nvbench_helper/nvbench_helper/nvbench_helper.cuh b/cub/benchmarks/nvbench_helper/nvbench_helper/nvbench_helper.cuh
index e8dacb4a1ff..88b189cf964 100644
--- a/cub/benchmarks/nvbench_helper/nvbench_helper/nvbench_helper.cuh
+++ b/cub/benchmarks/nvbench_helper/nvbench_helper/nvbench_helper.cuh
@@ -418,52 +418,51 @@ struct less_t
   {
     return lhs < rhs;
   }
-};
-
-template <>
-__host__ __device__ inline bool less_t::operator()(const complex& lhs, const complex& rhs) const
-{
-  double magnitude_0 = cuda::std::abs(lhs);
-  double magnitude_1 = cuda::std::abs(rhs);
 
-  if (cuda::std::isnan(magnitude_0) || cuda::std::isnan(magnitude_1))
-  {
-    // NaN's are always equal.
-    return false;
-  }
-  else if (cuda::std::isinf(magnitude_0) || cuda::std::isinf(magnitude_1))
+  __host__ __device__ inline bool operator()(const complex& lhs, const complex& rhs) const
   {
-    // If the real or imaginary part of the complex number has a very large value
-    // (close to the maximum representable value for a double), it is possible that
-    // the magnitude computation can result in positive infinity:
-    // ```cpp
-    // const double large_number = std::numeric_limits<double>::max() / 2;
-    // std::complex<double> z(large_number, large_number);
-    // std::abs(z) == inf;
-    // ```
-    // Dividing both components by a constant before computing the magnitude prevents overflow.
-    const complex::value_type scaler = 0.5;
-
-    magnitude_0 = cuda::std::abs(lhs * scaler);
-    magnitude_1 = cuda::std::abs(rhs * scaler);
-  }
+    double magnitude_0 = cuda::std::abs(lhs);
+    double magnitude_1 = cuda::std::abs(rhs);
+
+    if (cuda::std::isnan(magnitude_0) || cuda::std::isnan(magnitude_1))
+    {
+      // NaN's are always equal.
+      return false;
+    }
+    else if (cuda::std::isinf(magnitude_0) || cuda::std::isinf(magnitude_1))
+    {
+      // If the real or imaginary part of the complex number has a very large value
+      // (close to the maximum representable value for a double), it is possible that
+      // the magnitude computation can result in positive infinity:
+      // ```cpp
+      // const double large_number = std::numeric_limits<double>::max() / 2;
+      // std::complex<double> z(large_number, large_number);
+      // std::abs(z) == inf;
+      // ```
+      // Dividing both components by a constant before computing the magnitude prevents overflow.
+      const complex::value_type scaler = 0.5;
+
+      magnitude_0 = cuda::std::abs(lhs * scaler);
+      magnitude_1 = cuda::std::abs(rhs * scaler);
+    }
 
-  const complex::value_type difference = cuda::std::abs(magnitude_0 - magnitude_1);
-  const complex::value_type threshold  = cuda::std::numeric_limits<complex::value_type>::epsilon() * 2;
+    const complex::value_type difference = cuda::std::abs(magnitude_0 - magnitude_1);
+    const complex::value_type threshold  = cuda::std::numeric_limits<complex::value_type>::epsilon() * 2;
 
-  if (difference < threshold)
-  {
-    // Triangles with the same magnitude are sorted by their phase angle.
-    const complex::value_type phase_angle_0 = cuda::std::arg(lhs);
-    const complex::value_type phase_angle_1 = cuda::std::arg(rhs);
+    if (difference < threshold)
+    {
+      // Triangles with the same magnitude are sorted by their phase angle.
+      const complex::value_type phase_angle_0 = cuda::std::arg(lhs);
+      const complex::value_type phase_angle_1 = cuda::std::arg(rhs);
 
-    return phase_angle_0 < phase_angle_1;
-  }
-  else
-  {
-    return magnitude_0 < magnitude_1;
+      return phase_angle_0 < phase_angle_1;
+    }
+    else
+    {
+      return magnitude_0 < magnitude_1;
+    }
   }
-}
+};
 
 struct max_t
 {

From 801b794cc8f46837cd66595eb9f0bc6824907630 Mon Sep 17 00:00:00 2001
From: Eric Niebler <eniebler@nvidia.com>
Date: Thu, 21 Nov 2024 11:02:57 -0800
Subject: [PATCH 03/45] add "`interface`" to `_CCCL_PUSH_MACROS` (#2919)

---
 libcudacxx/include/cuda/std/__cccl/diagnostic.h | 9 ++++++---
 1 file changed, 6 insertions(+), 3 deletions(-)

diff --git a/libcudacxx/include/cuda/std/__cccl/diagnostic.h b/libcudacxx/include/cuda/std/__cccl/diagnostic.h
index 4183fd96bf7..fdedae215f3 100644
--- a/libcudacxx/include/cuda/std/__cccl/diagnostic.h
+++ b/libcudacxx/include/cuda/std/__cccl/diagnostic.h
@@ -175,9 +175,12 @@
 #  define _CCCL_PUSH_MACROS _CCCL_MSVC_WARNINGS_PUSH
 #  define _CCCL_POP_MACROS  _CCCL_MSVC_WARNINGS_POP
 #else // ^^^ _CCCL_HAS_NO_PRAGMA_PUSH_POP_MACRO ^^^ / vvv !_CCCL_HAS_NO_PRAGMA_PUSH_POP_MACRO vvv
-#  define _CCCL_PUSH_MACROS _CCCL_PRAGMA(push_macro("min")) _CCCL_PRAGMA(push_macro("max")) _CCCL_MSVC_WARNINGS_PUSH
-#  define _CCCL_POP_MACROS  _CCCL_PRAGMA(pop_macro("min")) _CCCL_PRAGMA(pop_macro("max")) _CCCL_MSVC_WARNINGS_POP
-
+#  define _CCCL_PUSH_MACROS         \
+    _CCCL_PRAGMA(push_macro("min")) \
+    _CCCL_PRAGMA(push_macro("max")) _CCCL_PRAGMA(push_macro("interface")) _CCCL_MSVC_WARNINGS_PUSH
+#  define _CCCL_POP_MACROS         \
+    _CCCL_PRAGMA(pop_macro("min")) \
+    _CCCL_PRAGMA(pop_macro("max")) _CCCL_PRAGMA(pop_macro("interface")) _CCCL_MSVC_WARNINGS_POP
 #endif // !_CCCL_HAS_NO_PRAGMA_PUSH_POP_MACRO
 
 #endif // __CCCL_DIAGNOSTIC_H

From 0722044948f46e61e704828be78843ff256c7eb6 Mon Sep 17 00:00:00 2001
From: Eric Niebler <eniebler@nvidia.com>
Date: Thu, 21 Nov 2024 12:34:03 -0800
Subject: [PATCH 04/45] Replace inconsistent Doxygen macros with
 `_CCCL_DOXYGEN_INVOKED` (#2921)

fixes #2362
---
 cub/cub/block/block_discontinuity.cuh         |  4 +-
 cub/cub/block/block_exchange.cuh              |  4 +-
 cub/cub/block/block_load.cuh                  |  4 +-
 cub/cub/block/block_merge_sort.cuh            |  4 +-
 cub/cub/block/block_radix_rank.cuh            | 16 ++---
 cub/cub/block/block_radix_sort.cuh            |  8 +--
 cub/cub/block/block_run_length_decode.cuh     |  4 +-
 cub/cub/block/block_scan.cuh                  |  4 +-
 cub/cub/block/block_store.cuh                 |  4 +-
 cub/cub/block/radix_rank_sort_operations.cuh  |  4 +-
 cub/cub/detail/array_utils.cuh                |  4 +-
 cub/cub/detail/detect_cuda_runtime.cuh        |  2 +-
 cub/cub/detail/nvtx.cuh                       |  4 +-
 cub/cub/detail/strong_load.cuh                |  4 +-
 cub/cub/detail/strong_store.cuh               |  4 +-
 cub/cub/device/device_adjacent_difference.cuh | 16 ++---
 cub/cub/device/device_histogram.cuh           | 32 +++++-----
 cub/cub/device/device_merge_sort.cuh          | 24 +++----
 cub/cub/device/device_partition.cuh           | 12 ++--
 cub/cub/device/device_radix_sort.cuh          | 16 ++---
 cub/cub/device/device_reduce.cuh              | 28 ++++----
 cub/cub/device/device_run_length_encode.cuh   |  8 +--
 cub/cub/device/device_scan.cuh                | 56 ++++++++--------
 .../device/device_segmented_radix_sort.cuh    | 32 +++++-----
 cub/cub/device/device_segmented_reduce.cuh    | 24 +++----
 cub/cub/device/device_segmented_sort.cuh      | 64 +++++++++----------
 cub/cub/device/device_select.cuh              | 24 +++----
 cub/cub/device/device_spmv.cuh                |  4 +-
 cub/cub/device/device_transform.cuh           | 16 ++---
 .../dispatch/dispatch_adjacent_difference.cuh |  8 +--
 .../device/dispatch/dispatch_histogram.cuh    | 16 ++---
 cub/cub/device/dispatch/dispatch_reduce.cuh   | 16 ++---
 .../dispatch/dispatch_reduce_by_key.cuh       |  4 +-
 cub/cub/device/dispatch/dispatch_rle.cuh      |  4 +-
 cub/cub/device/dispatch/dispatch_scan.cuh     |  8 +--
 .../device/dispatch/dispatch_scan_by_key.cuh  |  8 +--
 .../dispatch/dispatch_segmented_sort.cuh      |  8 +--
 .../device/dispatch/dispatch_select_if.cuh    |  4 +-
 .../device/dispatch/dispatch_spmv_orig.cuh    |  8 +--
 .../dispatch/dispatch_three_way_partition.cuh |  4 +-
 .../dispatch/dispatch_unique_by_key.cuh       |  8 +--
 cub/cub/grid/grid_queue.cuh                   |  4 +-
 cub/cub/thread/thread_load.cuh                |  4 +-
 cub/cub/thread/thread_operators.cuh           |  4 +-
 cub/cub/thread/thread_reduce.cuh              |  8 +--
 cub/cub/thread/thread_store.cuh               |  4 +-
 cub/cub/util_allocator.cuh                    |  4 +-
 cub/cub/util_arch.cuh                         |  2 +-
 cub/cub/util_cpp_dialect.cuh                  |  4 +-
 cub/cub/util_debug.cuh                        |  4 +-
 cub/cub/util_device.cuh                       | 12 ++--
 cub/cub/util_macro.cuh                        |  2 +-
 cub/cub/util_ptx.cuh                          | 12 ++--
 cub/cub/util_temporary_storage.cuh            |  4 +-
 cub/cub/util_type.cuh                         | 12 ++--
 cub/cub/util_vsmem.cuh                        |  4 +-
 cub/cub/warp/warp_reduce.cuh                  |  8 +--
 .../uninitialized_async_buffer.cuh            |  4 +-
 .../__container/uninitialized_buffer.cuh      |  4 +-
 .../cuda/experimental/__device/device.cuh     |  2 +-
 .../device_memory_resource.cuh                |  4 +-
 .../__stf/internal/data_interface.cuh         |  4 +-
 .../__stf/internal/execution_policy.cuh       |  6 +-
 .../__stf/internal/reduction_base.cuh         |  4 +-
 .../places/exec/host/callback_queues.cuh      |  4 +-
 .../experimental/__stf/places/inner_shape.cuh |  4 +-
 .../experimental/__stf/stream/reduction.cuh   |  4 +-
 .../experimental/__stf/stream/stream_task.cuh |  4 +-
 .../cuda/experimental/__stf/utility/core.cuh  |  8 +--
 .../experimental/__stf/utility/unittest.cuh   |  6 +-
 .../__utility/ensure_current_device.cuh       |  4 +-
 docs/repo.toml                                |  9 +--
 .../cuda/std/__type_traits/type_list.h        |  4 +-
 thrust/thrust/detail/type_deduction.h         |  4 +-
 thrust/thrust/device_malloc_allocator.h       |  4 +-
 thrust/thrust/device_ptr.h                    |  4 +-
 thrust/thrust/device_reference.h              |  2 +-
 thrust/thrust/memory.h                        |  4 +-
 thrust/thrust/optional.h                      |  4 +-
 thrust/thrust/pair.h                          | 18 +++---
 .../random/linear_congruential_engine.h       |  4 +-
 thrust/thrust/tuple.h                         | 18 +++---
 82 files changed, 376 insertions(+), 379 deletions(-)

diff --git a/cub/cub/block/block_discontinuity.cuh b/cub/cub/block/block_discontinuity.cuh
index 2fb15e9059b..fb88dfac07f 100644
--- a/cub/cub/block/block_discontinuity.cuh
+++ b/cub/cub/block/block_discontinuity.cuh
@@ -270,7 +270,7 @@ public:
   //! @name Head flag operations
   //! @{
 
-#ifndef DOXYGEN_SHOULD_SKIP_THIS // Do not document
+#ifndef _CCCL_DOXYGEN_INVOKED // Do not document
 
   /**
    * @param[out] head_flags
@@ -349,7 +349,7 @@ public:
     Iterate::FlagHeads(linear_tid, head_flags, input, preds, flag_op);
   }
 
-#endif // DOXYGEN_SHOULD_SKIP_THIS
+#endif // _CCCL_DOXYGEN_INVOKED
 
   //! @rst
   //! Sets head flags indicating discontinuities between items partitioned across the thread
diff --git a/cub/cub/block/block_exchange.cuh b/cub/cub/block/block_exchange.cuh
index a781d68e68b..bdc2a3dc932 100644
--- a/cub/cub/block/block_exchange.cuh
+++ b/cub/cub/block/block_exchange.cuh
@@ -1217,7 +1217,7 @@ public:
 
   //! @}  end member group
 
-#ifndef DOXYGEN_SHOULD_SKIP_THIS // Do not document
+#ifndef _CCCL_DOXYGEN_INVOKED // Do not document
 
   /// @param[in-out] items
   ///   Items to exchange, converting between **striped** and **blocked** arrangements.
@@ -1292,7 +1292,7 @@ public:
     ScatterToStriped(items, items, ranks, is_valid);
   }
 
-#endif // DOXYGEN_SHOULD_SKIP_THIS
+#endif // _CCCL_DOXYGEN_INVOKED
 };
 
 CUB_NAMESPACE_END
diff --git a/cub/cub/block/block_load.cuh b/cub/cub/block/block_load.cuh
index 641ff6d5d09..c1e9b95ac56 100644
--- a/cub/cub/block/block_load.cuh
+++ b/cub/cub/block/block_load.cuh
@@ -179,7 +179,7 @@ _CCCL_DEVICE _CCCL_FORCEINLINE void LoadDirectBlocked(
   LoadDirectBlocked(linear_tid, block_src_it, dst_items, block_items_end);
 }
 
-#ifndef DOXYGEN_SHOULD_SKIP_THIS // Do not document
+#ifndef _CCCL_DOXYGEN_INVOKED // Do not document
 
 //! @brief Internal implementation for load vectorization
 //!
@@ -225,7 +225,7 @@ InternalLoadDirectBlockedVectorized(int linear_tid, const T* block_src_ptr, T (&
   }
 }
 
-#endif // DOXYGEN_SHOULD_SKIP_THIS
+#endif // _CCCL_DOXYGEN_INVOKED
 
 //! @rst
 //! Load a linear segment of items into a blocked arrangement across the thread block.
diff --git a/cub/cub/block/block_merge_sort.cuh b/cub/cub/block/block_merge_sort.cuh
index 29510db5e97..b6d0c8a33b1 100644
--- a/cub/cub/block/block_merge_sort.cuh
+++ b/cub/cub/block/block_merge_sort.cuh
@@ -175,14 +175,14 @@ private:
   // Whether or not there are values to be trucked along with keys
   static constexpr bool KEYS_ONLY = ::cuda::std::is_same<ValueT, NullType>::value;
 
-#ifndef DOXYGEN_SHOULD_SKIP_THIS // Do not document
+#ifndef _CCCL_DOXYGEN_INVOKED // Do not document
   /// Shared memory type required by this thread block
   union _TempStorage
   {
     KeyT keys_shared[ITEMS_PER_TILE + 1];
     ValueT items_shared[ITEMS_PER_TILE + 1];
   }; // union TempStorage
-#endif // DOXYGEN_SHOULD_SKIP_THIS
+#endif // _CCCL_DOXYGEN_INVOKED
 
   /// Shared storage reference
   _TempStorage& temp_storage;
diff --git a/cub/cub/block/block_radix_rank.cuh b/cub/cub/block/block_radix_rank.cuh
index 73228368fc5..5426e967712 100644
--- a/cub/cub/block/block_radix_rank.cuh
+++ b/cub/cub/block/block_radix_rank.cuh
@@ -93,7 +93,7 @@ struct BlockRadixRankEmptyCallback
   _CCCL_DEVICE _CCCL_FORCEINLINE void operator()(int (&bins)[BINS_PER_THREAD]) {}
 };
 
-#ifndef DOXYGEN_SHOULD_SKIP_THIS // Do not document
+#ifndef _CCCL_DOXYGEN_INVOKED // Do not document
 namespace detail
 {
 
@@ -121,7 +121,7 @@ struct warp_in_block_matcher_t<Bits, 0, PartialWarpId>
 };
 
 } // namespace detail
-#endif // DOXYGEN_SHOULD_SKIP_THIS
+#endif // _CCCL_DOXYGEN_INVOKED
 
 //! @rst
 //! BlockRadixRank provides operations for ranking unsigned integer types within a CUDA thread block.
@@ -263,7 +263,7 @@ private:
   /// BlockScan type
   using BlockScan = BlockScan<PackedCounter, BLOCK_DIM_X, INNER_SCAN_ALGORITHM, BLOCK_DIM_Y, BLOCK_DIM_Z>;
 
-#ifndef DOXYGEN_SHOULD_SKIP_THIS // Do not document
+#ifndef _CCCL_DOXYGEN_INVOKED // Do not document
   struct __align__(16) _TempStorage
   {
     union Aliasable
@@ -276,7 +276,7 @@ private:
     // Storage for scanning local ranks
     typename BlockScan::TempStorage block_scan;
   };
-#endif // !DOXYGEN_SHOULD_SKIP_THIS
+#endif // !_CCCL_DOXYGEN_INVOKED
 
   /// Shared storage reference
   _TempStorage& temp_storage;
@@ -597,7 +597,7 @@ private:
   /// BlockScan type
   using BlockScanT = BlockScan<DigitCounterT, BLOCK_THREADS, INNER_SCAN_ALGORITHM, BLOCK_DIM_Y, BLOCK_DIM_Z>;
 
-#ifndef DOXYGEN_SHOULD_SKIP_THIS // Do not document
+#ifndef _CCCL_DOXYGEN_INVOKED // Do not document
   struct __align__(16) _TempStorage
   {
     typename BlockScanT::TempStorage block_scan;
@@ -609,7 +609,7 @@ private:
     }
     aliasable;
   };
-#endif // !DOXYGEN_SHOULD_SKIP_THIS
+#endif // !_CCCL_DOXYGEN_INVOKED
 
   /// Shared storage reference
   _TempStorage& temp_storage;
@@ -1183,7 +1183,7 @@ struct BlockRadixRankMatchEarlyCounts
   }
 };
 
-#ifndef DOXYGEN_SHOULD_SKIP_THIS // Do not document
+#ifndef _CCCL_DOXYGEN_INVOKED // Do not document
 namespace detail
 {
 
@@ -1211,6 +1211,6 @@ using block_radix_rank_t = ::cuda::std::_If<
         BlockRadixRankMatchEarlyCounts<BlockDimX, RadixBits, IsDescending, ScanAlgorithm, WARP_MATCH_ATOMIC_OR>>>>>;
 
 } // namespace detail
-#endif // DOXYGEN_SHOULD_SKIP_THIS
+#endif // _CCCL_DOXYGEN_INVOKED
 
 CUB_NAMESPACE_END
diff --git a/cub/cub/block/block_radix_sort.cuh b/cub/cub/block/block_radix_sort.cuh
index 48650992918..3223b920b13 100644
--- a/cub/cub/block/block_radix_sort.cuh
+++ b/cub/cub/block/block_radix_sort.cuh
@@ -303,7 +303,7 @@ private:
   /// BlockExchange utility type for values
   using BlockExchangeValues = BlockExchange<ValueT, BLOCK_DIM_X, ITEMS_PER_THREAD, false, BLOCK_DIM_Y, BLOCK_DIM_Z>;
 
-#ifndef DOXYGEN_SHOULD_SKIP_THIS // Do not document
+#ifndef _CCCL_DOXYGEN_INVOKED // Do not document
   /// Shared memory storage layout type
   union _TempStorage
   {
@@ -312,7 +312,7 @@ private:
     typename BlockExchangeKeys::TempStorage exchange_keys;
     typename BlockExchangeValues::TempStorage exchange_values;
   };
-#endif // DOXYGEN_SHOULD_SKIP_THIS
+#endif // _CCCL_DOXYGEN_INVOKED
 
   /******************************************************************************
    * Thread fields
@@ -469,7 +469,7 @@ private:
   }
 
 public:
-#ifndef DOXYGEN_SHOULD_SKIP_THIS // Do not document
+#ifndef _CCCL_DOXYGEN_INVOKED // Do not document
 
   /**
    * @brief Sort blocked -> striped arrangement
@@ -554,7 +554,7 @@ public:
     }
   }
 
-#endif // DOXYGEN_SHOULD_SKIP_THIS
+#endif // _CCCL_DOXYGEN_INVOKED
 
   /// @smemstorage{BlockRadixSort}
   struct TempStorage : Uninitialized<_TempStorage>
diff --git a/cub/cub/block/block_run_length_decode.cuh b/cub/cub/block/block_run_length_decode.cuh
index 253fdb8b1d9..74934576cd5 100644
--- a/cub/cub/block/block_run_length_decode.cuh
+++ b/cub/cub/block/block_run_length_decode.cuh
@@ -173,7 +173,7 @@ private:
   /// Type used to index into the block's runs
   using RunOffsetT = uint32_t;
 
-#ifndef DOXYGEN_SHOULD_SKIP_THIS // Do not document
+#ifndef _CCCL_DOXYGEN_INVOKED // Do not document
   /// Shared memory type required by this thread block
   union _TempStorage
   {
@@ -184,7 +184,7 @@ private:
       DecodedOffsetT run_offsets[BLOCK_RUNS];
     } runs;
   }; // union TempStorage
-#endif // DOXYGEN_SHOULD_SKIP_THIS
+#endif // _CCCL_DOXYGEN_INVOKED
 
   /// Internal storage allocator (used when the user does not provide pre-allocated shared memory)
   _CCCL_DEVICE _CCCL_FORCEINLINE _TempStorage& PrivateStorage()
diff --git a/cub/cub/block/block_scan.cuh b/cub/cub/block/block_scan.cuh
index 0644e8ca254..c49eb36a52e 100644
--- a/cub/cub/block/block_scan.cuh
+++ b/cub/cub/block/block_scan.cuh
@@ -1291,7 +1291,7 @@ public:
   }
 
   //! @}  end member group
-#ifndef DOXYGEN_SHOULD_SKIP_THIS // Do not document no-initial-value scans
+#ifndef _CCCL_DOXYGEN_INVOKED // Do not document no-initial-value scans
 
   //! @name Exclusive prefix scan operations (no initial value, single datum per thread)
   //! @{
@@ -1445,7 +1445,7 @@ public:
   }
 
   //! @}  end member group
-#endif // DOXYGEN_SHOULD_SKIP_THIS  // Do not document no-initial-value scans
+#endif // _CCCL_DOXYGEN_INVOKED  // Do not document no-initial-value scans
 
   //! @name Inclusive prefix sum operations
   //! @{
diff --git a/cub/cub/block/block_store.cuh b/cub/cub/block/block_store.cuh
index 9d057d7fe4b..443f7a7f93b 100644
--- a/cub/cub/block/block_store.cuh
+++ b/cub/cub/block/block_store.cuh
@@ -1229,12 +1229,12 @@ public:
   //! @}  end member group
 };
 
-#ifndef DOXYGEN_SHOULD_SKIP_THIS // Do not document
+#ifndef _CCCL_DOXYGEN_INVOKED // Do not document
 template <class Policy, class It, class T = cub::detail::value_t<It>>
 struct BlockStoreType
 {
   using type = cub::BlockStore<T, Policy::BLOCK_THREADS, Policy::ITEMS_PER_THREAD, Policy::STORE_ALGORITHM>;
 };
-#endif // DOXYGEN_SHOULD_SKIP_THIS
+#endif // _CCCL_DOXYGEN_INVOKED
 
 CUB_NAMESPACE_END
diff --git a/cub/cub/block/radix_rank_sort_operations.cuh b/cub/cub/block/radix_rank_sort_operations.cuh
index e56a0ec1e27..d4fdd9c405f 100644
--- a/cub/cub/block/radix_rank_sort_operations.cuh
+++ b/cub/cub/block/radix_rank_sort_operations.cuh
@@ -142,7 +142,7 @@ struct ShiftDigitExtractor : BaseDigitExtractor<KeyT>
   }
 };
 
-#ifndef DOXYGEN_SHOULD_SKIP_THIS // Do not document
+#ifndef _CCCL_DOXYGEN_INVOKED // Do not document
 namespace detail
 {
 
@@ -564,7 +564,7 @@ struct traits_t<T, false /* is_fundamental */>
 } // namespace radix
 
 } // namespace detail
-#endif // DOXYGEN_SHOULD_SKIP_THIS
+#endif // _CCCL_DOXYGEN_INVOKED
 
 //! Twiddling keys for radix sort
 template <bool IS_DESCENDING, typename KeyT>
diff --git a/cub/cub/detail/array_utils.cuh b/cub/cub/detail/array_utils.cuh
index cfc8fafb452..1857c895a3c 100644
--- a/cub/cub/detail/array_utils.cuh
+++ b/cub/cub/detail/array_utils.cuh
@@ -51,7 +51,7 @@ CUB_NAMESPACE_BEGIN
 namespace detail
 {
 
-#ifndef DOXYGEN_SHOULD_SKIP_THIS // Do not document
+#ifndef _CCCL_DOXYGEN_INVOKED // Do not document
 
 /***********************************************************************************************************************
  * Generic Array-like to Array Conversion
@@ -74,7 +74,7 @@ to_array(const Input& input)
   return to_array_impl<CastType1>(input, ::cuda::std::make_index_sequence<cub::detail::static_size_v<Input>()>{});
 }
 
-#endif // !DOXYGEN_SHOULD_SKIP_THIS
+#endif // !_CCCL_DOXYGEN_INVOKED
 
 } // namespace detail
 
diff --git a/cub/cub/detail/detect_cuda_runtime.cuh b/cub/cub/detail/detect_cuda_runtime.cuh
index 35c52f4aedb..d83b2c1179a 100644
--- a/cub/cub/detail/detect_cuda_runtime.cuh
+++ b/cub/cub/detail/detect_cuda_runtime.cuh
@@ -49,7 +49,7 @@
 #  include <cuda_runtime_api.h>
 #endif // !_CCCL_COMPILER(NVRTC)
 
-#ifdef DOXYGEN_SHOULD_SKIP_THIS // Only parse this during doxygen passes:
+#ifdef _CCCL_DOXYGEN_INVOKED // Only parse this during doxygen passes:
 
 /**
  * \def CUB_DISABLE_CDP
diff --git a/cub/cub/detail/nvtx.cuh b/cub/cub/detail/nvtx.cuh
index 6a5dd8ff039..3bda5e596f3 100644
--- a/cub/cub/detail/nvtx.cuh
+++ b/cub/cub/detail/nvtx.cuh
@@ -37,10 +37,10 @@
 #  pragma system_header
 #endif // no system header
 
-#ifdef DOXYGEN_SHOULD_SKIP_THIS // Only parse this during doxygen passes:
+#ifdef _CCCL_DOXYGEN_INVOKED // Only parse this during doxygen passes:
 //! When this macro is defined, no NVTX ranges are emitted by CCCL
 #  define CCCL_DISABLE_NVTX
-#endif // DOXYGEN_SHOULD_SKIP_THIS
+#endif // _CCCL_DOXYGEN_INVOKED
 
 // Enable the functionality of this header if:
 // * The NVTX3 C API is available in CTK
diff --git a/cub/cub/detail/strong_load.cuh b/cub/cub/detail/strong_load.cuh
index e63bd3456c0..61693d808e2 100644
--- a/cub/cub/detail/strong_load.cuh
+++ b/cub/cub/detail/strong_load.cuh
@@ -49,7 +49,7 @@
 
 CUB_NAMESPACE_BEGIN
 
-#ifndef DOXYGEN_SHOULD_SKIP_THIS // Do not document
+#ifndef _CCCL_DOXYGEN_INVOKED // Do not document
 
 namespace detail
 {
@@ -247,6 +247,6 @@ static _CCCL_DEVICE _CCCL_FORCEINLINE unsigned int load_acquire(unsigned int con
 
 } // namespace detail
 
-#endif // DOXYGEN_SHOULD_SKIP_THIS
+#endif // _CCCL_DOXYGEN_INVOKED
 
 CUB_NAMESPACE_END
diff --git a/cub/cub/detail/strong_store.cuh b/cub/cub/detail/strong_store.cuh
index fe16cae9674..9b8091738db 100644
--- a/cub/cub/detail/strong_store.cuh
+++ b/cub/cub/detail/strong_store.cuh
@@ -47,7 +47,7 @@
 
 CUB_NAMESPACE_BEGIN
 
-#ifndef DOXYGEN_SHOULD_SKIP_THIS // Do not document
+#ifndef _CCCL_DOXYGEN_INVOKED // Do not document
 
 namespace detail
 {
@@ -302,6 +302,6 @@ _CCCL_DEVICE _CCCL_FORCEINLINE void store_release(unsigned char* ptr, unsigned c
 
 } // namespace detail
 
-#endif // DOXYGEN_SHOULD_SKIP_THIS
+#endif // _CCCL_DOXYGEN_INVOKED
 
 CUB_NAMESPACE_END
diff --git a/cub/cub/device/device_adjacent_difference.cuh b/cub/cub/device/device_adjacent_difference.cuh
index 84add4262e2..41728342abc 100644
--- a/cub/cub/device/device_adjacent_difference.cuh
+++ b/cub/cub/device/device_adjacent_difference.cuh
@@ -267,7 +267,7 @@ public:
       d_temp_storage, temp_storage_bytes, d_input, d_output, num_items, difference_op, stream);
   }
 
-#ifndef DOXYGEN_SHOULD_SKIP_THIS // Do not document
+#ifndef _CCCL_DOXYGEN_INVOKED // Do not document
   template <typename InputIteratorT, typename OutputIteratorT, typename DifferenceOpT, typename NumItemsT = std::uint32_t>
   CUB_DETAIL_RUNTIME_DEBUG_SYNC_IS_NOT_SUPPORTED static CUB_RUNTIME_FUNCTION cudaError_t SubtractLeftCopy(
     void* d_temp_storage,
@@ -283,7 +283,7 @@ public:
 
     return SubtractLeftCopy(d_temp_storage, temp_storage_bytes, d_input, d_output, num_items, difference_op, stream);
   }
-#endif // DOXYGEN_SHOULD_SKIP_THIS
+#endif // _CCCL_DOXYGEN_INVOKED
 
   //! @rst
   //! Subtracts the left element of each adjacent pair of elements residing within device-accessible memory.
@@ -398,7 +398,7 @@ public:
       d_temp_storage, temp_storage_bytes, d_input, d_input, num_items, difference_op, stream);
   }
 
-#ifndef DOXYGEN_SHOULD_SKIP_THIS // Do not document
+#ifndef _CCCL_DOXYGEN_INVOKED // Do not document
   template <typename RandomAccessIteratorT, typename DifferenceOpT, typename NumItemsT = std::uint32_t>
   CUB_DETAIL_RUNTIME_DEBUG_SYNC_IS_NOT_SUPPORTED static CUB_RUNTIME_FUNCTION cudaError_t SubtractLeft(
     void* d_temp_storage,
@@ -413,7 +413,7 @@ public:
 
     return SubtractLeft(d_temp_storage, temp_storage_bytes, d_input, num_items, difference_op, stream);
   }
-#endif // DOXYGEN_SHOULD_SKIP_THIS
+#endif // _CCCL_DOXYGEN_INVOKED
 
   //! @rst
   //! Subtracts the right element of each adjacent pair of elements residing within device-accessible memory.
@@ -545,7 +545,7 @@ public:
       d_temp_storage, temp_storage_bytes, d_input, d_output, num_items, difference_op, stream);
   }
 
-#ifndef DOXYGEN_SHOULD_SKIP_THIS // Do not document
+#ifndef _CCCL_DOXYGEN_INVOKED // Do not document
   template <typename InputIteratorT, typename OutputIteratorT, typename DifferenceOpT, typename NumItemsT = std::uint32_t>
   CUB_DETAIL_RUNTIME_DEBUG_SYNC_IS_NOT_SUPPORTED static CUB_RUNTIME_FUNCTION cudaError_t SubtractRightCopy(
     void* d_temp_storage,
@@ -561,7 +561,7 @@ public:
 
     return SubtractRightCopy(d_temp_storage, temp_storage_bytes, d_input, d_output, num_items, difference_op, stream);
   }
-#endif // DOXYGEN_SHOULD_SKIP_THIS
+#endif // _CCCL_DOXYGEN_INVOKED
 
   //! @rst
   //! Subtracts the right element of each adjacent pair of elements residing within device-accessible memory.
@@ -665,7 +665,7 @@ public:
       d_temp_storage, temp_storage_bytes, d_input, d_input, num_items, difference_op, stream);
   }
 
-#ifndef DOXYGEN_SHOULD_SKIP_THIS // Do not document
+#ifndef _CCCL_DOXYGEN_INVOKED // Do not document
   template <typename RandomAccessIteratorT, typename DifferenceOpT, typename NumItemsT>
   CUB_DETAIL_RUNTIME_DEBUG_SYNC_IS_NOT_SUPPORTED static CUB_RUNTIME_FUNCTION cudaError_t SubtractRight(
     void* d_temp_storage,
@@ -680,7 +680,7 @@ public:
 
     return SubtractRight(d_temp_storage, temp_storage_bytes, d_input, num_items, difference_op, stream);
   }
-#endif // DOXYGEN_SHOULD_SKIP_THIS
+#endif // _CCCL_DOXYGEN_INVOKED
 };
 
 CUB_NAMESPACE_END
diff --git a/cub/cub/device/device_histogram.cuh b/cub/cub/device/device_histogram.cuh
index e6abc4bd07b..32e485df2b3 100644
--- a/cub/cub/device/device_histogram.cuh
+++ b/cub/cub/device/device_histogram.cuh
@@ -206,7 +206,7 @@ struct DeviceHistogram
       stream);
   }
 
-#ifndef DOXYGEN_SHOULD_SKIP_THIS // Do not document
+#ifndef _CCCL_DOXYGEN_INVOKED // Do not document
   template <typename SampleIteratorT, typename CounterT, typename LevelT, typename OffsetT>
   CUB_DETAIL_RUNTIME_DEBUG_SYNC_IS_NOT_SUPPORTED CUB_RUNTIME_FUNCTION static cudaError_t HistogramEven(
     void* d_temp_storage,
@@ -233,7 +233,7 @@ struct DeviceHistogram
       num_samples,
       stream);
   }
-#endif // DOXYGEN_SHOULD_SKIP_THIS
+#endif // _CCCL_DOXYGEN_INVOKED
 
   //! @rst
   //! Computes an intensity histogram from a sequence of data samples using equal-width bins.
@@ -386,7 +386,7 @@ struct DeviceHistogram
       stream);
   }
 
-#ifndef DOXYGEN_SHOULD_SKIP_THIS // Do not document
+#ifndef _CCCL_DOXYGEN_INVOKED // Do not document
   template <typename SampleIteratorT, typename CounterT, typename LevelT, typename OffsetT>
   CUB_DETAIL_RUNTIME_DEBUG_SYNC_IS_NOT_SUPPORTED CUB_RUNTIME_FUNCTION static cudaError_t HistogramEven(
     void* d_temp_storage,
@@ -417,7 +417,7 @@ struct DeviceHistogram
       row_stride_bytes,
       stream);
   }
-#endif // DOXYGEN_SHOULD_SKIP_THIS
+#endif // _CCCL_DOXYGEN_INVOKED
 
   //! @rst
   //! Computes per-channel intensity histograms from a sequence of multi-channel "pixel" data samples using
@@ -588,7 +588,7 @@ struct DeviceHistogram
       stream);
   }
 
-#ifndef DOXYGEN_SHOULD_SKIP_THIS // Do not document
+#ifndef _CCCL_DOXYGEN_INVOKED // Do not document
   template <int NUM_CHANNELS,
             int NUM_ACTIVE_CHANNELS,
             typename SampleIteratorT,
@@ -620,7 +620,7 @@ struct DeviceHistogram
       num_pixels,
       stream);
   }
-#endif // DOXYGEN_SHOULD_SKIP_THIS
+#endif // _CCCL_DOXYGEN_INVOKED
 
   //! @rst
   //! Computes per-channel intensity histograms from a sequence of
@@ -836,7 +836,7 @@ struct DeviceHistogram
       is_byte_sample);
   }
 
-#ifndef DOXYGEN_SHOULD_SKIP_THIS // Do not document
+#ifndef _CCCL_DOXYGEN_INVOKED // Do not document
   template <int NUM_CHANNELS,
             int NUM_ACTIVE_CHANNELS,
             typename SampleIteratorT,
@@ -872,7 +872,7 @@ struct DeviceHistogram
       row_stride_bytes,
       stream);
   }
-#endif // DOXYGEN_SHOULD_SKIP_THIS
+#endif // _CCCL_DOXYGEN_INVOKED
 
   //! @}  end member group
   //! @name Custom bin ranges
@@ -999,7 +999,7 @@ struct DeviceHistogram
       stream);
   }
 
-#ifndef DOXYGEN_SHOULD_SKIP_THIS // Do not document
+#ifndef _CCCL_DOXYGEN_INVOKED // Do not document
   template <typename SampleIteratorT, typename CounterT, typename LevelT, typename OffsetT>
   CUB_DETAIL_RUNTIME_DEBUG_SYNC_IS_NOT_SUPPORTED CUB_RUNTIME_FUNCTION static cudaError_t HistogramRange(
     void* d_temp_storage,
@@ -1017,7 +1017,7 @@ struct DeviceHistogram
     return HistogramRange(
       d_temp_storage, temp_storage_bytes, d_samples, d_histogram, num_levels, d_levels, num_samples, stream);
   }
-#endif // DOXYGEN_SHOULD_SKIP_THIS
+#endif // _CCCL_DOXYGEN_INVOKED
 
   //! @rst
   //! Computes an intensity histogram from a sequence of data samples using the specified bin boundary levels.
@@ -1157,7 +1157,7 @@ struct DeviceHistogram
       stream);
   }
 
-#ifndef DOXYGEN_SHOULD_SKIP_THIS // Do not document
+#ifndef _CCCL_DOXYGEN_INVOKED // Do not document
   template <typename SampleIteratorT, typename CounterT, typename LevelT, typename OffsetT>
   CUB_DETAIL_RUNTIME_DEBUG_SYNC_IS_NOT_SUPPORTED CUB_RUNTIME_FUNCTION static cudaError_t HistogramRange(
     void* d_temp_storage,
@@ -1186,7 +1186,7 @@ struct DeviceHistogram
       row_stride_bytes,
       stream);
   }
-#endif // DOXYGEN_SHOULD_SKIP_THIS
+#endif // _CCCL_DOXYGEN_INVOKED
 
   //! @rst
   //! Computes per-channel intensity histograms from a sequence of multi-channel "pixel" data samples
@@ -1346,7 +1346,7 @@ struct DeviceHistogram
       stream);
   }
 
-#ifndef DOXYGEN_SHOULD_SKIP_THIS // Do not document
+#ifndef _CCCL_DOXYGEN_INVOKED // Do not document
   template <int NUM_CHANNELS,
             int NUM_ACTIVE_CHANNELS,
             typename SampleIteratorT,
@@ -1369,7 +1369,7 @@ struct DeviceHistogram
     return MultiHistogramRange(
       d_temp_storage, temp_storage_bytes, d_samples, d_histogram, num_levels, d_levels, num_pixels, stream);
   }
-#endif // DOXYGEN_SHOULD_SKIP_THIS
+#endif // _CCCL_DOXYGEN_INVOKED
 
   //! @rst
   //! Computes per-channel intensity histograms from a sequence of multi-channel "pixel" data samples using
@@ -1574,7 +1574,7 @@ struct DeviceHistogram
       is_byte_sample);
   }
 
-#ifndef DOXYGEN_SHOULD_SKIP_THIS // Do not document
+#ifndef _CCCL_DOXYGEN_INVOKED // Do not document
   template <int NUM_CHANNELS,
             int NUM_ACTIVE_CHANNELS,
             typename SampleIteratorT,
@@ -1608,7 +1608,7 @@ struct DeviceHistogram
       row_stride_bytes,
       stream);
   }
-#endif // DOXYGEN_SHOULD_SKIP_THIS
+#endif // _CCCL_DOXYGEN_INVOKED
 
   //@}  end member group
 };
diff --git a/cub/cub/device/device_merge_sort.cuh b/cub/cub/device/device_merge_sort.cuh
index 293aaecce96..7f1772d0a5a 100644
--- a/cub/cub/device/device_merge_sort.cuh
+++ b/cub/cub/device/device_merge_sort.cuh
@@ -246,7 +246,7 @@ public:
     return SortPairsNoNVTX(d_temp_storage, temp_storage_bytes, d_keys, d_items, num_items, compare_op, stream);
   }
 
-#ifndef DOXYGEN_SHOULD_SKIP_THIS // Do not document
+#ifndef _CCCL_DOXYGEN_INVOKED // Do not document
   template <typename KeyIteratorT, typename ValueIteratorT, typename OffsetT, typename CompareOpT>
   CUB_DETAIL_RUNTIME_DEBUG_SYNC_IS_NOT_SUPPORTED CUB_RUNTIME_FUNCTION static cudaError_t SortPairs(
     void* d_temp_storage,
@@ -263,7 +263,7 @@ public:
     return SortPairs<KeyIteratorT, ValueIteratorT, OffsetT, CompareOpT>(
       d_temp_storage, temp_storage_bytes, d_keys, d_items, num_items, compare_op, stream);
   }
-#endif // DOXYGEN_SHOULD_SKIP_THIS
+#endif // _CCCL_DOXYGEN_INVOKED
 
   /**
    * @brief Sorts items using a merge sorting method.
@@ -411,7 +411,7 @@ public:
       stream);
   }
 
-#ifndef DOXYGEN_SHOULD_SKIP_THIS // Do not document
+#ifndef _CCCL_DOXYGEN_INVOKED // Do not document
   template <typename KeyInputIteratorT,
             typename ValueInputIteratorT,
             typename KeyIteratorT,
@@ -443,7 +443,7 @@ public:
       compare_op,
       stream);
   }
-#endif // DOXYGEN_SHOULD_SKIP_THIS
+#endif // _CCCL_DOXYGEN_INVOKED
 
 private:
   // Internal version without NVTX range
@@ -570,7 +570,7 @@ public:
     return SortKeysNoNVTX(d_temp_storage, temp_storage_bytes, d_keys, num_items, compare_op, stream);
   }
 
-#ifndef DOXYGEN_SHOULD_SKIP_THIS // Do not document
+#ifndef _CCCL_DOXYGEN_INVOKED // Do not document
   template <typename KeyIteratorT, typename OffsetT, typename CompareOpT>
   CUB_DETAIL_RUNTIME_DEBUG_SYNC_IS_NOT_SUPPORTED CUB_RUNTIME_FUNCTION static cudaError_t SortKeys(
     void* d_temp_storage,
@@ -586,7 +586,7 @@ public:
     return SortKeys<KeyIteratorT, OffsetT, CompareOpT>(
       d_temp_storage, temp_storage_bytes, d_keys, num_items, compare_op, stream);
   }
-#endif // DOXYGEN_SHOULD_SKIP_THIS
+#endif // _CCCL_DOXYGEN_INVOKED
 
 private:
   // Internal version without NVTX range
@@ -729,7 +729,7 @@ public:
       d_temp_storage, temp_storage_bytes, d_input_keys, d_output_keys, num_items, compare_op, stream);
   }
 
-#ifndef DOXYGEN_SHOULD_SKIP_THIS // Do not document
+#ifndef _CCCL_DOXYGEN_INVOKED // Do not document
   template <typename KeyInputIteratorT, typename KeyIteratorT, typename OffsetT, typename CompareOpT>
   CUB_DETAIL_RUNTIME_DEBUG_SYNC_IS_NOT_SUPPORTED CUB_RUNTIME_FUNCTION static cudaError_t SortKeysCopy(
     void* d_temp_storage,
@@ -746,7 +746,7 @@ public:
     return SortKeysCopy<KeyInputIteratorT, KeyIteratorT, OffsetT, CompareOpT>(
       d_temp_storage, temp_storage_bytes, d_input_keys, d_output_keys, num_items, compare_op, stream);
   }
-#endif // DOXYGEN_SHOULD_SKIP_THIS
+#endif // _CCCL_DOXYGEN_INVOKED
 
   /**
    * @brief Sorts items using a merge sorting method.
@@ -857,7 +857,7 @@ public:
       d_temp_storage, temp_storage_bytes, d_keys, d_items, num_items, compare_op, stream);
   }
 
-#ifndef DOXYGEN_SHOULD_SKIP_THIS // Do not document
+#ifndef _CCCL_DOXYGEN_INVOKED // Do not document
   template <typename KeyIteratorT, typename ValueIteratorT, typename OffsetT, typename CompareOpT>
   CUB_DETAIL_RUNTIME_DEBUG_SYNC_IS_NOT_SUPPORTED CUB_RUNTIME_FUNCTION static cudaError_t StableSortPairs(
     void* d_temp_storage,
@@ -874,7 +874,7 @@ public:
     return StableSortPairs<KeyIteratorT, ValueIteratorT, OffsetT, CompareOpT>(
       d_temp_storage, temp_storage_bytes, d_keys, d_items, num_items, compare_op, stream);
   }
-#endif // DOXYGEN_SHOULD_SKIP_THIS
+#endif // _CCCL_DOXYGEN_INVOKED
 
   /**
    * @brief Sorts items using a merge sorting method.
@@ -976,7 +976,7 @@ public:
       d_temp_storage, temp_storage_bytes, d_keys, num_items, compare_op, stream);
   }
 
-#ifndef DOXYGEN_SHOULD_SKIP_THIS // Do not document
+#ifndef _CCCL_DOXYGEN_INVOKED // Do not document
   template <typename KeyIteratorT, typename OffsetT, typename CompareOpT>
   CUB_DETAIL_RUNTIME_DEBUG_SYNC_IS_NOT_SUPPORTED CUB_RUNTIME_FUNCTION static cudaError_t StableSortKeys(
     void* d_temp_storage,
@@ -992,7 +992,7 @@ public:
     return StableSortKeys<KeyIteratorT, OffsetT, CompareOpT>(
       d_temp_storage, temp_storage_bytes, d_keys, num_items, compare_op, stream);
   }
-#endif // DOXYGEN_SHOULD_SKIP_THIS
+#endif // _CCCL_DOXYGEN_INVOKED
 
   /**
    * @brief Sorts items using a merge sorting method.
diff --git a/cub/cub/device/device_partition.cuh b/cub/cub/device/device_partition.cuh
index 48666f1370b..621bf2b9070 100644
--- a/cub/cub/device/device_partition.cuh
+++ b/cub/cub/device/device_partition.cuh
@@ -223,7 +223,7 @@ struct DevicePartition
       stream);
   }
 
-#ifndef DOXYGEN_SHOULD_SKIP_THIS // Do not document
+#ifndef _CCCL_DOXYGEN_INVOKED // Do not document
   template <typename InputIteratorT,
             typename FlagIterator,
             typename OutputIteratorT,
@@ -245,7 +245,7 @@ struct DevicePartition
     return Flagged<InputIteratorT, FlagIterator, OutputIteratorT, NumSelectedIteratorT>(
       d_temp_storage, temp_storage_bytes, d_in, d_flags, d_out, d_num_selected_out, num_items, stream);
   }
-#endif // DOXYGEN_SHOULD_SKIP_THIS
+#endif // _CCCL_DOXYGEN_INVOKED
 
   //! @rst
   //! Uses the ``select_op`` functor to split the corresponding items from ``d_in`` into
@@ -405,7 +405,7 @@ struct DevicePartition
       stream);
   }
 
-#ifndef DOXYGEN_SHOULD_SKIP_THIS // Do not document
+#ifndef _CCCL_DOXYGEN_INVOKED // Do not document
   template <typename InputIteratorT,
             typename OutputIteratorT,
             typename NumSelectedIteratorT,
@@ -427,7 +427,7 @@ struct DevicePartition
     return If<InputIteratorT, OutputIteratorT, NumSelectedIteratorT, SelectOp, NumItemsT>(
       d_temp_storage, temp_storage_bytes, d_in, d_out, d_num_selected_out, num_items, select_op, stream);
   }
-#endif // DOXYGEN_SHOULD_SKIP_THIS
+#endif // _CCCL_DOXYGEN_INVOKED
 
 private:
   template <bool IS_DESCENDING,
@@ -700,7 +700,7 @@ public:
       stream);
   }
 
-#ifndef DOXYGEN_SHOULD_SKIP_THIS // Do not document
+#ifndef _CCCL_DOXYGEN_INVOKED // Do not document
   template <typename InputIteratorT,
             typename FirstOutputIteratorT,
             typename SecondOutputIteratorT,
@@ -743,7 +743,7 @@ public:
       select_second_part_op,
       stream);
   }
-#endif // DOXYGEN_SHOULD_SKIP_THIS
+#endif // _CCCL_DOXYGEN_INVOKED
 };
 
 CUB_NAMESPACE_END
diff --git a/cub/cub/device/device_radix_sort.cuh b/cub/cub/device/device_radix_sort.cuh
index a14c5e43640..87873e5b9ba 100644
--- a/cub/cub/device/device_radix_sort.cuh
+++ b/cub/cub/device/device_radix_sort.cuh
@@ -363,7 +363,7 @@ public:
       stream);
   }
 
-#ifndef DOXYGEN_SHOULD_SKIP_THIS // Do not document
+#ifndef _CCCL_DOXYGEN_INVOKED // Do not document
   template <typename KeyT, typename ValueT, typename NumItemsT>
   CUB_DETAIL_RUNTIME_DEBUG_SYNC_IS_NOT_SUPPORTED CUB_RUNTIME_FUNCTION static cudaError_t SortPairs(
     void* d_temp_storage,
@@ -818,7 +818,7 @@ public:
       d_temp_storage, temp_storage_bytes, d_keys, d_values, num_items, begin_bit, end_bit, is_overwrite_okay, stream);
   }
 
-#ifndef DOXYGEN_SHOULD_SKIP_THIS // Do not document
+#ifndef _CCCL_DOXYGEN_INVOKED // Do not document
   template <typename KeyT, typename ValueT, typename NumItemsT>
   CUB_DETAIL_RUNTIME_DEBUG_SYNC_IS_NOT_SUPPORTED CUB_RUNTIME_FUNCTION static cudaError_t SortPairs(
     void* d_temp_storage,
@@ -1252,7 +1252,7 @@ public:
       d_temp_storage, temp_storage_bytes, d_keys, d_values, num_items, begin_bit, end_bit, is_overwrite_okay, stream);
   }
 
-#ifndef DOXYGEN_SHOULD_SKIP_THIS // Do not document
+#ifndef _CCCL_DOXYGEN_INVOKED // Do not document
   template <typename KeyT, typename ValueT, typename NumItemsT>
   CUB_DETAIL_RUNTIME_DEBUG_SYNC_IS_NOT_SUPPORTED CUB_RUNTIME_FUNCTION static cudaError_t SortPairsDescending(
     void* d_temp_storage,
@@ -1706,7 +1706,7 @@ public:
       d_temp_storage, temp_storage_bytes, d_keys, d_values, num_items, begin_bit, end_bit, is_overwrite_okay, stream);
   }
 
-#ifndef DOXYGEN_SHOULD_SKIP_THIS // Do not document
+#ifndef _CCCL_DOXYGEN_INVOKED // Do not document
   template <typename KeyT, typename ValueT, typename NumItemsT>
   CUB_DETAIL_RUNTIME_DEBUG_SYNC_IS_NOT_SUPPORTED CUB_RUNTIME_FUNCTION static cudaError_t SortPairsDescending(
     void* d_temp_storage,
@@ -2412,7 +2412,7 @@ public:
       stream);
   }
 
-#ifndef DOXYGEN_SHOULD_SKIP_THIS // Do not document
+#ifndef _CCCL_DOXYGEN_INVOKED // Do not document
   template <typename KeyT, typename NumItemsT>
   CUB_DETAIL_RUNTIME_DEBUG_SYNC_IS_NOT_SUPPORTED CUB_RUNTIME_FUNCTION static cudaError_t SortKeys(
     void* d_temp_storage,
@@ -2552,7 +2552,7 @@ public:
       d_temp_storage, temp_storage_bytes, d_keys, d_values, num_items, begin_bit, end_bit, is_overwrite_okay, stream);
   }
 
-#ifndef DOXYGEN_SHOULD_SKIP_THIS // Do not document
+#ifndef _CCCL_DOXYGEN_INVOKED // Do not document
   template <typename KeyT, typename NumItemsT>
   CUB_DETAIL_RUNTIME_DEBUG_SYNC_IS_NOT_SUPPORTED CUB_RUNTIME_FUNCTION static cudaError_t SortKeys(
     void* d_temp_storage,
@@ -2945,7 +2945,7 @@ public:
       d_temp_storage, temp_storage_bytes, d_keys, d_values, num_items, begin_bit, end_bit, is_overwrite_okay, stream);
   }
 
-#ifndef DOXYGEN_SHOULD_SKIP_THIS // Do not document
+#ifndef _CCCL_DOXYGEN_INVOKED // Do not document
   template <typename KeyT, typename NumItemsT>
   CUB_DETAIL_RUNTIME_DEBUG_SYNC_IS_NOT_SUPPORTED CUB_RUNTIME_FUNCTION static cudaError_t SortKeysDescending(
     void* d_temp_storage,
@@ -3345,7 +3345,7 @@ public:
       d_temp_storage, temp_storage_bytes, d_keys, d_values, num_items, begin_bit, end_bit, is_overwrite_okay, stream);
   }
 
-#ifndef DOXYGEN_SHOULD_SKIP_THIS // Do not document
+#ifndef _CCCL_DOXYGEN_INVOKED // Do not document
   template <typename KeyT, typename NumItemsT>
   CUB_DETAIL_RUNTIME_DEBUG_SYNC_IS_NOT_SUPPORTED CUB_RUNTIME_FUNCTION static cudaError_t SortKeysDescending(
     void* d_temp_storage,
diff --git a/cub/cub/device/device_reduce.cuh b/cub/cub/device/device_reduce.cuh
index a9b94f60534..bd78224be5d 100644
--- a/cub/cub/device/device_reduce.cuh
+++ b/cub/cub/device/device_reduce.cuh
@@ -205,7 +205,7 @@ struct DeviceReduce
       d_temp_storage, temp_storage_bytes, d_in, d_out, static_cast<OffsetT>(num_items), reduction_op, init, stream);
   }
 
-#ifndef DOXYGEN_SHOULD_SKIP_THIS // Do not document
+#ifndef _CCCL_DOXYGEN_INVOKED // Do not document
   template <typename InputIteratorT, typename OutputIteratorT, typename ReductionOpT, typename T>
   CUB_DETAIL_RUNTIME_DEBUG_SYNC_IS_NOT_SUPPORTED CUB_RUNTIME_FUNCTION static cudaError_t Reduce(
     void* d_temp_storage,
@@ -223,7 +223,7 @@ struct DeviceReduce
     return Reduce<InputIteratorT, OutputIteratorT, ReductionOpT, T>(
       d_temp_storage, temp_storage_bytes, d_in, d_out, num_items, reduction_op, init, stream);
   }
-#endif // DOXYGEN_SHOULD_SKIP_THIS
+#endif // _CCCL_DOXYGEN_INVOKED
 
   //! @rst
   //! Computes a device-wide sum using the addition (``+``) operator.
@@ -330,7 +330,7 @@ struct DeviceReduce
       stream);
   }
 
-#ifndef DOXYGEN_SHOULD_SKIP_THIS // Do not document
+#ifndef _CCCL_DOXYGEN_INVOKED // Do not document
   template <typename InputIteratorT, typename OutputIteratorT>
   CUB_DETAIL_RUNTIME_DEBUG_SYNC_IS_NOT_SUPPORTED CUB_RUNTIME_FUNCTION static cudaError_t
   Sum(void* d_temp_storage,
@@ -345,7 +345,7 @@ struct DeviceReduce
 
     return Sum<InputIteratorT, OutputIteratorT>(d_temp_storage, temp_storage_bytes, d_in, d_out, num_items, stream);
   }
-#endif // DOXYGEN_SHOULD_SKIP_THIS
+#endif // _CCCL_DOXYGEN_INVOKED
 
   //! @rst
   //! Computes a device-wide minimum using the less-than (``<``) operator.
@@ -456,7 +456,7 @@ struct DeviceReduce
       stream);
   }
 
-#ifndef DOXYGEN_SHOULD_SKIP_THIS // Do not document
+#ifndef _CCCL_DOXYGEN_INVOKED // Do not document
   template <typename InputIteratorT, typename OutputIteratorT>
   CUB_DETAIL_RUNTIME_DEBUG_SYNC_IS_NOT_SUPPORTED CUB_RUNTIME_FUNCTION static cudaError_t
   Min(void* d_temp_storage,
@@ -471,7 +471,7 @@ struct DeviceReduce
 
     return Min<InputIteratorT, OutputIteratorT>(d_temp_storage, temp_storage_bytes, d_in, d_out, num_items, stream);
   }
-#endif // DOXYGEN_SHOULD_SKIP_THIS
+#endif // _CCCL_DOXYGEN_INVOKED
 
   //! @rst
   //! Finds the first device-wide minimum using the less-than (``<``) operator, also returning the index of that item.
@@ -591,7 +591,7 @@ struct DeviceReduce
       d_temp_storage, temp_storage_bytes, d_indexed_in, d_out, num_items, cub::ArgMin(), initial_value, stream);
   }
 
-#ifndef DOXYGEN_SHOULD_SKIP_THIS // Do not document
+#ifndef _CCCL_DOXYGEN_INVOKED // Do not document
   template <typename InputIteratorT, typename OutputIteratorT>
   CUB_DETAIL_RUNTIME_DEBUG_SYNC_IS_NOT_SUPPORTED CUB_RUNTIME_FUNCTION static cudaError_t ArgMin(
     void* d_temp_storage,
@@ -606,7 +606,7 @@ struct DeviceReduce
 
     return ArgMin<InputIteratorT, OutputIteratorT>(d_temp_storage, temp_storage_bytes, d_in, d_out, num_items, stream);
   }
-#endif // DOXYGEN_SHOULD_SKIP_THIS
+#endif // _CCCL_DOXYGEN_INVOKED
 
   //! @rst
   //! Computes a device-wide maximum using the greater-than (``>``) operator.
@@ -715,7 +715,7 @@ struct DeviceReduce
       stream);
   }
 
-#ifndef DOXYGEN_SHOULD_SKIP_THIS // Do not document
+#ifndef _CCCL_DOXYGEN_INVOKED // Do not document
   template <typename InputIteratorT, typename OutputIteratorT>
   CUB_DETAIL_RUNTIME_DEBUG_SYNC_IS_NOT_SUPPORTED CUB_RUNTIME_FUNCTION static cudaError_t
   Max(void* d_temp_storage,
@@ -730,7 +730,7 @@ struct DeviceReduce
 
     return Max<InputIteratorT, OutputIteratorT>(d_temp_storage, temp_storage_bytes, d_in, d_out, num_items, stream);
   }
-#endif // DOXYGEN_SHOULD_SKIP_THIS
+#endif // _CCCL_DOXYGEN_INVOKED
 
   //! @rst
   //! Finds the first device-wide maximum using the greater-than (``>``)
@@ -854,7 +854,7 @@ struct DeviceReduce
       d_temp_storage, temp_storage_bytes, d_indexed_in, d_out, num_items, cub::ArgMax(), initial_value, stream);
   }
 
-#ifndef DOXYGEN_SHOULD_SKIP_THIS // Do not document
+#ifndef _CCCL_DOXYGEN_INVOKED // Do not document
   template <typename InputIteratorT, typename OutputIteratorT>
   CUB_DETAIL_RUNTIME_DEBUG_SYNC_IS_NOT_SUPPORTED CUB_RUNTIME_FUNCTION static cudaError_t ArgMax(
     void* d_temp_storage,
@@ -869,7 +869,7 @@ struct DeviceReduce
 
     return ArgMax<InputIteratorT, OutputIteratorT>(d_temp_storage, temp_storage_bytes, d_in, d_out, num_items, stream);
   }
-#endif // DOXYGEN_SHOULD_SKIP_THIS
+#endif // _CCCL_DOXYGEN_INVOKED
 
   //! @rst
   //! Fuses transform and reduce operations
@@ -1195,7 +1195,7 @@ struct DeviceReduce
                          stream);
   }
 
-#ifndef DOXYGEN_SHOULD_SKIP_THIS // Do not document
+#ifndef _CCCL_DOXYGEN_INVOKED // Do not document
   template <typename KeysInputIteratorT,
             typename UniqueOutputIteratorT,
             typename ValuesInputIteratorT,
@@ -1234,7 +1234,7 @@ struct DeviceReduce
       num_items,
       stream);
   }
-#endif // DOXYGEN_SHOULD_SKIP_THIS
+#endif // _CCCL_DOXYGEN_INVOKED
 };
 
 CUB_NAMESPACE_END
diff --git a/cub/cub/device/device_run_length_encode.cuh b/cub/cub/device/device_run_length_encode.cuh
index 1817f25920d..710019b4218 100644
--- a/cub/cub/device/device_run_length_encode.cuh
+++ b/cub/cub/device/device_run_length_encode.cuh
@@ -231,7 +231,7 @@ struct DeviceRunLengthEncode
                           stream);
   }
 
-#ifndef DOXYGEN_SHOULD_SKIP_THIS // Do not document
+#ifndef _CCCL_DOXYGEN_INVOKED // Do not document
   template <typename InputIteratorT,
             typename UniqueOutputIteratorT,
             typename LengthsOutputIteratorT,
@@ -252,7 +252,7 @@ struct DeviceRunLengthEncode
     return Encode<InputIteratorT, UniqueOutputIteratorT, LengthsOutputIteratorT, NumRunsOutputIteratorT>(
       d_temp_storage, temp_storage_bytes, d_in, d_unique_out, d_counts_out, d_num_runs_out, num_items, stream);
   }
-#endif // DOXYGEN_SHOULD_SKIP_THIS
+#endif // _CCCL_DOXYGEN_INVOKED
 
   //! @rst
   //! Enumerates the starting offsets and lengths of all non-trivial runs
@@ -386,7 +386,7 @@ struct DeviceRunLengthEncode
                          stream);
   }
 
-#ifndef DOXYGEN_SHOULD_SKIP_THIS // Do not document
+#ifndef _CCCL_DOXYGEN_INVOKED // Do not document
   template <typename InputIteratorT,
             typename OffsetsOutputIteratorT,
             typename LengthsOutputIteratorT,
@@ -408,7 +408,7 @@ struct DeviceRunLengthEncode
     return NonTrivialRuns<InputIteratorT, OffsetsOutputIteratorT, LengthsOutputIteratorT, NumRunsOutputIteratorT>(
       d_temp_storage, temp_storage_bytes, d_in, d_offsets_out, d_lengths_out, d_num_runs_out, num_items, stream);
   }
-#endif // DOXYGEN_SHOULD_SKIP_THIS
+#endif // _CCCL_DOXYGEN_INVOKED
 };
 
 CUB_NAMESPACE_END
diff --git a/cub/cub/device/device_scan.cuh b/cub/cub/device/device_scan.cuh
index e8b56709eda..e105fa36819 100644
--- a/cub/cub/device/device_scan.cuh
+++ b/cub/cub/device/device_scan.cuh
@@ -208,7 +208,7 @@ struct DeviceScan
                stream);
   }
 
-#ifndef DOXYGEN_SHOULD_SKIP_THIS // Do not document
+#ifndef _CCCL_DOXYGEN_INVOKED // Do not document
   template <typename InputIteratorT, typename OutputIteratorT, typename NumItemsT>
   CUB_DETAIL_RUNTIME_DEBUG_SYNC_IS_NOT_SUPPORTED CUB_RUNTIME_FUNCTION static cudaError_t ExclusiveSum(
     void* d_temp_storage,
@@ -224,7 +224,7 @@ struct DeviceScan
     return ExclusiveSum<InputIteratorT, OutputIteratorT>(
       d_temp_storage, temp_storage_bytes, d_in, d_out, num_items, stream);
   }
-#endif // DOXYGEN_SHOULD_SKIP_THIS
+#endif // _CCCL_DOXYGEN_INVOKED
 
   //! @rst
   //! Computes a device-wide exclusive prefix sum in-place.
@@ -302,7 +302,7 @@ struct DeviceScan
     return ExclusiveSum(d_temp_storage, temp_storage_bytes, d_data, d_data, num_items, stream);
   }
 
-#ifndef DOXYGEN_SHOULD_SKIP_THIS // Do not document
+#ifndef _CCCL_DOXYGEN_INVOKED // Do not document
   template <typename IteratorT, typename NumItemsT>
   CUB_DETAIL_RUNTIME_DEBUG_SYNC_IS_NOT_SUPPORTED CUB_RUNTIME_FUNCTION static cudaError_t ExclusiveSum(
     void* d_temp_storage,
@@ -316,7 +316,7 @@ struct DeviceScan
 
     return ExclusiveSum<IteratorT>(d_temp_storage, temp_storage_bytes, d_data, num_items, stream);
   }
-#endif // DOXYGEN_SHOULD_SKIP_THIS
+#endif // _CCCL_DOXYGEN_INVOKED
 
   //! @rst
   //! Computes a device-wide exclusive prefix scan using the specified
@@ -450,7 +450,7 @@ struct DeviceScan
       stream);
   }
 
-#ifndef DOXYGEN_SHOULD_SKIP_THIS // Do not document
+#ifndef _CCCL_DOXYGEN_INVOKED // Do not document
   template <typename InputIteratorT, typename OutputIteratorT, typename ScanOpT, typename InitValueT, typename NumItemsT>
   CUB_DETAIL_RUNTIME_DEBUG_SYNC_IS_NOT_SUPPORTED CUB_RUNTIME_FUNCTION static cudaError_t ExclusiveScan(
     void* d_temp_storage,
@@ -468,7 +468,7 @@ struct DeviceScan
     return ExclusiveScan<InputIteratorT, OutputIteratorT, ScanOpT, InitValueT>(
       d_temp_storage, temp_storage_bytes, d_in, d_out, scan_op, init_value, num_items, stream);
   }
-#endif // DOXYGEN_SHOULD_SKIP_THIS
+#endif // _CCCL_DOXYGEN_INVOKED
 
   //! @rst
   //! Computes a device-wide exclusive prefix scan using the specified
@@ -579,7 +579,7 @@ struct DeviceScan
     return ExclusiveScan(d_temp_storage, temp_storage_bytes, d_data, d_data, scan_op, init_value, num_items, stream);
   }
 
-#ifndef DOXYGEN_SHOULD_SKIP_THIS // Do not document
+#ifndef _CCCL_DOXYGEN_INVOKED // Do not document
   template <typename IteratorT, typename ScanOpT, typename InitValueT, typename NumItemsT>
   CUB_DETAIL_RUNTIME_DEBUG_SYNC_IS_NOT_SUPPORTED CUB_RUNTIME_FUNCTION static cudaError_t ExclusiveScan(
     void* d_temp_storage,
@@ -596,7 +596,7 @@ struct DeviceScan
     return ExclusiveScan<IteratorT, ScanOpT, InitValueT>(
       d_temp_storage, temp_storage_bytes, d_data, scan_op, init_value, num_items, stream);
   }
-#endif // DOXYGEN_SHOULD_SKIP_THIS
+#endif // _CCCL_DOXYGEN_INVOKED
 
   //! @rst
   //! Computes a device-wide exclusive prefix scan using the specified
@@ -739,7 +739,7 @@ struct DeviceScan
       stream);
   }
 
-#ifndef DOXYGEN_SHOULD_SKIP_THIS // Do not document
+#ifndef _CCCL_DOXYGEN_INVOKED // Do not document
   template <typename InputIteratorT,
             typename OutputIteratorT,
             typename ScanOpT,
@@ -762,7 +762,7 @@ struct DeviceScan
     return ExclusiveScan<InputIteratorT, OutputIteratorT, ScanOpT, InitValueT, InitValueIterT>(
       d_temp_storage, temp_storage_bytes, d_in, d_out, scan_op, init_value, num_items, stream);
   }
-#endif // DOXYGEN_SHOULD_SKIP_THIS
+#endif // _CCCL_DOXYGEN_INVOKED
 
   //! @rst
   //! Computes a device-wide exclusive prefix scan using the specified binary ``scan_op`` functor.
@@ -880,7 +880,7 @@ struct DeviceScan
     return ExclusiveScan(d_temp_storage, temp_storage_bytes, d_data, d_data, scan_op, init_value, num_items, stream);
   }
 
-#ifndef DOXYGEN_SHOULD_SKIP_THIS // Do not document
+#ifndef _CCCL_DOXYGEN_INVOKED // Do not document
   template <typename IteratorT,
             typename ScanOpT,
             typename InitValueT,
@@ -901,7 +901,7 @@ struct DeviceScan
     return ExclusiveScan<IteratorT, ScanOpT, InitValueT, InitValueIterT>(
       d_temp_storage, temp_storage_bytes, d_data, scan_op, init_value, num_items, stream);
   }
-#endif // DOXYGEN_SHOULD_SKIP_THIS
+#endif // _CCCL_DOXYGEN_INVOKED
 
   //! @}  end member group
   //! @name Inclusive scans
@@ -1003,7 +1003,7 @@ struct DeviceScan
       d_temp_storage, temp_storage_bytes, d_in, d_out, ::cuda::std::plus<>{}, NullType{}, num_items, stream);
   }
 
-#ifndef DOXYGEN_SHOULD_SKIP_THIS // Do not document
+#ifndef _CCCL_DOXYGEN_INVOKED // Do not document
   template <typename InputIteratorT, typename OutputIteratorT, typename NumItemsT>
   CUB_DETAIL_RUNTIME_DEBUG_SYNC_IS_NOT_SUPPORTED CUB_RUNTIME_FUNCTION static cudaError_t InclusiveSum(
     void* d_temp_storage,
@@ -1019,7 +1019,7 @@ struct DeviceScan
     return InclusiveSum<InputIteratorT, OutputIteratorT>(
       d_temp_storage, temp_storage_bytes, d_in, d_out, num_items, stream);
   }
-#endif // DOXYGEN_SHOULD_SKIP_THIS
+#endif // _CCCL_DOXYGEN_INVOKED
 
   //! @rst
   //! Computes a device-wide inclusive prefix sum in-place.
@@ -1096,7 +1096,7 @@ struct DeviceScan
     return InclusiveSum(d_temp_storage, temp_storage_bytes, d_data, d_data, num_items, stream);
   }
 
-#ifndef DOXYGEN_SHOULD_SKIP_THIS // Do not document
+#ifndef _CCCL_DOXYGEN_INVOKED // Do not document
   template <typename IteratorT, typename NumItemsT>
   CUB_DETAIL_RUNTIME_DEBUG_SYNC_IS_NOT_SUPPORTED CUB_RUNTIME_FUNCTION static cudaError_t InclusiveSum(
     void* d_temp_storage,
@@ -1110,7 +1110,7 @@ struct DeviceScan
 
     return InclusiveSum<IteratorT>(d_temp_storage, temp_storage_bytes, d_data, num_items, stream);
   }
-#endif // DOXYGEN_SHOULD_SKIP_THIS
+#endif // _CCCL_DOXYGEN_INVOKED
 
   //! @rst
   //! Computes a device-wide inclusive prefix scan using the specified binary ``scan_op`` functor.
@@ -1333,7 +1333,7 @@ struct DeviceScan
                                 stream);
   }
 
-#ifndef DOXYGEN_SHOULD_SKIP_THIS // Do not document
+#ifndef _CCCL_DOXYGEN_INVOKED // Do not document
   template <typename InputIteratorT, typename OutputIteratorT, typename ScanOpT, typename NumItemsT>
   CUB_DETAIL_RUNTIME_DEBUG_SYNC_IS_NOT_SUPPORTED CUB_RUNTIME_FUNCTION static cudaError_t InclusiveScan(
     void* d_temp_storage,
@@ -1350,7 +1350,7 @@ struct DeviceScan
     return InclusiveScan<InputIteratorT, OutputIteratorT, ScanOpT>(
       d_temp_storage, temp_storage_bytes, d_in, d_out, scan_op, num_items, stream);
   }
-#endif // DOXYGEN_SHOULD_SKIP_THIS
+#endif // _CCCL_DOXYGEN_INVOKED
 
   //! @rst
   //! Computes a device-wide inclusive prefix scan using the specified binary ``scan_op`` functor.
@@ -1451,7 +1451,7 @@ struct DeviceScan
     return InclusiveScan(d_temp_storage, temp_storage_bytes, d_data, d_data, scan_op, num_items, stream);
   }
 
-#ifndef DOXYGEN_SHOULD_SKIP_THIS // Do not document
+#ifndef _CCCL_DOXYGEN_INVOKED // Do not document
   template <typename IteratorT, typename ScanOpT, typename NumItemsT>
   CUB_DETAIL_RUNTIME_DEBUG_SYNC_IS_NOT_SUPPORTED CUB_RUNTIME_FUNCTION static cudaError_t InclusiveScan(
     void* d_temp_storage,
@@ -1466,7 +1466,7 @@ struct DeviceScan
 
     return InclusiveScan<IteratorT, ScanOpT>(d_temp_storage, temp_storage_bytes, d_data, scan_op, num_items, stream);
   }
-#endif // DOXYGEN_SHOULD_SKIP_THIS
+#endif // _CCCL_DOXYGEN_INVOKED
 
   //! @rst
   //! Computes a device-wide exclusive prefix sum-by-key with key equality
@@ -1608,7 +1608,7 @@ struct DeviceScan
                          stream);
   }
 
-#ifndef DOXYGEN_SHOULD_SKIP_THIS // Do not document
+#ifndef _CCCL_DOXYGEN_INVOKED // Do not document
   template <typename KeysInputIteratorT,
             typename ValuesInputIteratorT,
             typename ValuesOutputIteratorT,
@@ -1630,7 +1630,7 @@ struct DeviceScan
     return ExclusiveSumByKey<KeysInputIteratorT, ValuesInputIteratorT, ValuesOutputIteratorT, EqualityOpT>(
       d_temp_storage, temp_storage_bytes, d_keys_in, d_values_in, d_values_out, num_items, equality_op, stream);
   }
-#endif // DOXYGEN_SHOULD_SKIP_THIS
+#endif // _CCCL_DOXYGEN_INVOKED
 
   //! @rst
   //! Computes a device-wide exclusive prefix scan-by-key using the
@@ -1814,7 +1814,7 @@ struct DeviceScan
                          stream);
   }
 
-#ifndef DOXYGEN_SHOULD_SKIP_THIS // Do not document
+#ifndef _CCCL_DOXYGEN_INVOKED // Do not document
   template <typename KeysInputIteratorT,
             typename ValuesInputIteratorT,
             typename ValuesOutputIteratorT,
@@ -1854,7 +1854,7 @@ struct DeviceScan
       equality_op,
       stream);
   }
-#endif // DOXYGEN_SHOULD_SKIP_THIS
+#endif // _CCCL_DOXYGEN_INVOKED
 
   //! @rst
   //! Computes a device-wide inclusive prefix sum-by-key with key equality defined by ``equality_op``.
@@ -1990,7 +1990,7 @@ struct DeviceScan
                          stream);
   }
 
-#ifndef DOXYGEN_SHOULD_SKIP_THIS // Do not document
+#ifndef _CCCL_DOXYGEN_INVOKED // Do not document
   template <typename KeysInputIteratorT,
             typename ValuesInputIteratorT,
             typename ValuesOutputIteratorT,
@@ -2012,7 +2012,7 @@ struct DeviceScan
     return InclusiveSumByKey<KeysInputIteratorT, ValuesInputIteratorT, ValuesOutputIteratorT, EqualityOpT>(
       d_temp_storage, temp_storage_bytes, d_keys_in, d_values_in, d_values_out, num_items, equality_op, stream);
   }
-#endif // DOXYGEN_SHOULD_SKIP_THIS
+#endif // _CCCL_DOXYGEN_INVOKED
 
   //! @rst
   //! Computes a device-wide inclusive prefix scan-by-key using the
@@ -2180,7 +2180,7 @@ struct DeviceScan
                          stream);
   }
 
-#ifndef DOXYGEN_SHOULD_SKIP_THIS // Do not document
+#ifndef _CCCL_DOXYGEN_INVOKED // Do not document
   template <typename KeysInputIteratorT,
             typename ValuesInputIteratorT,
             typename ValuesOutputIteratorT,
@@ -2204,7 +2204,7 @@ struct DeviceScan
     return InclusiveScanByKey<KeysInputIteratorT, ValuesInputIteratorT, ValuesOutputIteratorT, ScanOpT, EqualityOpT>(
       d_temp_storage, temp_storage_bytes, d_keys_in, d_values_in, d_values_out, scan_op, num_items, equality_op, stream);
   }
-#endif // DOXYGEN_SHOULD_SKIP_THIS
+#endif // _CCCL_DOXYGEN_INVOKED
 
   //! @}  end member group
 };
diff --git a/cub/cub/device/device_segmented_radix_sort.cuh b/cub/cub/device/device_segmented_radix_sort.cuh
index cc627b971ca..490caf36c48 100644
--- a/cub/cub/device/device_segmented_radix_sort.cuh
+++ b/cub/cub/device/device_segmented_radix_sort.cuh
@@ -265,7 +265,7 @@ public:
       stream);
   }
 
-#ifndef DOXYGEN_SHOULD_SKIP_THIS // Do not document
+#ifndef _CCCL_DOXYGEN_INVOKED // Do not document
   template <typename KeyT, typename ValueT, typename BeginOffsetIteratorT, typename EndOffsetIteratorT>
   CUB_DETAIL_RUNTIME_DEBUG_SYNC_IS_NOT_SUPPORTED CUB_RUNTIME_FUNCTION static cudaError_t SortPairs(
     void* d_temp_storage,
@@ -300,7 +300,7 @@ public:
       end_bit,
       stream);
   }
-#endif // DOXYGEN_SHOULD_SKIP_THIS
+#endif // _CCCL_DOXYGEN_INVOKED
 
   //! @rst
   //! Sorts segments of key-value pairs into ascending order. (``~N`` auxiliary storage required)
@@ -476,7 +476,7 @@ public:
       stream);
   }
 
-#ifndef DOXYGEN_SHOULD_SKIP_THIS // Do not document
+#ifndef _CCCL_DOXYGEN_INVOKED // Do not document
   template <typename KeyT, typename ValueT, typename BeginOffsetIteratorT, typename EndOffsetIteratorT>
   CUB_DETAIL_RUNTIME_DEBUG_SYNC_IS_NOT_SUPPORTED CUB_RUNTIME_FUNCTION static cudaError_t SortPairs(
     void* d_temp_storage,
@@ -507,7 +507,7 @@ public:
       end_bit,
       stream);
   }
-#endif // DOXYGEN_SHOULD_SKIP_THIS
+#endif // _CCCL_DOXYGEN_INVOKED
 
   //! @rst
   //! Sorts segments of key-value pairs into descending order. (``~2N`` auxiliary storage required).
@@ -683,7 +683,7 @@ public:
       stream);
   }
 
-#ifndef DOXYGEN_SHOULD_SKIP_THIS // Do not document
+#ifndef _CCCL_DOXYGEN_INVOKED // Do not document
   template <typename KeyT, typename ValueT, typename BeginOffsetIteratorT, typename EndOffsetIteratorT>
   CUB_DETAIL_RUNTIME_DEBUG_SYNC_IS_NOT_SUPPORTED CUB_RUNTIME_FUNCTION static cudaError_t SortPairsDescending(
     void* d_temp_storage,
@@ -718,7 +718,7 @@ public:
       end_bit,
       stream);
   }
-#endif // DOXYGEN_SHOULD_SKIP_THIS
+#endif // _CCCL_DOXYGEN_INVOKED
 
   //! @rst
   //! Sorts segments of key-value pairs into descending order. (``~N`` auxiliary storage required).
@@ -898,7 +898,7 @@ public:
       stream);
   }
 
-#ifndef DOXYGEN_SHOULD_SKIP_THIS // Do not document
+#ifndef _CCCL_DOXYGEN_INVOKED // Do not document
   template <typename KeyT, typename ValueT, typename BeginOffsetIteratorT, typename EndOffsetIteratorT>
   CUB_DETAIL_RUNTIME_DEBUG_SYNC_IS_NOT_SUPPORTED CUB_RUNTIME_FUNCTION static cudaError_t SortPairsDescending(
     void* d_temp_storage,
@@ -929,7 +929,7 @@ public:
       end_bit,
       stream);
   }
-#endif // DOXYGEN_SHOULD_SKIP_THIS
+#endif // _CCCL_DOXYGEN_INVOKED
 
   //! @}  end member group
   //! @name Keys-only
@@ -1092,7 +1092,7 @@ public:
       stream);
   }
 
-#ifndef DOXYGEN_SHOULD_SKIP_THIS // Do not document
+#ifndef _CCCL_DOXYGEN_INVOKED // Do not document
   template <typename KeyT, typename BeginOffsetIteratorT, typename EndOffsetIteratorT>
   CUB_DETAIL_RUNTIME_DEBUG_SYNC_IS_NOT_SUPPORTED CUB_RUNTIME_FUNCTION static cudaError_t SortKeys(
     void* d_temp_storage,
@@ -1123,7 +1123,7 @@ public:
       end_bit,
       stream);
   }
-#endif // DOXYGEN_SHOULD_SKIP_THIS
+#endif // _CCCL_DOXYGEN_INVOKED
 
   //! @rst
   //! Sorts segments of keys into ascending order. (``~N`` auxiliary storage required).
@@ -1291,7 +1291,7 @@ public:
       stream);
   }
 
-#ifndef DOXYGEN_SHOULD_SKIP_THIS // Do not document
+#ifndef _CCCL_DOXYGEN_INVOKED // Do not document
   template <typename KeyT, typename BeginOffsetIteratorT, typename EndOffsetIteratorT>
   CUB_DETAIL_RUNTIME_DEBUG_SYNC_IS_NOT_SUPPORTED CUB_RUNTIME_FUNCTION static cudaError_t SortKeys(
     void* d_temp_storage,
@@ -1320,7 +1320,7 @@ public:
       end_bit,
       stream);
   }
-#endif // DOXYGEN_SHOULD_SKIP_THIS
+#endif // _CCCL_DOXYGEN_INVOKED
 
   //! @rst
   //! Sorts segments of keys into descending order. (``~2N`` auxiliary storage required).
@@ -1479,7 +1479,7 @@ public:
       stream);
   }
 
-#ifndef DOXYGEN_SHOULD_SKIP_THIS // Do not document
+#ifndef _CCCL_DOXYGEN_INVOKED // Do not document
   template <typename KeyT, typename BeginOffsetIteratorT, typename EndOffsetIteratorT>
   CUB_DETAIL_RUNTIME_DEBUG_SYNC_IS_NOT_SUPPORTED CUB_RUNTIME_FUNCTION static cudaError_t SortKeysDescending(
     void* d_temp_storage,
@@ -1510,7 +1510,7 @@ public:
       end_bit,
       stream);
   }
-#endif // DOXYGEN_SHOULD_SKIP_THIS
+#endif // _CCCL_DOXYGEN_INVOKED
 
   //! @rst
   //! Sorts segments of keys into descending order. (``~N`` auxiliary storage required).
@@ -1676,7 +1676,7 @@ public:
       stream);
   }
 
-#ifndef DOXYGEN_SHOULD_SKIP_THIS // Do not document
+#ifndef _CCCL_DOXYGEN_INVOKED // Do not document
   template <typename KeyT, typename BeginOffsetIteratorT, typename EndOffsetIteratorT>
   CUB_DETAIL_RUNTIME_DEBUG_SYNC_IS_NOT_SUPPORTED CUB_RUNTIME_FUNCTION static cudaError_t SortKeysDescending(
     void* d_temp_storage,
@@ -1705,7 +1705,7 @@ public:
       end_bit,
       stream);
   }
-#endif // DOXYGEN_SHOULD_SKIP_THIS
+#endif // _CCCL_DOXYGEN_INVOKED
 
   //! @}  end member group
 };
diff --git a/cub/cub/device/device_segmented_reduce.cuh b/cub/cub/device/device_segmented_reduce.cuh
index 9d4de803e86..7ad043eab5f 100644
--- a/cub/cub/device/device_segmented_reduce.cuh
+++ b/cub/cub/device/device_segmented_reduce.cuh
@@ -272,7 +272,7 @@ public:
       stream);
   }
 
-#ifndef DOXYGEN_SHOULD_SKIP_THIS // Do not document
+#ifndef _CCCL_DOXYGEN_INVOKED // Do not document
   template <typename InputIteratorT,
             typename OutputIteratorT,
             typename BeginOffsetIteratorT,
@@ -306,7 +306,7 @@ public:
       initial_value,
       stream);
   }
-#endif // DOXYGEN_SHOULD_SKIP_THIS
+#endif // _CCCL_DOXYGEN_INVOKED
 
   //! @rst
   //! Computes a device-wide segmented sum using the addition (``+``) operator.
@@ -426,7 +426,7 @@ public:
       stream);
   }
 
-#ifndef DOXYGEN_SHOULD_SKIP_THIS // Do not document
+#ifndef _CCCL_DOXYGEN_INVOKED // Do not document
   template <typename InputIteratorT, typename OutputIteratorT, typename BeginOffsetIteratorT, typename EndOffsetIteratorT>
   CUB_DETAIL_RUNTIME_DEBUG_SYNC_IS_NOT_SUPPORTED CUB_RUNTIME_FUNCTION static cudaError_t
   Sum(void* d_temp_storage,
@@ -444,7 +444,7 @@ public:
     return Sum<InputIteratorT, OutputIteratorT, BeginOffsetIteratorT, EndOffsetIteratorT>(
       d_temp_storage, temp_storage_bytes, d_in, d_out, num_segments, d_begin_offsets, d_end_offsets, stream);
   }
-#endif // DOXYGEN_SHOULD_SKIP_THIS
+#endif // _CCCL_DOXYGEN_INVOKED
 
   //! @rst
   //! Computes a device-wide segmented minimum using the less-than (``<``) operator.
@@ -572,7 +572,7 @@ public:
       stream);
   }
 
-#ifndef DOXYGEN_SHOULD_SKIP_THIS // Do not document
+#ifndef _CCCL_DOXYGEN_INVOKED // Do not document
   template <typename InputIteratorT, typename OutputIteratorT, typename BeginOffsetIteratorT, typename EndOffsetIteratorT>
   CUB_DETAIL_RUNTIME_DEBUG_SYNC_IS_NOT_SUPPORTED CUB_RUNTIME_FUNCTION static cudaError_t
   Min(void* d_temp_storage,
@@ -590,7 +590,7 @@ public:
     return Min<InputIteratorT, OutputIteratorT, BeginOffsetIteratorT, EndOffsetIteratorT>(
       d_temp_storage, temp_storage_bytes, d_in, d_out, num_segments, d_begin_offsets, d_end_offsets, stream);
   }
-#endif // DOXYGEN_SHOULD_SKIP_THIS
+#endif // _CCCL_DOXYGEN_INVOKED
 
   //! @rst
   //! Finds the first device-wide minimum in each segment using the
@@ -742,7 +742,7 @@ public:
       stream);
   }
 
-#ifndef DOXYGEN_SHOULD_SKIP_THIS // Do not document
+#ifndef _CCCL_DOXYGEN_INVOKED // Do not document
   template <typename InputIteratorT, typename OutputIteratorT, typename BeginOffsetIteratorT, typename EndOffsetIteratorT>
   CUB_DETAIL_RUNTIME_DEBUG_SYNC_IS_NOT_SUPPORTED CUB_RUNTIME_FUNCTION static cudaError_t ArgMin(
     void* d_temp_storage,
@@ -760,7 +760,7 @@ public:
     return ArgMin<InputIteratorT, OutputIteratorT, BeginOffsetIteratorT, EndOffsetIteratorT>(
       d_temp_storage, temp_storage_bytes, d_in, d_out, num_segments, d_begin_offsets, d_end_offsets, stream);
   }
-#endif // DOXYGEN_SHOULD_SKIP_THIS
+#endif // _CCCL_DOXYGEN_INVOKED
 
   //! @rst
   //! Computes a device-wide segmented maximum using the greater-than (``>``) operator.
@@ -877,7 +877,7 @@ public:
       stream);
   }
 
-#ifndef DOXYGEN_SHOULD_SKIP_THIS // Do not document
+#ifndef _CCCL_DOXYGEN_INVOKED // Do not document
   template <typename InputIteratorT, typename OutputIteratorT, typename BeginOffsetIteratorT, typename EndOffsetIteratorT>
   CUB_DETAIL_RUNTIME_DEBUG_SYNC_IS_NOT_SUPPORTED CUB_RUNTIME_FUNCTION static cudaError_t
   Max(void* d_temp_storage,
@@ -895,7 +895,7 @@ public:
     return Max<InputIteratorT, OutputIteratorT, BeginOffsetIteratorT, EndOffsetIteratorT>(
       d_temp_storage, temp_storage_bytes, d_in, d_out, num_segments, d_begin_offsets, d_end_offsets, stream);
   }
-#endif // DOXYGEN_SHOULD_SKIP_THIS
+#endif // _CCCL_DOXYGEN_INVOKED
 
   //! @rst
   //! Finds the first device-wide maximum in each segment using the
@@ -1050,7 +1050,7 @@ public:
       stream);
   }
 
-#ifndef DOXYGEN_SHOULD_SKIP_THIS // Do not document
+#ifndef _CCCL_DOXYGEN_INVOKED // Do not document
   template <typename InputIteratorT, typename OutputIteratorT, typename BeginOffsetIteratorT, typename EndOffsetIteratorT>
   CUB_DETAIL_RUNTIME_DEBUG_SYNC_IS_NOT_SUPPORTED CUB_RUNTIME_FUNCTION static cudaError_t ArgMax(
     void* d_temp_storage,
@@ -1068,7 +1068,7 @@ public:
     return ArgMax<InputIteratorT, OutputIteratorT, BeginOffsetIteratorT, EndOffsetIteratorT>(
       d_temp_storage, temp_storage_bytes, d_in, d_out, num_segments, d_begin_offsets, d_end_offsets, stream);
   }
-#endif // DOXYGEN_SHOULD_SKIP_THIS
+#endif // _CCCL_DOXYGEN_INVOKED
 };
 
 CUB_NAMESPACE_END
diff --git a/cub/cub/device/device_segmented_sort.cuh b/cub/cub/device/device_segmented_sort.cuh
index 1f219aebd25..10b5c6d2388 100644
--- a/cub/cub/device/device_segmented_sort.cuh
+++ b/cub/cub/device/device_segmented_sort.cuh
@@ -306,7 +306,7 @@ public:
       stream);
   }
 
-#ifndef DOXYGEN_SHOULD_SKIP_THIS // Do not document
+#ifndef _CCCL_DOXYGEN_INVOKED // Do not document
   template <typename KeyT, typename BeginOffsetIteratorT, typename EndOffsetIteratorT>
   CUB_DETAIL_RUNTIME_DEBUG_SYNC_IS_NOT_SUPPORTED CUB_RUNTIME_FUNCTION static cudaError_t SortKeys(
     void* d_temp_storage,
@@ -333,7 +333,7 @@ public:
       d_end_offsets,
       stream);
   }
-#endif // DOXYGEN_SHOULD_SKIP_THIS
+#endif // _CCCL_DOXYGEN_INVOKED
 
 private:
   // Internal version without NVTX range
@@ -503,7 +503,7 @@ public:
       stream);
   }
 
-#ifndef DOXYGEN_SHOULD_SKIP_THIS // Do not document
+#ifndef _CCCL_DOXYGEN_INVOKED // Do not document
   template <typename KeyT, typename BeginOffsetIteratorT, typename EndOffsetIteratorT>
   CUB_DETAIL_RUNTIME_DEBUG_SYNC_IS_NOT_SUPPORTED CUB_RUNTIME_FUNCTION static cudaError_t SortKeysDescending(
     void* d_temp_storage,
@@ -530,7 +530,7 @@ public:
       d_end_offsets,
       stream);
   }
-#endif // DOXYGEN_SHOULD_SKIP_THIS
+#endif // _CCCL_DOXYGEN_INVOKED
 
 private:
   // Internal version without NVTX range
@@ -702,7 +702,7 @@ public:
       d_temp_storage, temp_storage_bytes, d_keys, num_items, num_segments, d_begin_offsets, d_end_offsets, stream);
   }
 
-#ifndef DOXYGEN_SHOULD_SKIP_THIS // Do not document
+#ifndef _CCCL_DOXYGEN_INVOKED // Do not document
   template <typename KeyT, typename BeginOffsetIteratorT, typename EndOffsetIteratorT>
   CUB_DETAIL_RUNTIME_DEBUG_SYNC_IS_NOT_SUPPORTED CUB_RUNTIME_FUNCTION static cudaError_t SortKeys(
     void* d_temp_storage,
@@ -720,7 +720,7 @@ public:
     return SortKeys<KeyT, BeginOffsetIteratorT, EndOffsetIteratorT>(
       d_temp_storage, temp_storage_bytes, d_keys, num_items, num_segments, d_begin_offsets, d_end_offsets, stream);
   }
-#endif // DOXYGEN_SHOULD_SKIP_THIS
+#endif // _CCCL_DOXYGEN_INVOKED
 
 private:
   // Internal version without NVTX range
@@ -893,7 +893,7 @@ public:
       d_temp_storage, temp_storage_bytes, d_keys, num_items, num_segments, d_begin_offsets, d_end_offsets, stream);
   }
 
-#ifndef DOXYGEN_SHOULD_SKIP_THIS // Do not document
+#ifndef _CCCL_DOXYGEN_INVOKED // Do not document
   template <typename KeyT, typename BeginOffsetIteratorT, typename EndOffsetIteratorT>
   CUB_DETAIL_RUNTIME_DEBUG_SYNC_IS_NOT_SUPPORTED CUB_RUNTIME_FUNCTION static cudaError_t SortKeysDescending(
     void* d_temp_storage,
@@ -911,7 +911,7 @@ public:
     return SortKeysDescending<KeyT, BeginOffsetIteratorT, EndOffsetIteratorT>(
       d_temp_storage, temp_storage_bytes, d_keys, num_items, num_segments, d_begin_offsets, d_end_offsets, stream);
   }
-#endif // DOXYGEN_SHOULD_SKIP_THIS
+#endif // _CCCL_DOXYGEN_INVOKED
 
   //! @rst
   //! Sorts segments of keys into ascending order. Approximately
@@ -1049,7 +1049,7 @@ public:
       stream);
   }
 
-#ifndef DOXYGEN_SHOULD_SKIP_THIS // Do not document
+#ifndef _CCCL_DOXYGEN_INVOKED // Do not document
   template <typename KeyT, typename BeginOffsetIteratorT, typename EndOffsetIteratorT>
   CUB_DETAIL_RUNTIME_DEBUG_SYNC_IS_NOT_SUPPORTED CUB_RUNTIME_FUNCTION static cudaError_t StableSortKeys(
     void* d_temp_storage,
@@ -1076,7 +1076,7 @@ public:
       d_end_offsets,
       stream);
   }
-#endif // DOXYGEN_SHOULD_SKIP_THIS
+#endif // _CCCL_DOXYGEN_INVOKED
 
   //! @rst
   //! Sorts segments of keys into descending order.
@@ -1214,7 +1214,7 @@ public:
       stream);
   }
 
-#ifndef DOXYGEN_SHOULD_SKIP_THIS // Do not document
+#ifndef _CCCL_DOXYGEN_INVOKED // Do not document
   template <typename KeyT, typename BeginOffsetIteratorT, typename EndOffsetIteratorT>
   CUB_DETAIL_RUNTIME_DEBUG_SYNC_IS_NOT_SUPPORTED CUB_RUNTIME_FUNCTION static cudaError_t StableSortKeysDescending(
     void* d_temp_storage,
@@ -1241,7 +1241,7 @@ public:
       d_end_offsets,
       stream);
   }
-#endif // DOXYGEN_SHOULD_SKIP_THIS
+#endif // _CCCL_DOXYGEN_INVOKED
 
   //! @rst
   //! Sorts segments of keys into ascending order.
@@ -1381,7 +1381,7 @@ public:
       d_temp_storage, temp_storage_bytes, d_keys, num_items, num_segments, d_begin_offsets, d_end_offsets, stream);
   }
 
-#ifndef DOXYGEN_SHOULD_SKIP_THIS // Do not document
+#ifndef _CCCL_DOXYGEN_INVOKED // Do not document
   template <typename KeyT, typename BeginOffsetIteratorT, typename EndOffsetIteratorT>
   CUB_DETAIL_RUNTIME_DEBUG_SYNC_IS_NOT_SUPPORTED CUB_RUNTIME_FUNCTION static cudaError_t StableSortKeys(
     void* d_temp_storage,
@@ -1399,7 +1399,7 @@ public:
     return StableSortKeys<KeyT, BeginOffsetIteratorT, EndOffsetIteratorT>(
       d_temp_storage, temp_storage_bytes, d_keys, num_items, num_segments, d_begin_offsets, d_end_offsets, stream);
   }
-#endif // DOXYGEN_SHOULD_SKIP_THIS
+#endif // _CCCL_DOXYGEN_INVOKED
 
   //! @rst
   //! Sorts segments of keys into descending order.
@@ -1538,7 +1538,7 @@ public:
       d_temp_storage, temp_storage_bytes, d_keys, num_items, num_segments, d_begin_offsets, d_end_offsets, stream);
   }
 
-#ifndef DOXYGEN_SHOULD_SKIP_THIS // Do not document
+#ifndef _CCCL_DOXYGEN_INVOKED // Do not document
   template <typename KeyT, typename BeginOffsetIteratorT, typename EndOffsetIteratorT>
   CUB_DETAIL_RUNTIME_DEBUG_SYNC_IS_NOT_SUPPORTED CUB_RUNTIME_FUNCTION static cudaError_t StableSortKeysDescending(
     void* d_temp_storage,
@@ -1556,7 +1556,7 @@ public:
     return StableSortKeysDescending<KeyT, BeginOffsetIteratorT, EndOffsetIteratorT>(
       d_temp_storage, temp_storage_bytes, d_keys, num_items, num_segments, d_begin_offsets, d_end_offsets, stream);
   }
-#endif // DOXYGEN_SHOULD_SKIP_THIS
+#endif // _CCCL_DOXYGEN_INVOKED
 
 private:
   // Internal version without NVTX range
@@ -1757,7 +1757,7 @@ public:
       stream);
   }
 
-#ifndef DOXYGEN_SHOULD_SKIP_THIS // Do not document
+#ifndef _CCCL_DOXYGEN_INVOKED // Do not document
   template <typename KeyT, typename ValueT, typename BeginOffsetIteratorT, typename EndOffsetIteratorT>
   CUB_DETAIL_RUNTIME_DEBUG_SYNC_IS_NOT_SUPPORTED CUB_RUNTIME_FUNCTION static cudaError_t SortPairs(
     void* d_temp_storage,
@@ -1788,7 +1788,7 @@ public:
       d_end_offsets,
       stream);
   }
-#endif // DOXYGEN_SHOULD_SKIP_THIS
+#endif // _CCCL_DOXYGEN_INVOKED
 
 private:
   // Internal version without NVTX range
@@ -1985,7 +1985,7 @@ public:
       stream);
   }
 
-#ifndef DOXYGEN_SHOULD_SKIP_THIS // Do not document
+#ifndef _CCCL_DOXYGEN_INVOKED // Do not document
   template <typename KeyT, typename ValueT, typename BeginOffsetIteratorT, typename EndOffsetIteratorT>
   CUB_DETAIL_RUNTIME_DEBUG_SYNC_IS_NOT_SUPPORTED CUB_RUNTIME_FUNCTION static cudaError_t SortPairsDescending(
     void* d_temp_storage,
@@ -2016,7 +2016,7 @@ public:
       d_end_offsets,
       stream);
   }
-#endif // DOXYGEN_SHOULD_SKIP_THIS
+#endif // _CCCL_DOXYGEN_INVOKED
 
 private:
   // Internal version without NVTX range
@@ -2213,7 +2213,7 @@ public:
       stream);
   }
 
-#ifndef DOXYGEN_SHOULD_SKIP_THIS // Do not document
+#ifndef _CCCL_DOXYGEN_INVOKED // Do not document
   template <typename KeyT, typename ValueT, typename BeginOffsetIteratorT, typename EndOffsetIteratorT>
   CUB_DETAIL_RUNTIME_DEBUG_SYNC_IS_NOT_SUPPORTED CUB_RUNTIME_FUNCTION static cudaError_t SortPairs(
     void* d_temp_storage,
@@ -2240,7 +2240,7 @@ public:
       d_end_offsets,
       stream);
   }
-#endif // DOXYGEN_SHOULD_SKIP_THIS
+#endif // _CCCL_DOXYGEN_INVOKED
 
 private:
   // Internal version without NVTX range
@@ -2436,7 +2436,7 @@ public:
       stream);
   }
 
-#ifndef DOXYGEN_SHOULD_SKIP_THIS // Do not document
+#ifndef _CCCL_DOXYGEN_INVOKED // Do not document
   template <typename KeyT, typename ValueT, typename BeginOffsetIteratorT, typename EndOffsetIteratorT>
   CUB_DETAIL_RUNTIME_DEBUG_SYNC_IS_NOT_SUPPORTED CUB_RUNTIME_FUNCTION static cudaError_t SortPairsDescending(
     void* d_temp_storage,
@@ -2463,7 +2463,7 @@ public:
       d_end_offsets,
       stream);
   }
-#endif // DOXYGEN_SHOULD_SKIP_THIS
+#endif // _CCCL_DOXYGEN_INVOKED
 
   //! @rst
   //! Sorts segments of key-value pairs into ascending order.
@@ -2623,7 +2623,7 @@ public:
       stream);
   }
 
-#ifndef DOXYGEN_SHOULD_SKIP_THIS // Do not document
+#ifndef _CCCL_DOXYGEN_INVOKED // Do not document
   template <typename KeyT, typename ValueT, typename BeginOffsetIteratorT, typename EndOffsetIteratorT>
   CUB_DETAIL_RUNTIME_DEBUG_SYNC_IS_NOT_SUPPORTED CUB_RUNTIME_FUNCTION static cudaError_t StableSortPairs(
     void* d_temp_storage,
@@ -2654,7 +2654,7 @@ public:
       d_end_offsets,
       stream);
   }
-#endif // DOXYGEN_SHOULD_SKIP_THIS
+#endif // _CCCL_DOXYGEN_INVOKED
 
   //! @rst
   //! Sorts segments of key-value pairs into descending order.
@@ -2814,7 +2814,7 @@ public:
       stream);
   }
 
-#ifndef DOXYGEN_SHOULD_SKIP_THIS // Do not document
+#ifndef _CCCL_DOXYGEN_INVOKED // Do not document
   template <typename KeyT, typename ValueT, typename BeginOffsetIteratorT, typename EndOffsetIteratorT>
   CUB_DETAIL_RUNTIME_DEBUG_SYNC_IS_NOT_SUPPORTED CUB_RUNTIME_FUNCTION static cudaError_t StableSortPairsDescending(
     void* d_temp_storage,
@@ -2845,7 +2845,7 @@ public:
       d_end_offsets,
       stream);
   }
-#endif // DOXYGEN_SHOULD_SKIP_THIS
+#endif // _CCCL_DOXYGEN_INVOKED
 
   //! @rst
   //! Sorts segments of key-value pairs into ascending order.
@@ -3011,7 +3011,7 @@ public:
       stream);
   }
 
-#ifndef DOXYGEN_SHOULD_SKIP_THIS // Do not document
+#ifndef _CCCL_DOXYGEN_INVOKED // Do not document
   template <typename KeyT, typename ValueT, typename BeginOffsetIteratorT, typename EndOffsetIteratorT>
   CUB_DETAIL_RUNTIME_DEBUG_SYNC_IS_NOT_SUPPORTED CUB_RUNTIME_FUNCTION static cudaError_t StableSortPairs(
     void* d_temp_storage,
@@ -3038,7 +3038,7 @@ public:
       d_end_offsets,
       stream);
   }
-#endif // DOXYGEN_SHOULD_SKIP_THIS
+#endif // _CCCL_DOXYGEN_INVOKED
 
   //! @rst
   //! Sorts segments of key-value pairs into descending order.
@@ -3203,7 +3203,7 @@ public:
       stream);
   }
 
-#ifndef DOXYGEN_SHOULD_SKIP_THIS // Do not document
+#ifndef _CCCL_DOXYGEN_INVOKED // Do not document
   template <typename KeyT, typename ValueT, typename BeginOffsetIteratorT, typename EndOffsetIteratorT>
   CUB_DETAIL_RUNTIME_DEBUG_SYNC_IS_NOT_SUPPORTED CUB_RUNTIME_FUNCTION static cudaError_t StableSortPairsDescending(
     void* d_temp_storage,
@@ -3230,7 +3230,7 @@ public:
       d_end_offsets,
       stream);
   }
-#endif // DOXYGEN_SHOULD_SKIP_THIS
+#endif // _CCCL_DOXYGEN_INVOKED
 
   //! @}  end member group
 };
diff --git a/cub/cub/device/device_select.cuh b/cub/cub/device/device_select.cuh
index b537ab9204b..27a18cf809a 100644
--- a/cub/cub/device/device_select.cuh
+++ b/cub/cub/device/device_select.cuh
@@ -203,7 +203,7 @@ struct DeviceSelect
                        stream);
   }
 
-#ifndef DOXYGEN_SHOULD_SKIP_THIS // Do not document
+#ifndef _CCCL_DOXYGEN_INVOKED // Do not document
   template <typename InputIteratorT, typename FlagIterator, typename OutputIteratorT, typename NumSelectedIteratorT>
   CUB_DETAIL_RUNTIME_DEBUG_SYNC_IS_NOT_SUPPORTED CUB_RUNTIME_FUNCTION _CCCL_FORCEINLINE static cudaError_t Flagged(
     void* d_temp_storage,
@@ -221,7 +221,7 @@ struct DeviceSelect
     return Flagged<InputIteratorT, FlagIterator, OutputIteratorT, NumSelectedIteratorT>(
       d_temp_storage, temp_storage_bytes, d_in, d_flags, d_out, d_num_selected_out, num_items, stream);
   }
-#endif // DOXYGEN_SHOULD_SKIP_THIS
+#endif // _CCCL_DOXYGEN_INVOKED
 
   //! @rst
   //! Uses the ``d_flags`` sequence to selectively compact the items in `d_data``.
@@ -341,7 +341,7 @@ struct DeviceSelect
                            stream);
   }
 
-#ifndef DOXYGEN_SHOULD_SKIP_THIS // Do not document
+#ifndef _CCCL_DOXYGEN_INVOKED // Do not document
   template <typename IteratorT, typename FlagIterator, typename NumSelectedIteratorT>
   CUB_DETAIL_RUNTIME_DEBUG_SYNC_IS_NOT_SUPPORTED CUB_RUNTIME_FUNCTION _CCCL_FORCEINLINE static cudaError_t Flagged(
     void* d_temp_storage,
@@ -358,7 +358,7 @@ struct DeviceSelect
     return Flagged<IteratorT, FlagIterator, NumSelectedIteratorT>(
       d_temp_storage, temp_storage_bytes, d_data, d_flags, d_num_selected_out, num_items, stream);
   }
-#endif // DOXYGEN_SHOULD_SKIP_THIS
+#endif // _CCCL_DOXYGEN_INVOKED
 
   //! @rst
   //! Uses the ``select_op`` functor to selectively copy items from ``d_in`` into ``d_out``.
@@ -498,7 +498,7 @@ struct DeviceSelect
                        stream);
   }
 
-#ifndef DOXYGEN_SHOULD_SKIP_THIS // Do not document
+#ifndef _CCCL_DOXYGEN_INVOKED // Do not document
   template <typename InputIteratorT, typename OutputIteratorT, typename NumSelectedIteratorT, typename SelectOp>
   CUB_DETAIL_RUNTIME_DEBUG_SYNC_IS_NOT_SUPPORTED CUB_RUNTIME_FUNCTION _CCCL_FORCEINLINE static cudaError_t
   If(void* d_temp_storage,
@@ -516,7 +516,7 @@ struct DeviceSelect
     return If<InputIteratorT, OutputIteratorT, NumSelectedIteratorT, SelectOp>(
       d_temp_storage, temp_storage_bytes, d_in, d_out, d_num_selected_out, num_items, select_op, stream);
   }
-#endif // DOXYGEN_SHOULD_SKIP_THIS
+#endif // _CCCL_DOXYGEN_INVOKED
 
   //! @rst
   //! Uses the ``select_op`` functor to selectively compact items in ``d_data``.
@@ -648,7 +648,7 @@ struct DeviceSelect
                            stream);
   }
 
-#ifndef DOXYGEN_SHOULD_SKIP_THIS // Do not document
+#ifndef _CCCL_DOXYGEN_INVOKED // Do not document
   template <typename IteratorT, typename NumSelectedIteratorT, typename SelectOp>
   CUB_DETAIL_RUNTIME_DEBUG_SYNC_IS_NOT_SUPPORTED CUB_RUNTIME_FUNCTION _CCCL_FORCEINLINE static cudaError_t
   If(void* d_temp_storage,
@@ -665,7 +665,7 @@ struct DeviceSelect
     return If<IteratorT, NumSelectedIteratorT, SelectOp>(
       d_temp_storage, temp_storage_bytes, d_data, d_num_selected_out, num_items, select_op, stream);
   }
-#endif // DOXYGEN_SHOULD_SKIP_THIS
+#endif // _CCCL_DOXYGEN_INVOKED
 
   //! @rst
   //! Uses the ``select_op`` functor applied to ``d_flags`` to selectively copy the
@@ -1011,7 +1011,7 @@ struct DeviceSelect
                        stream);
   }
 
-#ifndef DOXYGEN_SHOULD_SKIP_THIS // Do not document
+#ifndef _CCCL_DOXYGEN_INVOKED // Do not document
   template <typename InputIteratorT, typename OutputIteratorT, typename NumSelectedIteratorT>
   CUB_DETAIL_RUNTIME_DEBUG_SYNC_IS_NOT_SUPPORTED CUB_RUNTIME_FUNCTION _CCCL_FORCEINLINE static cudaError_t Unique(
     void* d_temp_storage,
@@ -1028,7 +1028,7 @@ struct DeviceSelect
     return Unique<InputIteratorT, OutputIteratorT, NumSelectedIteratorT>(
       d_temp_storage, temp_storage_bytes, d_in, d_out, d_num_selected_out, num_items, stream);
   }
-#endif // DOXYGEN_SHOULD_SKIP_THIS
+#endif // _CCCL_DOXYGEN_INVOKED
 
   //! @rst
   //! Given an input sequence ``d_keys_in`` and ``d_values_in`` with runs of key-value pairs with consecutive
@@ -1330,7 +1330,7 @@ struct DeviceSelect
       stream);
   }
 
-#ifndef DOXYGEN_SHOULD_SKIP_THIS // Do not document
+#ifndef _CCCL_DOXYGEN_INVOKED // Do not document
   template <typename KeyInputIteratorT,
             typename ValueInputIteratorT,
             typename KeyOutputIteratorT,
@@ -1367,7 +1367,7 @@ struct DeviceSelect
       num_items,
       stream);
   }
-#endif // DOXYGEN_SHOULD_SKIP_THIS
+#endif // _CCCL_DOXYGEN_INVOKED
 };
 
 CUB_NAMESPACE_END
diff --git a/cub/cub/device/device_spmv.cuh b/cub/cub/device/device_spmv.cuh
index 8b7e60d435f..2646e872c80 100644
--- a/cub/cub/device/device_spmv.cuh
+++ b/cub/cub/device/device_spmv.cuh
@@ -208,7 +208,7 @@ struct DeviceSpmv
     return DispatchSpmv<ValueT, int>::Dispatch(d_temp_storage, temp_storage_bytes, spmv_params, stream);
   }
 
-#ifndef DOXYGEN_SHOULD_SKIP_THIS // Do not document
+#ifndef _CCCL_DOXYGEN_INVOKED // Do not document
   template <typename ValueT>
   CUB_DETAIL_RUNTIME_DEBUG_SYNC_IS_NOT_SUPPORTED CUB_RUNTIME_FUNCTION static cudaError_t CsrMV(
     void* d_temp_storage,
@@ -239,7 +239,7 @@ struct DeviceSpmv
       num_nonzeros,
       stream);
   }
-#endif // DOXYGEN_SHOULD_SKIP_THIS
+#endif // _CCCL_DOXYGEN_INVOKED
 
   //! @}  end member group
 };
diff --git a/cub/cub/device/device_transform.cuh b/cub/cub/device/device_transform.cuh
index 984109692f6..ef00248b448 100644
--- a/cub/cub/device/device_transform.cuh
+++ b/cub/cub/device/device_transform.cuh
@@ -66,7 +66,7 @@ struct DeviceTransform
           ::cuda::std::move(inputs), ::cuda::std::move(output), num_items, ::cuda::std::move(transform_op), stream);
   }
 
-#ifndef DOXYGEN_SHOULD_SKIP_THIS // Do not document
+#ifndef _CCCL_DOXYGEN_INVOKED // Do not document
   // This overload has additional parameters to specify temporary storage. Provided for compatibility with other CUB
   // APIs.
   template <typename... RandomAccessIteratorsIn, typename RandomAccessIteratorOut, typename TransformOp>
@@ -88,7 +88,7 @@ struct DeviceTransform
     return Transform(
       ::cuda::std::move(inputs), ::cuda::std::move(output), num_items, ::cuda::std::move(transform_op), stream);
   }
-#endif // DOXYGEN_SHOULD_SKIP_THIS
+#endif // _CCCL_DOXYGEN_INVOKED
 
   //! @rst
   //! Transforms one input sequence into one output sequence, by applying a transformation operation on corresponding
@@ -120,7 +120,7 @@ struct DeviceTransform
       stream);
   }
 
-#ifndef DOXYGEN_SHOULD_SKIP_THIS // Do not document
+#ifndef _CCCL_DOXYGEN_INVOKED // Do not document
   // This overload has additional parameters to specify temporary storage. Provided for compatibility with other CUB
   // APIs.
   template <typename RandomAccessIteratorIn, typename RandomAccessIteratorOut, typename TransformOp>
@@ -146,7 +146,7 @@ struct DeviceTransform
       ::cuda::std::move(transform_op),
       stream);
   }
-#endif // DOXYGEN_SHOULD_SKIP_THIS
+#endif // _CCCL_DOXYGEN_INVOKED
 
   //! @rst
   //! Overview
@@ -189,7 +189,7 @@ struct DeviceTransform
           ::cuda::std::move(inputs), ::cuda::std::move(output), num_items, ::cuda::std::move(transform_op), stream);
   }
 
-#ifndef DOXYGEN_SHOULD_SKIP_THIS // Do not document
+#ifndef _CCCL_DOXYGEN_INVOKED // Do not document
   template <typename... RandomAccessIteratorsIn, typename RandomAccessIteratorOut, typename TransformOp>
   CUB_RUNTIME_FUNCTION static cudaError_t TransformStableArgumentAddresses(
     void* d_temp_storage,
@@ -209,7 +209,7 @@ struct DeviceTransform
     return TransformStableArgumentAddresses(
       ::cuda::std::move(inputs), ::cuda::std::move(output), num_items, ::cuda::std::move(transform_op), stream);
   }
-#endif // DOXYGEN_SHOULD_SKIP_THIS
+#endif // _CCCL_DOXYGEN_INVOKED
 
   //! @rst
   //! Transforms one input sequence into one output sequence, by applying a transformation operation on corresponding
@@ -241,7 +241,7 @@ struct DeviceTransform
       stream);
   }
 
-#ifndef DOXYGEN_SHOULD_SKIP_THIS // Do not document
+#ifndef _CCCL_DOXYGEN_INVOKED // Do not document
   template <typename RandomAccessIteratorIn, typename RandomAccessIteratorOut, typename TransformOp>
   CUB_RUNTIME_FUNCTION static cudaError_t TransformStableArgumentAddresses(
     void* d_temp_storage,
@@ -265,7 +265,7 @@ struct DeviceTransform
       ::cuda::std::move(transform_op),
       stream);
   }
-#endif // DOXYGEN_SHOULD_SKIP_THIS
+#endif // _CCCL_DOXYGEN_INVOKED
 };
 
 CUB_NAMESPACE_END
diff --git a/cub/cub/device/dispatch/dispatch_adjacent_difference.cuh b/cub/cub/device/dispatch/dispatch_adjacent_difference.cuh
index af41c7137c7..4eef4fb5b86 100644
--- a/cub/cub/device/dispatch/dispatch_adjacent_difference.cuh
+++ b/cub/cub/device/dispatch/dispatch_adjacent_difference.cuh
@@ -169,7 +169,7 @@ struct DispatchAdjacentDifference : public SelectedPolicy
       , stream(stream)
   {}
 
-#ifndef DOXYGEN_SHOULD_SKIP_THIS // Do not document
+#ifndef _CCCL_DOXYGEN_INVOKED // Do not document
   CUB_DETAIL_RUNTIME_DEBUG_SYNC_IS_NOT_SUPPORTED
   CUB_DEPRECATED CUB_RUNTIME_FUNCTION _CCCL_FORCEINLINE DispatchAdjacentDifference(
     void* d_temp_storage,
@@ -190,7 +190,7 @@ struct DispatchAdjacentDifference : public SelectedPolicy
   {
     CUB_DETAIL_RUNTIME_DEBUG_SYNC_USAGE_LOG
   }
-#endif // DOXYGEN_SHOULD_SKIP_THIS
+#endif // _CCCL_DOXYGEN_INVOKED
 
   /// Invocation
   template <typename ActivePolicyT>
@@ -356,7 +356,7 @@ struct DispatchAdjacentDifference : public SelectedPolicy
     return error;
   }
 
-#ifndef DOXYGEN_SHOULD_SKIP_THIS // Do not document
+#ifndef _CCCL_DOXYGEN_INVOKED // Do not document
   CUB_DETAIL_RUNTIME_DEBUG_SYNC_IS_NOT_SUPPORTED
   CUB_RUNTIME_FUNCTION static cudaError_t Dispatch(
     void* d_temp_storage,
@@ -372,7 +372,7 @@ struct DispatchAdjacentDifference : public SelectedPolicy
 
     return Dispatch(d_temp_storage, temp_storage_bytes, d_input, d_output, num_items, difference_op, stream);
   }
-#endif // DOXYGEN_SHOULD_SKIP_THIS
+#endif // _CCCL_DOXYGEN_INVOKED
 };
 
 CUB_NAMESPACE_END
diff --git a/cub/cub/device/dispatch/dispatch_histogram.cuh b/cub/cub/device/dispatch/dispatch_histogram.cuh
index dab551559a4..15e0311fa2a 100644
--- a/cub/cub/device/dispatch/dispatch_histogram.cuh
+++ b/cub/cub/device/dispatch/dispatch_histogram.cuh
@@ -1036,7 +1036,7 @@ public:
     return error;
   }
 
-#ifndef DOXYGEN_SHOULD_SKIP_THIS // Do not document
+#ifndef _CCCL_DOXYGEN_INVOKED // Do not document
   CUB_DETAIL_RUNTIME_DEBUG_SYNC_IS_NOT_SUPPORTED
   CUB_RUNTIME_FUNCTION static cudaError_t DispatchRange(
     void* d_temp_storage,
@@ -1067,7 +1067,7 @@ public:
       stream,
       is_byte_sample);
   }
-#endif // DOXYGEN_SHOULD_SKIP_THIS
+#endif // _CCCL_DOXYGEN_INVOKED
 
   /**
    * Dispatch routine for HistogramRange, specialized for 8-bit sample types
@@ -1202,7 +1202,7 @@ public:
     return error;
   }
 
-#ifndef DOXYGEN_SHOULD_SKIP_THIS // Do not document
+#ifndef _CCCL_DOXYGEN_INVOKED // Do not document
   CUB_DETAIL_RUNTIME_DEBUG_SYNC_IS_NOT_SUPPORTED
   CUB_RUNTIME_FUNCTION static cudaError_t DispatchRange(
     void* d_temp_storage,
@@ -1233,7 +1233,7 @@ public:
       stream,
       is_byte_sample);
   }
-#endif // DOXYGEN_SHOULD_SKIP_THIS
+#endif // _CCCL_DOXYGEN_INVOKED
 
   /**
    * Dispatch routine for HistogramEven, specialized for sample types larger than 8-bit
@@ -1420,7 +1420,7 @@ public:
     return error;
   }
 
-#ifndef DOXYGEN_SHOULD_SKIP_THIS // Do not document
+#ifndef _CCCL_DOXYGEN_INVOKED // Do not document
   CUB_DETAIL_RUNTIME_DEBUG_SYNC_IS_NOT_SUPPORTED
   CUB_RUNTIME_FUNCTION _CCCL_FORCEINLINE static cudaError_t DispatchEven(
     void* d_temp_storage,
@@ -1453,7 +1453,7 @@ public:
       stream,
       is_byte_sample);
   }
-#endif // DOXYGEN_SHOULD_SKIP_THIS
+#endif // _CCCL_DOXYGEN_INVOKED
 
   /**
    * Dispatch routine for HistogramEven, specialized for 8-bit sample types
@@ -1592,7 +1592,7 @@ public:
     return error;
   }
 
-#ifndef DOXYGEN_SHOULD_SKIP_THIS // Do not document
+#ifndef _CCCL_DOXYGEN_INVOKED // Do not document
   CUB_DETAIL_RUNTIME_DEBUG_SYNC_IS_NOT_SUPPORTED
   CUB_RUNTIME_FUNCTION _CCCL_FORCEINLINE static cudaError_t DispatchEven(
     void* d_temp_storage,
@@ -1625,7 +1625,7 @@ public:
       stream,
       is_byte_sample);
   }
-#endif // DOXYGEN_SHOULD_SKIP_THIS
+#endif // _CCCL_DOXYGEN_INVOKED
 };
 
 CUB_NAMESPACE_END
diff --git a/cub/cub/device/dispatch/dispatch_reduce.cuh b/cub/cub/device/dispatch/dispatch_reduce.cuh
index 23855d05951..c485e80e446 100644
--- a/cub/cub/device/dispatch/dispatch_reduce.cuh
+++ b/cub/cub/device/dispatch/dispatch_reduce.cuh
@@ -469,7 +469,7 @@ struct DispatchReduce : SelectedPolicy
       , launcher_factory(launcher_factory)
   {}
 
-#ifndef DOXYGEN_SHOULD_SKIP_THIS // Do not document
+#ifndef _CCCL_DOXYGEN_INVOKED // Do not document
   CUB_DETAIL_RUNTIME_DEBUG_SYNC_IS_NOT_SUPPORTED
   CUB_RUNTIME_FUNCTION _CCCL_FORCEINLINE DispatchReduce(
     void* d_temp_storage,
@@ -494,7 +494,7 @@ struct DispatchReduce : SelectedPolicy
   {
     CUB_DETAIL_RUNTIME_DEBUG_SYNC_USAGE_LOG
   }
-#endif // DOXYGEN_SHOULD_SKIP_THIS
+#endif // _CCCL_DOXYGEN_INVOKED
 
   //---------------------------------------------------------------------------
   // Small-problem (single tile) invocation
@@ -814,7 +814,7 @@ struct DispatchReduce : SelectedPolicy
     return error;
   }
 
-#ifndef DOXYGEN_SHOULD_SKIP_THIS // Do not document
+#ifndef _CCCL_DOXYGEN_INVOKED // Do not document
   CUB_DETAIL_RUNTIME_DEBUG_SYNC_IS_NOT_SUPPORTED
   CUB_RUNTIME_FUNCTION _CCCL_FORCEINLINE static cudaError_t Dispatch(
     void* d_temp_storage,
@@ -831,7 +831,7 @@ struct DispatchReduce : SelectedPolicy
 
     return Dispatch(d_temp_storage, temp_storage_bytes, d_in, d_out, num_items, reduction_op, init, stream);
   }
-#endif // DOXYGEN_SHOULD_SKIP_THIS
+#endif // _CCCL_DOXYGEN_INVOKED
 };
 
 /**
@@ -1008,7 +1008,7 @@ struct DispatchSegmentedReduce : SelectedPolicy
       , ptx_version(ptx_version)
   {}
 
-#ifndef DOXYGEN_SHOULD_SKIP_THIS // Do not document
+#ifndef _CCCL_DOXYGEN_INVOKED // Do not document
   CUB_DETAIL_RUNTIME_DEBUG_SYNC_IS_NOT_SUPPORTED
   CUB_RUNTIME_FUNCTION _CCCL_FORCEINLINE DispatchSegmentedReduce(
     void* d_temp_storage,
@@ -1037,7 +1037,7 @@ struct DispatchSegmentedReduce : SelectedPolicy
   {
     CUB_DETAIL_RUNTIME_DEBUG_SYNC_USAGE_LOG
   }
-#endif // DOXYGEN_SHOULD_SKIP_THIS
+#endif // _CCCL_DOXYGEN_INVOKED
 
   //---------------------------------------------------------------------------
   // Chained policy invocation
@@ -1238,7 +1238,7 @@ struct DispatchSegmentedReduce : SelectedPolicy
     return error;
   }
 
-#ifndef DOXYGEN_SHOULD_SKIP_THIS // Do not document
+#ifndef _CCCL_DOXYGEN_INVOKED // Do not document
   CUB_DETAIL_RUNTIME_DEBUG_SYNC_IS_NOT_SUPPORTED
   CUB_RUNTIME_FUNCTION _CCCL_FORCEINLINE static cudaError_t Dispatch(
     void* d_temp_storage,
@@ -1267,7 +1267,7 @@ struct DispatchSegmentedReduce : SelectedPolicy
       init,
       stream);
   }
-#endif // DOXYGEN_SHOULD_SKIP_THIS
+#endif // _CCCL_DOXYGEN_INVOKED
 };
 
 CUB_NAMESPACE_END
diff --git a/cub/cub/device/dispatch/dispatch_reduce_by_key.cuh b/cub/cub/device/dispatch/dispatch_reduce_by_key.cuh
index 00d7280701a..482b9afe19f 100644
--- a/cub/cub/device/dispatch/dispatch_reduce_by_key.cuh
+++ b/cub/cub/device/dispatch/dispatch_reduce_by_key.cuh
@@ -550,7 +550,7 @@ struct DispatchReduceByKey
     return error;
   }
 
-#ifndef DOXYGEN_SHOULD_SKIP_THIS // Do not document
+#ifndef _CCCL_DOXYGEN_INVOKED // Do not document
   CUB_DETAIL_RUNTIME_DEBUG_SYNC_IS_NOT_SUPPORTED
   CUB_RUNTIME_FUNCTION _CCCL_FORCEINLINE static cudaError_t Dispatch(
     void* d_temp_storage,
@@ -581,7 +581,7 @@ struct DispatchReduceByKey
       num_items,
       stream);
   }
-#endif // DOXYGEN_SHOULD_SKIP_THIS
+#endif // _CCCL_DOXYGEN_INVOKED
 };
 
 CUB_NAMESPACE_END
diff --git a/cub/cub/device/dispatch/dispatch_rle.cuh b/cub/cub/device/dispatch/dispatch_rle.cuh
index 2a6a0b3b641..bb99b20ab8a 100644
--- a/cub/cub/device/dispatch/dispatch_rle.cuh
+++ b/cub/cub/device/dispatch/dispatch_rle.cuh
@@ -543,7 +543,7 @@ struct DeviceRleDispatch
     return error;
   }
 
-#ifndef DOXYGEN_SHOULD_SKIP_THIS // Do not document
+#ifndef _CCCL_DOXYGEN_INVOKED // Do not document
   CUB_DETAIL_RUNTIME_DEBUG_SYNC_IS_NOT_SUPPORTED
   CUB_RUNTIME_FUNCTION _CCCL_FORCEINLINE static cudaError_t Dispatch(
     void* d_temp_storage,
@@ -570,7 +570,7 @@ struct DeviceRleDispatch
       num_items,
       stream);
   }
-#endif // DOXYGEN_SHOULD_SKIP_THIS
+#endif // _CCCL_DOXYGEN_INVOKED
 };
 
 CUB_NAMESPACE_END
diff --git a/cub/cub/device/dispatch/dispatch_scan.cuh b/cub/cub/device/dispatch/dispatch_scan.cuh
index d1efaa01cd2..691fc2ece8c 100644
--- a/cub/cub/device/dispatch/dispatch_scan.cuh
+++ b/cub/cub/device/dispatch/dispatch_scan.cuh
@@ -330,7 +330,7 @@ struct DispatchScan : SelectedPolicy
       , ptx_version(ptx_version)
   {}
 
-#ifndef DOXYGEN_SHOULD_SKIP_THIS // Do not document
+#ifndef _CCCL_DOXYGEN_INVOKED // Do not document
   CUB_DETAIL_RUNTIME_DEBUG_SYNC_IS_NOT_SUPPORTED
   CUB_RUNTIME_FUNCTION _CCCL_FORCEINLINE DispatchScan(
     void* d_temp_storage,
@@ -355,7 +355,7 @@ struct DispatchScan : SelectedPolicy
   {
     CUB_DETAIL_RUNTIME_DEBUG_SYNC_USAGE_LOG
   }
-#endif // DOXYGEN_SHOULD_SKIP_THIS
+#endif // _CCCL_DOXYGEN_INVOKED
 
   template <typename ActivePolicyT, typename InitKernel, typename ScanKernel>
   CUB_RUNTIME_FUNCTION _CCCL_HOST _CCCL_FORCEINLINE cudaError_t Invoke(InitKernel init_kernel, ScanKernel scan_kernel)
@@ -593,7 +593,7 @@ struct DispatchScan : SelectedPolicy
     return error;
   }
 
-#ifndef DOXYGEN_SHOULD_SKIP_THIS // Do not document
+#ifndef _CCCL_DOXYGEN_INVOKED // Do not document
   CUB_DETAIL_RUNTIME_DEBUG_SYNC_IS_NOT_SUPPORTED
   CUB_RUNTIME_FUNCTION _CCCL_FORCEINLINE static cudaError_t Dispatch(
     void* d_temp_storage,
@@ -610,7 +610,7 @@ struct DispatchScan : SelectedPolicy
 
     return Dispatch(d_temp_storage, temp_storage_bytes, d_in, d_out, scan_op, init_value, num_items, stream);
   }
-#endif // DOXYGEN_SHOULD_SKIP_THIS
+#endif // _CCCL_DOXYGEN_INVOKED
 };
 
 CUB_NAMESPACE_END
diff --git a/cub/cub/device/dispatch/dispatch_scan_by_key.cuh b/cub/cub/device/dispatch/dispatch_scan_by_key.cuh
index aa04ce9f2ec..bf26c54e90e 100644
--- a/cub/cub/device/dispatch/dispatch_scan_by_key.cuh
+++ b/cub/cub/device/dispatch/dispatch_scan_by_key.cuh
@@ -342,7 +342,7 @@ struct DispatchScanByKey : SelectedPolicy
       , ptx_version(ptx_version)
   {}
 
-#ifndef DOXYGEN_SHOULD_SKIP_THIS // Do not document
+#ifndef _CCCL_DOXYGEN_INVOKED // Do not document
   CUB_DETAIL_RUNTIME_DEBUG_SYNC_IS_NOT_SUPPORTED
   CUB_RUNTIME_FUNCTION _CCCL_FORCEINLINE DispatchScanByKey(
     void* d_temp_storage,
@@ -371,7 +371,7 @@ struct DispatchScanByKey : SelectedPolicy
   {
     CUB_DETAIL_RUNTIME_DEBUG_SYNC_USAGE_LOG
   }
-#endif // DOXYGEN_SHOULD_SKIP_THIS
+#endif // _CCCL_DOXYGEN_INVOKED
 
   template <typename ActivePolicyT, typename InitKernel, typename ScanKernel>
   CUB_RUNTIME_FUNCTION _CCCL_HOST _CCCL_FORCEINLINE cudaError_t Invoke(InitKernel init_kernel, ScanKernel scan_kernel)
@@ -622,7 +622,7 @@ struct DispatchScanByKey : SelectedPolicy
     return error;
   }
 
-#ifndef DOXYGEN_SHOULD_SKIP_THIS // Do not document
+#ifndef _CCCL_DOXYGEN_INVOKED // Do not document
   CUB_DETAIL_RUNTIME_DEBUG_SYNC_IS_NOT_SUPPORTED
   CUB_RUNTIME_FUNCTION _CCCL_FORCEINLINE static cudaError_t Dispatch(
     void* d_temp_storage,
@@ -651,7 +651,7 @@ struct DispatchScanByKey : SelectedPolicy
       num_items,
       stream);
   }
-#endif // DOXYGEN_SHOULD_SKIP_THIS
+#endif // _CCCL_DOXYGEN_INVOKED
 };
 
 CUB_NAMESPACE_END
diff --git a/cub/cub/device/dispatch/dispatch_segmented_sort.cuh b/cub/cub/device/dispatch/dispatch_segmented_sort.cuh
index 80d8973c759..a98e1de494a 100644
--- a/cub/cub/device/dispatch/dispatch_segmented_sort.cuh
+++ b/cub/cub/device/dispatch/dispatch_segmented_sort.cuh
@@ -1131,7 +1131,7 @@ struct DispatchSegmentedSort : SelectedPolicy
       , stream(stream)
   {}
 
-#ifndef DOXYGEN_SHOULD_SKIP_THIS // Do not document
+#ifndef _CCCL_DOXYGEN_INVOKED // Do not document
   CUB_DETAIL_RUNTIME_DEBUG_SYNC_IS_NOT_SUPPORTED
   CUB_RUNTIME_FUNCTION _CCCL_FORCEINLINE DispatchSegmentedSort(
     void* d_temp_storage,
@@ -1158,7 +1158,7 @@ struct DispatchSegmentedSort : SelectedPolicy
   {
     CUB_DETAIL_RUNTIME_DEBUG_SYNC_USAGE_LOG
   }
-#endif // DOXYGEN_SHOULD_SKIP_THIS
+#endif // _CCCL_DOXYGEN_INVOKED
 
   template <typename ActivePolicyT>
   CUB_RUNTIME_FUNCTION _CCCL_FORCEINLINE cudaError_t Invoke()
@@ -1440,7 +1440,7 @@ struct DispatchSegmentedSort : SelectedPolicy
     return error;
   }
 
-#ifndef DOXYGEN_SHOULD_SKIP_THIS // Do not document
+#ifndef _CCCL_DOXYGEN_INVOKED // Do not document
   CUB_DETAIL_RUNTIME_DEBUG_SYNC_IS_NOT_SUPPORTED
   CUB_RUNTIME_FUNCTION _CCCL_FORCEINLINE static cudaError_t Dispatch(
     void* d_temp_storage,
@@ -1469,7 +1469,7 @@ struct DispatchSegmentedSort : SelectedPolicy
       is_overwrite_okay,
       stream);
   }
-#endif // DOXYGEN_SHOULD_SKIP_THIS
+#endif // _CCCL_DOXYGEN_INVOKED
 
 private:
   CUB_RUNTIME_FUNCTION _CCCL_FORCEINLINE int GetNumPasses(int radix_bits)
diff --git a/cub/cub/device/dispatch/dispatch_select_if.cuh b/cub/cub/device/dispatch/dispatch_select_if.cuh
index 807ba62e4b3..7fbf9ccda4f 100644
--- a/cub/cub/device/dispatch/dispatch_select_if.cuh
+++ b/cub/cub/device/dispatch/dispatch_select_if.cuh
@@ -845,7 +845,7 @@ struct DispatchSelectIf : SelectedPolicy
     return CubDebug(MaxPolicyT::Invoke(ptx_version, dispatch));
   }
 
-#ifndef DOXYGEN_SHOULD_SKIP_THIS // Do not document
+#ifndef _CCCL_DOXYGEN_INVOKED // Do not document
   CUB_DETAIL_RUNTIME_DEBUG_SYNC_IS_NOT_SUPPORTED
   CUB_RUNTIME_FUNCTION _CCCL_FORCEINLINE static cudaError_t Dispatch(
     void* d_temp_storage,
@@ -874,7 +874,7 @@ struct DispatchSelectIf : SelectedPolicy
       num_items,
       stream);
   }
-#endif // DOXYGEN_SHOULD_SKIP_THIS
+#endif // _CCCL_DOXYGEN_INVOKED
 };
 
 CUB_NAMESPACE_END
diff --git a/cub/cub/device/dispatch/dispatch_spmv_orig.cuh b/cub/cub/device/dispatch/dispatch_spmv_orig.cuh
index a36a7f7890a..7d3d3094a48 100644
--- a/cub/cub/device/dispatch/dispatch_spmv_orig.cuh
+++ b/cub/cub/device/dispatch/dispatch_spmv_orig.cuh
@@ -893,7 +893,7 @@ struct DispatchSpmv
     return error;
   }
 
-#ifndef DOXYGEN_SHOULD_SKIP_THIS // Do not document
+#ifndef _CCCL_DOXYGEN_INVOKED // Do not document
   template <typename Spmv1ColKernelT,
             typename SpmvSearchKernelT,
             typename SpmvKernelT,
@@ -929,7 +929,7 @@ struct DispatchSpmv
       spmv_config,
       segment_fixup_config);
   }
-#endif // DOXYGEN_SHOULD_SKIP_THIS
+#endif // _CCCL_DOXYGEN_INVOKED
 
   /**
    * @brief Internal dispatch routine for computing a device-wide reduction
@@ -990,7 +990,7 @@ struct DispatchSpmv
     return error;
   }
 
-#ifndef DOXYGEN_SHOULD_SKIP_THIS // Do not document
+#ifndef _CCCL_DOXYGEN_INVOKED // Do not document
   CUB_DETAIL_RUNTIME_DEBUG_SYNC_IS_NOT_SUPPORTED
   CUB_RUNTIME_FUNCTION _CCCL_FORCEINLINE static cudaError_t Dispatch(
     void* d_temp_storage,
@@ -1003,7 +1003,7 @@ struct DispatchSpmv
 
     return Dispatch(d_temp_storage, temp_storage_bytes, spmv_params, stream);
   }
-#endif // DOXYGEN_SHOULD_SKIP_THIS
+#endif // _CCCL_DOXYGEN_INVOKED
 };
 
 CUB_NAMESPACE_END
diff --git a/cub/cub/device/dispatch/dispatch_three_way_partition.cuh b/cub/cub/device/dispatch/dispatch_three_way_partition.cuh
index e77f82e0642..44b7142cbfc 100644
--- a/cub/cub/device/dispatch/dispatch_three_way_partition.cuh
+++ b/cub/cub/device/dispatch/dispatch_three_way_partition.cuh
@@ -452,7 +452,7 @@ struct DispatchThreeWayPartitionIf
     return error;
   }
 
-#ifndef DOXYGEN_SHOULD_SKIP_THIS // Do not document
+#ifndef _CCCL_DOXYGEN_INVOKED // Do not document
   CUB_DETAIL_RUNTIME_DEBUG_SYNC_IS_NOT_SUPPORTED
   CUB_RUNTIME_FUNCTION _CCCL_FORCEINLINE static cudaError_t Dispatch(
     void* d_temp_storage,
@@ -483,7 +483,7 @@ struct DispatchThreeWayPartitionIf
       num_items,
       stream);
   }
-#endif // DOXYGEN_SHOULD_SKIP_THIS
+#endif // _CCCL_DOXYGEN_INVOKED
 };
 
 CUB_NAMESPACE_END
diff --git a/cub/cub/device/dispatch/dispatch_unique_by_key.cuh b/cub/cub/device/dispatch/dispatch_unique_by_key.cuh
index a9c4008beb3..760475f68b9 100644
--- a/cub/cub/device/dispatch/dispatch_unique_by_key.cuh
+++ b/cub/cub/device/dispatch/dispatch_unique_by_key.cuh
@@ -322,7 +322,7 @@ struct DispatchUniqueByKey : SelectedPolicy
       , stream(stream)
   {}
 
-#ifndef DOXYGEN_SHOULD_SKIP_THIS // Do not document
+#ifndef _CCCL_DOXYGEN_INVOKED // Do not document
   CUB_DETAIL_RUNTIME_DEBUG_SYNC_IS_NOT_SUPPORTED
   CUB_RUNTIME_FUNCTION _CCCL_FORCEINLINE DispatchUniqueByKey(
     void* d_temp_storage,
@@ -349,7 +349,7 @@ struct DispatchUniqueByKey : SelectedPolicy
   {
     CUB_DETAIL_RUNTIME_DEBUG_SYNC_USAGE_LOG
   }
-#endif // DOXYGEN_SHOULD_SKIP_THIS
+#endif // _CCCL_DOXYGEN_INVOKED
 
   /******************************************************************************
    * Dispatch entrypoints
@@ -631,7 +631,7 @@ struct DispatchUniqueByKey : SelectedPolicy
     return error;
   }
 
-#ifndef DOXYGEN_SHOULD_SKIP_THIS // Do not document
+#ifndef _CCCL_DOXYGEN_INVOKED // Do not document
   CUB_DETAIL_RUNTIME_DEBUG_SYNC_IS_NOT_SUPPORTED
   CUB_RUNTIME_FUNCTION _CCCL_FORCEINLINE static cudaError_t Dispatch(
     void* d_temp_storage,
@@ -660,7 +660,7 @@ struct DispatchUniqueByKey : SelectedPolicy
       num_items,
       stream);
   }
-#endif // DOXYGEN_SHOULD_SKIP_THIS
+#endif // _CCCL_DOXYGEN_INVOKED
 };
 
 CUB_NAMESPACE_END
diff --git a/cub/cub/grid/grid_queue.cuh b/cub/cub/grid/grid_queue.cuh
index ec98c6445ba..094ca8d4374 100644
--- a/cub/cub/grid/grid_queue.cuh
+++ b/cub/cub/grid/grid_queue.cuh
@@ -185,7 +185,7 @@ public:
   }
 };
 
-#ifndef DOXYGEN_SHOULD_SKIP_THIS // Do not document
+#ifndef _CCCL_DOXYGEN_INVOKED // Do not document
 
 /**
  * Reset grid queue (call with 1 block of 1 thread)
@@ -196,6 +196,6 @@ __global__ void FillAndResetDrainKernel(GridQueue<OffsetT> grid_queue, OffsetT n
   grid_queue.FillAndResetDrain(num_items);
 }
 
-#endif // DOXYGEN_SHOULD_SKIP_THIS
+#endif // _CCCL_DOXYGEN_INVOKED
 
 CUB_NAMESPACE_END
diff --git a/cub/cub/thread/thread_load.cuh b/cub/cub/thread/thread_load.cuh
index 14577a56c92..6679f04b1e8 100644
--- a/cub/cub/thread/thread_load.cuh
+++ b/cub/cub/thread/thread_load.cuh
@@ -110,7 +110,7 @@ _CCCL_DEVICE _CCCL_FORCEINLINE cub::detail::value_t<RandomAccessIterator> Thread
 
 //@}  end member group
 
-#ifndef DOXYGEN_SHOULD_SKIP_THIS // Do not document
+#ifndef _CCCL_DOXYGEN_INVOKED // Do not document
 
 /// Helper structure for templated load iteration (inductive case)
 /// \deprecated [Since 2.6.0] Use UnrolledThreadLoad() or UnrolledCopy() instead.
@@ -378,6 +378,6 @@ _CCCL_DEVICE _CCCL_FORCEINLINE cub::detail::value_t<RandomAccessIterator> Thread
   return ThreadLoad(itr, Int2Type<MODIFIER>(), Int2Type<::cuda::std::is_pointer<RandomAccessIterator>::value>());
 }
 
-#endif // DOXYGEN_SHOULD_SKIP_THIS
+#endif // _CCCL_DOXYGEN_INVOKED
 
 CUB_NAMESPACE_END
diff --git a/cub/cub/thread/thread_operators.cuh b/cub/cub/thread/thread_operators.cuh
index 2de65083843..45d2446188f 100644
--- a/cub/cub/thread/thread_operators.cuh
+++ b/cub/cub/thread/thread_operators.cuh
@@ -396,7 +396,7 @@ CUB_DEPRECATED _CCCL_HOST_DEVICE BinaryFlip<BinaryOpT> MakeBinaryFlip(BinaryOpT
 }
 _CCCL_SUPPRESS_DEPRECATED_POP
 
-#ifndef DOXYGEN_SHOULD_SKIP_THIS // Do not document
+#ifndef _CCCL_DOXYGEN_INVOKED // Do not document
 
 namespace internal
 {
@@ -720,6 +720,6 @@ using simd_type_t = typename CubOperatorToSimdOperator<ReduceOp, T>::simd_type;
 
 } // namespace internal
 
-#endif // !DOXYGEN_SHOULD_SKIP_THIS
+#endif // !_CCCL_DOXYGEN_INVOKED
 
 CUB_NAMESPACE_END
diff --git a/cub/cub/thread/thread_reduce.cuh b/cub/cub/thread/thread_reduce.cuh
index 2a5b6566a26..d4b4a89fdfd 100644
--- a/cub/cub/thread/thread_reduce.cuh
+++ b/cub/cub/thread/thread_reduce.cuh
@@ -145,11 +145,11 @@ CUB_NAMESPACE_BEGIN
 //!
 template <typename Input,
           typename ReductionOp,
-#ifndef DOXYGEN_SHOULD_SKIP_THIS // Do not document
+#ifndef _CCCL_DOXYGEN_INVOKED // Do not document
           typename ValueT = ::cuda::std::remove_cvref_t<decltype(::cuda::std::declval<Input>()[0])>,
 #else
           typename ValueT = random_access_value_t<Input>,
-#endif // !DOXYGEN_SHOULD_SKIP_THIS
+#endif // !_CCCL_DOXYGEN_INVOKED
           typename AccumT = ::cuda::std::__accumulator_t<ReductionOp, ValueT>>
 _CCCL_NODISCARD _CCCL_DEVICE _CCCL_FORCEINLINE AccumT ThreadReduce(const Input& input, ReductionOp reduction_op);
 // forward declaration
@@ -158,7 +158,7 @@ _CCCL_NODISCARD _CCCL_DEVICE _CCCL_FORCEINLINE AccumT ThreadReduce(const Input&
  * Internal Reduction Implementations
  **********************************************************************************************************************/
 
-#ifndef DOXYGEN_SHOULD_SKIP_THIS // Do not document
+#ifndef _CCCL_DOXYGEN_INVOKED // Do not document
 
 namespace detail
 {
@@ -697,6 +697,6 @@ _CCCL_NODISCARD _CCCL_DEVICE _CCCL_FORCEINLINE T ThreadReduce(const T*, Reductio
 
 } // namespace internal
 
-#endif // !DOXYGEN_SHOULD_SKIP_THIS
+#endif // !_CCCL_DOXYGEN_INVOKED
 
 CUB_NAMESPACE_END
diff --git a/cub/cub/thread/thread_store.cuh b/cub/cub/thread/thread_store.cuh
index d0927a0d28d..a895884a60d 100644
--- a/cub/cub/thread/thread_store.cuh
+++ b/cub/cub/thread/thread_store.cuh
@@ -114,7 +114,7 @@ _CCCL_DEVICE _CCCL_FORCEINLINE void ThreadStore(OutputIteratorT itr, T val);
 
 //@}  end member group
 
-#ifndef DOXYGEN_SHOULD_SKIP_THIS // Do not document
+#ifndef _CCCL_DOXYGEN_INVOKED // Do not document
 
 /// Helper structure for templated store iteration (inductive case)
 template <int COUNT, int MAX>
@@ -353,6 +353,6 @@ _CCCL_DEVICE _CCCL_FORCEINLINE void ThreadStore(OutputIteratorT itr, T val)
   ThreadStore(itr, val, Int2Type<MODIFIER>(), Int2Type<std::is_pointer<OutputIteratorT>::value>());
 }
 
-#endif // DOXYGEN_SHOULD_SKIP_THIS
+#endif // _CCCL_DOXYGEN_INVOKED
 
 CUB_NAMESPACE_END
diff --git a/cub/cub/util_allocator.cuh b/cub/cub/util_allocator.cuh
index d9559b874f3..39e59bdf4de 100644
--- a/cub/cub/util_allocator.cuh
+++ b/cub/cub/util_allocator.cuh
@@ -110,7 +110,7 @@ struct CachingDeviceAllocator
   /// Invalid size
   static constexpr size_t INVALID_SIZE = (size_t) -1;
 
-#ifndef DOXYGEN_SHOULD_SKIP_THIS // Do not document
+#ifndef _CCCL_DOXYGEN_INVOKED // Do not document
 
   /// Invalid device ordinal
   static constexpr int INVALID_DEVICE_ORDINAL = -1;
@@ -299,7 +299,7 @@ struct CachingDeviceAllocator
   /// Set of live device allocations currently in use
   BusyBlocks live_blocks;
 
-#endif // DOXYGEN_SHOULD_SKIP_THIS
+#endif // _CCCL_DOXYGEN_INVOKED
 
   //---------------------------------------------------------------------
   // Methods
diff --git a/cub/cub/util_arch.cuh b/cub/cub/util_arch.cuh
index 5f8780620fa..1d6d7289b78 100644
--- a/cub/cub/util_arch.cuh
+++ b/cub/cub/util_arch.cuh
@@ -52,7 +52,7 @@
 
 CUB_NAMESPACE_BEGIN
 
-#ifndef DOXYGEN_SHOULD_SKIP_THIS // Do not document
+#ifndef _CCCL_DOXYGEN_INVOKED // Do not document
 
 // \deprecated [Since 2.1.0]
 #  define CUB_USE_COOPERATIVE_GROUPS
diff --git a/cub/cub/util_cpp_dialect.cuh b/cub/cub/util_cpp_dialect.cuh
index 006a070a7e9..6f54239bf84 100644
--- a/cub/cub/util_cpp_dialect.cuh
+++ b/cub/cub/util_cpp_dialect.cuh
@@ -42,7 +42,7 @@
 
 #include <cub/util_compiler.cuh> // IWYU pragma: export
 
-#ifndef DOXYGEN_SHOULD_SKIP_THIS // Do not document
+#ifndef _CCCL_DOXYGEN_INVOKED // Do not document
 
 // Deprecation warnings may be silenced by defining the following macros. These
 // may be combined.
@@ -133,4 +133,4 @@ CUB_COMPILER_DEPRECATION_SOFT(C++ 17, C++ 14);
 #  undef CUB_COMP_DEPR_IMPL0
 #  undef CUB_COMP_DEPR_IMPL1
 
-#endif // !DOXYGEN_SHOULD_SKIP_THIS
+#endif // !_CCCL_DOXYGEN_INVOKED
diff --git a/cub/cub/util_debug.cuh b/cub/cub/util_debug.cuh
index edb75a64da3..0a08c9ae223 100644
--- a/cub/cub/util_debug.cuh
+++ b/cub/cub/util_debug.cuh
@@ -48,7 +48,7 @@
 
 #include <nv/target>
 
-#ifdef DOXYGEN_SHOULD_SKIP_THIS // Only parse this during doxygen passes:
+#ifdef _CCCL_DOXYGEN_INVOKED // Only parse this during doxygen passes:
 
 /**
  * @def CUB_DEBUG_LOG
@@ -92,7 +92,7 @@
  */
 #  define CUB_DEBUG_ALL
 
-#endif // DOXYGEN_SHOULD_SKIP_THIS
+#endif // _CCCL_DOXYGEN_INVOKED
 
 // `CUB_DETAIL_DEBUG_LEVEL_*`: Implementation details, internal use only:
 
diff --git a/cub/cub/util_device.cuh b/cub/cub/util_device.cuh
index e395b17f6d3..5b8c1f3f1f3 100644
--- a/cub/cub/util_device.cuh
+++ b/cub/cub/util_device.cuh
@@ -65,7 +65,7 @@
 
 CUB_NAMESPACE_BEGIN
 
-#ifndef DOXYGEN_SHOULD_SKIP_THIS // Do not document
+#ifndef _CCCL_DOXYGEN_INVOKED // Do not document
 
 namespace detail
 {
@@ -90,7 +90,7 @@ template <typename T>
 CUB_DETAIL_KERNEL_ATTRIBUTES void EmptyKernel()
 {}
 
-#endif // DOXYGEN_SHOULD_SKIP_THIS
+#endif // _CCCL_DOXYGEN_INVOKED
 
 /**
  * \brief Returns the current device or -1 if an error occurred.
@@ -105,13 +105,13 @@ CUB_RUNTIME_FUNCTION inline int CurrentDevice()
   return device;
 }
 
-#ifndef DOXYGEN_SHOULD_SKIP_THIS // Do not document
+#ifndef _CCCL_DOXYGEN_INVOKED // Do not document
 
 //! @brief RAII helper which saves the current device and switches to the specified device on construction and switches
 //! to the saved device on destruction.
 using SwitchDevice = ::cuda::__ensure_current_device;
 
-#endif // DOXYGEN_SHOULD_SKIP_THIS
+#endif // _CCCL_DOXYGEN_INVOKED
 
 /**
  * \brief Returns the number of CUDA devices available or -1 if an error
@@ -171,7 +171,7 @@ CUB_RUNTIME_FUNCTION inline int DeviceCount()
   return result;
 }
 
-#ifndef DOXYGEN_SHOULD_SKIP_THIS // Do not document
+#ifndef _CCCL_DOXYGEN_INVOKED // Do not document
 /**
  * \brief Per-device cache for a CUDA attribute value; the attribute is queried
  *        and stored for each device upon construction.
@@ -286,7 +286,7 @@ public:
     return entry.payload;
   }
 };
-#endif // DOXYGEN_SHOULD_SKIP_THIS
+#endif // _CCCL_DOXYGEN_INVOKED
 
 /**
  * \brief Retrieves the PTX version that will be used on the current device (major * 100 + minor * 10).
diff --git a/cub/cub/util_macro.cuh b/cub/cub/util_macro.cuh
index f98751b2ddf..b3ab7e73629 100644
--- a/cub/cub/util_macro.cuh
+++ b/cub/cub/util_macro.cuh
@@ -49,7 +49,7 @@
 
 CUB_NAMESPACE_BEGIN
 
-#ifndef DOXYGEN_SHOULD_SKIP_THIS // Do not document
+#ifndef _CCCL_DOXYGEN_INVOKED // Do not document
 #  define CUB_PREVENT_MACRO_SUBSTITUTION
 template <typename T, typename U>
 constexpr _CCCL_HOST_DEVICE auto min CUB_PREVENT_MACRO_SUBSTITUTION(T&& t, U&& u)
diff --git a/cub/cub/util_ptx.cuh b/cub/cub/util_ptx.cuh
index 3fc73b90304..aa522d9576e 100644
--- a/cub/cub/util_ptx.cuh
+++ b/cub/cub/util_ptx.cuh
@@ -97,7 +97,7 @@ _CCCL_DEVICE _CCCL_FORCEINLINE unsigned int SHL_ADD(unsigned int x, unsigned int
   return ret;
 }
 
-#ifndef DOXYGEN_SHOULD_SKIP_THIS // Do not document
+#ifndef _CCCL_DOXYGEN_INVOKED // Do not document
 
 /**
  * Bitfield-extract.
@@ -135,7 +135,7 @@ BFE(UnsignedBits source, unsigned int bit_start, unsigned int num_bits, Int2Type
 }
 #  endif
 
-#endif // DOXYGEN_SHOULD_SKIP_THIS
+#endif // _CCCL_DOXYGEN_INVOKED
 
 /**
  * \brief Bitfield-extract.  Extracts \p num_bits from \p source starting at bit-offset \p bit_start.  The input \p
@@ -199,7 +199,7 @@ _CCCL_DEVICE _CCCL_FORCEINLINE int PRMT(unsigned int a, unsigned int b, unsigned
   return ret;
 }
 
-#ifndef DOXYGEN_SHOULD_SKIP_THIS // Do not document
+#ifndef _CCCL_DOXYGEN_INVOKED // Do not document
 
 /**
  * Sync-threads barrier.
@@ -329,7 +329,7 @@ _CCCL_DEVICE _CCCL_FORCEINLINE float FFMA_RZ(float a, float b, float c)
   return d;
 }
 
-#endif // DOXYGEN_SHOULD_SKIP_THIS
+#endif // _CCCL_DOXYGEN_INVOKED
 
 /**
  * \brief Terminates the calling thread
@@ -689,7 +689,7 @@ _CCCL_DEVICE _CCCL_FORCEINLINE T ShuffleIndex(T input, int src_lane, unsigned in
   return output;
 }
 
-#ifndef DOXYGEN_SHOULD_SKIP_THIS // Do not document
+#ifndef _CCCL_DOXYGEN_INVOKED // Do not document
 namespace detail
 {
 
@@ -751,7 +751,7 @@ struct warp_matcher_t<LABEL_BITS, CUB_PTX_WARP_THREADS>
 };
 
 } // namespace detail
-#endif // DOXYGEN_SHOULD_SKIP_THIS
+#endif // _CCCL_DOXYGEN_INVOKED
 
 /**
  * Compute a 32b mask of threads having the same least-significant
diff --git a/cub/cub/util_temporary_storage.cuh b/cub/cub/util_temporary_storage.cuh
index ee456083c3e..61c00f969f4 100644
--- a/cub/cub/util_temporary_storage.cuh
+++ b/cub/cub/util_temporary_storage.cuh
@@ -48,7 +48,7 @@
 
 CUB_NAMESPACE_BEGIN
 
-#ifndef DOXYGEN_SHOULD_SKIP_THIS // Do not document
+#ifndef _CCCL_DOXYGEN_INVOKED // Do not document
 
 /**
  * @brief Alias temporaries to externally-allocated device storage (or simply return the amount of storage needed).
@@ -112,6 +112,6 @@ _CCCL_HOST_DEVICE _CCCL_FORCEINLINE cudaError_t AliasTemporaries(
   return cudaSuccess;
 }
 
-#endif // DOXYGEN_SHOULD_SKIP_THIS
+#endif // _CCCL_DOXYGEN_INVOKED
 
 CUB_NAMESPACE_END
diff --git a/cub/cub/util_type.cuh b/cub/cub/util_type.cuh
index 42ffef0f6b0..f062ebc4ae9 100644
--- a/cub/cub/util_type.cuh
+++ b/cub/cub/util_type.cuh
@@ -85,7 +85,7 @@ CUB_NAMESPACE_BEGIN
  * Conditional types
  ******************************************************************************/
 
-#ifndef DOXYGEN_SHOULD_SKIP_THIS // Do not document
+#ifndef _CCCL_DOXYGEN_INVOKED // Do not document
 namespace detail
 {
 //! Alias to the given iterator's value_type.
@@ -142,7 +142,7 @@ struct Log2
   }; // Inductive case
 };
 
-#  ifndef DOXYGEN_SHOULD_SKIP_THIS // Do not document
+#  ifndef _CCCL_DOXYGEN_INVOKED // Do not document
 
 template <int N, int COUNT>
 struct Log2<N, 0, COUNT>
@@ -155,7 +155,7 @@ struct Log2<N, 0, COUNT>
   };
 };
 
-#  endif // DOXYGEN_SHOULD_SKIP_THIS
+#  endif // _CCCL_DOXYGEN_INVOKED
 
 /**
  * \brief Statically determine if N is a power-of-two
@@ -169,13 +169,13 @@ struct PowerOfTwo
   };
 };
 
-#endif // DOXYGEN_SHOULD_SKIP_THIS
+#endif // _CCCL_DOXYGEN_INVOKED
 
 /******************************************************************************
  * Marker types
  ******************************************************************************/
 
-#ifndef DOXYGEN_SHOULD_SKIP_THIS // Do not document
+#ifndef _CCCL_DOXYGEN_INVOKED // Do not document
 
 /**
  * \brief A simple "null" marker type
@@ -1156,6 +1156,6 @@ template <typename T>
 struct Traits : NumericTraits<typename ::cuda::std::remove_cv<T>::type>
 {};
 
-#endif // DOXYGEN_SHOULD_SKIP_THIS
+#endif // _CCCL_DOXYGEN_INVOKED
 
 CUB_NAMESPACE_END
diff --git a/cub/cub/util_vsmem.cuh b/cub/cub/util_vsmem.cuh
index d2e5541c09c..f5926ce11e5 100644
--- a/cub/cub/util_vsmem.cuh
+++ b/cub/cub/util_vsmem.cuh
@@ -54,7 +54,7 @@
 
 CUB_NAMESPACE_BEGIN
 
-#ifndef DOXYGEN_SHOULD_SKIP_THIS // Do not document
+#ifndef _CCCL_DOXYGEN_INVOKED // Do not document
 
 namespace detail
 {
@@ -248,6 +248,6 @@ using vsmem_helper_default_fallback_policy_t =
 
 } // namespace detail
 
-#endif // DOXYGEN_SHOULD_SKIP_THIS
+#endif // _CCCL_DOXYGEN_INVOKED
 
 CUB_NAMESPACE_END
diff --git a/cub/cub/warp/warp_reduce.cuh b/cub/cub/warp/warp_reduce.cuh
index 1d647a06c86..00440c18bdf 100644
--- a/cub/cub/warp/warp_reduce.cuh
+++ b/cub/cub/warp/warp_reduce.cuh
@@ -170,14 +170,14 @@ private:
   };
 
 public:
-#ifndef DOXYGEN_SHOULD_SKIP_THIS // Do not document
+#ifndef _CCCL_DOXYGEN_INVOKED // Do not document
 
   /// Internal specialization.
   /// Use SHFL-based reduction if LOGICAL_WARP_THREADS is a power-of-two
   using InternalWarpReduce =
     ::cuda::std::_If<IS_POW_OF_TWO, WarpReduceShfl<T, LOGICAL_WARP_THREADS>, WarpReduceSmem<T, LOGICAL_WARP_THREADS>>;
 
-#endif // DOXYGEN_SHOULD_SKIP_THIS
+#endif // _CCCL_DOXYGEN_INVOKED
 
 private:
   /// Shared memory storage layout type for WarpReduce
@@ -662,7 +662,7 @@ public:
   //! @}  end member group
 };
 
-#ifndef DOXYGEN_SHOULD_SKIP_THIS // Do not document
+#ifndef _CCCL_DOXYGEN_INVOKED // Do not document
 template <typename T, int LEGACY_PTX_ARCH>
 class WarpReduce<T, 1, LEGACY_PTX_ARCH>
 {
@@ -740,6 +740,6 @@ public:
     return input;
   }
 };
-#endif // DOXYGEN_SHOULD_SKIP_THIS
+#endif // _CCCL_DOXYGEN_INVOKED
 
 CUB_NAMESPACE_END
diff --git a/cudax/include/cuda/experimental/__container/uninitialized_async_buffer.cuh b/cudax/include/cuda/experimental/__container/uninitialized_async_buffer.cuh
index 4bcd93d259f..5bfd60da9d3 100644
--- a/cudax/include/cuda/experimental/__container/uninitialized_async_buffer.cuh
+++ b/cudax/include/cuda/experimental/__container/uninitialized_async_buffer.cuh
@@ -273,12 +273,12 @@ public:
     __stream_ = __new_stream;
   }
 
-#  ifndef DOXYGEN_SHOULD_SKIP_THIS // friend functions are currently broken
+#  ifndef _CCCL_DOXYGEN_INVOKED // friend functions are currently broken
   //! @brief Forwards the passed properties
   _CCCL_TEMPLATE(class _Property)
   _CCCL_REQUIRES((!property_with_value<_Property>) _CCCL_AND _CUDA_VSTD::__is_included_in_v<_Property, _Properties...>)
   _CCCL_HIDE_FROM_ABI friend constexpr void get_property(const uninitialized_async_buffer&, _Property) noexcept {}
-#  endif // DOXYGEN_SHOULD_SKIP_THIS
+#  endif // _CCCL_DOXYGEN_INVOKED
 
   //! @brief Internal method to grow the allocation to a new size \p __count.
   //! @param __count The new size of the allocation.
diff --git a/cudax/include/cuda/experimental/__container/uninitialized_buffer.cuh b/cudax/include/cuda/experimental/__container/uninitialized_buffer.cuh
index d480ded4588..38c968d25c8 100644
--- a/cudax/include/cuda/experimental/__container/uninitialized_buffer.cuh
+++ b/cudax/include/cuda/experimental/__container/uninitialized_buffer.cuh
@@ -238,12 +238,12 @@ public:
     return __mr_;
   }
 
-#  ifndef DOXYGEN_SHOULD_SKIP_THIS // friend functions are currently broken
+#  ifndef _CCCL_DOXYGEN_INVOKED // friend functions are currently broken
   //! @brief Forwards the passed Properties
   _CCCL_TEMPLATE(class _Property)
   _CCCL_REQUIRES((!property_with_value<_Property>) _CCCL_AND _CUDA_VSTD::__is_included_in_v<_Property, _Properties...>)
   _CCCL_HIDE_FROM_ABI friend constexpr void get_property(const uninitialized_buffer&, _Property) noexcept {}
-#  endif // DOXYGEN_SHOULD_SKIP_THIS
+#  endif // _CCCL_DOXYGEN_INVOKED
 
   //! @brief Internal method to grow the allocation to a new size \p __count.
   //! @param __count The new size of the allocation.
diff --git a/cudax/include/cuda/experimental/__device/device.cuh b/cudax/include/cuda/experimental/__device/device.cuh
index 52c109bff6a..3e19bafb4e7 100644
--- a/cudax/include/cuda/experimental/__device/device.cuh
+++ b/cudax/include/cuda/experimental/__device/device.cuh
@@ -68,7 +68,7 @@ public:
   template <::cudaDeviceAttr _Attr>
   using attr_result_t = typename detail::__dev_attr<_Attr>::type;
 
-#ifndef DOXYGEN_SHOULD_SKIP_THIS // Do not document
+#ifndef _CCCL_DOXYGEN_INVOKED // Do not document
 #  if defined(_CCCL_COMPILER_MSVC)
   // When __EDG__ is defined, std::construct_at will not permit constructing
   // a device object from an __emplace_device object. This is a workaround.
diff --git a/cudax/include/cuda/experimental/__memory_resource/device_memory_resource.cuh b/cudax/include/cuda/experimental/__memory_resource/device_memory_resource.cuh
index bae301feb0b..7d54dd4f750 100644
--- a/cudax/include/cuda/experimental/__memory_resource/device_memory_resource.cuh
+++ b/cudax/include/cuda/experimental/__memory_resource/device_memory_resource.cuh
@@ -405,11 +405,11 @@ public:
     return __pool_;
   }
 
-#    ifndef DOXYGEN_SHOULD_SKIP_THIS // Doxygen cannot handle the friend function
+#    ifndef _CCCL_DOXYGEN_INVOKED // Doxygen cannot handle the friend function
   //! @brief Enables the \c device_accessible property for \c device_memory_resource.
   //! @relates device_memory_resource
   friend constexpr void get_property(device_memory_resource const&, _CUDA_VMR::device_accessible) noexcept {}
-#    endif // DOXYGEN_SHOULD_SKIP_THIS
+#    endif // _CCCL_DOXYGEN_INVOKED
 };
 static_assert(_CUDA_VMR::resource_with<device_memory_resource, _CUDA_VMR::device_accessible>, "");
 
diff --git a/cudax/include/cuda/experimental/__stf/internal/data_interface.cuh b/cudax/include/cuda/experimental/__stf/internal/data_interface.cuh
index 0d2026fdbbe..7fe81211569 100644
--- a/cudax/include/cuda/experimental/__stf/internal/data_interface.cuh
+++ b/cudax/include/cuda/experimental/__stf/internal/data_interface.cuh
@@ -213,7 +213,7 @@ public:
    */
   virtual size_t data_hash(instance_id_t instance_id) const = 0;
 
-#ifndef DOXYGEN_SHOULD_SKIP_THIS // doxygen fails to parse this
+#ifndef _CCCL_DOXYGEN_INVOKED // doxygen fails to parse this
   /**
    * @brief Returns the size of the data represented by this logical data.
    *
@@ -221,7 +221,7 @@ public:
    * purposes, or for the scheduling strategies.
    */
   virtual size_t data_footprint() const = 0;
-#endif // DOXYGEN_SHOULD_SKIP_THIS
+#endif // _CCCL_DOXYGEN_INVOKED
 
   /**
    * @brief Get the part of the data interface that is common to all data instances.
diff --git a/cudax/include/cuda/experimental/__stf/internal/execution_policy.cuh b/cudax/include/cuda/experimental/__stf/internal/execution_policy.cuh
index d2cc954bfa9..e79dca54141 100644
--- a/cudax/include/cuda/experimental/__stf/internal/execution_policy.cuh
+++ b/cudax/include/cuda/experimental/__stf/internal/execution_policy.cuh
@@ -301,7 +301,7 @@ public:
    *
    * @tparam level The level in the hierarchy to check for the `sync` property. Level starts from 0 (top-level).
    */
-#ifndef DOXYGEN_SHOULD_SKIP_THIS // doxygen fails to parse this
+#ifndef _CCCL_DOXYGEN_INVOKED // doxygen fails to parse this
   template <size_t level>
   static inline constexpr bool is_synchronizable = [] {
     if constexpr (level > 0)
@@ -395,7 +395,7 @@ private:
   mem mem_bytes = mem(0);
 };
 
-#ifndef DOXYGEN_SHOULD_SKIP_THIS // doxygen fails to parse this
+#ifndef _CCCL_DOXYGEN_INVOKED // doxygen fails to parse this
 /**
  * @brief Creates and returns a `thread_hierarchy_spec` object with no synchronization and dynamic width.
  *
@@ -480,7 +480,7 @@ constexpr auto con(const P&... p)
   return R(p...);
 }
 /// @}
-#endif // DOXYGEN_SHOULD_SKIP_THIS
+#endif // _CCCL_DOXYGEN_INVOKED
 
 #ifdef UNITTESTED_FILE
 
diff --git a/cudax/include/cuda/experimental/__stf/internal/reduction_base.cuh b/cudax/include/cuda/experimental/__stf/internal/reduction_base.cuh
index 0b6a0cd7c78..1d8d00d6670 100644
--- a/cudax/include/cuda/experimental/__stf/internal/reduction_base.cuh
+++ b/cudax/include/cuda/experimental/__stf/internal/reduction_base.cuh
@@ -42,7 +42,7 @@ public:
   reduction_operator_base& operator=(const reduction_operator_base&) = delete;
   reduction_operator_base(const reduction_operator_base&)            = delete;
 
-#ifndef DOXYGEN_SHOULD_SKIP_THIS // doxygen fails here
+#ifndef _CCCL_DOXYGEN_INVOKED // doxygen fails here
 
   // Reduction operator (inout, in)
   virtual void op_untyped(
@@ -62,7 +62,7 @@ public:
     const exec_place& e,
     event_list& prereq_in) = 0;
 
-#endif // DOXYGEN_SHOULD_SKIP_THIS
+#endif // _CCCL_DOXYGEN_INVOKED
 
 private:
   // not used for now ...
diff --git a/cudax/include/cuda/experimental/__stf/places/exec/host/callback_queues.cuh b/cudax/include/cuda/experimental/__stf/places/exec/host/callback_queues.cuh
index 0b011cce0f5..2d3036ec143 100644
--- a/cudax/include/cuda/experimental/__stf/places/exec/host/callback_queues.cuh
+++ b/cudax/include/cuda/experimental/__stf/places/exec/host/callback_queues.cuh
@@ -30,7 +30,7 @@
 #include <cstdio>
 #include <stack>
 
-#ifndef DOXYGEN_SHOULD_SKIP_THIS // do not document
+#ifndef _CCCL_DOXYGEN_INVOKED // do not document
 
 #  if !defined(_CCCL_COMPILER_MSVC)
 #    define STATEFUL_CALLBACKS
@@ -603,4 +603,4 @@ inline bool cudaCallbackQueueProgress(callback_queue* q, bool flag)
 } // end namespace cuda::experimental::stf
 
 #  endif // !_CCCL_COMPILER_MSVC
-#endif // DOXYGEN_SHOULD_SKIP_THIS do not document
+#endif // _CCCL_DOXYGEN_INVOKED do not document
diff --git a/cudax/include/cuda/experimental/__stf/places/inner_shape.cuh b/cudax/include/cuda/experimental/__stf/places/inner_shape.cuh
index 04b2badf7f2..383f43961be 100644
--- a/cudax/include/cuda/experimental/__stf/places/inner_shape.cuh
+++ b/cudax/include/cuda/experimental/__stf/places/inner_shape.cuh
@@ -31,7 +31,7 @@
 namespace cuda::experimental::stf
 {
 
-#ifndef DOXYGEN_SHOULD_SKIP_THIS // doxygen fails to parse this
+#ifndef _CCCL_DOXYGEN_INVOKED // doxygen fails to parse this
 
 /**
  * @brief Applying "inner" on a mdspan shape returns an explicit shape which extents
@@ -89,7 +89,7 @@ _CCCL_HOST_DEVICE box<rank> inner(const box<rank>& s)
   return box(inner_extents);
 }
 
-#endif // DOXYGEN_SHOULD_SKIP_THIS
+#endif // _CCCL_DOXYGEN_INVOKED
 
 #ifdef UNITTESTED_FILE
 UNITTEST("inner explicit shape (explicit bounds)")
diff --git a/cudax/include/cuda/experimental/__stf/stream/reduction.cuh b/cudax/include/cuda/experimental/__stf/stream/reduction.cuh
index 4493672c70b..deea02bbd9c 100644
--- a/cudax/include/cuda/experimental/__stf/stream/reduction.cuh
+++ b/cudax/include/cuda/experimental/__stf/stream/reduction.cuh
@@ -65,7 +65,7 @@ public:
     const exec_place& e,
     cudaStream_t s) = 0;
 
-#ifndef DOXYGEN_SHOULD_SKIP_THIS // doxygen has issues with this code
+#ifndef _CCCL_DOXYGEN_INVOKED // doxygen has issues with this code
   void op_untyped(
     logical_data_untyped& d,
     const data_place& inout_memory_node,
@@ -110,7 +110,7 @@ public:
 
     prereqs = async_op.end(d.get_ctx());
   }
-#endif // DOXYGEN_SHOULD_SKIP_THIS // doxygen has issues with this code
+#endif // _CCCL_DOXYGEN_INVOKED // doxygen has issues with this code
 };
 
 /**
diff --git a/cudax/include/cuda/experimental/__stf/stream/stream_task.cuh b/cudax/include/cuda/experimental/__stf/stream/stream_task.cuh
index 3c51f7304bb..48a28aa6648 100644
--- a/cudax/include/cuda/experimental/__stf/stream/stream_task.cuh
+++ b/cudax/include/cuda/experimental/__stf/stream/stream_task.cuh
@@ -617,7 +617,7 @@ private:
 template <typename... Data>
 class deferred_stream_task;
 
-#ifndef DOXYGEN_SHOULD_SKIP_THIS // doxygen has issues with this code
+#ifndef _CCCL_DOXYGEN_INVOKED // doxygen has issues with this code
 /*
  * Base of all deferred tasks. Stores the needed information for typed deferred tasks to run (see below).
  */
@@ -877,6 +877,6 @@ public:
     };
   }
 };
-#endif // DOXYGEN_SHOULD_SKIP_THIS
+#endif // _CCCL_DOXYGEN_INVOKED
 
 } // namespace cuda::experimental::stf
diff --git a/cudax/include/cuda/experimental/__stf/utility/core.cuh b/cudax/include/cuda/experimental/__stf/utility/core.cuh
index e0eb417aad7..23b0ff5560f 100644
--- a/cudax/include/cuda/experimental/__stf/utility/core.cuh
+++ b/cudax/include/cuda/experimental/__stf/utility/core.cuh
@@ -79,7 +79,7 @@ inline int setenv(const char* name, const char* value, int overwrite)
 }
 #endif
 
-#ifndef DOXYGEN_SHOULD_SKIP_THIS // FIXME Doxygen is lost with decltype(auto)
+#ifndef _CCCL_DOXYGEN_INVOKED // FIXME Doxygen is lost with decltype(auto)
 /**
  * @brief Custom move function that performs checks on the argument type.
  *
@@ -97,7 +97,7 @@ _CCCL_HOST_DEVICE constexpr decltype(auto) mv(T&& obj)
   static_assert(!::std::is_const_v<::std::remove_reference_t<T>>, "Misleading move from const lvalue.");
   return ::std::move(obj);
 }
-#endif // DOXYGEN_SHOULD_SKIP_THIS
+#endif // _CCCL_DOXYGEN_INVOKED
 
 /**
  * @brief Creates a `std::shared_ptr` managing a copy of the given object.
@@ -609,7 +609,7 @@ private:
   [[no_unique_address]] state_t payload = state_t();
 };
 
-#ifndef DOXYGEN_SHOULD_SKIP_THIS // Do not document
+#ifndef _CCCL_DOXYGEN_INVOKED // Do not document
 // Operator implementations
 #  define _3197bc91feaf98030b2cc0b441d7b0ea(op)                                                          \
     template <auto v1, auto v2, auto r>                                                                  \
@@ -691,6 +691,6 @@ _3197bc91feaf98030b2cc0b441d7b0ea(>=);
 
 #  undef _3197bc91feaf98030b2cc0b441d7b0ea
 
-#endif // DOXYGEN_SHOULD_SKIP_THIS
+#endif // _CCCL_DOXYGEN_INVOKED
 
 } // namespace cuda::experimental::stf
diff --git a/cudax/include/cuda/experimental/__stf/utility/unittest.cuh b/cudax/include/cuda/experimental/__stf/utility/unittest.cuh
index 3cc470df80e..dd42fbdd9bd 100644
--- a/cudax/include/cuda/experimental/__stf/utility/unittest.cuh
+++ b/cudax/include/cuda/experimental/__stf/utility/unittest.cuh
@@ -31,7 +31,7 @@
 
 #include <filesystem>
 
-#ifndef DOXYGEN_SHOULD_SKIP_THIS // Do not document
+#ifndef _CCCL_DOXYGEN_INVOKED // Do not document
 // One level of macro indirection is required in order to resolve __COUNTER__,
 // and get varname1 instead of varname__COUNTER__.
 #  define _55f56f4e3b45c8cf3fa50b28fed72e2a(a, b) _a56ec7069122ad2e0888a508ecdc4639(a, b)
@@ -705,7 +705,7 @@ UNITTEST("cuda::std::source_location")
   test_func();
 };
 
-#else // DOXYGEN_SHOULD_SKIP_THIS  Do not document
+#else // _CCCL_DOXYGEN_INVOKED  Do not document
 // Ensure these are ignored by Doxygen
 #  define UNITTEST(name, ...)
-#endif // DOXYGEN_SHOULD_SKIP_THIS  Do not document
+#endif // _CCCL_DOXYGEN_INVOKED  Do not document
diff --git a/cudax/include/cuda/experimental/__utility/ensure_current_device.cuh b/cudax/include/cuda/experimental/__utility/ensure_current_device.cuh
index 6c37d4f6996..c644dd19a1c 100644
--- a/cudax/include/cuda/experimental/__utility/ensure_current_device.cuh
+++ b/cudax/include/cuda/experimental/__utility/ensure_current_device.cuh
@@ -27,7 +27,7 @@
 #include <cuda/experimental/__device/logical_device.cuh>
 #include <cuda/experimental/__utility/driver_api.cuh>
 
-#ifndef DOXYGEN_SHOULD_SKIP_THIS // Do not document
+#ifndef _CCCL_DOXYGEN_INVOKED // Do not document
 
 namespace cuda::experimental
 {
@@ -101,5 +101,5 @@ struct [[maybe_unused]] __ensure_current_device
   }
 };
 } // namespace cuda::experimental
-#endif // DOXYGEN_SHOULD_SKIP_THIS
+#endif // _CCCL_DOXYGEN_INVOKED
 #endif // _CUDAX__UTILITY_ENSURE_CURRENT_DEVICE
diff --git a/docs/repo.toml b/docs/repo.toml
index 9a684c4d5f4..f7c426f13db 100644
--- a/docs/repo.toml
+++ b/docs/repo.toml
@@ -159,11 +159,10 @@ doxygen_predefined = [
     "_CCCL_DIAG_SUPPRESS_ICC(x)=",
     "_CCCL_DIAG_SUPPRESS_MSVC(x)=",
     "_CCCL_DIAG_SUPPRESS_NVHPC(x)=",
+    "_CCCL_DOXYGEN_INVOKED",
     "_CCCL_REQUIRES(x)= ::cuda::std::enable_if_t<x, int> = 0>",
     "_CCCL_TEMPLATE(x)=template<x, ",
     "_CCCL_TRAILING_REQUIRES(x)=-> x _CCCL_EAT_REST",
-    "DOXYGEN_SHOULD_SKIP_THIS",
-    "DOXYGEN_ACTIVE",
     "__device__",
     "__host__",
     "__forceinline__",
@@ -275,8 +274,7 @@ doxygen_predefined = [
   "CUDASTF_HOST=",
   "CUDASTF_DEVICE=",
   "CUDASTF_HOST_DEVICE=",
-  "DOXYGEN_SHOULD_SKIP_THIS",
-  "DOXYGEN_ACTIVE",
+  "_CCCL_DOXYGEN_INVOKED",
   "_LIBCUDACXX_DEPRECATED_IN_CXX11",
   "THRUST_DISABLE_NAMESPACE_MAGIC",
   "THRUST_IGNORE_NAMESPACE_MAGIC_ERROR",
@@ -445,8 +443,7 @@ doxygen_predefined = [
   "_CUDAX_TRIVIAL_DEVICE_API",
   "_CUDAX_PUBLIC_API",
   "LIBCUDACXX_ENABLE_EXPERIMENTAL_MEMORY_RESOURCE=",
-  "DOXYGEN_SHOULD_SKIP_THIS",
-  "DOXYGEN_ACTIVE",
+  "_CCCL_DOXYGEN_INVOKED",
 ]
 
 # make sure to use ./fetch_imgs.sh
diff --git a/libcudacxx/include/cuda/std/__type_traits/type_list.h b/libcudacxx/include/cuda/std/__type_traits/type_list.h
index bef58f29966..4bd928b0013 100644
--- a/libcudacxx/include/cuda/std/__type_traits/type_list.h
+++ b/libcudacxx/include/cuda/std/__type_traits/type_list.h
@@ -42,7 +42,7 @@
 
 _LIBCUDACXX_BEGIN_NAMESPACE_STD
 
-#ifndef DOXYGEN_SHOULD_SKIP_THIS // Do not document
+#ifndef _CCCL_DOXYGEN_INVOKED // Do not document
 
 template <class... _Ts>
 struct __type_list;
@@ -947,7 +947,7 @@ template <class _Ty, _Ty _Start, _Ty _Size, _Ty _Stride = _Ty(1)>
 using __type_iota =
   decltype(__detail::__type_iota_fn<_Ty, _Start, _Stride>(static_cast<make_integer_sequence<_Ty, _Size>*>(nullptr)));
 
-#endif // DOXYGEN_SHOULD_SKIP_THIS
+#endif // _CCCL_DOXYGEN_INVOKED
 
 _LIBCUDACXX_END_NAMESPACE_STD
 
diff --git a/thrust/thrust/detail/type_deduction.h b/thrust/thrust/detail/type_deduction.h
index 08f31630bb5..a1d41de9676 100644
--- a/thrust/thrust/detail/type_deduction.h
+++ b/thrust/thrust/detail/type_deduction.h
@@ -59,7 +59,7 @@
 ///
 // Trailing return types seem to confuse Doxygen, and cause it to interpret
 // parts of the function's body as new function signatures.
-#if defined(THRUST_DOXYGEN)
+#if defined(_CCCL_DOXYGEN_INVOKED)
 #  define THRUST_DECLTYPE_RETURNS(...) \
     {                                  \
       return (__VA_ARGS__);            \
@@ -81,7 +81,7 @@
 ///
 // Trailing return types seem to confuse Doxygen, and cause it to interpret
 // parts of the function's body as new function signatures.
-#if defined(THRUST_DOXYGEN)
+#if defined(_CCCL_DOXYGEN_INVOKED)
 #  define THRUST_DECLTYPE_RETURNS(...) \
     {                                  \
       return (__VA_ARGS__);            \
diff --git a/thrust/thrust/device_malloc_allocator.h b/thrust/thrust/device_malloc_allocator.h
index e5d2e04fc19..c9de52a8404 100644
--- a/thrust/thrust/device_malloc_allocator.h
+++ b/thrust/thrust/device_malloc_allocator.h
@@ -40,12 +40,12 @@
 THRUST_NAMESPACE_BEGIN
 
 // forward declarations to WAR circular #includes
-#ifndef DOXYGEN_SHOULD_SKIP_THIS // Do not document
+#ifndef _CCCL_DOXYGEN_INVOKED // Do not document
 template <typename>
 class device_ptr;
 template <typename T>
 device_ptr<T> device_malloc(const std::size_t n);
-#endif // DOXYGEN_SHOULD_SKIP_THIS
+#endif // _CCCL_DOXYGEN_INVOKED
 
 /*! \addtogroup allocators Allocators
  *  \ingroup memory_management
diff --git a/thrust/thrust/device_ptr.h b/thrust/thrust/device_ptr.h
index 5c5f55a3a83..058d12cb83f 100644
--- a/thrust/thrust/device_ptr.h
+++ b/thrust/thrust/device_ptr.h
@@ -154,14 +154,14 @@ class device_ptr
     return *this;
   }
 
-#if THRUST_DOXYGEN
+#ifdef _CCCL_DOXYGEN_INVOKED
   /*! \brief Return the raw pointer that this \c device_ptr points to.
    */
   _CCCL_HOST_DEVICE T* get() const;
 #endif
 };
 
-#if THRUST_DOXYGEN
+#ifdef _CCCL_DOXYGEN_INVOKED
 /*! Write the address that a \c device_ptr points to to an output stream.
  *
  *  \param os The output stream.
diff --git a/thrust/thrust/device_reference.h b/thrust/thrust/device_reference.h
index 40a6790a5a1..545d5449bee 100644
--- a/thrust/thrust/device_reference.h
+++ b/thrust/thrust/device_reference.h
@@ -961,7 +961,7 @@ _CCCL_HOST_DEVICE void swap(device_reference<T>& x, device_reference<T>& y)
 
 // declare these methods for the purpose of Doxygenating them
 // they actually are defined for a base class
-#if THRUST_DOXYGEN
+#ifdef _CCCL_DOXYGEN_INVOKED
 /*! Writes to an output stream the value of a \p device_reference.
  *
  *  \param os The output stream.
diff --git a/thrust/thrust/memory.h b/thrust/thrust/memory.h
index 6462545590b..290c99b7b2e 100644
--- a/thrust/thrust/memory.h
+++ b/thrust/thrust/memory.h
@@ -138,7 +138,7 @@ template<typename Element, typename Tag, typename Reference = thrust::use_defaul
 };
 #endif
 
-#ifndef DOXYGEN_SHOULD_SKIP_THIS // Doxygen cannot handle both versions
+#ifndef _CCCL_DOXYGEN_INVOKED // Doxygen cannot handle both versions
 
 /*! This version of \p malloc allocates untyped uninitialized storage associated with a given system.
  *
@@ -176,7 +176,7 @@ template <typename DerivedPolicy>
 _CCCL_HOST_DEVICE pointer<void, DerivedPolicy>
 malloc(const thrust::detail::execution_policy_base<DerivedPolicy>& system, std::size_t n);
 
-#endif // DOXYGEN_SHOULD_SKIP_THIS
+#endif // _CCCL_DOXYGEN_INVOKED
 
 /*! This version of \p malloc allocates typed uninitialized storage associated with a given system.
  *
diff --git a/thrust/thrust/optional.h b/thrust/thrust/optional.h
index 6762271cb47..bb9bf1cfb4b 100644
--- a/thrust/thrust/optional.h
+++ b/thrust/thrust/optional.h
@@ -1976,7 +1976,7 @@ optional(T) -> optional<T>;
 #endif
 
 // Doxygen chokes on the trailing return types used below.
-#if !defined(THRUST_DOXYGEN)
+#if !defined(_CCCL_DOXYGEN_INVOKED)
 /// \exclude
 namespace detail
 {
@@ -2034,7 +2034,7 @@ _CCCL_HOST_DEVICE auto optional_map_impl(Opt&& opt, F&& f) -> optional<monostate
 }
 #  endif
 } // namespace detail
-#endif // !defined(THRUST_DOXYGEN)
+#endif // !defined(_CCCL_DOXYGEN_INVOKED)
 
 /// Specialization for when `T` is a reference. `optional<T&>` acts similarly
 /// to a `T*`, but provides more operations and shows intent more clearly.
diff --git a/thrust/thrust/pair.h b/thrust/thrust/pair.h
index e3c74677993..9f35a388bc7 100644
--- a/thrust/thrust/pair.h
+++ b/thrust/thrust/pair.h
@@ -49,12 +49,12 @@ THRUST_NAMESPACE_BEGIN
  *  \tparam N This parameter selects the member of interest.
  *  \tparam T A \c pair type of interest.
  */
-#ifdef DOXYGEN_SHOULD_SKIP_THIS // Provide a fake alias for doxygen
+#ifdef _CCCL_DOXYGEN_INVOKED // Provide a fake alias for doxygen
 template <size_t N, class T>
 using tuple_element = _CUDA_VSTD::tuple_element<N, T>;
-#else // ^^^ DOXYGEN_SHOULD_SKIP_THIS ^^^ / vvv !DOXYGEN_SHOULD_SKIP_THIS vvv
+#else // ^^^ _CCCL_DOXYGEN_INVOKED ^^^ / vvv !_CCCL_DOXYGEN_INVOKED vvv
 using _CUDA_VSTD::tuple_element;
-#endif // DOXYGEN_SHOULD_SKIP_THIS
+#endif // _CCCL_DOXYGEN_INVOKED
 
 /*! This convenience metafunction is included for compatibility with
  *  \p tuple. It returns \c 2, the number of elements of a \p pair,
@@ -62,12 +62,12 @@ using _CUDA_VSTD::tuple_element;
  *
  *  \tparam Pair A \c pair type of interest.
  */
-#ifdef DOXYGEN_SHOULD_SKIP_THIS // Provide a fake alias for doxygen
+#ifdef _CCCL_DOXYGEN_INVOKED // Provide a fake alias for doxygen
 template <class T>
 using tuple_size = _CUDA_VSTD::tuple_size<T>;
-#else // ^^^ DOXYGEN_SHOULD_SKIP_THIS ^^^ / vvv !DOXYGEN_SHOULD_SKIP_THIS vvv
+#else // ^^^ _CCCL_DOXYGEN_INVOKED ^^^ / vvv !_CCCL_DOXYGEN_INVOKED vvv
 using _CUDA_VSTD::tuple_size;
-#endif // DOXYGEN_SHOULD_SKIP_THIS
+#endif // _CCCL_DOXYGEN_INVOKED
 
 /*! \p pair is a generic data structure encapsulating a heterogeneous
  *  pair of values.
@@ -80,12 +80,12 @@ using _CUDA_VSTD::tuple_size;
  *          requirements on the type of \p T2. <tt>T2</tt>'s type is
  *          provided by <tt>pair::second_type</tt>.
  */
-#ifdef DOXYGEN_SHOULD_SKIP_THIS // Provide a fake alias for doxygen
+#ifdef _CCCL_DOXYGEN_INVOKED // Provide a fake alias for doxygen
 template <class T, class U>
 using pair = _CUDA_VSTD::pair<T, U>;
-#else // ^^^ DOXYGEN_SHOULD_SKIP_THIS ^^^ / vvv !DOXYGEN_SHOULD_SKIP_THIS vvv
+#else // ^^^ _CCCL_DOXYGEN_INVOKED ^^^ / vvv !_CCCL_DOXYGEN_INVOKED vvv
 using _CUDA_VSTD::pair;
-#endif // DOXYGEN_SHOULD_SKIP_THIS
+#endif // _CCCL_DOXYGEN_INVOKED
 
 using _CUDA_VSTD::get;
 using _CUDA_VSTD::make_pair;
diff --git a/thrust/thrust/random/linear_congruential_engine.h b/thrust/thrust/random/linear_congruential_engine.h
index ce47c08b619..c289667749f 100644
--- a/thrust/thrust/random/linear_congruential_engine.h
+++ b/thrust/thrust/random/linear_congruential_engine.h
@@ -143,11 +143,11 @@ class linear_congruential_engine
 
   /*! The smallest value this \p linear_congruential_engine may potentially produce.
    */
-#ifndef DOXYGEN_SHOULD_SKIP_THIS // Doxygen breaks on the ternary :shrug:
+#ifndef _CCCL_DOXYGEN_INVOKED // Doxygen breaks on the ternary :shrug:
   static const result_type min = c == 0u ? 1u : 0u;
 #else
   static const result_type min = 0u;
-#endif // DOXYGEN_SHOULD_SKIP_THIS
+#endif // _CCCL_DOXYGEN_INVOKED
 
   /*! The largest value this \p linear_congruential_engine may potentially produce.
    */
diff --git a/thrust/thrust/tuple.h b/thrust/thrust/tuple.h
index 1f8ed8943e5..d0d13670f0c 100644
--- a/thrust/thrust/tuple.h
+++ b/thrust/thrust/tuple.h
@@ -94,12 +94,12 @@ _CCCL_HOST_DEVICE inline bool operator>(const null_type&, const null_type&)
  *  \see pair
  *  \see tuple
  */
-#ifdef DOXYGEN_SHOULD_SKIP_THIS // Provide a fake alias for doxygen
+#ifdef _CCCL_DOXYGEN_INVOKED // Provide a fake alias for doxygen
 template <size_t N, class T>
 using tuple_element = _CUDA_VSTD::tuple_element<N, T>;
-#else // ^^^ DOXYGEN_SHOULD_SKIP_THIS ^^^ / vvv !DOXYGEN_SHOULD_SKIP_THIS vvv
+#else // ^^^ _CCCL_DOXYGEN_INVOKED ^^^ / vvv !_CCCL_DOXYGEN_INVOKED vvv
 using _CUDA_VSTD::tuple_element;
-#endif // DOXYGEN_SHOULD_SKIP_THIS
+#endif // _CCCL_DOXYGEN_INVOKED
 
 /*! This metafunction returns the number of elements
  *  of a \p tuple type of interest.
@@ -109,12 +109,12 @@ using _CUDA_VSTD::tuple_element;
  *  \see pair
  *  \see tuple
  */
-#ifdef DOXYGEN_SHOULD_SKIP_THIS // Provide a fake alias for doxygen
+#ifdef _CCCL_DOXYGEN_INVOKED // Provide a fake alias for doxygen
 template <class T>
 using tuple_size = _CUDA_VSTD::tuple_size<T>;
-#else // ^^^ DOXYGEN_SHOULD_SKIP_THIS ^^^ / vvv !DOXYGEN_SHOULD_SKIP_THIS vvv
+#else // ^^^ _CCCL_DOXYGEN_INVOKED ^^^ / vvv !_CCCL_DOXYGEN_INVOKED vvv
 using _CUDA_VSTD::tuple_size;
-#endif // DOXYGEN_SHOULD_SKIP_THIS
+#endif // _CCCL_DOXYGEN_INVOKED
 
 /*! \brief \p tuple is a heterogeneous, fixed-size collection of values.
  *  An instantiation of \p tuple with two arguments is similar to an
@@ -153,12 +153,12 @@ using _CUDA_VSTD::tuple_size;
  *  \see tuple_size
  *  \see tie
  */
-#ifdef DOXYGEN_SHOULD_SKIP_THIS // Provide a fake alias for doxygen
+#ifdef _CCCL_DOXYGEN_INVOKED // Provide a fake alias for doxygen
 template <class... Ts>
 using tuple = _CUDA_VSTD::tuple<T...>;
-#else // ^^^ DOXYGEN_SHOULD_SKIP_THIS ^^^ / vvv !DOXYGEN_SHOULD_SKIP_THIS vvv
+#else // ^^^ _CCCL_DOXYGEN_INVOKED ^^^ / vvv !_CCCL_DOXYGEN_INVOKED vvv
 using _CUDA_VSTD::tuple;
-#endif // DOXYGEN_SHOULD_SKIP_THIS
+#endif // _CCCL_DOXYGEN_INVOKED
 
 using _CUDA_VSTD::get;
 using _CUDA_VSTD::make_tuple;

From 667886ef0d7db34a412b06aba94bd0a9bf502bb9 Mon Sep 17 00:00:00 2001
From: David Bayer <48736217+davebayer@users.noreply.github.com>
Date: Fri, 22 Nov 2024 08:21:33 +0100
Subject: [PATCH 05/45] implement C++26 `std::span::at` (#2924)

Co-authored-by: Bernhard Manfred Gruber <bernhardmgruber@gmail.com>
---
 .../cuda/std/detail/libcxx/include/span       |  19 ++
 libcudacxx/include/cuda/std/version           |   2 +-
 .../views/views.span/span.elem/at.pass.cpp    | 225 ++++++++++++++++++
 3 files changed, 245 insertions(+), 1 deletion(-)
 create mode 100644 libcudacxx/test/libcudacxx/std/containers/views/views.span/span.elem/at.pass.cpp

diff --git a/libcudacxx/include/cuda/std/detail/libcxx/include/span b/libcudacxx/include/cuda/std/detail/libcxx/include/span
index 8257ac93f1b..afe5ea34519 100644
--- a/libcudacxx/include/cuda/std/detail/libcxx/include/span
+++ b/libcudacxx/include/cuda/std/detail/libcxx/include/span
@@ -171,6 +171,7 @@ template<class R>
 #include <cuda/std/__utility/declval.h>
 #include <cuda/std/array>
 #include <cuda/std/cstddef> // for ptrdiff_t
+#include <cuda/std/detail/libcxx/include/stdexcept>
 
 // standard-mandated includes
 #include <cuda/std/version>
@@ -502,6 +503,15 @@ public:
     return __data_[__idx];
   }
 
+  _LIBCUDACXX_HIDE_FROM_ABI constexpr reference at(size_type __idx) const
+  {
+    if (__idx >= size())
+    {
+      _CUDA_VSTD::__throw_out_of_range("span::at");
+    }
+    return __data_[__idx];
+  }
+
   _LIBCUDACXX_HIDE_FROM_ABI constexpr reference front() const noexcept
   {
     _CCCL_ASSERT(!empty(), "span<T, N>::front() on empty span");
@@ -731,6 +741,15 @@ public:
     return __data_[__idx];
   }
 
+  _LIBCUDACXX_HIDE_FROM_ABI constexpr reference at(size_type __idx) const
+  {
+    if (__idx >= size())
+    {
+      _CUDA_VSTD::__throw_out_of_range("span::at");
+    }
+    return __data_[__idx];
+  }
+
   _LIBCUDACXX_HIDE_FROM_ABI constexpr reference front() const noexcept
   {
     _CCCL_ASSERT(!empty(), "span<T>::front() on empty span");
diff --git a/libcudacxx/include/cuda/std/version b/libcudacxx/include/cuda/std/version
index 059bfcccc66..841aa449c77 100644
--- a/libcudacxx/include/cuda/std/version
+++ b/libcudacxx/include/cuda/std/version
@@ -60,7 +60,7 @@
 // #   define __cccl_lib_shared_timed_mutex                 201402L
 #  endif // !_LIBCUDACXX_HAS_NO_THREADS
 #  define __cccl_lib_source_location 201907L
-#  define __cccl_lib_span            202002L
+#  define __cccl_lib_span            202311L
 // # define __cccl_lib_string_udls                          201304L
 #  define __cccl_lib_transformation_trait_aliases 201304L
 #  define __cccl_lib_transparent_operators        201210L
diff --git a/libcudacxx/test/libcudacxx/std/containers/views/views.span/span.elem/at.pass.cpp b/libcudacxx/test/libcudacxx/std/containers/views/views.span/span.elem/at.pass.cpp
new file mode 100644
index 00000000000..47f45804aad
--- /dev/null
+++ b/libcudacxx/test/libcudacxx/std/containers/views/views.span/span.elem/at.pass.cpp
@@ -0,0 +1,225 @@
+//===----------------------------------------------------------------------===//
+//
+// Part of the libcu++ Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+// SPDX-FileCopyrightText: Copyright (c) 2024 NVIDIA CORPORATION & AFFILIATES.
+//
+//===----------------------------------------------------------------------===//
+// UNSUPPORTED: c++11
+
+// <cuda/std/span>
+
+// constexpr reference at(size_type idx) const;
+
+#include <cuda/std/array>
+#include <cuda/std/cassert>
+#include <cuda/std/concepts>
+#include <cuda/std/limits>
+#include <cuda/std/span>
+#include <cuda/std/tuple>
+#include <cuda/std/utility>
+
+#include "test_macros.h"
+
+#ifndef TEST_HAS_NO_EXCEPTIONS
+#  include <stdexcept>
+#endif // !TEST_HAS_NO_EXCEPTIONS
+
+template <typename ReferenceT, typename SpanT>
+__host__ __device__ constexpr void testSpanAt(SpanT&& anySpan, int index, int expectedValue)
+{
+  // non-const
+  {
+    auto elem = anySpan.at(index);
+    ASSERT_SAME_TYPE(ReferenceT, decltype(anySpan.at(index)));
+    assert(elem == expectedValue);
+  }
+
+  // const
+  {
+    auto elem = cuda::std::as_const(anySpan).at(index);
+    ASSERT_SAME_TYPE(ReferenceT, decltype(cuda::std::as_const(anySpan).at(index)));
+    assert(elem == expectedValue);
+  }
+}
+
+__host__ __device__ constexpr bool test()
+{
+  // With static extent
+  {
+    cuda::std::array<int, 7> arr{0, 1, 2, 3, 4, 5, 9084};
+    cuda::std::span<int, 7> arrSpan{arr};
+
+    assert(cuda::std::dynamic_extent != arrSpan.extent);
+
+    using ReferenceT = typename decltype(arrSpan)::reference;
+
+    testSpanAt<ReferenceT>(arrSpan, 0, 0);
+    testSpanAt<ReferenceT>(arrSpan, 1, 1);
+    testSpanAt<ReferenceT>(arrSpan, 6, 9084);
+  }
+
+  // With dynamic extent
+  {
+    cuda::std::array<int, 7> arr{0, 1, 2, 3, 4, 5, 9084};
+    cuda::std::span<int> dynSpan{arr};
+
+    assert(cuda::std::dynamic_extent == dynSpan.extent);
+
+    using ReferenceT = typename decltype(dynSpan)::reference;
+
+    testSpanAt<ReferenceT>(dynSpan, 0, 0);
+    testSpanAt<ReferenceT>(dynSpan, 1, 1);
+    testSpanAt<ReferenceT>(dynSpan, 6, 9084);
+  }
+
+  return true;
+}
+
+#ifndef TEST_HAS_NO_EXCEPTIONS
+void test_exceptions()
+{
+  // With static extent
+  {
+    cuda::std::array<int, 8> arr{0, 1, 2, 3, 4, 5, 9084, cuda::std::numeric_limits<int>::max()};
+    const cuda::std::span<int, 8> arrSpan{arr};
+
+    try
+    {
+      using SizeT       = typename decltype(arrSpan)::size_type;
+      cuda::std::ignore = arrSpan.at(cuda::std::numeric_limits<SizeT>::max());
+      assert(false);
+    }
+    catch (const std::out_of_range&)
+    {
+      // pass
+    }
+    catch (...)
+    {
+      assert(false);
+    }
+
+    try
+    {
+      cuda::std::ignore = arrSpan.at(arr.size());
+      assert(false);
+    }
+    catch (const std::out_of_range&)
+    {
+      // pass
+    }
+    catch (...)
+    {
+      assert(false);
+    }
+
+    try
+    {
+      cuda::std::ignore = arrSpan.at(arr.size() - 1);
+      // pass
+      assert(arrSpan.at(arr.size() - 1) == cuda::std::numeric_limits<int>::max());
+    }
+    catch (...)
+    {
+      assert(false);
+    }
+  }
+
+  {
+    cuda::std::array<int, 0> arr{};
+    const cuda::std::span<int, 0> arrSpan{arr};
+
+    try
+    {
+      cuda::std::ignore = arrSpan.at(0);
+      assert(false);
+    }
+    catch (const std::out_of_range&)
+    {
+      // pass
+    }
+    catch (...)
+    {
+      assert(false);
+    }
+  }
+
+  // With dynamic extent
+
+  {
+    cuda::std::array<int, 8> arr{0, 1, 2, 3, 4, 5, 9084, cuda::std::numeric_limits<int>::max()};
+    const cuda::std::span<int> dynSpan{arr};
+
+    try
+    {
+      using SizeT       = typename decltype(dynSpan)::size_type;
+      cuda::std::ignore = dynSpan.at(cuda::std::numeric_limits<SizeT>::max());
+      assert(false);
+    }
+    catch (const std::out_of_range&)
+    {
+      // pass
+    }
+    catch (...)
+    {
+      assert(false);
+    }
+
+    try
+    {
+      cuda::std::ignore = dynSpan.at(arr.size());
+      assert(false);
+    }
+    catch (const std::out_of_range&)
+    {
+      // pass
+    }
+    catch (...)
+    {
+      assert(false);
+    }
+
+    try
+    {
+      cuda::std::ignore = dynSpan.at(arr.size() - 1);
+      assert(dynSpan.at(arr.size() - 1) == cuda::std::numeric_limits<int>::max());
+    }
+    catch (...)
+    {
+      assert(false);
+    }
+  }
+
+  {
+    cuda::std::array<int, 0> arr{};
+    const cuda::std::span<int> dynSpan{arr};
+
+    try
+    {
+      cuda::std::ignore = dynSpan.at(0);
+      assert(false);
+    }
+    catch (const std::out_of_range&)
+    {
+      // pass
+    }
+    catch (...)
+    {
+      assert(false);
+    }
+  }
+}
+#endif // TEST_HAS_NO_EXCEPTIONS
+
+int main(int, char**)
+{
+  test();
+  static_assert(test(), "");
+
+#ifndef TEST_HAS_NO_EXCEPTIONS
+  NV_IF_TARGET(NV_IS_HOST, (test_exceptions();))
+#endif // TEST_HAS_NO_EXCEPTIONS
+
+  return 0;
+}

From bc45573d680911f18aa8e8c0a970ef8b0742ab9c Mon Sep 17 00:00:00 2001
From: David Bayer <48736217+davebayer@users.noreply.github.com>
Date: Fri, 22 Nov 2024 08:51:57 +0100
Subject: [PATCH 06/45] move msvc compiler macros to new version (#2885)

Co-authored-by: Michael Schellenberger Costa <miscco@nvidia.com>
Co-authored-by: Bernhard Manfred Gruber <bernhardmgruber@gmail.com>
Co-authored-by: Eric Niebler <eniebler@nvidia.com>
---
 cub/cub/detail/nvtx.cuh                       |   2 +-
 cub/cub/util_compiler.cuh                     |   8 +-
 cub/cub/util_cpp_dialect.cuh                  |   6 +-
 cub/cub/util_deprecated.cuh                   |   2 +-
 .../catch2_test_device_for_each_in_extents.cu |   6 +-
 cub/test/catch2_test_device_transform.cu      |   4 +-
 cub/test/test_warning_suppression.cuh         |   2 +-
 cudax/examples/stf/fdtd_mgpu.cu               |   8 +-
 .../cuda/experimental/__async/lazy.cuh        |   2 +-
 .../cuda/experimental/__async/meta.cuh        |   2 +-
 .../cuda/experimental/__async/tuple.cuh       |   2 +-
 .../cuda/experimental/__async/variant.cuh     |   2 +-
 .../uninitialized_async_buffer.cuh            |   5 +-
 .../__container/uninitialized_buffer.cuh      |   5 +-
 .../cuda/experimental/__device/device.cuh     |   2 +-
 .../__memory_resource/any_resource.cuh        |   2 +-
 .../__memory_resource/device_memory_pool.cuh  |   4 +-
 .../device_memory_resource.cuh                |   4 +-
 .../__memory_resource/shared_resource.cuh     |   2 +-
 .../experimental/__stf/graph/graph_task.cuh   |   8 +-
 .../__stf/internal/backend_ctx.cuh            |   8 +-
 .../places/exec/host/callback_queues.cuh      |   4 +-
 .../experimental/__stf/stream/stream_ctx.cuh  |   4 +-
 .../cuda/experimental/__stf/utility/core.cuh  |   2 +-
 .../experimental/__stf/utility/traits.cuh     |  12 +-
 cudax/test/stf/dot/basic.cu                   |   4 +-
 cudax/test/stf/dot/graph_print_to_dot.cu      |   4 +-
 cudax/test/stf/dot/with_events.cu             |   4 +-
 cudax/test/stf/error_checks/ctx_mismatch.cu   |   6 +-
 .../error_checks/data_interface_mismatch.cu   |   6 +-
 .../test/stf/error_checks/double_finalize.cu  |   6 +-
 cudax/test/stf/error_checks/erase_frozen.cu   |   6 +-
 .../error_checks/misformed_tasks_dbl_end.cu   |   6 +-
 .../error_checks/misformed_tasks_dbl_start.cu |   6 +-
 .../test/stf/error_checks/non_managed_data.cu |   6 +-
 .../stf/error_checks/slice_check_bounds.cu    |   6 +-
 .../stf/error_checks/uninitialized_data.cu    |   6 +-
 .../stf/error_checks/unsatisfiable_spec.cu    |   6 +-
 cudax/test/stf/error_checks/write_frozen.cu   |   6 +-
 cudax/test/stf/parallel_for/fdtd.cu           |   8 +-
 cudax/test/stf/reclaiming/graph.cu            |   8 +-
 cudax/test/stf/stress/task_bench.cu           |   8 +-
 cudax/test/stf/tools/auto_dump/auto_dump.cu   |   4 +-
 .../device_memory_resource.h                  |   4 +-
 .../cuda/__memory_resource/get_property.h     |   4 +-
 .../managed_memory_resource.h                 |   4 +-
 .../pinned_memory_resource.h                  |   4 +-
 .../cuda/__memory_resource/properties.h       |   4 +-
 .../include/cuda/__memory_resource/resource.h |   4 +-
 .../cuda/__memory_resource/resource_ref.h     |   4 +-
 .../std/__algorithm/iterator_operations.h     |   4 +-
 .../std/__algorithm/ranges_iterator_concept.h |   4 +-
 .../include/cuda/std/__atomic/platform.h      |   2 +-
 .../std/__atomic/platform/msvc_to_builtins.h  |   4 +-
 libcudacxx/include/cuda/std/__bit/clz.h       |   8 +-
 libcudacxx/include/cuda/std/__bit/ctz.h       |   8 +-
 libcudacxx/include/cuda/std/__bit/popc.h      |   8 +-
 libcudacxx/include/cuda/std/__cccl/assert.h   |   8 +-
 .../include/cuda/std/__cccl/attributes.h      |  10 +-
 libcudacxx/include/cuda/std/__cccl/builtin.h  | 109 ++++++++----------
 libcudacxx/include/cuda/std/__cccl/compiler.h |  26 ++---
 .../include/cuda/std/__cccl/diagnostic.h      |  26 ++---
 libcudacxx/include/cuda/std/__cccl/dialect.h  |   6 +-
 .../include/cuda/std/__cccl/exceptions.h      |   4 +-
 libcudacxx/include/cuda/std/__cccl/rtti.h     |   4 +-
 .../include/cuda/std/__cccl/system_header.h   |   9 +-
 .../include/cuda/std/__cccl/unreachable.h     |   8 +-
 .../include/cuda/std/__cccl/visibility.h      |  14 +--
 .../cuda/std/__concepts/concept_macros.h      |   6 +-
 .../cuda/std/__concepts/convertible_to.h      |   8 +-
 .../cuda/std/__concepts/destructible.h        |   6 +-
 .../include/cuda/std/__concepts/swappable.h   |   8 +-
 libcudacxx/include/cuda/std/__fwd/get.h       |   4 +-
 libcudacxx/include/cuda/std/__fwd/subrange.h  |   4 +-
 .../include/cuda/std/__iterator/concepts.h    |   6 +-
 .../include/cuda/std/__iterator/distance.h    |   4 +-
 .../cuda/std/__iterator/iterator_traits.h     |   6 +-
 .../cuda/std/__iterator/move_iterator.h       |  20 ++--
 libcudacxx/include/cuda/std/__iterator/next.h |   4 +-
 libcudacxx/include/cuda/std/__iterator/prev.h |   4 +-
 .../cuda/std/__iterator/reverse_iterator.h    |   6 +-
 .../std/__iterator/unreachable_sentinel.h     |   8 +-
 .../cuda/std/__memory/assume_aligned.h        |   4 +-
 libcudacxx/include/cuda/std/__ranges/access.h |   4 +-
 .../include/cuda/std/__ranges/concepts.h      |   4 +-
 .../include/cuda/std/__ranges/dangling.h      |   4 +-
 libcudacxx/include/cuda/std/__ranges/data.h   |   4 +-
 libcudacxx/include/cuda/std/__ranges/empty.h  |   4 +-
 libcudacxx/include/cuda/std/__ranges/rbegin.h |   4 +-
 libcudacxx/include/cuda/std/__ranges/rend.h   |   4 +-
 libcudacxx/include/cuda/std/__ranges/size.h   |   4 +-
 .../include/cuda/std/__ranges/subrange.h      |   4 +-
 .../include/cuda/std/__ranges/unwrap_end.h    |   4 +-
 .../cuda/std/__ranges/view_interface.h        |   4 +-
 libcudacxx/include/cuda/std/__ranges/views.h  |   4 +-
 .../std/__tuple_dir/structured_bindings.h     |   4 +-
 .../include/cuda/std/__tuple_dir/tuple_like.h |   4 +-
 .../cuda/std/__type_traits/common_reference.h |   6 +-
 .../cuda/std/__type_traits/common_type.h      |   6 +-
 .../cuda/std/__type_traits/disjunction.h      |   4 +-
 .../cuda/std/__type_traits/is_convertible.h   |   4 +-
 .../std/__type_traits/is_primary_template.h   |   6 +-
 .../cuda/std/__type_traits/type_list.h        |   6 +-
 .../include/cuda/std/__type_traits/type_set.h |   2 +-
 .../include/cuda/std/__utility/auto_cast.h    |   2 +-
 .../include/cuda/std/__utility/declval.h      |   3 +-
 libcudacxx/include/cuda/std/bitset            |   6 +-
 .../cuda/std/detail/libcxx/include/__config   |  24 ++--
 .../cuda/std/detail/libcxx/include/climits    |   2 +-
 .../cuda/std/detail/libcxx/include/cmath      |  18 +--
 .../cuda/std/detail/libcxx/include/limits     |   2 +-
 .../cuda/std/detail/libcxx/include/span       |  12 +-
 .../cuda/std/detail/libcxx/include/variant    |   4 +-
 libcudacxx/include/cuda/std/inplace_vector    |  52 ++++-----
 libcudacxx/include/cuda/std/version           |   4 +-
 .../support.srcloc/general.pass.cpp           |   8 +-
 .../bitset.members/to_ullong.pass.cpp         |   2 +-
 .../bitset.members/to_ulong.pass.cpp          |   2 +-
 libcudacxx/test/support/test_macros.h         |   6 +-
 thrust/testing/async_sort.cu                  |   2 +-
 thrust/testing/cuda/transform.cu              |   4 +-
 thrust/testing/functional.cu                  |   2 +-
 thrust/testing/set_difference.cu              |   2 +-
 thrust/testing/set_intersection.cu            |   2 +-
 thrust/testing/vector_manipulation.cu         |   2 +-
 thrust/thrust/detail/config/compiler.h        |   8 +-
 thrust/thrust/detail/config/compiler_fence.h  |   4 +-
 thrust/thrust/detail/config/cpp_dialect.h     |   6 +-
 thrust/thrust/detail/config/deprecated.h      |   2 +-
 thrust/thrust/iterator/permutation_iterator.h |   8 +-
 thrust/thrust/iterator/reverse_iterator.h     |   6 +-
 thrust/thrust/iterator/transform_iterator.h   |   8 +-
 thrust/thrust/iterator/zip_iterator.h         |   6 +-
 thrust/thrust/optional.h                      |   4 +-
 thrust/thrust/system/detail/error_code.inl    |   8 +-
 .../thrust/system/detail/error_condition.inl  |   8 +-
 thrust/thrust/system/error_code.h             |  16 +--
 .../type_traits/is_contiguous_iterator.h      |   2 +-
 138 files changed, 458 insertions(+), 481 deletions(-)

diff --git a/cub/cub/detail/nvtx.cuh b/cub/cub/detail/nvtx.cuh
index 3bda5e596f3..35fae565b0b 100644
--- a/cub/cub/detail/nvtx.cuh
+++ b/cub/cub/detail/nvtx.cuh
@@ -96,7 +96,7 @@ CUB_NAMESPACE_END
 
 #    define CUB_DETAIL_NVTX_RANGE_SCOPE(name) CUB_DETAIL_NVTX_RANGE_SCOPE_IF(true, name)
 #  else // NVTX3_CPP_DEFINITIONS_V1_0
-#    if defined(_CCCL_COMPILER_MSVC)
+#    if _CCCL_COMPILER(MSVC)
 #      pragma message( \
         "warning: nvtx3.hpp is available but does not define the V1 API. This is odd. Please open a GitHub issue at: https://github.com/NVIDIA/cccl/issues.")
 #    else
diff --git a/cub/cub/util_compiler.cuh b/cub/cub/util_compiler.cuh
index 6385e795045..2110268617c 100644
--- a/cub/cub/util_compiler.cuh
+++ b/cub/cub/util_compiler.cuh
@@ -66,13 +66,13 @@
 #define CUB_DEVICE_COMPILER_CLANG 4
 
 // figure out which host compiler we're using
-#if defined(_CCCL_COMPILER_MSVC)
+#if _CCCL_COMPILER(MSVC)
 //! deprecated [Since 2.7]
 #  define CUB_HOST_COMPILER CUB_HOST_COMPILER_MSVC
 //! deprecated [Since 2.7]
-#  define CUB_MSVC_VERSION _CCCL_MSVC_VERSION
+#  define CUB_MSVC_VERSION _MSC_VER
 //! deprecated [Since 2.7]
-#  define CUB_MSVC_VERSION_FULL _CCCL_MSVC_VERSION_FULL
+#  define CUB_MSVC_VERSION_FULL _MSC_FULL_VER
 #elif _CCCL_COMPILER(CLANG)
 //! deprecated [Since 2.7]
 #  define CUB_HOST_COMPILER CUB_HOST_COMPILER_CLANG
@@ -89,7 +89,7 @@
 #if defined(_CCCL_CUDA_COMPILER_NVCC) || defined(_CCCL_CUDA_COMPILER_NVHPC)
 //! deprecated [Since 2.7]
 #  define CUB_DEVICE_COMPILER CUB_DEVICE_COMPILER_NVCC
-#elif defined(_CCCL_COMPILER_MSVC)
+#elif _CCCL_COMPILER(MSVC)
 //! deprecated [Since 2.7]
 #  define CUB_DEVICE_COMPILER CUB_DEVICE_COMPILER_MSVC
 #elif _CCCL_COMPILER(GCC)
diff --git a/cub/cub/util_cpp_dialect.cuh b/cub/cub/util_cpp_dialect.cuh
index 6f54239bf84..e2affdb3304 100644
--- a/cub/cub/util_cpp_dialect.cuh
+++ b/cub/cub/util_cpp_dialect.cuh
@@ -80,7 +80,7 @@
 #  define CUB_CPP_DIALECT _CCCL_STD_VER
 
 // Define CUB_COMPILER_DEPRECATION macro:
-#  if defined(_CCCL_COMPILER_MSVC)
+#  if _CCCL_COMPILER(MSVC)
 #    define CUB_COMP_DEPR_IMPL(msg) _CCCL_PRAGMA(message(__FILE__ ":" _CCCL_TO_STRING(__LINE__) ": warning: " #msg))
 #  else // clang / gcc:
 #    define CUB_COMP_DEPR_IMPL(msg) _CCCL_PRAGMA(GCC warning #msg)
@@ -101,10 +101,10 @@
 CUB_COMPILER_DEPRECATION(GCC 5.0);
 #    elif _CCCL_COMPILER(CLANG, <, 7)
 CUB_COMPILER_DEPRECATION(Clang 7.0);
-#    elif defined(_CCCL_COMPILER_MSVC) && _CCCL_MSVC_VERSION < 1910
+#    elif _CCCL_COMPILER(MSVC, <, 19, 10)
 // <2017. Hard upgrade message:
 CUB_COMPILER_DEPRECATION(MSVC 2019(19.20 / 16.0 / 14.20));
-#    elif defined(_CCCL_COMPILER_MSVC) && _CCCL_MSVC_VERSION < 1920
+#    elif _CCCL_COMPILER(MSVC2017)
 // >=2017, <2019. Soft deprecation message:
 CUB_COMPILER_DEPRECATION_SOFT(MSVC 2019(19.20 / 16.0 / 14.20), MSVC 2017);
 #    endif
diff --git a/cub/cub/util_deprecated.cuh b/cub/cub/util_deprecated.cuh
index 250c3f53b16..c227d4309b9 100644
--- a/cub/cub/util_deprecated.cuh
+++ b/cub/cub/util_deprecated.cuh
@@ -55,7 +55,7 @@
 #elif _CCCL_STD_VER >= 2014
 #  define CUB_DEPRECATED              [[deprecated]]
 #  define CUB_DEPRECATED_BECAUSE(MSG) [[deprecated(MSG)]]
-#elif defined(_CCCL_COMPILER_MSVC)
+#elif _CCCL_COMPILER(MSVC)
 #  define CUB_DEPRECATED              __declspec(deprecated)
 #  define CUB_DEPRECATED_BECAUSE(MSG) __declspec(deprecated(MSG))
 #elif _CCCL_COMPILER(CLANG)
diff --git a/cub/test/catch2_test_device_for_each_in_extents.cu b/cub/test/catch2_test_device_for_each_in_extents.cu
index 51f9b7e6d37..6f11810101c 100644
--- a/cub/test/catch2_test_device_for_each_in_extents.cu
+++ b/cub/test/catch2_test_device_for_each_in_extents.cu
@@ -26,9 +26,9 @@
  ******************************************************************************/
 #include <cub/config.cuh>
 
-// TODO: remove _CCCL_COMPILER_MSVC check after MSVC bug related to vector comparison is fixed:
+// TODO: remove _CCCL_COMPILER(MSVC) check after MSVC bug related to vector comparison is fixed:
 //       "error C3546: '...': there are no parameter packs available to expand"
-#if __cccl_lib_mdspan && !defined(_CCCL_COMPILER_MSVC)
+#if __cccl_lib_mdspan && !_CCCL_COMPILER(MSVC)
 
 #  include <cub/device/device_for.cuh>
 
@@ -181,4 +181,4 @@ C2H_TEST("DeviceForEachInExtents 3D dynamic", "[ForEachInExtents][dynamic][devic
   REQUIRE(h_output == h_output_gpu);
 }
 
-#endif // __cccl_lib_mdspan && !defined(_CCCL_COMPILER_MSVC)
+#endif // __cccl_lib_mdspan && !_CCCL_COMPILER(MSVC)
diff --git a/cub/test/catch2_test_device_transform.cu b/cub/test/catch2_test_device_transform.cu
index 4da07e330b6..db05da6c032 100644
--- a/cub/test/catch2_test_device_transform.cu
+++ b/cub/test/catch2_test_device_transform.cu
@@ -178,10 +178,10 @@ struct alignas(Alignment) overaligned_addable_t
 
 using overaligned_types =
   c2h::type_list<overaligned_addable_t<32>
-#ifndef _CCCL_COMPILER_MSVC // error C2719: [...] formal parameter with requested alignment of 256 won't be aligned
+#if !_CCCL_COMPILER(MSVC) // error C2719: [...] formal parameter with requested alignment of 256 won't be aligned
                  ,
                  overaligned_addable_t<256>
-#endif // _CCCL_COMPILER_MSVC
+#endif // !_CCCL_COMPILER(MSVC)
                  >;
 
 // test with types exceeding the memcpy_async and bulk copy alignments (16 and 128 bytes respectively)
diff --git a/cub/test/test_warning_suppression.cuh b/cub/test/test_warning_suppression.cuh
index e11d199e0a8..46c6080fed7 100644
--- a/cub/test/test_warning_suppression.cuh
+++ b/cub/test/test_warning_suppression.cuh
@@ -33,7 +33,7 @@
 // C4127: conditional expression is constant
 // This can be fixed with `if constexpr` when available, but there's no way to
 // silence these pre-C++17.
-#if defined(_CCCL_COMPILER_MSVC)
+#if _CCCL_COMPILER(MSVC)
 #  if _CCCL_STD_VER < 2017
 #    pragma warning(disable : 4127)
 #  endif
diff --git a/cudax/examples/stf/fdtd_mgpu.cu b/cudax/examples/stf/fdtd_mgpu.cu
index a9a54d1f993..d991c97f258 100644
--- a/cudax/examples/stf/fdtd_mgpu.cu
+++ b/cudax/examples/stf/fdtd_mgpu.cu
@@ -21,7 +21,7 @@
 using namespace cuda::experimental::stf;
 
 // FIXME : MSVC has trouble with box constructors
-#if !defined(_CCCL_COMPILER_MSVC)
+#if !_CCCL_COMPILER(MSVC)
 void write_vtk_2D(const std::string& filename, slice<double, 3> Ez, double dx, double dy, double /*unused*/)
 {
   FILE* f = fopen(filename.c_str(), "w");
@@ -99,11 +99,11 @@ _CCCL_DEVICE double Source(double t, double x, double y, double z)
   constexpr double k          = 2 * pi / wavelength;
   return sin(k * x - omega * t);
 }
-#endif // !defined(_CCCL_COMPILER_MSVC)
+#endif // !_CCCL_COMPILER(MSVC)
 
 int main([[maybe_unused]] int argc, [[maybe_unused]] char** argv)
 {
-#if !defined(_CCCL_COMPILER_MSVC)
+#if !_CCCL_COMPILER(MSVC)
   context ctx;
 
   // Initialize the time loop
@@ -292,5 +292,5 @@ int main([[maybe_unused]] int argc, [[maybe_unused]] char** argv)
   };
 
   ctx.finalize();
-#endif // !defined(_CCCL_COMPILER_MSVC)
+#endif // !_CCCL_COMPILER(MSVC)
 }
diff --git a/cudax/include/cuda/experimental/__async/lazy.cuh b/cudax/include/cuda/experimental/__async/lazy.cuh
index 95f7a4a0adb..7655b658401 100644
--- a/cudax/include/cuda/experimental/__async/lazy.cuh
+++ b/cudax/include/cuda/experimental/__async/lazy.cuh
@@ -136,7 +136,7 @@ struct __lazy_tupl<_CUDA_VSTD::index_sequence<_Idx...>, _Ts...> : __detail::__la
   bool __engaged_[sizeof...(_Ts)] = {};
 };
 
-#if defined(_CCCL_COMPILER_MSVC)
+#if _CCCL_COMPILER(MSVC)
 template <class... _Ts>
 struct __mk_lazy_tuple_
 {
diff --git a/cudax/include/cuda/experimental/__async/meta.cuh b/cudax/include/cuda/experimental/__async/meta.cuh
index dbe2d21a15a..4f2b00a9de2 100644
--- a/cudax/include/cuda/experimental/__async/meta.cuh
+++ b/cudax/include/cuda/experimental/__async/meta.cuh
@@ -135,7 +135,7 @@ inline constexpr bool __type_is_error<_ERROR<_What...>&> = true;
 // True if any of the types in _Ts... are errors; false otherwise.
 template <class... _Ts>
 inline constexpr bool __type_contains_error =
-#if defined(_CCCL_COMPILER_MSVC)
+#if _CCCL_COMPILER(MSVC)
   (__type_is_error<_Ts> || ...);
 #else
   __ustdex_unhandled_error(static_cast<_CUDA_VSTD::__type_list<_Ts...>*>(nullptr));
diff --git a/cudax/include/cuda/experimental/__async/tuple.cuh b/cudax/include/cuda/experimental/__async/tuple.cuh
index 3891ec47df7..06e74e3aabc 100644
--- a/cudax/include/cuda/experimental/__async/tuple.cuh
+++ b/cudax/include/cuda/experimental/__async/tuple.cuh
@@ -82,7 +82,7 @@ template <class _Fn, class _Tupl, class... _Us>
 using __apply_result_t =
   decltype(__declval<_Tupl>().__apply(__declval<_Fn>(), __declval<_Tupl>(), __declval<_Us>()...));
 
-#if defined(_CCCL_COMPILER_MSVC)
+#if _CCCL_COMPILER(MSVC)
 template <class... _Ts>
 struct __mk_tuple_
 {
diff --git a/cudax/include/cuda/experimental/__async/variant.cuh b/cudax/include/cuda/experimental/__async/variant.cuh
index 3398cdc9717..2c8c5b1ea16 100644
--- a/cudax/include/cuda/experimental/__async/variant.cuh
+++ b/cudax/include/cuda/experimental/__async/variant.cuh
@@ -169,7 +169,7 @@ public:
   }
 };
 
-#if defined(_CCCL_COMPILER_MSVC)
+#if _CCCL_COMPILER(MSVC)
 template <class... _Ts>
 struct __mk_variant_
 {
diff --git a/cudax/include/cuda/experimental/__container/uninitialized_async_buffer.cuh b/cudax/include/cuda/experimental/__container/uninitialized_async_buffer.cuh
index 5bfd60da9d3..fb502cbbf7d 100644
--- a/cudax/include/cuda/experimental/__container/uninitialized_async_buffer.cuh
+++ b/cudax/include/cuda/experimental/__container/uninitialized_async_buffer.cuh
@@ -34,8 +34,7 @@
 
 #include <cuda/experimental/__memory_resource/any_resource.cuh>
 
-#if _CCCL_STD_VER >= 2014 && !defined(_CCCL_COMPILER_MSVC_2017) \
-  && defined(LIBCUDACXX_ENABLE_EXPERIMENTAL_MEMORY_RESOURCE)
+#if _CCCL_STD_VER >= 2014 && !_CCCL_COMPILER(MSVC2017) && defined(LIBCUDACXX_ENABLE_EXPERIMENTAL_MEMORY_RESOURCE)
 
 //! @file
 //! The \c uninitialized_async_buffer class provides a typed buffer allocated in stream-order from a given memory
@@ -299,6 +298,6 @@ using uninitialized_async_device_buffer = uninitialized_async_buffer<_Tp, _CUDA_
 
 } // namespace cuda::experimental
 
-#endif // _CCCL_STD_VER >= 2014 && !_CCCL_COMPILER_MSVC_2017 && LIBCUDACXX_ENABLE_EXPERIMENTAL_MEMORY_RESOURCE
+#endif // _CCCL_STD_VER >= 2014 && !_CCCL_COMPILER(MSVC2017) && LIBCUDACXX_ENABLE_EXPERIMENTAL_MEMORY_RESOURCE
 
 #endif //__CUDAX__CONTAINERS_UNINITIALIZED_ASYNC_BUFFER_H
diff --git a/cudax/include/cuda/experimental/__container/uninitialized_buffer.cuh b/cudax/include/cuda/experimental/__container/uninitialized_buffer.cuh
index 38c968d25c8..9a2f1200678 100644
--- a/cudax/include/cuda/experimental/__container/uninitialized_buffer.cuh
+++ b/cudax/include/cuda/experimental/__container/uninitialized_buffer.cuh
@@ -33,8 +33,7 @@
 
 #include <cuda/experimental/__memory_resource/any_resource.cuh>
 
-#if _CCCL_STD_VER >= 2014 && !defined(_CCCL_COMPILER_MSVC_2017) \
-  && defined(LIBCUDACXX_ENABLE_EXPERIMENTAL_MEMORY_RESOURCE)
+#if _CCCL_STD_VER >= 2014 && !_CCCL_COMPILER(MSVC2017) && defined(LIBCUDACXX_ENABLE_EXPERIMENTAL_MEMORY_RESOURCE)
 
 //! @file
 //! The \c uninitialized_buffer class provides a typed buffer allocated from a given memory resource.
@@ -264,6 +263,6 @@ using uninitialized_device_buffer = uninitialized_buffer<_Tp, _CUDA_VMR::device_
 
 } // namespace cuda::experimental
 
-#endif // _CCCL_STD_VER >= 2014 && !_CCCL_COMPILER_MSVC_2017 && LIBCUDACXX_ENABLE_EXPERIMENTAL_MEMORY_RESOURCE
+#endif // _CCCL_STD_VER >= 2014 && !_CCCL_COMPILER(MSVC2017) && LIBCUDACXX_ENABLE_EXPERIMENTAL_MEMORY_RESOURCE
 
 #endif //__CUDAX__CONTAINERS_UNINITIALIZED_BUFFER_H
diff --git a/cudax/include/cuda/experimental/__device/device.cuh b/cudax/include/cuda/experimental/__device/device.cuh
index 3e19bafb4e7..98db56f668e 100644
--- a/cudax/include/cuda/experimental/__device/device.cuh
+++ b/cudax/include/cuda/experimental/__device/device.cuh
@@ -69,7 +69,7 @@ public:
   using attr_result_t = typename detail::__dev_attr<_Attr>::type;
 
 #ifndef _CCCL_DOXYGEN_INVOKED // Do not document
-#  if defined(_CCCL_COMPILER_MSVC)
+#  if _CCCL_COMPILER(MSVC)
   // When __EDG__ is defined, std::construct_at will not permit constructing
   // a device object from an __emplace_device object. This is a workaround.
   device(detail::__emplace_device __ed)
diff --git a/cudax/include/cuda/experimental/__memory_resource/any_resource.cuh b/cudax/include/cuda/experimental/__memory_resource/any_resource.cuh
index f386853bb08..f442e56dcfe 100644
--- a/cudax/include/cuda/experimental/__memory_resource/any_resource.cuh
+++ b/cudax/include/cuda/experimental/__memory_resource/any_resource.cuh
@@ -28,7 +28,7 @@
 #endif
 
 // cuda::mr is unavable on MSVC 2017
-#if defined(_CCCL_COMPILER_MSVC_2017)
+#if _CCCL_COMPILER(MSVC2017)
 #  error "The any_resource header is not supported on MSVC 2017"
 #endif
 
diff --git a/cudax/include/cuda/experimental/__memory_resource/device_memory_pool.cuh b/cudax/include/cuda/experimental/__memory_resource/device_memory_pool.cuh
index 4708930d8ad..c74f7d68f77 100644
--- a/cudax/include/cuda/experimental/__memory_resource/device_memory_pool.cuh
+++ b/cudax/include/cuda/experimental/__memory_resource/device_memory_pool.cuh
@@ -22,7 +22,7 @@
 #endif // no system header
 
 // cudaMallocAsync was introduced in CTK 11.2
-#if !defined(_CCCL_COMPILER_MSVC_2017) && _CCCL_CUDACC_AT_LEAST(11, 2)
+#if !_CCCL_COMPILER(MSVC2017) && _CCCL_CUDACC_AT_LEAST(11, 2)
 
 #  if defined(_CCCL_CUDA_COMPILER_CLANG)
 #    include <cuda_runtime.h>
@@ -428,6 +428,6 @@ public:
 
 #  endif // _CCCL_STD_VER >= 2014
 
-#endif // !_CCCL_COMPILER_MSVC_2017 && _CCCL_CUDACC_AT_LEAST(11, 2)
+#endif // !_CCCL_COMPILER(MSVC2017) && _CCCL_CUDACC_AT_LEAST(11, 2)
 
 #endif // _CUDAX__MEMORY_RESOURCE_DEVICE_MEMORY_POOL
diff --git a/cudax/include/cuda/experimental/__memory_resource/device_memory_resource.cuh b/cudax/include/cuda/experimental/__memory_resource/device_memory_resource.cuh
index 7d54dd4f750..fffe3dea722 100644
--- a/cudax/include/cuda/experimental/__memory_resource/device_memory_resource.cuh
+++ b/cudax/include/cuda/experimental/__memory_resource/device_memory_resource.cuh
@@ -22,7 +22,7 @@
 #endif // no system header
 
 // cudaMallocAsync was introduced in CTK 11.2
-#if !defined(_CCCL_COMPILER_MSVC_2017) && _CCCL_CUDACC_AT_LEAST(11, 2)
+#if !_CCCL_COMPILER(MSVC2017) && _CCCL_CUDACC_AT_LEAST(11, 2)
 
 #  if defined(_CCCL_CUDA_COMPILER_CLANG)
 #    include <cuda_runtime.h>
@@ -417,6 +417,6 @@ static_assert(_CUDA_VMR::resource_with<device_memory_resource, _CUDA_VMR::device
 
 #  endif // _CCCL_STD_VER >= 2014
 
-#endif // !_CCCL_COMPILER_MSVC_2017 && _CCCL_CUDACC_AT_LEAST(11, 2)
+#endif // !_CCCL_COMPILER(MSVC2017) && _CCCL_CUDACC_AT_LEAST(11, 2)
 
 #endif //_CUDAX__MEMORY_RESOURCE_CUDA_DEVICE_MEMORY_RESOURCE
diff --git a/cudax/include/cuda/experimental/__memory_resource/shared_resource.cuh b/cudax/include/cuda/experimental/__memory_resource/shared_resource.cuh
index e92538ae8a0..1b0a81320b1 100644
--- a/cudax/include/cuda/experimental/__memory_resource/shared_resource.cuh
+++ b/cudax/include/cuda/experimental/__memory_resource/shared_resource.cuh
@@ -28,7 +28,7 @@
 #endif
 
 // cuda::mr is unavable on MSVC 2017
-#if defined(_CCCL_COMPILER_MSVC_2017)
+#if _CCCL_COMPILER(MSVC2017)
 #  error "The shared_resource header is not supported on MSVC 2017"
 #endif
 
diff --git a/cudax/include/cuda/experimental/__stf/graph/graph_task.cuh b/cudax/include/cuda/experimental/__stf/graph/graph_task.cuh
index 884abc7cdac..f10c883e2ee 100644
--- a/cudax/include/cuda/experimental/__stf/graph/graph_task.cuh
+++ b/cudax/include/cuda/experimental/__stf/graph/graph_task.cuh
@@ -420,11 +420,11 @@ public:
     return mv(*this);
   }
 
-#if defined(_CCCL_COMPILER_MSVC)
+#if _CCCL_COMPILER(MSVC)
   // TODO (miscco): figure out why MSVC is complaining about unreachable code here
   _CCCL_DIAG_PUSH
   _CCCL_DIAG_SUPPRESS_MSVC(4702) // unreachable code
-#endif // _CCCL_COMPILER_MSVC
+#endif // _CCCL_COMPILER(MSVC)
 
   template <typename Fun>
   void operator->*(Fun&& f)
@@ -518,9 +518,9 @@ public:
       ::std::apply(f, tuple_prepend(mv(childGraph), typed_deps()));
     }
   }
-#if defined(_CCCL_COMPILER_MSVC)
+#if _CCCL_COMPILER(MSVC)
   _CCCL_DIAG_POP
-#endif // _CCCL_COMPILER_MSVC
+#endif // _CCCL_COMPILER(MSVC)
 
 private:
   auto typed_deps()
diff --git a/cudax/include/cuda/experimental/__stf/internal/backend_ctx.cuh b/cudax/include/cuda/experimental/__stf/internal/backend_ctx.cuh
index ce162fc40c0..2822370c1f3 100644
--- a/cudax/include/cuda/experimental/__stf/internal/backend_ctx.cuh
+++ b/cudax/include/cuda/experimental/__stf/internal/backend_ctx.cuh
@@ -492,19 +492,19 @@ protected:
       return nullptr;
     }
 
-#if defined(_CCCL_COMPILER_MSVC)
+#if _CCCL_COMPILER(MSVC)
     _CCCL_DIAG_PUSH
     _CCCL_DIAG_SUPPRESS_MSVC(4702) // unreachable code
-#endif // _CCCL_COMPILER_MSVC
+#endif // _CCCL_COMPILER(MSVC)
     virtual event_list stream_to_event_list(cudaStream_t, ::std::string) const
     {
       fprintf(stderr, "Internal error.\n");
       abort();
       return event_list();
     }
-#if defined(_CCCL_COMPILER_MSVC)
+#if _CCCL_COMPILER(MSVC)
     _CCCL_DIAG_POP
-#endif // _CCCL_COMPILER_MSVC
+#endif // _CCCL_COMPILER(MSVC)
 
     virtual size_t epoch() const
     {
diff --git a/cudax/include/cuda/experimental/__stf/places/exec/host/callback_queues.cuh b/cudax/include/cuda/experimental/__stf/places/exec/host/callback_queues.cuh
index 2d3036ec143..387a3594c1f 100644
--- a/cudax/include/cuda/experimental/__stf/places/exec/host/callback_queues.cuh
+++ b/cudax/include/cuda/experimental/__stf/places/exec/host/callback_queues.cuh
@@ -32,7 +32,7 @@
 
 #ifndef _CCCL_DOXYGEN_INVOKED // do not document
 
-#  if !defined(_CCCL_COMPILER_MSVC)
+#  if !_CCCL_COMPILER(MSVC)
 #    define STATEFUL_CALLBACKS
 
 namespace cuda::experimental::stf
@@ -602,5 +602,5 @@ inline bool cudaCallbackQueueProgress(callback_queue* q, bool flag)
 
 } // end namespace cuda::experimental::stf
 
-#  endif // !_CCCL_COMPILER_MSVC
+#  endif // !_CCCL_COMPILER(MSVC)
 #endif // _CCCL_DOXYGEN_INVOKED do not document
diff --git a/cudax/include/cuda/experimental/__stf/stream/stream_ctx.cuh b/cudax/include/cuda/experimental/__stf/stream/stream_ctx.cuh
index 022179341d9..86cceb1b2d9 100644
--- a/cudax/include/cuda/experimental/__stf/stream/stream_ctx.cuh
+++ b/cudax/include/cuda/experimental/__stf/stream/stream_ctx.cuh
@@ -809,7 +809,7 @@ UNITTEST("movable stream_task")
 
 // FIXME : This test is causing some compiler errors with MSVC, so we disable
 // it on MSVC for now
-#  if !defined(_CCCL_COMPILER_MSVC)
+#  if !_CCCL_COMPILER(MSVC)
 UNITTEST("logical_data_untyped moveable")
 {
   using namespace cuda::experimental::stf;
@@ -852,7 +852,7 @@ UNITTEST("logical_data_untyped moveable")
 
   ctx.finalize();
 };
-#  endif // !_CCCL_COMPILER_MSVC
+#  endif // !_CCCL_COMPILER(MSVC)
 
 #  ifdef __CUDACC__
 namespace reserved
diff --git a/cudax/include/cuda/experimental/__stf/utility/core.cuh b/cudax/include/cuda/experimental/__stf/utility/core.cuh
index 23b0ff5560f..42e68c36905 100644
--- a/cudax/include/cuda/experimental/__stf/utility/core.cuh
+++ b/cudax/include/cuda/experimental/__stf/utility/core.cuh
@@ -39,7 +39,7 @@ namespace cuda::experimental::stf
 {
 
 // Hack setenv on Windows
-#if defined(_CCCL_COMPILER_MSVC)
+#if _CCCL_COMPILER(MSVC)
 /**
  * @brief Sets an environment variable, mimicking the behavior of `std::setenv` on Windows.
  *
diff --git a/cudax/include/cuda/experimental/__stf/utility/traits.cuh b/cudax/include/cuda/experimental/__stf/utility/traits.cuh
index 402737a44d3..8308e56d702 100644
--- a/cudax/include/cuda/experimental/__stf/utility/traits.cuh
+++ b/cudax/include/cuda/experimental/__stf/utility/traits.cuh
@@ -44,11 +44,11 @@ namespace reserved
 template <typename T>
 constexpr ::std::string_view type_name_IMPL()
 {
-#if defined(_CCCL_COMPILER_MSVC)
+#if _CCCL_COMPILER(MSVC)
   return __FUNCSIG__;
-#else // ^^^ _CCCL_COMPILER_MSVC ^^^ / vvv !_CCCL_COMPILER_MSVC vvv
+#else // ^^^ _CCCL_COMPILER(MSVC) ^^^ / vvv !_CCCL_COMPILER(MSVC) vvv
   return __PRETTY_FUNCTION__;
-#endif // !_CCCL_COMPILER_MSVC
+#endif // !_CCCL_COMPILER(MSVC)
 }
 
 // Length of prefix and suffix in __PRETTY_FUNCTION__ when used with `type_name`.
@@ -73,14 +73,14 @@ inline constexpr ::std::pair<size_t, size_t> type_name_affixes = [] {
 template <class T>
 constexpr ::std::string_view type_name_impl()
 {
-#if defined(_CCCL_COMPILER_MSVC)
+#if _CCCL_COMPILER(MSVC)
   constexpr ::std::string_view p = __FUNCSIG__;
   // MSVC does not provide constexpr methods so we make this utility much simpler and return __FUNCSIG__ directly
   return p;
-#else // ^^^ _CCCL_COMPILER_MSVC ^^^ / vvv !_CCCL_COMPILER_MSVC vvv
+#else // ^^^ _CCCL_COMPILER(MSVC) ^^^ / vvv !_CCCL_COMPILER(MSVC) vvv
   ::std::string_view p = __PRETTY_FUNCTION__;
   return p.substr(type_name_affixes.first, p.size() - type_name_affixes.first - type_name_affixes.second);
-#endif // !_CCCL_COMPILER_MSVC
+#endif // !_CCCL_COMPILER(MSVC)
 }
 
 } // namespace reserved
diff --git a/cudax/test/stf/dot/basic.cu b/cudax/test/stf/dot/basic.cu
index dce79545a71..b602f142a2f 100644
--- a/cudax/test/stf/dot/basic.cu
+++ b/cudax/test/stf/dot/basic.cu
@@ -20,7 +20,7 @@ using namespace cuda::experimental::stf;
 int main()
 {
 // TODO (miscco): Make it work for windows
-#if !defined(_CCCL_COMPILER_MSVC)
+#if !_CCCL_COMPILER(MSVC)
   // Generate a random filename
   int r = rand();
 
@@ -44,5 +44,5 @@ int main()
   EXPECT(access(filename, F_OK) != -1);
 
   EXPECT(unlink(filename) == 0);
-#endif // !_CCCL_COMPILER_MSVC
+#endif // !_CCCL_COMPILER(MSVC)
 }
diff --git a/cudax/test/stf/dot/graph_print_to_dot.cu b/cudax/test/stf/dot/graph_print_to_dot.cu
index 62e5abfd193..cd024f72770 100644
--- a/cudax/test/stf/dot/graph_print_to_dot.cu
+++ b/cudax/test/stf/dot/graph_print_to_dot.cu
@@ -22,7 +22,7 @@ __global__ void dummy() {}
 int main()
 {
 // TODO (miscco): Make it work for windows
-#if !defined(_CCCL_COMPILER_MSVC)
+#if !_CCCL_COMPILER(MSVC)
   // Generate a random filename
   int r = rand();
 
@@ -46,5 +46,5 @@ int main()
   EXPECT(access(filename, F_OK) != -1);
 
   EXPECT(unlink(filename) == 0);
-#endif // !_CCCL_COMPILER_MSVC
+#endif // !_CCCL_COMPILER(MSVC)
 }
diff --git a/cudax/test/stf/dot/with_events.cu b/cudax/test/stf/dot/with_events.cu
index a03425024b8..b3c636e1d69 100644
--- a/cudax/test/stf/dot/with_events.cu
+++ b/cudax/test/stf/dot/with_events.cu
@@ -20,7 +20,7 @@ using namespace cuda::experimental::stf;
 int main()
 {
 // TODO (miscco): Make it work for windows
-#if !defined(_CCCL_COMPILER_MSVC)
+#if !_CCCL_COMPILER(MSVC)
   // Generate a random filename
   int r = rand();
 
@@ -45,5 +45,5 @@ int main()
   EXPECT(access(filename, F_OK) != -1);
 
   EXPECT(unlink(filename) == 0);
-#endif // !_CCCL_COMPILER_MSVC
+#endif // !_CCCL_COMPILER(MSVC)
 }
diff --git a/cudax/test/stf/error_checks/ctx_mismatch.cu b/cudax/test/stf/error_checks/ctx_mismatch.cu
index cafa6873dec..c04d589c367 100644
--- a/cudax/test/stf/error_checks/ctx_mismatch.cu
+++ b/cudax/test/stf/error_checks/ctx_mismatch.cu
@@ -53,9 +53,9 @@ void run(double (&X)[n])
 int main()
 {
   /* Setup an handler to catch the SIGABRT signal during the programming error */
-#if defined(_CCCL_COMPILER_MSVC)
+#if _CCCL_COMPILER(MSVC)
   signal(SIGABRT, &cleanupRoutine);
-#else // ^^^ _CCCL_COMPILER_MSVC ^^^ / vvv !_CCCL_COMPILER_MSVC
+#else // ^^^ _CCCL_COMPILER(MSVC) ^^^ / vvv !_CCCL_COMPILER(MSVC)
   struct sigaction sigabrt_action
   {};
   memset(&sigabrt_action, 0, sizeof(sigabrt_action));
@@ -66,7 +66,7 @@ int main()
     perror("sigaction SIGABRT");
     exit(EXIT_FAILURE);
   }
-#endif // !_CCCL_COMPILER_MSVC
+#endif // !_CCCL_COMPILER(MSVC)
 
   const int n = 12;
   double X[n];
diff --git a/cudax/test/stf/error_checks/data_interface_mismatch.cu b/cudax/test/stf/error_checks/data_interface_mismatch.cu
index 79969f390ba..ea2ada7e633 100644
--- a/cudax/test/stf/error_checks/data_interface_mismatch.cu
+++ b/cudax/test/stf/error_checks/data_interface_mismatch.cu
@@ -63,9 +63,9 @@ void run(double (&X)[n])
 int main()
 {
   /* Setup an handler to catch the SIGABRT signal during the programming error */
-#if defined(_CCCL_COMPILER_MSVC)
+#if _CCCL_COMPILER(MSVC)
   signal(SIGABRT, &cleanupRoutine);
-#else // ^^^ _CCCL_COMPILER_MSVC ^^^ / vvv !_CCCL_COMPILER_MSVC
+#else // ^^^ _CCCL_COMPILER(MSVC) ^^^ / vvv !_CCCL_COMPILER(MSVC)
   struct sigaction sigabrt_action
   {};
   memset(&sigabrt_action, 0, sizeof(sigabrt_action));
@@ -76,7 +76,7 @@ int main()
     perror("sigaction SIGABRT");
     exit(EXIT_FAILURE);
   }
-#endif // !_CCCL_COMPILER_MSVC
+#endif // !_CCCL_COMPILER(MSVC)
 
   const int n = 12;
   double X[n];
diff --git a/cudax/test/stf/error_checks/double_finalize.cu b/cudax/test/stf/error_checks/double_finalize.cu
index 6de61c0c2b3..37913ca6e36 100644
--- a/cudax/test/stf/error_checks/double_finalize.cu
+++ b/cudax/test/stf/error_checks/double_finalize.cu
@@ -39,9 +39,9 @@ int main()
   // This test only works when assert() is enabled in
 #ifndef NDEBUG
   /* Setup an handler to catch the SIGABRT signal during the programming error */
-#  if defined(_CCCL_COMPILER_MSVC)
+#  if _CCCL_COMPILER(MSVC)
   signal(SIGABRT, &cleanupRoutine);
-#  else // ^^^ _CCCL_COMPILER_MSVC ^^^ / vvv !_CCCL_COMPILER_MSVC
+#  else // ^^^ _CCCL_COMPILER(MSVC) ^^^ / vvv !_CCCL_COMPILER(MSVC)
   struct sigaction sigabrt_action
   {};
   memset(&sigabrt_action, 0, sizeof(sigabrt_action));
@@ -52,7 +52,7 @@ int main()
     perror("sigaction SIGABRT");
     exit(EXIT_FAILURE);
   }
-#  endif // !_CCCL_COMPILER_MSVC
+#  endif // !_CCCL_COMPILER(MSVC)
 
   context ctx;
 
diff --git a/cudax/test/stf/error_checks/erase_frozen.cu b/cudax/test/stf/error_checks/erase_frozen.cu
index 3e99c360aa2..624dfb062f8 100644
--- a/cudax/test/stf/error_checks/erase_frozen.cu
+++ b/cudax/test/stf/error_checks/erase_frozen.cu
@@ -40,9 +40,9 @@ void cleanupRoutine(int /*unused*/)
 int main()
 {
   /* Setup an handler to catch the SIGABRT signal during the programming error */
-#if defined(_CCCL_COMPILER_MSVC)
+#if _CCCL_COMPILER(MSVC)
   signal(SIGABRT, &cleanupRoutine);
-#else // ^^^ _CCCL_COMPILER_MSVC ^^^ / vvv !_CCCL_COMPILER_MSVC
+#else // ^^^ _CCCL_COMPILER(MSVC) ^^^ / vvv !_CCCL_COMPILER(MSVC)
   struct sigaction sigabrt_action
   {};
   memset(&sigabrt_action, 0, sizeof(sigabrt_action));
@@ -53,7 +53,7 @@ int main()
     perror("sigaction SIGABRT");
     exit(EXIT_FAILURE);
   }
-#endif // !_CCCL_COMPILER_MSVC
+#endif // !_CCCL_COMPILER(MSVC)
 
   stream_ctx ctx;
   const int N = 16;
diff --git a/cudax/test/stf/error_checks/misformed_tasks_dbl_end.cu b/cudax/test/stf/error_checks/misformed_tasks_dbl_end.cu
index b91a8d0aabb..fa28e5467e0 100644
--- a/cudax/test/stf/error_checks/misformed_tasks_dbl_end.cu
+++ b/cudax/test/stf/error_checks/misformed_tasks_dbl_end.cu
@@ -39,9 +39,9 @@ int main()
   // This test only works when assert() is enabled in
 #ifndef NDEBUG
   /* Setup an handler to catch the SIGABRT signal during the programming error */
-#  if defined(_CCCL_COMPILER_MSVC)
+#  if _CCCL_COMPILER(MSVC)
   signal(SIGABRT, &cleanupRoutine);
-#  else // ^^^ _CCCL_COMPILER_MSVC ^^^ / vvv !_CCCL_COMPILER_MSVC
+#  else // ^^^ _CCCL_COMPILER(MSVC) ^^^ / vvv !_CCCL_COMPILER(MSVC)
   struct sigaction sigabrt_action
   {};
   memset(&sigabrt_action, 0, sizeof(sigabrt_action));
@@ -52,7 +52,7 @@ int main()
     perror("sigaction SIGABRT");
     exit(EXIT_FAILURE);
   }
-#  endif // !_CCCL_COMPILER_MSVC
+#  endif // !_CCCL_COMPILER(MSVC)
 
   stream_ctx ctx;
 
diff --git a/cudax/test/stf/error_checks/misformed_tasks_dbl_start.cu b/cudax/test/stf/error_checks/misformed_tasks_dbl_start.cu
index 3f783773b0e..b35cb99457f 100644
--- a/cudax/test/stf/error_checks/misformed_tasks_dbl_start.cu
+++ b/cudax/test/stf/error_checks/misformed_tasks_dbl_start.cu
@@ -37,9 +37,9 @@ void cleanupRoutine(int /*unused*/)
 int main()
 {
   /* Setup an handler to catch the SIGABRT signal during the programming error */
-#if defined(_CCCL_COMPILER_MSVC)
+#if _CCCL_COMPILER(MSVC)
   signal(SIGABRT, &cleanupRoutine);
-#else // ^^^ _CCCL_COMPILER_MSVC ^^^ / vvv !_CCCL_COMPILER_MSVC
+#else // ^^^ _CCCL_COMPILER(MSVC) ^^^ / vvv !_CCCL_COMPILER(MSVC)
   struct sigaction sigabrt_action
   {};
   memset(&sigabrt_action, 0, sizeof(sigabrt_action));
@@ -50,7 +50,7 @@ int main()
     perror("sigaction SIGABRT");
     exit(EXIT_FAILURE);
   }
-#endif // !_CCCL_COMPILER_MSVC
+#endif // !_CCCL_COMPILER(MSVC)
 
   stream_ctx ctx;
 
diff --git a/cudax/test/stf/error_checks/non_managed_data.cu b/cudax/test/stf/error_checks/non_managed_data.cu
index 387322a0912..a1188c7750f 100644
--- a/cudax/test/stf/error_checks/non_managed_data.cu
+++ b/cudax/test/stf/error_checks/non_managed_data.cu
@@ -41,9 +41,9 @@ int main()
 {
   /* Setup an handler to catch the SIGABRT signal during the programming error */
 #ifndef NDEBUG
-#  if defined(_CCCL_COMPILER_MSVC)
+#  if _CCCL_COMPILER(MSVC)
   signal(SIGABRT, &cleanupRoutine);
-#  else // ^^^ _CCCL_COMPILER_MSVC ^^^ / vvv !_CCCL_COMPILER_MSVC
+#  else // ^^^ _CCCL_COMPILER(MSVC) ^^^ / vvv !_CCCL_COMPILER(MSVC)
   struct sigaction sigabrt_action
   {};
   memset(&sigabrt_action, 0, sizeof(sigabrt_action));
@@ -54,7 +54,7 @@ int main()
     perror("sigaction SIGABRT");
     exit(EXIT_FAILURE);
   }
-#  endif // !_CCCL_COMPILER_MSVC
+#  endif // !_CCCL_COMPILER(MSVC)
 
   stream_ctx ctx;
 
diff --git a/cudax/test/stf/error_checks/slice_check_bounds.cu b/cudax/test/stf/error_checks/slice_check_bounds.cu
index f27cebdd722..fecea9e7a55 100644
--- a/cudax/test/stf/error_checks/slice_check_bounds.cu
+++ b/cudax/test/stf/error_checks/slice_check_bounds.cu
@@ -48,9 +48,9 @@ int main()
 {
   /* Setup an handler to catch the SIGABRT signal during the programming error */
 #ifndef NDEBUG
-#  if defined(_CCCL_COMPILER_MSVC)
+#  if _CCCL_COMPILER(MSVC)
   signal(SIGABRT, &cleanupRoutine);
-#  else // ^^^ _CCCL_COMPILER_MSVC ^^^ / vvv !_CCCL_COMPILER_MSVC
+#  else // ^^^ _CCCL_COMPILER(MSVC) ^^^ / vvv !_CCCL_COMPILER(MSVC)
   struct sigaction sigabrt_action
   {};
   memset(&sigabrt_action, 0, sizeof(sigabrt_action));
@@ -61,7 +61,7 @@ int main()
     perror("sigaction SIGABRT");
     exit(EXIT_FAILURE);
   }
-#  endif // !_CCCL_COMPILER_MSVC
+#  endif // !_CCCL_COMPILER(MSVC)
 
   context ctx;
 
diff --git a/cudax/test/stf/error_checks/uninitialized_data.cu b/cudax/test/stf/error_checks/uninitialized_data.cu
index efd45db4d1b..6af57556ad5 100644
--- a/cudax/test/stf/error_checks/uninitialized_data.cu
+++ b/cudax/test/stf/error_checks/uninitialized_data.cu
@@ -39,9 +39,9 @@ int main()
 {
   /* Setup an handler to catch the SIGABRT signal during the programming error */
 #ifndef NDEBUG
-#  if defined(_CCCL_COMPILER_MSVC)
+#  if _CCCL_COMPILER(MSVC)
   signal(SIGABRT, &cleanupRoutine);
-#  else // ^^^ _CCCL_COMPILER_MSVC ^^^ / vvv !_CCCL_COMPILER_MSVC
+#  else // ^^^ _CCCL_COMPILER(MSVC) ^^^ / vvv !_CCCL_COMPILER(MSVC)
   struct sigaction sigabrt_action
   {};
   memset(&sigabrt_action, 0, sizeof(sigabrt_action));
@@ -52,7 +52,7 @@ int main()
     perror("sigaction SIGABRT");
     exit(EXIT_FAILURE);
   }
-#  endif // !_CCCL_COMPILER_MSVC
+#  endif // !_CCCL_COMPILER(MSVC)
 
   stream_ctx ctx;
 
diff --git a/cudax/test/stf/error_checks/unsatisfiable_spec.cu b/cudax/test/stf/error_checks/unsatisfiable_spec.cu
index ee3c10ad9cf..a0e4277979c 100644
--- a/cudax/test/stf/error_checks/unsatisfiable_spec.cu
+++ b/cudax/test/stf/error_checks/unsatisfiable_spec.cu
@@ -39,9 +39,9 @@ int main()
 {
   /* Setup an handler to catch the SIGABRT signal during the programming error */
 #ifndef NDEBUG
-#  if defined(_CCCL_COMPILER_MSVC)
+#  if _CCCL_COMPILER(MSVC)
   signal(SIGABRT, &cleanupRoutine);
-#  else // ^^^ _CCCL_COMPILER_MSVC ^^^ / vvv !_CCCL_COMPILER_MSVC
+#  else // ^^^ _CCCL_COMPILER(MSVC) ^^^ / vvv !_CCCL_COMPILER(MSVC)
   struct sigaction sigabrt_action
   {};
   memset(&sigabrt_action, 0, sizeof(sigabrt_action));
@@ -52,7 +52,7 @@ int main()
     perror("sigaction SIGABRT");
     exit(EXIT_FAILURE);
   }
-#  endif // !_CCCL_COMPILER_MSVC
+#  endif // !_CCCL_COMPILER(MSVC)
 
   context ctx;
 
diff --git a/cudax/test/stf/error_checks/write_frozen.cu b/cudax/test/stf/error_checks/write_frozen.cu
index 1d46c702c0f..b4e08642a5e 100644
--- a/cudax/test/stf/error_checks/write_frozen.cu
+++ b/cudax/test/stf/error_checks/write_frozen.cu
@@ -40,9 +40,9 @@ void cleanupRoutine(int /*unused*/)
 int main()
 {
   /* Setup an handler to catch the SIGABRT signal during the programming error */
-#if defined(_CCCL_COMPILER_MSVC)
+#if _CCCL_COMPILER(MSVC)
   signal(SIGABRT, &cleanupRoutine);
-#else // ^^^ _CCCL_COMPILER_MSVC ^^^ / vvv !_CCCL_COMPILER_MSVC
+#else // ^^^ _CCCL_COMPILER(MSVC) ^^^ / vvv !_CCCL_COMPILER(MSVC)
   struct sigaction sigabrt_action
   {};
   memset(&sigabrt_action, 0, sizeof(sigabrt_action));
@@ -53,7 +53,7 @@ int main()
     perror("sigaction SIGABRT");
     exit(EXIT_FAILURE);
   }
-#endif // !_CCCL_COMPILER_MSVC
+#endif // !_CCCL_COMPILER(MSVC)
 
   stream_ctx ctx;
   const int N = 16;
diff --git a/cudax/test/stf/parallel_for/fdtd.cu b/cudax/test/stf/parallel_for/fdtd.cu
index 0e97d2e7afd..686d613d710 100644
--- a/cudax/test/stf/parallel_for/fdtd.cu
+++ b/cudax/test/stf/parallel_for/fdtd.cu
@@ -14,7 +14,7 @@
 using namespace cuda::experimental::stf;
 
 // FIXME : MSVC has trouble with box constructors
-#if !defined(_CCCL_COMPILER_MSVC)
+#if !_CCCL_COMPILER(MSVC)
 void write_vtk_2D(const std::string& filename, slice<double, 3> Ez, double dx, double dy, double /*unused*/)
 {
   FILE* f = fopen(filename.c_str(), "w");
@@ -92,11 +92,11 @@ __device__ double Source(double t, double x, double y, double z)
   constexpr double k          = 2 * pi / wavelength;
   return sin(k * x - omega * t);
 }
-#endif // !defined(_CCCL_COMPILER_MSVC)
+#endif // !_CCCL_COMPILER(MSVC)
 
 int main([[maybe_unused]] int argc, [[maybe_unused]] char** argv)
 {
-#if !defined(_CCCL_COMPILER_MSVC)
+#if !_CCCL_COMPILER(MSVC)
   stream_ctx ctx;
 
   // Domain dimensions
@@ -250,5 +250,5 @@ int main([[maybe_unused]] int argc, [[maybe_unused]] char** argv)
   }
 
   ctx.finalize();
-#endif // !defined(_CCCL_COMPILER_MSVC)
+#endif // !_CCCL_COMPILER(MSVC)
 }
diff --git a/cudax/test/stf/reclaiming/graph.cu b/cudax/test/stf/reclaiming/graph.cu
index 3c77cb15980..56e82254bff 100644
--- a/cudax/test/stf/reclaiming/graph.cu
+++ b/cudax/test/stf/reclaiming/graph.cu
@@ -10,19 +10,19 @@
 
 #include <cuda/experimental/__stf/graph/graph_ctx.cuh>
 
-#if !defined(_CCCL_COMPILER_MSVC)
+#if !_CCCL_COMPILER(MSVC)
 using namespace cuda::experimental::stf;
 
 __global__ void kernel()
 {
   // No-op
 }
-#endif // !defined(_CCCL_COMPILER_MSVC)
+#endif // !_CCCL_COMPILER(MSVC)
 
 int main([[maybe_unused]] int argc, [[maybe_unused]] char** argv)
 {
 // TODO fix setenv
-#if !defined(_CCCL_COMPILER_MSVC)
+#if !_CCCL_COMPILER(MSVC)
   int nblocks       = 4;
   size_t block_size = 1024 * 1024;
 
@@ -68,5 +68,5 @@ int main([[maybe_unused]] int argc, [[maybe_unused]] char** argv)
   }
 
   ctx.finalize();
-#endif // !defined(_CCCL_COMPILER_MSVC)
+#endif // !_CCCL_COMPILER(MSVC)
 }
diff --git a/cudax/test/stf/stress/task_bench.cu b/cudax/test/stf/stress/task_bench.cu
index 769b057075b..5d5d483c9ca 100644
--- a/cudax/test/stf/stress/task_bench.cu
+++ b/cudax/test/stf/stress/task_bench.cu
@@ -57,10 +57,10 @@ int log2Int(int n)
   return result;
 }
 
-#if defined(_CCCL_COMPILER_MSVC)
+#if _CCCL_COMPILER(MSVC)
 _CCCL_DIAG_PUSH
 _CCCL_DIAG_SUPPRESS_MSVC(4702) // unreachable code
-#endif // _CCCL_COMPILER_MSVC
+#endif // _CCCL_COMPILER(MSVC)
 bool skip_task(test_id id, int t, int i, int /*W*/)
 {
   switch (id)
@@ -89,9 +89,9 @@ bool skip_task(test_id id, int t, int i, int /*W*/)
   abort();
   return true;
 }
-#if defined(_CCCL_COMPILER_MSVC)
+#if _CCCL_COMPILER(MSVC)
 _CCCL_DIAG_POP
-#endif // _CCCL_COMPILER_MSVC
+#endif // _CCCL_COMPILER(MSVC)
 
 std::vector<int> input_deps(test_id id, int t, int i, int W)
 {
diff --git a/cudax/test/stf/tools/auto_dump/auto_dump.cu b/cudax/test/stf/tools/auto_dump/auto_dump.cu
index 835016fcf2a..dd9b06981d4 100644
--- a/cudax/test/stf/tools/auto_dump/auto_dump.cu
+++ b/cudax/test/stf/tools/auto_dump/auto_dump.cu
@@ -21,7 +21,7 @@ using namespace cuda::experimental::stf;
 
 int main()
 {
-#if !defined(_CCCL_COMPILER_MSVC)
+#if !_CCCL_COMPILER(MSVC)
   // Generate a random dirname
   srand(static_cast<unsigned>(time(nullptr)));
   int r = rand();
@@ -60,5 +60,5 @@ int main()
   EXPECT(!std::filesystem::exists(dirname + "/" + std::to_string(2)));
 
   std::filesystem::remove_all(dirname);
-#endif // !_CCCL_COMPILER_MSVC
+#endif // !_CCCL_COMPILER(MSVC)
 }
diff --git a/libcudacxx/include/cuda/__memory_resource/device_memory_resource.h b/libcudacxx/include/cuda/__memory_resource/device_memory_resource.h
index d82ba355ff4..72e01a5521d 100644
--- a/libcudacxx/include/cuda/__memory_resource/device_memory_resource.h
+++ b/libcudacxx/include/cuda/__memory_resource/device_memory_resource.h
@@ -21,7 +21,7 @@
 #  pragma system_header
 #endif // no system header
 
-#if !defined(_CCCL_COMPILER_MSVC_2017) && defined(LIBCUDACXX_ENABLE_EXPERIMENTAL_MEMORY_RESOURCE)
+#if !_CCCL_COMPILER(MSVC2017) && defined(LIBCUDACXX_ENABLE_EXPERIMENTAL_MEMORY_RESOURCE)
 
 #  if defined(_CCCL_CUDA_COMPILER_CLANG)
 #    include <cuda_runtime_api.h>
@@ -214,6 +214,6 @@ _LIBCUDACXX_END_NAMESPACE_CUDA_MR
 
 #  endif // _CCCL_STD_VER >= 2014
 
-#endif // !_CCCL_COMPILER_MSVC_2017 && LIBCUDACXX_ENABLE_EXPERIMENTAL_MEMORY_RESOURCE
+#endif // !_CCCL_COMPILER(MSVC2017) && LIBCUDACXX_ENABLE_EXPERIMENTAL_MEMORY_RESOURCE
 
 #endif // _CUDA__MEMORY_RESOURCE_CUDA_MEMORY_RESOURCE_H
diff --git a/libcudacxx/include/cuda/__memory_resource/get_property.h b/libcudacxx/include/cuda/__memory_resource/get_property.h
index fcfa023e6fa..d9b762225eb 100644
--- a/libcudacxx/include/cuda/__memory_resource/get_property.h
+++ b/libcudacxx/include/cuda/__memory_resource/get_property.h
@@ -21,7 +21,7 @@
 #  pragma system_header
 #endif // no system header
 
-#if !defined(_CCCL_COMPILER_MSVC_2017) && defined(LIBCUDACXX_ENABLE_EXPERIMENTAL_MEMORY_RESOURCE)
+#if !_CCCL_COMPILER(MSVC2017) && defined(LIBCUDACXX_ENABLE_EXPERIMENTAL_MEMORY_RESOURCE)
 
 #  include <cuda/std/__concepts/same_as.h>
 #  include <cuda/std/__type_traits/remove_const_ref.h>
@@ -180,6 +180,6 @@ _LIBCUDACXX_END_NAMESPACE_CUDA
 
 #  endif // _CCCL_STD_VER >= 2014
 
-#endif // !_CCCL_COMPILER_MSVC_2017 && LIBCUDACXX_ENABLE_EXPERIMENTAL_MEMORY_RESOURCE
+#endif // !_CCCL_COMPILER(MSVC2017) && LIBCUDACXX_ENABLE_EXPERIMENTAL_MEMORY_RESOURCE
 
 #endif //_CUDA__MEMORY_RESOURCE_GET_PROPERTY_H
diff --git a/libcudacxx/include/cuda/__memory_resource/managed_memory_resource.h b/libcudacxx/include/cuda/__memory_resource/managed_memory_resource.h
index c1af2074beb..86835aede18 100644
--- a/libcudacxx/include/cuda/__memory_resource/managed_memory_resource.h
+++ b/libcudacxx/include/cuda/__memory_resource/managed_memory_resource.h
@@ -21,7 +21,7 @@
 #  pragma system_header
 #endif // no system header
 
-#if !defined(_CCCL_COMPILER_MSVC_2017) && defined(LIBCUDACXX_ENABLE_EXPERIMENTAL_MEMORY_RESOURCE)
+#if !_CCCL_COMPILER(MSVC2017) && defined(LIBCUDACXX_ENABLE_EXPERIMENTAL_MEMORY_RESOURCE)
 
 #  if defined(_CCCL_CUDA_COMPILER_CLANG)
 #    include <cuda_runtime_api.h>
@@ -196,6 +196,6 @@ _LIBCUDACXX_END_NAMESPACE_CUDA_MR
 
 #  endif // _CCCL_STD_VER >= 2014
 
-#endif // !_CCCL_COMPILER_MSVC_2017 && LIBCUDACXX_ENABLE_EXPERIMENTAL_MEMORY_RESOURCE
+#endif // !_CCCL_COMPILER(MSVC2017) && LIBCUDACXX_ENABLE_EXPERIMENTAL_MEMORY_RESOURCE
 
 #endif //_CUDA__MEMORY_RESOURCE_CUDA_MANAGED_MEMORY_RESOURCE_H
diff --git a/libcudacxx/include/cuda/__memory_resource/pinned_memory_resource.h b/libcudacxx/include/cuda/__memory_resource/pinned_memory_resource.h
index 2fe29653d75..819d485a104 100644
--- a/libcudacxx/include/cuda/__memory_resource/pinned_memory_resource.h
+++ b/libcudacxx/include/cuda/__memory_resource/pinned_memory_resource.h
@@ -21,7 +21,7 @@
 #  pragma system_header
 #endif // no system header
 
-#if !defined(_CCCL_COMPILER_MSVC_2017) && defined(LIBCUDACXX_ENABLE_EXPERIMENTAL_MEMORY_RESOURCE)
+#if !_CCCL_COMPILER(MSVC2017) && defined(LIBCUDACXX_ENABLE_EXPERIMENTAL_MEMORY_RESOURCE)
 
 #  if defined(_CCCL_CUDA_COMPILER_CLANG)
 #    include <cuda_runtime.h>
@@ -199,6 +199,6 @@ _LIBCUDACXX_END_NAMESPACE_CUDA_MR
 
 #  endif // _CCCL_STD_VER >= 2014
 
-#endif // !_CCCL_COMPILER_MSVC_2017 && LIBCUDACXX_ENABLE_EXPERIMENTAL_MEMORY_RESOURCE
+#endif // !_CCCL_COMPILER(MSVC2017) && LIBCUDACXX_ENABLE_EXPERIMENTAL_MEMORY_RESOURCE
 
 #endif //_CUDA__MEMORY_RESOURCE_CUDA_PINNED_MEMORY_RESOURCE_H
diff --git a/libcudacxx/include/cuda/__memory_resource/properties.h b/libcudacxx/include/cuda/__memory_resource/properties.h
index 42fbbda5f7b..6b0279eb06f 100644
--- a/libcudacxx/include/cuda/__memory_resource/properties.h
+++ b/libcudacxx/include/cuda/__memory_resource/properties.h
@@ -24,7 +24,7 @@
 #include <cuda/std/__type_traits/type_set.h>
 #include <cuda/std/cstddef>
 
-#if !defined(_CCCL_COMPILER_MSVC_2017) && defined(LIBCUDACXX_ENABLE_EXPERIMENTAL_MEMORY_RESOURCE)
+#if !_CCCL_COMPILER(MSVC2017) && defined(LIBCUDACXX_ENABLE_EXPERIMENTAL_MEMORY_RESOURCE)
 
 #  if _CCCL_STD_VER >= 2014
 
@@ -68,6 +68,6 @@ _LIBCUDACXX_END_NAMESPACE_CUDA_MR
 
 #  endif // _CCCL_STD_VER >= 2014
 
-#endif // !_CCCL_COMPILER_MSVC_2017 && LIBCUDACXX_ENABLE_EXPERIMENTAL_MEMORY_RESOURCE
+#endif // !_CCCL_COMPILER(MSVC2017) && LIBCUDACXX_ENABLE_EXPERIMENTAL_MEMORY_RESOURCE
 
 #endif //_CUDA__MEMORY_RESOURCE_PROPERTIES_H
diff --git a/libcudacxx/include/cuda/__memory_resource/resource.h b/libcudacxx/include/cuda/__memory_resource/resource.h
index bfcf6d73174..0b864e649d6 100644
--- a/libcudacxx/include/cuda/__memory_resource/resource.h
+++ b/libcudacxx/include/cuda/__memory_resource/resource.h
@@ -21,7 +21,7 @@
 #  pragma system_header
 #endif // no system header
 
-#if !defined(_CCCL_COMPILER_MSVC_2017) && defined(LIBCUDACXX_ENABLE_EXPERIMENTAL_MEMORY_RESOURCE)
+#if !_CCCL_COMPILER(MSVC2017) && defined(LIBCUDACXX_ENABLE_EXPERIMENTAL_MEMORY_RESOURCE)
 
 #  include <cuda/__memory_resource/get_property.h>
 #  include <cuda/std/__concepts/concept_macros.h>
@@ -129,6 +129,6 @@ _LIBCUDACXX_END_NAMESPACE_CUDA_MR
 
 #  endif // _CCCL_STD_VER >= 2014
 
-#endif // !_CCCL_COMPILER_MSVC_2017 && LIBCUDACXX_ENABLE_EXPERIMENTAL_MEMORY_RESOURCE
+#endif // !_CCCL_COMPILER(MSVC2017) && LIBCUDACXX_ENABLE_EXPERIMENTAL_MEMORY_RESOURCE
 
 #endif //_CUDA__MEMORY_RESOURCE_RESOURCE_H
diff --git a/libcudacxx/include/cuda/__memory_resource/resource_ref.h b/libcudacxx/include/cuda/__memory_resource/resource_ref.h
index 164625c6493..81831720349 100644
--- a/libcudacxx/include/cuda/__memory_resource/resource_ref.h
+++ b/libcudacxx/include/cuda/__memory_resource/resource_ref.h
@@ -21,7 +21,7 @@
 #  pragma system_header
 #endif // no system header
 
-#if !defined(_CCCL_COMPILER_MSVC_2017) && defined(LIBCUDACXX_ENABLE_EXPERIMENTAL_MEMORY_RESOURCE)
+#if !_CCCL_COMPILER(MSVC2017) && defined(LIBCUDACXX_ENABLE_EXPERIMENTAL_MEMORY_RESOURCE)
 
 #  include <cuda/__memory_resource/get_property.h>
 #  include <cuda/__memory_resource/properties.h>
@@ -640,6 +640,6 @@ _LIBCUDACXX_END_NAMESPACE_CUDA_MR
 
 #  endif // _CCCL_STD_VER >= 2014
 
-#endif // !_CCCL_COMPILER_MSVC_2017 && LIBCUDACXX_ENABLE_EXPERIMENTAL_MEMORY_RESOURCE
+#endif // !_CCCL_COMPILER(MSVC2017) && LIBCUDACXX_ENABLE_EXPERIMENTAL_MEMORY_RESOURCE
 
 #endif //_CUDA__MEMORY_RESOURCE_RESOURCE_REF_H
diff --git a/libcudacxx/include/cuda/std/__algorithm/iterator_operations.h b/libcudacxx/include/cuda/std/__algorithm/iterator_operations.h
index 7eb7a715962..be02de72b97 100644
--- a/libcudacxx/include/cuda/std/__algorithm/iterator_operations.h
+++ b/libcudacxx/include/cuda/std/__algorithm/iterator_operations.h
@@ -44,7 +44,7 @@ _LIBCUDACXX_BEGIN_NAMESPACE_STD
 template <class _AlgPolicy>
 struct _IterOps;
 
-#if _CCCL_STD_VER >= 2017 && !defined(_CCCL_COMPILER_MSVC_2017)
+#if _CCCL_STD_VER >= 2017 && !_CCCL_COMPILER(MSVC2017)
 struct _RangeAlgPolicy
 {};
 
@@ -69,7 +69,7 @@ struct _IterOps<_RangeAlgPolicy>
   static constexpr auto __advance_to = _CUDA_VRANGES::advance;
 };
 
-#endif // _CCCL_STD_VER >= 2017 && !_CCCL_COMPILER_MSVC_2017
+#endif // _CCCL_STD_VER >= 2017 && !_CCCL_COMPILER(MSVC2017)
 
 struct _ClassicAlgPolicy
 {};
diff --git a/libcudacxx/include/cuda/std/__algorithm/ranges_iterator_concept.h b/libcudacxx/include/cuda/std/__algorithm/ranges_iterator_concept.h
index 7562eedbf95..43e674b976f 100644
--- a/libcudacxx/include/cuda/std/__algorithm/ranges_iterator_concept.h
+++ b/libcudacxx/include/cuda/std/__algorithm/ranges_iterator_concept.h
@@ -24,7 +24,7 @@
 #include <cuda/std/__iterator/iterator_traits.h>
 #include <cuda/std/__type_traits/remove_cvref.h>
 
-#if _CCCL_STD_VER >= 2017 && !defined(_CCCL_COMPILER_MSVC_2017)
+#if _CCCL_STD_VER >= 2017 && !_CCCL_COMPILER(MSVC2017)
 
 _LIBCUDACXX_BEGIN_NAMESPACE_RANGES
 
@@ -60,6 +60,6 @@ using __iterator_concept = decltype(__get_iterator_concept<_Iter>());
 
 _LIBCUDACXX_END_NAMESPACE_RANGES
 
-#endif // _CCCL_STD_VER >= 2017 && !_CCCL_COMPILER_MSVC_2017
+#endif // _CCCL_STD_VER >= 2017 && !_CCCL_COMPILER(MSVC2017)
 
 #endif // _LIBCUDACXX___ALGORITHM_RANGES_ITERATOR_CONCEPT_H
diff --git a/libcudacxx/include/cuda/std/__atomic/platform.h b/libcudacxx/include/cuda/std/__atomic/platform.h
index 6367e20234e..66eae16ffaa 100644
--- a/libcudacxx/include/cuda/std/__atomic/platform.h
+++ b/libcudacxx/include/cuda/std/__atomic/platform.h
@@ -21,7 +21,7 @@
 #  pragma system_header
 #endif // no system header
 
-#if defined(_CCCL_COMPILER_MSVC)
+#if _CCCL_COMPILER(MSVC)
 #  include <cuda/std/__atomic/platform/msvc_to_builtins.h>
 #endif
 
diff --git a/libcudacxx/include/cuda/std/__atomic/platform/msvc_to_builtins.h b/libcudacxx/include/cuda/std/__atomic/platform/msvc_to_builtins.h
index eb5721c8022..8c532260284 100644
--- a/libcudacxx/include/cuda/std/__atomic/platform/msvc_to_builtins.h
+++ b/libcudacxx/include/cuda/std/__atomic/platform/msvc_to_builtins.h
@@ -21,7 +21,7 @@
 #  pragma system_header
 #endif // no system header
 
-#if defined(_CCCL_COMPILER_MSVC)
+#if _CCCL_COMPILER(MSVC)
 
 #  include <cuda/std/__atomic/order.h>
 #  include <cuda/std/cassert>
@@ -637,6 +637,6 @@ _Type __atomic_fetch_min(_Type volatile* __ptr, _Delta __val, int __memorder)
 
 _LIBCUDACXX_END_NAMESPACE_STD
 
-#endif // defined(_CCCL_COMPILER_MSVC)
+#endif // _CCCL_COMPILER(MSVC)
 
 #endif // __LIBCUDACXX___ATOMIC_PLATFORM_MSVC_H
diff --git a/libcudacxx/include/cuda/std/__bit/clz.h b/libcudacxx/include/cuda/std/__bit/clz.h
index c61508f8287..267f022737a 100644
--- a/libcudacxx/include/cuda/std/__bit/clz.h
+++ b/libcudacxx/include/cuda/std/__bit/clz.h
@@ -24,9 +24,9 @@
 #include <cuda/std/__type_traits/is_constant_evaluated.h>
 #include <cuda/std/cstdint>
 
-#if defined(_CCCL_COMPILER_MSVC)
+#if _CCCL_COMPILER(MSVC)
 #  include <intrin.h>
-#endif // _CCCL_COMPILER_MSVC
+#endif // _CCCL_COMPILER(MSVC)
 
 _LIBCUDACXX_BEGIN_NAMESPACE_STD
 
@@ -55,7 +55,7 @@ _LIBCUDACXX_HIDE_FROM_ABI constexpr int __binary_clz64(uint64_t __x)
   return __binary_clz32(__x >> 32 * !!(__x & 0xFFFFFFFF00000000), 32 * !(__x & 0xFFFFFFFF00000000));
 }
 
-#if !defined(_CCCL_COMPILER_MSVC)
+#if !_CCCL_COMPILER(MSVC)
 
 _LIBCUDACXX_HIDE_FROM_ABI constexpr int __constexpr_clz(uint32_t __x) noexcept
 {
@@ -97,7 +97,7 @@ _LIBCUDACXX_HIDE_FROM_ABI constexpr int __libcpp_clz(uint64_t __x) noexcept
   return __constexpr_clz(__x);
 }
 
-#else // defined(_CCCL_COMPILER_MSVC)
+#else // _CCCL_COMPILER(MSVC)
 
 // Precondition:  __x != 0
 _LIBCUDACXX_HIDE_FROM_ABI constexpr int __libcpp_clz(uint32_t __x)
diff --git a/libcudacxx/include/cuda/std/__bit/ctz.h b/libcudacxx/include/cuda/std/__bit/ctz.h
index 0f08f67d38b..9d2e771bd61 100644
--- a/libcudacxx/include/cuda/std/__bit/ctz.h
+++ b/libcudacxx/include/cuda/std/__bit/ctz.h
@@ -24,9 +24,9 @@
 #include <cuda/std/__type_traits/is_constant_evaluated.h>
 #include <cuda/std/cstdint>
 
-#if defined(_CCCL_COMPILER_MSVC)
+#if _CCCL_COMPILER(MSVC)
 #  include <intrin.h>
-#endif // _CCCL_COMPILER_MSVC
+#endif // _CCCL_COMPILER(MSVC)
 
 _LIBCUDACXX_BEGIN_NAMESPACE_STD
 
@@ -55,7 +55,7 @@ _LIBCUDACXX_HIDE_FROM_ABI constexpr int __binary_ctz64(uint64_t __x) noexcept
   return __binary_ctz32(__x >> 32 * !(__x & 0x00000000FFFFFFFF), 32 * !(__x & 0x00000000FFFFFFFF));
 }
 
-#if !defined(_CCCL_COMPILER_MSVC)
+#if !_CCCL_COMPILER(MSVC)
 
 _LIBCUDACXX_HIDE_FROM_ABI constexpr int __constexpr_ctz(uint32_t __x) noexcept
 {
@@ -99,7 +99,7 @@ _LIBCUDACXX_HIDE_FROM_ABI constexpr int __libcpp_ctz(uint64_t __x) noexcept
   return __constexpr_ctz(__x);
 }
 
-#else // defined(_CCCL_COMPILER_MSVC)
+#else // _CCCL_COMPILER(MSVC)
 
 // Precondition:  __x != 0
 _LIBCUDACXX_HIDE_FROM_ABI constexpr int __libcpp_ctz(uint32_t __x)
diff --git a/libcudacxx/include/cuda/std/__bit/popc.h b/libcudacxx/include/cuda/std/__bit/popc.h
index d3ec52342ad..dc22999b985 100644
--- a/libcudacxx/include/cuda/std/__bit/popc.h
+++ b/libcudacxx/include/cuda/std/__bit/popc.h
@@ -24,7 +24,7 @@
 #include <cuda/std/__type_traits/is_constant_evaluated.h>
 #include <cuda/std/cstdint>
 
-#if defined(_CCCL_COMPILER_MSVC)
+#if _CCCL_COMPILER(MSVC)
 #  include <intrin.h>
 
 #  if defined(_M_ARM64)
@@ -35,7 +35,7 @@
 #    define _LIBCUDACXX_MSVC_POPC64(x) __popcnt64(x)
 #  endif // !_M_ARM64
 
-#endif // _CCCL_COMPILER_MSVC
+#endif // _CCCL_COMPILER(MSVC)
 
 _LIBCUDACXX_BEGIN_NAMESPACE_STD
 
@@ -56,7 +56,7 @@ _LIBCUDACXX_HIDE_FROM_ABI constexpr int __fallback_popc64(uint64_t __x)
   return __fallback_popc32(__x - ((__x >> 1) & 0x5555555555555555));
 }
 
-#if !defined(_CCCL_COMPILER_MSVC)
+#if !_CCCL_COMPILER(MSVC)
 
 _LIBCUDACXX_HIDE_FROM_ABI constexpr int __constexpr_popcount(uint32_t __x) noexcept
 {
@@ -98,7 +98,7 @@ _LIBCUDACXX_HIDE_FROM_ABI constexpr int __libcpp_popc(uint64_t __x) noexcept
   return __constexpr_popcount(static_cast<uint64_t>(__x));
 }
 
-#else // defined(_CCCL_COMPILER_MSVC)
+#else // _CCCL_COMPILER(MSVC)
 
 _LIBCUDACXX_HIDE_FROM_ABI constexpr int __libcpp_popc(uint32_t __x)
 {
diff --git a/libcudacxx/include/cuda/std/__cccl/assert.h b/libcudacxx/include/cuda/std/__cccl/assert.h
index 70b2398cccf..8c48296619c 100644
--- a/libcudacxx/include/cuda/std/__cccl/assert.h
+++ b/libcudacxx/include/cuda/std/__cccl/assert.h
@@ -64,7 +64,7 @@
 //! _CCCL_ASSERT_IMPL_HOST should never be used directly
 #if _CCCL_COMPILER(NVRTC) // There is no host standard library in nvrtc
 #  define _CCCL_ASSERT_IMPL_HOST(expression, message) ((void) 0)
-#elif _CCCL_HAS_INCLUDE(<yvals.h>) && defined(_CCCL_COMPILER_MSVC) // MSVC uses _STL_VERIFY from <yvals.h>
+#elif _CCCL_HAS_INCLUDE(<yvals.h>) && _CCCL_COMPILER(MSVC) // MSVC uses _STL_VERIFY from <yvals.h>
 #  include <yvals.h>
 #  define _CCCL_ASSERT_IMPL_HOST(expression, message) _STL_VERIFY(expression, message)
 #else // ^^^ MSVC STL ^^^ / vvv !MSVC STL vvv
@@ -97,15 +97,15 @@ _CCCL_HOST_DEVICE
     _CCCL_BUILTIN_EXPECT(static_cast<bool>(expression), 1) \
     ? (void) 0 : __assertfail(message, __FILE__, __LINE__, __func__, sizeof(char))
 #elif defined(_CCCL_CUDA_COMPILER_NVCC) //! Use __assert_fail to implement device side asserts
-#  if defined(_CCCL_COMPILER_MSVC)
+#  if _CCCL_COMPILER(MSVC)
 #    define _CCCL_ASSERT_IMPL_DEVICE(expression, message)    \
       _CCCL_BUILTIN_EXPECT(static_cast<bool>(expression), 1) \
       ? (void) 0 : _wassert(_CRT_WIDE(#message), __FILEW__, __LINE__)
-#  else // ^^^ _CCCL_COMPILER_MSVC ^^^ / vvv !_CCCL_COMPILER_MSVC vvv
+#  else // ^^^ _CCCL_COMPILER(MSVC) ^^^ / vvv !_CCCL_COMPILER(MSVC) vvv
 #    define _CCCL_ASSERT_IMPL_DEVICE(expression, message)    \
       _CCCL_BUILTIN_EXPECT(static_cast<bool>(expression), 1) \
       ? (void) 0 : __assert_fail(message, __FILE__, __LINE__, __func__)
-#  endif // !_CCCL_COMPILER_MSVC
+#  endif // !_CCCL_COMPILER(MSVC)
 #elif defined(_CCCL_CUDA_COMPILER)
 #  define _CCCL_ASSERT_IMPL_DEVICE(expression, message) _CCCL_ASSERT_IMPL_HOST(expression, message)
 #else // ^^^ _CCCL_CUDA_COMPILER ^^^ / vvv !_CCCL_CUDA_COMPILER vvv
diff --git a/libcudacxx/include/cuda/std/__cccl/attributes.h b/libcudacxx/include/cuda/std/__cccl/attributes.h
index 7a8bcb49f0b..0ed5fdd2b7f 100644
--- a/libcudacxx/include/cuda/std/__cccl/attributes.h
+++ b/libcudacxx/include/cuda/std/__cccl/attributes.h
@@ -80,7 +80,7 @@
 #  define _CCCL_HAS_NO_ATTRIBUTE_NO_UNIQUE_ADDRESS
 #endif // !_CCCL_HAS_NO_ATTRIBUTE_NO_UNIQUE_ADDRESS && _CCCL_COMPILER(CLANG)
 
-#if _CCCL_HAS_CPP_ATTRIBUTE(nodiscard) || (defined(_CCCL_COMPILER_MSVC) && _CCCL_STD_VER >= 2017)
+#if _CCCL_HAS_CPP_ATTRIBUTE(nodiscard) || (_CCCL_COMPILER(MSVC) && _CCCL_STD_VER >= 2017)
 #  define _CCCL_NODISCARD [[nodiscard]]
 #else // ^^^ has nodiscard ^^^ / vvv no nodiscard vvv
 #  define _CCCL_NODISCARD
@@ -101,7 +101,7 @@
 #  define _CCCL_ALIAS_ATTRIBUTE(...) __VA_ARGS__
 #endif // _CCCL_CUDACC_AT_LEAST(11, 3)
 
-#if defined(_CCCL_COMPILER_MSVC)
+#if _CCCL_COMPILER(MSVC)
 #  define _CCCL_NORETURN __declspec(noreturn)
 #elif _CCCL_HAS_CPP_ATTRIBUTE(noreturn)
 #  define _CCCL_NORETURN [[noreturn]]
@@ -109,10 +109,10 @@
 #  define _CCCL_NORETURN __attribute__((noreturn))
 #endif
 
-#if defined(_CCCL_COMPILER_MSVC) // vvv _CCCL_COMPILER_MSVC vvv
+#if _CCCL_COMPILER(MSVC) // vvv _CCCL_COMPILER(MSVC) vvv
 #  define _CCCL_RESTRICT __restrict
-#else // ^^^ _CCCL_COMPILER_MSVC ^^^ / vvv !_CCCL_COMPILER_MSVC vvv
+#else // ^^^ _CCCL_COMPILER(MSVC) ^^^ / vvv !_CCCL_COMPILER(MSVC) vvv
 #  define _CCCL_RESTRICT __restrict__
-#endif // ^^^ !_CCCL_COMPILER_MSVC ^^^
+#endif // ^^^ !_CCCL_COMPILER(MSVC) ^^^
 
 #endif // __CCCL_ATTRIBUTES_H
diff --git a/libcudacxx/include/cuda/std/__cccl/builtin.h b/libcudacxx/include/cuda/std/__cccl/builtin.h
index ac93b0f8caf..b3a53918054 100644
--- a/libcudacxx/include/cuda/std/__cccl/builtin.h
+++ b/libcudacxx/include/cuda/std/__cccl/builtin.h
@@ -86,8 +86,7 @@
 #  define _CCCL_BUILTIN_ARRAY_EXTENT(...) __array_extent(__VA_ARGS__)
 #endif // _CCCL_HAS_BUILTIN(__array_extent)
 
-#if _CCCL_HAS_BUILTIN(__builtin_assume_aligned) || (defined(_CCCL_COMPILER_MSVC) && _CCCL_MSVC_VERSION >= 1923) \
-  || _CCCL_COMPILER(GCC)
+#if _CCCL_HAS_BUILTIN(__builtin_assume_aligned) || _CCCL_COMPILER(MSVC, >=, 19, 23) || _CCCL_COMPILER(GCC)
 #  define _CCCL_BUILTIN_ASSUME_ALIGNED(...) __builtin_assume_aligned(__VA_ARGS__)
 #endif // _CCCL_HAS_BUILTIN(__builtin_assume_aligned)
 
@@ -97,7 +96,7 @@
 #endif // _CCCL_CUDACC_BELOW(11, 2)
 
 // nvhpc has a bug where it supports __builtin_addressof but does not mark it via _CCCL_CHECK_BUILTIN
-#if _CCCL_CHECK_BUILTIN(builtin_addressof) || _CCCL_COMPILER(GCC, >=, 7) || defined(_CCCL_COMPILER_MSVC) \
+#if _CCCL_CHECK_BUILTIN(builtin_addressof) || _CCCL_COMPILER(GCC, >=, 7) || _CCCL_COMPILER(MSVC) \
   || _CCCL_COMPILER(NVHPC)
 #  define _CCCL_BUILTIN_ADDRESSOF(...) __builtin_addressof(__VA_ARGS__)
 #endif // _CCCL_CHECK_BUILTIN(builtin_addressof)
@@ -117,7 +116,7 @@
 #endif // _CCCL_CUDACC_BELOW(11, 2)
 
 // MSVC supports __builtin_bit_cast from 19.25 on
-#if _CCCL_CHECK_BUILTIN(builtin_bit_cast) || (defined(_CCCL_COMPILER_MSVC) && _CCCL_MSVC_VERSION > 1925)
+#if _CCCL_CHECK_BUILTIN(builtin_bit_cast) || _CCCL_COMPILER(MSVC, >, 19, 25)
 #  define _CCCL_BUILTIN_BIT_CAST(...) __builtin_bit_cast(__VA_ARGS__)
 #endif // _CCCL_CHECK_BUILTIN(builtin_bit_cast)
 
@@ -127,7 +126,7 @@
 #  undef _CCCL_BUILTIN_BIT_CAST
 #endif // clang < 10 || nvcc < 11.7
 
-#if _CCCL_HAS_BUILTIN(__builtin_COLUMN) || defined(_CCCL_COMPILER_MSVC) && _CCCL_MSVC_VERSION >= 1927
+#if _CCCL_HAS_BUILTIN(__builtin_COLUMN) || _CCCL_COMPILER(MSVC, >=, 19, 27)
 #  define _CCCL_BUILTIN_COLUMN() __builtin_COLUMN()
 #else // ^^^ _CCCL_HAS_BUILTIN(__builtin_COLUMN) ^^^ / vvv !_CCCL_HAS_BUILTIN(__builtin_COLUMN) vvv
 #  define _CCCL_BUILTIN_COLUMN() 0
@@ -143,12 +142,11 @@
 #  define _CCCL_BUILTIN_CONSTANT_P(...) __builtin_constant_p(__VA_ARGS__)
 #endif // _CCCL_CHECK_BUILTIN(builtin_contant_p)
 
-#if _CCCL_CHECK_BUILTIN(builtin_expect) || defined(_CCCL_COMPILER_MSVC) || _CCCL_COMPILER(GCC)
+#if _CCCL_CHECK_BUILTIN(builtin_expect) || _CCCL_COMPILER(MSVC) || _CCCL_COMPILER(GCC)
 #  define _CCCL_BUILTIN_EXPECT(...) __builtin_expect(__VA_ARGS__)
 #endif // _CCCL_CHECK_BUILTIN(builtin_expect)
 
-#if _CCCL_HAS_BUILTIN(__builtin_FILE) || _CCCL_COMPILER(GCC) \
-  || (defined(_CCCL_COMPILER_MSVC) && _CCCL_MSVC_VERSION >= 1927)
+#if _CCCL_HAS_BUILTIN(__builtin_FILE) || _CCCL_COMPILER(GCC) || _CCCL_COMPILER(MSVC, >=, 19, 27)
 #  define _CCCL_BUILTIN_FILE() __builtin_FILE()
 #else // ^^^ _CCCL_HAS_BUILTIN(__builtin_FILE) ^^^ / vvv !_CCCL_HAS_BUILTIN(__builtin_FILE) vvv
 #  define _CCCL_BUILTIN_FILE() __FILE__
@@ -160,8 +158,7 @@
 #  define _CCCL_BUILTIN_FILE() __FILE__
 #endif // _CCCL_CUDACC_BELOW(11, 3)
 
-#if _CCCL_HAS_BUILTIN(__builtin_FUNCTION) || _CCCL_COMPILER(GCC) \
-  || (defined(_CCCL_COMPILER_MSVC) && _CCCL_MSVC_VERSION >= 1927)
+#if _CCCL_HAS_BUILTIN(__builtin_FUNCTION) || _CCCL_COMPILER(GCC) || _CCCL_COMPILER(MSVC, >=, 19, 27)
 #  define _CCCL_BUILTIN_FUNCTION() __builtin_FUNCTION()
 #else // ^^^ _CCCL_HAS_BUILTIN(__builtin_FUNCTION) ^^^ / vvv !_CCCL_HAS_BUILTIN(__builtin_FUNCTION) vvv
 #  define _CCCL_BUILTIN_FUNCTION() "__builtin_FUNCTION is unsupported"
@@ -174,7 +171,7 @@
 #endif // _CCCL_CUDACC_BELOW(11, 3)
 
 #if _CCCL_CHECK_BUILTIN(builtin_is_constant_evaluated) || _CCCL_COMPILER(GCC, >=, 9) \
-  || (defined(_CCCL_COMPILER_MSVC) && _CCCL_MSVC_VERSION > 1924 && _CCCL_CUDACC_AT_LEAST(11, 3))
+  || (_CCCL_COMPILER(MSVC, >, 19, 24) && _CCCL_CUDACC_AT_LEAST(11, 3))
 #  define _CCCL_BUILTIN_IS_CONSTANT_EVALUATED(...) __builtin_is_constant_evaluated(__VA_ARGS__)
 #endif // _CCCL_CHECK_BUILTIN(builtin_is_constant_evaluated)
 
@@ -193,8 +190,7 @@
 #  undef _CCCL_BUILTIN_LAUNDER
 #endif // clang < 10 || nvcc < 11.3
 
-#if _CCCL_HAS_BUILTIN(__builtin_LINE) || _CCCL_COMPILER(GCC) \
-  || (defined(_CCCL_COMPILER_MSVC) && _CCCL_MSVC_VERSION >= 1927)
+#if _CCCL_HAS_BUILTIN(__builtin_LINE) || _CCCL_COMPILER(GCC) || _CCCL_COMPILER(MSVC, >=, 19, 27)
 #  define _CCCL_BUILTIN_LINE() __builtin_LINE()
 #else // ^^^ _CCCL_HAS_BUILTIN(__builtin_LINE) ^^^ / vvv !_CCCL_HAS_BUILTIN(__builtin_LINE) vvv
 #  define _CCCL_BUILTIN_LINE() __LINE__
@@ -216,27 +212,27 @@
 #  define _CCCL_BUILTIN_DECAY(...) __decay(__VA_ARGS__)
 #endif // _CCCL_HAS_BUILTIN(__decay) && clang-cuda
 
-#if _CCCL_CHECK_BUILTIN(has_nothrow_assign) || _CCCL_COMPILER(GCC, >=, 4, 3) || defined(_CCCL_COMPILER_MSVC) \
+#if _CCCL_CHECK_BUILTIN(has_nothrow_assign) || _CCCL_COMPILER(GCC, >=, 4, 3) || _CCCL_COMPILER(MSVC) \
   || _CCCL_COMPILER(NVRTC)
 #  define _CCCL_BUILTIN_HAS_NOTHROW_ASSIGN(...) __has_nothrow_assign(__VA_ARGS__)
 #endif // _CCCL_CHECK_BUILTIN(has_nothrow_assign) && gcc >= 4.3
 
-#if _CCCL_CHECK_BUILTIN(has_nothrow_constructor) || _CCCL_COMPILER(GCC, >=, 4, 3) || defined(_CCCL_COMPILER_MSVC) \
+#if _CCCL_CHECK_BUILTIN(has_nothrow_constructor) || _CCCL_COMPILER(GCC, >=, 4, 3) || _CCCL_COMPILER(MSVC) \
   || _CCCL_COMPILER(NVRTC)
 #  define _CCCL_BUILTIN_HAS_NOTHROW_CONSTRUCTOR(...) __has_nothrow_constructor(__VA_ARGS__)
 #endif // _CCCL_CHECK_BUILTIN(has_nothrow_constructor) && gcc >= 4.3
 
-#if _CCCL_CHECK_BUILTIN(has_nothrow_copy) || _CCCL_COMPILER(GCC, >=, 4, 3) || defined(_CCCL_COMPILER_MSVC) \
+#if _CCCL_CHECK_BUILTIN(has_nothrow_copy) || _CCCL_COMPILER(GCC, >=, 4, 3) || _CCCL_COMPILER(MSVC) \
   || _CCCL_COMPILER(NVRTC)
 #  define _CCCL_BUILTIN_HAS_NOTHROW_COPY(...) __has_nothrow_copy(__VA_ARGS__)
 #endif // _CCCL_CHECK_BUILTIN(has_nothrow_copy) && gcc >= 4.3
 
-#if _CCCL_CHECK_BUILTIN(has_trivial_constructor) || _CCCL_COMPILER(GCC, >=, 4, 3) || defined(_CCCL_COMPILER_MSVC) \
+#if _CCCL_CHECK_BUILTIN(has_trivial_constructor) || _CCCL_COMPILER(GCC, >=, 4, 3) || _CCCL_COMPILER(MSVC) \
   || _CCCL_COMPILER(NVRTC)
 #  define _CCCL_BUILTIN_HAS_TRIVIAL_CONSTRUCTOR(...) __has_trivial_constructor(__VA_ARGS__)
 #endif // _CCCL_CHECK_BUILTIN(has_trivial_constructor) && gcc >= 4.3
 
-#if _CCCL_CHECK_BUILTIN(has_trivial_destructor) || _CCCL_COMPILER(GCC, >=, 4, 3) || defined(_CCCL_COMPILER_MSVC) \
+#if _CCCL_CHECK_BUILTIN(has_trivial_destructor) || _CCCL_COMPILER(GCC, >=, 4, 3) || _CCCL_COMPILER(MSVC) \
   || _CCCL_COMPILER(NVRTC)
 #  define _CCCL_BUILTIN_HAS_TRIVIAL_DESTRUCTOR(...) __has_trivial_destructor(__VA_ARGS__)
 #endif // _CCCL_CHECK_BUILTIN(has_trivial_destructor) && gcc >= 4.3
@@ -245,7 +241,7 @@
 #  define _CCCL_BUILTIN_HAS_UNIQUE_OBJECT_REPRESENTATIONS(...) __has_unique_object_representations(__VA_ARGS__)
 #endif // _CCCL_CHECK_BUILTIN(has_unique_object_representations) && gcc >= 7.0
 
-#if _CCCL_CHECK_BUILTIN(has_virtual_destructor) || _CCCL_COMPILER(GCC, >=, 4, 3) || defined(_CCCL_COMPILER_MSVC) \
+#if _CCCL_CHECK_BUILTIN(has_virtual_destructor) || _CCCL_COMPILER(GCC, >=, 4, 3) || _CCCL_COMPILER(MSVC) \
   || _CCCL_COMPILER(NVRTC)
 #  define _CCCL_BUILTIN_HAS_VIRTUAL_DESTRUCTOR(...) __has_virtual_destructor(__VA_ARGS__)
 #endif // _CCCL_CHECK_BUILTIN(has_virtual_destructor) && gcc >= 4.3
@@ -254,8 +250,8 @@
 #  define _CCCL_BUILTIN_INTEGER_PACK(...) __integer_pack(__VA_ARGS__)
 #endif // _CCCL_HAS_BUILTIN(__integer_pack)
 
-#if _CCCL_CHECK_BUILTIN(is_aggregate) || _CCCL_COMPILER(GCC, >=, 7) \
-  || (defined(_CCCL_COMPILER_MSVC) && _CCCL_MSVC_VERSION > 1914) || _CCCL_COMPILER(NVRTC)
+#if _CCCL_CHECK_BUILTIN(is_aggregate) || _CCCL_COMPILER(GCC, >=, 7) || _CCCL_COMPILER(MSVC, >, 19, 14) \
+  || _CCCL_COMPILER(NVRTC)
 #  define _CCCL_BUILTIN_IS_AGGREGATE(...) __is_aggregate(__VA_ARGS__)
 #endif // _CCCL_CHECK_BUILTIN(is_aggregate) && gcc >= 7.0
 
@@ -268,17 +264,15 @@
 #  undef _CCCL_BUILTIN_IS_ARRAY
 #endif // clang < 19
 
-#if _CCCL_CHECK_BUILTIN(is_assignable) || defined(_CCCL_COMPILER_MSVC) || _CCCL_COMPILER(GCC, >=, 9)
+#if _CCCL_CHECK_BUILTIN(is_assignable) || _CCCL_COMPILER(MSVC) || _CCCL_COMPILER(GCC, >=, 9)
 #  define _CCCL_BUILTIN_IS_ASSIGNABLE(...) __is_assignable(__VA_ARGS__)
 #endif // _CCCL_CHECK_BUILTIN(is_assignable) && gcc >= 9.0
 
-#if _CCCL_CHECK_BUILTIN(is_base_of) || _CCCL_COMPILER(GCC, >=, 4, 3) || defined(_CCCL_COMPILER_MSVC) \
-  || _CCCL_COMPILER(NVRTC)
+#if _CCCL_CHECK_BUILTIN(is_base_of) || _CCCL_COMPILER(GCC, >=, 4, 3) || _CCCL_COMPILER(MSVC) || _CCCL_COMPILER(NVRTC)
 #  define _CCCL_BUILTIN_IS_BASE_OF(...) __is_base_of(__VA_ARGS__)
 #endif // _CCCL_CHECK_BUILTIN(is_base_of) && gcc >= 4.3
 
-#if _CCCL_CHECK_BUILTIN(is_class) || _CCCL_COMPILER(GCC, >=, 4, 3) || defined(_CCCL_COMPILER_MSVC) \
-  || _CCCL_COMPILER(NVRTC)
+#if _CCCL_CHECK_BUILTIN(is_class) || _CCCL_COMPILER(GCC, >=, 4, 3) || _CCCL_COMPILER(MSVC) || _CCCL_COMPILER(NVRTC)
 #  define _CCCL_BUILTIN_IS_CLASS(...) __is_class(__VA_ARGS__)
 #endif // _CCCL_CHECK_BUILTIN(is_class) && gcc >= 4.3
 
@@ -290,31 +284,27 @@
 #  define _CCCL_BUILTIN_IS_CONST(...) __is_const(__VA_ARGS__)
 #endif // _CCCL_HAS_BUILTIN(__is_const)
 
-#if _CCCL_CHECK_BUILTIN(is_constructible) || _CCCL_COMPILER(GCC, >=, 8) || defined(_CCCL_COMPILER_MSVC) \
-  || _CCCL_COMPILER(NVRTC)
+#if _CCCL_CHECK_BUILTIN(is_constructible) || _CCCL_COMPILER(GCC, >=, 8) || _CCCL_COMPILER(MSVC) || _CCCL_COMPILER(NVRTC)
 #  define _CCCL_BUILTIN_IS_CONSTRUCTIBLE(...) __is_constructible(__VA_ARGS__)
 #endif // _CCCL_CHECK_BUILTIN(is_constructible) && gcc >= 8.0
 
-#if _CCCL_CHECK_BUILTIN(is_convertible_to) || defined(_CCCL_COMPILER_MSVC) || _CCCL_COMPILER(NVRTC)
+#if _CCCL_CHECK_BUILTIN(is_convertible_to) || _CCCL_COMPILER(MSVC) || _CCCL_COMPILER(NVRTC)
 #  define _CCCL_BUILTIN_IS_CONVERTIBLE_TO(...) __is_convertible_to(__VA_ARGS__)
 #endif // _CCCL_CHECK_BUILTIN(is_convertible_to)
 
-#if _CCCL_CHECK_BUILTIN(is_destructible) || defined(_CCCL_COMPILER_MSVC)
+#if _CCCL_CHECK_BUILTIN(is_destructible) || _CCCL_COMPILER(MSVC)
 #  define _CCCL_BUILTIN_IS_DESTRUCTIBLE(...) __is_destructible(__VA_ARGS__)
 #endif // _CCCL_CHECK_BUILTIN(is_destructible)
 
-#if _CCCL_CHECK_BUILTIN(is_empty) || _CCCL_COMPILER(GCC, >=, 4, 3) || defined(_CCCL_COMPILER_MSVC) \
-  || _CCCL_COMPILER(NVRTC)
+#if _CCCL_CHECK_BUILTIN(is_empty) || _CCCL_COMPILER(GCC, >=, 4, 3) || _CCCL_COMPILER(MSVC) || _CCCL_COMPILER(NVRTC)
 #  define _CCCL_BUILTIN_IS_EMPTY(...) __is_empty(__VA_ARGS__)
 #endif // _CCCL_CHECK_BUILTIN(is_empty) && gcc >= 4.3
 
-#if _CCCL_CHECK_BUILTIN(is_enum) || _CCCL_COMPILER(GCC, >=, 4, 3) || defined(_CCCL_COMPILER_MSVC) \
-  || _CCCL_COMPILER(NVRTC)
+#if _CCCL_CHECK_BUILTIN(is_enum) || _CCCL_COMPILER(GCC, >=, 4, 3) || _CCCL_COMPILER(MSVC) || _CCCL_COMPILER(NVRTC)
 #  define _CCCL_BUILTIN_IS_ENUM(...) __is_enum(__VA_ARGS__)
 #endif // _CCCL_CHECK_BUILTIN(is_enum) && gcc >= 4.3
 
-#if _CCCL_CHECK_BUILTIN(is_final) || _CCCL_COMPILER(GCC, >=, 4, 7) || defined(_CCCL_COMPILER_MSVC) \
-  || _CCCL_COMPILER(NVRTC)
+#if _CCCL_CHECK_BUILTIN(is_final) || _CCCL_COMPILER(GCC, >=, 4, 7) || _CCCL_COMPILER(MSVC) || _CCCL_COMPILER(NVRTC)
 
 #  define _CCCL_BUILTIN_IS_FINAL(...) __is_final(__VA_ARGS__)
 #endif // _CCCL_CHECK_BUILTIN(is_final) && gcc >= 4.7
@@ -341,7 +331,7 @@
 #  define _CCCL_BUILTIN_IS_INTEGRAL(...) __is_integral(__VA_ARGS__)
 #endif // _CCCL_HAS_BUILTIN(__is_integral)
 
-#if _CCCL_CHECK_BUILTIN(is_literal_type) || _CCCL_COMPILER(GCC, >=, 4, 6) || defined(_CCCL_COMPILER_MSVC) \
+#if _CCCL_CHECK_BUILTIN(is_literal_type) || _CCCL_COMPILER(GCC, >=, 4, 6) || _CCCL_COMPILER(MSVC) \
   || _CCCL_COMPILER(NVRTC)
 #  define _CCCL_BUILTIN_IS_LITERAL(...) __is_literal_type(__VA_ARGS__)
 #endif // _CCCL_CHECK_BUILTIN(is_literal_type) && gcc >= 4.6
@@ -367,15 +357,15 @@
 #  define _CCCL_BUILTIN_IS_MEMBER_POINTER(...) __is_member_pointer(__VA_ARGS__)
 #endif // _CCCL_HAS_BUILTIN(__is_member_pointer)
 
-#if _CCCL_CHECK_BUILTIN(is_nothrow_assignable) || defined(_CCCL_COMPILER_MSVC) || _CCCL_COMPILER(NVRTC)
+#if _CCCL_CHECK_BUILTIN(is_nothrow_assignable) || _CCCL_COMPILER(MSVC) || _CCCL_COMPILER(NVRTC)
 #  define _CCCL_BUILTIN_IS_NOTHROW_ASSIGNABLE(...) __is_nothrow_assignable(__VA_ARGS__)
 #endif // _CCCL_CHECK_BUILTIN(is_nothrow_assignable)
 
-#if _CCCL_CHECK_BUILTIN(is_nothrow_constructible) || defined(_CCCL_COMPILER_MSVC) || _CCCL_COMPILER(NVRTC)
+#if _CCCL_CHECK_BUILTIN(is_nothrow_constructible) || _CCCL_COMPILER(MSVC) || _CCCL_COMPILER(NVRTC)
 #  define _CCCL_BUILTIN_IS_NOTHROW_CONSTRUCTIBLE(...) __is_nothrow_constructible(__VA_ARGS__)
 #endif // _CCCL_CHECK_BUILTIN(is_nothrow_constructible)
 
-#if _CCCL_CHECK_BUILTIN(is_nothrow_destructible) || defined(_CCCL_COMPILER_MSVC) || _CCCL_COMPILER(NVRTC)
+#if _CCCL_CHECK_BUILTIN(is_nothrow_destructible) || _CCCL_COMPILER(MSVC) || _CCCL_COMPILER(NVRTC)
 #  define _CCCL_BUILTIN_IS_NOTHROW_DESTRUCTIBLE(...) __is_nothrow_destructible(__VA_ARGS__)
 #endif // _CCCL_CHECK_BUILTIN(is_nothrow_destructible)
 
@@ -388,8 +378,7 @@
 #  undef _CCCL_BUILTIN_IS_OBJECT
 #endif // _CCCL_CUDACC_BELOW(11, 3)
 
-#if _CCCL_CHECK_BUILTIN(is_pod) || _CCCL_COMPILER(GCC, >=, 4, 3) || defined(_CCCL_COMPILER_MSVC) \
-  || _CCCL_COMPILER(NVRTC)
+#if _CCCL_CHECK_BUILTIN(is_pod) || _CCCL_COMPILER(GCC, >=, 4, 3) || _CCCL_COMPILER(MSVC) || _CCCL_COMPILER(NVRTC)
 #  define _CCCL_BUILTIN_IS_POD(...) __is_pod(__VA_ARGS__)
 #endif // _CCCL_CHECK_BUILTIN(is_pod) && gcc >= 4.3
 
@@ -398,7 +387,7 @@
 #  define _CCCL_BUILTIN_IS_POINTER(...) __is_pointer(__VA_ARGS__)
 #endif // _CCCL_HAS_BUILTIN(__is_pointer)
 
-#if _CCCL_CHECK_BUILTIN(is_polymorphic) || _CCCL_COMPILER(GCC, >=, 4, 3) || defined(_CCCL_COMPILER_MSVC) \
+#if _CCCL_CHECK_BUILTIN(is_polymorphic) || _CCCL_COMPILER(GCC, >=, 4, 3) || _CCCL_COMPILER(MSVC) \
   || _CCCL_COMPILER(NVRTC)
 #  define _CCCL_BUILTIN_IS_POLYMORPHIC(...) __is_polymorphic(__VA_ARGS__)
 #endif // _CCCL_CHECK_BUILTIN(is_polymorphic) && gcc >= 4.3
@@ -430,38 +419,36 @@
 #  define _CCCL_BUILTIN_IS_SIGNED(...) __is_signed(__VA_ARGS__)
 #endif // _CCCL_HAS_BUILTIN(__is_signed)
 
-#if _CCCL_CHECK_BUILTIN(is_standard_layout) || _CCCL_COMPILER(GCC, >=, 4, 7) || defined(_CCCL_COMPILER_MSVC) \
+#if _CCCL_CHECK_BUILTIN(is_standard_layout) || _CCCL_COMPILER(GCC, >=, 4, 7) || _CCCL_COMPILER(MSVC) \
   || _CCCL_COMPILER(NVRTC)
 #  define _CCCL_BUILTIN_IS_STANDARD_LAYOUT(...) __is_standard_layout(__VA_ARGS__)
 #endif // _CCCL_CHECK_BUILTIN(is_standard_layout) && gcc >= 4.7
 
-#if _CCCL_CHECK_BUILTIN(is_trivial) || _CCCL_COMPILER(GCC, >=, 4, 5) || defined(_CCCL_COMPILER_MSVC) \
-  || _CCCL_COMPILER(NVRTC)
+#if _CCCL_CHECK_BUILTIN(is_trivial) || _CCCL_COMPILER(GCC, >=, 4, 5) || _CCCL_COMPILER(MSVC) || _CCCL_COMPILER(NVRTC)
 #  define _CCCL_BUILTIN_IS_TRIVIAL(...) __is_trivial(__VA_ARGS__)
 #endif // _CCCL_CHECK_BUILTIN(is_trivial) && gcc >= 4.5
 
-#if _CCCL_CHECK_BUILTIN(is_trivially_assignable) || _CCCL_COMPILER(GCC, >=, 5, 1) || defined(_CCCL_COMPILER_MSVC) \
+#if _CCCL_CHECK_BUILTIN(is_trivially_assignable) || _CCCL_COMPILER(GCC, >=, 5, 1) || _CCCL_COMPILER(MSVC) \
   || _CCCL_COMPILER(NVRTC)
 #  define _CCCL_BUILTIN_IS_TRIVIALLY_ASSIGNABLE(...) __is_trivially_assignable(__VA_ARGS__)
 #endif // _CCCL_CHECK_BUILTIN(is_trivially_assignable) && gcc >= 5.1
 
-#if _CCCL_CHECK_BUILTIN(is_trivially_constructible) || _CCCL_COMPILER(GCC, >=, 5, 1) || defined(_CCCL_COMPILER_MSVC) \
+#if _CCCL_CHECK_BUILTIN(is_trivially_constructible) || _CCCL_COMPILER(GCC, >=, 5, 1) || _CCCL_COMPILER(MSVC) \
   || _CCCL_COMPILER(NVRTC)
 #  define _CCCL_BUILTIN_IS_TRIVIALLY_CONSTRUCTIBLE(...) __is_trivially_constructible(__VA_ARGS__)
 #endif // _CCCL_CHECK_BUILTIN(is_trivially_constructible) && gcc >= 5.1
 
-#if _CCCL_CHECK_BUILTIN(is_trivially_copyable) || _CCCL_COMPILER(GCC, >=, 5, 1) || defined(_CCCL_COMPILER_MSVC) \
+#if _CCCL_CHECK_BUILTIN(is_trivially_copyable) || _CCCL_COMPILER(GCC, >=, 5, 1) || _CCCL_COMPILER(MSVC) \
   || _CCCL_COMPILER(NVRTC)
 
 #  define _CCCL_BUILTIN_IS_TRIVIALLY_COPYABLE(...) __is_trivially_copyable(__VA_ARGS__)
 #endif // _CCCL_CHECK_BUILTIN(is_trivially_copyable) && gcc >= 5.1
 
-#if _CCCL_CHECK_BUILTIN(is_trivially_destructible) || defined(_CCCL_COMPILER_MSVC)
+#if _CCCL_CHECK_BUILTIN(is_trivially_destructible) || _CCCL_COMPILER(MSVC)
 #  define _CCCL_BUILTIN_IS_TRIVIALLY_DESTRUCTIBLE(...) __is_trivially_destructible(__VA_ARGS__)
 #endif // _CCCL_CHECK_BUILTIN(is_trivially_destructible)
 
-#if _CCCL_CHECK_BUILTIN(is_union) || _CCCL_COMPILER(GCC, >=, 4, 3) || defined(_CCCL_COMPILER_MSVC) \
-  || _CCCL_COMPILER(NVRTC)
+#if _CCCL_CHECK_BUILTIN(is_union) || _CCCL_COMPILER(GCC, >=, 4, 3) || _CCCL_COMPILER(MSVC) || _CCCL_COMPILER(NVRTC)
 #  define _CCCL_BUILTIN_IS_UNION(...) __is_union(__VA_ARGS__)
 #endif // _CCCL_CHECK_BUILTIN(is_union) && gcc >= 4.3
 
@@ -496,7 +483,7 @@
 #  define _CCCL_BUILTIN_ISNAN(...) __builtin_isnan(__VA_ARGS__)
 #endif // _CCCL_CHECK_BUILTIN(isnan)
 
-#if _CCCL_CHECK_BUILTIN(make_integer_seq) || (defined(_CCCL_COMPILER_MSVC) && _CCCL_MSVC_VERSION >= 1923)
+#if _CCCL_CHECK_BUILTIN(make_integer_seq) || _CCCL_COMPILER(MSVC, >=, 19, 23)
 #  define _CCCL_BUILTIN_MAKE_INTEGER_SEQ(...) __make_integer_seq<__VA_ARGS__>
 #endif // _CCCL_CHECK_BUILTIN(make_integer_seq)
 
@@ -553,27 +540,27 @@
 #  undef _CCCL_BUILTIN_TYPE_PACK_ELEMENT
 #endif // _CCCL_CUDACC_BELOW(12, 2)
 
-#if _CCCL_CHECK_BUILTIN(underlying_type) || _CCCL_COMPILER(GCC, >=, 4, 7) || defined(_CCCL_COMPILER_MSVC) \
+#if _CCCL_CHECK_BUILTIN(underlying_type) || _CCCL_COMPILER(GCC, >=, 4, 7) || _CCCL_COMPILER(MSVC) \
   || _CCCL_COMPILER(NVRTC)
 #  define _CCCL_BUILTIN_UNDERLYING_TYPE(...) __underlying_type(__VA_ARGS__)
 #endif // _CCCL_CHECK_BUILTIN(underlying_type) && gcc >= 4.7
 
-#if defined(_CCCL_COMPILER_MSVC)
+#if _CCCL_COMPILER(MSVC)
 #  // To use __builtin_FUNCSIG(), both MSVC and nvcc need to support it
-#  if _CCCL_MSVC_VERSION >= 1935 && _CCCL_CUDACC_AT_LEAST(12, 3)
+#  if _CCCL_COMPILER(MSVC, >=, 19, 35) && _CCCL_CUDACC_AT_LEAST(12, 3)
 #    define _CCCL_BUILTIN_PRETTY_FUNCTION() __builtin_FUNCSIG()
-#  else // ^^^ _CCCL_MSVC_VERSION >= 1935 ^^^ / vvv _CCCL_MSVC_VERSION < 1935 vvv
+#  else // ^^^ _CCCL_COMPILER(MSVC, >=, 19, 35) ^^^ / vvv _CCCL_COMPILER(MSVC, <, 19, 35) vvv
 #    define _CCCL_BUILTIN_PRETTY_FUNCTION() __FUNCSIG__
 #    define _CCCL_BROKEN_MSVC_FUNCSIG
-#  endif // _CCCL_MSVC_VERSION < 1935
-#else // ^^^ _CCCL_COMPILER_MSVC ^^^ / vvv !_CCCL_COMPILER_MSVC vvv
+#  endif // _CCCL_COMPILER(MSVC, <, 19, 35)
+#else // ^^^ _CCCL_COMPILER(MSVC) ^^^ / vvv !_CCCL_COMPILER(MSVC) vvv
 #  define _CCCL_BUILTIN_PRETTY_FUNCTION() __PRETTY_FUNCTION__
-#endif // !_CCCL_COMPILER_MSVC
+#endif // !_CCCL_COMPILER(MSVC)
 
 // GCC's builtin_strlen isn't reliable at constexpr time
 // MSVC does not expose builtin_strlen before C++17
 // NVRTC does not expose builtin_strlen
-#if !_CCCL_COMPILER(GCC) && !_CCCL_COMPILER(NVRTC) && !(defined(_CCCL_COMPILER_MSVC) && _CCCL_STD_VER < 2017)
+#if !_CCCL_COMPILER(GCC) && !_CCCL_COMPILER(NVRTC) && !(_CCCL_COMPILER(MSVC) && _CCCL_STD_VER < 2017)
 #  define _CCCL_BUILTIN_STRLEN(...) __builtin_strlen(__VA_ARGS__)
 #endif
 
diff --git a/libcudacxx/include/cuda/std/__cccl/compiler.h b/libcudacxx/include/cuda/std/__cccl/compiler.h
index 89c054a12a1..fd7e93d22cb 100644
--- a/libcudacxx/include/cuda/std/__cccl/compiler.h
+++ b/libcudacxx/include/cuda/std/__cccl/compiler.h
@@ -27,9 +27,14 @@
 #elif defined(__GNUC__)
 #  define _CCCL_COMPILER_GCC _CCCL_COMPILER_MAKE_VERSION(__GNUC__, __GNUC_MINOR__)
 #elif defined(_MSC_VER)
-#  define _CCCL_COMPILER_MSVC
-#  define _CCCL_MSVC_VERSION      _MSC_VER
-#  define _CCCL_MSVC_VERSION_FULL _MSC_FULL_VER
+#  define _CCCL_COMPILER_MSVC     _CCCL_COMPILER_MAKE_VERSION(_MSC_VER / 100, _MSC_VER % 100)
+#  define _CCCL_COMPILER_MSVC2017 (_CCCL_COMPILER_MSVC < _CCCL_COMPILER_MAKE_VERSION(19, 20))
+#  define _CCCL_COMPILER_MSVC2019                               \
+    (_CCCL_COMPILER_MSVC >= _CCCL_COMPILER_MAKE_VERSION(19, 20) \
+     && _CCCL_COMPILER_MSVC < _CCCL_COMPILER_MAKE_VERSION(19, 30))
+#  define _CCCL_COMPILER_MSVC2022                               \
+    (_CCCL_COMPILER_MSVC >= _CCCL_COMPILER_MAKE_VERSION(19, 30) \
+     && _CCCL_COMPILER_MSVC < _CCCL_COMPILER_MAKE_VERSION(19, 40))
 #elif defined(__CUDACC_RTC__)
 #  define _CCCL_COMPILER_NVRTC _CCCL_COMPILER_MAKE_VERSION(__CUDACC_VER_MAJOR__, __CUDACC_VER_MINOR__)
 #endif
@@ -52,17 +57,6 @@
      _CCCL_COMPILER_COMPARE_BAD_ARG_COUNT))
 #define _CCCL_COMPILER(...) _CCCL_COMPILER_SELECT(_CCCL_COMPILER_##__VA_ARGS__)(_CCCL_COMPILER_##__VA_ARGS__)
 
-// Convenient shortcut to determine which version of MSVC we are dealing with
-#if defined(_CCCL_COMPILER_MSVC)
-#  if _MSC_VER < 1920
-#    define _CCCL_COMPILER_MSVC_2017
-#  elif _MSC_VER < 1930
-#    define _CCCL_COMPILER_MSVC_2019
-#  else // _MSC_VER < 1940
-#    define _CCCL_COMPILER_MSVC_2022
-#  endif // _MSC_VER < 1940
-#endif // _CCCL_COMPILER_MSVC
-
 // Determine the cuda compiler
 #if defined(__NVCC__)
 #  define _CCCL_CUDA_COMPILER_NVCC
@@ -100,10 +94,10 @@
 #define _CCCL_TO_STRING(_STR)  _CCCL_TO_STRING2(_STR)
 
 // Define the pragma for the host compiler
-#if defined(_CCCL_COMPILER_MSVC)
+#if _CCCL_COMPILER(MSVC)
 #  define _CCCL_PRAGMA(_ARG) __pragma(_ARG)
 #else
 #  define _CCCL_PRAGMA(_ARG) _Pragma(_CCCL_TO_STRING(_ARG))
-#endif // defined(_CCCL_COMPILER_MSVC)
+#endif // _CCCL_COMPILER(MSVC)
 
 #endif // __CCCL_COMPILER_H
diff --git a/libcudacxx/include/cuda/std/__cccl/diagnostic.h b/libcudacxx/include/cuda/std/__cccl/diagnostic.h
index fdedae215f3..5824c65b8eb 100644
--- a/libcudacxx/include/cuda/std/__cccl/diagnostic.h
+++ b/libcudacxx/include/cuda/std/__cccl/diagnostic.h
@@ -55,7 +55,7 @@
 #  define _CCCL_DIAG_SUPPRESS_NVHPC(str) _CCCL_PRAGMA(diag_suppress str)
 #  define _CCCL_DIAG_SUPPRESS_MSVC(str)
 #  define _CCCL_DIAG_SUPPRESS_ICC(str)
-#elif defined(_CCCL_COMPILER_MSVC)
+#elif _CCCL_COMPILER(MSVC)
 #  define _CCCL_DIAG_PUSH _CCCL_PRAGMA(warning(push))
 #  define _CCCL_DIAG_POP  _CCCL_PRAGMA(warning(pop))
 #  define _CCCL_DIAG_SUPPRESS_CLANG(str)
@@ -96,17 +96,17 @@
     _CCCL_DIAG_PUSH                      \
     _CCCL_DIAG_SUPPRESS_NVHPC(deprecated_entity)
 #  define _CCCL_SUPPRESS_DEPRECATED_POP _CCCL_DIAG_POP
-#elif defined(_CCCL_COMPILER_MSVC)
+#elif _CCCL_COMPILER(MSVC)
 #  define _CCCL_SUPPRESS_DEPRECATED_PUSH \
     _CCCL_DIAG_PUSH                      \
     _CCCL_DIAG_SUPPRESS_MSVC(4996)
 #  define _CCCL_SUPPRESS_DEPRECATED_POP _CCCL_DIAG_POP
 #else // !_CCCL_COMPILER(CLANG) && !_CCCL_COMPILER(ICC) && && !_CCCL_COMPILER(GCC) && !_CCCL_COMPILER(NVHPC) &&
-      // !_CCCL_COMPILER_MSVC
+      // !_CCCL_COMPILER(MSVC)
 #  define _CCCL_SUPPRESS_DEPRECATED_PUSH
 #  define _CCCL_SUPPRESS_DEPRECATED_POP
 #endif // !_CCCL_COMPILER(CLANG) && !_CCCL_COMPILER(ICC) && && !_CCCL_COMPILER(GCC) && !_CCCL_COMPILER(NVHPC) &&
-       // !_CCCL_COMPILER_MSVC
+       // !_CCCL_COMPILER(MSVC)
 
 // Enable us to selectively silence cuda compiler warnings
 #if defined(_CCCL_CUDA_COMPILER)
@@ -114,31 +114,31 @@
 #    define _CCCL_NV_DIAG_SUPPRESS(_WARNING)
 #    define _CCCL_NV_DIAG_DEFAULT(_WARNING)
 #  elif defined(__NVCC_DIAG_PRAGMA_SUPPORT__) || _CCCL_COMPILER(ICC)
-#    if defined(_CCCL_COMPILER_MSVC)
+#    if _CCCL_COMPILER(MSVC)
 #      define _CCCL_NV_DIAG_SUPPRESS(_WARNING) _CCCL_PRAGMA(nv_diag_suppress _WARNING)
 #      define _CCCL_NV_DIAG_DEFAULT(_WARNING)  _CCCL_PRAGMA(nv_diag_default _WARNING)
 #    else // ^^^ _CCCL_COMPILER_{MSVC,ICC}^^^ / vvv !_CCCL_COMPILER_{MSVC,ICC} vvv
 #      define _CCCL_NV_DIAG_SUPPRESS(_WARNING) _CCCL_PRAGMA(nv_diagnostic push) _CCCL_PRAGMA(nv_diag_suppress _WARNING)
 #      define _CCCL_NV_DIAG_DEFAULT(_WARNING)  _CCCL_PRAGMA(nv_diagnostic pop)
-#    endif // !_CCCL_COMPILER_MSVC
+#    endif // !_CCCL_COMPILER(MSVC)
 #  elif _CCCL_COMPILER(NVHPC)
 #    define _CCCL_NV_DIAG_SUPPRESS(_WARNING) _CCCL_PRAGMA(diagnostic push) _CCCL_PRAGMA(diag_suppress _WARNING)
 #    define _CCCL_NV_DIAG_DEFAULT(_WARNING)  _CCCL_PRAGMA(diagnostic pop)
 #  else // ^^^ __NVCC_DIAG_PRAGMA_SUPPORT__ ^^^ / vvv !__NVCC_DIAG_PRAGMA_SUPPORT__ vvv
-#    if defined(_CCCL_COMPILER_MSVC_2017) // MSVC 2017 has issues with restoring the warning
+#    if _CCCL_COMPILER(MSVC2017) // MSVC 2017 has issues with restoring the warning
 #      define _CCCL_NV_DIAG_SUPPRESS(_WARNING) _CCCL_PRAGMA(diag_suppress _WARNING)
 #      define _CCCL_NV_DIAG_DEFAULT(_WARNING)
-#    else // ^^^ _CCCL_COMPILER_MSVC_2017 ^^^ / vvv !_CCCL_COMPILER_MSVC_2017 vvv
+#    else // ^^^ _CCCL_COMPILER(MSVC2017) ^^^ / vvv !_CCCL_COMPILER(MSVC2017) vvv
 #      define _CCCL_NV_DIAG_SUPPRESS(_WARNING) _CCCL_PRAGMA(diag_suppress _WARNING)
 #      define _CCCL_NV_DIAG_DEFAULT(_WARNING)  _CCCL_PRAGMA(diag_default _WARNING)
-#    endif // !_CCCL_COMPILER_MSVC_2017
+#    endif // !_CCCL_COMPILER(MSVC2017)
 #  endif // !__NVCC_DIAG_PRAGMA_SUPPORT__
 #else // ^^^ _CCCL_CUDA_COMPILER ^^^ / vvv !_CCCL_CUDA_COMPILER vvv
 #  define _CCCL_NV_DIAG_SUPPRESS(_WARNING)
 #  define _CCCL_NV_DIAG_DEFAULT(_WARNING)
 #endif // other compilers
 
-#if defined(_CCCL_COMPILER_MSVC)
+#if _CCCL_COMPILER(MSVC)
 #  define _CCCL_HAS_PRAGMA_MSVC_WARNING
 #  if !defined(_LIBCUDACXX_DISABLE_PRAGMA_MSVC_WARNING)
 #    define _CCCL_USE_PRAGMA_MSVC_WARNING
@@ -160,13 +160,13 @@
 #  define _CCCL_MSVC_WARNINGS_PUSH \
     _CCCL_PRAGMA(warning(push)) _CCCL_PRAGMA(warning(disable : _CCCL_MSVC_DISABLED_WARNINGS))
 #  define _CCCL_MSVC_WARNINGS_POP _CCCL_PRAGMA(warning(pop))
-#else // ^^^ _CCCL_COMPILER_MSVC ^^^ / vvv !_CCCL_COMPILER_MSVC vvv
+#else // ^^^ _CCCL_COMPILER(MSVC) ^^^ / vvv !_CCCL_COMPILER(MSVC) vvv
 #  define _CCCL_MSVC_WARNINGS_PUSH
 #  define _CCCL_MSVC_WARNINGS_POP
-#endif // !_CCCL_COMPILER_MSVC
+#endif // !_CCCL_COMPILER(MSVC)
 
 #ifndef _CCCL_HAS_NO_PRAGMA_PUSH_POP_MACRO
-#  if defined(_CCCL_COMPILER_MSVC_2017) || _CCCL_COMPILER(NVRTC)
+#  if _CCCL_COMPILER(MSVC2017) || _CCCL_COMPILER(NVRTC)
 #    define _CCCL_HAS_NO_PRAGMA_PUSH_POP_MACRO
 #  endif
 #endif // _CCCL_HAS_NO_PRAGMA_PUSH_POP_MACRO
diff --git a/libcudacxx/include/cuda/std/__cccl/dialect.h b/libcudacxx/include/cuda/std/__cccl/dialect.h
index 4b96695de73..8dfedd5a3cc 100644
--- a/libcudacxx/include/cuda/std/__cccl/dialect.h
+++ b/libcudacxx/include/cuda/std/__cccl/dialect.h
@@ -22,7 +22,7 @@
 #  pragma system_header
 #endif // no system header
 
-#if defined(_CCCL_COMPILER_MSVC)
+#if _CCCL_COMPILER(MSVC)
 #  if _MSVC_LANG <= 201103L
 #    define _CCCL_STD_VER 2011
 #  elif _MSVC_LANG <= 201402L
@@ -34,7 +34,7 @@
 #  else
 #    define _CCCL_STD_VER 2023 // current year, or date of c++2b ratification
 #  endif
-#else // ^^^ _CCCL_COMPILER_MSVC ^^^ / vvv !_CCCL_COMPILER_MSVC vvv
+#else // ^^^ _CCCL_COMPILER(MSVC) ^^^ / vvv !_CCCL_COMPILER(MSVC) vvv
 #  if __cplusplus <= 199711L
 #    define _CCCL_STD_VER 2003
 #  elif __cplusplus <= 201103L
@@ -50,7 +50,7 @@
 #  else
 #    define _CCCL_STD_VER 2024 // current year, or date of c++2c ratification
 #  endif
-#endif // !_CCCL_COMPILER_MSVC
+#endif // !_CCCL_COMPILER(MSVC)
 
 #if _CCCL_STD_VER >= 2014
 #  define _CCCL_CONSTEXPR_CXX14 constexpr
diff --git a/libcudacxx/include/cuda/std/__cccl/exceptions.h b/libcudacxx/include/cuda/std/__cccl/exceptions.h
index 24124bfa126..9f9e439e14e 100644
--- a/libcudacxx/include/cuda/std/__cccl/exceptions.h
+++ b/libcudacxx/include/cuda/std/__cccl/exceptions.h
@@ -25,8 +25,8 @@
 #ifndef _CCCL_NO_EXCEPTIONS
 #  if defined(CCCL_DISABLE_EXCEPTIONS) // Escape hatch for users to manually disable exceptions
 #    define _CCCL_NO_EXCEPTIONS
-#  elif _CCCL_COMPILER(NVRTC) || (defined(_CCCL_COMPILER_MSVC) && _CPPUNWIND == 0) \
-    || (!defined(_CCCL_COMPILER_MSVC) && !__EXCEPTIONS) // Catches all non msvc based compilers
+#  elif _CCCL_COMPILER(NVRTC) || (_CCCL_COMPILER(MSVC) && _CPPUNWIND == 0) \
+    || (!_CCCL_COMPILER(MSVC) && !__EXCEPTIONS) // Catches all non msvc based compilers
 #    define _CCCL_NO_EXCEPTIONS
 #  endif
 #endif // !_CCCL_NO_EXCEPTIONS
diff --git a/libcudacxx/include/cuda/std/__cccl/rtti.h b/libcudacxx/include/cuda/std/__cccl/rtti.h
index 502407a0607..174b6313d87 100644
--- a/libcudacxx/include/cuda/std/__cccl/rtti.h
+++ b/libcudacxx/include/cuda/std/__cccl/rtti.h
@@ -38,7 +38,7 @@
 #    endif
 #  elif _CCCL_COMPILER(NVRTC)
 #    define _CCCL_NO_RTTI
-#  elif defined(_CCCL_COMPILER_MSVC)
+#  elif _CCCL_COMPILER(MSVC)
 #    if _CPPRTTI == 0
 #      define _CCCL_NO_RTTI
 #    endif
@@ -65,7 +65,7 @@
 #    endif
 #  elif _CCCL_COMPILER(NVRTC)
 #    define _CCCL_NO_TYPEID
-#  elif defined(_CCCL_COMPILER_MSVC)
+#  elif _CCCL_COMPILER(MSVC)
 // No-op, MSVC always supports typeid even when RTTI is disabled
 #  elif _CCCL_COMPILER(CLANG)
 #    if !_CCCL_HAS_FEATURE(cxx_rtti)
diff --git a/libcudacxx/include/cuda/std/__cccl/system_header.h b/libcudacxx/include/cuda/std/__cccl/system_header.h
index 2285bcf1651..d557dc88682 100644
--- a/libcudacxx/include/cuda/std/__cccl/system_header.h
+++ b/libcudacxx/include/cuda/std/__cccl/system_header.h
@@ -19,19 +19,18 @@
 #  define _CCCL_FORCE_SYSTEM_HEADER_GCC
 #elif _CCCL_COMPILER(CLANG)
 #  define _CCCL_FORCE_SYSTEM_HEADER_CLANG
-#elif defined(_CCCL_COMPILER_MSVC)
+#elif _CCCL_COMPILER(MSVC)
 #  define _CCCL_FORCE_SYSTEM_HEADER_MSVC
 #endif // other compilers
 
 // Potentially enable that cccl headers are treated as system headers
-#if !defined(_CCCL_NO_SYSTEM_HEADER)                                                                               \
-  && !(defined(_CCCL_COMPILER_MSVC) && defined(_LIBCUDACXX_DISABLE_PRAGMA_MSVC_WARNING)) && !_CCCL_COMPILER(NVRTC) \
-  && !defined(_LIBCUDACXX_DISABLE_PRAGMA_GCC_SYSTEM_HEADER)
+#if !defined(_CCCL_NO_SYSTEM_HEADER) && !(_CCCL_COMPILER(MSVC) && defined(_LIBCUDACXX_DISABLE_PRAGMA_MSVC_WARNING)) \
+  && !_CCCL_COMPILER(NVRTC) && !defined(_LIBCUDACXX_DISABLE_PRAGMA_GCC_SYSTEM_HEADER)
 #  if _CCCL_COMPILER(GCC) || _CCCL_COMPILER(NVHPC) || _CCCL_COMPILER(ICC)
 #    define _CCCL_IMPLICIT_SYSTEM_HEADER_GCC
 #  elif _CCCL_COMPILER(CLANG)
 #    define _CCCL_IMPLICIT_SYSTEM_HEADER_CLANG
-#  elif defined(_CCCL_COMPILER_MSVC)
+#  elif _CCCL_COMPILER(MSVC)
 #    define _CCCL_IMPLICIT_SYSTEM_HEADER_MSVC
 #  endif // other compilers
 #endif // Use system header
diff --git a/libcudacxx/include/cuda/std/__cccl/unreachable.h b/libcudacxx/include/cuda/std/__cccl/unreachable.h
index eb6ae9a63db..f92a042fb05 100644
--- a/libcudacxx/include/cuda/std/__cccl/unreachable.h
+++ b/libcudacxx/include/cuda/std/__cccl/unreachable.h
@@ -35,18 +35,18 @@
 #    define _CCCL_UNREACHABLE() __builtin_unreachable()
 #  endif // CUDACC above 11.4
 #else // ^^^ __CUDA_ARCH__ ^^^ / vvv !__CUDA_ARCH__ vvv
-#  if defined(_CCCL_COMPILER_MSVC_2017)
+#  if _CCCL_COMPILER(MSVC2017)
 template <class = void>
 _LIBCUDACXX_HIDE_FROM_ABI __declspec(noreturn) void __cccl_unreachable_fallback()
 {
   __assume(0);
 }
 #    define _CCCL_UNREACHABLE() __cccl_unreachable_fallback()
-#  elif defined(_CCCL_COMPILER_MSVC)
+#  elif _CCCL_COMPILER(MSVC)
 #    define _CCCL_UNREACHABLE() __assume(0)
-#  else // ^^^ _CCCL_COMPILER_MSVC ^^^ / vvv !_CCCL_COMPILER_MSVC vvv
+#  else // ^^^ _CCCL_COMPILER(MSVC) ^^^ / vvv !_CCCL_COMPILER(MSVC) vvv
 #    define _CCCL_UNREACHABLE() __builtin_unreachable()
-#  endif // !_CCCL_COMPILER_MSVC
+#  endif // !_CCCL_COMPILER(MSVC)
 #endif // !__CUDA_ARCH__
 
 #endif // __CCCL_UNREACHABLE_H
diff --git a/libcudacxx/include/cuda/std/__cccl/visibility.h b/libcudacxx/include/cuda/std/__cccl/visibility.h
index ad35694a448..781e5a4fefa 100644
--- a/libcudacxx/include/cuda/std/__cccl/visibility.h
+++ b/libcudacxx/include/cuda/std/__cccl/visibility.h
@@ -37,21 +37,21 @@
 #endif // _CCCL_COMPILER(NVHPC)
 
 // Enable us to hide kernels
-#if defined(_CCCL_COMPILER_MSVC) || _CCCL_COMPILER(NVRTC)
+#if _CCCL_COMPILER(MSVC) || _CCCL_COMPILER(NVRTC)
 #  define _CCCL_VISIBILITY_HIDDEN
 #else // ^^^ _CCCL_COMPILER(NVRTC) ^^^ / vvv _CCCL_COMPILER(NVRTC) vvv
 #  define _CCCL_VISIBILITY_HIDDEN __attribute__((__visibility__("hidden")))
 #endif // !_CCCL_COMPILER(NVRTC)
 
-#if defined(_CCCL_COMPILER_MSVC)
+#if _CCCL_COMPILER(MSVC)
 #  define _CCCL_VISIBILITY_DEFAULT __declspec(dllimport)
-#elif _CCCL_COMPILER(NVRTC) // ^^^ _CCCL_COMPILER_MSVC ^^^ / vvv _CCCL_COMPILER(NVRTC) vvv
+#elif _CCCL_COMPILER(NVRTC) // ^^^ _CCCL_COMPILER(MSVC) ^^^ / vvv _CCCL_COMPILER(NVRTC) vvv
 #  define _CCCL_VISIBILITY_DEFAULT
 #else // ^^^ _CCCL_COMPILER(NVRTC) ^^^ / vvv !_CCCL_COMPILER(NVRTC) vvv
 #  define _CCCL_VISIBILITY_DEFAULT __attribute__((__visibility__("default")))
 #endif // !_CCCL_COMPILER(NVRTC)
 
-#if defined(_CCCL_COMPILER_MSVC) || _CCCL_COMPILER(NVRTC)
+#if _CCCL_COMPILER(MSVC) || _CCCL_COMPILER(NVRTC)
 #  define _CCCL_TYPE_VISIBILITY_DEFAULT
 #elif _CCCL_HAS_ATTRIBUTE(__type_visibility__)
 #  define _CCCL_TYPE_VISIBILITY_DEFAULT __attribute__((__type_visibility__("default")))
@@ -59,11 +59,11 @@
 #  define _CCCL_TYPE_VISIBILITY_DEFAULT _CCCL_VISIBILITY_DEFAULT
 #endif // !_CCCL_COMPILER(NVRTC)
 
-#if defined(_CCCL_COMPILER_MSVC)
+#if _CCCL_COMPILER(MSVC)
 #  define _CCCL_FORCEINLINE __forceinline
-#else // ^^^ _CCCL_COMPILER_MSVC ^^^ / vvv _CCCL_COMPILER_MSVC vvv
+#else // ^^^ _CCCL_COMPILER(MSVC) ^^^ / vvv _CCCL_COMPILER(MSVC) vvv
 #  define _CCCL_FORCEINLINE __inline__ __attribute__((__always_inline__))
-#endif // !_CCCL_COMPILER_MSVC
+#endif // !_CCCL_COMPILER(MSVC)
 
 #if _CCCL_HAS_ATTRIBUTE(exclude_from_explicit_instantiation)
 #  define _CCCL_EXCLUDE_FROM_EXPLICIT_INSTANTIATION __attribute__((exclude_from_explicit_instantiation))
diff --git a/libcudacxx/include/cuda/std/__concepts/concept_macros.h b/libcudacxx/include/cuda/std/__concepts/concept_macros.h
index 6dc147e9084..2850c38a493 100644
--- a/libcudacxx/include/cuda/std/__concepts/concept_macros.h
+++ b/libcudacxx/include/cuda/std/__concepts/concept_macros.h
@@ -78,14 +78,14 @@ _LIBCUDACXX_HIDE_FROM_ABI constexpr bool __is_true()
   return true;
 }
 
-#  if _CCCL_COMPILER(CLANG) || defined(_CCCL_COMPILER_MSVC)
+#  if _CCCL_COMPILER(CLANG) || _CCCL_COMPILER(MSVC)
 template <bool _Bp>
 _LIBCUDACXX_HIDE_FROM_ABI __cccl_enable_if_t<_Bp> __cccl_requires()
 {}
-#  else // ^^^ _CCCL_COMPILER(CLANG) || defined(_CCCL_COMPILER_MSVC) ^^^ / vvv other compilers vvv
+#  else // ^^^ _CCCL_COMPILER(CLANG) || _CCCL_COMPILER(MSVC) ^^^ / vvv other compilers vvv
 template <bool _Bp, __cccl_enable_if_t<_Bp, int> = 0>
 _CCCL_INLINE_VAR constexpr int __cccl_requires = 0;
-#  endif // !_CCCL_COMPILER(CLANG) && !defined(_CCCL_COMPILER_MSVC)
+#  endif // !_CCCL_COMPILER(CLANG) && !_CCCL_COMPILER(MSVC)
 
 template <class _Tp, class... _Args>
 _LIBCUDACXX_HIDE_FROM_ABI auto __cccl_make_dependent(_Tp*, _Tag<_Args...>*) -> _Tp;
diff --git a/libcudacxx/include/cuda/std/__concepts/convertible_to.h b/libcudacxx/include/cuda/std/__concepts/convertible_to.h
index 329b493b490..169383cb095 100644
--- a/libcudacxx/include/cuda/std/__concepts/convertible_to.h
+++ b/libcudacxx/include/cuda/std/__concepts/convertible_to.h
@@ -35,9 +35,9 @@ concept convertible_to = is_convertible_v<_From, _To> && requires { static_cast<
 
 #elif _CCCL_STD_VER >= 2014 // ^^^ C++20 ^^^ / vvv C++14/17 vvv
 
-#  if defined(_CCCL_COMPILER_MSVC)
+#  if _CCCL_COMPILER(MSVC)
 _CCCL_NV_DIAG_SUPPRESS(1211) // nonstandard cast to array type ignored
-#  endif // _CCCL_COMPILER_MSVC
+#  endif // _CCCL_COMPILER(MSVC)
 _CCCL_NV_DIAG_SUPPRESS(171) // invalid type conversion, e.g. [with _From=int **, _To=const int *const *]
 
 // We cannot put this conversion check with the other constraint, as types with deleted operator will break here
@@ -55,9 +55,9 @@ _CCCL_CONCEPT_FRAGMENT(
 template <class _From, class _To>
 _CCCL_CONCEPT convertible_to = _CCCL_FRAGMENT(__convertible_to_, _From, _To);
 
-#  if defined(_CCCL_COMPILER_MSVC)
+#  if _CCCL_COMPILER(MSVC)
 _CCCL_NV_DIAG_DEFAULT(1211) // nonstandard cast to array type ignored
-#  endif // _CCCL_COMPILER_MSVC
+#  endif // _CCCL_COMPILER(MSVC)
 _CCCL_NV_DIAG_DEFAULT(171) // invalid type conversion, e.g. [with _From=int **, _To=const int *const *]
 
 #endif // _CCCL_STD_VER >= 2014
diff --git a/libcudacxx/include/cuda/std/__concepts/destructible.h b/libcudacxx/include/cuda/std/__concepts/destructible.h
index 421b5e41335..90426478490 100644
--- a/libcudacxx/include/cuda/std/__concepts/destructible.h
+++ b/libcudacxx/include/cuda/std/__concepts/destructible.h
@@ -32,12 +32,12 @@ _LIBCUDACXX_BEGIN_NAMESPACE_STD
 
 #if _CCCL_STD_VER > 2011
 
-#  if defined(_CCCL_COMPILER_MSVC)
+#  if _CCCL_COMPILER(MSVC)
 
 template <class _Tp>
 _CCCL_CONCEPT destructible = __is_nothrow_destructible(_Tp);
 
-#  else // ^^^ _CCCL_COMPILER_MSVC ^^^ / vvv !_CCCL_COMPILER_MSVC vvv
+#  else // ^^^ _CCCL_COMPILER(MSVC) ^^^ / vvv !_CCCL_COMPILER(MSVC) vvv
 
 template <class _Tp, class = void, class = void>
 _CCCL_INLINE_VAR constexpr bool __destructible_impl = false;
@@ -67,7 +67,7 @@ _CCCL_INLINE_VAR constexpr bool __destructible<_Tp[_Nm]> = __destructible<_Tp>;
 template <class _Tp>
 _CCCL_CONCEPT destructible = __destructible<_Tp>;
 
-#  endif // !_CCCL_COMPILER_MSVC
+#  endif // !_CCCL_COMPILER(MSVC)
 
 #endif // _CCCL_STD_VER > 2011
 
diff --git a/libcudacxx/include/cuda/std/__concepts/swappable.h b/libcudacxx/include/cuda/std/__concepts/swappable.h
index 8635bc9cc6c..8688e71a702 100644
--- a/libcudacxx/include/cuda/std/__concepts/swappable.h
+++ b/libcudacxx/include/cuda/std/__concepts/swappable.h
@@ -37,9 +37,9 @@
 #include <cuda/std/__utility/forward.h>
 #include <cuda/std/__utility/move.h>
 
-#if defined(_CCCL_COMPILER_MSVC)
+#if _CCCL_COMPILER(MSVC)
 _CCCL_NV_DIAG_SUPPRESS(461) // nonstandard cast to array type ignored
-#endif // _CCCL_COMPILER_MSVC
+#endif // _CCCL_COMPILER(MSVC)
 
 #if _CCCL_STD_VER > 2011
 
@@ -199,8 +199,8 @@ _LIBCUDACXX_END_NAMESPACE_STD
 
 #endif // _CCCL_STD_VER > 2011
 
-#if defined(_CCCL_COMPILER_MSVC)
+#if _CCCL_COMPILER(MSVC)
 _CCCL_NV_DIAG_DEFAULT(461) // nonstandard cast to array type ignored
-#endif // _CCCL_COMPILER_MSVC
+#endif // _CCCL_COMPILER(MSVC)
 
 #endif // _LIBCUDACXX___CONCEPTS_SWAPPABLE_H
diff --git a/libcudacxx/include/cuda/std/__fwd/get.h b/libcudacxx/include/cuda/std/__fwd/get.h
index 9280f9d45d3..6fd977fd158 100644
--- a/libcudacxx/include/cuda/std/__fwd/get.h
+++ b/libcudacxx/include/cuda/std/__fwd/get.h
@@ -72,7 +72,7 @@ _LIBCUDACXX_HIDE_FROM_ABI _CCCL_CONSTEXPR_CXX14 const _Tp&& get(const array<_Tp,
 
 _LIBCUDACXX_END_NAMESPACE_STD
 
-#if _CCCL_STD_VER >= 2017 && !defined(_CCCL_COMPILER_MSVC_2017)
+#if _CCCL_STD_VER >= 2017 && !_CCCL_COMPILER(MSVC2017)
 
 _LIBCUDACXX_BEGIN_NAMESPACE_RANGES
 
@@ -109,6 +109,6 @@ using _CUDA_VRANGES::get;
 
 _LIBCUDACXX_END_NAMESPACE_STD
 
-#endif // _CCCL_STD_VER >= 2017 && !_CCCL_COMPILER_MSVC_2017
+#endif // _CCCL_STD_VER >= 2017 && !_CCCL_COMPILER(MSVC2017)
 
 #endif // _LIBCUDACXX___FWD_GET_H
diff --git a/libcudacxx/include/cuda/std/__fwd/subrange.h b/libcudacxx/include/cuda/std/__fwd/subrange.h
index ba6b5e45ef5..d89df6f0ab2 100644
--- a/libcudacxx/include/cuda/std/__fwd/subrange.h
+++ b/libcudacxx/include/cuda/std/__fwd/subrange.h
@@ -22,7 +22,7 @@
 
 #include <cuda/std/__iterator/concepts.h>
 
-#if _CCCL_STD_VER >= 2017 && !defined(_CCCL_COMPILER_MSVC_2017)
+#if _CCCL_STD_VER >= 2017 && !_CCCL_COMPILER(MSVC2017)
 
 _LIBCUDACXX_BEGIN_NAMESPACE_RANGES
 
@@ -52,6 +52,6 @@ class _CCCL_TYPE_VISIBILITY_DEFAULT subrange;
 _LIBCUDACXX_END_NAMESPACE_RANGES_ABI
 _LIBCUDACXX_END_NAMESPACE_RANGES
 
-#endif // _CCCL_STD_VER >= 2017 && !_CCCL_COMPILER_MSVC_2017
+#endif // _CCCL_STD_VER >= 2017 && !_CCCL_COMPILER(MSVC2017)
 
 #endif // _LIBCUDACXX___FWD_SUBRANGE_H
diff --git a/libcudacxx/include/cuda/std/__iterator/concepts.h b/libcudacxx/include/cuda/std/__iterator/concepts.h
index 59b2d0818dc..e4e507afe83 100644
--- a/libcudacxx/include/cuda/std/__iterator/concepts.h
+++ b/libcudacxx/include/cuda/std/__iterator/concepts.h
@@ -403,7 +403,7 @@ template <class _Ip>
 _CCCL_CONCEPT bidirectional_iterator = _CCCL_FRAGMENT(__bidirectional_iterator_, _Ip);
 
 // [iterator.concept.random.access]
-#  if defined(_CCCL_COMPILER_MSVC_2017)
+#  if _CCCL_COMPILER(MSVC2017)
 // For whatever reasons MSVC2017 cannot check decltype(__n +  __j)
 template <class _Ip>
 _CCCL_CONCEPT_FRAGMENT(
@@ -415,7 +415,7 @@ _CCCL_CONCEPT_FRAGMENT(
     requires(same_as<_Ip&, decltype(__i -= __n)>),
     requires(same_as<_Ip, decltype(__j - __n)>),
     requires(same_as<iter_reference_t<_Ip>, decltype(__j[__n])>)));
-#  else // ^^^ _CCCL_COMPILER_MSVC_2017 ^^^ / vvv !_CCCL_COMPILER_MSVC_2017 vvv
+#  else // ^^^ _CCCL_COMPILER(MSVC2017) ^^^ / vvv !_CCCL_COMPILER(MSVC2017) vvv
 template <class _Ip>
 _CCCL_CONCEPT_FRAGMENT(
   __random_access_iterator_operations_,
@@ -426,7 +426,7 @@ _CCCL_CONCEPT_FRAGMENT(
     requires(same_as<_Ip&, decltype(__i -= __n)>),
     requires(same_as<_Ip, decltype(__j - __n)>),
     requires(same_as<iter_reference_t<_Ip>, decltype(__j[__n])>)));
-#  endif // !_CCCL_COMPILER_MSVC_2017
+#  endif // !_CCCL_COMPILER(MSVC2017)
 template <class _Ip>
 _CCCL_CONCEPT __random_access_iterator_operations = _CCCL_FRAGMENT(__random_access_iterator_operations_, _Ip);
 
diff --git a/libcudacxx/include/cuda/std/__iterator/distance.h b/libcudacxx/include/cuda/std/__iterator/distance.h
index 1e6fae1c988..441c30c9a9f 100644
--- a/libcudacxx/include/cuda/std/__iterator/distance.h
+++ b/libcudacxx/include/cuda/std/__iterator/distance.h
@@ -59,7 +59,7 @@ distance(_InputIter __first, _InputIter __last)
 }
 
 _LIBCUDACXX_END_NAMESPACE_STD
-#if _CCCL_STD_VER > 2014 && !defined(_CCCL_COMPILER_MSVC_2017)
+#if _CCCL_STD_VER > 2014 && !_CCCL_COMPILER(MSVC2017)
 
 // [range.iter.op.distance]
 
@@ -118,6 +118,6 @@ _CCCL_GLOBAL_CONSTANT auto distance = __distance::__fn{};
 } // namespace __cpo
 _LIBCUDACXX_END_NAMESPACE_RANGES
 
-#endif // _CCCL_STD_VER > 2014  && !defined(_CCCL_COMPILER_MSVC_2017)
+#endif // _CCCL_STD_VER > 2014  && !_CCCL_COMPILER(MSVC2017)
 
 #endif // _LIBCUDACXX___ITERATOR_DISTANCE_H
diff --git a/libcudacxx/include/cuda/std/__iterator/iterator_traits.h b/libcudacxx/include/cuda/std/__iterator/iterator_traits.h
index f20dde7d1b1..759af45cc3a 100644
--- a/libcudacxx/include/cuda/std/__iterator/iterator_traits.h
+++ b/libcudacxx/include/cuda/std/__iterator/iterator_traits.h
@@ -40,11 +40,11 @@
 #include <cuda/std/cstddef>
 
 #if !_CCCL_COMPILER(NVRTC)
-#  if defined(_CCCL_COMPILER_MSVC)
+#  if _CCCL_COMPILER(MSVC)
 #    include <xutility> // for ::std::input_iterator_tag
-#  else // ^^^ _CCCL_COMPILER_MSVC ^^^ / vvv !_CCCL_COMPILER_MSVC vvv
+#  else // ^^^ _CCCL_COMPILER(MSVC) ^^^ / vvv !_CCCL_COMPILER(MSVC) vvv
 #    include <iterator> // for ::std::input_iterator_tag
-#  endif // !_CCCL_COMPILER_MSVC
+#  endif // !_CCCL_COMPILER(MSVC)
 
 #  if _CCCL_STD_VER >= 2020
 template <class _Tp, class = void>
diff --git a/libcudacxx/include/cuda/std/__iterator/move_iterator.h b/libcudacxx/include/cuda/std/__iterator/move_iterator.h
index 7e2e176b817..efdf656366a 100644
--- a/libcudacxx/include/cuda/std/__iterator/move_iterator.h
+++ b/libcudacxx/include/cuda/std/__iterator/move_iterator.h
@@ -107,7 +107,7 @@ class _CCCL_TYPE_VISIBILITY_DEFAULT move_iterator
   _Iter __current_;
 
 #if _CCCL_STD_VER >= 2017
-#  if !defined(_CCCL_COMPILER_MSVC_2017)
+#  if !_CCCL_COMPILER(MSVC2017)
   _LIBCUDACXX_HIDE_FROM_ABI static constexpr auto __mi_get_iter_concept()
   {
     if constexpr (random_access_iterator<_Iter>)
@@ -128,22 +128,22 @@ class _CCCL_TYPE_VISIBILITY_DEFAULT move_iterator
     }
     _CCCL_UNREACHABLE();
   }
-#  endif // !_CCCL_COMPILER_MSVC_2017
+#  endif // !_CCCL_COMPILER(MSVC2017)
 #endif // _CCCL_STD_VER >= 2017
 
 public:
 #if _CCCL_STD_VER > 2014
   using iterator_type = _Iter;
-#  if defined(_CCCL_COMPILER_MSVC_2017)
+#  if _CCCL_COMPILER(MSVC2017)
   // clang-format off
   using iterator_concept = conditional_t<random_access_iterator<_Iter>, random_access_iterator_tag,
                            conditional_t<bidirectional_iterator<_Iter>, bidirectional_iterator_tag,
                            conditional_t<forward_iterator<_Iter>,       forward_iterator_tag,
                                                                         input_iterator_tag>>>;
   // clang-format on
-#  else // ^^^ _CCCL_COMPILER_MSVC_2017 ^^^ / vvv !_CCCL_COMPILER_MSVC_2017 vvv
+#  else // ^^^ _CCCL_COMPILER(MSVC2017) ^^^ / vvv !_CCCL_COMPILER(MSVC2017) vvv
   using iterator_concept = decltype(__mi_get_iter_concept());
-#  endif // !_CCCL_COMPILER_MSVC_2017
+#  endif // !_CCCL_COMPILER(MSVC2017)
 
   // iterator_category is inherited and not always present
   using value_type      = iter_value_t<_Iter>;
@@ -370,7 +370,7 @@ class _CCCL_TYPE_VISIBILITY_DEFAULT move_iterator
     return _CUDA_VRANGES::iter_move(__i.__current_);
   }
 
-#  if defined(_CCCL_COMPILER_MSVC_2017) // MSVC2017 cannot find _Iter otherwise
+#  if _CCCL_COMPILER(MSVC2017) // MSVC2017 cannot find _Iter otherwise
   template <class _Iter2, class _Iter1 = _Iter>
   _LIBCUDACXX_HIDE_FROM_ABI friend constexpr auto iter_swap(
     const move_iterator<_Iter1>& __x, const move_iterator<_Iter2>& __y) noexcept(__noexcept_swappable<_Iter1, _Iter2>)
@@ -378,7 +378,7 @@ class _CCCL_TYPE_VISIBILITY_DEFAULT move_iterator
   {
     return _CUDA_VRANGES::iter_swap(__x.__current_, __y.__current_);
   }
-#  else // ^^^ _CCCL_COMPILER_MSVC_2017 ^^^ / vvv !_CCCL_COMPILER_MSVC_2017 vvv
+#  else // ^^^ _CCCL_COMPILER(MSVC2017) ^^^ / vvv !_CCCL_COMPILER(MSVC2017) vvv
   template <class _Iter2>
   _LIBCUDACXX_HIDE_FROM_ABI friend constexpr auto
   iter_swap(const move_iterator& __x, const move_iterator<_Iter2>& __y) noexcept(__noexcept_swappable<_Iter, _Iter2>)
@@ -386,17 +386,17 @@ class _CCCL_TYPE_VISIBILITY_DEFAULT move_iterator
   {
     return _CUDA_VRANGES::iter_swap(__x.__current_, __y.__current_);
   }
-#  endif // !_CCCL_COMPILER_MSVC_2017
+#  endif // !_CCCL_COMPILER(MSVC2017)
 #endif // _CCCL_STD_VER > 2014
 };
 _LIBCUDACXX_CTAD_SUPPORTED_FOR_TYPE(move_iterator);
 
 // Some compilers have issues determining _IsFancyPointer
-#if _CCCL_COMPILER(GCC) || defined(_CCCL_COMPILER_MSVC)
+#if _CCCL_COMPILER(GCC) || _CCCL_COMPILER(MSVC)
 template <class _Iter>
 struct _IsFancyPointer<move_iterator<_Iter>> : _IsFancyPointer<_Iter>
 {};
-#endif // _CCCL_COMPILER(GCC) || _CCCL_COMPILER_MSVC
+#endif // _CCCL_COMPILER(GCC) || _CCCL_COMPILER(MSVC)
 
 template <class _Iter1, class _Iter2>
 _LIBCUDACXX_HIDE_FROM_ABI _CCCL_CONSTEXPR_CXX14 bool
diff --git a/libcudacxx/include/cuda/std/__iterator/next.h b/libcudacxx/include/cuda/std/__iterator/next.h
index 4651214e4bd..f100e76ec9d 100644
--- a/libcudacxx/include/cuda/std/__iterator/next.h
+++ b/libcudacxx/include/cuda/std/__iterator/next.h
@@ -42,7 +42,7 @@ next(_InputIter __x, typename iterator_traits<_InputIter>::difference_type __n =
 
 _LIBCUDACXX_END_NAMESPACE_STD
 
-#if _CCCL_STD_VER > 2014 && !defined(_CCCL_COMPILER_MSVC_2017)
+#if _CCCL_STD_VER > 2014 && !_CCCL_COMPILER(MSVC2017)
 
 // [range.iter.op.next]
 
@@ -90,6 +90,6 @@ _CCCL_GLOBAL_CONSTANT auto next = __next::__fn{};
 } // namespace __cpo
 _LIBCUDACXX_END_NAMESPACE_RANGES
 
-#endif // _CCCL_STD_VER > 2014 && !defined(_CCCL_COMPILER_MSVC_2017)
+#endif // _CCCL_STD_VER > 2014 && !_CCCL_COMPILER(MSVC2017)
 
 #endif // _LIBCUDACXX___ITERATOR_NEXT_H
diff --git a/libcudacxx/include/cuda/std/__iterator/prev.h b/libcudacxx/include/cuda/std/__iterator/prev.h
index f28098d9e45..1e5e78d043e 100644
--- a/libcudacxx/include/cuda/std/__iterator/prev.h
+++ b/libcudacxx/include/cuda/std/__iterator/prev.h
@@ -41,7 +41,7 @@ prev(_InputIter __x, typename iterator_traits<_InputIter>::difference_type __n =
 
 _LIBCUDACXX_END_NAMESPACE_STD
 
-#if _CCCL_STD_VER > 2014 && !defined(_CCCL_COMPILER_MSVC_2017)
+#if _CCCL_STD_VER > 2014 && !_CCCL_COMPILER(MSVC2017)
 
 // [range.iter.op.prev]
 
@@ -81,6 +81,6 @@ _CCCL_GLOBAL_CONSTANT auto prev = __prev::__fn{};
 } // namespace __cpo
 _LIBCUDACXX_END_NAMESPACE_RANGES
 
-#endif // _CCCL_STD_VER > 2014 && !defined(_CCCL_COMPILER_MSVC_2017)
+#endif // _CCCL_STD_VER > 2014 && !_CCCL_COMPILER(MSVC2017)
 
 #endif // _LIBCUDACXX___ITERATOR_PREV_H
diff --git a/libcudacxx/include/cuda/std/__iterator/reverse_iterator.h b/libcudacxx/include/cuda/std/__iterator/reverse_iterator.h
index 6f2b0cce65e..982312731f9 100644
--- a/libcudacxx/include/cuda/std/__iterator/reverse_iterator.h
+++ b/libcudacxx/include/cuda/std/__iterator/reverse_iterator.h
@@ -248,7 +248,7 @@ class _CCCL_TYPE_VISIBILITY_DEFAULT reverse_iterator
     return _CUDA_VRANGES::iter_move(--__tmp);
   }
 
-#  if defined(_CCCL_COMPILER_MSVC_2017) // MSVC2017 cannot find _Iter otherwise
+#  if _CCCL_COMPILER(MSVC2017) // MSVC2017 cannot find _Iter otherwise
   template <class _Iter2, class _Iter1 = _Iter>
   _LIBCUDACXX_HIDE_FROM_ABI friend constexpr auto
   iter_swap(const reverse_iterator<_Iter1>& __x,
@@ -259,7 +259,7 @@ class _CCCL_TYPE_VISIBILITY_DEFAULT reverse_iterator
     auto __ytmp = __y.base();
     _CUDA_VRANGES::iter_swap(--__xtmp, --__ytmp);
   }
-#  else // ^^^ _CCCL_COMPILER_MSVC_2017 ^^^ / vvv !_CCCL_COMPILER_MSVC_2017 vvv
+#  else // ^^^ _CCCL_COMPILER(MSVC2017) ^^^ / vvv !_CCCL_COMPILER(MSVC2017) vvv
   template <class _Iter2>
   _LIBCUDACXX_HIDE_FROM_ABI friend constexpr auto
   iter_swap(const reverse_iterator& __x,
@@ -270,7 +270,7 @@ class _CCCL_TYPE_VISIBILITY_DEFAULT reverse_iterator
     auto __ytmp = __y.base();
     return _CUDA_VRANGES::iter_swap(--__xtmp, --__ytmp);
   }
-#  endif // !_CCCL_COMPILER_MSVC_2017
+#  endif // !_CCCL_COMPILER(MSVC2017)
 #endif // _CCCL_STD_VER > 2014
 };
 
diff --git a/libcudacxx/include/cuda/std/__iterator/unreachable_sentinel.h b/libcudacxx/include/cuda/std/__iterator/unreachable_sentinel.h
index 785c6d149c9..3ffffea090c 100644
--- a/libcudacxx/include/cuda/std/__iterator/unreachable_sentinel.h
+++ b/libcudacxx/include/cuda/std/__iterator/unreachable_sentinel.h
@@ -35,12 +35,12 @@ _LIBCUDACXX_BEGIN_NAMESPACE_RANGES_ABI
 // are only ever found through ADL
 
 struct unreachable_sentinel_t
-#  ifdef _CCCL_COMPILER_MSVC
+#  if _CCCL_COMPILER(MSVC)
   ;
 namespace __unreachable_sentinel_detail
 {
 struct __unreachable_base
-#  endif // _CCCL_COMPILER_MSVC
+#  endif // _CCCL_COMPILER(MSVC)
 {
   _CCCL_TEMPLATE(class _Iter)
   _CCCL_REQUIRES(weakly_incrementable<_Iter>)
@@ -74,11 +74,11 @@ struct __unreachable_base
 #  endif // _CCCL_STD_VER < 2020
 };
 
-#  ifdef _CCCL_COMPILER_MSVC
+#  if _CCCL_COMPILER(MSVC)
 } // namespace __unreachable_sentinel_detail
 struct unreachable_sentinel_t : __unreachable_sentinel_detail::__unreachable_base
 {};
-#  endif // _CCCL_COMPILER_MSVC
+#  endif // _CCCL_COMPILER(MSVC)
 
 _LIBCUDACXX_END_NAMESPACE_RANGES_ABI
 
diff --git a/libcudacxx/include/cuda/std/__memory/assume_aligned.h b/libcudacxx/include/cuda/std/__memory/assume_aligned.h
index c8f9310ed1a..ce7b70e6a01 100644
--- a/libcudacxx/include/cuda/std/__memory/assume_aligned.h
+++ b/libcudacxx/include/cuda/std/__memory/assume_aligned.h
@@ -36,9 +36,9 @@ _CCCL_NODISCARD _LIBCUDACXX_HIDE_FROM_ABI _CCCL_CONSTEXPR_CXX14 _Tp* assume_alig
 #if defined(_CCCL_BUILTIN_IS_CONSTANT_EVALUATED) && defined(_CCCL_BUILTIN_ASSUME_ALIGNED)
   if (!_CCCL_BUILTIN_IS_CONSTANT_EVALUATED())
   {
-#  if !defined(_CCCL_COMPILER_MSVC) // MSVC checks within the builtin
+#  if !_CCCL_COMPILER(MSVC) // MSVC checks within the builtin
     _CCCL_ASSERT(reinterpret_cast<uintptr_t>(__ptr) % _Align == 0, "Alignment assumption is violated");
-#  endif // !_CCCL_COMPILER_MSVC
+#  endif // !_CCCL_COMPILER(MSVC)
     return static_cast<_Tp*>(_CCCL_BUILTIN_ASSUME_ALIGNED(__ptr, _Align));
   }
   else
diff --git a/libcudacxx/include/cuda/std/__ranges/access.h b/libcudacxx/include/cuda/std/__ranges/access.h
index 9a18ddd88b6..2c1525e1ad4 100644
--- a/libcudacxx/include/cuda/std/__ranges/access.h
+++ b/libcudacxx/include/cuda/std/__ranges/access.h
@@ -33,7 +33,7 @@
 
 _LIBCUDACXX_BEGIN_NAMESPACE_RANGES
 
-#if _CCCL_STD_VER > 2014 && !defined(_CCCL_COMPILER_MSVC_2017)
+#if _CCCL_STD_VER > 2014 && !_CCCL_COMPILER(MSVC2017)
 
 template <class _Tp>
 _CCCL_CONCEPT __can_borrow = is_lvalue_reference_v<_Tp> || enable_borrowed_range<remove_cvref_t<_Tp>>;
@@ -279,7 +279,7 @@ inline namespace __cpo
 {
 _CCCL_GLOBAL_CONSTANT auto cend = __cend::__fn{};
 } // namespace __cpo
-#endif // _CCCL_STD_VER > 2014 && !_CCCL_COMPILER_MSVC_2017
+#endif // _CCCL_STD_VER > 2014 && !_CCCL_COMPILER(MSVC2017)
 
 _LIBCUDACXX_END_NAMESPACE_RANGES
 
diff --git a/libcudacxx/include/cuda/std/__ranges/concepts.h b/libcudacxx/include/cuda/std/__ranges/concepts.h
index 8b4aa426b51..26d7fe421e7 100644
--- a/libcudacxx/include/cuda/std/__ranges/concepts.h
+++ b/libcudacxx/include/cuda/std/__ranges/concepts.h
@@ -44,7 +44,7 @@
 
 _LIBCUDACXX_BEGIN_NAMESPACE_RANGES
 
-#if _CCCL_STD_VER >= 2017 && !defined(_CCCL_COMPILER_MSVC_2017)
+#if _CCCL_STD_VER >= 2017 && !_CCCL_COMPILER(MSVC2017)
 
 #  if _CCCL_STD_VER >= 2020
 
@@ -301,7 +301,7 @@ template <class _Range, class _Tp>
 _CCCL_CONCEPT __container_compatible_range = _CCCL_FRAGMENT(__container_compatible_range_, _Range, _Tp);
 #  endif // _CCCL_STD_VER <= 2017
 
-#endif // _CCCL_STD_VER >= 2017 && !_CCCL_COMPILER_MSVC_2017
+#endif // _CCCL_STD_VER >= 2017 && !_CCCL_COMPILER(MSVC2017)
 
 _LIBCUDACXX_END_NAMESPACE_RANGES
 
diff --git a/libcudacxx/include/cuda/std/__ranges/dangling.h b/libcudacxx/include/cuda/std/__ranges/dangling.h
index e0974298c03..b97e5e5555a 100644
--- a/libcudacxx/include/cuda/std/__ranges/dangling.h
+++ b/libcudacxx/include/cuda/std/__ranges/dangling.h
@@ -27,7 +27,7 @@
 
 _LIBCUDACXX_BEGIN_NAMESPACE_RANGES
 
-#if _CCCL_STD_VER >= 2017 && !defined(_CCCL_COMPILER_MSVC_2017)
+#if _CCCL_STD_VER >= 2017 && !_CCCL_COMPILER(MSVC2017)
 
 struct dangling
 {
@@ -47,7 +47,7 @@ using borrowed_iterator_t = enable_if_t<range<_Rp>, _If<borrowed_range<_Rp>, ite
 
 // borrowed_subrange_t defined in <__ranges/subrange.h>
 
-#endif // _CCCL_STD_VER >= 2017 && !_CCCL_COMPILER_MSVC_2017
+#endif // _CCCL_STD_VER >= 2017 && !_CCCL_COMPILER(MSVC2017)
 
 _LIBCUDACXX_END_NAMESPACE_RANGES
 
diff --git a/libcudacxx/include/cuda/std/__ranges/data.h b/libcudacxx/include/cuda/std/__ranges/data.h
index 0e949a12489..f5bf6015963 100644
--- a/libcudacxx/include/cuda/std/__ranges/data.h
+++ b/libcudacxx/include/cuda/std/__ranges/data.h
@@ -34,7 +34,7 @@
 
 _LIBCUDACXX_BEGIN_NAMESPACE_RANGES
 
-#if _CCCL_STD_VER >= 2017 && !defined(_CCCL_COMPILER_MSVC_2017)
+#if _CCCL_STD_VER >= 2017 && !_CCCL_COMPILER(MSVC2017)
 
 // [range.prim.data]
 
@@ -128,7 +128,7 @@ inline namespace __cpo
 _CCCL_GLOBAL_CONSTANT auto cdata = __cdata::__fn{};
 } // namespace __cpo
 
-#endif // _CCCL_STD_VER >= 2017 && !_CCCL_COMPILER_MSVC_2017
+#endif // _CCCL_STD_VER >= 2017 && !_CCCL_COMPILER(MSVC2017)
 
 _LIBCUDACXX_END_NAMESPACE_RANGES
 
diff --git a/libcudacxx/include/cuda/std/__ranges/empty.h b/libcudacxx/include/cuda/std/__ranges/empty.h
index 9eee04a6644..d8f8213e9a8 100644
--- a/libcudacxx/include/cuda/std/__ranges/empty.h
+++ b/libcudacxx/include/cuda/std/__ranges/empty.h
@@ -27,7 +27,7 @@
 
 _LIBCUDACXX_BEGIN_NAMESPACE_RANGES
 
-#if _CCCL_STD_VER >= 2017 && !defined(_CCCL_COMPILER_MSVC_2017)
+#if _CCCL_STD_VER >= 2017 && !_CCCL_COMPILER(MSVC2017)
 
 // [range.prim.empty]
 
@@ -104,7 +104,7 @@ inline namespace __cpo
 _CCCL_GLOBAL_CONSTANT auto empty = __empty::__fn{};
 } // namespace __cpo
 
-#endif // _CCCL_STD_VER >= 2017 && !_CCCL_COMPILER_MSVC_2017
+#endif // _CCCL_STD_VER >= 2017 && !_CCCL_COMPILER(MSVC2017)
 
 _LIBCUDACXX_END_NAMESPACE_RANGES
 
diff --git a/libcudacxx/include/cuda/std/__ranges/rbegin.h b/libcudacxx/include/cuda/std/__ranges/rbegin.h
index 671e8e31798..8b70f702797 100644
--- a/libcudacxx/include/cuda/std/__ranges/rbegin.h
+++ b/libcudacxx/include/cuda/std/__ranges/rbegin.h
@@ -33,7 +33,7 @@
 
 _LIBCUDACXX_BEGIN_NAMESPACE_RANGES
 
-#if _CCCL_STD_VER >= 2017 && !defined(_CCCL_COMPILER_MSVC_2017)
+#if _CCCL_STD_VER >= 2017 && !_CCCL_COMPILER(MSVC2017)
 
 // [ranges.access.rbegin]
 
@@ -168,7 +168,7 @@ inline namespace __cpo
 _CCCL_GLOBAL_CONSTANT auto crbegin = __crbegin::__fn{};
 } // namespace __cpo
 
-#endif // _CCCL_STD_VER >= 2017 && && !_CCCL_COMPILER_MSVC_2017
+#endif // _CCCL_STD_VER >= 2017 && && !_CCCL_COMPILER(MSVC2017)
 
 _LIBCUDACXX_END_NAMESPACE_RANGES
 
diff --git a/libcudacxx/include/cuda/std/__ranges/rend.h b/libcudacxx/include/cuda/std/__ranges/rend.h
index 28ec5e9e021..5c266d63bdd 100644
--- a/libcudacxx/include/cuda/std/__ranges/rend.h
+++ b/libcudacxx/include/cuda/std/__ranges/rend.h
@@ -34,7 +34,7 @@
 
 _LIBCUDACXX_BEGIN_NAMESPACE_RANGES
 
-#if _CCCL_STD_VER >= 2017 && !defined(_CCCL_COMPILER_MSVC_2017)
+#if _CCCL_STD_VER >= 2017 && !_CCCL_COMPILER(MSVC2017)
 
 // [range.access.rend]
 
@@ -174,7 +174,7 @@ inline namespace __cpo
 _CCCL_GLOBAL_CONSTANT auto crend = __crend::__fn{};
 } // namespace __cpo
 
-#endif // _CCCL_STD_VER >= 2017 && !_CCCL_COMPILER_MSVC_2017
+#endif // _CCCL_STD_VER >= 2017 && !_CCCL_COMPILER(MSVC2017)
 
 _LIBCUDACXX_END_NAMESPACE_RANGES
 
diff --git a/libcudacxx/include/cuda/std/__ranges/size.h b/libcudacxx/include/cuda/std/__ranges/size.h
index 0c87e1c1ef3..04487441586 100644
--- a/libcudacxx/include/cuda/std/__ranges/size.h
+++ b/libcudacxx/include/cuda/std/__ranges/size.h
@@ -36,7 +36,7 @@
 
 _LIBCUDACXX_BEGIN_NAMESPACE_RANGES
 
-#if _CCCL_STD_VER >= 2017 && !defined(_CCCL_COMPILER_MSVC_2017)
+#if _CCCL_STD_VER >= 2017 && !_CCCL_COMPILER(MSVC2017)
 
 template <class>
 _CCCL_INLINE_VAR constexpr bool disable_sized_range = false;
@@ -200,7 +200,7 @@ inline namespace __cpo
 _CCCL_GLOBAL_CONSTANT auto ssize = __ssize::__fn{};
 } // namespace __cpo
 
-#endif // _CCCL_STD_VER >= 2017 && !_CCCL_COMPILER_MSVC_2017
+#endif // _CCCL_STD_VER >= 2017 && !_CCCL_COMPILER(MSVC2017)
 
 _LIBCUDACXX_END_NAMESPACE_RANGES
 
diff --git a/libcudacxx/include/cuda/std/__ranges/subrange.h b/libcudacxx/include/cuda/std/__ranges/subrange.h
index a9eb9f2572f..190df21d43b 100644
--- a/libcudacxx/include/cuda/std/__ranges/subrange.h
+++ b/libcudacxx/include/cuda/std/__ranges/subrange.h
@@ -51,7 +51,7 @@
 #include <cuda/std/__type_traits/remove_pointer.h>
 #include <cuda/std/__utility/move.h>
 
-#if _CCCL_STD_VER >= 2017 && !defined(_CCCL_COMPILER_MSVC_2017)
+#if _CCCL_STD_VER >= 2017 && !_CCCL_COMPILER(MSVC2017)
 
 // MSVC complains about [[msvc::no_unique_address]] prior to C++20 as a vendor extension
 _CCCL_DIAG_PUSH
@@ -514,6 +514,6 @@ _LIBCUDACXX_END_NAMESPACE_STD
 
 _CCCL_DIAG_POP
 
-#endif // _CCCL_STD_VER >= 2017 && !defined(_CCCL_COMPILER_MSVC_2017)
+#endif // _CCCL_STD_VER >= 2017 && !_CCCL_COMPILER(MSVC2017)
 
 #endif // _LIBCUDACXX___RANGES_SUBRANGE_H
diff --git a/libcudacxx/include/cuda/std/__ranges/unwrap_end.h b/libcudacxx/include/cuda/std/__ranges/unwrap_end.h
index f134f141e8f..9e0b6636ff0 100644
--- a/libcudacxx/include/cuda/std/__ranges/unwrap_end.h
+++ b/libcudacxx/include/cuda/std/__ranges/unwrap_end.h
@@ -27,7 +27,7 @@
 
 _LIBCUDACXX_BEGIN_NAMESPACE_RANGES
 
-#if _CCCL_STD_VER >= 2017 && !defined(_CCCL_COMPILER_MSVC_2017)
+#if _CCCL_STD_VER >= 2017 && !_CCCL_COMPILER(MSVC2017)
 
 _CCCL_TEMPLATE(class _Range)
 _CCCL_REQUIRES(forward_range<_Range>)
@@ -46,7 +46,7 @@ _CCCL_NODISCARD _LIBCUDACXX_HIDE_FROM_ABI constexpr iterator_t<_Range> __unwrap_
   _CCCL_UNREACHABLE();
 }
 
-#endif // _CCCL_STD_VER >= 2017 && !_CCCL_COMPILER_MSVC_2017
+#endif // _CCCL_STD_VER >= 2017 && !_CCCL_COMPILER(MSVC2017)
 
 _LIBCUDACXX_END_NAMESPACE_RANGES
 
diff --git a/libcudacxx/include/cuda/std/__ranges/view_interface.h b/libcudacxx/include/cuda/std/__ranges/view_interface.h
index f7f14b53c09..661e20c1b68 100644
--- a/libcudacxx/include/cuda/std/__ranges/view_interface.h
+++ b/libcudacxx/include/cuda/std/__ranges/view_interface.h
@@ -37,7 +37,7 @@
 
 _LIBCUDACXX_BEGIN_NAMESPACE_RANGES
 
-#if _CCCL_STD_VER >= 2017 && !defined(_CCCL_COMPILER_MSVC_2017)
+#if _CCCL_STD_VER >= 2017 && !_CCCL_COMPILER(MSVC2017)
 
 #  if _CCCL_STD_VER >= 2020
 template <class _Tp>
@@ -178,7 +178,7 @@ class view_interface
 
 _LIBCUDACXX_END_NAMESPACE_RANGES_ABI
 
-#endif // _CCCL_STD_VER >= 2017 && !_CCCL_COMPILER_MSVC_2017
+#endif // _CCCL_STD_VER >= 2017 && !_CCCL_COMPILER(MSVC2017)
 
 _LIBCUDACXX_END_NAMESPACE_RANGES
 
diff --git a/libcudacxx/include/cuda/std/__ranges/views.h b/libcudacxx/include/cuda/std/__ranges/views.h
index 8941de6c14d..3954877f117 100644
--- a/libcudacxx/include/cuda/std/__ranges/views.h
+++ b/libcudacxx/include/cuda/std/__ranges/views.h
@@ -21,7 +21,7 @@
 #  pragma system_header
 #endif // no system header
 
-#if _CCCL_STD_VER >= 2017 && !defined(_CCCL_COMPILER_MSVC_2017)
+#if _CCCL_STD_VER >= 2017 && !_CCCL_COMPILER(MSVC2017)
 
 _LIBCUDACXX_BEGIN_NAMESPACE_VIEWS
 
@@ -33,6 +33,6 @@ namespace views = ranges::views;
 
 _LIBCUDACXX_END_NAMESPACE_STD
 
-#endif // _CCCL_STD_VER >= 2017 && !_CCCL_COMPILER_MSVC_2017
+#endif // _CCCL_STD_VER >= 2017 && !_CCCL_COMPILER(MSVC2017)
 
 #endif // _LIBCUDACXX___RANGES_VIEWS
diff --git a/libcudacxx/include/cuda/std/__tuple_dir/structured_bindings.h b/libcudacxx/include/cuda/std/__tuple_dir/structured_bindings.h
index 8c381a9af91..e054f78729e 100644
--- a/libcudacxx/include/cuda/std/__tuple_dir/structured_bindings.h
+++ b/libcudacxx/include/cuda/std/__tuple_dir/structured_bindings.h
@@ -157,7 +157,7 @@ struct tuple_element<_Ip, const volatile _CUDA_VSTD::tuple<_Tp...>>
     : _CUDA_VSTD::tuple_element<_Ip, const volatile _CUDA_VSTD::tuple<_Tp...>>
 {};
 
-#  if !defined(_CCCL_COMPILER_MSVC_2017)
+#  if !_CCCL_COMPILER(MSVC2017)
 template <class _Ip, class _Sp, _CUDA_VRANGES::subrange_kind _Kp>
 struct tuple_size<_CUDA_VRANGES::subrange<_Ip, _Sp, _Kp>>
     : _CUDA_VSTD::tuple_size<_CUDA_VRANGES::subrange<_Ip, _Sp, _Kp>>
@@ -197,7 +197,7 @@ template <size_t _Idx, class _Ip, class _Sp, _CUDA_VRANGES::subrange_kind _Kp>
 struct tuple_element<_Idx, const volatile _CUDA_VRANGES::subrange<_Ip, _Sp, _Kp>>
     : _CUDA_VSTD::tuple_element<_Idx, const volatile _CUDA_VRANGES::subrange<_Ip, _Sp, _Kp>>
 {};
-#  endif // !_CCCL_COMPILER_MSVC_2017
+#  endif // !_CCCL_COMPILER(MSVC2017)
 } // namespace std
 #endif // _CCCL_STD_VER >= 2017
 
diff --git a/libcudacxx/include/cuda/std/__tuple_dir/tuple_like.h b/libcudacxx/include/cuda/std/__tuple_dir/tuple_like.h
index b1f2273b035..28a6b1dada9 100644
--- a/libcudacxx/include/cuda/std/__tuple_dir/tuple_like.h
+++ b/libcudacxx/include/cuda/std/__tuple_dir/tuple_like.h
@@ -56,11 +56,11 @@ template <class _Tp, size_t _Size>
 struct __tuple_like<array<_Tp, _Size>> : true_type
 {};
 
-#if _CCCL_STD_VER >= 2017 && !defined(_CCCL_COMPILER_MSVC_2017)
+#if _CCCL_STD_VER >= 2017 && !_CCCL_COMPILER(MSVC2017)
 template <class _Ip, class _Sp, _CUDA_VRANGES::subrange_kind _Kp>
 struct __tuple_like<_CUDA_VRANGES::subrange<_Ip, _Sp, _Kp>> : true_type
 {};
-#endif // _CCCL_STD_VER >= 2017 && !_CCCL_COMPILER_MSVC_2017
+#endif // _CCCL_STD_VER >= 2017 && !_CCCL_COMPILER(MSVC2017)
 
 template <class... _Tp>
 struct __tuple_like<__tuple_types<_Tp...>> : true_type
diff --git a/libcudacxx/include/cuda/std/__type_traits/common_reference.h b/libcudacxx/include/cuda/std/__type_traits/common_reference.h
index 020925bfb2c..6f62a1033ef 100644
--- a/libcudacxx/include/cuda/std/__type_traits/common_reference.h
+++ b/libcudacxx/include/cuda/std/__type_traits/common_reference.h
@@ -42,7 +42,7 @@ _LIBCUDACXX_BEGIN_NAMESPACE_STD
 // common_reference
 
 // Let COND_RES(X, Y) be:
-#ifdef _CCCL_COMPILER_MSVC // Workaround for DevCom-1627396
+#if _CCCL_COMPILER(MSVC) // Workaround for DevCom-1627396
 template <class _Tp>
 _Tp __returns_exactly() noexcept; // not defined
 
@@ -67,10 +67,10 @@ struct __cond_res_workaround<_Tp, _Up, void_t<__cond_res_if_right<_Tp, _Up>>>
 
 template <class _Xp, class _Yp>
 using __cond_res = typename __cond_res_workaround<_Xp, _Yp>::type;
-#else // ^^^ MSVC ^^^ / vvv !MSVC vvv
+#else // ^^^ _CCCL_COMPILER(MSVC) ^^^ / vvv !_CCCL_COMPILER(MSVC) vvv
 template <class _Xp, class _Yp>
 using __cond_res = decltype(false ? _CUDA_VSTD::declval<_Xp (&)()>()() : _CUDA_VSTD::declval<_Yp (&)()>()());
-#endif // !MSVC
+#endif // !_CCCL_COMPILER(MSVC)
 
 // Let `XREF(A)` denote a unary alias template `T` such that `T<U>` denotes the same type as `U`
 // with the addition of `A`'s cv and reference qualifiers, for a non-reference cv-unqualified type
diff --git a/libcudacxx/include/cuda/std/__type_traits/common_type.h b/libcudacxx/include/cuda/std/__type_traits/common_type.h
index 319d6fb7143..09067b52084 100644
--- a/libcudacxx/include/cuda/std/__type_traits/common_type.h
+++ b/libcudacxx/include/cuda/std/__type_traits/common_type.h
@@ -90,11 +90,11 @@ struct __common_type2_imp : __common_type3<_Tp, _Up>
 // branches have diverging return types, this happens for extended floating point types
 template <class _Tp, class _Up>
 using __msvc_declval_workaround =
-#if defined(_CCCL_COMPILER_MSVC)
+#if _CCCL_COMPILER(MSVC)
   enable_if_t<_CCCL_TRAIT(is_same, __cond_type<_Tp, _Up>, __cond_type<_Up, _Tp>)>;
-#else // ^^^ _CCCL_COMPILER_MSVC ^^^ / vvv !_CCCL_COMPILER_MSVC vvv
+#else // ^^^ _CCCL_COMPILER(MSVC) ^^^ / vvv !_CCCL_COMPILER(MSVC) vvv
   void;
-#endif // !_CCCL_COMPILER_MSVC
+#endif // !_CCCL_COMPILER(MSVC)
 
 // sub-bullet 3 - "if decay_t<decltype(false ? declval<D1>() : declval<D2>())> ..."
 template <class _Tp, class _Up>
diff --git a/libcudacxx/include/cuda/std/__type_traits/disjunction.h b/libcudacxx/include/cuda/std/__type_traits/disjunction.h
index 01fe64735a5..61bfca3f428 100644
--- a/libcudacxx/include/cuda/std/__type_traits/disjunction.h
+++ b/libcudacxx/include/cuda/std/__type_traits/disjunction.h
@@ -51,7 +51,7 @@ struct _OrImpl<false>
 template <class... _Args>
 using _Or _CCCL_NODEBUG_ALIAS = typename _OrImpl<sizeof...(_Args) != 0>::template _Result<false_type, _Args...>;
 
-#ifdef _CCCL_COMPILER_MSVC
+#if _CCCL_COMPILER(MSVC)
 template <class... _Args>
 struct disjunction : false_type
 {};
@@ -63,7 +63,7 @@ struct disjunction<_First, _Rest...> : _OrImpl<true>::template _Result<false_typ
 template <class... _Args>
 struct disjunction : _Or<_Args...>
 {};
-#endif // !MSVC
+#endif // !_CCCL_COMPILER(MSVC)
 
 #if !defined(_CCCL_NO_VARIABLE_TEMPLATES)
 template <class... _Args>
diff --git a/libcudacxx/include/cuda/std/__type_traits/is_convertible.h b/libcudacxx/include/cuda/std/__type_traits/is_convertible.h
index 11b16014cb8..4fbcb82deb6 100644
--- a/libcudacxx/include/cuda/std/__type_traits/is_convertible.h
+++ b/libcudacxx/include/cuda/std/__type_traits/is_convertible.h
@@ -43,7 +43,7 @@ template <class _T1, class _T2>
 _CCCL_INLINE_VAR constexpr bool is_convertible_v = _CCCL_BUILTIN_IS_CONVERTIBLE_TO(_T1, _T2);
 #  endif // !_CCCL_NO_VARIABLE_TEMPLATES
 
-#  ifdef _CCCL_COMPILER_MSVC // Workaround for DevCom-1627396
+#  if _CCCL_COMPILER(MSVC) // Workaround for DevCom-1627396
 template <class _Ty>
 struct is_convertible<_Ty&, volatile _Ty&> : true_type
 {};
@@ -71,7 +71,7 @@ _CCCL_INLINE_VAR constexpr bool is_convertible_v<_Ty&, const volatile _Ty&> = tr
 
 template <class _Ty>
 _CCCL_INLINE_VAR constexpr bool is_convertible_v<volatile _Ty&, const volatile _Ty&> = true;
-#  endif // _CCCL_COMPILER_MSVC
+#  endif // _CCCL_COMPILER(MSVC)
 
 #else // ^^^ _CCCL_BUILTIN_IS_CONVERTIBLE_TO ^^^ / vvv !_CCCL_BUILTIN_IS_CONVERTIBLE_TO vvv
 
diff --git a/libcudacxx/include/cuda/std/__type_traits/is_primary_template.h b/libcudacxx/include/cuda/std/__type_traits/is_primary_template.h
index 9c6a7ebc53d..d9d536d2b80 100644
--- a/libcudacxx/include/cuda/std/__type_traits/is_primary_template.h
+++ b/libcudacxx/include/cuda/std/__type_traits/is_primary_template.h
@@ -27,7 +27,7 @@
 
 _LIBCUDACXX_BEGIN_NAMESPACE_STD
 
-#if defined(_CCCL_COMPILER_MSVC)
+#if _CCCL_COMPILER(MSVC)
 template <class _Tp, class = void>
 struct __is_primary_template : false_type
 {};
@@ -37,13 +37,13 @@ struct __is_primary_template<_Tp, void_t<typename _Tp::__primary_template>>
     : public is_same<_Tp, typename _Tp::__primary_template>
 {};
 
-#else // ^^^ _CCCL_COMPILER_MSVC ^^^ / vvv !_CCCL_COMPILER_MSVC vvv
+#else // ^^^ _CCCL_COMPILER(MSVC) ^^^ / vvv !_CCCL_COMPILER(MSVC) vvv
 
 template <class _Tp>
 using __test_for_primary_template = enable_if_t<_IsSame<_Tp, typename _Tp::__primary_template>::value>;
 template <class _Tp>
 using __is_primary_template = _IsValidExpansion<__test_for_primary_template, _Tp>;
-#endif // !_CCCL_COMPILER_MSVC
+#endif // !_CCCL_COMPILER(MSVC)
 
 _LIBCUDACXX_END_NAMESPACE_STD
 
diff --git a/libcudacxx/include/cuda/std/__type_traits/type_list.h b/libcudacxx/include/cuda/std/__type_traits/type_list.h
index 4bd928b0013..00f69273673 100644
--- a/libcudacxx/include/cuda/std/__type_traits/type_list.h
+++ b/libcudacxx/include/cuda/std/__type_traits/type_list.h
@@ -559,7 +559,7 @@ using __type_back = __type_at_c<_List::__size - 1, _List>;
 
 namespace __detail
 {
-#  if defined(_CCCL_COMPILER_MSVC) && _CCCL_MSVC_VERSION < 1938
+#  if _CCCL_COMPILER(MSVC, <, 19, 38)
 // A workaround for https://developercommunity.visualstudio.com/t/fatal-error-C1001:-Internal-compiler-err/10405847
 struct __type_concat_fn
 {
@@ -586,7 +586,7 @@ struct __type_concat_fn
   template <class... _Lists>
   using __call _CCCL_NODEBUG_ALIAS = __type<__trait<_Lists...>>;
 };
-#  else // ^^^ _CCCL_COMPILER_MSVC < 19.38 ^^^ / vvv !(_CCCL_COMPILER_MSVC < 19.38) vvv
+#  else // ^^^ _CCCL_COMPILER(MSVC, <, 19, 38) ^^^ / vvv _CCCL_COMPILER(MSVC, >=, 19, 38) vvv
 template <size_t _Count>
 struct _CCCL_TYPE_VISIBILITY_DEFAULT __type_maybe_concat_fn
 {
@@ -646,7 +646,7 @@ struct _CCCL_TYPE_VISIBILITY_DEFAULT __type_concat_fn
     __type_list_ptr<>{nullptr},
     __type_list_ptr<>{nullptr}));
 };
-#  endif // !(_CCCL_COMPILER_MSVC < 19.38)
+#  endif // _CCCL_COMPILER(MSVC, >=, 19, 38)
 } // namespace __detail
 
 //! \brief Concatenate a list of type lists into a single type list.
diff --git a/libcudacxx/include/cuda/std/__type_traits/type_set.h b/libcudacxx/include/cuda/std/__type_traits/type_set.h
index e73c6161070..c83ebd06af7 100644
--- a/libcudacxx/include/cuda/std/__type_traits/type_set.h
+++ b/libcudacxx/include/cuda/std/__type_traits/type_set.h
@@ -84,7 +84,7 @@ struct __bulk_insert
 template <>
 struct __bulk_insert<false>
 {
-#if defined(_CCCL_COMPILER_MSVC) && _CCCL_MSVC_VERSION < 1920
+#if _CCCL_COMPILER(MSVC, <, 19, 20)
   template <class _Set, class _Ty, class... _Us>
   _LIBCUDACXX_HIDE_FROM_ABI static auto __insert_fn(__type_list<_Ty, _Us...>*) ->
     typename __bulk_insert<sizeof...(_Us) == 0>::template __call<typename _Set::template __maybe_insert<_Ty>, _Us...>;
diff --git a/libcudacxx/include/cuda/std/__utility/auto_cast.h b/libcudacxx/include/cuda/std/__utility/auto_cast.h
index ad4107ab178..b766493501f 100644
--- a/libcudacxx/include/cuda/std/__utility/auto_cast.h
+++ b/libcudacxx/include/cuda/std/__utility/auto_cast.h
@@ -23,7 +23,7 @@
 
 #include <cuda/std/__type_traits/decay.h>
 
-#if _CCCL_STD_VER < 2020 && defined(_CCCL_COMPILER_MSVC)
+#if _CCCL_STD_VER < 2020 && _CCCL_COMPILER(MSVC)
 #  define _LIBCUDACXX_AUTO_CAST(expr) (_CUDA_VSTD::decay_t<decltype((expr))>) (expr)
 #else
 #  define _LIBCUDACXX_AUTO_CAST(expr) static_cast<_CUDA_VSTD::decay_t<decltype((expr))>>(expr)
diff --git a/libcudacxx/include/cuda/std/__utility/declval.h b/libcudacxx/include/cuda/std/__utility/declval.h
index 96499be6e67..d7f701c201a 100644
--- a/libcudacxx/include/cuda/std/__utility/declval.h
+++ b/libcudacxx/include/cuda/std/__utility/declval.h
@@ -30,8 +30,7 @@ _LIBCUDACXX_BEGIN_NAMESPACE_STD
 // MSVC < 19.39 to miscompile so we use the fallback instead. The use of the
 // `__identity_t` alias is help MSVC parse the declaration correctly.
 #if !defined(_CCCL_NO_VARIABLE_TEMPLATES) && !defined(_CCCL_NO_NOEXCEPT_FUNCTION_TYPE) \
-  && !(defined(_CCCL_CUDA_COMPILER_NVCC) && _CCCL_CUDACC_BELOW(12, 4))                 \
-  && !(defined(_CCCL_COMPILER_MSVC) && _CCCL_MSVC_VERSION < 1939)
+  && !(defined(_CCCL_CUDA_COMPILER_NVCC) && _CCCL_CUDACC_BELOW(12, 4)) && !_CCCL_COMPILER(MSVC, <, 19, 39)
 
 template <class _Tp>
 using __identity_t _CCCL_NODEBUG_ALIAS = _Tp;
diff --git a/libcudacxx/include/cuda/std/bitset b/libcudacxx/include/cuda/std/bitset
index 7c9839bc043..60d0e912c80 100644
--- a/libcudacxx/include/cuda/std/bitset
+++ b/libcudacxx/include/cuda/std/bitset
@@ -526,15 +526,15 @@ protected:
 
   _LIBCUDACXX_HIDE_FROM_ABI _CCCL_CONSTEXPR_CXX14 unsigned long to_ulong() const
   {
-#ifdef _CCCL_COMPILER_MSVC
+#if _CCCL_COMPILER(MSVC)
     if (static_cast<unsigned long>(__first_.__data) != __first_.__data)
     {
       _CUDA_VSTD::__throw_overflow_error("bitset to_ulong overflow error");
     }
     return static_cast<unsigned long>(__first_.__data);
-#else // ^^ MSVC ^^ | vv !MSVC vv
+#else // ^^ _CCCL_COMPILER(MSVC) ^^ | vv !_CCCL_COMPILER(MSVC) vv
     return __first_.__data;
-#endif // !MSVC
+#endif // !_CCCL_COMPILER(MSVC)
   }
 
   _LIBCUDACXX_HIDE_FROM_ABI _CCCL_CONSTEXPR_CXX14 unsigned long long to_ullong() const
diff --git a/libcudacxx/include/cuda/std/detail/libcxx/include/__config b/libcudacxx/include/cuda/std/detail/libcxx/include/__config
index 2cdeeb4c1ef..2944ef6d4c8 100644
--- a/libcudacxx/include/cuda/std/detail/libcxx/include/__config
+++ b/libcudacxx/include/cuda/std/detail/libcxx/include/__config
@@ -115,7 +115,7 @@ extern "C++" {
 #  elif defined(_LIBCUDACXX_ABI_FORCE_MICROSOFT)
 #    define _LIBCUDACXX_ABI_MICROSOFT
 #  else
-#    if defined(_WIN32) && defined(_CCCL_COMPILER_MSVC)
+#    if defined(_WIN32) && _CCCL_COMPILER(MSVC)
 #      define _LIBCUDACXX_ABI_MICROSOFT
 #    else
 #      define _LIBCUDACXX_ABI_ITANIUM
@@ -186,7 +186,7 @@ extern "C++" {
 #    define _LIBCUDACXX_MSVCRT_LIKE
 // If mingw not explicitly detected, assume using MS C runtime only if
 // a MS compatibility version is specified.
-#    if defined(_CCCL_COMPILER_MSVC) && !defined(__MINGW32__)
+#    if _CCCL_COMPILER(MSVC) && !defined(__MINGW32__)
 #      define _LIBCUDACXX_MSVCRT // Using Microsoft's C Runtime library
 #    endif
 #    if (defined(_M_AMD64) || defined(__x86_64__)) || (defined(_M_ARM) || defined(__arm__))
@@ -287,14 +287,14 @@ extern "C++" {
 #    define __alignof(x) alignof(x)
 #  endif // _CCCL_COMPILER(NVRTC)
 
-#  if defined(_CCCL_COMPILER_MSVC)
+#  if _CCCL_COMPILER(MSVC)
 #    define __alignof__ __alignof
 #  endif
 
 #  define _LIBCUDACXX_ALIGNOF(_Tp)           alignof(_Tp)
 #  define _LIBCUDACXX_PREFERRED_ALIGNOF(_Tp) __alignof(_Tp)
 
-#  if defined(_CCCL_COMPILER_MSVC)
+#  if _CCCL_COMPILER(MSVC)
 #    define _CCCL_ALIGNAS_TYPE(x) alignas(x)
 #    define _CCCL_ALIGNAS(x)      __declspec(align(x))
 #  elif _CCCL_HAS_FEATURE(cxx_alignas)
@@ -303,7 +303,7 @@ extern "C++" {
 #  else
 #    define _CCCL_ALIGNAS_TYPE(x) __attribute__((__aligned__(_LIBCUDACXX_ALIGNOF(x))))
 #    define _CCCL_ALIGNAS(x)      __attribute__((__aligned__(x)))
-#  endif // !_CCCL_COMPILER_MSVC && !_CCCL_HAS_FEATURE(cxx_alignas)
+#  endif // !_CCCL_COMPILER(MSVC) && !_CCCL_HAS_FEATURE(cxx_alignas)
 
 // This is wrapped in __CUDA_ARCH__ to prevent error: "ignoring '#pragma unroll'
 // [-Werror=unknown-pragmas]"
@@ -389,7 +389,7 @@ typedef __char32_t char32_t;
 
 #    define _LIBCUDACXX_DISABLE_EXTENSION_WARNING __extension__
 
-#  elif defined(_CCCL_COMPILER_MSVC)
+#  elif _CCCL_COMPILER(MSVC)
 
 #    define _LIBCUDACXX_WARNING(x) _CCCL_PRAGMA(message(__FILE__ "(" _CCCL_TO_STRING(__LINE__) ") : warning note: " x))
 
@@ -461,7 +461,7 @@ typedef __char32_t char32_t;
 #  endif // _LIBCUDACXX_HAS_NO_CXX20_CHRONO_LITERALS
 
 #  ifndef _LIBCUDACXX_HAS_NO_INT128
-#    if defined(_CCCL_COMPILER_MSVC) || (_CCCL_COMPILER(NVRTC) && !defined(__CUDACC_RTC_INT128__)) \
+#    if _CCCL_COMPILER(MSVC) || (_CCCL_COMPILER(NVRTC) && !defined(__CUDACC_RTC_INT128__)) \
       || (defined(_CCCL_CUDA_COMPILER_NVCC) && _CCCL_CUDACC_BELOW(11, 5)) || !defined(__SIZEOF_INT128__)
 #      define _LIBCUDACXX_HAS_NO_INT128
 #    endif
@@ -605,7 +605,7 @@ typedef unsigned int char32_t;
 
 // If we are getting operator new from the MSVC CRT, then allocation overloads
 // for align_val_t were added in 19.12, aka VS 2017 version 15.3.
-#  if defined(_LIBCUDACXX_MSVCRT) && defined(_CCCL_COMPILER_MSVC) && _MSC_VER < 1912
+#  if defined(_LIBCUDACXX_MSVCRT) && _CCCL_COMPILER(MSVC, <, 19, 12)
 #    define _LIBCUDACXX_HAS_NO_LIBRARY_ALIGNED_ALLOCATION
 #  elif defined(_LIBCUDACXX_ABI_VCRUNTIME) && !defined(__cpp_aligned_new)
 // We're deferring to Microsoft's STL to provide aligned new et al. We don't
@@ -681,7 +681,7 @@ typedef unsigned int char32_t;
 #  endif // _LIBCUDACXX_HAS_THREAD_API_CUDA
 
 #  ifndef _LIBCUDACXX_HAS_THREAD_API_WIN32
-#    if defined(_CCCL_COMPILER_MSVC) && !defined(_LIBCUDACXX_HAS_THREAD_API_CUDA)
+#    if _CCCL_COMPILER(MSVC) && !defined(_LIBCUDACXX_HAS_THREAD_API_CUDA)
 #      define _LIBCUDACXX_HAS_THREAD_API_WIN32
 #    endif
 #  endif // _LIBCUDACXX_HAS_THREAD_API_WIN32
@@ -744,7 +744,7 @@ typedef unsigned int char32_t;
 #    define _LIBCUDACXX_HAS_GCC_ATOMIC_IMP
 #  elif _CCCL_COMPILER(NVHPC)
 #    define _LIBCUDACXX_HAS_GCC_ATOMIC_IMP
-#  elif defined(_CCCL_COMPILER_MSVC)
+#  elif _CCCL_COMPILER(MSVC)
 #    define _LIBCUDACXX_HAS_MSVC_ATOMIC_IMPL
 #  endif
 
@@ -800,7 +800,7 @@ typedef unsigned int char32_t;
 #    define _LIBCUDACXX_PREFERRED_NAME(x)
 #  endif
 
-#  if defined(_LIBCUDACXX_ABI_MICROSOFT) && (defined(_CCCL_COMPILER_MSVC) || __has_declspec_attribute(empty_bases))
+#  if defined(_LIBCUDACXX_ABI_MICROSOFT) && (_CCCL_COMPILER(MSVC) || __has_declspec_attribute(empty_bases))
 #    define _LIBCUDACXX_DECLSPEC_EMPTY_BASES __declspec(empty_bases)
 #  else
 #    define _LIBCUDACXX_DECLSPEC_EMPTY_BASES
@@ -822,7 +822,7 @@ typedef unsigned int char32_t;
 #    define _LIBCUDACXX_HAS_NO_CONSTEXPR_COMPLEX_OPERATIONS
 #  elif _CCCL_COMPILER(NVRTC)
 #    define _LIBCUDACXX_HAS_NO_CONSTEXPR_COMPLEX_OPERATIONS
-#  elif defined(_CCCL_COMPILER_MSVC)
+#  elif _CCCL_COMPILER(MSVC)
 #    define _LIBCUDACXX_HAS_NO_CONSTEXPR_COMPLEX_OPERATIONS
 #  elif _CCCL_CUDACC_BELOW(11, 8)
 #    define _LIBCUDACXX_HAS_NO_CONSTEXPR_COMPLEX_OPERATIONS
diff --git a/libcudacxx/include/cuda/std/detail/libcxx/include/climits b/libcudacxx/include/cuda/std/detail/libcxx/include/climits
index ffe87d1caf0..f5b285ccc25 100644
--- a/libcudacxx/include/cuda/std/detail/libcxx/include/climits
+++ b/libcudacxx/include/cuda/std/detail/libcxx/include/climits
@@ -51,7 +51,7 @@ Macros:
 
 _CCCL_PUSH_MACROS
 
-#if defined(_CCCL_COMPILER_MSVC)
+#if _CCCL_COMPILER(MSVC)
 #  include <cuda/std/detail/libcxx/include/support/win32/limits_msvc_win32.h>
 #endif // _LIBCUDACXX_MSVCRT
 
diff --git a/libcudacxx/include/cuda/std/detail/libcxx/include/cmath b/libcudacxx/include/cuda/std/detail/libcxx/include/cmath
index bf0f4da10f5..7066ddec4f2 100644
--- a/libcudacxx/include/cuda/std/detail/libcxx/include/cmath
+++ b/libcudacxx/include/cuda/std/detail/libcxx/include/cmath
@@ -683,7 +683,7 @@ __constexpr_isfinite(_A1 __lcpp_x) noexcept
   return isfinite(__lcpp_x);
 }
 
-#if defined(_CCCL_COMPILER_MSVC) || _CCCL_COMPILER(NVRTC)
+#if _CCCL_COMPILER(MSVC) || _CCCL_COMPILER(NVRTC)
 template <class _A1>
 _LIBCUDACXX_HIDE_FROM_ABI _A1 __constexpr_copysign(_A1 __x, _A1 __y) noexcept
 {
@@ -715,9 +715,9 @@ __constexpr_copysign(_A1 __x, _A2 __y) noexcept
   static_assert((!(_IsSame<_A1, __result_type>::value && _IsSame<_A2, __result_type>::value)), "");
   return __builtin_copysign((__result_type) __x, (__result_type) __y);
 }
-#endif // !_CCCL_COMPILER_MSVC
+#endif // !_CCCL_COMPILER(MSVC)
 
-#if defined(_CCCL_COMPILER_MSVC) || _CCCL_COMPILER(NVRTC)
+#if _CCCL_COMPILER(MSVC) || _CCCL_COMPILER(NVRTC)
 template <class _A1>
 _LIBCUDACXX_HIDE_FROM_ABI _A1 __constexpr_fabs(_A1 __x) noexcept
 {
@@ -744,9 +744,9 @@ _LIBCUDACXX_HIDE_FROM_ABI _CCCL_CONSTEXPR_CXX14 double __constexpr_fabs(_Tp __x)
 {
   return __builtin_fabs(static_cast<double>(__x));
 }
-#endif // !_CCCL_COMPILER_MSVC
+#endif // !_CCCL_COMPILER(MSVC)
 
-#if defined(_CCCL_COMPILER_MSVC) || _CCCL_COMPILER(NVRTC)
+#if _CCCL_COMPILER(MSVC) || _CCCL_COMPILER(NVRTC)
 template <class _A1>
 _LIBCUDACXX_HIDE_FROM_ABI _A1 __constexpr_fmax(_A1 __x, _A1 __y) noexcept
 {
@@ -829,9 +829,9 @@ _LIBCUDACXX_HIDE_FROM_ABI _CCCL_CONSTEXPR_CXX14_COMPLEX __promote_t<_Tp, _Up> __
   using __result_type = __promote_t<_Tp, _Up>;
   return _CUDA_VSTD::__constexpr_fmax(static_cast<__result_type>(__x), static_cast<__result_type>(__y));
 }
-#endif // !_CCCL_COMPILER_MSVC
+#endif // !_CCCL_COMPILER(MSVC)
 
-#if defined(_CCCL_COMPILER_MSVC) || _CCCL_COMPILER(NVRTC) || defined(_CCCL_CUDA_COMPILER_CLANG)
+#if _CCCL_COMPILER(MSVC) || _CCCL_COMPILER(NVRTC) || defined(_CCCL_CUDA_COMPILER_CLANG)
 template <class _A1>
 _LIBCUDACXX_HIDE_FROM_ABI _A1 __constexpr_logb(_A1 __x)
 {
@@ -874,7 +874,7 @@ _LIBCUDACXX_HIDE_FROM_ABI _CCCL_CONSTEXPR_CXX14_COMPLEX _Tp __constexpr_logb(_Tp
 }
 #endif // !_MSVC
 
-#if defined(_CCCL_COMPILER_MSVC) || _CCCL_COMPILER(NVRTC) || defined(_CCCL_CUDA_COMPILER_CLANG)
+#if _CCCL_COMPILER(MSVC) || _CCCL_COMPILER(NVRTC) || defined(_CCCL_CUDA_COMPILER_CLANG)
 template <class _Tp>
 _LIBCUDACXX_HIDE_FROM_ABI _Tp __constexpr_scalbn(_Tp __x, int __i)
 {
@@ -958,7 +958,7 @@ _LIBCUDACXX_HIDE_FROM_ABI _CCCL_CONSTEXPR_CXX14_COMPLEX _Tp __constexpr_scalbn(_
 #  endif // defined(_CCCL_BUILTIN_IS_CONSTANT_EVALUATED)
   return __builtin_scalbn(__x, __exp);
 }
-#endif // !_CCCL_COMPILER_MSVC
+#endif // !_CCCL_COMPILER(MSVC)
 
 #if _CCCL_STD_VER > 2017
 template <typename _Fp>
diff --git a/libcudacxx/include/cuda/std/detail/libcxx/include/limits b/libcudacxx/include/cuda/std/detail/libcxx/include/limits
index 06845b5f664..ea830da6046 100644
--- a/libcudacxx/include/cuda/std/detail/libcxx/include/limits
+++ b/libcudacxx/include/cuda/std/detail/libcxx/include/limits
@@ -118,7 +118,7 @@ template<> class numeric_limits<cv long double>;
 
 _CCCL_PUSH_MACROS
 
-#if defined(_CCCL_COMPILER_MSVC)
+#if _CCCL_COMPILER(MSVC)
 #  include <cuda/std/detail/libcxx/include/support/win32/limits_msvc_win32.h>
 #endif // _LIBCUDACXX_MSVCRT
 
diff --git a/libcudacxx/include/cuda/std/detail/libcxx/include/span b/libcudacxx/include/cuda/std/detail/libcxx/include/span
index afe5ea34519..75774146c09 100644
--- a/libcudacxx/include/cuda/std/detail/libcxx/include/span
+++ b/libcudacxx/include/cuda/std/detail/libcxx/include/span
@@ -203,9 +203,9 @@ template <class _From, class _To>
 _CCCL_CONCEPT __span_array_convertible = _CCCL_TRAIT(is_convertible, _From (*)[], _To (*)[]);
 
 // We want to ensure that span interacts nicely with containers that might not have had the ranges treatment
-#  if defined(__cpp_lib_ranges) && !defined(_CCCL_COMPILER_MSVC_2017)
+#  if defined(__cpp_lib_ranges) && !_CCCL_COMPILER(MSVC2017)
 #    define _CCCL_SPAN_USES_RANGES
-#  endif // __cpp_lib_ranges && !_CCCL_COMPILER_MSVC_2017
+#  endif // __cpp_lib_ranges && !_CCCL_COMPILER(MSVC2017)
 
 #  if defined(_CCCL_SPAN_USES_RANGES)
 template <class _Range, class _ElementType>
@@ -367,7 +367,7 @@ public:
   }
 #  endif // !_CCCL_SPAN_USES_RANGES
 
-#  if _CCCL_COMPILER(NVRTC) || defined(_CCCL_COMPILER_MSVC_2017)
+#  if _CCCL_COMPILER(NVRTC) || _CCCL_COMPILER(MSVC2017)
   template <size_t _Sz = _Extent, enable_if_t<_Sz != 0, int> = 0>
   _LIBCUDACXX_HIDE_FROM_ABI constexpr span(type_identity_t<element_type> (&__arr)[_Sz]) noexcept
       : __data_{__arr}
@@ -376,7 +376,7 @@ public:
   _LIBCUDACXX_HIDE_FROM_ABI constexpr span(type_identity_t<element_type> (&__arr)[_Extent]) noexcept
       : __data_{__arr}
   {}
-#  endif // !_CCCL_COMPILER(NVRTC) && !_CCCL_COMPILER_MSVC_2017
+#  endif // !_CCCL_COMPILER(NVRTC) && !_CCCL_COMPILER(MSVC2017)
 
   _CCCL_TEMPLATE(class _OtherElementType)
   _CCCL_REQUIRES(__span_array_convertible<_OtherElementType, element_type>)
@@ -855,7 +855,7 @@ _CCCL_HOST_DEVICE span(const _Container&) -> span<const typename _Container::val
 
 _LIBCUDACXX_END_NAMESPACE_STD
 
-#if _CCCL_STD_VER >= 2017 && !defined(_CCCL_COMPILER_MSVC_2017)
+#if _CCCL_STD_VER >= 2017 && !_CCCL_COMPILER(MSVC2017)
 _LIBCUDACXX_BEGIN_NAMESPACE_RANGES
 template <class _Tp, size_t _Extent>
 _CCCL_INLINE_VAR constexpr bool enable_borrowed_range<span<_Tp, _Extent>> = true;
@@ -863,6 +863,6 @@ _CCCL_INLINE_VAR constexpr bool enable_borrowed_range<span<_Tp, _Extent>> = true
 template <class _Tp, size_t _Extent>
 _CCCL_INLINE_VAR constexpr bool enable_view<span<_Tp, _Extent>> = true;
 _LIBCUDACXX_END_NAMESPACE_RANGES
-#endif // _CCCL_STD_VER >= 2017 && !_CCCL_COMPILER_MSVC_2017
+#endif // _CCCL_STD_VER >= 2017 && !_CCCL_COMPILER(MSVC2017)
 
 #endif // _LIBCUDACXX_SPAN
diff --git a/libcudacxx/include/cuda/std/detail/libcxx/include/variant b/libcudacxx/include/cuda/std/detail/libcxx/include/variant
index c40abf1fd23..d71967a3c85 100644
--- a/libcudacxx/include/cuda/std/detail/libcxx/include/variant
+++ b/libcudacxx/include/cuda/std/detail/libcxx/include/variant
@@ -293,7 +293,7 @@ _LIBCUDACXX_END_NAMESPACE_STD_NOVERSION
 
 #endif // !_CCCL_NO_EXCEPTIONS
 
-#if _CCCL_STD_VER >= 2014 && !defined(_CCCL_COMPILER_MSVC_2017)
+#if _CCCL_STD_VER >= 2014 && !_CCCL_COMPILER(MSVC2017)
 
 _LIBCUDACXX_BEGIN_NAMESPACE_STD
 
@@ -2152,6 +2152,6 @@ _LIBCUDACXX_HIDE_FROM_ABI constexpr auto&& __unchecked_get(variant<_Types...>& _
 
 _LIBCUDACXX_END_NAMESPACE_STD
 
-#endif // _CCCL_STD_VER >= 2014 && !_CCCL_COMPILER_MSVC_2017
+#endif // _CCCL_STD_VER >= 2014 && !_CCCL_COMPILER(MSVC2017)
 
 #endif // _LIBCUDACXX_VARIANT
diff --git a/libcudacxx/include/cuda/std/inplace_vector b/libcudacxx/include/cuda/std/inplace_vector
index 73449b26188..5137fce4209 100644
--- a/libcudacxx/include/cuda/std/inplace_vector
+++ b/libcudacxx/include/cuda/std/inplace_vector
@@ -315,11 +315,11 @@ protected:
     iterator __curr = __dest;
     for (; __first != __last; ++__curr, (void) ++__first)
     {
-#  if _CCCL_STD_VER >= 2017 && !defined(_CCCL_COMPILER_MSVC_2017)
+#  if _CCCL_STD_VER >= 2017 && !_CCCL_COMPILER(MSVC2017)
       ::new (_CUDA_VSTD::__voidify(*__curr)) _Tp(_CUDA_VRANGES::iter_move(__first));
 #  else // ^^^ C++17 ^^^ / vvv C++14 vvv
       ::new (_CUDA_VSTD::__voidify(*__curr)) _Tp(_CUDA_VSTD::move(*__first));
-#  endif // _CCCL_STD_VER <= 2014 || _CCCL_COMPILER_MSVC_2017
+#  endif // _CCCL_STD_VER <= 2014 || _CCCL_COMPILER(MSVC2017)
     }
     this->__size_ += static_cast<__size_type>(__curr - __dest);
   }
@@ -332,11 +332,11 @@ protected:
     auto __guard    = __make_exception_guard(_Rollback_change_size<__inplace_vector_storage>{this, __dest, __curr});
     for (; __first != __last; ++__curr, (void) ++__first)
     {
-#  if _CCCL_STD_VER >= 2017 && !defined(_CCCL_COMPILER_MSVC_2017)
+#  if _CCCL_STD_VER >= 2017 && !_CCCL_COMPILER(MSVC2017)
       ::new (_CUDA_VSTD::__voidify(*__curr)) _Tp(_CUDA_VRANGES::iter_move(__first));
 #  else // ^^^ C++17 ^^^ / vvv C++14 vvv
       ::new (_CUDA_VSTD::__voidify(*__curr)) _Tp(_CUDA_VSTD::move(*__first));
-#  endif // _CCCL_STD_VER <= 2014 || _CCCL_COMPILER_MSVC_2017
+#  endif // _CCCL_STD_VER <= 2014 || _CCCL_COMPILER(MSVC2017)
     }
     __guard.__complete();
     this->__size_ += static_cast<__size_type>(__curr - __dest);
@@ -590,22 +590,22 @@ protected:
   _LIBCUDACXX_HIDE_FROM_ABI constexpr void __uninitialized_copy(_Iter __first, _Iter __last, iterator __dest) noexcept
   {
     _CUDA_VSTD::copy(__first, __last, __dest);
-#  if _CCCL_STD_VER >= 2017 && !defined(_CCCL_COMPILER_MSVC_2017)
+#  if _CCCL_STD_VER >= 2017 && !_CCCL_COMPILER(MSVC2017)
     __size_ += static_cast<__size_type>(_CUDA_VRANGES::distance(__first, __last));
 #  else // ^^^ C++17 ^^^ / vvv C++14 vvv
     __size_ += static_cast<__size_type>(_CUDA_VSTD::distance(__first, __last));
-#  endif // _CCCL_STD_VER <= 2014 || _CCCL_COMPILER_MSVC_2017
+#  endif // _CCCL_STD_VER <= 2014 || _CCCL_COMPILER(MSVC2017)
   }
 
   template <class _Iter>
   _LIBCUDACXX_HIDE_FROM_ABI constexpr void __uninitialized_move(_Iter __first, _Iter __last, iterator __dest) noexcept
   {
     _CUDA_VSTD::copy(__first, __last, __dest);
-#  if _CCCL_STD_VER >= 2017 && !defined(_CCCL_COMPILER_MSVC_2017)
+#  if _CCCL_STD_VER >= 2017 && !_CCCL_COMPILER(MSVC2017)
     __size_ += static_cast<__size_type>(_CUDA_VRANGES::distance(__first, __last));
 #  else // ^^^ C++17 ^^^ / vvv C++14 vvv
     __size_ += static_cast<__size_type>(_CUDA_VSTD::distance(__first, __last));
-#  endif // _CCCL_STD_VER <= 2014 || _CCCL_COMPILER_MSVC_2017
+#  endif // _CCCL_STD_VER <= 2014 || _CCCL_COMPILER(MSVC2017)
   }
 };
 
@@ -670,9 +670,9 @@ struct __inplace_vector_base<_Tp, 0, __inplace_vector_specialization::__empty>
   _LIBCUDACXX_HIDE_FROM_ABI constexpr _Tp& unchecked_emplace_back(_Args&&...) noexcept
   {
     _CCCL_UNREACHABLE();
-#  if defined(_CCCL_COMPILER_MSVC)
+#  if _CCCL_COMPILER(MSVC)
     return *begin();
-#  endif // _CCCL_COMPILER_MSVC
+#  endif // _CCCL_COMPILER(MSVC)
   }
 
 protected:
@@ -800,7 +800,7 @@ public:
     }
   }
 
-#  if _CCCL_STD_VER >= 2017 && !defined(_CCCL_COMPILER_MSVC_2017)
+#  if _CCCL_STD_VER >= 2017 && !_CCCL_COMPILER(MSVC2017)
   _CCCL_TEMPLATE(class _Range)
   _CCCL_REQUIRES(
     _CUDA_VRANGES::__container_compatible_range<_Range, _Tp> _CCCL_AND(!_CUDA_VRANGES::forward_range<_Range>))
@@ -851,7 +851,7 @@ public:
       this->__uninitialized_move(_CUDA_VRANGES::begin(__range), _CUDA_VRANGES::__unwrap_end(__range), this->begin());
     }
   }
-#  endif // _CCCL_STD_VER >= 2017 && !defined(_CCCL_COMPILER_MSVC_2017)
+#  endif // _CCCL_STD_VER >= 2017 && !_CCCL_COMPILER(MSVC2017)
 
   _LIBCUDACXX_HIDE_FROM_ABI constexpr inplace_vector& operator=(initializer_list<_Tp> __ilist)
   {
@@ -961,7 +961,7 @@ public:
     }
   }
 
-#  if _CCCL_STD_VER >= 2017 && !defined(_CCCL_COMPILER_MSVC_2017)
+#  if _CCCL_STD_VER >= 2017 && !_CCCL_COMPILER(MSVC2017)
   _CCCL_TEMPLATE(class _Range)
   _CCCL_REQUIRES(
     _CUDA_VRANGES::__container_compatible_range<_Range, _Tp> _CCCL_AND(!_CUDA_VRANGES::forward_range<_Range>))
@@ -1037,7 +1037,7 @@ public:
       this->__uninitialized_copy(__middle, __last, this->end());
     }
   }
-#  endif // _CCCL_STD_VER >= 2017 && !defined(_CCCL_COMPILER_MSVC_2017)
+#  endif // _CCCL_STD_VER >= 2017 && !_CCCL_COMPILER(MSVC2017)
 
   // [containers.sequences.inplace.vector.access], element access
   _CCCL_NODISCARD _LIBCUDACXX_HIDE_FROM_ABI constexpr reference at(const size_type __pos)
@@ -1307,7 +1307,7 @@ public:
     return __res;
   }
 
-#  if _CCCL_STD_VER >= 2017 && !defined(_CCCL_COMPILER_MSVC_2017)
+#  if _CCCL_STD_VER >= 2017 && !_CCCL_COMPILER(MSVC2017)
   _CCCL_TEMPLATE(class _Range)
   _CCCL_REQUIRES(
     _CUDA_VRANGES::__container_compatible_range<_Range, _Tp> _CCCL_AND(!_CUDA_VRANGES::forward_range<_Range>))
@@ -1355,7 +1355,7 @@ public:
     auto __first = _CUDA_VRANGES::begin(__range);
     insert(this->end(), __first, _CUDA_VRANGES::__unwrap_end(__range));
   }
-#  endif // _CCCL_STD_VER >= 2017 && !defined(_CCCL_COMPILER_MSVC_2017)
+#  endif // _CCCL_STD_VER >= 2017 && !_CCCL_COMPILER(MSVC2017)
 
   template <class... _Args>
   _LIBCUDACXX_HIDE_FROM_ABI constexpr iterator emplace(const_iterator __cpos, _Args&&... __args)
@@ -1451,7 +1451,7 @@ public:
     return _CUDA_VSTD::addressof(this->unchecked_emplace_back(_CUDA_VSTD::move(__value)));
   }
 
-#  if _CCCL_STD_VER >= 2017 && !defined(_CCCL_COMPILER_MSVC_2017)
+#  if _CCCL_STD_VER >= 2017 && !_CCCL_COMPILER(MSVC2017)
   _CCCL_TEMPLATE(class _Range)
   _CCCL_REQUIRES(
     _CUDA_VRANGES::__container_compatible_range<_Range, _Tp> _CCCL_AND(!_CUDA_VRANGES::forward_range<_Range>))
@@ -1498,7 +1498,7 @@ public:
     this->__uninitialized_move(__first, __middle, this->end());
     return __middle;
   }
-#  endif // _CCCL_STD_VER >= 2017 && !defined(_CCCL_COMPILER_MSVC_2017)
+#  endif // _CCCL_STD_VER >= 2017 && !_CCCL_COMPILER(MSVC2017)
 
   using __base::unchecked_emplace_back;
 
@@ -1786,7 +1786,7 @@ public:
     }
   }
 
-#  if _CCCL_STD_VER >= 2017 && !defined(_CCCL_COMPILER_MSVC_2017)
+#  if _CCCL_STD_VER >= 2017 && !_CCCL_COMPILER(MSVC2017)
   _CCCL_TEMPLATE(class _Range)
   _CCCL_REQUIRES(_CUDA_VRANGES::__container_compatible_range<_Range, _Tp>)
   _LIBCUDACXX_HIDE_FROM_ABI constexpr inplace_vector(from_range_t, _Range&& __range)
@@ -1797,7 +1797,7 @@ public:
       _CUDA_VSTD::__throw_bad_alloc();
     }
   }
-#  endif // _CCCL_STD_VER >= 2017 && !defined(_CCCL_COMPILER_MSVC_2017)
+#  endif // _CCCL_STD_VER >= 2017 && !_CCCL_COMPILER(MSVC2017)
 
   _LIBCUDACXX_HIDE_FROM_ABI constexpr inplace_vector& operator=(initializer_list<_Tp> __ilist)
   {
@@ -1838,7 +1838,7 @@ public:
     return;
   }
 
-#  if _CCCL_STD_VER >= 2017 && !defined(_CCCL_COMPILER_MSVC_2017)
+#  if _CCCL_STD_VER >= 2017 && !_CCCL_COMPILER(MSVC2017)
   _CCCL_TEMPLATE(class _Range)
   _CCCL_REQUIRES(_CUDA_VRANGES::__container_compatible_range<_Range, _Tp>)
   _LIBCUDACXX_HIDE_FROM_ABI constexpr void assign_range(_Range&& __range)
@@ -1849,7 +1849,7 @@ public:
     }
     return;
   }
-#  endif // _CCCL_STD_VER >= 2017 && !defined(_CCCL_COMPILER_MSVC_2017)
+#  endif // _CCCL_STD_VER >= 2017 && !_CCCL_COMPILER(MSVC2017)
 
   // [containers.sequences.inplace.vector.access], element access
   _CCCL_NODISCARD _LIBCUDACXX_HIDE_FROM_ABI constexpr reference at(const size_type __pos)
@@ -1996,7 +1996,7 @@ public:
     return nullptr;
   }
 
-#  if _CCCL_STD_VER >= 2017 && !defined(_CCCL_COMPILER_MSVC_2017)
+#  if _CCCL_STD_VER >= 2017 && !_CCCL_COMPILER(MSVC2017)
   _CCCL_TEMPLATE(class _Range)
   _CCCL_REQUIRES(_CUDA_VRANGES::__container_compatible_range<_Range, _Tp>)
   _LIBCUDACXX_HIDE_FROM_ABI constexpr iterator insert_range(const_iterator __cpos, _Range&& __range)
@@ -2017,7 +2017,7 @@ public:
       _CUDA_VSTD::__throw_bad_alloc();
     }
   }
-#  endif // _CCCL_STD_VER >= 2017 && !defined(_CCCL_COMPILER_MSVC_2017)
+#  endif // _CCCL_STD_VER >= 2017 && !_CCCL_COMPILER(MSVC2017)
 
   template <class... _Args>
   _LIBCUDACXX_HIDE_FROM_ABI constexpr iterator emplace(const_iterator, _Args&&...)
@@ -2061,14 +2061,14 @@ public:
     return nullptr;
   }
 
-#  if _CCCL_STD_VER >= 2017 && !defined(_CCCL_COMPILER_MSVC_2017)
+#  if _CCCL_STD_VER >= 2017 && !_CCCL_COMPILER(MSVC2017)
   _CCCL_TEMPLATE(class _Range)
   _CCCL_REQUIRES(_CUDA_VRANGES::__container_compatible_range<_Range, _Tp>)
   _LIBCUDACXX_HIDE_FROM_ABI constexpr _CUDA_VRANGES::iterator_t<_Range> try_append_range(_Range&& __range) noexcept
   {
     return _CUDA_VRANGES::begin(__range);
   }
-#  endif // _CCCL_STD_VER >= 2017 && !defined(_CCCL_COMPILER_MSVC_2017)
+#  endif // _CCCL_STD_VER >= 2017 && !_CCCL_COMPILER(MSVC2017)
 
   using __base::unchecked_emplace_back;
 
diff --git a/libcudacxx/include/cuda/std/version b/libcudacxx/include/cuda/std/version
index 841aa449c77..238259f45ef 100644
--- a/libcudacxx/include/cuda/std/version
+++ b/libcudacxx/include/cuda/std/version
@@ -48,9 +48,9 @@
 #  define __cccl_lib_is_null_pointer            201309L
 #  define __cccl_lib_make_reverse_iterator      201402L
 // # define __cccl_lib_make_unique                          201304L
-#  if !defined(_CCCL_COMPILER_MSVC) || _CCCL_STD_VER >= 2020
+#  if !_CCCL_COMPILER(MSVC) || _CCCL_STD_VER >= 2020
 #    define __cccl_lib_mdspan 202207L
-#  endif // _CCCL_COMPILER_MSVC && _CCCL_STD_VER >= 2020
+#  endif // _CCCL_COMPILER(MSVC) && _CCCL_STD_VER >= 2020
 #  define __cccl_lib_null_iterators 201304L
 #  define __cccl_lib_optional       202110L
 // # define __cccl_lib_quoted_string_io                     201304L
diff --git a/libcudacxx/test/libcudacxx/std/language.support/support.srcloc/general.pass.cpp b/libcudacxx/test/libcudacxx/std/language.support/support.srcloc/general.pass.cpp
index 976997b99b2..2fc33268dfc 100644
--- a/libcudacxx/test/libcudacxx/std/language.support/support.srcloc/general.pass.cpp
+++ b/libcudacxx/test/libcudacxx/std/language.support/support.srcloc/general.pass.cpp
@@ -68,7 +68,7 @@ ASSERT_NOEXCEPT(device_empty.function_name());
 constexpr cuda::std::source_location cur = cuda::std::source_location::current();
 static_assert(cur.line() == 1000, "");
 
-#if _CCCL_HAS_BUILTIN(__builtin_COLUMN) || defined(TEST_COMPILER_MSVC) && _CCCL_MSVC_VERSION >= 1927
+#if _CCCL_HAS_BUILTIN(__builtin_COLUMN) || defined(TEST_COMPILER_MSVC) && _MSC_VER >= 1927
 static_assert(cur.column() > 0, "");
 #else // ^^^ _CCCL_BULTIN_COLUMN ^^^ / vvv !_CCCL_BULTIN_COLUMN vvv
 static_assert(cur.column() == 0, "");
@@ -78,7 +78,7 @@ static_assert(cur.file_name()[0] == __FILE__[0] && cur.file_name()[1] == __FILE_
               "");
 
 // MSVC below 19.27 is broken with function name
-#if !defined(_CCCL_COMPILER_MSVC) || _CCCL_MSVC_VERSION >= 1927
+#if !_CCCL_COMPILER(MSVC) || _MSC_VER >= 1927
 static_assert(cur.function_name()[0] == '\0', "");
 #else // ^^^ __builtin_FUNCTION ^^^ / vvv !__builtin_FUNCTION vvv
 static_assert(compare_strings(cur.function_name(), "__builtin_FUNCTION is unsupported"));
@@ -139,14 +139,14 @@ __host__ __device__ void test()
   assert(compare_strings(local.file_name(), __FILE__));
 
   // MSVC below 19.27 is broken with function name
-#if !defined(_CCCL_COMPILER_MSVC) || _CCCL_MSVC_VERSION >= 1927
+#if !_CCCL_COMPILER(MSVC) || _MSC_VER >= 1927
   assert(find_substring(local.function_name(), "test"));
 #else // ^^^ __builtin_FUNCTION ^^^ / vvv !__builtin_FUNCTION vvv
   assert(compare_strings(local.function_name(), "__builtin_FUNCTION is unsupported"));
 #endif // !__builtin_FUNCTION
 
   assert(local.line() == 2000);
-#if _CCCL_HAS_BUILTIN(__builtin_COLUMN) || defined(TEST_COMPILER_MSVC) && _CCCL_MSVC_VERSION >= 1927
+#if _CCCL_HAS_BUILTIN(__builtin_COLUMN) || defined(TEST_COMPILER_MSVC) && _MSC_VER >= 1927
   assert(cur.column() > 0);
 #else // ^^^ _CCCL_BULTIN_COLUMN ^^^ / vvv !_CCCL_BULTIN_COLUMN vvv
   assert(cur.column() == 0);
diff --git a/libcudacxx/test/libcudacxx/std/utilities/template.bitset/bitset.members/to_ullong.pass.cpp b/libcudacxx/test/libcudacxx/std/utilities/template.bitset/bitset.members/to_ullong.pass.cpp
index db3e3877dc0..25a0d2aff05 100644
--- a/libcudacxx/test/libcudacxx/std/utilities/template.bitset/bitset.members/to_ullong.pass.cpp
+++ b/libcudacxx/test/libcudacxx/std/utilities/template.bitset/bitset.members/to_ullong.pass.cpp
@@ -67,7 +67,7 @@ __host__ __device__ TEST_CONSTEXPR_CXX14 bool test()
 int main(int, char**)
 {
   test();
-#if TEST_STD_VER >= 2014 && (_CCCL_CUDACC_AT_LEAST(11, 8) || !defined(_CCCL_COMPILER_MSVC))
+#if TEST_STD_VER >= 2014 && (_CCCL_CUDACC_AT_LEAST(11, 8) || !_CCCL_COMPILER(MSVC))
   static_assert(test(), "");
 #endif // TEST_STD_VER >= 2014
 
diff --git a/libcudacxx/test/libcudacxx/std/utilities/template.bitset/bitset.members/to_ulong.pass.cpp b/libcudacxx/test/libcudacxx/std/utilities/template.bitset/bitset.members/to_ulong.pass.cpp
index 2c22b439ffb..61953443dfd 100644
--- a/libcudacxx/test/libcudacxx/std/utilities/template.bitset/bitset.members/to_ulong.pass.cpp
+++ b/libcudacxx/test/libcudacxx/std/utilities/template.bitset/bitset.members/to_ulong.pass.cpp
@@ -66,7 +66,7 @@ __host__ __device__ TEST_CONSTEXPR_CXX14 bool test()
 int main(int, char**)
 {
   test();
-#if TEST_STD_VER >= 2014 && (_CCCL_CUDACC_AT_LEAST(11, 8) || !defined(_CCCL_COMPILER_MSVC))
+#if TEST_STD_VER >= 2014 && (_CCCL_CUDACC_AT_LEAST(11, 8) || !_CCCL_COMPILER(MSVC))
   static_assert(test(), "");
 #endif // TEST_STD_VER >= 2014
 
diff --git a/libcudacxx/test/support/test_macros.h b/libcudacxx/test/support/test_macros.h
index c42adf2d0bb..c81987a0dc1 100644
--- a/libcudacxx/test/support/test_macros.h
+++ b/libcudacxx/test/support/test_macros.h
@@ -148,7 +148,7 @@
 #endif
 
 #if TEST_HAS_BUILTIN(__builtin_is_constant_evaluated) || _CCCL_COMPILER(GCC, >=, 9) \
-  || (defined(_CCCL_COMPILER_MSVC) && _MSC_VER > 1924 && _CCCL_CUDACC_AT_LEAST(11, 3))
+  || (_CCCL_COMPILER(MSVC) && _MSC_VER > 1924 && _CCCL_CUDACC_AT_LEAST(11, 3))
 #  define TEST_IS_CONSTANT_EVALUATED() _CUDA_VSTD::__libcpp_is_constant_evaluated()
 #else
 #  define TEST_IS_CONSTANT_EVALUATED() false
@@ -245,8 +245,8 @@
 #endif
 
 #ifndef TEST_HAS_NO_EXCEPTIONS
-#  if (defined(_CCCL_COMPILER_MSVC) && _HAS_EXCEPTIONS == 0) \
-    || (!defined(_CCCL_COMPILER_MSVC) && !__EXCEPTIONS) // Catches all non msvc based compilers
+#  if (_CCCL_COMPILER(MSVC) && _HAS_EXCEPTIONS == 0) || (!_CCCL_COMPILER(MSVC) && !__EXCEPTIONS) // Catches all non msvc
+                                                                                                 // based compilers
 #    define TEST_HAS_NO_EXCEPTIONS
 #  endif
 #endif // !TEST_HAS_NO_EXCEPTIONS
diff --git a/thrust/testing/async_sort.cu b/thrust/testing/async_sort.cu
index feb5cb5624a..77144779814 100644
--- a/thrust/testing/async_sort.cu
+++ b/thrust/testing/async_sort.cu
@@ -1,7 +1,7 @@
 #include <thrust/detail/config.h>
 
 // Disabled on MSVC && NVCC < 11.1 for GH issue #1098.
-#if defined(_CCCL_COMPILER_MSVC) && defined(__CUDACC__)
+#if _CCCL_COMPILER(MSVC) && defined(__CUDACC__)
 #  if (__CUDACC_VER_MAJOR__ < 11) || (__CUDACC_VER_MAJOR__ == 11 && __CUDACC_VER_MINOR__ < 1)
 #    define THRUST_BUG_1098_ACTIVE
 #  endif // NVCC version check
diff --git a/thrust/testing/cuda/transform.cu b/thrust/testing/cuda/transform.cu
index 888264ffce2..2e474ccfb5a 100644
--- a/thrust/testing/cuda/transform.cu
+++ b/thrust/testing/cuda/transform.cu
@@ -355,7 +355,7 @@ struct sum_five
 };
 
 // The following test cannot be compiled because of a bug in the conversion of thrust::tuple on MSVC 2017
-#ifndef _CCCL_COMPILER_MSVC_2017
+#if !_CCCL_COMPILER(MSVC2017)
 // we specialize zip_function for sum_five, but do nothing in the call operator so the test below would fail if the
 // zip_function is actually called (and not unwrapped)
 THRUST_NAMESPACE_BEGIN
@@ -420,4 +420,4 @@ void TestTransformZipIteratorUnwrapping()
   }
 }
 DECLARE_UNITTEST(TestTransformZipIteratorUnwrapping);
-#endif // !_CCCL_COMPILER_MSVC_2017
+#endif // !_CCCL_COMPILER(MSVC2017)
diff --git a/thrust/testing/functional.cu b/thrust/testing/functional.cu
index 20478dbcb9b..7757ed47bed 100644
--- a/thrust/testing/functional.cu
+++ b/thrust/testing/functional.cu
@@ -212,7 +212,7 @@ THRUST_DISABLE_BROKEN_GCC_VECTORIZER void TestIdentityFunctional()
   // value categories when casting to different type
   static_assert(::cuda::std::is_same<decltype(thrust::identity<int>{}(3.14)), int&&>::value, "");
   // unfortunately, old versions of MSVC pick the `const int&` overload instead of `int&&`
-#if defined(_CCCL_COMPILER_MSVC) && _CCCL_MSVC_VERSION >= 1929
+#if _CCCL_COMPILER(MSVC, >=, 19, 29)
   static_assert(::cuda::std::is_same<decltype(thrust::identity<int>{}(d)), int&&>::value, "");
   static_assert(::cuda::std::is_same<decltype(thrust::identity<int>{}(as_const(d))), int&&>::value, "");
 #endif
diff --git a/thrust/testing/set_difference.cu b/thrust/testing/set_difference.cu
index cdb538d384c..5fe1de1fc1e 100644
--- a/thrust/testing/set_difference.cu
+++ b/thrust/testing/set_difference.cu
@@ -172,7 +172,7 @@ DECLARE_VARIABLE_UNITTEST(TestSetDifferenceMultiset);
 
 // FIXME: disabled on Windows, because it causes a failure on the internal CI system in one specific configuration.
 // That failure will be tracked in a new NVBug, this is disabled to unblock submitting all the other changes.
-#if !defined(_CCCL_COMPILER_MSVC)
+#if !_CCCL_COMPILER(MSVC)
 void TestSetDifferenceWithBigIndexesHelper(int magnitude)
 {
   thrust::counting_iterator<long long> begin(0);
diff --git a/thrust/testing/set_intersection.cu b/thrust/testing/set_intersection.cu
index 392e23b7337..af95e2cdf07 100644
--- a/thrust/testing/set_intersection.cu
+++ b/thrust/testing/set_intersection.cu
@@ -206,7 +206,7 @@ DECLARE_VARIABLE_UNITTEST(TestSetIntersectionMultiset);
 
 // FIXME: disabled on Windows, because it causes a failure on the internal CI system in one specific configuration.
 // That failure will be tracked in a new NVBug, this is disabled to unblock submitting all the other changes.
-#if !defined(_CCCL_COMPILER_MSVC)
+#if !_CCCL_COMPILER(MSVC)
 void TestSetDifferenceWithBigIndexesHelper(int magnitude)
 {
   thrust::counting_iterator<long long> begin1(0);
diff --git a/thrust/testing/vector_manipulation.cu b/thrust/testing/vector_manipulation.cu
index e5492eeb1a0..3a10492319c 100644
--- a/thrust/testing/vector_manipulation.cu
+++ b/thrust/testing/vector_manipulation.cu
@@ -20,7 +20,7 @@ void TestVectorManipulation(size_t n)
   ASSERT_EQUAL(test1.size(), n);
   ASSERT_EQUAL((test1 == std::vector<T>(n, T(3))), true);
 
-#if defined(_CCCL_COMPILER_MSVC) && (_MSC_VER <= 1400)
+#if _CCCL_COMPILER(MSVC, <=, 14)
   // XXX MSVC 2005's STL unintentionally uses adl to dispatch advance which
   //     produces an ambiguity between std::advance & thrust::advance
   //     don't produce a KNOWN_FAILURE, just ignore the issue
diff --git a/thrust/thrust/detail/config/compiler.h b/thrust/thrust/detail/config/compiler.h
index 6b73b04f55e..25d8ebfb29e 100644
--- a/thrust/thrust/detail/config/compiler.h
+++ b/thrust/thrust/detail/config/compiler.h
@@ -55,13 +55,13 @@
 #define THRUST_DEVICE_COMPILER_NVCC 4
 
 // figure out which host compiler we're using
-#if defined(_CCCL_COMPILER_MSVC)
+#if _CCCL_COMPILER(MSVC)
 //! deprecated [Since 2.7]
 #  define THRUST_HOST_COMPILER THRUST_HOST_COMPILER_MSVC
 //! deprecated [Since 2.7]
-#  define THRUST_MSVC_VERSION _CCCL_MSVC_VERSION
+#  define THRUST_MSVC_VERSION _MSC_VER
 //! deprecated [Since 2.7]
-#  define THRUST_MSVC_VERSION_FULL _CCCL_MSVC_VERSION_FULL
+#  define THRUST_MSVC_VERSION_FULL _MSC_FULL_VER
 #elif _CCCL_COMPILER(ICC)
 //! deprecated [Since 2.7]
 #  define THRUST_HOST_COMPILER THRUST_HOST_COMPILER_INTEL
@@ -91,7 +91,7 @@
 #if defined(__CUDACC__) || defined(_NVHPC_CUDA)
 //! deprecated [Since 2.7]
 #  define THRUST_DEVICE_COMPILER THRUST_DEVICE_COMPILER_NVCC
-#elif defined(_CCCL_COMPILER_MSVC)
+#elif _CCCL_COMPILER(MSVC)
 //! deprecated [Since 2.7]
 #  define THRUST_DEVICE_COMPILER THRUST_DEVICE_COMPILER_MSVC
 #elif _CCCL_COMPILER(GCC)
diff --git a/thrust/thrust/detail/config/compiler_fence.h b/thrust/thrust/detail/config/compiler_fence.h
index fc8cda95682..4b93b682c99 100644
--- a/thrust/thrust/detail/config/compiler_fence.h
+++ b/thrust/thrust/detail/config/compiler_fence.h
@@ -28,7 +28,7 @@
 
 #include <thrust/detail/preprocessor.h>
 
-#if defined(_CCCL_COMPILER_MSVC)
+#if _CCCL_COMPILER(MSVC)
 #  pragma message( \
     "warning: The functionality in this header is unsafe, deprecated, and will soon be removed. Use C++11 atomics instead.")
 #else
@@ -36,7 +36,7 @@
 #endif
 
 // msvc case
-#if defined(_CCCL_COMPILER_MSVC)
+#if _CCCL_COMPILER(MSVC)
 
 #  ifndef _DEBUG
 
diff --git a/thrust/thrust/detail/config/cpp_dialect.h b/thrust/thrust/detail/config/cpp_dialect.h
index e7589b39638..87733b15d55 100644
--- a/thrust/thrust/detail/config/cpp_dialect.h
+++ b/thrust/thrust/detail/config/cpp_dialect.h
@@ -68,7 +68,7 @@
 #define THRUST_CPP_DIALECT _CCCL_STD_VER
 
 // Define THRUST_COMPILER_DEPRECATION macro:
-#if defined(_CCCL_COMPILER_MSVC)
+#if _CCCL_COMPILER(MSVC)
 #  define THRUST_COMP_DEPR_IMPL(msg) _CCCL_PRAGMA(message(__FILE__ ":" _CCCL_TO_STRING(__LINE__) ": warning: " #msg))
 #else // clang / gcc:
 #  define THRUST_COMP_DEPR_IMPL(msg) _CCCL_PRAGMA(GCC warning #msg)
@@ -89,10 +89,10 @@
 THRUST_COMPILER_DEPRECATION(GCC 5.0);
 #  elif _CCCL_COMPILER(CLANG, <, 7)
 THRUST_COMPILER_DEPRECATION(Clang 7.0);
-#  elif defined(_CCCL_COMPILER_MSVC) && THRUST_MSVC_VERSION < 1910
+#  elif _CCCL_COMPILER(MSVC, <, 19, 10)
 // <2017. Hard upgrade message:
 THRUST_COMPILER_DEPRECATION(MSVC 2019(19.20 / 16.0 / 14.20));
-#  elif defined(_CCCL_COMPILER_MSVC) && THRUST_MSVC_VERSION < 1920
+#  elif _CCCL_COMPILER(MSVC2017)
 // >=2017, <2019. Soft deprecation message:
 THRUST_COMPILER_DEPRECATION_SOFT(MSVC 2019(19.20 / 16.0 / 14.20), MSVC 2017);
 #  endif
diff --git a/thrust/thrust/detail/config/deprecated.h b/thrust/thrust/detail/config/deprecated.h
index af53047212d..29204f49287 100644
--- a/thrust/thrust/detail/config/deprecated.h
+++ b/thrust/thrust/detail/config/deprecated.h
@@ -43,7 +43,7 @@
 #elif _CCCL_STD_VER >= 2014
 #  define THRUST_DEPRECATED              [[deprecated]]
 #  define THRUST_DEPRECATED_BECAUSE(MSG) [[deprecated(MSG)]]
-#elif defined(_CCCL_COMPILER_MSVC)
+#elif _CCCL_COMPILER(MSVC)
 #  define THRUST_DEPRECATED              __declspec(deprecated)
 #  define THRUST_DEPRECATED_BECAUSE(MSG) __declspec(deprecated(MSG))
 #elif _CCCL_COMPILER(CLANG)
diff --git a/thrust/thrust/iterator/permutation_iterator.h b/thrust/thrust/iterator/permutation_iterator.h
index 821a0a2484d..38dd35456b8 100644
--- a/thrust/thrust/iterator/permutation_iterator.h
+++ b/thrust/thrust/iterator/permutation_iterator.h
@@ -170,10 +170,10 @@ class permutation_iterator : public thrust::detail::permutation_iterator_base<El
 private:
   // MSVC incorrectly warning about returning a reference to a local/temporary here.
   // NVHPC breaks with push / pop within a class
-#if defined(_CCCL_COMPILER_MSVC)
+#if _CCCL_COMPILER(MSVC)
   _CCCL_DIAG_PUSH
   _CCCL_DIAG_SUPPRESS_MSVC(4172)
-#endif // _CCCL_COMPILER_MSVC
+#endif // _CCCL_COMPILER(MSVC)
 
   _CCCL_EXEC_CHECK_DISABLE
   _CCCL_HOST_DEVICE typename super_t::reference dereference() const
@@ -181,9 +181,9 @@ class permutation_iterator : public thrust::detail::permutation_iterator_base<El
     return *(m_element_iterator + *this->base());
   }
 
-#if defined(_CCCL_COMPILER_MSVC_2017)
+#if _CCCL_COMPILER(MSVC2017)
   _CCCL_DIAG_POP
-#endif // _CCCL_COMPILER_MSVC_2017
+#endif // _CCCL_COMPILER(MSVC2017)
 
   // make friends for the copy constructor
   template <typename, typename>
diff --git a/thrust/thrust/iterator/reverse_iterator.h b/thrust/thrust/iterator/reverse_iterator.h
index 21d258e6d31..a3e6b737b68 100644
--- a/thrust/thrust/iterator/reverse_iterator.h
+++ b/thrust/thrust/iterator/reverse_iterator.h
@@ -163,11 +163,11 @@ class reverse_iterator : public detail::reverse_iterator_base<BidirectionalItera
 public:
   /*! Default constructor does nothing.
    */
-#if defined(_CCCL_COMPILER_MSVC_2017)
+#if _CCCL_COMPILER(MSVC2017)
   _CCCL_HOST_DEVICE reverse_iterator() {}
-#else // ^^^ _CCCL_COMPILER_MSVC_2017 ^^^ / vvv !_CCCL_COMPILER_MSVC_2017 vvv
+#else // ^^^ _CCCL_COMPILER(MSVC2017) ^^^ / vvv !_CCCL_COMPILER(MSVC2017) vvv
   reverse_iterator() = default;
-#endif // !_CCCL_COMPILER_MSVC_2017
+#endif // !_CCCL_COMPILER(MSVC2017)
 
   /*! \p Constructor accepts a \c BidirectionalIterator pointing to a range
    *  for this \p reverse_iterator to reverse.
diff --git a/thrust/thrust/iterator/transform_iterator.h b/thrust/thrust/iterator/transform_iterator.h
index b4e36853e33..8d8f2063590 100644
--- a/thrust/thrust/iterator/transform_iterator.h
+++ b/thrust/thrust/iterator/transform_iterator.h
@@ -297,10 +297,10 @@ class transform_iterator
 // MSVC 2013 and 2015 incorrectly warning about returning a reference to
 // a local/temporary here.
 // See goo.gl/LELTNp
-#if defined(_CCCL_COMPILER_MSVC_2017)
+#if _CCCL_COMPILER(MSVC2017)
   _CCCL_DIAG_PUSH
   _CCCL_DIAG_SUPPRESS_MSVC(4172)
-#endif // _CCCL_COMPILER_MSVC_2017
+#endif // _CCCL_COMPILER(MSVC2017)
 
   _CCCL_EXEC_CHECK_DISABLE
   _CCCL_HOST_DEVICE typename super_t::reference dereference() const
@@ -328,9 +328,9 @@ class transform_iterator
     return m_f(x);
   }
 
-#if defined(_CCCL_COMPILER_MSVC_2017)
+#if _CCCL_COMPILER(MSVC2017)
   _CCCL_DIAG_POP
-#endif // _CCCL_COMPILER_MSVC_2017
+#endif // _CCCL_COMPILER(MSVC2017)
 
   // tag this as mutable per Dave Abrahams in this thread:
   // http://lists.boost.org/Archives/boost/2004/05/65332.php
diff --git a/thrust/thrust/iterator/zip_iterator.h b/thrust/thrust/iterator/zip_iterator.h
index af2278cb528..fe2203d842e 100644
--- a/thrust/thrust/iterator/zip_iterator.h
+++ b/thrust/thrust/iterator/zip_iterator.h
@@ -140,11 +140,11 @@ class zip_iterator : public detail::zip_iterator_base<IteratorTuple>::type
 
   /*! Default constructor does nothing.
    */
-#if defined(_CCCL_COMPILER_MSVC_2017)
+#if _CCCL_COMPILER(MSVC2017)
   inline _CCCL_HOST_DEVICE zip_iterator() {}
-#else // ^^^ _CCCL_COMPILER_MSVC_2017 ^^^ / vvv !_CCCL_COMPILER_MSVC_2017 vvv
+#else // ^^^ _CCCL_COMPILER(MSVC2017) ^^^ / vvv !_CCCL_COMPILER(MSVC2017) vvv
   zip_iterator() = default;
-#endif // !_CCCL_COMPILER_MSVC_2017
+#endif // !_CCCL_COMPILER(MSVC2017)
 
   /*! This constructor creates a new \p zip_iterator from a
    *  \p tuple of iterators.
diff --git a/thrust/thrust/optional.h b/thrust/thrust/optional.h
index bb9bf1cfb4b..dbee5ebda24 100644
--- a/thrust/thrust/optional.h
+++ b/thrust/thrust/optional.h
@@ -37,7 +37,7 @@
 #include <type_traits>
 #include <utility>
 
-#if defined(_CCCL_COMPILER_MSVC) && _MSC_VER == 1900
+#if _CCCL_COMPILER(MSVC, ==, 19, 00)
 #  define THRUST_OPTIONAL_MSVC2015
 #endif
 
@@ -231,7 +231,7 @@ using enable_assign_from_other = detail::enable_if_t<
   && !std::is_assignable<T&, optional<U>&>::value && !std::is_assignable<T&, optional<U>&&>::value
   && !std::is_assignable<T&, const optional<U>&>::value && !std::is_assignable<T&, const optional<U>&&>::value>;
 
-#if defined(_CCCL_COMPILER_MSVC)
+#if _CCCL_COMPILER(MSVC)
 // TODO make a version which works with MSVC
 template <class T, class U = T>
 struct is_swappable : std::true_type
diff --git a/thrust/thrust/system/detail/error_code.inl b/thrust/thrust/system/detail/error_code.inl
index e27c7db0286..0f41a9da220 100644
--- a/thrust/thrust/system/detail/error_code.inl
+++ b/thrust/thrust/system/detail/error_code.inl
@@ -50,10 +50,10 @@ error_code ::error_code(int val, const error_category& cat)
 template <typename ErrorCodeEnum>
 error_code ::error_code(ErrorCodeEnum e
 // XXX WAR msvc's problem with enable_if
-#if !defined(_CCCL_COMPILER_MSVC)
+#if !_CCCL_COMPILER(MSVC)
                         ,
                         ::cuda::std::enable_if_t<is_error_code_enum<ErrorCodeEnum>::value>*
-#endif // !_CCCL_COMPILER_MSVC
+#endif // !_CCCL_COMPILER(MSVC)
 )
 {
   *this = make_error_code(e);
@@ -67,11 +67,11 @@ void error_code ::assign(int val, const error_category& cat)
 
 template <typename ErrorCodeEnum>
 // XXX WAR msvc's problem with enable_if
-#if !defined(_CCCL_COMPILER_MSVC)
+#if !_CCCL_COMPILER(MSVC)
 ::cuda::std::enable_if_t<is_error_code_enum<ErrorCodeEnum>::value, error_code>&
 #else
 error_code&
-#endif // !_CCCL_COMPILER_MSVC
+#endif // !_CCCL_COMPILER(MSVC)
 error_code ::operator=(ErrorCodeEnum e)
 {
   *this = make_error_code(e);
diff --git a/thrust/thrust/system/detail/error_condition.inl b/thrust/thrust/system/detail/error_condition.inl
index a63323be760..f9ad1f2b696 100644
--- a/thrust/thrust/system/detail/error_condition.inl
+++ b/thrust/thrust/system/detail/error_condition.inl
@@ -51,10 +51,10 @@ error_condition ::error_condition(int val, const error_category& cat)
 template <typename ErrorConditionEnum>
 error_condition ::error_condition(ErrorConditionEnum e
 // XXX WAR msvc's problem with enable_if
-#if !defined(_CCCL_COMPILER_MSVC)
+#if !_CCCL_COMPILER(MSVC)
                                   ,
                                   ::cuda::std::enable_if_t<is_error_condition_enum<ErrorConditionEnum>::value>*
-#endif // !_CCCL_COMPILER_MSVC
+#endif // !_CCCL_COMPILER(MSVC)
 )
 {
   *this = make_error_condition(e);
@@ -68,11 +68,11 @@ void error_condition ::assign(int val, const error_category& cat)
 
 template <typename ErrorConditionEnum>
 // XXX WAR msvc's problem with enable_if
-#if !defined(_CCCL_COMPILER_MSVC)
+#if !_CCCL_COMPILER(MSVC)
 ::cuda::std::enable_if_t<is_error_condition_enum<ErrorConditionEnum>::value, error_condition>&
 #else
 error_condition&
-#endif // !_CCCL_COMPILER_MSVC
+#endif // !_CCCL_COMPILER(MSVC)
 error_condition ::operator=(ErrorConditionEnum e)
 {
   *this = make_error_condition(e);
diff --git a/thrust/thrust/system/error_code.h b/thrust/thrust/system/error_code.h
index d5313db8b2b..1573f8a3a82 100644
--- a/thrust/thrust/system/error_code.h
+++ b/thrust/thrust/system/error_code.h
@@ -256,10 +256,10 @@ class error_code
   template <typename ErrorCodeEnum>
   error_code(ErrorCodeEnum e
 // XXX WAR msvc's problem with enable_if
-#if !defined(_CCCL_COMPILER_MSVC)
+#if !_CCCL_COMPILER(MSVC)
              ,
              ::cuda::std::enable_if_t<is_error_code_enum<ErrorCodeEnum>::value>* = 0
-#endif // !_CCCL_COMPILER_MSVC
+#endif // !_CCCL_COMPILER(MSVC)
   );
 
   // [19.5.2.3] modifiers:
@@ -272,11 +272,11 @@ class error_code
    */
   template <typename ErrorCodeEnum>
 // XXX WAR msvc's problem with enable_if
-#if !defined(_CCCL_COMPILER_MSVC)
+#if !_CCCL_COMPILER(MSVC)
   ::cuda::std::enable_if_t<is_error_code_enum<ErrorCodeEnum>::value, error_code>&
 #else
   error_code&
-#endif // !_CCCL_COMPILER_MSVC
+#endif // !_CCCL_COMPILER(MSVC)
   operator=(ErrorCodeEnum e);
 
   /*! \post <tt>value() == 0</tt> and <tt>category() == system_category()</tt>.
@@ -367,10 +367,10 @@ class error_condition
   template <typename ErrorConditionEnum>
   error_condition(ErrorConditionEnum e
 // XXX WAR msvc's problem with enable_if
-#if !defined(_CCCL_COMPILER_MSVC)
+#if !_CCCL_COMPILER(MSVC)
                   ,
                   ::cuda::std::enable_if_t<is_error_condition_enum<ErrorConditionEnum>::value>* = 0
-#endif // !_CCCL_COMPILER_MSVC
+#endif // !_CCCL_COMPILER(MSVC)
   );
 
   // [19.5.3.3] modifiers
@@ -391,11 +391,11 @@ class error_condition
    */
   template <typename ErrorConditionEnum>
 // XXX WAR msvc's problem with enable_if
-#if !defined(_CCCL_COMPILER_MSVC)
+#if !_CCCL_COMPILER(MSVC)
   ::cuda::std::enable_if_t<is_error_condition_enum<ErrorConditionEnum>::value, error_condition>&
 #else
   error_condition&
-#endif // !_CCCL_COMPILER_MSVC
+#endif // !_CCCL_COMPILER(MSVC)
   operator=(ErrorConditionEnum e);
 
   /*! Clears this \p error_code object.
diff --git a/thrust/thrust/type_traits/is_contiguous_iterator.h b/thrust/thrust/type_traits/is_contiguous_iterator.h
index 26ef2020e6b..303b54f38a4 100644
--- a/thrust/thrust/type_traits/is_contiguous_iterator.h
+++ b/thrust/thrust/type_traits/is_contiguous_iterator.h
@@ -39,7 +39,7 @@
 #include <type_traits>
 #include <utility>
 
-#if defined(_CCCL_COMPILER_MSVC) && _MSC_VER < 1916 // MSVC 2017 version 15.9
+#if _CCCL_COMPILER(MSVC, <, 19, 16) // MSVC 2017 version 15.9
 #  include <array>
 #  include <string>
 #  include <vector>

From cee542b88fb4e943f8ee04c0b1e200ae5fb4bd3b Mon Sep 17 00:00:00 2001
From: Bernhard Manfred Gruber <bernhardmgruber@gmail.com>
Date: Fri, 22 Nov 2024 11:28:08 +0100
Subject: [PATCH 07/45] Reorganize PTX tests to match generator (#2930)

---
 .../cuda/ptx/generated/barrier_cluster.inc    |  40 ++
 .../cuda/ptx/generated/cp_async_bulk.inc      |  37 ++
 .../generated/cp_async_bulk_commit_group.inc  |  10 +
 .../ptx/generated/cp_async_bulk_multicast.inc |  18 +
 .../ptx/generated/cp_async_bulk_tensor.inc    | 117 ++++
 .../cp_async_bulk_tensor_multicast.inc        |  82 +++
 .../generated/cp_async_bulk_wait_group.inc    |  18 +
 .../ptx/generated/cp_reduce_async_bulk.inc    | 476 +++++++++++++++
 .../generated/cp_reduce_async_bulk_bf16.inc   |  44 ++
 .../generated/cp_reduce_async_bulk_f16.inc    |  35 ++
 .../generated/cp_reduce_async_bulk_tensor.inc | 392 ++++++++++++
 .../libcudacxx/cuda/ptx/generated/fence.inc   |  38 ++
 .../ptx/generated/fence_mbarrier_init.inc     |  11 +
 .../cuda/ptx/generated/fence_proxy_alias.inc  |   9 +
 .../cuda/ptx/generated/fence_proxy_async.inc  |  24 +
 .../fence_proxy_tensormap_generic.inc         |  44 ++
 .../cuda/ptx/generated/get_sreg.inc           | 331 +++++++++++
 .../cuda/ptx/generated/getctarank.inc         |  10 +
 .../cuda/ptx/generated/mbarrier_arrive.inc    |  74 +++
 .../generated/mbarrier_arrive_expect_tx.inc   |  31 +
 .../generated/mbarrier_arrive_no_complete.inc |  10 +
 .../cuda/ptx/generated/mbarrier_init.inc      |  10 +
 .../cuda/ptx/generated/mbarrier_try_wait.inc  |  53 ++
 .../generated/mbarrier_try_wait_parity.inc    |  52 ++
 .../cuda/ptx/generated/mbarrier_wait.inc      |  24 +
 .../ptx/generated/mbarrier_wait_parity.inc    |  24 +
 .../cuda/ptx/generated/red_async.inc          | 120 ++++
 .../cuda/ptx/generated/st_async.inc           |  35 ++
 .../ptx/generated/tensormap_cp_fenceproxy.inc |  29 +
 .../cuda/ptx/generated/tensormap_replace.inc  | 198 +++++++
 .../ptx/ptx.barrier.cluster.compile.pass.cpp  |  42 +-
 ...p.async.bulk.commit_group.compile.pass.cpp |  11 +-
 .../ptx/ptx.cp.async.bulk.compile.pass.cpp    |  38 +-
 ...x.cp.async.bulk.multicast.compile.pass.cpp |  19 +-
 .../ptx.cp.async.bulk.tensor.compile.pass.cpp | 118 +---
 ...ync.bulk.tensor.multicast.compile.pass.cpp |  83 +--
 ....cp.async.bulk.wait_group.compile.pass.cpp |  19 +-
 .../ptx.cp.reduce.async.bulk.compile.pass.cpp | 560 +-----------------
 ....reduce.async.bulk.tensor.compile.pass.cpp | 393 +-----------
 .../cuda/ptx/ptx.fence.compile.pass.cpp       | 135 +----
 .../cuda/ptx/ptx.get_sreg.compile.pass.cpp    | 332 +----------
 .../cuda/ptx/ptx.getctarank.compile.pass.cpp  |  11 +-
 .../ptx/ptx.mbarrier.arrive.compile.pass.cpp  | 120 +---
 .../ptx/ptx.mbarrier.init.compile.pass.cpp    |  11 +-
 .../ptx/ptx.mbarrier.wait.compile.pass.cpp    | 160 +----
 .../cuda/ptx/ptx.red.async.compile.pass.cpp   | 121 +---
 .../cuda/ptx/ptx.st.async.compile.pass.cpp    |  36 +-
 ...x.tensormap.cp_fenceproxy.compile.pass.cpp |  30 +-
 .../ptx.tensormap.replace.compile.pass.cpp    | 199 +------
 49 files changed, 2427 insertions(+), 2407 deletions(-)
 create mode 100644 libcudacxx/test/libcudacxx/cuda/ptx/generated/barrier_cluster.inc
 create mode 100644 libcudacxx/test/libcudacxx/cuda/ptx/generated/cp_async_bulk.inc
 create mode 100644 libcudacxx/test/libcudacxx/cuda/ptx/generated/cp_async_bulk_commit_group.inc
 create mode 100644 libcudacxx/test/libcudacxx/cuda/ptx/generated/cp_async_bulk_multicast.inc
 create mode 100644 libcudacxx/test/libcudacxx/cuda/ptx/generated/cp_async_bulk_tensor.inc
 create mode 100644 libcudacxx/test/libcudacxx/cuda/ptx/generated/cp_async_bulk_tensor_multicast.inc
 create mode 100644 libcudacxx/test/libcudacxx/cuda/ptx/generated/cp_async_bulk_wait_group.inc
 create mode 100644 libcudacxx/test/libcudacxx/cuda/ptx/generated/cp_reduce_async_bulk.inc
 create mode 100644 libcudacxx/test/libcudacxx/cuda/ptx/generated/cp_reduce_async_bulk_bf16.inc
 create mode 100644 libcudacxx/test/libcudacxx/cuda/ptx/generated/cp_reduce_async_bulk_f16.inc
 create mode 100644 libcudacxx/test/libcudacxx/cuda/ptx/generated/cp_reduce_async_bulk_tensor.inc
 create mode 100644 libcudacxx/test/libcudacxx/cuda/ptx/generated/fence.inc
 create mode 100644 libcudacxx/test/libcudacxx/cuda/ptx/generated/fence_mbarrier_init.inc
 create mode 100644 libcudacxx/test/libcudacxx/cuda/ptx/generated/fence_proxy_alias.inc
 create mode 100644 libcudacxx/test/libcudacxx/cuda/ptx/generated/fence_proxy_async.inc
 create mode 100644 libcudacxx/test/libcudacxx/cuda/ptx/generated/fence_proxy_tensormap_generic.inc
 create mode 100644 libcudacxx/test/libcudacxx/cuda/ptx/generated/get_sreg.inc
 create mode 100644 libcudacxx/test/libcudacxx/cuda/ptx/generated/getctarank.inc
 create mode 100644 libcudacxx/test/libcudacxx/cuda/ptx/generated/mbarrier_arrive.inc
 create mode 100644 libcudacxx/test/libcudacxx/cuda/ptx/generated/mbarrier_arrive_expect_tx.inc
 create mode 100644 libcudacxx/test/libcudacxx/cuda/ptx/generated/mbarrier_arrive_no_complete.inc
 create mode 100644 libcudacxx/test/libcudacxx/cuda/ptx/generated/mbarrier_init.inc
 create mode 100644 libcudacxx/test/libcudacxx/cuda/ptx/generated/mbarrier_try_wait.inc
 create mode 100644 libcudacxx/test/libcudacxx/cuda/ptx/generated/mbarrier_try_wait_parity.inc
 create mode 100644 libcudacxx/test/libcudacxx/cuda/ptx/generated/mbarrier_wait.inc
 create mode 100644 libcudacxx/test/libcudacxx/cuda/ptx/generated/mbarrier_wait_parity.inc
 create mode 100644 libcudacxx/test/libcudacxx/cuda/ptx/generated/red_async.inc
 create mode 100644 libcudacxx/test/libcudacxx/cuda/ptx/generated/st_async.inc
 create mode 100644 libcudacxx/test/libcudacxx/cuda/ptx/generated/tensormap_cp_fenceproxy.inc
 create mode 100644 libcudacxx/test/libcudacxx/cuda/ptx/generated/tensormap_replace.inc

diff --git a/libcudacxx/test/libcudacxx/cuda/ptx/generated/barrier_cluster.inc b/libcudacxx/test/libcudacxx/cuda/ptx/generated/barrier_cluster.inc
new file mode 100644
index 00000000000..cad5510ba70
--- /dev/null
+++ b/libcudacxx/test/libcudacxx/cuda/ptx/generated/barrier_cluster.inc
@@ -0,0 +1,40 @@
+__global__ void test_barrier_cluster(void** fn_ptr)
+{
+#if __cccl_ptx_isa >= 780
+  NV_IF_TARGET(NV_PROVIDES_SM_90,
+               (
+                   // barrier.cluster.arrive;
+                   * fn_ptr++ = reinterpret_cast<void*>(static_cast<void (*)()>(cuda::ptx::barrier_cluster_arrive));));
+#endif // __cccl_ptx_isa >= 780
+
+#if __cccl_ptx_isa >= 780
+  NV_IF_TARGET(NV_PROVIDES_SM_90,
+               (
+                   // barrier.cluster.wait;
+                   * fn_ptr++ = reinterpret_cast<void*>(static_cast<void (*)()>(cuda::ptx::barrier_cluster_wait));));
+#endif // __cccl_ptx_isa >= 780
+
+#if __cccl_ptx_isa >= 800
+  NV_IF_TARGET(NV_PROVIDES_SM_90,
+               (
+                   // barrier.cluster.arrive.release;
+                   * fn_ptr++ = reinterpret_cast<void*>(
+                     static_cast<void (*)(cuda::ptx::sem_release_t)>(cuda::ptx::barrier_cluster_arrive));));
+#endif // __cccl_ptx_isa >= 800
+
+#if __cccl_ptx_isa >= 800
+  NV_IF_TARGET(NV_PROVIDES_SM_90,
+               (
+                   // barrier.cluster.arrive.relaxed;
+                   * fn_ptr++ = reinterpret_cast<void*>(
+                     static_cast<void (*)(cuda::ptx::sem_relaxed_t)>(cuda::ptx::barrier_cluster_arrive));));
+#endif // __cccl_ptx_isa >= 800
+
+#if __cccl_ptx_isa >= 800
+  NV_IF_TARGET(NV_PROVIDES_SM_90,
+               (
+                   // barrier.cluster.wait.acquire;
+                   * fn_ptr++ = reinterpret_cast<void*>(
+                     static_cast<void (*)(cuda::ptx::sem_acquire_t)>(cuda::ptx::barrier_cluster_wait));));
+#endif // __cccl_ptx_isa >= 800
+}
diff --git a/libcudacxx/test/libcudacxx/cuda/ptx/generated/cp_async_bulk.inc b/libcudacxx/test/libcudacxx/cuda/ptx/generated/cp_async_bulk.inc
new file mode 100644
index 00000000000..cd66de989a2
--- /dev/null
+++ b/libcudacxx/test/libcudacxx/cuda/ptx/generated/cp_async_bulk.inc
@@ -0,0 +1,37 @@
+__global__ void test_cp_async_bulk(void** fn_ptr)
+{
+#if __cccl_ptx_isa >= 800
+  NV_IF_TARGET(
+    NV_PROVIDES_SM_90,
+    (
+        // cp.async.bulk.shared::cluster.global.mbarrier::complete_tx::bytes [dstMem], [srcMem], size, [smem_bar]; //
+        // 1a. unicast
+        * fn_ptr++ = reinterpret_cast<void*>(
+          static_cast<void (*)(
+            cuda::ptx::space_cluster_t, cuda::ptx::space_global_t, void*, const void*, const uint32_t&, uint64_t*)>(
+            cuda::ptx::cp_async_bulk));));
+#endif // __cccl_ptx_isa >= 800
+
+#if __cccl_ptx_isa >= 800
+  NV_IF_TARGET(
+    NV_PROVIDES_SM_90,
+    (
+        // cp.async.bulk.shared::cluster.shared::cta.mbarrier::complete_tx::bytes [dstMem], [srcMem], size,
+        // [rdsmem_bar]; // 2.
+        * fn_ptr++ = reinterpret_cast<void*>(
+          static_cast<void (*)(
+            cuda::ptx::space_cluster_t, cuda::ptx::space_shared_t, void*, const void*, const uint32_t&, uint64_t*)>(
+            cuda::ptx::cp_async_bulk));));
+#endif // __cccl_ptx_isa >= 800
+
+#if __cccl_ptx_isa >= 800
+  NV_IF_TARGET(
+    NV_PROVIDES_SM_90,
+    (
+        // cp.async.bulk.global.shared::cta.bulk_group [dstMem], [srcMem], size; // 3.
+        * fn_ptr++ = reinterpret_cast<void*>(
+          static_cast<void (*)(
+            cuda::ptx::space_global_t, cuda::ptx::space_shared_t, void*, const void*, const uint32_t&)>(
+            cuda::ptx::cp_async_bulk));));
+#endif // __cccl_ptx_isa >= 800
+}
diff --git a/libcudacxx/test/libcudacxx/cuda/ptx/generated/cp_async_bulk_commit_group.inc b/libcudacxx/test/libcudacxx/cuda/ptx/generated/cp_async_bulk_commit_group.inc
new file mode 100644
index 00000000000..afdf14abb8a
--- /dev/null
+++ b/libcudacxx/test/libcudacxx/cuda/ptx/generated/cp_async_bulk_commit_group.inc
@@ -0,0 +1,10 @@
+__global__ void test_cp_async_bulk_commit_group(void** fn_ptr)
+{
+#if __cccl_ptx_isa >= 800
+  NV_IF_TARGET(
+    NV_PROVIDES_SM_90,
+    (
+        // cp.async.bulk.commit_group;
+        * fn_ptr++ = reinterpret_cast<void*>(static_cast<void (*)()>(cuda::ptx::cp_async_bulk_commit_group));));
+#endif // __cccl_ptx_isa >= 800
+}
diff --git a/libcudacxx/test/libcudacxx/cuda/ptx/generated/cp_async_bulk_multicast.inc b/libcudacxx/test/libcudacxx/cuda/ptx/generated/cp_async_bulk_multicast.inc
new file mode 100644
index 00000000000..b2bd0d968d9
--- /dev/null
+++ b/libcudacxx/test/libcudacxx/cuda/ptx/generated/cp_async_bulk_multicast.inc
@@ -0,0 +1,18 @@
+__global__ void test_cp_async_bulk_multicast(void** fn_ptr)
+{
+#if __cccl_ptx_isa >= 800
+  NV_IF_TARGET(
+    NV_HAS_FEATURE_SM_90a,
+    (
+        // cp.async.bulk.shared::cluster.global.mbarrier::complete_tx::bytes.multicast::cluster [dstMem], [srcMem],
+        // size, [smem_bar], ctaMask; // 1.
+        * fn_ptr++ = reinterpret_cast<void*>(
+          static_cast<void (*)(cuda::ptx::space_cluster_t,
+                               cuda::ptx::space_global_t,
+                               void*,
+                               const void*,
+                               const uint32_t&,
+                               uint64_t*,
+                               const uint16_t&)>(cuda::ptx::cp_async_bulk));));
+#endif // __cccl_ptx_isa >= 800
+}
diff --git a/libcudacxx/test/libcudacxx/cuda/ptx/generated/cp_async_bulk_tensor.inc b/libcudacxx/test/libcudacxx/cuda/ptx/generated/cp_async_bulk_tensor.inc
new file mode 100644
index 00000000000..f9d0d240d28
--- /dev/null
+++ b/libcudacxx/test/libcudacxx/cuda/ptx/generated/cp_async_bulk_tensor.inc
@@ -0,0 +1,117 @@
+__global__ void test_cp_async_bulk_tensor(void** fn_ptr)
+{
+#if __cccl_ptx_isa >= 800
+  NV_IF_TARGET(
+    NV_PROVIDES_SM_90,
+    (
+        // cp.async.bulk.tensor.1d.shared::cluster.global.tile.mbarrier::complete_tx::bytes [dstMem], [tensorMap,
+        // tensorCoords], [smem_bar];// 1a.
+        * fn_ptr++ = reinterpret_cast<void*>(
+          static_cast<void (*)(
+            cuda::ptx::space_cluster_t, cuda::ptx::space_global_t, void*, const void*, const int32_t(&)[1], uint64_t*)>(
+            cuda::ptx::cp_async_bulk_tensor));));
+#endif // __cccl_ptx_isa >= 800
+
+#if __cccl_ptx_isa >= 800
+  NV_IF_TARGET(
+    NV_PROVIDES_SM_90,
+    (
+        // cp.async.bulk.tensor.1d.global.shared::cta.tile.bulk_group [tensorMap, tensorCoords], [srcMem]; // 3a.
+        * fn_ptr++ = reinterpret_cast<void*>(
+          static_cast<void (*)(
+            cuda::ptx::space_global_t, cuda::ptx::space_shared_t, const void*, const int32_t(&)[1], const void*)>(
+            cuda::ptx::cp_async_bulk_tensor));));
+#endif // __cccl_ptx_isa >= 800
+
+#if __cccl_ptx_isa >= 800
+  NV_IF_TARGET(
+    NV_PROVIDES_SM_90,
+    (
+        // cp.async.bulk.tensor.2d.shared::cluster.global.tile.mbarrier::complete_tx::bytes [dstMem], [tensorMap,
+        // tensorCoords], [smem_bar];// 1b.
+        * fn_ptr++ = reinterpret_cast<void*>(
+          static_cast<void (*)(
+            cuda::ptx::space_cluster_t, cuda::ptx::space_global_t, void*, const void*, const int32_t(&)[2], uint64_t*)>(
+            cuda::ptx::cp_async_bulk_tensor));));
+#endif // __cccl_ptx_isa >= 800
+
+#if __cccl_ptx_isa >= 800
+  NV_IF_TARGET(
+    NV_PROVIDES_SM_90,
+    (
+        // cp.async.bulk.tensor.2d.global.shared::cta.tile.bulk_group [tensorMap, tensorCoords], [srcMem]; // 3b.
+        * fn_ptr++ = reinterpret_cast<void*>(
+          static_cast<void (*)(
+            cuda::ptx::space_global_t, cuda::ptx::space_shared_t, const void*, const int32_t(&)[2], const void*)>(
+            cuda::ptx::cp_async_bulk_tensor));));
+#endif // __cccl_ptx_isa >= 800
+
+#if __cccl_ptx_isa >= 800
+  NV_IF_TARGET(
+    NV_PROVIDES_SM_90,
+    (
+        // cp.async.bulk.tensor.3d.shared::cluster.global.tile.mbarrier::complete_tx::bytes [dstMem], [tensorMap,
+        // tensorCoords], [smem_bar];// 1c.
+        * fn_ptr++ = reinterpret_cast<void*>(
+          static_cast<void (*)(
+            cuda::ptx::space_cluster_t, cuda::ptx::space_global_t, void*, const void*, const int32_t(&)[3], uint64_t*)>(
+            cuda::ptx::cp_async_bulk_tensor));));
+#endif // __cccl_ptx_isa >= 800
+
+#if __cccl_ptx_isa >= 800
+  NV_IF_TARGET(
+    NV_PROVIDES_SM_90,
+    (
+        // cp.async.bulk.tensor.3d.global.shared::cta.tile.bulk_group [tensorMap, tensorCoords], [srcMem]; // 3c.
+        * fn_ptr++ = reinterpret_cast<void*>(
+          static_cast<void (*)(
+            cuda::ptx::space_global_t, cuda::ptx::space_shared_t, const void*, const int32_t(&)[3], const void*)>(
+            cuda::ptx::cp_async_bulk_tensor));));
+#endif // __cccl_ptx_isa >= 800
+
+#if __cccl_ptx_isa >= 800
+  NV_IF_TARGET(
+    NV_PROVIDES_SM_90,
+    (
+        // cp.async.bulk.tensor.4d.shared::cluster.global.tile.mbarrier::complete_tx::bytes [dstMem], [tensorMap,
+        // tensorCoords], [smem_bar];// 1d.
+        * fn_ptr++ = reinterpret_cast<void*>(
+          static_cast<void (*)(
+            cuda::ptx::space_cluster_t, cuda::ptx::space_global_t, void*, const void*, const int32_t(&)[4], uint64_t*)>(
+            cuda::ptx::cp_async_bulk_tensor));));
+#endif // __cccl_ptx_isa >= 800
+
+#if __cccl_ptx_isa >= 800
+  NV_IF_TARGET(
+    NV_PROVIDES_SM_90,
+    (
+        // cp.async.bulk.tensor.4d.global.shared::cta.tile.bulk_group [tensorMap, tensorCoords], [srcMem]; // 3d.
+        * fn_ptr++ = reinterpret_cast<void*>(
+          static_cast<void (*)(
+            cuda::ptx::space_global_t, cuda::ptx::space_shared_t, const void*, const int32_t(&)[4], const void*)>(
+            cuda::ptx::cp_async_bulk_tensor));));
+#endif // __cccl_ptx_isa >= 800
+
+#if __cccl_ptx_isa >= 800
+  NV_IF_TARGET(
+    NV_PROVIDES_SM_90,
+    (
+        // cp.async.bulk.tensor.5d.shared::cluster.global.tile.mbarrier::complete_tx::bytes [dstMem], [tensorMap,
+        // tensorCoords], [smem_bar];// 1e.
+        * fn_ptr++ = reinterpret_cast<void*>(
+          static_cast<void (*)(
+            cuda::ptx::space_cluster_t, cuda::ptx::space_global_t, void*, const void*, const int32_t(&)[5], uint64_t*)>(
+            cuda::ptx::cp_async_bulk_tensor));));
+#endif // __cccl_ptx_isa >= 800
+
+#if __cccl_ptx_isa >= 800
+  NV_IF_TARGET(
+    NV_PROVIDES_SM_90,
+    (
+        // cp.async.bulk.tensor.5d.global.shared::cta.tile.bulk_group [tensorMap, tensorCoords], [srcMem]; // 3e.
+        * fn_ptr++ = reinterpret_cast<void*>(
+          static_cast<void (*)(
+            cuda::ptx::space_global_t, cuda::ptx::space_shared_t, const void*, const int32_t(&)[5], const void*)>(
+            cuda::ptx::cp_async_bulk_tensor));));
+#endif // __cccl_ptx_isa >= 800
+}
diff --git a/libcudacxx/test/libcudacxx/cuda/ptx/generated/cp_async_bulk_tensor_multicast.inc b/libcudacxx/test/libcudacxx/cuda/ptx/generated/cp_async_bulk_tensor_multicast.inc
new file mode 100644
index 00000000000..2851aab6d7c
--- /dev/null
+++ b/libcudacxx/test/libcudacxx/cuda/ptx/generated/cp_async_bulk_tensor_multicast.inc
@@ -0,0 +1,82 @@
+__global__ void test_cp_async_bulk_tensor_multicast(void** fn_ptr)
+{
+#if __cccl_ptx_isa >= 800
+  NV_IF_TARGET(
+    NV_HAS_FEATURE_SM_90a,
+    (
+        // cp.async.bulk.tensor.1d.shared::cluster.global.tile.mbarrier::complete_tx::bytes.multicast::cluster [dstMem],
+        // [tensorMap, tensorCoords], [smem_bar], ctaMask; // 2a.
+        * fn_ptr++ = reinterpret_cast<void*>(
+          static_cast<void (*)(cuda::ptx::space_cluster_t,
+                               cuda::ptx::space_global_t,
+                               void*,
+                               const void*,
+                               const int32_t(&)[1],
+                               uint64_t*,
+                               const uint16_t&)>(cuda::ptx::cp_async_bulk_tensor));));
+#endif // __cccl_ptx_isa >= 800
+
+#if __cccl_ptx_isa >= 800
+  NV_IF_TARGET(
+    NV_HAS_FEATURE_SM_90a,
+    (
+        // cp.async.bulk.tensor.2d.shared::cluster.global.tile.mbarrier::complete_tx::bytes.multicast::cluster [dstMem],
+        // [tensorMap, tensorCoords], [smem_bar], ctaMask; // 2b.
+        * fn_ptr++ = reinterpret_cast<void*>(
+          static_cast<void (*)(cuda::ptx::space_cluster_t,
+                               cuda::ptx::space_global_t,
+                               void*,
+                               const void*,
+                               const int32_t(&)[2],
+                               uint64_t*,
+                               const uint16_t&)>(cuda::ptx::cp_async_bulk_tensor));));
+#endif // __cccl_ptx_isa >= 800
+
+#if __cccl_ptx_isa >= 800
+  NV_IF_TARGET(
+    NV_HAS_FEATURE_SM_90a,
+    (
+        // cp.async.bulk.tensor.3d.shared::cluster.global.tile.mbarrier::complete_tx::bytes.multicast::cluster [dstMem],
+        // [tensorMap, tensorCoords], [smem_bar], ctaMask; // 2c.
+        * fn_ptr++ = reinterpret_cast<void*>(
+          static_cast<void (*)(cuda::ptx::space_cluster_t,
+                               cuda::ptx::space_global_t,
+                               void*,
+                               const void*,
+                               const int32_t(&)[3],
+                               uint64_t*,
+                               const uint16_t&)>(cuda::ptx::cp_async_bulk_tensor));));
+#endif // __cccl_ptx_isa >= 800
+
+#if __cccl_ptx_isa >= 800
+  NV_IF_TARGET(
+    NV_HAS_FEATURE_SM_90a,
+    (
+        // cp.async.bulk.tensor.4d.shared::cluster.global.tile.mbarrier::complete_tx::bytes.multicast::cluster [dstMem],
+        // [tensorMap, tensorCoords], [smem_bar], ctaMask; // 2d.
+        * fn_ptr++ = reinterpret_cast<void*>(
+          static_cast<void (*)(cuda::ptx::space_cluster_t,
+                               cuda::ptx::space_global_t,
+                               void*,
+                               const void*,
+                               const int32_t(&)[4],
+                               uint64_t*,
+                               const uint16_t&)>(cuda::ptx::cp_async_bulk_tensor));));
+#endif // __cccl_ptx_isa >= 800
+
+#if __cccl_ptx_isa >= 800
+  NV_IF_TARGET(
+    NV_HAS_FEATURE_SM_90a,
+    (
+        // cp.async.bulk.tensor.5d.shared::cluster.global.tile.mbarrier::complete_tx::bytes.multicast::cluster [dstMem],
+        // [tensorMap, tensorCoords], [smem_bar], ctaMask; // 2e.
+        * fn_ptr++ = reinterpret_cast<void*>(
+          static_cast<void (*)(cuda::ptx::space_cluster_t,
+                               cuda::ptx::space_global_t,
+                               void*,
+                               const void*,
+                               const int32_t(&)[5],
+                               uint64_t*,
+                               const uint16_t&)>(cuda::ptx::cp_async_bulk_tensor));));
+#endif // __cccl_ptx_isa >= 800
+}
diff --git a/libcudacxx/test/libcudacxx/cuda/ptx/generated/cp_async_bulk_wait_group.inc b/libcudacxx/test/libcudacxx/cuda/ptx/generated/cp_async_bulk_wait_group.inc
new file mode 100644
index 00000000000..0139a65f6ce
--- /dev/null
+++ b/libcudacxx/test/libcudacxx/cuda/ptx/generated/cp_async_bulk_wait_group.inc
@@ -0,0 +1,18 @@
+__global__ void test_cp_async_bulk_wait_group(void** fn_ptr)
+{
+#if __cccl_ptx_isa >= 800
+  NV_IF_TARGET(NV_PROVIDES_SM_90,
+               (
+                   // cp.async.bulk.wait_group N;
+                   * fn_ptr++ = reinterpret_cast<void*>(
+                     static_cast<void (*)(cuda::ptx::n32_t<128>)>(cuda::ptx::cp_async_bulk_wait_group));));
+#endif // __cccl_ptx_isa >= 800
+
+#if __cccl_ptx_isa >= 800
+  NV_IF_TARGET(NV_PROVIDES_SM_90,
+               (
+                   // cp.async.bulk.wait_group.read N;
+                   * fn_ptr++ = reinterpret_cast<void*>(
+                     static_cast<void (*)(cuda::ptx::n32_t<128>)>(cuda::ptx::cp_async_bulk_wait_group_read));));
+#endif // __cccl_ptx_isa >= 800
+}
diff --git a/libcudacxx/test/libcudacxx/cuda/ptx/generated/cp_reduce_async_bulk.inc b/libcudacxx/test/libcudacxx/cuda/ptx/generated/cp_reduce_async_bulk.inc
new file mode 100644
index 00000000000..5ee274bcbe8
--- /dev/null
+++ b/libcudacxx/test/libcudacxx/cuda/ptx/generated/cp_reduce_async_bulk.inc
@@ -0,0 +1,476 @@
+__global__ void test_cp_reduce_async_bulk(void** fn_ptr)
+{
+#if __cccl_ptx_isa >= 800
+  NV_IF_TARGET(
+    NV_PROVIDES_SM_90,
+    (
+        // cp.reduce.async.bulk.shared::cluster.shared::cta.mbarrier::complete_tx::bytes.and.b32 [dstMem], [srcMem],
+        // size, [rdsmem_bar]; // 1.
+        * fn_ptr++ = reinterpret_cast<void*>(
+          static_cast<void (*)(cuda::ptx::space_cluster_t,
+                               cuda::ptx::space_shared_t,
+                               cuda::ptx::op_and_op_t,
+                               int32_t*,
+                               const int32_t*,
+                               uint32_t,
+                               uint64_t*)>(cuda::ptx::cp_reduce_async_bulk));));
+#endif // __cccl_ptx_isa >= 800
+
+#if __cccl_ptx_isa >= 800
+  NV_IF_TARGET(
+    NV_PROVIDES_SM_90,
+    (
+        // cp.reduce.async.bulk.shared::cluster.shared::cta.mbarrier::complete_tx::bytes.or.b32 [dstMem], [srcMem],
+        // size, [rdsmem_bar]; // 1.
+        * fn_ptr++ = reinterpret_cast<void*>(
+          static_cast<void (*)(cuda::ptx::space_cluster_t,
+                               cuda::ptx::space_shared_t,
+                               cuda::ptx::op_or_op_t,
+                               int32_t*,
+                               const int32_t*,
+                               uint32_t,
+                               uint64_t*)>(cuda::ptx::cp_reduce_async_bulk));));
+#endif // __cccl_ptx_isa >= 800
+
+#if __cccl_ptx_isa >= 800
+  NV_IF_TARGET(
+    NV_PROVIDES_SM_90,
+    (
+        // cp.reduce.async.bulk.shared::cluster.shared::cta.mbarrier::complete_tx::bytes.xor.b32 [dstMem], [srcMem],
+        // size, [rdsmem_bar]; // 1.
+        * fn_ptr++ = reinterpret_cast<void*>(
+          static_cast<void (*)(cuda::ptx::space_cluster_t,
+                               cuda::ptx::space_shared_t,
+                               cuda::ptx::op_xor_op_t,
+                               int32_t*,
+                               const int32_t*,
+                               uint32_t,
+                               uint64_t*)>(cuda::ptx::cp_reduce_async_bulk));));
+#endif // __cccl_ptx_isa >= 800
+
+#if __cccl_ptx_isa >= 800
+  NV_IF_TARGET(
+    NV_PROVIDES_SM_90,
+    (
+        // cp.reduce.async.bulk.shared::cluster.shared::cta.mbarrier::complete_tx::bytes.min.u32 [dstMem], [srcMem],
+        // size, [rdsmem_bar]; // 1.
+        * fn_ptr++ = reinterpret_cast<void*>(
+          static_cast<void (*)(cuda::ptx::space_cluster_t,
+                               cuda::ptx::space_shared_t,
+                               cuda::ptx::op_min_t,
+                               uint32_t*,
+                               const uint32_t*,
+                               uint32_t,
+                               uint64_t*)>(cuda::ptx::cp_reduce_async_bulk));));
+#endif // __cccl_ptx_isa >= 800
+
+#if __cccl_ptx_isa >= 800
+  NV_IF_TARGET(
+    NV_PROVIDES_SM_90,
+    (
+        // cp.reduce.async.bulk.shared::cluster.shared::cta.mbarrier::complete_tx::bytes.max.u32 [dstMem], [srcMem],
+        // size, [rdsmem_bar]; // 1.
+        * fn_ptr++ = reinterpret_cast<void*>(
+          static_cast<void (*)(cuda::ptx::space_cluster_t,
+                               cuda::ptx::space_shared_t,
+                               cuda::ptx::op_max_t,
+                               uint32_t*,
+                               const uint32_t*,
+                               uint32_t,
+                               uint64_t*)>(cuda::ptx::cp_reduce_async_bulk));));
+#endif // __cccl_ptx_isa >= 800
+
+#if __cccl_ptx_isa >= 800
+  NV_IF_TARGET(
+    NV_PROVIDES_SM_90,
+    (
+        // cp.reduce.async.bulk.shared::cluster.shared::cta.mbarrier::complete_tx::bytes.add.u32 [dstMem], [srcMem],
+        // size, [rdsmem_bar]; // 1.
+        * fn_ptr++ = reinterpret_cast<void*>(
+          static_cast<void (*)(cuda::ptx::space_cluster_t,
+                               cuda::ptx::space_shared_t,
+                               cuda::ptx::op_add_t,
+                               uint32_t*,
+                               const uint32_t*,
+                               uint32_t,
+                               uint64_t*)>(cuda::ptx::cp_reduce_async_bulk));));
+#endif // __cccl_ptx_isa >= 800
+
+#if __cccl_ptx_isa >= 800
+  NV_IF_TARGET(
+    NV_PROVIDES_SM_90,
+    (
+        // cp.reduce.async.bulk.shared::cluster.shared::cta.mbarrier::complete_tx::bytes.inc.u32 [dstMem], [srcMem],
+        // size, [rdsmem_bar]; // 1.
+        * fn_ptr++ = reinterpret_cast<void*>(
+          static_cast<void (*)(cuda::ptx::space_cluster_t,
+                               cuda::ptx::space_shared_t,
+                               cuda::ptx::op_inc_t,
+                               uint32_t*,
+                               const uint32_t*,
+                               uint32_t,
+                               uint64_t*)>(cuda::ptx::cp_reduce_async_bulk));));
+#endif // __cccl_ptx_isa >= 800
+
+#if __cccl_ptx_isa >= 800
+  NV_IF_TARGET(
+    NV_PROVIDES_SM_90,
+    (
+        // cp.reduce.async.bulk.shared::cluster.shared::cta.mbarrier::complete_tx::bytes.dec.u32 [dstMem], [srcMem],
+        // size, [rdsmem_bar]; // 1.
+        * fn_ptr++ = reinterpret_cast<void*>(
+          static_cast<void (*)(cuda::ptx::space_cluster_t,
+                               cuda::ptx::space_shared_t,
+                               cuda::ptx::op_dec_t,
+                               uint32_t*,
+                               const uint32_t*,
+                               uint32_t,
+                               uint64_t*)>(cuda::ptx::cp_reduce_async_bulk));));
+#endif // __cccl_ptx_isa >= 800
+
+#if __cccl_ptx_isa >= 800
+  NV_IF_TARGET(
+    NV_PROVIDES_SM_90,
+    (
+        // cp.reduce.async.bulk.shared::cluster.shared::cta.mbarrier::complete_tx::bytes.min.s32 [dstMem], [srcMem],
+        // size, [rdsmem_bar]; // 1.
+        * fn_ptr++ = reinterpret_cast<void*>(
+          static_cast<void (*)(cuda::ptx::space_cluster_t,
+                               cuda::ptx::space_shared_t,
+                               cuda::ptx::op_min_t,
+                               int32_t*,
+                               const int32_t*,
+                               uint32_t,
+                               uint64_t*)>(cuda::ptx::cp_reduce_async_bulk));));
+#endif // __cccl_ptx_isa >= 800
+
+#if __cccl_ptx_isa >= 800
+  NV_IF_TARGET(
+    NV_PROVIDES_SM_90,
+    (
+        // cp.reduce.async.bulk.shared::cluster.shared::cta.mbarrier::complete_tx::bytes.max.s32 [dstMem], [srcMem],
+        // size, [rdsmem_bar]; // 1.
+        * fn_ptr++ = reinterpret_cast<void*>(
+          static_cast<void (*)(cuda::ptx::space_cluster_t,
+                               cuda::ptx::space_shared_t,
+                               cuda::ptx::op_max_t,
+                               int32_t*,
+                               const int32_t*,
+                               uint32_t,
+                               uint64_t*)>(cuda::ptx::cp_reduce_async_bulk));));
+#endif // __cccl_ptx_isa >= 800
+
+#if __cccl_ptx_isa >= 800
+  NV_IF_TARGET(
+    NV_PROVIDES_SM_90,
+    (
+        // cp.reduce.async.bulk.shared::cluster.shared::cta.mbarrier::complete_tx::bytes.add.s32 [dstMem], [srcMem],
+        // size, [rdsmem_bar]; // 1.
+        * fn_ptr++ = reinterpret_cast<void*>(
+          static_cast<void (*)(cuda::ptx::space_cluster_t,
+                               cuda::ptx::space_shared_t,
+                               cuda::ptx::op_add_t,
+                               int32_t*,
+                               const int32_t*,
+                               uint32_t,
+                               uint64_t*)>(cuda::ptx::cp_reduce_async_bulk));));
+#endif // __cccl_ptx_isa >= 800
+
+#if __cccl_ptx_isa >= 800
+  NV_IF_TARGET(
+    NV_PROVIDES_SM_90,
+    (
+        // cp.reduce.async.bulk.shared::cluster.shared::cta.mbarrier::complete_tx::bytes.add.u64 [dstMem], [srcMem],
+        // size, [rdsmem_bar]; // 1.
+        * fn_ptr++ = reinterpret_cast<void*>(
+          static_cast<void (*)(cuda::ptx::space_cluster_t,
+                               cuda::ptx::space_shared_t,
+                               cuda::ptx::op_add_t,
+                               uint64_t*,
+                               const uint64_t*,
+                               uint32_t,
+                               uint64_t*)>(cuda::ptx::cp_reduce_async_bulk));));
+#endif // __cccl_ptx_isa >= 800
+
+#if __cccl_ptx_isa >= 800
+  NV_IF_TARGET(
+    NV_PROVIDES_SM_90,
+    (
+        // cp.reduce.async.bulk.shared::cluster.shared::cta.mbarrier::complete_tx::bytes.add.u64 [dstMem], [srcMem],
+        // size, [rdsmem_bar]; // 2.
+        * fn_ptr++ = reinterpret_cast<void*>(
+          static_cast<void (*)(cuda::ptx::space_cluster_t,
+                               cuda::ptx::space_shared_t,
+                               cuda::ptx::op_add_t,
+                               int64_t*,
+                               const int64_t*,
+                               uint32_t,
+                               uint64_t*)>(cuda::ptx::cp_reduce_async_bulk));));
+#endif // __cccl_ptx_isa >= 800
+
+#if __cccl_ptx_isa >= 800
+  NV_IF_TARGET(
+    NV_PROVIDES_SM_90,
+    (
+        // cp.reduce.async.bulk.global.shared::cta.bulk_group.and.b32  [dstMem], [srcMem], size; // 3.
+        * fn_ptr++ = reinterpret_cast<void*>(
+          static_cast<void (*)(cuda::ptx::space_global_t,
+                               cuda::ptx::space_shared_t,
+                               cuda::ptx::op_and_op_t,
+                               int32_t*,
+                               const int32_t*,
+                               uint32_t)>(cuda::ptx::cp_reduce_async_bulk));
+          // cp.reduce.async.bulk.global.shared::cta.bulk_group.and.b64  [dstMem], [srcMem], size; // 3.
+            * fn_ptr++ = reinterpret_cast<void*>(
+              static_cast<void (*)(cuda::ptx::space_global_t,
+                                   cuda::ptx::space_shared_t,
+                                   cuda::ptx::op_and_op_t,
+                                   int64_t*,
+                                   const int64_t*,
+                                   uint32_t)>(cuda::ptx::cp_reduce_async_bulk));));
+#endif // __cccl_ptx_isa >= 800
+
+#if __cccl_ptx_isa >= 800
+  NV_IF_TARGET(
+    NV_PROVIDES_SM_90,
+    (
+        // cp.reduce.async.bulk.global.shared::cta.bulk_group.or.b32  [dstMem], [srcMem], size; // 3.
+        * fn_ptr++ = reinterpret_cast<void*>(
+          static_cast<void (*)(cuda::ptx::space_global_t,
+                               cuda::ptx::space_shared_t,
+                               cuda::ptx::op_or_op_t,
+                               int32_t*,
+                               const int32_t*,
+                               uint32_t)>(cuda::ptx::cp_reduce_async_bulk));
+          // cp.reduce.async.bulk.global.shared::cta.bulk_group.or.b64  [dstMem], [srcMem], size; // 3.
+            * fn_ptr++ = reinterpret_cast<void*>(
+              static_cast<void (*)(cuda::ptx::space_global_t,
+                                   cuda::ptx::space_shared_t,
+                                   cuda::ptx::op_or_op_t,
+                                   int64_t*,
+                                   const int64_t*,
+                                   uint32_t)>(cuda::ptx::cp_reduce_async_bulk));));
+#endif // __cccl_ptx_isa >= 800
+
+#if __cccl_ptx_isa >= 800
+  NV_IF_TARGET(
+    NV_PROVIDES_SM_90,
+    (
+        // cp.reduce.async.bulk.global.shared::cta.bulk_group.xor.b32  [dstMem], [srcMem], size; // 3.
+        * fn_ptr++ = reinterpret_cast<void*>(
+          static_cast<void (*)(cuda::ptx::space_global_t,
+                               cuda::ptx::space_shared_t,
+                               cuda::ptx::op_xor_op_t,
+                               int32_t*,
+                               const int32_t*,
+                               uint32_t)>(cuda::ptx::cp_reduce_async_bulk));
+          // cp.reduce.async.bulk.global.shared::cta.bulk_group.xor.b64  [dstMem], [srcMem], size; // 3.
+            * fn_ptr++ = reinterpret_cast<void*>(
+              static_cast<void (*)(cuda::ptx::space_global_t,
+                                   cuda::ptx::space_shared_t,
+                                   cuda::ptx::op_xor_op_t,
+                                   int64_t*,
+                                   const int64_t*,
+                                   uint32_t)>(cuda::ptx::cp_reduce_async_bulk));));
+#endif // __cccl_ptx_isa >= 800
+
+#if __cccl_ptx_isa >= 800
+  NV_IF_TARGET(
+    NV_PROVIDES_SM_90,
+    (
+        // cp.reduce.async.bulk.global.shared::cta.bulk_group.min.u32  [dstMem], [srcMem], size; // 4.
+        * fn_ptr++ = reinterpret_cast<void*>(
+          static_cast<void (*)(cuda::ptx::space_global_t,
+                               cuda::ptx::space_shared_t,
+                               cuda::ptx::op_min_t,
+                               uint32_t*,
+                               const uint32_t*,
+                               uint32_t)>(cuda::ptx::cp_reduce_async_bulk));));
+#endif // __cccl_ptx_isa >= 800
+
+#if __cccl_ptx_isa >= 800
+  NV_IF_TARGET(
+    NV_PROVIDES_SM_90,
+    (
+        // cp.reduce.async.bulk.global.shared::cta.bulk_group.max.u32  [dstMem], [srcMem], size; // 4.
+        * fn_ptr++ = reinterpret_cast<void*>(
+          static_cast<void (*)(cuda::ptx::space_global_t,
+                               cuda::ptx::space_shared_t,
+                               cuda::ptx::op_max_t,
+                               uint32_t*,
+                               const uint32_t*,
+                               uint32_t)>(cuda::ptx::cp_reduce_async_bulk));));
+#endif // __cccl_ptx_isa >= 800
+
+#if __cccl_ptx_isa >= 800
+  NV_IF_TARGET(
+    NV_PROVIDES_SM_90,
+    (
+        // cp.reduce.async.bulk.global.shared::cta.bulk_group.add.u32  [dstMem], [srcMem], size; // 4.
+        * fn_ptr++ = reinterpret_cast<void*>(
+          static_cast<void (*)(cuda::ptx::space_global_t,
+                               cuda::ptx::space_shared_t,
+                               cuda::ptx::op_add_t,
+                               uint32_t*,
+                               const uint32_t*,
+                               uint32_t)>(cuda::ptx::cp_reduce_async_bulk));));
+#endif // __cccl_ptx_isa >= 800
+
+#if __cccl_ptx_isa >= 800
+  NV_IF_TARGET(
+    NV_PROVIDES_SM_90,
+    (
+        // cp.reduce.async.bulk.global.shared::cta.bulk_group.inc.u32  [dstMem], [srcMem], size; // 4.
+        * fn_ptr++ = reinterpret_cast<void*>(
+          static_cast<void (*)(cuda::ptx::space_global_t,
+                               cuda::ptx::space_shared_t,
+                               cuda::ptx::op_inc_t,
+                               uint32_t*,
+                               const uint32_t*,
+                               uint32_t)>(cuda::ptx::cp_reduce_async_bulk));));
+#endif // __cccl_ptx_isa >= 800
+
+#if __cccl_ptx_isa >= 800
+  NV_IF_TARGET(
+    NV_PROVIDES_SM_90,
+    (
+        // cp.reduce.async.bulk.global.shared::cta.bulk_group.dec.u32  [dstMem], [srcMem], size; // 4.
+        * fn_ptr++ = reinterpret_cast<void*>(
+          static_cast<void (*)(cuda::ptx::space_global_t,
+                               cuda::ptx::space_shared_t,
+                               cuda::ptx::op_dec_t,
+                               uint32_t*,
+                               const uint32_t*,
+                               uint32_t)>(cuda::ptx::cp_reduce_async_bulk));));
+#endif // __cccl_ptx_isa >= 800
+
+#if __cccl_ptx_isa >= 800
+  NV_IF_TARGET(
+    NV_PROVIDES_SM_90,
+    (
+        // cp.reduce.async.bulk.global.shared::cta.bulk_group.min.s32  [dstMem], [srcMem], size; // 4.
+        * fn_ptr++ = reinterpret_cast<void*>(
+          static_cast<void (*)(
+            cuda::ptx::space_global_t, cuda::ptx::space_shared_t, cuda::ptx::op_min_t, int32_t*, const int32_t*, uint32_t)>(
+            cuda::ptx::cp_reduce_async_bulk));));
+#endif // __cccl_ptx_isa >= 800
+
+#if __cccl_ptx_isa >= 800
+  NV_IF_TARGET(
+    NV_PROVIDES_SM_90,
+    (
+        // cp.reduce.async.bulk.global.shared::cta.bulk_group.max.s32  [dstMem], [srcMem], size; // 4.
+        * fn_ptr++ = reinterpret_cast<void*>(
+          static_cast<void (*)(
+            cuda::ptx::space_global_t, cuda::ptx::space_shared_t, cuda::ptx::op_max_t, int32_t*, const int32_t*, uint32_t)>(
+            cuda::ptx::cp_reduce_async_bulk));));
+#endif // __cccl_ptx_isa >= 800
+
+#if __cccl_ptx_isa >= 800
+  NV_IF_TARGET(
+    NV_PROVIDES_SM_90,
+    (
+        // cp.reduce.async.bulk.global.shared::cta.bulk_group.add.s32  [dstMem], [srcMem], size; // 4.
+        * fn_ptr++ = reinterpret_cast<void*>(
+          static_cast<void (*)(
+            cuda::ptx::space_global_t, cuda::ptx::space_shared_t, cuda::ptx::op_add_t, int32_t*, const int32_t*, uint32_t)>(
+            cuda::ptx::cp_reduce_async_bulk));));
+#endif // __cccl_ptx_isa >= 800
+
+#if __cccl_ptx_isa >= 800
+  NV_IF_TARGET(
+    NV_PROVIDES_SM_90,
+    (
+        // cp.reduce.async.bulk.global.shared::cta.bulk_group.min.u64  [dstMem], [srcMem], size; // 4.
+        * fn_ptr++ = reinterpret_cast<void*>(
+          static_cast<void (*)(cuda::ptx::space_global_t,
+                               cuda::ptx::space_shared_t,
+                               cuda::ptx::op_min_t,
+                               uint64_t*,
+                               const uint64_t*,
+                               uint32_t)>(cuda::ptx::cp_reduce_async_bulk));));
+#endif // __cccl_ptx_isa >= 800
+
+#if __cccl_ptx_isa >= 800
+  NV_IF_TARGET(
+    NV_PROVIDES_SM_90,
+    (
+        // cp.reduce.async.bulk.global.shared::cta.bulk_group.max.u64  [dstMem], [srcMem], size; // 4.
+        * fn_ptr++ = reinterpret_cast<void*>(
+          static_cast<void (*)(cuda::ptx::space_global_t,
+                               cuda::ptx::space_shared_t,
+                               cuda::ptx::op_max_t,
+                               uint64_t*,
+                               const uint64_t*,
+                               uint32_t)>(cuda::ptx::cp_reduce_async_bulk));));
+#endif // __cccl_ptx_isa >= 800
+
+#if __cccl_ptx_isa >= 800
+  NV_IF_TARGET(
+    NV_PROVIDES_SM_90,
+    (
+        // cp.reduce.async.bulk.global.shared::cta.bulk_group.add.u64  [dstMem], [srcMem], size; // 4.
+        * fn_ptr++ = reinterpret_cast<void*>(
+          static_cast<void (*)(cuda::ptx::space_global_t,
+                               cuda::ptx::space_shared_t,
+                               cuda::ptx::op_add_t,
+                               uint64_t*,
+                               const uint64_t*,
+                               uint32_t)>(cuda::ptx::cp_reduce_async_bulk));));
+#endif // __cccl_ptx_isa >= 800
+
+#if __cccl_ptx_isa >= 800
+  NV_IF_TARGET(
+    NV_PROVIDES_SM_90,
+    (
+        // cp.reduce.async.bulk.global.shared::cta.bulk_group.min.s64  [dstMem], [srcMem], size; // 4.
+        * fn_ptr++ = reinterpret_cast<void*>(
+          static_cast<void (*)(
+            cuda::ptx::space_global_t, cuda::ptx::space_shared_t, cuda::ptx::op_min_t, int64_t*, const int64_t*, uint32_t)>(
+            cuda::ptx::cp_reduce_async_bulk));));
+#endif // __cccl_ptx_isa >= 800
+
+#if __cccl_ptx_isa >= 800
+  NV_IF_TARGET(
+    NV_PROVIDES_SM_90,
+    (
+        // cp.reduce.async.bulk.global.shared::cta.bulk_group.max.s64  [dstMem], [srcMem], size; // 4.
+        * fn_ptr++ = reinterpret_cast<void*>(
+          static_cast<void (*)(
+            cuda::ptx::space_global_t, cuda::ptx::space_shared_t, cuda::ptx::op_max_t, int64_t*, const int64_t*, uint32_t)>(
+            cuda::ptx::cp_reduce_async_bulk));));
+#endif // __cccl_ptx_isa >= 800
+
+#if __cccl_ptx_isa >= 800
+  NV_IF_TARGET(
+    NV_PROVIDES_SM_90,
+    (
+        // cp.reduce.async.bulk.global.shared::cta.bulk_group.add.f32  [dstMem], [srcMem], size; // 4.
+        * fn_ptr++ = reinterpret_cast<void*>(
+          static_cast<void (*)(
+            cuda::ptx::space_global_t, cuda::ptx::space_shared_t, cuda::ptx::op_add_t, float*, const float*, uint32_t)>(
+            cuda::ptx::cp_reduce_async_bulk));));
+#endif // __cccl_ptx_isa >= 800
+
+#if __cccl_ptx_isa >= 800
+  NV_IF_TARGET(
+    NV_PROVIDES_SM_90,
+    (
+        // cp.reduce.async.bulk.global.shared::cta.bulk_group.add.f64  [dstMem], [srcMem], size; // 4.
+        * fn_ptr++ = reinterpret_cast<void*>(
+          static_cast<void (*)(
+            cuda::ptx::space_global_t, cuda::ptx::space_shared_t, cuda::ptx::op_add_t, double*, const double*, uint32_t)>(
+            cuda::ptx::cp_reduce_async_bulk));));
+#endif // __cccl_ptx_isa >= 800
+
+#if __cccl_ptx_isa >= 800
+  NV_IF_TARGET(
+    NV_PROVIDES_SM_90,
+    (
+        // cp.reduce.async.bulk.global.shared::cta.bulk_group.add.u64  [dstMem], [srcMem], size; // 6.
+        * fn_ptr++ = reinterpret_cast<void*>(
+          static_cast<void (*)(
+            cuda::ptx::space_global_t, cuda::ptx::space_shared_t, cuda::ptx::op_add_t, int64_t*, const int64_t*, uint32_t)>(
+            cuda::ptx::cp_reduce_async_bulk));));
+#endif // __cccl_ptx_isa >= 800
+}
diff --git a/libcudacxx/test/libcudacxx/cuda/ptx/generated/cp_reduce_async_bulk_bf16.inc b/libcudacxx/test/libcudacxx/cuda/ptx/generated/cp_reduce_async_bulk_bf16.inc
new file mode 100644
index 00000000000..fe38374fe00
--- /dev/null
+++ b/libcudacxx/test/libcudacxx/cuda/ptx/generated/cp_reduce_async_bulk_bf16.inc
@@ -0,0 +1,44 @@
+__global__ void test_cp_reduce_async_bulk_bf16(void** fn_ptr)
+{
+#  if __cccl_ptx_isa >= 800
+  NV_IF_TARGET(
+    NV_PROVIDES_SM_90,
+    (
+        // cp.reduce.async.bulk.global.shared::cta.bulk_group.min.bf16  [dstMem], [srcMem], size; // 4.
+        * fn_ptr++ = reinterpret_cast<void*>(
+          static_cast<void (*)(cuda::ptx::space_global_t,
+                               cuda::ptx::space_shared_t,
+                               cuda::ptx::op_min_t,
+                               __nv_bfloat16*,
+                               const __nv_bfloat16*,
+                               uint32_t)>(cuda::ptx::cp_reduce_async_bulk));));
+#  endif // __cccl_ptx_isa >= 800
+
+#  if __cccl_ptx_isa >= 800
+  NV_IF_TARGET(
+    NV_PROVIDES_SM_90,
+    (
+        // cp.reduce.async.bulk.global.shared::cta.bulk_group.max.bf16  [dstMem], [srcMem], size; // 4.
+        * fn_ptr++ = reinterpret_cast<void*>(
+          static_cast<void (*)(cuda::ptx::space_global_t,
+                               cuda::ptx::space_shared_t,
+                               cuda::ptx::op_max_t,
+                               __nv_bfloat16*,
+                               const __nv_bfloat16*,
+                               uint32_t)>(cuda::ptx::cp_reduce_async_bulk));));
+#  endif // __cccl_ptx_isa >= 800
+
+#  if __cccl_ptx_isa >= 800
+  NV_IF_TARGET(
+    NV_PROVIDES_SM_90,
+    (
+        // cp.reduce.async.bulk.global.shared::cta.bulk_group.add.noftz.bf16  [dstMem], [srcMem], size; // 5.
+        * fn_ptr++ = reinterpret_cast<void*>(
+          static_cast<void (*)(cuda::ptx::space_global_t,
+                               cuda::ptx::space_shared_t,
+                               cuda::ptx::op_add_t,
+                               __nv_bfloat16*,
+                               const __nv_bfloat16*,
+                               uint32_t)>(cuda::ptx::cp_reduce_async_bulk));));
+#  endif // __cccl_ptx_isa >= 800
+}
diff --git a/libcudacxx/test/libcudacxx/cuda/ptx/generated/cp_reduce_async_bulk_f16.inc b/libcudacxx/test/libcudacxx/cuda/ptx/generated/cp_reduce_async_bulk_f16.inc
new file mode 100644
index 00000000000..e7e58cfcb80
--- /dev/null
+++ b/libcudacxx/test/libcudacxx/cuda/ptx/generated/cp_reduce_async_bulk_f16.inc
@@ -0,0 +1,35 @@
+__global__ void test_cp_reduce_async_bulk_f16(void** fn_ptr)
+{
+#  if __cccl_ptx_isa >= 800
+  NV_IF_TARGET(
+    NV_PROVIDES_SM_90,
+    (
+        // cp.reduce.async.bulk.global.shared::cta.bulk_group.min.f16  [dstMem], [srcMem], size; // 4.
+        * fn_ptr++ = reinterpret_cast<void*>(
+          static_cast<void (*)(
+            cuda::ptx::space_global_t, cuda::ptx::space_shared_t, cuda::ptx::op_min_t, __half*, const __half*, uint32_t)>(
+            cuda::ptx::cp_reduce_async_bulk));));
+#  endif // __cccl_ptx_isa >= 800
+
+#  if __cccl_ptx_isa >= 800
+  NV_IF_TARGET(
+    NV_PROVIDES_SM_90,
+    (
+        // cp.reduce.async.bulk.global.shared::cta.bulk_group.max.f16  [dstMem], [srcMem], size; // 4.
+        * fn_ptr++ = reinterpret_cast<void*>(
+          static_cast<void (*)(
+            cuda::ptx::space_global_t, cuda::ptx::space_shared_t, cuda::ptx::op_max_t, __half*, const __half*, uint32_t)>(
+            cuda::ptx::cp_reduce_async_bulk));));
+#  endif // __cccl_ptx_isa >= 800
+
+#  if __cccl_ptx_isa >= 800
+  NV_IF_TARGET(
+    NV_PROVIDES_SM_90,
+    (
+        // cp.reduce.async.bulk.global.shared::cta.bulk_group.add.noftz.f16  [dstMem], [srcMem], size; // 5.
+        * fn_ptr++ = reinterpret_cast<void*>(
+          static_cast<void (*)(
+            cuda::ptx::space_global_t, cuda::ptx::space_shared_t, cuda::ptx::op_add_t, __half*, const __half*, uint32_t)>(
+            cuda::ptx::cp_reduce_async_bulk));));
+#  endif // __cccl_ptx_isa >= 800
+}
diff --git a/libcudacxx/test/libcudacxx/cuda/ptx/generated/cp_reduce_async_bulk_tensor.inc b/libcudacxx/test/libcudacxx/cuda/ptx/generated/cp_reduce_async_bulk_tensor.inc
new file mode 100644
index 00000000000..6f0a7d710ce
--- /dev/null
+++ b/libcudacxx/test/libcudacxx/cuda/ptx/generated/cp_reduce_async_bulk_tensor.inc
@@ -0,0 +1,392 @@
+__global__ void test_cp_reduce_async_bulk_tensor(void** fn_ptr)
+{
+#if __cccl_ptx_isa >= 800
+  NV_IF_TARGET(
+    NV_PROVIDES_SM_90,
+    (
+        // cp.reduce.async.bulk.tensor.1d.global.shared::cta.add.tile.bulk_group [tensorMap, tensorCoords], [srcMem]; //
+        // 1a.
+        * fn_ptr++ = reinterpret_cast<void*>(
+          static_cast<void (*)(cuda::ptx::space_global_t,
+                               cuda::ptx::space_shared_t,
+                               cuda::ptx::op_add_t,
+                               const void*,
+                               const int32_t(&)[1],
+                               const void*)>(cuda::ptx::cp_reduce_async_bulk_tensor));
+          // cp.reduce.async.bulk.tensor.1d.global.shared::cta.min.tile.bulk_group [tensorMap, tensorCoords],
+          // [srcMem]; // 1a.
+            * fn_ptr++ = reinterpret_cast<void*>(
+              static_cast<void (*)(cuda::ptx::space_global_t,
+                                   cuda::ptx::space_shared_t,
+                                   cuda::ptx::op_min_t,
+                                   const void*,
+                                   const int32_t(&)[1],
+                                   const void*)>(cuda::ptx::cp_reduce_async_bulk_tensor));
+          // cp.reduce.async.bulk.tensor.1d.global.shared::cta.max.tile.bulk_group [tensorMap, tensorCoords],
+          // [srcMem]; // 1a.
+            * fn_ptr++ = reinterpret_cast<void*>(
+              static_cast<void (*)(cuda::ptx::space_global_t,
+                                   cuda::ptx::space_shared_t,
+                                   cuda::ptx::op_max_t,
+                                   const void*,
+                                   const int32_t(&)[1],
+                                   const void*)>(cuda::ptx::cp_reduce_async_bulk_tensor));
+          // cp.reduce.async.bulk.tensor.1d.global.shared::cta.inc.tile.bulk_group [tensorMap, tensorCoords],
+          // [srcMem]; // 1a.
+            * fn_ptr++ = reinterpret_cast<void*>(
+              static_cast<void (*)(cuda::ptx::space_global_t,
+                                   cuda::ptx::space_shared_t,
+                                   cuda::ptx::op_inc_t,
+                                   const void*,
+                                   const int32_t(&)[1],
+                                   const void*)>(cuda::ptx::cp_reduce_async_bulk_tensor));
+          // cp.reduce.async.bulk.tensor.1d.global.shared::cta.dec.tile.bulk_group [tensorMap, tensorCoords],
+          // [srcMem]; // 1a.
+            * fn_ptr++ = reinterpret_cast<void*>(
+              static_cast<void (*)(cuda::ptx::space_global_t,
+                                   cuda::ptx::space_shared_t,
+                                   cuda::ptx::op_dec_t,
+                                   const void*,
+                                   const int32_t(&)[1],
+                                   const void*)>(cuda::ptx::cp_reduce_async_bulk_tensor));
+          // cp.reduce.async.bulk.tensor.1d.global.shared::cta.and.tile.bulk_group [tensorMap, tensorCoords],
+          // [srcMem]; // 1a.
+            * fn_ptr++ = reinterpret_cast<void*>(
+              static_cast<void (*)(cuda::ptx::space_global_t,
+                                   cuda::ptx::space_shared_t,
+                                   cuda::ptx::op_and_op_t,
+                                   const void*,
+                                   const int32_t(&)[1],
+                                   const void*)>(cuda::ptx::cp_reduce_async_bulk_tensor));
+          // cp.reduce.async.bulk.tensor.1d.global.shared::cta.or.tile.bulk_group [tensorMap, tensorCoords], [srcMem];
+          // // 1a.
+            * fn_ptr++ = reinterpret_cast<void*>(
+              static_cast<void (*)(cuda::ptx::space_global_t,
+                                   cuda::ptx::space_shared_t,
+                                   cuda::ptx::op_or_op_t,
+                                   const void*,
+                                   const int32_t(&)[1],
+                                   const void*)>(cuda::ptx::cp_reduce_async_bulk_tensor));
+          // cp.reduce.async.bulk.tensor.1d.global.shared::cta.xor.tile.bulk_group [tensorMap, tensorCoords],
+          // [srcMem]; // 1a.
+            * fn_ptr++ = reinterpret_cast<void*>(
+              static_cast<void (*)(cuda::ptx::space_global_t,
+                                   cuda::ptx::space_shared_t,
+                                   cuda::ptx::op_xor_op_t,
+                                   const void*,
+                                   const int32_t(&)[1],
+                                   const void*)>(cuda::ptx::cp_reduce_async_bulk_tensor));));
+#endif // __cccl_ptx_isa >= 800
+
+#if __cccl_ptx_isa >= 800
+  NV_IF_TARGET(
+    NV_PROVIDES_SM_90,
+    (
+        // cp.reduce.async.bulk.tensor.2d.global.shared::cta.add.tile.bulk_group [tensorMap, tensorCoords], [srcMem]; //
+        // 1b.
+        * fn_ptr++ = reinterpret_cast<void*>(
+          static_cast<void (*)(cuda::ptx::space_global_t,
+                               cuda::ptx::space_shared_t,
+                               cuda::ptx::op_add_t,
+                               const void*,
+                               const int32_t(&)[2],
+                               const void*)>(cuda::ptx::cp_reduce_async_bulk_tensor));
+          // cp.reduce.async.bulk.tensor.2d.global.shared::cta.min.tile.bulk_group [tensorMap, tensorCoords],
+          // [srcMem]; // 1b.
+            * fn_ptr++ = reinterpret_cast<void*>(
+              static_cast<void (*)(cuda::ptx::space_global_t,
+                                   cuda::ptx::space_shared_t,
+                                   cuda::ptx::op_min_t,
+                                   const void*,
+                                   const int32_t(&)[2],
+                                   const void*)>(cuda::ptx::cp_reduce_async_bulk_tensor));
+          // cp.reduce.async.bulk.tensor.2d.global.shared::cta.max.tile.bulk_group [tensorMap, tensorCoords],
+          // [srcMem]; // 1b.
+            * fn_ptr++ = reinterpret_cast<void*>(
+              static_cast<void (*)(cuda::ptx::space_global_t,
+                                   cuda::ptx::space_shared_t,
+                                   cuda::ptx::op_max_t,
+                                   const void*,
+                                   const int32_t(&)[2],
+                                   const void*)>(cuda::ptx::cp_reduce_async_bulk_tensor));
+          // cp.reduce.async.bulk.tensor.2d.global.shared::cta.inc.tile.bulk_group [tensorMap, tensorCoords],
+          // [srcMem]; // 1b.
+            * fn_ptr++ = reinterpret_cast<void*>(
+              static_cast<void (*)(cuda::ptx::space_global_t,
+                                   cuda::ptx::space_shared_t,
+                                   cuda::ptx::op_inc_t,
+                                   const void*,
+                                   const int32_t(&)[2],
+                                   const void*)>(cuda::ptx::cp_reduce_async_bulk_tensor));
+          // cp.reduce.async.bulk.tensor.2d.global.shared::cta.dec.tile.bulk_group [tensorMap, tensorCoords],
+          // [srcMem]; // 1b.
+            * fn_ptr++ = reinterpret_cast<void*>(
+              static_cast<void (*)(cuda::ptx::space_global_t,
+                                   cuda::ptx::space_shared_t,
+                                   cuda::ptx::op_dec_t,
+                                   const void*,
+                                   const int32_t(&)[2],
+                                   const void*)>(cuda::ptx::cp_reduce_async_bulk_tensor));
+          // cp.reduce.async.bulk.tensor.2d.global.shared::cta.and.tile.bulk_group [tensorMap, tensorCoords],
+          // [srcMem]; // 1b.
+            * fn_ptr++ = reinterpret_cast<void*>(
+              static_cast<void (*)(cuda::ptx::space_global_t,
+                                   cuda::ptx::space_shared_t,
+                                   cuda::ptx::op_and_op_t,
+                                   const void*,
+                                   const int32_t(&)[2],
+                                   const void*)>(cuda::ptx::cp_reduce_async_bulk_tensor));
+          // cp.reduce.async.bulk.tensor.2d.global.shared::cta.or.tile.bulk_group [tensorMap, tensorCoords], [srcMem];
+          // // 1b.
+            * fn_ptr++ = reinterpret_cast<void*>(
+              static_cast<void (*)(cuda::ptx::space_global_t,
+                                   cuda::ptx::space_shared_t,
+                                   cuda::ptx::op_or_op_t,
+                                   const void*,
+                                   const int32_t(&)[2],
+                                   const void*)>(cuda::ptx::cp_reduce_async_bulk_tensor));
+          // cp.reduce.async.bulk.tensor.2d.global.shared::cta.xor.tile.bulk_group [tensorMap, tensorCoords],
+          // [srcMem]; // 1b.
+            * fn_ptr++ = reinterpret_cast<void*>(
+              static_cast<void (*)(cuda::ptx::space_global_t,
+                                   cuda::ptx::space_shared_t,
+                                   cuda::ptx::op_xor_op_t,
+                                   const void*,
+                                   const int32_t(&)[2],
+                                   const void*)>(cuda::ptx::cp_reduce_async_bulk_tensor));));
+#endif // __cccl_ptx_isa >= 800
+
+#if __cccl_ptx_isa >= 800
+  NV_IF_TARGET(
+    NV_PROVIDES_SM_90,
+    (
+        // cp.reduce.async.bulk.tensor.3d.global.shared::cta.add.tile.bulk_group [tensorMap, tensorCoords], [srcMem]; //
+        // 1c.
+        * fn_ptr++ = reinterpret_cast<void*>(
+          static_cast<void (*)(cuda::ptx::space_global_t,
+                               cuda::ptx::space_shared_t,
+                               cuda::ptx::op_add_t,
+                               const void*,
+                               const int32_t(&)[3],
+                               const void*)>(cuda::ptx::cp_reduce_async_bulk_tensor));
+          // cp.reduce.async.bulk.tensor.3d.global.shared::cta.min.tile.bulk_group [tensorMap, tensorCoords],
+          // [srcMem]; // 1c.
+            * fn_ptr++ = reinterpret_cast<void*>(
+              static_cast<void (*)(cuda::ptx::space_global_t,
+                                   cuda::ptx::space_shared_t,
+                                   cuda::ptx::op_min_t,
+                                   const void*,
+                                   const int32_t(&)[3],
+                                   const void*)>(cuda::ptx::cp_reduce_async_bulk_tensor));
+          // cp.reduce.async.bulk.tensor.3d.global.shared::cta.max.tile.bulk_group [tensorMap, tensorCoords],
+          // [srcMem]; // 1c.
+            * fn_ptr++ = reinterpret_cast<void*>(
+              static_cast<void (*)(cuda::ptx::space_global_t,
+                                   cuda::ptx::space_shared_t,
+                                   cuda::ptx::op_max_t,
+                                   const void*,
+                                   const int32_t(&)[3],
+                                   const void*)>(cuda::ptx::cp_reduce_async_bulk_tensor));
+          // cp.reduce.async.bulk.tensor.3d.global.shared::cta.inc.tile.bulk_group [tensorMap, tensorCoords],
+          // [srcMem]; // 1c.
+            * fn_ptr++ = reinterpret_cast<void*>(
+              static_cast<void (*)(cuda::ptx::space_global_t,
+                                   cuda::ptx::space_shared_t,
+                                   cuda::ptx::op_inc_t,
+                                   const void*,
+                                   const int32_t(&)[3],
+                                   const void*)>(cuda::ptx::cp_reduce_async_bulk_tensor));
+          // cp.reduce.async.bulk.tensor.3d.global.shared::cta.dec.tile.bulk_group [tensorMap, tensorCoords],
+          // [srcMem]; // 1c.
+            * fn_ptr++ = reinterpret_cast<void*>(
+              static_cast<void (*)(cuda::ptx::space_global_t,
+                                   cuda::ptx::space_shared_t,
+                                   cuda::ptx::op_dec_t,
+                                   const void*,
+                                   const int32_t(&)[3],
+                                   const void*)>(cuda::ptx::cp_reduce_async_bulk_tensor));
+          // cp.reduce.async.bulk.tensor.3d.global.shared::cta.and.tile.bulk_group [tensorMap, tensorCoords],
+          // [srcMem]; // 1c.
+            * fn_ptr++ = reinterpret_cast<void*>(
+              static_cast<void (*)(cuda::ptx::space_global_t,
+                                   cuda::ptx::space_shared_t,
+                                   cuda::ptx::op_and_op_t,
+                                   const void*,
+                                   const int32_t(&)[3],
+                                   const void*)>(cuda::ptx::cp_reduce_async_bulk_tensor));
+          // cp.reduce.async.bulk.tensor.3d.global.shared::cta.or.tile.bulk_group [tensorMap, tensorCoords], [srcMem];
+          // // 1c.
+            * fn_ptr++ = reinterpret_cast<void*>(
+              static_cast<void (*)(cuda::ptx::space_global_t,
+                                   cuda::ptx::space_shared_t,
+                                   cuda::ptx::op_or_op_t,
+                                   const void*,
+                                   const int32_t(&)[3],
+                                   const void*)>(cuda::ptx::cp_reduce_async_bulk_tensor));
+          // cp.reduce.async.bulk.tensor.3d.global.shared::cta.xor.tile.bulk_group [tensorMap, tensorCoords],
+          // [srcMem]; // 1c.
+            * fn_ptr++ = reinterpret_cast<void*>(
+              static_cast<void (*)(cuda::ptx::space_global_t,
+                                   cuda::ptx::space_shared_t,
+                                   cuda::ptx::op_xor_op_t,
+                                   const void*,
+                                   const int32_t(&)[3],
+                                   const void*)>(cuda::ptx::cp_reduce_async_bulk_tensor));));
+#endif // __cccl_ptx_isa >= 800
+
+#if __cccl_ptx_isa >= 800
+  NV_IF_TARGET(
+    NV_PROVIDES_SM_90,
+    (
+        // cp.reduce.async.bulk.tensor.4d.global.shared::cta.add.tile.bulk_group [tensorMap, tensorCoords], [srcMem]; //
+        // 1d.
+        * fn_ptr++ = reinterpret_cast<void*>(
+          static_cast<void (*)(cuda::ptx::space_global_t,
+                               cuda::ptx::space_shared_t,
+                               cuda::ptx::op_add_t,
+                               const void*,
+                               const int32_t(&)[4],
+                               const void*)>(cuda::ptx::cp_reduce_async_bulk_tensor));
+          // cp.reduce.async.bulk.tensor.4d.global.shared::cta.min.tile.bulk_group [tensorMap, tensorCoords],
+          // [srcMem]; // 1d.
+            * fn_ptr++ = reinterpret_cast<void*>(
+              static_cast<void (*)(cuda::ptx::space_global_t,
+                                   cuda::ptx::space_shared_t,
+                                   cuda::ptx::op_min_t,
+                                   const void*,
+                                   const int32_t(&)[4],
+                                   const void*)>(cuda::ptx::cp_reduce_async_bulk_tensor));
+          // cp.reduce.async.bulk.tensor.4d.global.shared::cta.max.tile.bulk_group [tensorMap, tensorCoords],
+          // [srcMem]; // 1d.
+            * fn_ptr++ = reinterpret_cast<void*>(
+              static_cast<void (*)(cuda::ptx::space_global_t,
+                                   cuda::ptx::space_shared_t,
+                                   cuda::ptx::op_max_t,
+                                   const void*,
+                                   const int32_t(&)[4],
+                                   const void*)>(cuda::ptx::cp_reduce_async_bulk_tensor));
+          // cp.reduce.async.bulk.tensor.4d.global.shared::cta.inc.tile.bulk_group [tensorMap, tensorCoords],
+          // [srcMem]; // 1d.
+            * fn_ptr++ = reinterpret_cast<void*>(
+              static_cast<void (*)(cuda::ptx::space_global_t,
+                                   cuda::ptx::space_shared_t,
+                                   cuda::ptx::op_inc_t,
+                                   const void*,
+                                   const int32_t(&)[4],
+                                   const void*)>(cuda::ptx::cp_reduce_async_bulk_tensor));
+          // cp.reduce.async.bulk.tensor.4d.global.shared::cta.dec.tile.bulk_group [tensorMap, tensorCoords],
+          // [srcMem]; // 1d.
+            * fn_ptr++ = reinterpret_cast<void*>(
+              static_cast<void (*)(cuda::ptx::space_global_t,
+                                   cuda::ptx::space_shared_t,
+                                   cuda::ptx::op_dec_t,
+                                   const void*,
+                                   const int32_t(&)[4],
+                                   const void*)>(cuda::ptx::cp_reduce_async_bulk_tensor));
+          // cp.reduce.async.bulk.tensor.4d.global.shared::cta.and.tile.bulk_group [tensorMap, tensorCoords],
+          // [srcMem]; // 1d.
+            * fn_ptr++ = reinterpret_cast<void*>(
+              static_cast<void (*)(cuda::ptx::space_global_t,
+                                   cuda::ptx::space_shared_t,
+                                   cuda::ptx::op_and_op_t,
+                                   const void*,
+                                   const int32_t(&)[4],
+                                   const void*)>(cuda::ptx::cp_reduce_async_bulk_tensor));
+          // cp.reduce.async.bulk.tensor.4d.global.shared::cta.or.tile.bulk_group [tensorMap, tensorCoords], [srcMem];
+          // // 1d.
+            * fn_ptr++ = reinterpret_cast<void*>(
+              static_cast<void (*)(cuda::ptx::space_global_t,
+                                   cuda::ptx::space_shared_t,
+                                   cuda::ptx::op_or_op_t,
+                                   const void*,
+                                   const int32_t(&)[4],
+                                   const void*)>(cuda::ptx::cp_reduce_async_bulk_tensor));
+          // cp.reduce.async.bulk.tensor.4d.global.shared::cta.xor.tile.bulk_group [tensorMap, tensorCoords],
+          // [srcMem]; // 1d.
+            * fn_ptr++ = reinterpret_cast<void*>(
+              static_cast<void (*)(cuda::ptx::space_global_t,
+                                   cuda::ptx::space_shared_t,
+                                   cuda::ptx::op_xor_op_t,
+                                   const void*,
+                                   const int32_t(&)[4],
+                                   const void*)>(cuda::ptx::cp_reduce_async_bulk_tensor));));
+#endif // __cccl_ptx_isa >= 800
+
+#if __cccl_ptx_isa >= 800
+  NV_IF_TARGET(
+    NV_PROVIDES_SM_90,
+    (
+        // cp.reduce.async.bulk.tensor.5d.global.shared::cta.add.tile.bulk_group [tensorMap, tensorCoords], [srcMem]; //
+        // 1e.
+        * fn_ptr++ = reinterpret_cast<void*>(
+          static_cast<void (*)(cuda::ptx::space_global_t,
+                               cuda::ptx::space_shared_t,
+                               cuda::ptx::op_add_t,
+                               const void*,
+                               const int32_t(&)[5],
+                               const void*)>(cuda::ptx::cp_reduce_async_bulk_tensor));
+          // cp.reduce.async.bulk.tensor.5d.global.shared::cta.min.tile.bulk_group [tensorMap, tensorCoords],
+          // [srcMem]; // 1e.
+            * fn_ptr++ = reinterpret_cast<void*>(
+              static_cast<void (*)(cuda::ptx::space_global_t,
+                                   cuda::ptx::space_shared_t,
+                                   cuda::ptx::op_min_t,
+                                   const void*,
+                                   const int32_t(&)[5],
+                                   const void*)>(cuda::ptx::cp_reduce_async_bulk_tensor));
+          // cp.reduce.async.bulk.tensor.5d.global.shared::cta.max.tile.bulk_group [tensorMap, tensorCoords],
+          // [srcMem]; // 1e.
+            * fn_ptr++ = reinterpret_cast<void*>(
+              static_cast<void (*)(cuda::ptx::space_global_t,
+                                   cuda::ptx::space_shared_t,
+                                   cuda::ptx::op_max_t,
+                                   const void*,
+                                   const int32_t(&)[5],
+                                   const void*)>(cuda::ptx::cp_reduce_async_bulk_tensor));
+          // cp.reduce.async.bulk.tensor.5d.global.shared::cta.inc.tile.bulk_group [tensorMap, tensorCoords],
+          // [srcMem]; // 1e.
+            * fn_ptr++ = reinterpret_cast<void*>(
+              static_cast<void (*)(cuda::ptx::space_global_t,
+                                   cuda::ptx::space_shared_t,
+                                   cuda::ptx::op_inc_t,
+                                   const void*,
+                                   const int32_t(&)[5],
+                                   const void*)>(cuda::ptx::cp_reduce_async_bulk_tensor));
+          // cp.reduce.async.bulk.tensor.5d.global.shared::cta.dec.tile.bulk_group [tensorMap, tensorCoords],
+          // [srcMem]; // 1e.
+            * fn_ptr++ = reinterpret_cast<void*>(
+              static_cast<void (*)(cuda::ptx::space_global_t,
+                                   cuda::ptx::space_shared_t,
+                                   cuda::ptx::op_dec_t,
+                                   const void*,
+                                   const int32_t(&)[5],
+                                   const void*)>(cuda::ptx::cp_reduce_async_bulk_tensor));
+          // cp.reduce.async.bulk.tensor.5d.global.shared::cta.and.tile.bulk_group [tensorMap, tensorCoords],
+          // [srcMem]; // 1e.
+            * fn_ptr++ = reinterpret_cast<void*>(
+              static_cast<void (*)(cuda::ptx::space_global_t,
+                                   cuda::ptx::space_shared_t,
+                                   cuda::ptx::op_and_op_t,
+                                   const void*,
+                                   const int32_t(&)[5],
+                                   const void*)>(cuda::ptx::cp_reduce_async_bulk_tensor));
+          // cp.reduce.async.bulk.tensor.5d.global.shared::cta.or.tile.bulk_group [tensorMap, tensorCoords], [srcMem];
+          // // 1e.
+            * fn_ptr++ = reinterpret_cast<void*>(
+              static_cast<void (*)(cuda::ptx::space_global_t,
+                                   cuda::ptx::space_shared_t,
+                                   cuda::ptx::op_or_op_t,
+                                   const void*,
+                                   const int32_t(&)[5],
+                                   const void*)>(cuda::ptx::cp_reduce_async_bulk_tensor));
+          // cp.reduce.async.bulk.tensor.5d.global.shared::cta.xor.tile.bulk_group [tensorMap, tensorCoords],
+          // [srcMem]; // 1e.
+            * fn_ptr++ = reinterpret_cast<void*>(
+              static_cast<void (*)(cuda::ptx::space_global_t,
+                                   cuda::ptx::space_shared_t,
+                                   cuda::ptx::op_xor_op_t,
+                                   const void*,
+                                   const int32_t(&)[5],
+                                   const void*)>(cuda::ptx::cp_reduce_async_bulk_tensor));));
+#endif // __cccl_ptx_isa >= 800
+}
diff --git a/libcudacxx/test/libcudacxx/cuda/ptx/generated/fence.inc b/libcudacxx/test/libcudacxx/cuda/ptx/generated/fence.inc
new file mode 100644
index 00000000000..2e464580de9
--- /dev/null
+++ b/libcudacxx/test/libcudacxx/cuda/ptx/generated/fence.inc
@@ -0,0 +1,38 @@
+__global__ void test_fence(void** fn_ptr)
+{
+#if __cccl_ptx_isa >= 600
+  NV_IF_TARGET(
+    NV_PROVIDES_SM_70,
+    (
+        // fence.sc.cta; // 1.
+        * fn_ptr++ =
+          reinterpret_cast<void*>(static_cast<void (*)(cuda::ptx::sem_sc_t, cuda::ptx::scope_cta_t)>(cuda::ptx::fence));
+          // fence.sc.gpu; // 1.
+            * fn_ptr++ = reinterpret_cast<void*>(
+              static_cast<void (*)(cuda::ptx::sem_sc_t, cuda::ptx::scope_gpu_t)>(cuda::ptx::fence));
+          // fence.sc.sys; // 1.
+            * fn_ptr++ = reinterpret_cast<void*>(
+              static_cast<void (*)(cuda::ptx::sem_sc_t, cuda::ptx::scope_sys_t)>(cuda::ptx::fence));
+          // fence.acq_rel.cta; // 1.
+            * fn_ptr++ = reinterpret_cast<void*>(
+              static_cast<void (*)(cuda::ptx::sem_acq_rel_t, cuda::ptx::scope_cta_t)>(cuda::ptx::fence));
+          // fence.acq_rel.gpu; // 1.
+            * fn_ptr++ = reinterpret_cast<void*>(
+              static_cast<void (*)(cuda::ptx::sem_acq_rel_t, cuda::ptx::scope_gpu_t)>(cuda::ptx::fence));
+          // fence.acq_rel.sys; // 1.
+            * fn_ptr++ = reinterpret_cast<void*>(
+              static_cast<void (*)(cuda::ptx::sem_acq_rel_t, cuda::ptx::scope_sys_t)>(cuda::ptx::fence));));
+#endif // __cccl_ptx_isa >= 600
+
+#if __cccl_ptx_isa >= 780
+  NV_IF_TARGET(
+    NV_PROVIDES_SM_90,
+    (
+        // fence.sc.cluster; // 2.
+        * fn_ptr++ = reinterpret_cast<void*>(
+          static_cast<void (*)(cuda::ptx::sem_sc_t, cuda::ptx::scope_cluster_t)>(cuda::ptx::fence));
+          // fence.acq_rel.cluster; // 2.
+            * fn_ptr++ = reinterpret_cast<void*>(
+              static_cast<void (*)(cuda::ptx::sem_acq_rel_t, cuda::ptx::scope_cluster_t)>(cuda::ptx::fence));));
+#endif // __cccl_ptx_isa >= 780
+}
diff --git a/libcudacxx/test/libcudacxx/cuda/ptx/generated/fence_mbarrier_init.inc b/libcudacxx/test/libcudacxx/cuda/ptx/generated/fence_mbarrier_init.inc
new file mode 100644
index 00000000000..f503c1d055b
--- /dev/null
+++ b/libcudacxx/test/libcudacxx/cuda/ptx/generated/fence_mbarrier_init.inc
@@ -0,0 +1,11 @@
+__global__ void test_fence_mbarrier_init(void** fn_ptr)
+{
+#if __cccl_ptx_isa >= 800
+  NV_IF_TARGET(
+    NV_PROVIDES_SM_90,
+    (
+        // fence.mbarrier_init.release.cluster; // 3.
+        * fn_ptr++ = reinterpret_cast<void*>(static_cast<void (*)(cuda::ptx::sem_release_t, cuda::ptx::scope_cluster_t)>(
+          cuda::ptx::fence_mbarrier_init));));
+#endif // __cccl_ptx_isa >= 800
+}
diff --git a/libcudacxx/test/libcudacxx/cuda/ptx/generated/fence_proxy_alias.inc b/libcudacxx/test/libcudacxx/cuda/ptx/generated/fence_proxy_alias.inc
new file mode 100644
index 00000000000..a8021d3f5be
--- /dev/null
+++ b/libcudacxx/test/libcudacxx/cuda/ptx/generated/fence_proxy_alias.inc
@@ -0,0 +1,9 @@
+__global__ void test_fence_proxy_alias(void** fn_ptr)
+{
+#if __cccl_ptx_isa >= 750
+  NV_IF_TARGET(NV_PROVIDES_SM_70,
+               (
+                   // fence.proxy.alias; // 4.
+                   * fn_ptr++ = reinterpret_cast<void*>(static_cast<void (*)()>(cuda::ptx::fence_proxy_alias));));
+#endif // __cccl_ptx_isa >= 750
+}
diff --git a/libcudacxx/test/libcudacxx/cuda/ptx/generated/fence_proxy_async.inc b/libcudacxx/test/libcudacxx/cuda/ptx/generated/fence_proxy_async.inc
new file mode 100644
index 00000000000..e3d8e6d160a
--- /dev/null
+++ b/libcudacxx/test/libcudacxx/cuda/ptx/generated/fence_proxy_async.inc
@@ -0,0 +1,24 @@
+__global__ void test_fence_proxy_async(void** fn_ptr)
+{
+#if __cccl_ptx_isa >= 800
+  NV_IF_TARGET(NV_PROVIDES_SM_90,
+               (
+                   // fence.proxy.async; // 5.
+                   * fn_ptr++ = reinterpret_cast<void*>(static_cast<void (*)()>(cuda::ptx::fence_proxy_async));));
+#endif // __cccl_ptx_isa >= 800
+
+#if __cccl_ptx_isa >= 800
+  NV_IF_TARGET(
+    NV_PROVIDES_SM_90,
+    (
+        // fence.proxy.async.global; // 6.
+        * fn_ptr++ =
+          reinterpret_cast<void*>(static_cast<void (*)(cuda::ptx::space_global_t)>(cuda::ptx::fence_proxy_async));
+          // fence.proxy.async.shared::cluster; // 6.
+            * fn_ptr++ =
+              reinterpret_cast<void*>(static_cast<void (*)(cuda::ptx::space_cluster_t)>(cuda::ptx::fence_proxy_async));
+          // fence.proxy.async.shared::cta; // 6.
+            * fn_ptr++ = reinterpret_cast<void*>(
+              static_cast<void (*)(cuda::ptx::space_shared_t)>(cuda::ptx::fence_proxy_async));));
+#endif // __cccl_ptx_isa >= 800
+}
diff --git a/libcudacxx/test/libcudacxx/cuda/ptx/generated/fence_proxy_tensormap_generic.inc b/libcudacxx/test/libcudacxx/cuda/ptx/generated/fence_proxy_tensormap_generic.inc
new file mode 100644
index 00000000000..1e0ea93a387
--- /dev/null
+++ b/libcudacxx/test/libcudacxx/cuda/ptx/generated/fence_proxy_tensormap_generic.inc
@@ -0,0 +1,44 @@
+__global__ void test_fence_proxy_tensormap_generic(void** fn_ptr)
+{
+#if __cccl_ptx_isa >= 830
+  NV_IF_TARGET(
+    NV_PROVIDES_SM_90,
+    (
+        // fence.proxy.tensormap::generic.release.cta; // 7.
+        * fn_ptr++ = reinterpret_cast<void*>(static_cast<void (*)(cuda::ptx::sem_release_t, cuda::ptx::scope_cta_t)>(
+          cuda::ptx::fence_proxy_tensormap_generic));
+          // fence.proxy.tensormap::generic.release.cluster; // 7.
+            * fn_ptr++ =
+              reinterpret_cast<void*>(static_cast<void (*)(cuda::ptx::sem_release_t, cuda::ptx::scope_cluster_t)>(
+                cuda::ptx::fence_proxy_tensormap_generic));
+          // fence.proxy.tensormap::generic.release.gpu; // 7.
+            * fn_ptr++ = reinterpret_cast<void*>(static_cast<void (*)(cuda::ptx::sem_release_t, cuda::ptx::scope_gpu_t)>(
+              cuda::ptx::fence_proxy_tensormap_generic));
+          // fence.proxy.tensormap::generic.release.sys; // 7.
+            * fn_ptr++ = reinterpret_cast<void*>(static_cast<void (*)(cuda::ptx::sem_release_t, cuda::ptx::scope_sys_t)>(
+              cuda::ptx::fence_proxy_tensormap_generic));));
+#endif // __cccl_ptx_isa >= 830
+
+#if __cccl_ptx_isa >= 830
+  NV_IF_TARGET(
+    NV_PROVIDES_SM_90,
+    (
+        // fence.proxy.tensormap::generic.acquire.cta [addr], size; // 8.
+        * fn_ptr++ = reinterpret_cast<void*>(
+          static_cast<void (*)(cuda::ptx::sem_acquire_t, cuda::ptx::scope_cta_t, const void*, cuda::ptx::n32_t<128>)>(
+            cuda::ptx::fence_proxy_tensormap_generic));
+          // fence.proxy.tensormap::generic.acquire.cluster [addr], size; // 8.
+            * fn_ptr++ = reinterpret_cast<void*>(
+              static_cast<void (*)(
+                cuda::ptx::sem_acquire_t, cuda::ptx::scope_cluster_t, const void*, cuda::ptx::n32_t<128>)>(
+                cuda::ptx::fence_proxy_tensormap_generic));
+          // fence.proxy.tensormap::generic.acquire.gpu [addr], size; // 8.
+            * fn_ptr++ = reinterpret_cast<void*>(
+              static_cast<void (*)(cuda::ptx::sem_acquire_t, cuda::ptx::scope_gpu_t, const void*, cuda::ptx::n32_t<128>)>(
+                cuda::ptx::fence_proxy_tensormap_generic));
+          // fence.proxy.tensormap::generic.acquire.sys [addr], size; // 8.
+            * fn_ptr++ = reinterpret_cast<void*>(
+              static_cast<void (*)(cuda::ptx::sem_acquire_t, cuda::ptx::scope_sys_t, const void*, cuda::ptx::n32_t<128>)>(
+                cuda::ptx::fence_proxy_tensormap_generic));));
+#endif // __cccl_ptx_isa >= 830
+}
diff --git a/libcudacxx/test/libcudacxx/cuda/ptx/generated/get_sreg.inc b/libcudacxx/test/libcudacxx/cuda/ptx/generated/get_sreg.inc
new file mode 100644
index 00000000000..90842352f90
--- /dev/null
+++ b/libcudacxx/test/libcudacxx/cuda/ptx/generated/get_sreg.inc
@@ -0,0 +1,331 @@
+__global__ void test_get_sreg(void** fn_ptr)
+{
+#if __cccl_ptx_isa >= 200
+  // mov.u32 sreg_value, %%tid.x;
+  *fn_ptr++ = reinterpret_cast<void*>(static_cast<uint32_t (*)()>(cuda::ptx::get_sreg_tid_x));
+#endif // __cccl_ptx_isa >= 200
+
+#if __cccl_ptx_isa >= 200
+  // mov.u32 sreg_value, %%tid.y;
+  *fn_ptr++ = reinterpret_cast<void*>(static_cast<uint32_t (*)()>(cuda::ptx::get_sreg_tid_y));
+#endif // __cccl_ptx_isa >= 200
+
+#if __cccl_ptx_isa >= 200
+  // mov.u32 sreg_value, %%tid.z;
+  *fn_ptr++ = reinterpret_cast<void*>(static_cast<uint32_t (*)()>(cuda::ptx::get_sreg_tid_z));
+#endif // __cccl_ptx_isa >= 200
+
+#if __cccl_ptx_isa >= 200
+  // mov.u32 sreg_value, %%ntid.x;
+  *fn_ptr++ = reinterpret_cast<void*>(static_cast<uint32_t (*)()>(cuda::ptx::get_sreg_ntid_x));
+#endif // __cccl_ptx_isa >= 200
+
+#if __cccl_ptx_isa >= 200
+  // mov.u32 sreg_value, %%ntid.y;
+  *fn_ptr++ = reinterpret_cast<void*>(static_cast<uint32_t (*)()>(cuda::ptx::get_sreg_ntid_y));
+#endif // __cccl_ptx_isa >= 200
+
+#if __cccl_ptx_isa >= 200
+  // mov.u32 sreg_value, %%ntid.z;
+  *fn_ptr++ = reinterpret_cast<void*>(static_cast<uint32_t (*)()>(cuda::ptx::get_sreg_ntid_z));
+#endif // __cccl_ptx_isa >= 200
+
+#if __cccl_ptx_isa >= 130
+  // mov.u32 sreg_value, %%laneid;
+  *fn_ptr++ = reinterpret_cast<void*>(static_cast<uint32_t (*)()>(cuda::ptx::get_sreg_laneid));
+#endif // __cccl_ptx_isa >= 130
+
+#if __cccl_ptx_isa >= 130
+  // mov.u32 sreg_value, %%warpid;
+  *fn_ptr++ = reinterpret_cast<void*>(static_cast<uint32_t (*)()>(cuda::ptx::get_sreg_warpid));
+#endif // __cccl_ptx_isa >= 130
+
+#if __cccl_ptx_isa >= 200
+  NV_IF_TARGET(NV_PROVIDES_SM_35,
+               (
+                   // mov.u32 sreg_value, %%nwarpid;
+                   * fn_ptr++ = reinterpret_cast<void*>(static_cast<uint32_t (*)()>(cuda::ptx::get_sreg_nwarpid));));
+#endif // __cccl_ptx_isa >= 200
+
+#if __cccl_ptx_isa >= 200
+  // mov.u32 sreg_value, %%ctaid.x;
+  *fn_ptr++ = reinterpret_cast<void*>(static_cast<uint32_t (*)()>(cuda::ptx::get_sreg_ctaid_x));
+#endif // __cccl_ptx_isa >= 200
+
+#if __cccl_ptx_isa >= 200
+  // mov.u32 sreg_value, %%ctaid.y;
+  *fn_ptr++ = reinterpret_cast<void*>(static_cast<uint32_t (*)()>(cuda::ptx::get_sreg_ctaid_y));
+#endif // __cccl_ptx_isa >= 200
+
+#if __cccl_ptx_isa >= 200
+  // mov.u32 sreg_value, %%ctaid.z;
+  *fn_ptr++ = reinterpret_cast<void*>(static_cast<uint32_t (*)()>(cuda::ptx::get_sreg_ctaid_z));
+#endif // __cccl_ptx_isa >= 200
+
+#if __cccl_ptx_isa >= 200
+  // mov.u32 sreg_value, %%nctaid.x;
+  *fn_ptr++ = reinterpret_cast<void*>(static_cast<uint32_t (*)()>(cuda::ptx::get_sreg_nctaid_x));
+#endif // __cccl_ptx_isa >= 200
+
+#if __cccl_ptx_isa >= 200
+  // mov.u32 sreg_value, %%nctaid.y;
+  *fn_ptr++ = reinterpret_cast<void*>(static_cast<uint32_t (*)()>(cuda::ptx::get_sreg_nctaid_y));
+#endif // __cccl_ptx_isa >= 200
+
+#if __cccl_ptx_isa >= 200
+  // mov.u32 sreg_value, %%nctaid.z;
+  *fn_ptr++ = reinterpret_cast<void*>(static_cast<uint32_t (*)()>(cuda::ptx::get_sreg_nctaid_z));
+#endif // __cccl_ptx_isa >= 200
+
+#if __cccl_ptx_isa >= 130
+  // mov.u32 sreg_value, %%smid;
+  *fn_ptr++ = reinterpret_cast<void*>(static_cast<uint32_t (*)()>(cuda::ptx::get_sreg_smid));
+#endif // __cccl_ptx_isa >= 130
+
+#if __cccl_ptx_isa >= 200
+  NV_IF_TARGET(NV_PROVIDES_SM_35,
+               (
+                   // mov.u32 sreg_value, %%nsmid;
+                   * fn_ptr++ = reinterpret_cast<void*>(static_cast<uint32_t (*)()>(cuda::ptx::get_sreg_nsmid));));
+#endif // __cccl_ptx_isa >= 200
+
+#if __cccl_ptx_isa >= 300
+  // mov.u64 sreg_value, %%gridid;
+  *fn_ptr++ = reinterpret_cast<void*>(static_cast<uint64_t (*)()>(cuda::ptx::get_sreg_gridid));
+#endif // __cccl_ptx_isa >= 300
+
+#if __cccl_ptx_isa >= 780
+  NV_IF_TARGET(
+    NV_PROVIDES_SM_90,
+    (
+        // mov.pred sreg_value, %%is_explicit_cluster;
+        * fn_ptr++ = reinterpret_cast<void*>(static_cast<bool (*)()>(cuda::ptx::get_sreg_is_explicit_cluster));));
+#endif // __cccl_ptx_isa >= 780
+
+#if __cccl_ptx_isa >= 780
+  NV_IF_TARGET(
+    NV_PROVIDES_SM_90,
+    (
+        // mov.u32 sreg_value, %%clusterid.x;
+        * fn_ptr++ = reinterpret_cast<void*>(static_cast<uint32_t (*)()>(cuda::ptx::get_sreg_clusterid_x));));
+#endif // __cccl_ptx_isa >= 780
+
+#if __cccl_ptx_isa >= 780
+  NV_IF_TARGET(
+    NV_PROVIDES_SM_90,
+    (
+        // mov.u32 sreg_value, %%clusterid.y;
+        * fn_ptr++ = reinterpret_cast<void*>(static_cast<uint32_t (*)()>(cuda::ptx::get_sreg_clusterid_y));));
+#endif // __cccl_ptx_isa >= 780
+
+#if __cccl_ptx_isa >= 780
+  NV_IF_TARGET(
+    NV_PROVIDES_SM_90,
+    (
+        // mov.u32 sreg_value, %%clusterid.z;
+        * fn_ptr++ = reinterpret_cast<void*>(static_cast<uint32_t (*)()>(cuda::ptx::get_sreg_clusterid_z));));
+#endif // __cccl_ptx_isa >= 780
+
+#if __cccl_ptx_isa >= 780
+  NV_IF_TARGET(
+    NV_PROVIDES_SM_90,
+    (
+        // mov.u32 sreg_value, %%nclusterid.x;
+        * fn_ptr++ = reinterpret_cast<void*>(static_cast<uint32_t (*)()>(cuda::ptx::get_sreg_nclusterid_x));));
+#endif // __cccl_ptx_isa >= 780
+
+#if __cccl_ptx_isa >= 780
+  NV_IF_TARGET(
+    NV_PROVIDES_SM_90,
+    (
+        // mov.u32 sreg_value, %%nclusterid.y;
+        * fn_ptr++ = reinterpret_cast<void*>(static_cast<uint32_t (*)()>(cuda::ptx::get_sreg_nclusterid_y));));
+#endif // __cccl_ptx_isa >= 780
+
+#if __cccl_ptx_isa >= 780
+  NV_IF_TARGET(
+    NV_PROVIDES_SM_90,
+    (
+        // mov.u32 sreg_value, %%nclusterid.z;
+        * fn_ptr++ = reinterpret_cast<void*>(static_cast<uint32_t (*)()>(cuda::ptx::get_sreg_nclusterid_z));));
+#endif // __cccl_ptx_isa >= 780
+
+#if __cccl_ptx_isa >= 780
+  NV_IF_TARGET(
+    NV_PROVIDES_SM_90,
+    (
+        // mov.u32 sreg_value, %%cluster_ctaid.x;
+        * fn_ptr++ = reinterpret_cast<void*>(static_cast<uint32_t (*)()>(cuda::ptx::get_sreg_cluster_ctaid_x));));
+#endif // __cccl_ptx_isa >= 780
+
+#if __cccl_ptx_isa >= 780
+  NV_IF_TARGET(
+    NV_PROVIDES_SM_90,
+    (
+        // mov.u32 sreg_value, %%cluster_ctaid.y;
+        * fn_ptr++ = reinterpret_cast<void*>(static_cast<uint32_t (*)()>(cuda::ptx::get_sreg_cluster_ctaid_y));));
+#endif // __cccl_ptx_isa >= 780
+
+#if __cccl_ptx_isa >= 780
+  NV_IF_TARGET(
+    NV_PROVIDES_SM_90,
+    (
+        // mov.u32 sreg_value, %%cluster_ctaid.z;
+        * fn_ptr++ = reinterpret_cast<void*>(static_cast<uint32_t (*)()>(cuda::ptx::get_sreg_cluster_ctaid_z));));
+#endif // __cccl_ptx_isa >= 780
+
+#if __cccl_ptx_isa >= 780
+  NV_IF_TARGET(
+    NV_PROVIDES_SM_90,
+    (
+        // mov.u32 sreg_value, %%cluster_nctaid.x;
+        * fn_ptr++ = reinterpret_cast<void*>(static_cast<uint32_t (*)()>(cuda::ptx::get_sreg_cluster_nctaid_x));));
+#endif // __cccl_ptx_isa >= 780
+
+#if __cccl_ptx_isa >= 780
+  NV_IF_TARGET(
+    NV_PROVIDES_SM_90,
+    (
+        // mov.u32 sreg_value, %%cluster_nctaid.y;
+        * fn_ptr++ = reinterpret_cast<void*>(static_cast<uint32_t (*)()>(cuda::ptx::get_sreg_cluster_nctaid_y));));
+#endif // __cccl_ptx_isa >= 780
+
+#if __cccl_ptx_isa >= 780
+  NV_IF_TARGET(
+    NV_PROVIDES_SM_90,
+    (
+        // mov.u32 sreg_value, %%cluster_nctaid.z;
+        * fn_ptr++ = reinterpret_cast<void*>(static_cast<uint32_t (*)()>(cuda::ptx::get_sreg_cluster_nctaid_z));));
+#endif // __cccl_ptx_isa >= 780
+
+#if __cccl_ptx_isa >= 780
+  NV_IF_TARGET(
+    NV_PROVIDES_SM_90,
+    (
+        // mov.u32 sreg_value, %%cluster_ctarank;
+        * fn_ptr++ = reinterpret_cast<void*>(static_cast<uint32_t (*)()>(cuda::ptx::get_sreg_cluster_ctarank));));
+#endif // __cccl_ptx_isa >= 780
+
+#if __cccl_ptx_isa >= 780
+  NV_IF_TARGET(
+    NV_PROVIDES_SM_90,
+    (
+        // mov.u32 sreg_value, %%cluster_nctarank;
+        * fn_ptr++ = reinterpret_cast<void*>(static_cast<uint32_t (*)()>(cuda::ptx::get_sreg_cluster_nctarank));));
+#endif // __cccl_ptx_isa >= 780
+
+#if __cccl_ptx_isa >= 200
+  NV_IF_TARGET(
+    NV_PROVIDES_SM_35,
+    (
+        // mov.u32 sreg_value, %%lanemask_eq;
+        * fn_ptr++ = reinterpret_cast<void*>(static_cast<uint32_t (*)()>(cuda::ptx::get_sreg_lanemask_eq));));
+#endif // __cccl_ptx_isa >= 200
+
+#if __cccl_ptx_isa >= 200
+  NV_IF_TARGET(
+    NV_PROVIDES_SM_35,
+    (
+        // mov.u32 sreg_value, %%lanemask_le;
+        * fn_ptr++ = reinterpret_cast<void*>(static_cast<uint32_t (*)()>(cuda::ptx::get_sreg_lanemask_le));));
+#endif // __cccl_ptx_isa >= 200
+
+#if __cccl_ptx_isa >= 200
+  NV_IF_TARGET(
+    NV_PROVIDES_SM_35,
+    (
+        // mov.u32 sreg_value, %%lanemask_lt;
+        * fn_ptr++ = reinterpret_cast<void*>(static_cast<uint32_t (*)()>(cuda::ptx::get_sreg_lanemask_lt));));
+#endif // __cccl_ptx_isa >= 200
+
+#if __cccl_ptx_isa >= 200
+  NV_IF_TARGET(
+    NV_PROVIDES_SM_35,
+    (
+        // mov.u32 sreg_value, %%lanemask_ge;
+        * fn_ptr++ = reinterpret_cast<void*>(static_cast<uint32_t (*)()>(cuda::ptx::get_sreg_lanemask_ge));));
+#endif // __cccl_ptx_isa >= 200
+
+#if __cccl_ptx_isa >= 200
+  NV_IF_TARGET(
+    NV_PROVIDES_SM_35,
+    (
+        // mov.u32 sreg_value, %%lanemask_gt;
+        * fn_ptr++ = reinterpret_cast<void*>(static_cast<uint32_t (*)()>(cuda::ptx::get_sreg_lanemask_gt));));
+#endif // __cccl_ptx_isa >= 200
+
+#if __cccl_ptx_isa >= 100
+  // mov.u32 sreg_value, %%clock;
+  *fn_ptr++ = reinterpret_cast<void*>(static_cast<uint32_t (*)()>(cuda::ptx::get_sreg_clock));
+#endif // __cccl_ptx_isa >= 100
+
+#if __cccl_ptx_isa >= 500
+  NV_IF_TARGET(NV_PROVIDES_SM_35,
+               (
+                   // mov.u32 sreg_value, %%clock_hi;
+                   * fn_ptr++ = reinterpret_cast<void*>(static_cast<uint32_t (*)()>(cuda::ptx::get_sreg_clock_hi));));
+#endif // __cccl_ptx_isa >= 500
+
+#if __cccl_ptx_isa >= 200
+  NV_IF_TARGET(NV_PROVIDES_SM_35,
+               (
+                   // mov.u64 sreg_value, %%clock64;
+                   * fn_ptr++ = reinterpret_cast<void*>(static_cast<uint64_t (*)()>(cuda::ptx::get_sreg_clock64));));
+#endif // __cccl_ptx_isa >= 200
+
+#if __cccl_ptx_isa >= 310
+  NV_IF_TARGET(
+    NV_PROVIDES_SM_35,
+    (
+        // mov.u64 sreg_value, %%globaltimer;
+        * fn_ptr++ = reinterpret_cast<void*>(static_cast<uint64_t (*)()>(cuda::ptx::get_sreg_globaltimer));));
+#endif // __cccl_ptx_isa >= 310
+
+#if __cccl_ptx_isa >= 310
+  NV_IF_TARGET(
+    NV_PROVIDES_SM_35,
+    (
+        // mov.u32 sreg_value, %%globaltimer_lo;
+        * fn_ptr++ = reinterpret_cast<void*>(static_cast<uint32_t (*)()>(cuda::ptx::get_sreg_globaltimer_lo));));
+#endif // __cccl_ptx_isa >= 310
+
+#if __cccl_ptx_isa >= 310
+  NV_IF_TARGET(
+    NV_PROVIDES_SM_35,
+    (
+        // mov.u32 sreg_value, %%globaltimer_hi;
+        * fn_ptr++ = reinterpret_cast<void*>(static_cast<uint32_t (*)()>(cuda::ptx::get_sreg_globaltimer_hi));));
+#endif // __cccl_ptx_isa >= 310
+
+#if __cccl_ptx_isa >= 410
+  NV_IF_TARGET(
+    NV_PROVIDES_SM_35,
+    (
+        // mov.u32 sreg_value, %%total_smem_size;
+        * fn_ptr++ = reinterpret_cast<void*>(static_cast<uint32_t (*)()>(cuda::ptx::get_sreg_total_smem_size));));
+#endif // __cccl_ptx_isa >= 410
+
+#if __cccl_ptx_isa >= 810
+  NV_IF_TARGET(
+    NV_PROVIDES_SM_90,
+    (
+        // mov.u32 sreg_value, %%aggr_smem_size;
+        * fn_ptr++ = reinterpret_cast<void*>(static_cast<uint32_t (*)()>(cuda::ptx::get_sreg_aggr_smem_size));));
+#endif // __cccl_ptx_isa >= 810
+
+#if __cccl_ptx_isa >= 410
+  NV_IF_TARGET(
+    NV_PROVIDES_SM_35,
+    (
+        // mov.u32 sreg_value, %%dynamic_smem_size;
+        * fn_ptr++ = reinterpret_cast<void*>(static_cast<uint32_t (*)()>(cuda::ptx::get_sreg_dynamic_smem_size));));
+#endif // __cccl_ptx_isa >= 410
+
+#if __cccl_ptx_isa >= 800
+  NV_IF_TARGET(
+    NV_PROVIDES_SM_50,
+    (
+        // mov.u64 sreg_value, %%current_graph_exec;
+        * fn_ptr++ = reinterpret_cast<void*>(static_cast<uint64_t (*)()>(cuda::ptx::get_sreg_current_graph_exec));));
+#endif // __cccl_ptx_isa >= 800
+}
diff --git a/libcudacxx/test/libcudacxx/cuda/ptx/generated/getctarank.inc b/libcudacxx/test/libcudacxx/cuda/ptx/generated/getctarank.inc
new file mode 100644
index 00000000000..28b04c9f738
--- /dev/null
+++ b/libcudacxx/test/libcudacxx/cuda/ptx/generated/getctarank.inc
@@ -0,0 +1,10 @@
+__global__ void test_getctarank(void** fn_ptr)
+{
+#if __cccl_ptx_isa >= 780
+  NV_IF_TARGET(NV_PROVIDES_SM_90,
+               (
+                   // getctarank.shared::cluster.u32 dest, addr;
+                   * fn_ptr++ = reinterpret_cast<void*>(
+                     static_cast<uint32_t (*)(cuda::ptx::space_cluster_t, const void*)>(cuda::ptx::getctarank));));
+#endif // __cccl_ptx_isa >= 780
+}
diff --git a/libcudacxx/test/libcudacxx/cuda/ptx/generated/mbarrier_arrive.inc b/libcudacxx/test/libcudacxx/cuda/ptx/generated/mbarrier_arrive.inc
new file mode 100644
index 00000000000..4a94ec51d45
--- /dev/null
+++ b/libcudacxx/test/libcudacxx/cuda/ptx/generated/mbarrier_arrive.inc
@@ -0,0 +1,74 @@
+__global__ void test_mbarrier_arrive(void** fn_ptr)
+{
+#if __cccl_ptx_isa >= 700
+  NV_IF_TARGET(
+    NV_PROVIDES_SM_80,
+    (
+        // mbarrier.arrive.shared.b64                                  state,  [addr];           // 1.
+        * fn_ptr++ = reinterpret_cast<void*>(static_cast<uint64_t (*)(uint64_t*)>(cuda::ptx::mbarrier_arrive));));
+#endif // __cccl_ptx_isa >= 700
+
+#if __cccl_ptx_isa >= 780
+  NV_IF_TARGET(NV_PROVIDES_SM_90,
+               (
+                   // mbarrier.arrive.shared::cta.b64                             state,  [addr], count;    // 2.
+                   * fn_ptr++ = reinterpret_cast<void*>(
+                     static_cast<uint64_t (*)(uint64_t*, const uint32_t&)>(cuda::ptx::mbarrier_arrive));));
+#endif // __cccl_ptx_isa >= 780
+
+#if __cccl_ptx_isa >= 800
+  NV_IF_TARGET(
+    NV_PROVIDES_SM_90,
+    (
+        // mbarrier.arrive.release.cta.shared::cta.b64                   state,  [addr];           // 3a.
+        * fn_ptr++ = reinterpret_cast<void*>(
+          static_cast<uint64_t (*)(
+            cuda::ptx::sem_release_t, cuda::ptx::scope_cta_t, cuda::ptx::space_shared_t, uint64_t*)>(
+            cuda::ptx::mbarrier_arrive));
+          // mbarrier.arrive.release.cluster.shared::cta.b64                   state,  [addr];           // 3a.
+            * fn_ptr++ = reinterpret_cast<void*>(
+              static_cast<uint64_t (*)(
+                cuda::ptx::sem_release_t, cuda::ptx::scope_cluster_t, cuda::ptx::space_shared_t, uint64_t*)>(
+                cuda::ptx::mbarrier_arrive));));
+#endif // __cccl_ptx_isa >= 800
+
+#if __cccl_ptx_isa >= 800
+  NV_IF_TARGET(
+    NV_PROVIDES_SM_90,
+    (
+        // mbarrier.arrive.release.cta.shared::cta.b64                   state,  [addr], count;    // 3b.
+        * fn_ptr++ = reinterpret_cast<void*>(
+          static_cast<uint64_t (*)(
+            cuda::ptx::sem_release_t, cuda::ptx::scope_cta_t, cuda::ptx::space_shared_t, uint64_t*, const uint32_t&)>(
+            cuda::ptx::mbarrier_arrive));
+          // mbarrier.arrive.release.cluster.shared::cta.b64                   state,  [addr], count;    // 3b.
+            * fn_ptr++ = reinterpret_cast<void*>(
+              static_cast<uint64_t (*)(cuda::ptx::sem_release_t,
+                                       cuda::ptx::scope_cluster_t,
+                                       cuda::ptx::space_shared_t,
+                                       uint64_t*,
+                                       const uint32_t&)>(cuda::ptx::mbarrier_arrive));));
+#endif // __cccl_ptx_isa >= 800
+
+#if __cccl_ptx_isa >= 800
+  NV_IF_TARGET(
+    NV_PROVIDES_SM_90,
+    (
+        // mbarrier.arrive.release.cluster.shared::cluster.b64                   _, [addr];                // 4a.
+        * fn_ptr++ = reinterpret_cast<void*>(
+          static_cast<void (*)(
+            cuda::ptx::sem_release_t, cuda::ptx::scope_cluster_t, cuda::ptx::space_cluster_t, uint64_t*)>(
+            cuda::ptx::mbarrier_arrive));));
+#endif // __cccl_ptx_isa >= 800
+
+#if __cccl_ptx_isa >= 800
+  NV_IF_TARGET(
+    NV_PROVIDES_SM_90,
+    (
+        // mbarrier.arrive.release.cluster.shared::cluster.b64                   _, [addr], count;         // 4b.
+        * fn_ptr++ = reinterpret_cast<void*>(
+          static_cast<void (*)(
+            cuda::ptx::sem_release_t, cuda::ptx::scope_cluster_t, cuda::ptx::space_cluster_t, uint64_t*, const uint32_t&)>(
+            cuda::ptx::mbarrier_arrive));));
+#endif // __cccl_ptx_isa >= 800
+}
diff --git a/libcudacxx/test/libcudacxx/cuda/ptx/generated/mbarrier_arrive_expect_tx.inc b/libcudacxx/test/libcudacxx/cuda/ptx/generated/mbarrier_arrive_expect_tx.inc
new file mode 100644
index 00000000000..085723a452b
--- /dev/null
+++ b/libcudacxx/test/libcudacxx/cuda/ptx/generated/mbarrier_arrive_expect_tx.inc
@@ -0,0 +1,31 @@
+__global__ void test_mbarrier_arrive_expect_tx(void** fn_ptr)
+{
+#if __cccl_ptx_isa >= 800
+  NV_IF_TARGET(
+    NV_PROVIDES_SM_90,
+    (
+        // mbarrier.arrive.expect_tx.release.cta.shared::cta.b64 state, [addr], tx_count; // 8.
+        * fn_ptr++ = reinterpret_cast<void*>(
+          static_cast<uint64_t (*)(
+            cuda::ptx::sem_release_t, cuda::ptx::scope_cta_t, cuda::ptx::space_shared_t, uint64_t*, const uint32_t&)>(
+            cuda::ptx::mbarrier_arrive_expect_tx));
+          // mbarrier.arrive.expect_tx.release.cluster.shared::cta.b64 state, [addr], tx_count; // 8.
+            * fn_ptr++ = reinterpret_cast<void*>(
+              static_cast<uint64_t (*)(cuda::ptx::sem_release_t,
+                                       cuda::ptx::scope_cluster_t,
+                                       cuda::ptx::space_shared_t,
+                                       uint64_t*,
+                                       const uint32_t&)>(cuda::ptx::mbarrier_arrive_expect_tx));));
+#endif // __cccl_ptx_isa >= 800
+
+#if __cccl_ptx_isa >= 800
+  NV_IF_TARGET(
+    NV_PROVIDES_SM_90,
+    (
+        // mbarrier.arrive.expect_tx.release.cluster.shared::cluster.b64   _, [addr], tx_count; // 9.
+        * fn_ptr++ = reinterpret_cast<void*>(
+          static_cast<void (*)(
+            cuda::ptx::sem_release_t, cuda::ptx::scope_cluster_t, cuda::ptx::space_cluster_t, uint64_t*, const uint32_t&)>(
+            cuda::ptx::mbarrier_arrive_expect_tx));));
+#endif // __cccl_ptx_isa >= 800
+}
diff --git a/libcudacxx/test/libcudacxx/cuda/ptx/generated/mbarrier_arrive_no_complete.inc b/libcudacxx/test/libcudacxx/cuda/ptx/generated/mbarrier_arrive_no_complete.inc
new file mode 100644
index 00000000000..d1d017cd3c2
--- /dev/null
+++ b/libcudacxx/test/libcudacxx/cuda/ptx/generated/mbarrier_arrive_no_complete.inc
@@ -0,0 +1,10 @@
+__global__ void test_mbarrier_arrive_no_complete(void** fn_ptr)
+{
+#if __cccl_ptx_isa >= 700
+  NV_IF_TARGET(NV_PROVIDES_SM_80,
+               (
+                   // mbarrier.arrive.noComplete.shared.b64                       state,  [addr], count;    // 5.
+                   * fn_ptr++ = reinterpret_cast<void*>(
+                     static_cast<uint64_t (*)(uint64_t*, const uint32_t&)>(cuda::ptx::mbarrier_arrive_no_complete));));
+#endif // __cccl_ptx_isa >= 700
+}
diff --git a/libcudacxx/test/libcudacxx/cuda/ptx/generated/mbarrier_init.inc b/libcudacxx/test/libcudacxx/cuda/ptx/generated/mbarrier_init.inc
new file mode 100644
index 00000000000..f814161d1f9
--- /dev/null
+++ b/libcudacxx/test/libcudacxx/cuda/ptx/generated/mbarrier_init.inc
@@ -0,0 +1,10 @@
+__global__ void test_mbarrier_init(void** fn_ptr)
+{
+#if __cccl_ptx_isa >= 700
+  NV_IF_TARGET(NV_PROVIDES_SM_80,
+               (
+                   // mbarrier.init.shared.b64 [addr], count;
+                   * fn_ptr++ = reinterpret_cast<void*>(
+                     static_cast<void (*)(uint64_t*, const uint32_t&)>(cuda::ptx::mbarrier_init));));
+#endif // __cccl_ptx_isa >= 700
+}
diff --git a/libcudacxx/test/libcudacxx/cuda/ptx/generated/mbarrier_try_wait.inc b/libcudacxx/test/libcudacxx/cuda/ptx/generated/mbarrier_try_wait.inc
new file mode 100644
index 00000000000..e9d8661a07e
--- /dev/null
+++ b/libcudacxx/test/libcudacxx/cuda/ptx/generated/mbarrier_try_wait.inc
@@ -0,0 +1,53 @@
+__global__ void test_mbarrier_try_wait(void** fn_ptr)
+{
+#if __cccl_ptx_isa >= 780
+  NV_IF_TARGET(NV_PROVIDES_SM_90,
+               (
+                   // mbarrier.try_wait.shared::cta.b64         waitComplete, [addr], state; // 5a.
+                   * fn_ptr++ = reinterpret_cast<void*>(
+                     static_cast<bool (*)(uint64_t*, const uint64_t&)>(cuda::ptx::mbarrier_try_wait));));
+#endif // __cccl_ptx_isa >= 780
+
+#if __cccl_ptx_isa >= 780
+  NV_IF_TARGET(
+    NV_PROVIDES_SM_90,
+    (
+        // mbarrier.try_wait.shared::cta.b64         waitComplete, [addr], state, suspendTimeHint;                    //
+        // 5b.
+        * fn_ptr++ = reinterpret_cast<void*>(
+          static_cast<bool (*)(uint64_t*, const uint64_t&, const uint32_t&)>(cuda::ptx::mbarrier_try_wait));));
+#endif // __cccl_ptx_isa >= 780
+
+#if __cccl_ptx_isa >= 800
+  NV_IF_TARGET(
+    NV_PROVIDES_SM_90,
+    (
+        // mbarrier.try_wait.acquire.cta.shared::cta.b64         waitComplete, [addr], state;                        //
+        // 6a.
+        * fn_ptr++ = reinterpret_cast<void*>(
+          static_cast<bool (*)(cuda::ptx::sem_acquire_t, cuda::ptx::scope_cta_t, uint64_t*, const uint64_t&)>(
+            cuda::ptx::mbarrier_try_wait));
+          // mbarrier.try_wait.acquire.cluster.shared::cta.b64         waitComplete, [addr], state; // 6a.
+            * fn_ptr++ = reinterpret_cast<void*>(
+              static_cast<bool (*)(cuda::ptx::sem_acquire_t, cuda::ptx::scope_cluster_t, uint64_t*, const uint64_t&)>(
+                cuda::ptx::mbarrier_try_wait));));
+#endif // __cccl_ptx_isa >= 800
+
+#if __cccl_ptx_isa >= 800
+  NV_IF_TARGET(
+    NV_PROVIDES_SM_90,
+    (
+        // mbarrier.try_wait.acquire.cta.shared::cta.b64         waitComplete, [addr], state , suspendTimeHint;      //
+        // 6b.
+        * fn_ptr++ = reinterpret_cast<void*>(
+          static_cast<bool (*)(
+            cuda::ptx::sem_acquire_t, cuda::ptx::scope_cta_t, uint64_t*, const uint64_t&, const uint32_t&)>(
+            cuda::ptx::mbarrier_try_wait));
+          // mbarrier.try_wait.acquire.cluster.shared::cta.b64         waitComplete, [addr], state , suspendTimeHint;
+          // // 6b.
+            * fn_ptr++ = reinterpret_cast<void*>(
+              static_cast<bool (*)(
+                cuda::ptx::sem_acquire_t, cuda::ptx::scope_cluster_t, uint64_t*, const uint64_t&, const uint32_t&)>(
+                cuda::ptx::mbarrier_try_wait));));
+#endif // __cccl_ptx_isa >= 800
+}
diff --git a/libcudacxx/test/libcudacxx/cuda/ptx/generated/mbarrier_try_wait_parity.inc b/libcudacxx/test/libcudacxx/cuda/ptx/generated/mbarrier_try_wait_parity.inc
new file mode 100644
index 00000000000..f8c3875451a
--- /dev/null
+++ b/libcudacxx/test/libcudacxx/cuda/ptx/generated/mbarrier_try_wait_parity.inc
@@ -0,0 +1,52 @@
+__global__ void test_mbarrier_try_wait_parity(void** fn_ptr)
+{
+#if __cccl_ptx_isa >= 780
+  NV_IF_TARGET(NV_PROVIDES_SM_90,
+               (
+                   // mbarrier.try_wait.parity.shared::cta.b64  waitComplete, [addr], phaseParity; // 7a.
+                   * fn_ptr++ = reinterpret_cast<void*>(
+                     static_cast<bool (*)(uint64_t*, const uint32_t&)>(cuda::ptx::mbarrier_try_wait_parity));));
+#endif // __cccl_ptx_isa >= 780
+
+#if __cccl_ptx_isa >= 780
+  NV_IF_TARGET(
+    NV_PROVIDES_SM_90,
+    (
+        // mbarrier.try_wait.parity.shared::cta.b64  waitComplete, [addr], phaseParity, suspendTimeHint; // 7b.
+        * fn_ptr++ = reinterpret_cast<void*>(
+          static_cast<bool (*)(uint64_t*, const uint32_t&, const uint32_t&)>(cuda::ptx::mbarrier_try_wait_parity));));
+#endif // __cccl_ptx_isa >= 780
+
+#if __cccl_ptx_isa >= 800
+  NV_IF_TARGET(
+    NV_PROVIDES_SM_90,
+    (
+        // mbarrier.try_wait.parity.acquire.cta.shared::cta.b64  waitComplete, [addr], phaseParity;                  //
+        // 8a.
+        * fn_ptr++ = reinterpret_cast<void*>(
+          static_cast<bool (*)(cuda::ptx::sem_acquire_t, cuda::ptx::scope_cta_t, uint64_t*, const uint32_t&)>(
+            cuda::ptx::mbarrier_try_wait_parity));
+          // mbarrier.try_wait.parity.acquire.cluster.shared::cta.b64  waitComplete, [addr], phaseParity; // 8a.
+            * fn_ptr++ = reinterpret_cast<void*>(
+              static_cast<bool (*)(cuda::ptx::sem_acquire_t, cuda::ptx::scope_cluster_t, uint64_t*, const uint32_t&)>(
+                cuda::ptx::mbarrier_try_wait_parity));));
+#endif // __cccl_ptx_isa >= 800
+
+#if __cccl_ptx_isa >= 800
+  NV_IF_TARGET(
+    NV_PROVIDES_SM_90,
+    (
+        // mbarrier.try_wait.parity.acquire.cta.shared::cta.b64  waitComplete, [addr], phaseParity, suspendTimeHint; //
+        // 8b.
+        * fn_ptr++ = reinterpret_cast<void*>(
+          static_cast<bool (*)(
+            cuda::ptx::sem_acquire_t, cuda::ptx::scope_cta_t, uint64_t*, const uint32_t&, const uint32_t&)>(
+            cuda::ptx::mbarrier_try_wait_parity));
+          // mbarrier.try_wait.parity.acquire.cluster.shared::cta.b64  waitComplete, [addr], phaseParity,
+          // suspendTimeHint; // 8b.
+            * fn_ptr++ = reinterpret_cast<void*>(
+              static_cast<bool (*)(
+                cuda::ptx::sem_acquire_t, cuda::ptx::scope_cluster_t, uint64_t*, const uint32_t&, const uint32_t&)>(
+                cuda::ptx::mbarrier_try_wait_parity));));
+#endif // __cccl_ptx_isa >= 800
+}
diff --git a/libcudacxx/test/libcudacxx/cuda/ptx/generated/mbarrier_wait.inc b/libcudacxx/test/libcudacxx/cuda/ptx/generated/mbarrier_wait.inc
new file mode 100644
index 00000000000..80129e5016c
--- /dev/null
+++ b/libcudacxx/test/libcudacxx/cuda/ptx/generated/mbarrier_wait.inc
@@ -0,0 +1,24 @@
+__global__ void test_mbarrier_test_wait(void** fn_ptr)
+{
+#if __cccl_ptx_isa >= 700
+  NV_IF_TARGET(NV_PROVIDES_SM_80,
+               (
+                   // mbarrier.test_wait.shared.b64 waitComplete, [addr], state; // 1.
+                   * fn_ptr++ = reinterpret_cast<void*>(
+                     static_cast<bool (*)(uint64_t*, const uint64_t&)>(cuda::ptx::mbarrier_test_wait));));
+#endif // __cccl_ptx_isa >= 700
+
+#if __cccl_ptx_isa >= 800
+  NV_IF_TARGET(
+    NV_PROVIDES_SM_90,
+    (
+        // mbarrier.test_wait.acquire.cta.shared::cta.b64        waitComplete, [addr], state; // 2.
+        * fn_ptr++ = reinterpret_cast<void*>(
+          static_cast<bool (*)(cuda::ptx::sem_acquire_t, cuda::ptx::scope_cta_t, uint64_t*, const uint64_t&)>(
+            cuda::ptx::mbarrier_test_wait));
+          // mbarrier.test_wait.acquire.cluster.shared::cta.b64        waitComplete, [addr], state; // 2.
+            * fn_ptr++ = reinterpret_cast<void*>(
+              static_cast<bool (*)(cuda::ptx::sem_acquire_t, cuda::ptx::scope_cluster_t, uint64_t*, const uint64_t&)>(
+                cuda::ptx::mbarrier_test_wait));));
+#endif // __cccl_ptx_isa >= 800
+}
diff --git a/libcudacxx/test/libcudacxx/cuda/ptx/generated/mbarrier_wait_parity.inc b/libcudacxx/test/libcudacxx/cuda/ptx/generated/mbarrier_wait_parity.inc
new file mode 100644
index 00000000000..30902c58905
--- /dev/null
+++ b/libcudacxx/test/libcudacxx/cuda/ptx/generated/mbarrier_wait_parity.inc
@@ -0,0 +1,24 @@
+__global__ void test_mbarrier_test_wait_parity(void** fn_ptr)
+{
+#if __cccl_ptx_isa >= 710
+  NV_IF_TARGET(NV_PROVIDES_SM_80,
+               (
+                   // mbarrier.test_wait.parity.shared.b64 waitComplete, [addr], phaseParity; // 3.
+                   * fn_ptr++ = reinterpret_cast<void*>(
+                     static_cast<bool (*)(uint64_t*, const uint32_t&)>(cuda::ptx::mbarrier_test_wait_parity));));
+#endif // __cccl_ptx_isa >= 710
+
+#if __cccl_ptx_isa >= 800
+  NV_IF_TARGET(
+    NV_PROVIDES_SM_90,
+    (
+        // mbarrier.test_wait.parity.acquire.cta.shared::cta.b64 waitComplete, [addr], phaseParity; // 4.
+        * fn_ptr++ = reinterpret_cast<void*>(
+          static_cast<bool (*)(cuda::ptx::sem_acquire_t, cuda::ptx::scope_cta_t, uint64_t*, const uint32_t&)>(
+            cuda::ptx::mbarrier_test_wait_parity));
+          // mbarrier.test_wait.parity.acquire.cluster.shared::cta.b64 waitComplete, [addr], phaseParity; // 4.
+            * fn_ptr++ = reinterpret_cast<void*>(
+              static_cast<bool (*)(cuda::ptx::sem_acquire_t, cuda::ptx::scope_cluster_t, uint64_t*, const uint32_t&)>(
+                cuda::ptx::mbarrier_test_wait_parity));));
+#endif // __cccl_ptx_isa >= 800
+}
diff --git a/libcudacxx/test/libcudacxx/cuda/ptx/generated/red_async.inc b/libcudacxx/test/libcudacxx/cuda/ptx/generated/red_async.inc
new file mode 100644
index 00000000000..0d562fd31a7
--- /dev/null
+++ b/libcudacxx/test/libcudacxx/cuda/ptx/generated/red_async.inc
@@ -0,0 +1,120 @@
+__global__ void test_red_async(void** fn_ptr)
+{
+#if __cccl_ptx_isa >= 810
+  NV_IF_TARGET(
+    NV_PROVIDES_SM_90,
+    (
+        // red.async.relaxed.cluster.shared::cluster.mbarrier::complete_tx::bytes.inc.u32  [dest], value, [remote_bar];
+        * fn_ptr++ = reinterpret_cast<void*>(
+          static_cast<void (*)(cuda::ptx::op_inc_t, uint32_t*, const uint32_t&, uint64_t*)>(cuda::ptx::red_async));));
+#endif // __cccl_ptx_isa >= 810
+
+#if __cccl_ptx_isa >= 810
+  NV_IF_TARGET(
+    NV_PROVIDES_SM_90,
+    (
+        // red.async.relaxed.cluster.shared::cluster.mbarrier::complete_tx::bytes.dec.u32  [dest], value, [remote_bar];
+        * fn_ptr++ = reinterpret_cast<void*>(
+          static_cast<void (*)(cuda::ptx::op_dec_t, uint32_t*, const uint32_t&, uint64_t*)>(cuda::ptx::red_async));));
+#endif // __cccl_ptx_isa >= 810
+
+#if __cccl_ptx_isa >= 810
+  NV_IF_TARGET(
+    NV_PROVIDES_SM_90,
+    (
+        // red.async.relaxed.cluster.shared::cluster.mbarrier::complete_tx::bytes.min.u32  [dest], value, [remote_bar];
+        * fn_ptr++ = reinterpret_cast<void*>(
+          static_cast<void (*)(cuda::ptx::op_min_t, uint32_t*, const uint32_t&, uint64_t*)>(cuda::ptx::red_async));));
+#endif // __cccl_ptx_isa >= 810
+
+#if __cccl_ptx_isa >= 810
+  NV_IF_TARGET(
+    NV_PROVIDES_SM_90,
+    (
+        // red.async.relaxed.cluster.shared::cluster.mbarrier::complete_tx::bytes.max.u32  [dest], value, [remote_bar];
+        * fn_ptr++ = reinterpret_cast<void*>(
+          static_cast<void (*)(cuda::ptx::op_max_t, uint32_t*, const uint32_t&, uint64_t*)>(cuda::ptx::red_async));));
+#endif // __cccl_ptx_isa >= 810
+
+#if __cccl_ptx_isa >= 810
+  NV_IF_TARGET(
+    NV_PROVIDES_SM_90,
+    (
+        // red.async.relaxed.cluster.shared::cluster.mbarrier::complete_tx::bytes.add.u32  [dest], value, [remote_bar];
+        * fn_ptr++ = reinterpret_cast<void*>(
+          static_cast<void (*)(cuda::ptx::op_add_t, uint32_t*, const uint32_t&, uint64_t*)>(cuda::ptx::red_async));));
+#endif // __cccl_ptx_isa >= 810
+
+#if __cccl_ptx_isa >= 810
+  NV_IF_TARGET(
+    NV_PROVIDES_SM_90,
+    (
+        // red.async.relaxed.cluster.shared::cluster.mbarrier::complete_tx::bytes.min.s32  [dest], value, [remote_bar];
+        * fn_ptr++ = reinterpret_cast<void*>(
+          static_cast<void (*)(cuda::ptx::op_min_t, int32_t*, const int32_t&, uint64_t*)>(cuda::ptx::red_async));));
+#endif // __cccl_ptx_isa >= 810
+
+#if __cccl_ptx_isa >= 810
+  NV_IF_TARGET(
+    NV_PROVIDES_SM_90,
+    (
+        // red.async.relaxed.cluster.shared::cluster.mbarrier::complete_tx::bytes.max.s32  [dest], value, [remote_bar];
+        * fn_ptr++ = reinterpret_cast<void*>(
+          static_cast<void (*)(cuda::ptx::op_max_t, int32_t*, const int32_t&, uint64_t*)>(cuda::ptx::red_async));));
+#endif // __cccl_ptx_isa >= 810
+
+#if __cccl_ptx_isa >= 810
+  NV_IF_TARGET(
+    NV_PROVIDES_SM_90,
+    (
+        // red.async.relaxed.cluster.shared::cluster.mbarrier::complete_tx::bytes.add.s32  [dest], value, [remote_bar];
+        * fn_ptr++ = reinterpret_cast<void*>(
+          static_cast<void (*)(cuda::ptx::op_add_t, int32_t*, const int32_t&, uint64_t*)>(cuda::ptx::red_async));));
+#endif // __cccl_ptx_isa >= 810
+
+#if __cccl_ptx_isa >= 810
+  NV_IF_TARGET(
+    NV_PROVIDES_SM_90,
+    (
+        // red.async.relaxed.cluster.shared::cluster.mbarrier::complete_tx::bytes.and.b32  [dest], value, [remote_bar];
+        * fn_ptr++ = reinterpret_cast<void*>(
+          static_cast<void (*)(cuda::ptx::op_and_op_t, int32_t*, const int32_t&, uint64_t*)>(cuda::ptx::red_async));));
+#endif // __cccl_ptx_isa >= 810
+
+#if __cccl_ptx_isa >= 810
+  NV_IF_TARGET(
+    NV_PROVIDES_SM_90,
+    (
+        // red.async.relaxed.cluster.shared::cluster.mbarrier::complete_tx::bytes.or.b32  [dest], value, [remote_bar];
+        * fn_ptr++ = reinterpret_cast<void*>(
+          static_cast<void (*)(cuda::ptx::op_or_op_t, int32_t*, const int32_t&, uint64_t*)>(cuda::ptx::red_async));));
+#endif // __cccl_ptx_isa >= 810
+
+#if __cccl_ptx_isa >= 810
+  NV_IF_TARGET(
+    NV_PROVIDES_SM_90,
+    (
+        // red.async.relaxed.cluster.shared::cluster.mbarrier::complete_tx::bytes.xor.b32  [dest], value, [remote_bar];
+        * fn_ptr++ = reinterpret_cast<void*>(
+          static_cast<void (*)(cuda::ptx::op_xor_op_t, int32_t*, const int32_t&, uint64_t*)>(cuda::ptx::red_async));));
+#endif // __cccl_ptx_isa >= 810
+
+#if __cccl_ptx_isa >= 810
+  NV_IF_TARGET(
+    NV_PROVIDES_SM_90,
+    (
+        // red.async.relaxed.cluster.shared::cluster.mbarrier::complete_tx::bytes.add.u64  [dest], value, [remote_bar];
+        * fn_ptr++ = reinterpret_cast<void*>(
+          static_cast<void (*)(cuda::ptx::op_add_t, uint64_t*, const uint64_t&, uint64_t*)>(cuda::ptx::red_async));));
+#endif // __cccl_ptx_isa >= 810
+
+#if __cccl_ptx_isa >= 810
+  NV_IF_TARGET(
+    NV_PROVIDES_SM_90,
+    (
+        // red.async.relaxed.cluster.shared::cluster.mbarrier::complete_tx::bytes.add.u64  [dest], value, [remote_bar];
+        // // .u64 intentional
+        * fn_ptr++ = reinterpret_cast<void*>(
+          static_cast<void (*)(cuda::ptx::op_add_t, int64_t*, const int64_t&, int64_t*)>(cuda::ptx::red_async));));
+#endif // __cccl_ptx_isa >= 810
+}
diff --git a/libcudacxx/test/libcudacxx/cuda/ptx/generated/st_async.inc b/libcudacxx/test/libcudacxx/cuda/ptx/generated/st_async.inc
new file mode 100644
index 00000000000..4efb95ef217
--- /dev/null
+++ b/libcudacxx/test/libcudacxx/cuda/ptx/generated/st_async.inc
@@ -0,0 +1,35 @@
+__global__ void test_st_async(void** fn_ptr)
+{
+#if __cccl_ptx_isa >= 810
+  NV_IF_TARGET(
+    NV_PROVIDES_SM_90,
+    (
+        // st.async.weak.shared::cluster.mbarrier::complete_tx::bytes.b32 [addr], value, [remote_bar];    // 1.
+        * fn_ptr++ =
+          reinterpret_cast<void*>(static_cast<void (*)(int32_t*, const int32_t&, uint64_t*)>(cuda::ptx::st_async));
+          // st.async.weak.shared::cluster.mbarrier::complete_tx::bytes.b64 [addr], value, [remote_bar];    // 1.
+            * fn_ptr++ = reinterpret_cast<void*>(
+              static_cast<void (*)(int64_t*, const int64_t&, uint64_t*)>(cuda::ptx::st_async));));
+#endif // __cccl_ptx_isa >= 810
+
+#if __cccl_ptx_isa >= 810
+  NV_IF_TARGET(
+    NV_PROVIDES_SM_90,
+    (
+        // st.async.weak.shared::cluster.mbarrier::complete_tx::bytes.v2.b32 [addr], value, [remote_bar]; // 2.
+        * fn_ptr++ =
+          reinterpret_cast<void*>(static_cast<void (*)(int32_t*, const int32_t(&)[2], uint64_t*)>(cuda::ptx::st_async));
+          // st.async.weak.shared::cluster.mbarrier::complete_tx::bytes.v2.b64 [addr], value, [remote_bar]; // 2.
+            * fn_ptr++ = reinterpret_cast<void*>(
+              static_cast<void (*)(int64_t*, const int64_t(&)[2], uint64_t*)>(cuda::ptx::st_async));));
+#endif // __cccl_ptx_isa >= 810
+
+#if __cccl_ptx_isa >= 810
+  NV_IF_TARGET(NV_PROVIDES_SM_90,
+               (
+                   // st.async.weak.shared::cluster.mbarrier::complete_tx::bytes.v4.b32 [addr], value, [remote_bar];
+                   // // 3.
+                   * fn_ptr++ = reinterpret_cast<void*>(
+                     static_cast<void (*)(int32_t*, const int32_t(&)[4], uint64_t*)>(cuda::ptx::st_async));));
+#endif // __cccl_ptx_isa >= 810
+}
diff --git a/libcudacxx/test/libcudacxx/cuda/ptx/generated/tensormap_cp_fenceproxy.inc b/libcudacxx/test/libcudacxx/cuda/ptx/generated/tensormap_cp_fenceproxy.inc
new file mode 100644
index 00000000000..9a0a8c1f615
--- /dev/null
+++ b/libcudacxx/test/libcudacxx/cuda/ptx/generated/tensormap_cp_fenceproxy.inc
@@ -0,0 +1,29 @@
+__global__ void test_tensormap_cp_fenceproxy(void** fn_ptr)
+{
+#if __cccl_ptx_isa >= 830
+  NV_IF_TARGET(
+    NV_PROVIDES_SM_90,
+    (
+        // tensormap.cp_fenceproxy.global.shared::cta.tensormap::generic.release.cta.sync.aligned  [dst], [src], size;
+        * fn_ptr++ = reinterpret_cast<void*>(
+          static_cast<void (*)(
+            cuda::ptx::sem_release_t, cuda::ptx::scope_cta_t, void*, const void*, cuda::ptx::n32_t<128>)>(
+            cuda::ptx::tensormap_cp_fenceproxy));
+          // tensormap.cp_fenceproxy.global.shared::cta.tensormap::generic.release.cluster.sync.aligned  [dst], [src],
+          // size;
+            * fn_ptr++ = reinterpret_cast<void*>(
+              static_cast<void (*)(
+                cuda::ptx::sem_release_t, cuda::ptx::scope_cluster_t, void*, const void*, cuda::ptx::n32_t<128>)>(
+                cuda::ptx::tensormap_cp_fenceproxy));
+          // tensormap.cp_fenceproxy.global.shared::cta.tensormap::generic.release.gpu.sync.aligned  [dst], [src], size;
+            * fn_ptr++ = reinterpret_cast<void*>(
+              static_cast<void (*)(
+                cuda::ptx::sem_release_t, cuda::ptx::scope_gpu_t, void*, const void*, cuda::ptx::n32_t<128>)>(
+                cuda::ptx::tensormap_cp_fenceproxy));
+          // tensormap.cp_fenceproxy.global.shared::cta.tensormap::generic.release.sys.sync.aligned  [dst], [src], size;
+            * fn_ptr++ = reinterpret_cast<void*>(
+              static_cast<void (*)(
+                cuda::ptx::sem_release_t, cuda::ptx::scope_sys_t, void*, const void*, cuda::ptx::n32_t<128>)>(
+                cuda::ptx::tensormap_cp_fenceproxy));));
+#endif // __cccl_ptx_isa >= 830
+}
diff --git a/libcudacxx/test/libcudacxx/cuda/ptx/generated/tensormap_replace.inc b/libcudacxx/test/libcudacxx/cuda/ptx/generated/tensormap_replace.inc
new file mode 100644
index 00000000000..c69f3d11964
--- /dev/null
+++ b/libcudacxx/test/libcudacxx/cuda/ptx/generated/tensormap_replace.inc
@@ -0,0 +1,198 @@
+__global__ void test_tensormap_replace(void** fn_ptr)
+{
+#if __cccl_ptx_isa >= 830
+  NV_IF_TARGET(
+    NV_HAS_FEATURE_SM_90a,
+    (
+        // tensormap.replace.tile.global_address.global.b1024.b64    [tm_addr], new_val;
+        * fn_ptr++ = reinterpret_cast<void*>(static_cast<void (*)(cuda::ptx::space_global_t, void*, int64_t)>(
+          cuda::ptx::tensormap_replace_global_address));));
+#endif // __cccl_ptx_isa >= 830
+
+#if __cccl_ptx_isa >= 830
+  NV_IF_TARGET(
+    NV_HAS_FEATURE_SM_90a,
+    (
+        // tensormap.replace.tile.global_address.shared::cta.b1024.b64    [tm_addr], new_val;
+        * fn_ptr++ = reinterpret_cast<void*>(static_cast<void (*)(cuda::ptx::space_shared_t, void*, int64_t)>(
+          cuda::ptx::tensormap_replace_global_address));));
+#endif // __cccl_ptx_isa >= 830
+
+#if __cccl_ptx_isa >= 830
+  NV_IF_TARGET(
+    NV_HAS_FEATURE_SM_90a,
+    (
+        // tensormap.replace.tile.rank.global.b1024.b32              [tm_addr], new_val;
+        * fn_ptr++ = reinterpret_cast<void*>(
+          static_cast<void (*)(cuda::ptx::space_global_t, void*, int32_t)>(cuda::ptx::tensormap_replace_rank));));
+#endif // __cccl_ptx_isa >= 830
+
+#if __cccl_ptx_isa >= 830
+  NV_IF_TARGET(
+    NV_HAS_FEATURE_SM_90a,
+    (
+        // tensormap.replace.tile.rank.shared::cta.b1024.b32              [tm_addr], new_val;
+        * fn_ptr++ = reinterpret_cast<void*>(
+          static_cast<void (*)(cuda::ptx::space_shared_t, void*, int32_t)>(cuda::ptx::tensormap_replace_rank));));
+#endif // __cccl_ptx_isa >= 830
+
+#if __cccl_ptx_isa >= 830
+  NV_IF_TARGET(
+    NV_HAS_FEATURE_SM_90a,
+    (
+        // tensormap.replace.tile.box_dim.global.b1024.b32           [tm_addr], ord, new_val;
+        * fn_ptr++ =
+          reinterpret_cast<void*>(static_cast<void (*)(cuda::ptx::space_global_t, void*, cuda::ptx::n32_t<0>, int32_t)>(
+            cuda::ptx::tensormap_replace_box_dim));));
+#endif // __cccl_ptx_isa >= 830
+
+#if __cccl_ptx_isa >= 830
+  NV_IF_TARGET(
+    NV_HAS_FEATURE_SM_90a,
+    (
+        // tensormap.replace.tile.box_dim.shared::cta.b1024.b32           [tm_addr], ord, new_val;
+        * fn_ptr++ =
+          reinterpret_cast<void*>(static_cast<void (*)(cuda::ptx::space_shared_t, void*, cuda::ptx::n32_t<0>, int32_t)>(
+            cuda::ptx::tensormap_replace_box_dim));));
+#endif // __cccl_ptx_isa >= 830
+
+#if __cccl_ptx_isa >= 830
+  NV_IF_TARGET(
+    NV_HAS_FEATURE_SM_90a,
+    (
+        // tensormap.replace.tile.global_dim.global.b1024.b32        [tm_addr], ord, new_val;
+        * fn_ptr++ =
+          reinterpret_cast<void*>(static_cast<void (*)(cuda::ptx::space_global_t, void*, cuda::ptx::n32_t<0>, int32_t)>(
+            cuda::ptx::tensormap_replace_global_dim));));
+#endif // __cccl_ptx_isa >= 830
+
+#if __cccl_ptx_isa >= 830
+  NV_IF_TARGET(
+    NV_HAS_FEATURE_SM_90a,
+    (
+        // tensormap.replace.tile.global_dim.shared::cta.b1024.b32        [tm_addr], ord, new_val;
+        * fn_ptr++ =
+          reinterpret_cast<void*>(static_cast<void (*)(cuda::ptx::space_shared_t, void*, cuda::ptx::n32_t<0>, int32_t)>(
+            cuda::ptx::tensormap_replace_global_dim));));
+#endif // __cccl_ptx_isa >= 830
+
+#if __cccl_ptx_isa >= 830
+  NV_IF_TARGET(
+    NV_HAS_FEATURE_SM_90a,
+    (
+        // tensormap.replace.tile.global_stride.global.b1024.b64     [tm_addr], ord, new_val;
+        * fn_ptr++ =
+          reinterpret_cast<void*>(static_cast<void (*)(cuda::ptx::space_global_t, void*, cuda::ptx::n32_t<0>, int64_t)>(
+            cuda::ptx::tensormap_replace_global_stride));));
+#endif // __cccl_ptx_isa >= 830
+
+#if __cccl_ptx_isa >= 830
+  NV_IF_TARGET(
+    NV_HAS_FEATURE_SM_90a,
+    (
+        // tensormap.replace.tile.global_stride.shared::cta.b1024.b64     [tm_addr], ord, new_val;
+        * fn_ptr++ =
+          reinterpret_cast<void*>(static_cast<void (*)(cuda::ptx::space_shared_t, void*, cuda::ptx::n32_t<0>, int64_t)>(
+            cuda::ptx::tensormap_replace_global_stride));));
+#endif // __cccl_ptx_isa >= 830
+
+#if __cccl_ptx_isa >= 830
+  NV_IF_TARGET(
+    NV_HAS_FEATURE_SM_90a,
+    (
+        // tensormap.replace.tile.element_stride.global.b1024.b32    [tm_addr], ord, new_val;
+        * fn_ptr++ =
+          reinterpret_cast<void*>(static_cast<void (*)(cuda::ptx::space_global_t, void*, cuda::ptx::n32_t<0>, int32_t)>(
+            cuda::ptx::tensormap_replace_element_size));));
+#endif // __cccl_ptx_isa >= 830
+
+#if __cccl_ptx_isa >= 830
+  NV_IF_TARGET(
+    NV_HAS_FEATURE_SM_90a,
+    (
+        // tensormap.replace.tile.element_stride.shared::cta.b1024.b32    [tm_addr], ord, new_val;
+        * fn_ptr++ =
+          reinterpret_cast<void*>(static_cast<void (*)(cuda::ptx::space_shared_t, void*, cuda::ptx::n32_t<0>, int32_t)>(
+            cuda::ptx::tensormap_replace_element_size));));
+#endif // __cccl_ptx_isa >= 830
+
+#if __cccl_ptx_isa >= 830
+  NV_IF_TARGET(
+    NV_HAS_FEATURE_SM_90a,
+    (
+        // tensormap.replace.tile.elemtype.global.b1024.b32          [tm_addr], new_val;
+        * fn_ptr++ =
+          reinterpret_cast<void*>(static_cast<void (*)(cuda::ptx::space_global_t, void*, cuda::ptx::n32_t<0>)>(
+            cuda::ptx::tensormap_replace_elemtype));));
+#endif // __cccl_ptx_isa >= 830
+
+#if __cccl_ptx_isa >= 830
+  NV_IF_TARGET(
+    NV_HAS_FEATURE_SM_90a,
+    (
+        // tensormap.replace.tile.elemtype.shared::cta.b1024.b32          [tm_addr], new_val;
+        * fn_ptr++ =
+          reinterpret_cast<void*>(static_cast<void (*)(cuda::ptx::space_shared_t, void*, cuda::ptx::n32_t<0>)>(
+            cuda::ptx::tensormap_replace_elemtype));));
+#endif // __cccl_ptx_isa >= 830
+
+#if __cccl_ptx_isa >= 830
+  NV_IF_TARGET(
+    NV_HAS_FEATURE_SM_90a,
+    (
+        // tensormap.replace.tile.interleave_layout.global.b1024.b32 [tm_addr], new_val;
+        * fn_ptr++ =
+          reinterpret_cast<void*>(static_cast<void (*)(cuda::ptx::space_global_t, void*, cuda::ptx::n32_t<0>)>(
+            cuda::ptx::tensormap_replace_interleave_layout));));
+#endif // __cccl_ptx_isa >= 830
+
+#if __cccl_ptx_isa >= 830
+  NV_IF_TARGET(
+    NV_HAS_FEATURE_SM_90a,
+    (
+        // tensormap.replace.tile.interleave_layout.shared::cta.b1024.b32 [tm_addr], new_val;
+        * fn_ptr++ =
+          reinterpret_cast<void*>(static_cast<void (*)(cuda::ptx::space_shared_t, void*, cuda::ptx::n32_t<0>)>(
+            cuda::ptx::tensormap_replace_interleave_layout));));
+#endif // __cccl_ptx_isa >= 830
+
+#if __cccl_ptx_isa >= 830
+  NV_IF_TARGET(
+    NV_HAS_FEATURE_SM_90a,
+    (
+        // tensormap.replace.tile.swizzle_mode.global.b1024.b32      [tm_addr], new_val;
+        * fn_ptr++ =
+          reinterpret_cast<void*>(static_cast<void (*)(cuda::ptx::space_global_t, void*, cuda::ptx::n32_t<0>)>(
+            cuda::ptx::tensormap_replace_swizzle_mode));));
+#endif // __cccl_ptx_isa >= 830
+
+#if __cccl_ptx_isa >= 830
+  NV_IF_TARGET(
+    NV_HAS_FEATURE_SM_90a,
+    (
+        // tensormap.replace.tile.swizzle_mode.shared::cta.b1024.b32      [tm_addr], new_val;
+        * fn_ptr++ =
+          reinterpret_cast<void*>(static_cast<void (*)(cuda::ptx::space_shared_t, void*, cuda::ptx::n32_t<0>)>(
+            cuda::ptx::tensormap_replace_swizzle_mode));));
+#endif // __cccl_ptx_isa >= 830
+
+#if __cccl_ptx_isa >= 830
+  NV_IF_TARGET(
+    NV_HAS_FEATURE_SM_90a,
+    (
+        // tensormap.replace.tile.fill_mode.global.b1024.b32         [tm_addr], new_val;
+        * fn_ptr++ =
+          reinterpret_cast<void*>(static_cast<void (*)(cuda::ptx::space_global_t, void*, cuda::ptx::n32_t<0>)>(
+            cuda::ptx::tensormap_replace_fill_mode));));
+#endif // __cccl_ptx_isa >= 830
+
+#if __cccl_ptx_isa >= 830
+  NV_IF_TARGET(
+    NV_HAS_FEATURE_SM_90a,
+    (
+        // tensormap.replace.tile.fill_mode.shared::cta.b1024.b32         [tm_addr], new_val;
+        * fn_ptr++ =
+          reinterpret_cast<void*>(static_cast<void (*)(cuda::ptx::space_shared_t, void*, cuda::ptx::n32_t<0>)>(
+            cuda::ptx::tensormap_replace_fill_mode));));
+#endif // __cccl_ptx_isa >= 830
+}
diff --git a/libcudacxx/test/libcudacxx/cuda/ptx/ptx.barrier.cluster.compile.pass.cpp b/libcudacxx/test/libcudacxx/cuda/ptx/ptx.barrier.cluster.compile.pass.cpp
index e6088d2f317..c460a2e5b09 100644
--- a/libcudacxx/test/libcudacxx/cuda/ptx/ptx.barrier.cluster.compile.pass.cpp
+++ b/libcudacxx/test/libcudacxx/cuda/ptx/ptx.barrier.cluster.compile.pass.cpp
@@ -31,48 +31,10 @@
  *
  */
 
-__global__ void test_barrier_cluster(void** fn_ptr)
-{
-#if __cccl_ptx_isa >= 780
-  NV_IF_TARGET(NV_PROVIDES_SM_90,
-               (
-                   // barrier.cluster.arrive;
-                   * fn_ptr++ = reinterpret_cast<void*>(static_cast<void (*)()>(cuda::ptx::barrier_cluster_arrive));));
-#endif // __cccl_ptx_isa >= 780
-
-#if __cccl_ptx_isa >= 780
-  NV_IF_TARGET(NV_PROVIDES_SM_90,
-               (
-                   // barrier.cluster.wait;
-                   * fn_ptr++ = reinterpret_cast<void*>(static_cast<void (*)()>(cuda::ptx::barrier_cluster_wait));));
-#endif // __cccl_ptx_isa >= 780
-
-#if __cccl_ptx_isa >= 800
-  NV_IF_TARGET(NV_PROVIDES_SM_90,
-               (
-                   // barrier.cluster.arrive.release;
-                   * fn_ptr++ = reinterpret_cast<void*>(
-                     static_cast<void (*)(cuda::ptx::sem_release_t)>(cuda::ptx::barrier_cluster_arrive));));
-#endif // __cccl_ptx_isa >= 800
-
-#if __cccl_ptx_isa >= 800
-  NV_IF_TARGET(NV_PROVIDES_SM_90,
-               (
-                   // barrier.cluster.arrive.relaxed;
-                   * fn_ptr++ = reinterpret_cast<void*>(
-                     static_cast<void (*)(cuda::ptx::sem_relaxed_t)>(cuda::ptx::barrier_cluster_arrive));));
-#endif // __cccl_ptx_isa >= 800
-
-#if __cccl_ptx_isa >= 800
-  NV_IF_TARGET(NV_PROVIDES_SM_90,
-               (
-                   // barrier.cluster.wait.acquire;
-                   * fn_ptr++ = reinterpret_cast<void*>(
-                     static_cast<void (*)(cuda::ptx::sem_acquire_t)>(cuda::ptx::barrier_cluster_wait));));
-#endif // __cccl_ptx_isa >= 800
-}
+#include "generated/barrier_cluster.inc"
 
 int main(int, char**)
 {
+  // FIXME(bgruber): why no call to test_barrier_cluster?
   return 0;
 }
diff --git a/libcudacxx/test/libcudacxx/cuda/ptx/ptx.cp.async.bulk.commit_group.compile.pass.cpp b/libcudacxx/test/libcudacxx/cuda/ptx/ptx.cp.async.bulk.commit_group.compile.pass.cpp
index b4dff69d5b7..4695221dbc5 100644
--- a/libcudacxx/test/libcudacxx/cuda/ptx/ptx.cp.async.bulk.commit_group.compile.pass.cpp
+++ b/libcudacxx/test/libcudacxx/cuda/ptx/ptx.cp.async.bulk.commit_group.compile.pass.cpp
@@ -31,16 +31,7 @@
  *
  */
 
-__global__ void test_cp_async_bulk_commit_group(void** fn_ptr)
-{
-#if __cccl_ptx_isa >= 800
-  NV_IF_TARGET(
-    NV_PROVIDES_SM_90,
-    (
-        // cp.async.bulk.commit_group;
-        * fn_ptr++ = reinterpret_cast<void*>(static_cast<void (*)()>(cuda::ptx::cp_async_bulk_commit_group));));
-#endif // __cccl_ptx_isa >= 800
-}
+#include "generated/cp_async_bulk_commit_group.inc"
 
 int main(int, char**)
 {
diff --git a/libcudacxx/test/libcudacxx/cuda/ptx/ptx.cp.async.bulk.compile.pass.cpp b/libcudacxx/test/libcudacxx/cuda/ptx/ptx.cp.async.bulk.compile.pass.cpp
index b234c35fcdc..b1811727b66 100644
--- a/libcudacxx/test/libcudacxx/cuda/ptx/ptx.cp.async.bulk.compile.pass.cpp
+++ b/libcudacxx/test/libcudacxx/cuda/ptx/ptx.cp.async.bulk.compile.pass.cpp
@@ -31,43 +31,7 @@
  *
  */
 
-__global__ void test_cp_async_bulk(void** fn_ptr)
-{
-#if __cccl_ptx_isa >= 800
-  NV_IF_TARGET(
-    NV_PROVIDES_SM_90,
-    (
-        // cp.async.bulk.shared::cluster.global.mbarrier::complete_tx::bytes [dstMem], [srcMem], size, [smem_bar]; //
-        // 1a. unicast
-        * fn_ptr++ = reinterpret_cast<void*>(
-          static_cast<void (*)(
-            cuda::ptx::space_cluster_t, cuda::ptx::space_global_t, void*, const void*, const uint32_t&, uint64_t*)>(
-            cuda::ptx::cp_async_bulk));));
-#endif // __cccl_ptx_isa >= 800
-
-#if __cccl_ptx_isa >= 800
-  NV_IF_TARGET(
-    NV_PROVIDES_SM_90,
-    (
-        // cp.async.bulk.shared::cluster.shared::cta.mbarrier::complete_tx::bytes [dstMem], [srcMem], size,
-        // [rdsmem_bar]; // 2.
-        * fn_ptr++ = reinterpret_cast<void*>(
-          static_cast<void (*)(
-            cuda::ptx::space_cluster_t, cuda::ptx::space_shared_t, void*, const void*, const uint32_t&, uint64_t*)>(
-            cuda::ptx::cp_async_bulk));));
-#endif // __cccl_ptx_isa >= 800
-
-#if __cccl_ptx_isa >= 800
-  NV_IF_TARGET(
-    NV_PROVIDES_SM_90,
-    (
-        // cp.async.bulk.global.shared::cta.bulk_group [dstMem], [srcMem], size; // 3.
-        * fn_ptr++ = reinterpret_cast<void*>(
-          static_cast<void (*)(
-            cuda::ptx::space_global_t, cuda::ptx::space_shared_t, void*, const void*, const uint32_t&)>(
-            cuda::ptx::cp_async_bulk));));
-#endif // __cccl_ptx_isa >= 800
-}
+#include "generated/cp_async_bulk.inc"
 
 int main(int, char**)
 {
diff --git a/libcudacxx/test/libcudacxx/cuda/ptx/ptx.cp.async.bulk.multicast.compile.pass.cpp b/libcudacxx/test/libcudacxx/cuda/ptx/ptx.cp.async.bulk.multicast.compile.pass.cpp
index 8dbc81741d2..c040528cabc 100644
--- a/libcudacxx/test/libcudacxx/cuda/ptx/ptx.cp.async.bulk.multicast.compile.pass.cpp
+++ b/libcudacxx/test/libcudacxx/cuda/ptx/ptx.cp.async.bulk.multicast.compile.pass.cpp
@@ -33,24 +33,7 @@
  *
  */
 
-__global__ void test_cp_async_bulk_multicast(void** fn_ptr)
-{
-#if __cccl_ptx_isa >= 800
-  NV_IF_TARGET(
-    NV_HAS_FEATURE_SM_90a,
-    (
-        // cp.async.bulk.shared::cluster.global.mbarrier::complete_tx::bytes.multicast::cluster [dstMem], [srcMem],
-        // size, [smem_bar], ctaMask; // 1.
-        * fn_ptr++ = reinterpret_cast<void*>(
-          static_cast<void (*)(cuda::ptx::space_cluster_t,
-                               cuda::ptx::space_global_t,
-                               void*,
-                               const void*,
-                               const uint32_t&,
-                               uint64_t*,
-                               const uint16_t&)>(cuda::ptx::cp_async_bulk));));
-#endif // __cccl_ptx_isa >= 800
-}
+#include "generated/cp_async_bulk_multicast.inc"
 
 int main(int, char**)
 {
diff --git a/libcudacxx/test/libcudacxx/cuda/ptx/ptx.cp.async.bulk.tensor.compile.pass.cpp b/libcudacxx/test/libcudacxx/cuda/ptx/ptx.cp.async.bulk.tensor.compile.pass.cpp
index 64d9b9590a3..0b69b8a8f1c 100644
--- a/libcudacxx/test/libcudacxx/cuda/ptx/ptx.cp.async.bulk.tensor.compile.pass.cpp
+++ b/libcudacxx/test/libcudacxx/cuda/ptx/ptx.cp.async.bulk.tensor.compile.pass.cpp
@@ -31,123 +31,7 @@
  *
  */
 
-__global__ void test_cp_async_bulk_tensor(void** fn_ptr)
-{
-#if __cccl_ptx_isa >= 800
-  NV_IF_TARGET(
-    NV_PROVIDES_SM_90,
-    (
-        // cp.async.bulk.tensor.1d.shared::cluster.global.tile.mbarrier::complete_tx::bytes [dstMem], [tensorMap,
-        // tensorCoords], [smem_bar];// 1a.
-        * fn_ptr++ = reinterpret_cast<void*>(
-          static_cast<void (*)(
-            cuda::ptx::space_cluster_t, cuda::ptx::space_global_t, void*, const void*, const int32_t(&)[1], uint64_t*)>(
-            cuda::ptx::cp_async_bulk_tensor));));
-#endif // __cccl_ptx_isa >= 800
-
-#if __cccl_ptx_isa >= 800
-  NV_IF_TARGET(
-    NV_PROVIDES_SM_90,
-    (
-        // cp.async.bulk.tensor.1d.global.shared::cta.tile.bulk_group [tensorMap, tensorCoords], [srcMem]; // 3a.
-        * fn_ptr++ = reinterpret_cast<void*>(
-          static_cast<void (*)(
-            cuda::ptx::space_global_t, cuda::ptx::space_shared_t, const void*, const int32_t(&)[1], const void*)>(
-            cuda::ptx::cp_async_bulk_tensor));));
-#endif // __cccl_ptx_isa >= 800
-
-#if __cccl_ptx_isa >= 800
-  NV_IF_TARGET(
-    NV_PROVIDES_SM_90,
-    (
-        // cp.async.bulk.tensor.2d.shared::cluster.global.tile.mbarrier::complete_tx::bytes [dstMem], [tensorMap,
-        // tensorCoords], [smem_bar];// 1b.
-        * fn_ptr++ = reinterpret_cast<void*>(
-          static_cast<void (*)(
-            cuda::ptx::space_cluster_t, cuda::ptx::space_global_t, void*, const void*, const int32_t(&)[2], uint64_t*)>(
-            cuda::ptx::cp_async_bulk_tensor));));
-#endif // __cccl_ptx_isa >= 800
-
-#if __cccl_ptx_isa >= 800
-  NV_IF_TARGET(
-    NV_PROVIDES_SM_90,
-    (
-        // cp.async.bulk.tensor.2d.global.shared::cta.tile.bulk_group [tensorMap, tensorCoords], [srcMem]; // 3b.
-        * fn_ptr++ = reinterpret_cast<void*>(
-          static_cast<void (*)(
-            cuda::ptx::space_global_t, cuda::ptx::space_shared_t, const void*, const int32_t(&)[2], const void*)>(
-            cuda::ptx::cp_async_bulk_tensor));));
-#endif // __cccl_ptx_isa >= 800
-
-#if __cccl_ptx_isa >= 800
-  NV_IF_TARGET(
-    NV_PROVIDES_SM_90,
-    (
-        // cp.async.bulk.tensor.3d.shared::cluster.global.tile.mbarrier::complete_tx::bytes [dstMem], [tensorMap,
-        // tensorCoords], [smem_bar];// 1c.
-        * fn_ptr++ = reinterpret_cast<void*>(
-          static_cast<void (*)(
-            cuda::ptx::space_cluster_t, cuda::ptx::space_global_t, void*, const void*, const int32_t(&)[3], uint64_t*)>(
-            cuda::ptx::cp_async_bulk_tensor));));
-#endif // __cccl_ptx_isa >= 800
-
-#if __cccl_ptx_isa >= 800
-  NV_IF_TARGET(
-    NV_PROVIDES_SM_90,
-    (
-        // cp.async.bulk.tensor.3d.global.shared::cta.tile.bulk_group [tensorMap, tensorCoords], [srcMem]; // 3c.
-        * fn_ptr++ = reinterpret_cast<void*>(
-          static_cast<void (*)(
-            cuda::ptx::space_global_t, cuda::ptx::space_shared_t, const void*, const int32_t(&)[3], const void*)>(
-            cuda::ptx::cp_async_bulk_tensor));));
-#endif // __cccl_ptx_isa >= 800
-
-#if __cccl_ptx_isa >= 800
-  NV_IF_TARGET(
-    NV_PROVIDES_SM_90,
-    (
-        // cp.async.bulk.tensor.4d.shared::cluster.global.tile.mbarrier::complete_tx::bytes [dstMem], [tensorMap,
-        // tensorCoords], [smem_bar];// 1d.
-        * fn_ptr++ = reinterpret_cast<void*>(
-          static_cast<void (*)(
-            cuda::ptx::space_cluster_t, cuda::ptx::space_global_t, void*, const void*, const int32_t(&)[4], uint64_t*)>(
-            cuda::ptx::cp_async_bulk_tensor));));
-#endif // __cccl_ptx_isa >= 800
-
-#if __cccl_ptx_isa >= 800
-  NV_IF_TARGET(
-    NV_PROVIDES_SM_90,
-    (
-        // cp.async.bulk.tensor.4d.global.shared::cta.tile.bulk_group [tensorMap, tensorCoords], [srcMem]; // 3d.
-        * fn_ptr++ = reinterpret_cast<void*>(
-          static_cast<void (*)(
-            cuda::ptx::space_global_t, cuda::ptx::space_shared_t, const void*, const int32_t(&)[4], const void*)>(
-            cuda::ptx::cp_async_bulk_tensor));));
-#endif // __cccl_ptx_isa >= 800
-
-#if __cccl_ptx_isa >= 800
-  NV_IF_TARGET(
-    NV_PROVIDES_SM_90,
-    (
-        // cp.async.bulk.tensor.5d.shared::cluster.global.tile.mbarrier::complete_tx::bytes [dstMem], [tensorMap,
-        // tensorCoords], [smem_bar];// 1e.
-        * fn_ptr++ = reinterpret_cast<void*>(
-          static_cast<void (*)(
-            cuda::ptx::space_cluster_t, cuda::ptx::space_global_t, void*, const void*, const int32_t(&)[5], uint64_t*)>(
-            cuda::ptx::cp_async_bulk_tensor));));
-#endif // __cccl_ptx_isa >= 800
-
-#if __cccl_ptx_isa >= 800
-  NV_IF_TARGET(
-    NV_PROVIDES_SM_90,
-    (
-        // cp.async.bulk.tensor.5d.global.shared::cta.tile.bulk_group [tensorMap, tensorCoords], [srcMem]; // 3e.
-        * fn_ptr++ = reinterpret_cast<void*>(
-          static_cast<void (*)(
-            cuda::ptx::space_global_t, cuda::ptx::space_shared_t, const void*, const int32_t(&)[5], const void*)>(
-            cuda::ptx::cp_async_bulk_tensor));));
-#endif // __cccl_ptx_isa >= 800
-}
+#include "generated/cp_async_bulk_tensor.inc"
 
 int main(int, char**)
 {
diff --git a/libcudacxx/test/libcudacxx/cuda/ptx/ptx.cp.async.bulk.tensor.multicast.compile.pass.cpp b/libcudacxx/test/libcudacxx/cuda/ptx/ptx.cp.async.bulk.tensor.multicast.compile.pass.cpp
index 2a3457396d0..7d53d9ee0c9 100644
--- a/libcudacxx/test/libcudacxx/cuda/ptx/ptx.cp.async.bulk.tensor.multicast.compile.pass.cpp
+++ b/libcudacxx/test/libcudacxx/cuda/ptx/ptx.cp.async.bulk.tensor.multicast.compile.pass.cpp
@@ -33,88 +33,7 @@
  *
  */
 
-__global__ void test_cp_async_bulk_tensor_multicast(void** fn_ptr)
-{
-#if __cccl_ptx_isa >= 800
-  NV_IF_TARGET(
-    NV_HAS_FEATURE_SM_90a,
-    (
-        // cp.async.bulk.tensor.1d.shared::cluster.global.tile.mbarrier::complete_tx::bytes.multicast::cluster [dstMem],
-        // [tensorMap, tensorCoords], [smem_bar], ctaMask; // 2a.
-        * fn_ptr++ = reinterpret_cast<void*>(
-          static_cast<void (*)(cuda::ptx::space_cluster_t,
-                               cuda::ptx::space_global_t,
-                               void*,
-                               const void*,
-                               const int32_t(&)[1],
-                               uint64_t*,
-                               const uint16_t&)>(cuda::ptx::cp_async_bulk_tensor));));
-#endif // __cccl_ptx_isa >= 800
-
-#if __cccl_ptx_isa >= 800
-  NV_IF_TARGET(
-    NV_HAS_FEATURE_SM_90a,
-    (
-        // cp.async.bulk.tensor.2d.shared::cluster.global.tile.mbarrier::complete_tx::bytes.multicast::cluster [dstMem],
-        // [tensorMap, tensorCoords], [smem_bar], ctaMask; // 2b.
-        * fn_ptr++ = reinterpret_cast<void*>(
-          static_cast<void (*)(cuda::ptx::space_cluster_t,
-                               cuda::ptx::space_global_t,
-                               void*,
-                               const void*,
-                               const int32_t(&)[2],
-                               uint64_t*,
-                               const uint16_t&)>(cuda::ptx::cp_async_bulk_tensor));));
-#endif // __cccl_ptx_isa >= 800
-
-#if __cccl_ptx_isa >= 800
-  NV_IF_TARGET(
-    NV_HAS_FEATURE_SM_90a,
-    (
-        // cp.async.bulk.tensor.3d.shared::cluster.global.tile.mbarrier::complete_tx::bytes.multicast::cluster [dstMem],
-        // [tensorMap, tensorCoords], [smem_bar], ctaMask; // 2c.
-        * fn_ptr++ = reinterpret_cast<void*>(
-          static_cast<void (*)(cuda::ptx::space_cluster_t,
-                               cuda::ptx::space_global_t,
-                               void*,
-                               const void*,
-                               const int32_t(&)[3],
-                               uint64_t*,
-                               const uint16_t&)>(cuda::ptx::cp_async_bulk_tensor));));
-#endif // __cccl_ptx_isa >= 800
-
-#if __cccl_ptx_isa >= 800
-  NV_IF_TARGET(
-    NV_HAS_FEATURE_SM_90a,
-    (
-        // cp.async.bulk.tensor.4d.shared::cluster.global.tile.mbarrier::complete_tx::bytes.multicast::cluster [dstMem],
-        // [tensorMap, tensorCoords], [smem_bar], ctaMask; // 2d.
-        * fn_ptr++ = reinterpret_cast<void*>(
-          static_cast<void (*)(cuda::ptx::space_cluster_t,
-                               cuda::ptx::space_global_t,
-                               void*,
-                               const void*,
-                               const int32_t(&)[4],
-                               uint64_t*,
-                               const uint16_t&)>(cuda::ptx::cp_async_bulk_tensor));));
-#endif // __cccl_ptx_isa >= 800
-
-#if __cccl_ptx_isa >= 800
-  NV_IF_TARGET(
-    NV_HAS_FEATURE_SM_90a,
-    (
-        // cp.async.bulk.tensor.5d.shared::cluster.global.tile.mbarrier::complete_tx::bytes.multicast::cluster [dstMem],
-        // [tensorMap, tensorCoords], [smem_bar], ctaMask; // 2e.
-        * fn_ptr++ = reinterpret_cast<void*>(
-          static_cast<void (*)(cuda::ptx::space_cluster_t,
-                               cuda::ptx::space_global_t,
-                               void*,
-                               const void*,
-                               const int32_t(&)[5],
-                               uint64_t*,
-                               const uint16_t&)>(cuda::ptx::cp_async_bulk_tensor));));
-#endif // __cccl_ptx_isa >= 800
-}
+#include "generated/cp_async_bulk_tensor_multicast.inc"
 
 int main(int, char**)
 {
diff --git a/libcudacxx/test/libcudacxx/cuda/ptx/ptx.cp.async.bulk.wait_group.compile.pass.cpp b/libcudacxx/test/libcudacxx/cuda/ptx/ptx.cp.async.bulk.wait_group.compile.pass.cpp
index 3bfa9bbc7dd..39df53c5f9d 100644
--- a/libcudacxx/test/libcudacxx/cuda/ptx/ptx.cp.async.bulk.wait_group.compile.pass.cpp
+++ b/libcudacxx/test/libcudacxx/cuda/ptx/ptx.cp.async.bulk.wait_group.compile.pass.cpp
@@ -31,24 +31,7 @@
  *
  */
 
-__global__ void test_cp_async_bulk_wait_group(void** fn_ptr)
-{
-#if __cccl_ptx_isa >= 800
-  NV_IF_TARGET(NV_PROVIDES_SM_90,
-               (
-                   // cp.async.bulk.wait_group N;
-                   * fn_ptr++ = reinterpret_cast<void*>(
-                     static_cast<void (*)(cuda::ptx::n32_t<128>)>(cuda::ptx::cp_async_bulk_wait_group));));
-#endif // __cccl_ptx_isa >= 800
-
-#if __cccl_ptx_isa >= 800
-  NV_IF_TARGET(NV_PROVIDES_SM_90,
-               (
-                   // cp.async.bulk.wait_group.read N;
-                   * fn_ptr++ = reinterpret_cast<void*>(
-                     static_cast<void (*)(cuda::ptx::n32_t<128>)>(cuda::ptx::cp_async_bulk_wait_group_read));));
-#endif // __cccl_ptx_isa >= 800
-}
+#include "generated/cp_async_bulk_wait_group.inc"
 
 int main(int, char**)
 {
diff --git a/libcudacxx/test/libcudacxx/cuda/ptx/ptx.cp.reduce.async.bulk.compile.pass.cpp b/libcudacxx/test/libcudacxx/cuda/ptx/ptx.cp.reduce.async.bulk.compile.pass.cpp
index b1d06ca49c0..a186e34a809 100644
--- a/libcudacxx/test/libcudacxx/cuda/ptx/ptx.cp.reduce.async.bulk.compile.pass.cpp
+++ b/libcudacxx/test/libcudacxx/cuda/ptx/ptx.cp.reduce.async.bulk.compile.pass.cpp
@@ -31,568 +31,14 @@
  *
  */
 
-__global__ void test_cp_reduce_async_bulk(void** fn_ptr)
-{
-#if __cccl_ptx_isa >= 800
-  NV_IF_TARGET(
-    NV_PROVIDES_SM_90,
-    (
-        // cp.reduce.async.bulk.shared::cluster.shared::cta.mbarrier::complete_tx::bytes.and.b32 [dstMem], [srcMem],
-        // size, [rdsmem_bar]; // 1.
-        * fn_ptr++ = reinterpret_cast<void*>(
-          static_cast<void (*)(cuda::ptx::space_cluster_t,
-                               cuda::ptx::space_shared_t,
-                               cuda::ptx::op_and_op_t,
-                               int32_t*,
-                               const int32_t*,
-                               uint32_t,
-                               uint64_t*)>(cuda::ptx::cp_reduce_async_bulk));));
-#endif // __cccl_ptx_isa >= 800
-
-#if __cccl_ptx_isa >= 800
-  NV_IF_TARGET(
-    NV_PROVIDES_SM_90,
-    (
-        // cp.reduce.async.bulk.shared::cluster.shared::cta.mbarrier::complete_tx::bytes.or.b32 [dstMem], [srcMem],
-        // size, [rdsmem_bar]; // 1.
-        * fn_ptr++ = reinterpret_cast<void*>(
-          static_cast<void (*)(cuda::ptx::space_cluster_t,
-                               cuda::ptx::space_shared_t,
-                               cuda::ptx::op_or_op_t,
-                               int32_t*,
-                               const int32_t*,
-                               uint32_t,
-                               uint64_t*)>(cuda::ptx::cp_reduce_async_bulk));));
-#endif // __cccl_ptx_isa >= 800
-
-#if __cccl_ptx_isa >= 800
-  NV_IF_TARGET(
-    NV_PROVIDES_SM_90,
-    (
-        // cp.reduce.async.bulk.shared::cluster.shared::cta.mbarrier::complete_tx::bytes.xor.b32 [dstMem], [srcMem],
-        // size, [rdsmem_bar]; // 1.
-        * fn_ptr++ = reinterpret_cast<void*>(
-          static_cast<void (*)(cuda::ptx::space_cluster_t,
-                               cuda::ptx::space_shared_t,
-                               cuda::ptx::op_xor_op_t,
-                               int32_t*,
-                               const int32_t*,
-                               uint32_t,
-                               uint64_t*)>(cuda::ptx::cp_reduce_async_bulk));));
-#endif // __cccl_ptx_isa >= 800
-
-#if __cccl_ptx_isa >= 800
-  NV_IF_TARGET(
-    NV_PROVIDES_SM_90,
-    (
-        // cp.reduce.async.bulk.shared::cluster.shared::cta.mbarrier::complete_tx::bytes.min.u32 [dstMem], [srcMem],
-        // size, [rdsmem_bar]; // 1.
-        * fn_ptr++ = reinterpret_cast<void*>(
-          static_cast<void (*)(cuda::ptx::space_cluster_t,
-                               cuda::ptx::space_shared_t,
-                               cuda::ptx::op_min_t,
-                               uint32_t*,
-                               const uint32_t*,
-                               uint32_t,
-                               uint64_t*)>(cuda::ptx::cp_reduce_async_bulk));));
-#endif // __cccl_ptx_isa >= 800
-
-#if __cccl_ptx_isa >= 800
-  NV_IF_TARGET(
-    NV_PROVIDES_SM_90,
-    (
-        // cp.reduce.async.bulk.shared::cluster.shared::cta.mbarrier::complete_tx::bytes.max.u32 [dstMem], [srcMem],
-        // size, [rdsmem_bar]; // 1.
-        * fn_ptr++ = reinterpret_cast<void*>(
-          static_cast<void (*)(cuda::ptx::space_cluster_t,
-                               cuda::ptx::space_shared_t,
-                               cuda::ptx::op_max_t,
-                               uint32_t*,
-                               const uint32_t*,
-                               uint32_t,
-                               uint64_t*)>(cuda::ptx::cp_reduce_async_bulk));));
-#endif // __cccl_ptx_isa >= 800
-
-#if __cccl_ptx_isa >= 800
-  NV_IF_TARGET(
-    NV_PROVIDES_SM_90,
-    (
-        // cp.reduce.async.bulk.shared::cluster.shared::cta.mbarrier::complete_tx::bytes.add.u32 [dstMem], [srcMem],
-        // size, [rdsmem_bar]; // 1.
-        * fn_ptr++ = reinterpret_cast<void*>(
-          static_cast<void (*)(cuda::ptx::space_cluster_t,
-                               cuda::ptx::space_shared_t,
-                               cuda::ptx::op_add_t,
-                               uint32_t*,
-                               const uint32_t*,
-                               uint32_t,
-                               uint64_t*)>(cuda::ptx::cp_reduce_async_bulk));));
-#endif // __cccl_ptx_isa >= 800
-
-#if __cccl_ptx_isa >= 800
-  NV_IF_TARGET(
-    NV_PROVIDES_SM_90,
-    (
-        // cp.reduce.async.bulk.shared::cluster.shared::cta.mbarrier::complete_tx::bytes.inc.u32 [dstMem], [srcMem],
-        // size, [rdsmem_bar]; // 1.
-        * fn_ptr++ = reinterpret_cast<void*>(
-          static_cast<void (*)(cuda::ptx::space_cluster_t,
-                               cuda::ptx::space_shared_t,
-                               cuda::ptx::op_inc_t,
-                               uint32_t*,
-                               const uint32_t*,
-                               uint32_t,
-                               uint64_t*)>(cuda::ptx::cp_reduce_async_bulk));));
-#endif // __cccl_ptx_isa >= 800
-
-#if __cccl_ptx_isa >= 800
-  NV_IF_TARGET(
-    NV_PROVIDES_SM_90,
-    (
-        // cp.reduce.async.bulk.shared::cluster.shared::cta.mbarrier::complete_tx::bytes.dec.u32 [dstMem], [srcMem],
-        // size, [rdsmem_bar]; // 1.
-        * fn_ptr++ = reinterpret_cast<void*>(
-          static_cast<void (*)(cuda::ptx::space_cluster_t,
-                               cuda::ptx::space_shared_t,
-                               cuda::ptx::op_dec_t,
-                               uint32_t*,
-                               const uint32_t*,
-                               uint32_t,
-                               uint64_t*)>(cuda::ptx::cp_reduce_async_bulk));));
-#endif // __cccl_ptx_isa >= 800
-
-#if __cccl_ptx_isa >= 800
-  NV_IF_TARGET(
-    NV_PROVIDES_SM_90,
-    (
-        // cp.reduce.async.bulk.shared::cluster.shared::cta.mbarrier::complete_tx::bytes.min.s32 [dstMem], [srcMem],
-        // size, [rdsmem_bar]; // 1.
-        * fn_ptr++ = reinterpret_cast<void*>(
-          static_cast<void (*)(cuda::ptx::space_cluster_t,
-                               cuda::ptx::space_shared_t,
-                               cuda::ptx::op_min_t,
-                               int32_t*,
-                               const int32_t*,
-                               uint32_t,
-                               uint64_t*)>(cuda::ptx::cp_reduce_async_bulk));));
-#endif // __cccl_ptx_isa >= 800
-
-#if __cccl_ptx_isa >= 800
-  NV_IF_TARGET(
-    NV_PROVIDES_SM_90,
-    (
-        // cp.reduce.async.bulk.shared::cluster.shared::cta.mbarrier::complete_tx::bytes.max.s32 [dstMem], [srcMem],
-        // size, [rdsmem_bar]; // 1.
-        * fn_ptr++ = reinterpret_cast<void*>(
-          static_cast<void (*)(cuda::ptx::space_cluster_t,
-                               cuda::ptx::space_shared_t,
-                               cuda::ptx::op_max_t,
-                               int32_t*,
-                               const int32_t*,
-                               uint32_t,
-                               uint64_t*)>(cuda::ptx::cp_reduce_async_bulk));));
-#endif // __cccl_ptx_isa >= 800
-
-#if __cccl_ptx_isa >= 800
-  NV_IF_TARGET(
-    NV_PROVIDES_SM_90,
-    (
-        // cp.reduce.async.bulk.shared::cluster.shared::cta.mbarrier::complete_tx::bytes.add.s32 [dstMem], [srcMem],
-        // size, [rdsmem_bar]; // 1.
-        * fn_ptr++ = reinterpret_cast<void*>(
-          static_cast<void (*)(cuda::ptx::space_cluster_t,
-                               cuda::ptx::space_shared_t,
-                               cuda::ptx::op_add_t,
-                               int32_t*,
-                               const int32_t*,
-                               uint32_t,
-                               uint64_t*)>(cuda::ptx::cp_reduce_async_bulk));));
-#endif // __cccl_ptx_isa >= 800
-
-#if __cccl_ptx_isa >= 800
-  NV_IF_TARGET(
-    NV_PROVIDES_SM_90,
-    (
-        // cp.reduce.async.bulk.shared::cluster.shared::cta.mbarrier::complete_tx::bytes.add.u64 [dstMem], [srcMem],
-        // size, [rdsmem_bar]; // 1.
-        * fn_ptr++ = reinterpret_cast<void*>(
-          static_cast<void (*)(cuda::ptx::space_cluster_t,
-                               cuda::ptx::space_shared_t,
-                               cuda::ptx::op_add_t,
-                               uint64_t*,
-                               const uint64_t*,
-                               uint32_t,
-                               uint64_t*)>(cuda::ptx::cp_reduce_async_bulk));));
-#endif // __cccl_ptx_isa >= 800
-
-#if __cccl_ptx_isa >= 800
-  NV_IF_TARGET(
-    NV_PROVIDES_SM_90,
-    (
-        // cp.reduce.async.bulk.shared::cluster.shared::cta.mbarrier::complete_tx::bytes.add.u64 [dstMem], [srcMem],
-        // size, [rdsmem_bar]; // 2.
-        * fn_ptr++ = reinterpret_cast<void*>(
-          static_cast<void (*)(cuda::ptx::space_cluster_t,
-                               cuda::ptx::space_shared_t,
-                               cuda::ptx::op_add_t,
-                               int64_t*,
-                               const int64_t*,
-                               uint32_t,
-                               uint64_t*)>(cuda::ptx::cp_reduce_async_bulk));));
-#endif // __cccl_ptx_isa >= 800
-
-#if __cccl_ptx_isa >= 800
-  NV_IF_TARGET(
-    NV_PROVIDES_SM_90,
-    (
-        // cp.reduce.async.bulk.global.shared::cta.bulk_group.and.b32  [dstMem], [srcMem], size; // 3.
-        * fn_ptr++ = reinterpret_cast<void*>(
-          static_cast<void (*)(cuda::ptx::space_global_t,
-                               cuda::ptx::space_shared_t,
-                               cuda::ptx::op_and_op_t,
-                               int32_t*,
-                               const int32_t*,
-                               uint32_t)>(cuda::ptx::cp_reduce_async_bulk));
-          // cp.reduce.async.bulk.global.shared::cta.bulk_group.and.b64  [dstMem], [srcMem], size; // 3.
-            * fn_ptr++ = reinterpret_cast<void*>(
-              static_cast<void (*)(cuda::ptx::space_global_t,
-                                   cuda::ptx::space_shared_t,
-                                   cuda::ptx::op_and_op_t,
-                                   int64_t*,
-                                   const int64_t*,
-                                   uint32_t)>(cuda::ptx::cp_reduce_async_bulk));));
-#endif // __cccl_ptx_isa >= 800
-
-#if __cccl_ptx_isa >= 800
-  NV_IF_TARGET(
-    NV_PROVIDES_SM_90,
-    (
-        // cp.reduce.async.bulk.global.shared::cta.bulk_group.or.b32  [dstMem], [srcMem], size; // 3.
-        * fn_ptr++ = reinterpret_cast<void*>(
-          static_cast<void (*)(cuda::ptx::space_global_t,
-                               cuda::ptx::space_shared_t,
-                               cuda::ptx::op_or_op_t,
-                               int32_t*,
-                               const int32_t*,
-                               uint32_t)>(cuda::ptx::cp_reduce_async_bulk));
-          // cp.reduce.async.bulk.global.shared::cta.bulk_group.or.b64  [dstMem], [srcMem], size; // 3.
-            * fn_ptr++ = reinterpret_cast<void*>(
-              static_cast<void (*)(cuda::ptx::space_global_t,
-                                   cuda::ptx::space_shared_t,
-                                   cuda::ptx::op_or_op_t,
-                                   int64_t*,
-                                   const int64_t*,
-                                   uint32_t)>(cuda::ptx::cp_reduce_async_bulk));));
-#endif // __cccl_ptx_isa >= 800
-
-#if __cccl_ptx_isa >= 800
-  NV_IF_TARGET(
-    NV_PROVIDES_SM_90,
-    (
-        // cp.reduce.async.bulk.global.shared::cta.bulk_group.xor.b32  [dstMem], [srcMem], size; // 3.
-        * fn_ptr++ = reinterpret_cast<void*>(
-          static_cast<void (*)(cuda::ptx::space_global_t,
-                               cuda::ptx::space_shared_t,
-                               cuda::ptx::op_xor_op_t,
-                               int32_t*,
-                               const int32_t*,
-                               uint32_t)>(cuda::ptx::cp_reduce_async_bulk));
-          // cp.reduce.async.bulk.global.shared::cta.bulk_group.xor.b64  [dstMem], [srcMem], size; // 3.
-            * fn_ptr++ = reinterpret_cast<void*>(
-              static_cast<void (*)(cuda::ptx::space_global_t,
-                                   cuda::ptx::space_shared_t,
-                                   cuda::ptx::op_xor_op_t,
-                                   int64_t*,
-                                   const int64_t*,
-                                   uint32_t)>(cuda::ptx::cp_reduce_async_bulk));));
-#endif // __cccl_ptx_isa >= 800
-
-#if __cccl_ptx_isa >= 800
-  NV_IF_TARGET(
-    NV_PROVIDES_SM_90,
-    (
-        // cp.reduce.async.bulk.global.shared::cta.bulk_group.min.u32  [dstMem], [srcMem], size; // 4.
-        * fn_ptr++ = reinterpret_cast<void*>(
-          static_cast<void (*)(cuda::ptx::space_global_t,
-                               cuda::ptx::space_shared_t,
-                               cuda::ptx::op_min_t,
-                               uint32_t*,
-                               const uint32_t*,
-                               uint32_t)>(cuda::ptx::cp_reduce_async_bulk));));
-#endif // __cccl_ptx_isa >= 800
-
-#if __cccl_ptx_isa >= 800
-  NV_IF_TARGET(
-    NV_PROVIDES_SM_90,
-    (
-        // cp.reduce.async.bulk.global.shared::cta.bulk_group.max.u32  [dstMem], [srcMem], size; // 4.
-        * fn_ptr++ = reinterpret_cast<void*>(
-          static_cast<void (*)(cuda::ptx::space_global_t,
-                               cuda::ptx::space_shared_t,
-                               cuda::ptx::op_max_t,
-                               uint32_t*,
-                               const uint32_t*,
-                               uint32_t)>(cuda::ptx::cp_reduce_async_bulk));));
-#endif // __cccl_ptx_isa >= 800
-
-#if __cccl_ptx_isa >= 800
-  NV_IF_TARGET(
-    NV_PROVIDES_SM_90,
-    (
-        // cp.reduce.async.bulk.global.shared::cta.bulk_group.add.u32  [dstMem], [srcMem], size; // 4.
-        * fn_ptr++ = reinterpret_cast<void*>(
-          static_cast<void (*)(cuda::ptx::space_global_t,
-                               cuda::ptx::space_shared_t,
-                               cuda::ptx::op_add_t,
-                               uint32_t*,
-                               const uint32_t*,
-                               uint32_t)>(cuda::ptx::cp_reduce_async_bulk));));
-#endif // __cccl_ptx_isa >= 800
-
-#if __cccl_ptx_isa >= 800
-  NV_IF_TARGET(
-    NV_PROVIDES_SM_90,
-    (
-        // cp.reduce.async.bulk.global.shared::cta.bulk_group.inc.u32  [dstMem], [srcMem], size; // 4.
-        * fn_ptr++ = reinterpret_cast<void*>(
-          static_cast<void (*)(cuda::ptx::space_global_t,
-                               cuda::ptx::space_shared_t,
-                               cuda::ptx::op_inc_t,
-                               uint32_t*,
-                               const uint32_t*,
-                               uint32_t)>(cuda::ptx::cp_reduce_async_bulk));));
-#endif // __cccl_ptx_isa >= 800
-
-#if __cccl_ptx_isa >= 800
-  NV_IF_TARGET(
-    NV_PROVIDES_SM_90,
-    (
-        // cp.reduce.async.bulk.global.shared::cta.bulk_group.dec.u32  [dstMem], [srcMem], size; // 4.
-        * fn_ptr++ = reinterpret_cast<void*>(
-          static_cast<void (*)(cuda::ptx::space_global_t,
-                               cuda::ptx::space_shared_t,
-                               cuda::ptx::op_dec_t,
-                               uint32_t*,
-                               const uint32_t*,
-                               uint32_t)>(cuda::ptx::cp_reduce_async_bulk));));
-#endif // __cccl_ptx_isa >= 800
-
-#if __cccl_ptx_isa >= 800
-  NV_IF_TARGET(
-    NV_PROVIDES_SM_90,
-    (
-        // cp.reduce.async.bulk.global.shared::cta.bulk_group.min.s32  [dstMem], [srcMem], size; // 4.
-        * fn_ptr++ = reinterpret_cast<void*>(
-          static_cast<void (*)(
-            cuda::ptx::space_global_t, cuda::ptx::space_shared_t, cuda::ptx::op_min_t, int32_t*, const int32_t*, uint32_t)>(
-            cuda::ptx::cp_reduce_async_bulk));));
-#endif // __cccl_ptx_isa >= 800
-
-#if __cccl_ptx_isa >= 800
-  NV_IF_TARGET(
-    NV_PROVIDES_SM_90,
-    (
-        // cp.reduce.async.bulk.global.shared::cta.bulk_group.max.s32  [dstMem], [srcMem], size; // 4.
-        * fn_ptr++ = reinterpret_cast<void*>(
-          static_cast<void (*)(
-            cuda::ptx::space_global_t, cuda::ptx::space_shared_t, cuda::ptx::op_max_t, int32_t*, const int32_t*, uint32_t)>(
-            cuda::ptx::cp_reduce_async_bulk));));
-#endif // __cccl_ptx_isa >= 800
-
-#if __cccl_ptx_isa >= 800
-  NV_IF_TARGET(
-    NV_PROVIDES_SM_90,
-    (
-        // cp.reduce.async.bulk.global.shared::cta.bulk_group.add.s32  [dstMem], [srcMem], size; // 4.
-        * fn_ptr++ = reinterpret_cast<void*>(
-          static_cast<void (*)(
-            cuda::ptx::space_global_t, cuda::ptx::space_shared_t, cuda::ptx::op_add_t, int32_t*, const int32_t*, uint32_t)>(
-            cuda::ptx::cp_reduce_async_bulk));));
-#endif // __cccl_ptx_isa >= 800
-
-#if __cccl_ptx_isa >= 800
-  NV_IF_TARGET(
-    NV_PROVIDES_SM_90,
-    (
-        // cp.reduce.async.bulk.global.shared::cta.bulk_group.min.u64  [dstMem], [srcMem], size; // 4.
-        * fn_ptr++ = reinterpret_cast<void*>(
-          static_cast<void (*)(cuda::ptx::space_global_t,
-                               cuda::ptx::space_shared_t,
-                               cuda::ptx::op_min_t,
-                               uint64_t*,
-                               const uint64_t*,
-                               uint32_t)>(cuda::ptx::cp_reduce_async_bulk));));
-#endif // __cccl_ptx_isa >= 800
-
-#if __cccl_ptx_isa >= 800
-  NV_IF_TARGET(
-    NV_PROVIDES_SM_90,
-    (
-        // cp.reduce.async.bulk.global.shared::cta.bulk_group.max.u64  [dstMem], [srcMem], size; // 4.
-        * fn_ptr++ = reinterpret_cast<void*>(
-          static_cast<void (*)(cuda::ptx::space_global_t,
-                               cuda::ptx::space_shared_t,
-                               cuda::ptx::op_max_t,
-                               uint64_t*,
-                               const uint64_t*,
-                               uint32_t)>(cuda::ptx::cp_reduce_async_bulk));));
-#endif // __cccl_ptx_isa >= 800
-
-#if __cccl_ptx_isa >= 800
-  NV_IF_TARGET(
-    NV_PROVIDES_SM_90,
-    (
-        // cp.reduce.async.bulk.global.shared::cta.bulk_group.add.u64  [dstMem], [srcMem], size; // 4.
-        * fn_ptr++ = reinterpret_cast<void*>(
-          static_cast<void (*)(cuda::ptx::space_global_t,
-                               cuda::ptx::space_shared_t,
-                               cuda::ptx::op_add_t,
-                               uint64_t*,
-                               const uint64_t*,
-                               uint32_t)>(cuda::ptx::cp_reduce_async_bulk));));
-#endif // __cccl_ptx_isa >= 800
-
-#if __cccl_ptx_isa >= 800
-  NV_IF_TARGET(
-    NV_PROVIDES_SM_90,
-    (
-        // cp.reduce.async.bulk.global.shared::cta.bulk_group.min.s64  [dstMem], [srcMem], size; // 4.
-        * fn_ptr++ = reinterpret_cast<void*>(
-          static_cast<void (*)(
-            cuda::ptx::space_global_t, cuda::ptx::space_shared_t, cuda::ptx::op_min_t, int64_t*, const int64_t*, uint32_t)>(
-            cuda::ptx::cp_reduce_async_bulk));));
-#endif // __cccl_ptx_isa >= 800
-
-#if __cccl_ptx_isa >= 800
-  NV_IF_TARGET(
-    NV_PROVIDES_SM_90,
-    (
-        // cp.reduce.async.bulk.global.shared::cta.bulk_group.max.s64  [dstMem], [srcMem], size; // 4.
-        * fn_ptr++ = reinterpret_cast<void*>(
-          static_cast<void (*)(
-            cuda::ptx::space_global_t, cuda::ptx::space_shared_t, cuda::ptx::op_max_t, int64_t*, const int64_t*, uint32_t)>(
-            cuda::ptx::cp_reduce_async_bulk));));
-#endif // __cccl_ptx_isa >= 800
-
-#if __cccl_ptx_isa >= 800
-  NV_IF_TARGET(
-    NV_PROVIDES_SM_90,
-    (
-        // cp.reduce.async.bulk.global.shared::cta.bulk_group.add.f32  [dstMem], [srcMem], size; // 4.
-        * fn_ptr++ = reinterpret_cast<void*>(
-          static_cast<void (*)(
-            cuda::ptx::space_global_t, cuda::ptx::space_shared_t, cuda::ptx::op_add_t, float*, const float*, uint32_t)>(
-            cuda::ptx::cp_reduce_async_bulk));));
-#endif // __cccl_ptx_isa >= 800
-
-#if __cccl_ptx_isa >= 800
-  NV_IF_TARGET(
-    NV_PROVIDES_SM_90,
-    (
-        // cp.reduce.async.bulk.global.shared::cta.bulk_group.add.f64  [dstMem], [srcMem], size; // 4.
-        * fn_ptr++ = reinterpret_cast<void*>(
-          static_cast<void (*)(
-            cuda::ptx::space_global_t, cuda::ptx::space_shared_t, cuda::ptx::op_add_t, double*, const double*, uint32_t)>(
-            cuda::ptx::cp_reduce_async_bulk));));
-#endif // __cccl_ptx_isa >= 800
-
-#if __cccl_ptx_isa >= 800
-  NV_IF_TARGET(
-    NV_PROVIDES_SM_90,
-    (
-        // cp.reduce.async.bulk.global.shared::cta.bulk_group.add.u64  [dstMem], [srcMem], size; // 6.
-        * fn_ptr++ = reinterpret_cast<void*>(
-          static_cast<void (*)(
-            cuda::ptx::space_global_t, cuda::ptx::space_shared_t, cuda::ptx::op_add_t, int64_t*, const int64_t*, uint32_t)>(
-            cuda::ptx::cp_reduce_async_bulk));));
-#endif // __cccl_ptx_isa >= 800
-}
+#include "generated/cp_reduce_async_bulk.inc"
 
 #ifdef _LIBCUDACXX_HAS_NVF16
-__global__ void test_cp_reduce_async_bulk_f16(void** fn_ptr)
-{
-#  if __cccl_ptx_isa >= 800
-  NV_IF_TARGET(
-    NV_PROVIDES_SM_90,
-    (
-        // cp.reduce.async.bulk.global.shared::cta.bulk_group.min.f16  [dstMem], [srcMem], size; // 4.
-        * fn_ptr++ = reinterpret_cast<void*>(
-          static_cast<void (*)(
-            cuda::ptx::space_global_t, cuda::ptx::space_shared_t, cuda::ptx::op_min_t, __half*, const __half*, uint32_t)>(
-            cuda::ptx::cp_reduce_async_bulk));));
-#  endif // __cccl_ptx_isa >= 800
-
-#  if __cccl_ptx_isa >= 800
-  NV_IF_TARGET(
-    NV_PROVIDES_SM_90,
-    (
-        // cp.reduce.async.bulk.global.shared::cta.bulk_group.max.f16  [dstMem], [srcMem], size; // 4.
-        * fn_ptr++ = reinterpret_cast<void*>(
-          static_cast<void (*)(
-            cuda::ptx::space_global_t, cuda::ptx::space_shared_t, cuda::ptx::op_max_t, __half*, const __half*, uint32_t)>(
-            cuda::ptx::cp_reduce_async_bulk));));
-#  endif // __cccl_ptx_isa >= 800
-
-#  if __cccl_ptx_isa >= 800
-  NV_IF_TARGET(
-    NV_PROVIDES_SM_90,
-    (
-        // cp.reduce.async.bulk.global.shared::cta.bulk_group.add.noftz.f16  [dstMem], [srcMem], size; // 5.
-        * fn_ptr++ = reinterpret_cast<void*>(
-          static_cast<void (*)(
-            cuda::ptx::space_global_t, cuda::ptx::space_shared_t, cuda::ptx::op_add_t, __half*, const __half*, uint32_t)>(
-            cuda::ptx::cp_reduce_async_bulk));));
-#  endif // __cccl_ptx_isa >= 800
-}
-
+#  include "generated/cp_reduce_async_bulk_f16.inc"
 #endif // _LIBCUDACXX_HAS_NVF16
 
 #ifdef _LIBCUDACXX_HAS_NVBF16
-__global__ void test_cp_reduce_async_bulk_bf16(void** fn_ptr)
-{
-#  if __cccl_ptx_isa >= 800
-  NV_IF_TARGET(
-    NV_PROVIDES_SM_90,
-    (
-        // cp.reduce.async.bulk.global.shared::cta.bulk_group.min.bf16  [dstMem], [srcMem], size; // 4.
-        * fn_ptr++ = reinterpret_cast<void*>(
-          static_cast<void (*)(cuda::ptx::space_global_t,
-                               cuda::ptx::space_shared_t,
-                               cuda::ptx::op_min_t,
-                               __nv_bfloat16*,
-                               const __nv_bfloat16*,
-                               uint32_t)>(cuda::ptx::cp_reduce_async_bulk));));
-#  endif // __cccl_ptx_isa >= 800
-
-#  if __cccl_ptx_isa >= 800
-  NV_IF_TARGET(
-    NV_PROVIDES_SM_90,
-    (
-        // cp.reduce.async.bulk.global.shared::cta.bulk_group.max.bf16  [dstMem], [srcMem], size; // 4.
-        * fn_ptr++ = reinterpret_cast<void*>(
-          static_cast<void (*)(cuda::ptx::space_global_t,
-                               cuda::ptx::space_shared_t,
-                               cuda::ptx::op_max_t,
-                               __nv_bfloat16*,
-                               const __nv_bfloat16*,
-                               uint32_t)>(cuda::ptx::cp_reduce_async_bulk));));
-#  endif // __cccl_ptx_isa >= 800
-
-#  if __cccl_ptx_isa >= 800
-  NV_IF_TARGET(
-    NV_PROVIDES_SM_90,
-    (
-        // cp.reduce.async.bulk.global.shared::cta.bulk_group.add.noftz.bf16  [dstMem], [srcMem], size; // 5.
-        * fn_ptr++ = reinterpret_cast<void*>(
-          static_cast<void (*)(cuda::ptx::space_global_t,
-                               cuda::ptx::space_shared_t,
-                               cuda::ptx::op_add_t,
-                               __nv_bfloat16*,
-                               const __nv_bfloat16*,
-                               uint32_t)>(cuda::ptx::cp_reduce_async_bulk));));
-#  endif // __cccl_ptx_isa >= 800
-}
-
+#  include "generated/cp_reduce_async_bulk_bf16.inc"
 #endif // _LIBCUDACXX_HAS_NVBF16
 
 int main(int, char**)
diff --git a/libcudacxx/test/libcudacxx/cuda/ptx/ptx.cp.reduce.async.bulk.tensor.compile.pass.cpp b/libcudacxx/test/libcudacxx/cuda/ptx/ptx.cp.reduce.async.bulk.tensor.compile.pass.cpp
index 5ae7d313c36..14abc0d3ae6 100644
--- a/libcudacxx/test/libcudacxx/cuda/ptx/ptx.cp.reduce.async.bulk.tensor.compile.pass.cpp
+++ b/libcudacxx/test/libcudacxx/cuda/ptx/ptx.cp.reduce.async.bulk.tensor.compile.pass.cpp
@@ -31,398 +31,7 @@
  *
  */
 
-__global__ void test_cp_reduce_async_bulk_tensor(void** fn_ptr)
-{
-#if __cccl_ptx_isa >= 800
-  NV_IF_TARGET(
-    NV_PROVIDES_SM_90,
-    (
-        // cp.reduce.async.bulk.tensor.1d.global.shared::cta.add.tile.bulk_group [tensorMap, tensorCoords], [srcMem]; //
-        // 1a.
-        * fn_ptr++ = reinterpret_cast<void*>(
-          static_cast<void (*)(cuda::ptx::space_global_t,
-                               cuda::ptx::space_shared_t,
-                               cuda::ptx::op_add_t,
-                               const void*,
-                               const int32_t(&)[1],
-                               const void*)>(cuda::ptx::cp_reduce_async_bulk_tensor));
-          // cp.reduce.async.bulk.tensor.1d.global.shared::cta.min.tile.bulk_group [tensorMap, tensorCoords],
-          // [srcMem]; // 1a.
-            * fn_ptr++ = reinterpret_cast<void*>(
-              static_cast<void (*)(cuda::ptx::space_global_t,
-                                   cuda::ptx::space_shared_t,
-                                   cuda::ptx::op_min_t,
-                                   const void*,
-                                   const int32_t(&)[1],
-                                   const void*)>(cuda::ptx::cp_reduce_async_bulk_tensor));
-          // cp.reduce.async.bulk.tensor.1d.global.shared::cta.max.tile.bulk_group [tensorMap, tensorCoords],
-          // [srcMem]; // 1a.
-            * fn_ptr++ = reinterpret_cast<void*>(
-              static_cast<void (*)(cuda::ptx::space_global_t,
-                                   cuda::ptx::space_shared_t,
-                                   cuda::ptx::op_max_t,
-                                   const void*,
-                                   const int32_t(&)[1],
-                                   const void*)>(cuda::ptx::cp_reduce_async_bulk_tensor));
-          // cp.reduce.async.bulk.tensor.1d.global.shared::cta.inc.tile.bulk_group [tensorMap, tensorCoords],
-          // [srcMem]; // 1a.
-            * fn_ptr++ = reinterpret_cast<void*>(
-              static_cast<void (*)(cuda::ptx::space_global_t,
-                                   cuda::ptx::space_shared_t,
-                                   cuda::ptx::op_inc_t,
-                                   const void*,
-                                   const int32_t(&)[1],
-                                   const void*)>(cuda::ptx::cp_reduce_async_bulk_tensor));
-          // cp.reduce.async.bulk.tensor.1d.global.shared::cta.dec.tile.bulk_group [tensorMap, tensorCoords],
-          // [srcMem]; // 1a.
-            * fn_ptr++ = reinterpret_cast<void*>(
-              static_cast<void (*)(cuda::ptx::space_global_t,
-                                   cuda::ptx::space_shared_t,
-                                   cuda::ptx::op_dec_t,
-                                   const void*,
-                                   const int32_t(&)[1],
-                                   const void*)>(cuda::ptx::cp_reduce_async_bulk_tensor));
-          // cp.reduce.async.bulk.tensor.1d.global.shared::cta.and.tile.bulk_group [tensorMap, tensorCoords],
-          // [srcMem]; // 1a.
-            * fn_ptr++ = reinterpret_cast<void*>(
-              static_cast<void (*)(cuda::ptx::space_global_t,
-                                   cuda::ptx::space_shared_t,
-                                   cuda::ptx::op_and_op_t,
-                                   const void*,
-                                   const int32_t(&)[1],
-                                   const void*)>(cuda::ptx::cp_reduce_async_bulk_tensor));
-          // cp.reduce.async.bulk.tensor.1d.global.shared::cta.or.tile.bulk_group [tensorMap, tensorCoords], [srcMem];
-          // // 1a.
-            * fn_ptr++ = reinterpret_cast<void*>(
-              static_cast<void (*)(cuda::ptx::space_global_t,
-                                   cuda::ptx::space_shared_t,
-                                   cuda::ptx::op_or_op_t,
-                                   const void*,
-                                   const int32_t(&)[1],
-                                   const void*)>(cuda::ptx::cp_reduce_async_bulk_tensor));
-          // cp.reduce.async.bulk.tensor.1d.global.shared::cta.xor.tile.bulk_group [tensorMap, tensorCoords],
-          // [srcMem]; // 1a.
-            * fn_ptr++ = reinterpret_cast<void*>(
-              static_cast<void (*)(cuda::ptx::space_global_t,
-                                   cuda::ptx::space_shared_t,
-                                   cuda::ptx::op_xor_op_t,
-                                   const void*,
-                                   const int32_t(&)[1],
-                                   const void*)>(cuda::ptx::cp_reduce_async_bulk_tensor));));
-#endif // __cccl_ptx_isa >= 800
-
-#if __cccl_ptx_isa >= 800
-  NV_IF_TARGET(
-    NV_PROVIDES_SM_90,
-    (
-        // cp.reduce.async.bulk.tensor.2d.global.shared::cta.add.tile.bulk_group [tensorMap, tensorCoords], [srcMem]; //
-        // 1b.
-        * fn_ptr++ = reinterpret_cast<void*>(
-          static_cast<void (*)(cuda::ptx::space_global_t,
-                               cuda::ptx::space_shared_t,
-                               cuda::ptx::op_add_t,
-                               const void*,
-                               const int32_t(&)[2],
-                               const void*)>(cuda::ptx::cp_reduce_async_bulk_tensor));
-          // cp.reduce.async.bulk.tensor.2d.global.shared::cta.min.tile.bulk_group [tensorMap, tensorCoords],
-          // [srcMem]; // 1b.
-            * fn_ptr++ = reinterpret_cast<void*>(
-              static_cast<void (*)(cuda::ptx::space_global_t,
-                                   cuda::ptx::space_shared_t,
-                                   cuda::ptx::op_min_t,
-                                   const void*,
-                                   const int32_t(&)[2],
-                                   const void*)>(cuda::ptx::cp_reduce_async_bulk_tensor));
-          // cp.reduce.async.bulk.tensor.2d.global.shared::cta.max.tile.bulk_group [tensorMap, tensorCoords],
-          // [srcMem]; // 1b.
-            * fn_ptr++ = reinterpret_cast<void*>(
-              static_cast<void (*)(cuda::ptx::space_global_t,
-                                   cuda::ptx::space_shared_t,
-                                   cuda::ptx::op_max_t,
-                                   const void*,
-                                   const int32_t(&)[2],
-                                   const void*)>(cuda::ptx::cp_reduce_async_bulk_tensor));
-          // cp.reduce.async.bulk.tensor.2d.global.shared::cta.inc.tile.bulk_group [tensorMap, tensorCoords],
-          // [srcMem]; // 1b.
-            * fn_ptr++ = reinterpret_cast<void*>(
-              static_cast<void (*)(cuda::ptx::space_global_t,
-                                   cuda::ptx::space_shared_t,
-                                   cuda::ptx::op_inc_t,
-                                   const void*,
-                                   const int32_t(&)[2],
-                                   const void*)>(cuda::ptx::cp_reduce_async_bulk_tensor));
-          // cp.reduce.async.bulk.tensor.2d.global.shared::cta.dec.tile.bulk_group [tensorMap, tensorCoords],
-          // [srcMem]; // 1b.
-            * fn_ptr++ = reinterpret_cast<void*>(
-              static_cast<void (*)(cuda::ptx::space_global_t,
-                                   cuda::ptx::space_shared_t,
-                                   cuda::ptx::op_dec_t,
-                                   const void*,
-                                   const int32_t(&)[2],
-                                   const void*)>(cuda::ptx::cp_reduce_async_bulk_tensor));
-          // cp.reduce.async.bulk.tensor.2d.global.shared::cta.and.tile.bulk_group [tensorMap, tensorCoords],
-          // [srcMem]; // 1b.
-            * fn_ptr++ = reinterpret_cast<void*>(
-              static_cast<void (*)(cuda::ptx::space_global_t,
-                                   cuda::ptx::space_shared_t,
-                                   cuda::ptx::op_and_op_t,
-                                   const void*,
-                                   const int32_t(&)[2],
-                                   const void*)>(cuda::ptx::cp_reduce_async_bulk_tensor));
-          // cp.reduce.async.bulk.tensor.2d.global.shared::cta.or.tile.bulk_group [tensorMap, tensorCoords], [srcMem];
-          // // 1b.
-            * fn_ptr++ = reinterpret_cast<void*>(
-              static_cast<void (*)(cuda::ptx::space_global_t,
-                                   cuda::ptx::space_shared_t,
-                                   cuda::ptx::op_or_op_t,
-                                   const void*,
-                                   const int32_t(&)[2],
-                                   const void*)>(cuda::ptx::cp_reduce_async_bulk_tensor));
-          // cp.reduce.async.bulk.tensor.2d.global.shared::cta.xor.tile.bulk_group [tensorMap, tensorCoords],
-          // [srcMem]; // 1b.
-            * fn_ptr++ = reinterpret_cast<void*>(
-              static_cast<void (*)(cuda::ptx::space_global_t,
-                                   cuda::ptx::space_shared_t,
-                                   cuda::ptx::op_xor_op_t,
-                                   const void*,
-                                   const int32_t(&)[2],
-                                   const void*)>(cuda::ptx::cp_reduce_async_bulk_tensor));));
-#endif // __cccl_ptx_isa >= 800
-
-#if __cccl_ptx_isa >= 800
-  NV_IF_TARGET(
-    NV_PROVIDES_SM_90,
-    (
-        // cp.reduce.async.bulk.tensor.3d.global.shared::cta.add.tile.bulk_group [tensorMap, tensorCoords], [srcMem]; //
-        // 1c.
-        * fn_ptr++ = reinterpret_cast<void*>(
-          static_cast<void (*)(cuda::ptx::space_global_t,
-                               cuda::ptx::space_shared_t,
-                               cuda::ptx::op_add_t,
-                               const void*,
-                               const int32_t(&)[3],
-                               const void*)>(cuda::ptx::cp_reduce_async_bulk_tensor));
-          // cp.reduce.async.bulk.tensor.3d.global.shared::cta.min.tile.bulk_group [tensorMap, tensorCoords],
-          // [srcMem]; // 1c.
-            * fn_ptr++ = reinterpret_cast<void*>(
-              static_cast<void (*)(cuda::ptx::space_global_t,
-                                   cuda::ptx::space_shared_t,
-                                   cuda::ptx::op_min_t,
-                                   const void*,
-                                   const int32_t(&)[3],
-                                   const void*)>(cuda::ptx::cp_reduce_async_bulk_tensor));
-          // cp.reduce.async.bulk.tensor.3d.global.shared::cta.max.tile.bulk_group [tensorMap, tensorCoords],
-          // [srcMem]; // 1c.
-            * fn_ptr++ = reinterpret_cast<void*>(
-              static_cast<void (*)(cuda::ptx::space_global_t,
-                                   cuda::ptx::space_shared_t,
-                                   cuda::ptx::op_max_t,
-                                   const void*,
-                                   const int32_t(&)[3],
-                                   const void*)>(cuda::ptx::cp_reduce_async_bulk_tensor));
-          // cp.reduce.async.bulk.tensor.3d.global.shared::cta.inc.tile.bulk_group [tensorMap, tensorCoords],
-          // [srcMem]; // 1c.
-            * fn_ptr++ = reinterpret_cast<void*>(
-              static_cast<void (*)(cuda::ptx::space_global_t,
-                                   cuda::ptx::space_shared_t,
-                                   cuda::ptx::op_inc_t,
-                                   const void*,
-                                   const int32_t(&)[3],
-                                   const void*)>(cuda::ptx::cp_reduce_async_bulk_tensor));
-          // cp.reduce.async.bulk.tensor.3d.global.shared::cta.dec.tile.bulk_group [tensorMap, tensorCoords],
-          // [srcMem]; // 1c.
-            * fn_ptr++ = reinterpret_cast<void*>(
-              static_cast<void (*)(cuda::ptx::space_global_t,
-                                   cuda::ptx::space_shared_t,
-                                   cuda::ptx::op_dec_t,
-                                   const void*,
-                                   const int32_t(&)[3],
-                                   const void*)>(cuda::ptx::cp_reduce_async_bulk_tensor));
-          // cp.reduce.async.bulk.tensor.3d.global.shared::cta.and.tile.bulk_group [tensorMap, tensorCoords],
-          // [srcMem]; // 1c.
-            * fn_ptr++ = reinterpret_cast<void*>(
-              static_cast<void (*)(cuda::ptx::space_global_t,
-                                   cuda::ptx::space_shared_t,
-                                   cuda::ptx::op_and_op_t,
-                                   const void*,
-                                   const int32_t(&)[3],
-                                   const void*)>(cuda::ptx::cp_reduce_async_bulk_tensor));
-          // cp.reduce.async.bulk.tensor.3d.global.shared::cta.or.tile.bulk_group [tensorMap, tensorCoords], [srcMem];
-          // // 1c.
-            * fn_ptr++ = reinterpret_cast<void*>(
-              static_cast<void (*)(cuda::ptx::space_global_t,
-                                   cuda::ptx::space_shared_t,
-                                   cuda::ptx::op_or_op_t,
-                                   const void*,
-                                   const int32_t(&)[3],
-                                   const void*)>(cuda::ptx::cp_reduce_async_bulk_tensor));
-          // cp.reduce.async.bulk.tensor.3d.global.shared::cta.xor.tile.bulk_group [tensorMap, tensorCoords],
-          // [srcMem]; // 1c.
-            * fn_ptr++ = reinterpret_cast<void*>(
-              static_cast<void (*)(cuda::ptx::space_global_t,
-                                   cuda::ptx::space_shared_t,
-                                   cuda::ptx::op_xor_op_t,
-                                   const void*,
-                                   const int32_t(&)[3],
-                                   const void*)>(cuda::ptx::cp_reduce_async_bulk_tensor));));
-#endif // __cccl_ptx_isa >= 800
-
-#if __cccl_ptx_isa >= 800
-  NV_IF_TARGET(
-    NV_PROVIDES_SM_90,
-    (
-        // cp.reduce.async.bulk.tensor.4d.global.shared::cta.add.tile.bulk_group [tensorMap, tensorCoords], [srcMem]; //
-        // 1d.
-        * fn_ptr++ = reinterpret_cast<void*>(
-          static_cast<void (*)(cuda::ptx::space_global_t,
-                               cuda::ptx::space_shared_t,
-                               cuda::ptx::op_add_t,
-                               const void*,
-                               const int32_t(&)[4],
-                               const void*)>(cuda::ptx::cp_reduce_async_bulk_tensor));
-          // cp.reduce.async.bulk.tensor.4d.global.shared::cta.min.tile.bulk_group [tensorMap, tensorCoords],
-          // [srcMem]; // 1d.
-            * fn_ptr++ = reinterpret_cast<void*>(
-              static_cast<void (*)(cuda::ptx::space_global_t,
-                                   cuda::ptx::space_shared_t,
-                                   cuda::ptx::op_min_t,
-                                   const void*,
-                                   const int32_t(&)[4],
-                                   const void*)>(cuda::ptx::cp_reduce_async_bulk_tensor));
-          // cp.reduce.async.bulk.tensor.4d.global.shared::cta.max.tile.bulk_group [tensorMap, tensorCoords],
-          // [srcMem]; // 1d.
-            * fn_ptr++ = reinterpret_cast<void*>(
-              static_cast<void (*)(cuda::ptx::space_global_t,
-                                   cuda::ptx::space_shared_t,
-                                   cuda::ptx::op_max_t,
-                                   const void*,
-                                   const int32_t(&)[4],
-                                   const void*)>(cuda::ptx::cp_reduce_async_bulk_tensor));
-          // cp.reduce.async.bulk.tensor.4d.global.shared::cta.inc.tile.bulk_group [tensorMap, tensorCoords],
-          // [srcMem]; // 1d.
-            * fn_ptr++ = reinterpret_cast<void*>(
-              static_cast<void (*)(cuda::ptx::space_global_t,
-                                   cuda::ptx::space_shared_t,
-                                   cuda::ptx::op_inc_t,
-                                   const void*,
-                                   const int32_t(&)[4],
-                                   const void*)>(cuda::ptx::cp_reduce_async_bulk_tensor));
-          // cp.reduce.async.bulk.tensor.4d.global.shared::cta.dec.tile.bulk_group [tensorMap, tensorCoords],
-          // [srcMem]; // 1d.
-            * fn_ptr++ = reinterpret_cast<void*>(
-              static_cast<void (*)(cuda::ptx::space_global_t,
-                                   cuda::ptx::space_shared_t,
-                                   cuda::ptx::op_dec_t,
-                                   const void*,
-                                   const int32_t(&)[4],
-                                   const void*)>(cuda::ptx::cp_reduce_async_bulk_tensor));
-          // cp.reduce.async.bulk.tensor.4d.global.shared::cta.and.tile.bulk_group [tensorMap, tensorCoords],
-          // [srcMem]; // 1d.
-            * fn_ptr++ = reinterpret_cast<void*>(
-              static_cast<void (*)(cuda::ptx::space_global_t,
-                                   cuda::ptx::space_shared_t,
-                                   cuda::ptx::op_and_op_t,
-                                   const void*,
-                                   const int32_t(&)[4],
-                                   const void*)>(cuda::ptx::cp_reduce_async_bulk_tensor));
-          // cp.reduce.async.bulk.tensor.4d.global.shared::cta.or.tile.bulk_group [tensorMap, tensorCoords], [srcMem];
-          // // 1d.
-            * fn_ptr++ = reinterpret_cast<void*>(
-              static_cast<void (*)(cuda::ptx::space_global_t,
-                                   cuda::ptx::space_shared_t,
-                                   cuda::ptx::op_or_op_t,
-                                   const void*,
-                                   const int32_t(&)[4],
-                                   const void*)>(cuda::ptx::cp_reduce_async_bulk_tensor));
-          // cp.reduce.async.bulk.tensor.4d.global.shared::cta.xor.tile.bulk_group [tensorMap, tensorCoords],
-          // [srcMem]; // 1d.
-            * fn_ptr++ = reinterpret_cast<void*>(
-              static_cast<void (*)(cuda::ptx::space_global_t,
-                                   cuda::ptx::space_shared_t,
-                                   cuda::ptx::op_xor_op_t,
-                                   const void*,
-                                   const int32_t(&)[4],
-                                   const void*)>(cuda::ptx::cp_reduce_async_bulk_tensor));));
-#endif // __cccl_ptx_isa >= 800
-
-#if __cccl_ptx_isa >= 800
-  NV_IF_TARGET(
-    NV_PROVIDES_SM_90,
-    (
-        // cp.reduce.async.bulk.tensor.5d.global.shared::cta.add.tile.bulk_group [tensorMap, tensorCoords], [srcMem]; //
-        // 1e.
-        * fn_ptr++ = reinterpret_cast<void*>(
-          static_cast<void (*)(cuda::ptx::space_global_t,
-                               cuda::ptx::space_shared_t,
-                               cuda::ptx::op_add_t,
-                               const void*,
-                               const int32_t(&)[5],
-                               const void*)>(cuda::ptx::cp_reduce_async_bulk_tensor));
-          // cp.reduce.async.bulk.tensor.5d.global.shared::cta.min.tile.bulk_group [tensorMap, tensorCoords],
-          // [srcMem]; // 1e.
-            * fn_ptr++ = reinterpret_cast<void*>(
-              static_cast<void (*)(cuda::ptx::space_global_t,
-                                   cuda::ptx::space_shared_t,
-                                   cuda::ptx::op_min_t,
-                                   const void*,
-                                   const int32_t(&)[5],
-                                   const void*)>(cuda::ptx::cp_reduce_async_bulk_tensor));
-          // cp.reduce.async.bulk.tensor.5d.global.shared::cta.max.tile.bulk_group [tensorMap, tensorCoords],
-          // [srcMem]; // 1e.
-            * fn_ptr++ = reinterpret_cast<void*>(
-              static_cast<void (*)(cuda::ptx::space_global_t,
-                                   cuda::ptx::space_shared_t,
-                                   cuda::ptx::op_max_t,
-                                   const void*,
-                                   const int32_t(&)[5],
-                                   const void*)>(cuda::ptx::cp_reduce_async_bulk_tensor));
-          // cp.reduce.async.bulk.tensor.5d.global.shared::cta.inc.tile.bulk_group [tensorMap, tensorCoords],
-          // [srcMem]; // 1e.
-            * fn_ptr++ = reinterpret_cast<void*>(
-              static_cast<void (*)(cuda::ptx::space_global_t,
-                                   cuda::ptx::space_shared_t,
-                                   cuda::ptx::op_inc_t,
-                                   const void*,
-                                   const int32_t(&)[5],
-                                   const void*)>(cuda::ptx::cp_reduce_async_bulk_tensor));
-          // cp.reduce.async.bulk.tensor.5d.global.shared::cta.dec.tile.bulk_group [tensorMap, tensorCoords],
-          // [srcMem]; // 1e.
-            * fn_ptr++ = reinterpret_cast<void*>(
-              static_cast<void (*)(cuda::ptx::space_global_t,
-                                   cuda::ptx::space_shared_t,
-                                   cuda::ptx::op_dec_t,
-                                   const void*,
-                                   const int32_t(&)[5],
-                                   const void*)>(cuda::ptx::cp_reduce_async_bulk_tensor));
-          // cp.reduce.async.bulk.tensor.5d.global.shared::cta.and.tile.bulk_group [tensorMap, tensorCoords],
-          // [srcMem]; // 1e.
-            * fn_ptr++ = reinterpret_cast<void*>(
-              static_cast<void (*)(cuda::ptx::space_global_t,
-                                   cuda::ptx::space_shared_t,
-                                   cuda::ptx::op_and_op_t,
-                                   const void*,
-                                   const int32_t(&)[5],
-                                   const void*)>(cuda::ptx::cp_reduce_async_bulk_tensor));
-          // cp.reduce.async.bulk.tensor.5d.global.shared::cta.or.tile.bulk_group [tensorMap, tensorCoords], [srcMem];
-          // // 1e.
-            * fn_ptr++ = reinterpret_cast<void*>(
-              static_cast<void (*)(cuda::ptx::space_global_t,
-                                   cuda::ptx::space_shared_t,
-                                   cuda::ptx::op_or_op_t,
-                                   const void*,
-                                   const int32_t(&)[5],
-                                   const void*)>(cuda::ptx::cp_reduce_async_bulk_tensor));
-          // cp.reduce.async.bulk.tensor.5d.global.shared::cta.xor.tile.bulk_group [tensorMap, tensorCoords],
-          // [srcMem]; // 1e.
-            * fn_ptr++ = reinterpret_cast<void*>(
-              static_cast<void (*)(cuda::ptx::space_global_t,
-                                   cuda::ptx::space_shared_t,
-                                   cuda::ptx::op_xor_op_t,
-                                   const void*,
-                                   const int32_t(&)[5],
-                                   const void*)>(cuda::ptx::cp_reduce_async_bulk_tensor));));
-#endif // __cccl_ptx_isa >= 800
-}
+#include "generated/cp_reduce_async_bulk_tensor.inc"
 
 int main(int, char**)
 {
diff --git a/libcudacxx/test/libcudacxx/cuda/ptx/ptx.fence.compile.pass.cpp b/libcudacxx/test/libcudacxx/cuda/ptx/ptx.fence.compile.pass.cpp
index 0be4f6b32fe..641cb83f172 100644
--- a/libcudacxx/test/libcudacxx/cuda/ptx/ptx.fence.compile.pass.cpp
+++ b/libcudacxx/test/libcudacxx/cuda/ptx/ptx.fence.compile.pass.cpp
@@ -31,136 +31,11 @@
  *
  */
 
-__global__ void test_fence(void** fn_ptr)
-{
-#if __cccl_ptx_isa >= 600
-  NV_IF_TARGET(
-    NV_PROVIDES_SM_70,
-    (
-        // fence.sc.cta; // 1.
-        * fn_ptr++ =
-          reinterpret_cast<void*>(static_cast<void (*)(cuda::ptx::sem_sc_t, cuda::ptx::scope_cta_t)>(cuda::ptx::fence));
-          // fence.sc.gpu; // 1.
-            * fn_ptr++ = reinterpret_cast<void*>(
-              static_cast<void (*)(cuda::ptx::sem_sc_t, cuda::ptx::scope_gpu_t)>(cuda::ptx::fence));
-          // fence.sc.sys; // 1.
-            * fn_ptr++ = reinterpret_cast<void*>(
-              static_cast<void (*)(cuda::ptx::sem_sc_t, cuda::ptx::scope_sys_t)>(cuda::ptx::fence));
-          // fence.acq_rel.cta; // 1.
-            * fn_ptr++ = reinterpret_cast<void*>(
-              static_cast<void (*)(cuda::ptx::sem_acq_rel_t, cuda::ptx::scope_cta_t)>(cuda::ptx::fence));
-          // fence.acq_rel.gpu; // 1.
-            * fn_ptr++ = reinterpret_cast<void*>(
-              static_cast<void (*)(cuda::ptx::sem_acq_rel_t, cuda::ptx::scope_gpu_t)>(cuda::ptx::fence));
-          // fence.acq_rel.sys; // 1.
-            * fn_ptr++ = reinterpret_cast<void*>(
-              static_cast<void (*)(cuda::ptx::sem_acq_rel_t, cuda::ptx::scope_sys_t)>(cuda::ptx::fence));));
-#endif // __cccl_ptx_isa >= 600
-
-#if __cccl_ptx_isa >= 780
-  NV_IF_TARGET(
-    NV_PROVIDES_SM_90,
-    (
-        // fence.sc.cluster; // 2.
-        * fn_ptr++ = reinterpret_cast<void*>(
-          static_cast<void (*)(cuda::ptx::sem_sc_t, cuda::ptx::scope_cluster_t)>(cuda::ptx::fence));
-          // fence.acq_rel.cluster; // 2.
-            * fn_ptr++ = reinterpret_cast<void*>(
-              static_cast<void (*)(cuda::ptx::sem_acq_rel_t, cuda::ptx::scope_cluster_t)>(cuda::ptx::fence));));
-#endif // __cccl_ptx_isa >= 780
-}
-
-__global__ void test_fence_mbarrier_init(void** fn_ptr)
-{
-#if __cccl_ptx_isa >= 800
-  NV_IF_TARGET(
-    NV_PROVIDES_SM_90,
-    (
-        // fence.mbarrier_init.release.cluster; // 3.
-        * fn_ptr++ = reinterpret_cast<void*>(static_cast<void (*)(cuda::ptx::sem_release_t, cuda::ptx::scope_cluster_t)>(
-          cuda::ptx::fence_mbarrier_init));));
-#endif // __cccl_ptx_isa >= 800
-}
-
-__global__ void test_fence_proxy_alias(void** fn_ptr)
-{
-#if __cccl_ptx_isa >= 750
-  NV_IF_TARGET(NV_PROVIDES_SM_70,
-               (
-                   // fence.proxy.alias; // 4.
-                   * fn_ptr++ = reinterpret_cast<void*>(static_cast<void (*)()>(cuda::ptx::fence_proxy_alias));));
-#endif // __cccl_ptx_isa >= 750
-}
-
-__global__ void test_fence_proxy_async(void** fn_ptr)
-{
-#if __cccl_ptx_isa >= 800
-  NV_IF_TARGET(NV_PROVIDES_SM_90,
-               (
-                   // fence.proxy.async; // 5.
-                   * fn_ptr++ = reinterpret_cast<void*>(static_cast<void (*)()>(cuda::ptx::fence_proxy_async));));
-#endif // __cccl_ptx_isa >= 800
-
-#if __cccl_ptx_isa >= 800
-  NV_IF_TARGET(
-    NV_PROVIDES_SM_90,
-    (
-        // fence.proxy.async.global; // 6.
-        * fn_ptr++ =
-          reinterpret_cast<void*>(static_cast<void (*)(cuda::ptx::space_global_t)>(cuda::ptx::fence_proxy_async));
-          // fence.proxy.async.shared::cluster; // 6.
-            * fn_ptr++ =
-              reinterpret_cast<void*>(static_cast<void (*)(cuda::ptx::space_cluster_t)>(cuda::ptx::fence_proxy_async));
-          // fence.proxy.async.shared::cta; // 6.
-            * fn_ptr++ = reinterpret_cast<void*>(
-              static_cast<void (*)(cuda::ptx::space_shared_t)>(cuda::ptx::fence_proxy_async));));
-#endif // __cccl_ptx_isa >= 800
-}
-
-__global__ void test_fence_proxy_tensormap_generic(void** fn_ptr)
-{
-#if __cccl_ptx_isa >= 830
-  NV_IF_TARGET(
-    NV_PROVIDES_SM_90,
-    (
-        // fence.proxy.tensormap::generic.release.cta; // 7.
-        * fn_ptr++ = reinterpret_cast<void*>(static_cast<void (*)(cuda::ptx::sem_release_t, cuda::ptx::scope_cta_t)>(
-          cuda::ptx::fence_proxy_tensormap_generic));
-          // fence.proxy.tensormap::generic.release.cluster; // 7.
-            * fn_ptr++ =
-              reinterpret_cast<void*>(static_cast<void (*)(cuda::ptx::sem_release_t, cuda::ptx::scope_cluster_t)>(
-                cuda::ptx::fence_proxy_tensormap_generic));
-          // fence.proxy.tensormap::generic.release.gpu; // 7.
-            * fn_ptr++ = reinterpret_cast<void*>(static_cast<void (*)(cuda::ptx::sem_release_t, cuda::ptx::scope_gpu_t)>(
-              cuda::ptx::fence_proxy_tensormap_generic));
-          // fence.proxy.tensormap::generic.release.sys; // 7.
-            * fn_ptr++ = reinterpret_cast<void*>(static_cast<void (*)(cuda::ptx::sem_release_t, cuda::ptx::scope_sys_t)>(
-              cuda::ptx::fence_proxy_tensormap_generic));));
-#endif // __cccl_ptx_isa >= 830
-
-#if __cccl_ptx_isa >= 830
-  NV_IF_TARGET(
-    NV_PROVIDES_SM_90,
-    (
-        // fence.proxy.tensormap::generic.acquire.cta [addr], size; // 8.
-        * fn_ptr++ = reinterpret_cast<void*>(
-          static_cast<void (*)(cuda::ptx::sem_acquire_t, cuda::ptx::scope_cta_t, const void*, cuda::ptx::n32_t<128>)>(
-            cuda::ptx::fence_proxy_tensormap_generic));
-          // fence.proxy.tensormap::generic.acquire.cluster [addr], size; // 8.
-            * fn_ptr++ = reinterpret_cast<void*>(
-              static_cast<void (*)(
-                cuda::ptx::sem_acquire_t, cuda::ptx::scope_cluster_t, const void*, cuda::ptx::n32_t<128>)>(
-                cuda::ptx::fence_proxy_tensormap_generic));
-          // fence.proxy.tensormap::generic.acquire.gpu [addr], size; // 8.
-            * fn_ptr++ = reinterpret_cast<void*>(
-              static_cast<void (*)(cuda::ptx::sem_acquire_t, cuda::ptx::scope_gpu_t, const void*, cuda::ptx::n32_t<128>)>(
-                cuda::ptx::fence_proxy_tensormap_generic));
-          // fence.proxy.tensormap::generic.acquire.sys [addr], size; // 8.
-            * fn_ptr++ = reinterpret_cast<void*>(
-              static_cast<void (*)(cuda::ptx::sem_acquire_t, cuda::ptx::scope_sys_t, const void*, cuda::ptx::n32_t<128>)>(
-                cuda::ptx::fence_proxy_tensormap_generic));));
-#endif // __cccl_ptx_isa >= 830
-}
+#include "generated/fence.inc"
+#include "generated/fence_mbarrier_init.inc"
+#include "generated/fence_proxy_alias.inc"
+#include "generated/fence_proxy_async.inc"
+#include "generated/fence_proxy_tensormap_generic.inc"
 
 int main(int, char**)
 {
diff --git a/libcudacxx/test/libcudacxx/cuda/ptx/ptx.get_sreg.compile.pass.cpp b/libcudacxx/test/libcudacxx/cuda/ptx/ptx.get_sreg.compile.pass.cpp
index 0003afb2fe2..697cc00a1be 100644
--- a/libcudacxx/test/libcudacxx/cuda/ptx/ptx.get_sreg.compile.pass.cpp
+++ b/libcudacxx/test/libcudacxx/cuda/ptx/ptx.get_sreg.compile.pass.cpp
@@ -32,337 +32,7 @@
  *
  */
 
-__global__ void test_get_sreg(void** fn_ptr)
-{
-#if __cccl_ptx_isa >= 200
-  // mov.u32 sreg_value, %%tid.x;
-  *fn_ptr++ = reinterpret_cast<void*>(static_cast<uint32_t (*)()>(cuda::ptx::get_sreg_tid_x));
-#endif // __cccl_ptx_isa >= 200
-
-#if __cccl_ptx_isa >= 200
-  // mov.u32 sreg_value, %%tid.y;
-  *fn_ptr++ = reinterpret_cast<void*>(static_cast<uint32_t (*)()>(cuda::ptx::get_sreg_tid_y));
-#endif // __cccl_ptx_isa >= 200
-
-#if __cccl_ptx_isa >= 200
-  // mov.u32 sreg_value, %%tid.z;
-  *fn_ptr++ = reinterpret_cast<void*>(static_cast<uint32_t (*)()>(cuda::ptx::get_sreg_tid_z));
-#endif // __cccl_ptx_isa >= 200
-
-#if __cccl_ptx_isa >= 200
-  // mov.u32 sreg_value, %%ntid.x;
-  *fn_ptr++ = reinterpret_cast<void*>(static_cast<uint32_t (*)()>(cuda::ptx::get_sreg_ntid_x));
-#endif // __cccl_ptx_isa >= 200
-
-#if __cccl_ptx_isa >= 200
-  // mov.u32 sreg_value, %%ntid.y;
-  *fn_ptr++ = reinterpret_cast<void*>(static_cast<uint32_t (*)()>(cuda::ptx::get_sreg_ntid_y));
-#endif // __cccl_ptx_isa >= 200
-
-#if __cccl_ptx_isa >= 200
-  // mov.u32 sreg_value, %%ntid.z;
-  *fn_ptr++ = reinterpret_cast<void*>(static_cast<uint32_t (*)()>(cuda::ptx::get_sreg_ntid_z));
-#endif // __cccl_ptx_isa >= 200
-
-#if __cccl_ptx_isa >= 130
-  // mov.u32 sreg_value, %%laneid;
-  *fn_ptr++ = reinterpret_cast<void*>(static_cast<uint32_t (*)()>(cuda::ptx::get_sreg_laneid));
-#endif // __cccl_ptx_isa >= 130
-
-#if __cccl_ptx_isa >= 130
-  // mov.u32 sreg_value, %%warpid;
-  *fn_ptr++ = reinterpret_cast<void*>(static_cast<uint32_t (*)()>(cuda::ptx::get_sreg_warpid));
-#endif // __cccl_ptx_isa >= 130
-
-#if __cccl_ptx_isa >= 200
-  NV_IF_TARGET(NV_PROVIDES_SM_35,
-               (
-                   // mov.u32 sreg_value, %%nwarpid;
-                   * fn_ptr++ = reinterpret_cast<void*>(static_cast<uint32_t (*)()>(cuda::ptx::get_sreg_nwarpid));));
-#endif // __cccl_ptx_isa >= 200
-
-#if __cccl_ptx_isa >= 200
-  // mov.u32 sreg_value, %%ctaid.x;
-  *fn_ptr++ = reinterpret_cast<void*>(static_cast<uint32_t (*)()>(cuda::ptx::get_sreg_ctaid_x));
-#endif // __cccl_ptx_isa >= 200
-
-#if __cccl_ptx_isa >= 200
-  // mov.u32 sreg_value, %%ctaid.y;
-  *fn_ptr++ = reinterpret_cast<void*>(static_cast<uint32_t (*)()>(cuda::ptx::get_sreg_ctaid_y));
-#endif // __cccl_ptx_isa >= 200
-
-#if __cccl_ptx_isa >= 200
-  // mov.u32 sreg_value, %%ctaid.z;
-  *fn_ptr++ = reinterpret_cast<void*>(static_cast<uint32_t (*)()>(cuda::ptx::get_sreg_ctaid_z));
-#endif // __cccl_ptx_isa >= 200
-
-#if __cccl_ptx_isa >= 200
-  // mov.u32 sreg_value, %%nctaid.x;
-  *fn_ptr++ = reinterpret_cast<void*>(static_cast<uint32_t (*)()>(cuda::ptx::get_sreg_nctaid_x));
-#endif // __cccl_ptx_isa >= 200
-
-#if __cccl_ptx_isa >= 200
-  // mov.u32 sreg_value, %%nctaid.y;
-  *fn_ptr++ = reinterpret_cast<void*>(static_cast<uint32_t (*)()>(cuda::ptx::get_sreg_nctaid_y));
-#endif // __cccl_ptx_isa >= 200
-
-#if __cccl_ptx_isa >= 200
-  // mov.u32 sreg_value, %%nctaid.z;
-  *fn_ptr++ = reinterpret_cast<void*>(static_cast<uint32_t (*)()>(cuda::ptx::get_sreg_nctaid_z));
-#endif // __cccl_ptx_isa >= 200
-
-#if __cccl_ptx_isa >= 130
-  // mov.u32 sreg_value, %%smid;
-  *fn_ptr++ = reinterpret_cast<void*>(static_cast<uint32_t (*)()>(cuda::ptx::get_sreg_smid));
-#endif // __cccl_ptx_isa >= 130
-
-#if __cccl_ptx_isa >= 200
-  NV_IF_TARGET(NV_PROVIDES_SM_35,
-               (
-                   // mov.u32 sreg_value, %%nsmid;
-                   * fn_ptr++ = reinterpret_cast<void*>(static_cast<uint32_t (*)()>(cuda::ptx::get_sreg_nsmid));));
-#endif // __cccl_ptx_isa >= 200
-
-#if __cccl_ptx_isa >= 300
-  // mov.u64 sreg_value, %%gridid;
-  *fn_ptr++ = reinterpret_cast<void*>(static_cast<uint64_t (*)()>(cuda::ptx::get_sreg_gridid));
-#endif // __cccl_ptx_isa >= 300
-
-#if __cccl_ptx_isa >= 780
-  NV_IF_TARGET(
-    NV_PROVIDES_SM_90,
-    (
-        // mov.pred sreg_value, %%is_explicit_cluster;
-        * fn_ptr++ = reinterpret_cast<void*>(static_cast<bool (*)()>(cuda::ptx::get_sreg_is_explicit_cluster));));
-#endif // __cccl_ptx_isa >= 780
-
-#if __cccl_ptx_isa >= 780
-  NV_IF_TARGET(
-    NV_PROVIDES_SM_90,
-    (
-        // mov.u32 sreg_value, %%clusterid.x;
-        * fn_ptr++ = reinterpret_cast<void*>(static_cast<uint32_t (*)()>(cuda::ptx::get_sreg_clusterid_x));));
-#endif // __cccl_ptx_isa >= 780
-
-#if __cccl_ptx_isa >= 780
-  NV_IF_TARGET(
-    NV_PROVIDES_SM_90,
-    (
-        // mov.u32 sreg_value, %%clusterid.y;
-        * fn_ptr++ = reinterpret_cast<void*>(static_cast<uint32_t (*)()>(cuda::ptx::get_sreg_clusterid_y));));
-#endif // __cccl_ptx_isa >= 780
-
-#if __cccl_ptx_isa >= 780
-  NV_IF_TARGET(
-    NV_PROVIDES_SM_90,
-    (
-        // mov.u32 sreg_value, %%clusterid.z;
-        * fn_ptr++ = reinterpret_cast<void*>(static_cast<uint32_t (*)()>(cuda::ptx::get_sreg_clusterid_z));));
-#endif // __cccl_ptx_isa >= 780
-
-#if __cccl_ptx_isa >= 780
-  NV_IF_TARGET(
-    NV_PROVIDES_SM_90,
-    (
-        // mov.u32 sreg_value, %%nclusterid.x;
-        * fn_ptr++ = reinterpret_cast<void*>(static_cast<uint32_t (*)()>(cuda::ptx::get_sreg_nclusterid_x));));
-#endif // __cccl_ptx_isa >= 780
-
-#if __cccl_ptx_isa >= 780
-  NV_IF_TARGET(
-    NV_PROVIDES_SM_90,
-    (
-        // mov.u32 sreg_value, %%nclusterid.y;
-        * fn_ptr++ = reinterpret_cast<void*>(static_cast<uint32_t (*)()>(cuda::ptx::get_sreg_nclusterid_y));));
-#endif // __cccl_ptx_isa >= 780
-
-#if __cccl_ptx_isa >= 780
-  NV_IF_TARGET(
-    NV_PROVIDES_SM_90,
-    (
-        // mov.u32 sreg_value, %%nclusterid.z;
-        * fn_ptr++ = reinterpret_cast<void*>(static_cast<uint32_t (*)()>(cuda::ptx::get_sreg_nclusterid_z));));
-#endif // __cccl_ptx_isa >= 780
-
-#if __cccl_ptx_isa >= 780
-  NV_IF_TARGET(
-    NV_PROVIDES_SM_90,
-    (
-        // mov.u32 sreg_value, %%cluster_ctaid.x;
-        * fn_ptr++ = reinterpret_cast<void*>(static_cast<uint32_t (*)()>(cuda::ptx::get_sreg_cluster_ctaid_x));));
-#endif // __cccl_ptx_isa >= 780
-
-#if __cccl_ptx_isa >= 780
-  NV_IF_TARGET(
-    NV_PROVIDES_SM_90,
-    (
-        // mov.u32 sreg_value, %%cluster_ctaid.y;
-        * fn_ptr++ = reinterpret_cast<void*>(static_cast<uint32_t (*)()>(cuda::ptx::get_sreg_cluster_ctaid_y));));
-#endif // __cccl_ptx_isa >= 780
-
-#if __cccl_ptx_isa >= 780
-  NV_IF_TARGET(
-    NV_PROVIDES_SM_90,
-    (
-        // mov.u32 sreg_value, %%cluster_ctaid.z;
-        * fn_ptr++ = reinterpret_cast<void*>(static_cast<uint32_t (*)()>(cuda::ptx::get_sreg_cluster_ctaid_z));));
-#endif // __cccl_ptx_isa >= 780
-
-#if __cccl_ptx_isa >= 780
-  NV_IF_TARGET(
-    NV_PROVIDES_SM_90,
-    (
-        // mov.u32 sreg_value, %%cluster_nctaid.x;
-        * fn_ptr++ = reinterpret_cast<void*>(static_cast<uint32_t (*)()>(cuda::ptx::get_sreg_cluster_nctaid_x));));
-#endif // __cccl_ptx_isa >= 780
-
-#if __cccl_ptx_isa >= 780
-  NV_IF_TARGET(
-    NV_PROVIDES_SM_90,
-    (
-        // mov.u32 sreg_value, %%cluster_nctaid.y;
-        * fn_ptr++ = reinterpret_cast<void*>(static_cast<uint32_t (*)()>(cuda::ptx::get_sreg_cluster_nctaid_y));));
-#endif // __cccl_ptx_isa >= 780
-
-#if __cccl_ptx_isa >= 780
-  NV_IF_TARGET(
-    NV_PROVIDES_SM_90,
-    (
-        // mov.u32 sreg_value, %%cluster_nctaid.z;
-        * fn_ptr++ = reinterpret_cast<void*>(static_cast<uint32_t (*)()>(cuda::ptx::get_sreg_cluster_nctaid_z));));
-#endif // __cccl_ptx_isa >= 780
-
-#if __cccl_ptx_isa >= 780
-  NV_IF_TARGET(
-    NV_PROVIDES_SM_90,
-    (
-        // mov.u32 sreg_value, %%cluster_ctarank;
-        * fn_ptr++ = reinterpret_cast<void*>(static_cast<uint32_t (*)()>(cuda::ptx::get_sreg_cluster_ctarank));));
-#endif // __cccl_ptx_isa >= 780
-
-#if __cccl_ptx_isa >= 780
-  NV_IF_TARGET(
-    NV_PROVIDES_SM_90,
-    (
-        // mov.u32 sreg_value, %%cluster_nctarank;
-        * fn_ptr++ = reinterpret_cast<void*>(static_cast<uint32_t (*)()>(cuda::ptx::get_sreg_cluster_nctarank));));
-#endif // __cccl_ptx_isa >= 780
-
-#if __cccl_ptx_isa >= 200
-  NV_IF_TARGET(
-    NV_PROVIDES_SM_35,
-    (
-        // mov.u32 sreg_value, %%lanemask_eq;
-        * fn_ptr++ = reinterpret_cast<void*>(static_cast<uint32_t (*)()>(cuda::ptx::get_sreg_lanemask_eq));));
-#endif // __cccl_ptx_isa >= 200
-
-#if __cccl_ptx_isa >= 200
-  NV_IF_TARGET(
-    NV_PROVIDES_SM_35,
-    (
-        // mov.u32 sreg_value, %%lanemask_le;
-        * fn_ptr++ = reinterpret_cast<void*>(static_cast<uint32_t (*)()>(cuda::ptx::get_sreg_lanemask_le));));
-#endif // __cccl_ptx_isa >= 200
-
-#if __cccl_ptx_isa >= 200
-  NV_IF_TARGET(
-    NV_PROVIDES_SM_35,
-    (
-        // mov.u32 sreg_value, %%lanemask_lt;
-        * fn_ptr++ = reinterpret_cast<void*>(static_cast<uint32_t (*)()>(cuda::ptx::get_sreg_lanemask_lt));));
-#endif // __cccl_ptx_isa >= 200
-
-#if __cccl_ptx_isa >= 200
-  NV_IF_TARGET(
-    NV_PROVIDES_SM_35,
-    (
-        // mov.u32 sreg_value, %%lanemask_ge;
-        * fn_ptr++ = reinterpret_cast<void*>(static_cast<uint32_t (*)()>(cuda::ptx::get_sreg_lanemask_ge));));
-#endif // __cccl_ptx_isa >= 200
-
-#if __cccl_ptx_isa >= 200
-  NV_IF_TARGET(
-    NV_PROVIDES_SM_35,
-    (
-        // mov.u32 sreg_value, %%lanemask_gt;
-        * fn_ptr++ = reinterpret_cast<void*>(static_cast<uint32_t (*)()>(cuda::ptx::get_sreg_lanemask_gt));));
-#endif // __cccl_ptx_isa >= 200
-
-#if __cccl_ptx_isa >= 100
-  // mov.u32 sreg_value, %%clock;
-  *fn_ptr++ = reinterpret_cast<void*>(static_cast<uint32_t (*)()>(cuda::ptx::get_sreg_clock));
-#endif // __cccl_ptx_isa >= 100
-
-#if __cccl_ptx_isa >= 500
-  NV_IF_TARGET(NV_PROVIDES_SM_35,
-               (
-                   // mov.u32 sreg_value, %%clock_hi;
-                   * fn_ptr++ = reinterpret_cast<void*>(static_cast<uint32_t (*)()>(cuda::ptx::get_sreg_clock_hi));));
-#endif // __cccl_ptx_isa >= 500
-
-#if __cccl_ptx_isa >= 200
-  NV_IF_TARGET(NV_PROVIDES_SM_35,
-               (
-                   // mov.u64 sreg_value, %%clock64;
-                   * fn_ptr++ = reinterpret_cast<void*>(static_cast<uint64_t (*)()>(cuda::ptx::get_sreg_clock64));));
-#endif // __cccl_ptx_isa >= 200
-
-#if __cccl_ptx_isa >= 310
-  NV_IF_TARGET(
-    NV_PROVIDES_SM_35,
-    (
-        // mov.u64 sreg_value, %%globaltimer;
-        * fn_ptr++ = reinterpret_cast<void*>(static_cast<uint64_t (*)()>(cuda::ptx::get_sreg_globaltimer));));
-#endif // __cccl_ptx_isa >= 310
-
-#if __cccl_ptx_isa >= 310
-  NV_IF_TARGET(
-    NV_PROVIDES_SM_35,
-    (
-        // mov.u32 sreg_value, %%globaltimer_lo;
-        * fn_ptr++ = reinterpret_cast<void*>(static_cast<uint32_t (*)()>(cuda::ptx::get_sreg_globaltimer_lo));));
-#endif // __cccl_ptx_isa >= 310
-
-#if __cccl_ptx_isa >= 310
-  NV_IF_TARGET(
-    NV_PROVIDES_SM_35,
-    (
-        // mov.u32 sreg_value, %%globaltimer_hi;
-        * fn_ptr++ = reinterpret_cast<void*>(static_cast<uint32_t (*)()>(cuda::ptx::get_sreg_globaltimer_hi));));
-#endif // __cccl_ptx_isa >= 310
-
-#if __cccl_ptx_isa >= 410
-  NV_IF_TARGET(
-    NV_PROVIDES_SM_35,
-    (
-        // mov.u32 sreg_value, %%total_smem_size;
-        * fn_ptr++ = reinterpret_cast<void*>(static_cast<uint32_t (*)()>(cuda::ptx::get_sreg_total_smem_size));));
-#endif // __cccl_ptx_isa >= 410
-
-#if __cccl_ptx_isa >= 810
-  NV_IF_TARGET(
-    NV_PROVIDES_SM_90,
-    (
-        // mov.u32 sreg_value, %%aggr_smem_size;
-        * fn_ptr++ = reinterpret_cast<void*>(static_cast<uint32_t (*)()>(cuda::ptx::get_sreg_aggr_smem_size));));
-#endif // __cccl_ptx_isa >= 810
-
-#if __cccl_ptx_isa >= 410
-  NV_IF_TARGET(
-    NV_PROVIDES_SM_35,
-    (
-        // mov.u32 sreg_value, %%dynamic_smem_size;
-        * fn_ptr++ = reinterpret_cast<void*>(static_cast<uint32_t (*)()>(cuda::ptx::get_sreg_dynamic_smem_size));));
-#endif // __cccl_ptx_isa >= 410
-
-#if __cccl_ptx_isa >= 800
-  NV_IF_TARGET(
-    NV_PROVIDES_SM_50,
-    (
-        // mov.u64 sreg_value, %%current_graph_exec;
-        * fn_ptr++ = reinterpret_cast<void*>(static_cast<uint64_t (*)()>(cuda::ptx::get_sreg_current_graph_exec));));
-#endif // __cccl_ptx_isa >= 800
-}
+#include "generated/get_sreg.inc"
 
 int main(int, char**)
 {
diff --git a/libcudacxx/test/libcudacxx/cuda/ptx/ptx.getctarank.compile.pass.cpp b/libcudacxx/test/libcudacxx/cuda/ptx/ptx.getctarank.compile.pass.cpp
index 73112e871b0..80fc71c0998 100644
--- a/libcudacxx/test/libcudacxx/cuda/ptx/ptx.getctarank.compile.pass.cpp
+++ b/libcudacxx/test/libcudacxx/cuda/ptx/ptx.getctarank.compile.pass.cpp
@@ -31,16 +31,7 @@
  *
  */
 
-__global__ void test_getctarank(void** fn_ptr)
-{
-#if __cccl_ptx_isa >= 780
-  NV_IF_TARGET(NV_PROVIDES_SM_90,
-               (
-                   // getctarank.shared::cluster.u32 dest, addr;
-                   * fn_ptr++ = reinterpret_cast<void*>(
-                     static_cast<uint32_t (*)(cuda::ptx::space_cluster_t, const void*)>(cuda::ptx::getctarank));));
-#endif // __cccl_ptx_isa >= 780
-}
+#include "generated/getctarank.inc"
 
 int main(int, char**)
 {
diff --git a/libcudacxx/test/libcudacxx/cuda/ptx/ptx.mbarrier.arrive.compile.pass.cpp b/libcudacxx/test/libcudacxx/cuda/ptx/ptx.mbarrier.arrive.compile.pass.cpp
index 3a213d9bce3..2350b176630 100644
--- a/libcudacxx/test/libcudacxx/cuda/ptx/ptx.mbarrier.arrive.compile.pass.cpp
+++ b/libcudacxx/test/libcudacxx/cuda/ptx/ptx.mbarrier.arrive.compile.pass.cpp
@@ -31,123 +31,9 @@
  *
  */
 
-__global__ void test_mbarrier_arrive(void** fn_ptr)
-{
-#if __cccl_ptx_isa >= 700
-  NV_IF_TARGET(
-    NV_PROVIDES_SM_80,
-    (
-        // mbarrier.arrive.shared.b64                                  state,  [addr];           // 1.
-        * fn_ptr++ = reinterpret_cast<void*>(static_cast<uint64_t (*)(uint64_t*)>(cuda::ptx::mbarrier_arrive));));
-#endif // __cccl_ptx_isa >= 700
-
-#if __cccl_ptx_isa >= 780
-  NV_IF_TARGET(NV_PROVIDES_SM_90,
-               (
-                   // mbarrier.arrive.shared::cta.b64                             state,  [addr], count;    // 2.
-                   * fn_ptr++ = reinterpret_cast<void*>(
-                     static_cast<uint64_t (*)(uint64_t*, const uint32_t&)>(cuda::ptx::mbarrier_arrive));));
-#endif // __cccl_ptx_isa >= 780
-
-#if __cccl_ptx_isa >= 800
-  NV_IF_TARGET(
-    NV_PROVIDES_SM_90,
-    (
-        // mbarrier.arrive.release.cta.shared::cta.b64                   state,  [addr];           // 3a.
-        * fn_ptr++ = reinterpret_cast<void*>(
-          static_cast<uint64_t (*)(
-            cuda::ptx::sem_release_t, cuda::ptx::scope_cta_t, cuda::ptx::space_shared_t, uint64_t*)>(
-            cuda::ptx::mbarrier_arrive));
-          // mbarrier.arrive.release.cluster.shared::cta.b64                   state,  [addr];           // 3a.
-            * fn_ptr++ = reinterpret_cast<void*>(
-              static_cast<uint64_t (*)(
-                cuda::ptx::sem_release_t, cuda::ptx::scope_cluster_t, cuda::ptx::space_shared_t, uint64_t*)>(
-                cuda::ptx::mbarrier_arrive));));
-#endif // __cccl_ptx_isa >= 800
-
-#if __cccl_ptx_isa >= 800
-  NV_IF_TARGET(
-    NV_PROVIDES_SM_90,
-    (
-        // mbarrier.arrive.release.cta.shared::cta.b64                   state,  [addr], count;    // 3b.
-        * fn_ptr++ = reinterpret_cast<void*>(
-          static_cast<uint64_t (*)(
-            cuda::ptx::sem_release_t, cuda::ptx::scope_cta_t, cuda::ptx::space_shared_t, uint64_t*, const uint32_t&)>(
-            cuda::ptx::mbarrier_arrive));
-          // mbarrier.arrive.release.cluster.shared::cta.b64                   state,  [addr], count;    // 3b.
-            * fn_ptr++ = reinterpret_cast<void*>(
-              static_cast<uint64_t (*)(cuda::ptx::sem_release_t,
-                                       cuda::ptx::scope_cluster_t,
-                                       cuda::ptx::space_shared_t,
-                                       uint64_t*,
-                                       const uint32_t&)>(cuda::ptx::mbarrier_arrive));));
-#endif // __cccl_ptx_isa >= 800
-
-#if __cccl_ptx_isa >= 800
-  NV_IF_TARGET(
-    NV_PROVIDES_SM_90,
-    (
-        // mbarrier.arrive.release.cluster.shared::cluster.b64                   _, [addr];                // 4a.
-        * fn_ptr++ = reinterpret_cast<void*>(
-          static_cast<void (*)(
-            cuda::ptx::sem_release_t, cuda::ptx::scope_cluster_t, cuda::ptx::space_cluster_t, uint64_t*)>(
-            cuda::ptx::mbarrier_arrive));));
-#endif // __cccl_ptx_isa >= 800
-
-#if __cccl_ptx_isa >= 800
-  NV_IF_TARGET(
-    NV_PROVIDES_SM_90,
-    (
-        // mbarrier.arrive.release.cluster.shared::cluster.b64                   _, [addr], count;         // 4b.
-        * fn_ptr++ = reinterpret_cast<void*>(
-          static_cast<void (*)(
-            cuda::ptx::sem_release_t, cuda::ptx::scope_cluster_t, cuda::ptx::space_cluster_t, uint64_t*, const uint32_t&)>(
-            cuda::ptx::mbarrier_arrive));));
-#endif // __cccl_ptx_isa >= 800
-}
-
-__global__ void test_mbarrier_arrive_no_complete(void** fn_ptr)
-{
-#if __cccl_ptx_isa >= 700
-  NV_IF_TARGET(NV_PROVIDES_SM_80,
-               (
-                   // mbarrier.arrive.noComplete.shared.b64                       state,  [addr], count;    // 5.
-                   * fn_ptr++ = reinterpret_cast<void*>(
-                     static_cast<uint64_t (*)(uint64_t*, const uint32_t&)>(cuda::ptx::mbarrier_arrive_no_complete));));
-#endif // __cccl_ptx_isa >= 700
-}
-
-__global__ void test_mbarrier_arrive_expect_tx(void** fn_ptr)
-{
-#if __cccl_ptx_isa >= 800
-  NV_IF_TARGET(
-    NV_PROVIDES_SM_90,
-    (
-        // mbarrier.arrive.expect_tx.release.cta.shared::cta.b64 state, [addr], tx_count; // 8.
-        * fn_ptr++ = reinterpret_cast<void*>(
-          static_cast<uint64_t (*)(
-            cuda::ptx::sem_release_t, cuda::ptx::scope_cta_t, cuda::ptx::space_shared_t, uint64_t*, const uint32_t&)>(
-            cuda::ptx::mbarrier_arrive_expect_tx));
-          // mbarrier.arrive.expect_tx.release.cluster.shared::cta.b64 state, [addr], tx_count; // 8.
-            * fn_ptr++ = reinterpret_cast<void*>(
-              static_cast<uint64_t (*)(cuda::ptx::sem_release_t,
-                                       cuda::ptx::scope_cluster_t,
-                                       cuda::ptx::space_shared_t,
-                                       uint64_t*,
-                                       const uint32_t&)>(cuda::ptx::mbarrier_arrive_expect_tx));));
-#endif // __cccl_ptx_isa >= 800
-
-#if __cccl_ptx_isa >= 800
-  NV_IF_TARGET(
-    NV_PROVIDES_SM_90,
-    (
-        // mbarrier.arrive.expect_tx.release.cluster.shared::cluster.b64   _, [addr], tx_count; // 9.
-        * fn_ptr++ = reinterpret_cast<void*>(
-          static_cast<void (*)(
-            cuda::ptx::sem_release_t, cuda::ptx::scope_cluster_t, cuda::ptx::space_cluster_t, uint64_t*, const uint32_t&)>(
-            cuda::ptx::mbarrier_arrive_expect_tx));));
-#endif // __cccl_ptx_isa >= 800
-}
+#include "generated/mbarrier_arrive.inc"
+#include "generated/mbarrier_arrive_expect_tx.inc"
+#include "generated/mbarrier_arrive_no_complete.inc"
 
 int main(int, char**)
 {
diff --git a/libcudacxx/test/libcudacxx/cuda/ptx/ptx.mbarrier.init.compile.pass.cpp b/libcudacxx/test/libcudacxx/cuda/ptx/ptx.mbarrier.init.compile.pass.cpp
index 6aa0f87e41e..b445a61a8a9 100644
--- a/libcudacxx/test/libcudacxx/cuda/ptx/ptx.mbarrier.init.compile.pass.cpp
+++ b/libcudacxx/test/libcudacxx/cuda/ptx/ptx.mbarrier.init.compile.pass.cpp
@@ -31,16 +31,7 @@
  *
  */
 
-__global__ void test_mbarrier_init(void** fn_ptr)
-{
-#if __cccl_ptx_isa >= 700
-  NV_IF_TARGET(NV_PROVIDES_SM_80,
-               (
-                   // mbarrier.init.shared.b64 [addr], count;
-                   * fn_ptr++ = reinterpret_cast<void*>(
-                     static_cast<void (*)(uint64_t*, const uint32_t&)>(cuda::ptx::mbarrier_init));));
-#endif // __cccl_ptx_isa >= 700
-}
+#include "generated/mbarrier_init.inc"
 
 int main(int, char**)
 {
diff --git a/libcudacxx/test/libcudacxx/cuda/ptx/ptx.mbarrier.wait.compile.pass.cpp b/libcudacxx/test/libcudacxx/cuda/ptx/ptx.mbarrier.wait.compile.pass.cpp
index 007ccdef29c..e9c17a2024d 100644
--- a/libcudacxx/test/libcudacxx/cuda/ptx/ptx.mbarrier.wait.compile.pass.cpp
+++ b/libcudacxx/test/libcudacxx/cuda/ptx/ptx.mbarrier.wait.compile.pass.cpp
@@ -31,162 +31,10 @@
  *
  */
 
-__global__ void test_mbarrier_test_wait(void** fn_ptr)
-{
-#if __cccl_ptx_isa >= 700
-  NV_IF_TARGET(NV_PROVIDES_SM_80,
-               (
-                   // mbarrier.test_wait.shared.b64 waitComplete, [addr], state; // 1.
-                   * fn_ptr++ = reinterpret_cast<void*>(
-                     static_cast<bool (*)(uint64_t*, const uint64_t&)>(cuda::ptx::mbarrier_test_wait));));
-#endif // __cccl_ptx_isa >= 700
-
-#if __cccl_ptx_isa >= 800
-  NV_IF_TARGET(
-    NV_PROVIDES_SM_90,
-    (
-        // mbarrier.test_wait.acquire.cta.shared::cta.b64        waitComplete, [addr], state; // 2.
-        * fn_ptr++ = reinterpret_cast<void*>(
-          static_cast<bool (*)(cuda::ptx::sem_acquire_t, cuda::ptx::scope_cta_t, uint64_t*, const uint64_t&)>(
-            cuda::ptx::mbarrier_test_wait));
-          // mbarrier.test_wait.acquire.cluster.shared::cta.b64        waitComplete, [addr], state; // 2.
-            * fn_ptr++ = reinterpret_cast<void*>(
-              static_cast<bool (*)(cuda::ptx::sem_acquire_t, cuda::ptx::scope_cluster_t, uint64_t*, const uint64_t&)>(
-                cuda::ptx::mbarrier_test_wait));));
-#endif // __cccl_ptx_isa >= 800
-}
-
-__global__ void test_mbarrier_test_wait_parity(void** fn_ptr)
-{
-#if __cccl_ptx_isa >= 710
-  NV_IF_TARGET(NV_PROVIDES_SM_80,
-               (
-                   // mbarrier.test_wait.parity.shared.b64 waitComplete, [addr], phaseParity; // 3.
-                   * fn_ptr++ = reinterpret_cast<void*>(
-                     static_cast<bool (*)(uint64_t*, const uint32_t&)>(cuda::ptx::mbarrier_test_wait_parity));));
-#endif // __cccl_ptx_isa >= 710
-
-#if __cccl_ptx_isa >= 800
-  NV_IF_TARGET(
-    NV_PROVIDES_SM_90,
-    (
-        // mbarrier.test_wait.parity.acquire.cta.shared::cta.b64 waitComplete, [addr], phaseParity; // 4.
-        * fn_ptr++ = reinterpret_cast<void*>(
-          static_cast<bool (*)(cuda::ptx::sem_acquire_t, cuda::ptx::scope_cta_t, uint64_t*, const uint32_t&)>(
-            cuda::ptx::mbarrier_test_wait_parity));
-          // mbarrier.test_wait.parity.acquire.cluster.shared::cta.b64 waitComplete, [addr], phaseParity; // 4.
-            * fn_ptr++ = reinterpret_cast<void*>(
-              static_cast<bool (*)(cuda::ptx::sem_acquire_t, cuda::ptx::scope_cluster_t, uint64_t*, const uint32_t&)>(
-                cuda::ptx::mbarrier_test_wait_parity));));
-#endif // __cccl_ptx_isa >= 800
-}
-
-__global__ void test_mbarrier_try_wait(void** fn_ptr)
-{
-#if __cccl_ptx_isa >= 780
-  NV_IF_TARGET(NV_PROVIDES_SM_90,
-               (
-                   // mbarrier.try_wait.shared::cta.b64         waitComplete, [addr], state; // 5a.
-                   * fn_ptr++ = reinterpret_cast<void*>(
-                     static_cast<bool (*)(uint64_t*, const uint64_t&)>(cuda::ptx::mbarrier_try_wait));));
-#endif // __cccl_ptx_isa >= 780
-
-#if __cccl_ptx_isa >= 780
-  NV_IF_TARGET(
-    NV_PROVIDES_SM_90,
-    (
-        // mbarrier.try_wait.shared::cta.b64         waitComplete, [addr], state, suspendTimeHint;                    //
-        // 5b.
-        * fn_ptr++ = reinterpret_cast<void*>(
-          static_cast<bool (*)(uint64_t*, const uint64_t&, const uint32_t&)>(cuda::ptx::mbarrier_try_wait));));
-#endif // __cccl_ptx_isa >= 780
-
-#if __cccl_ptx_isa >= 800
-  NV_IF_TARGET(
-    NV_PROVIDES_SM_90,
-    (
-        // mbarrier.try_wait.acquire.cta.shared::cta.b64         waitComplete, [addr], state;                        //
-        // 6a.
-        * fn_ptr++ = reinterpret_cast<void*>(
-          static_cast<bool (*)(cuda::ptx::sem_acquire_t, cuda::ptx::scope_cta_t, uint64_t*, const uint64_t&)>(
-            cuda::ptx::mbarrier_try_wait));
-          // mbarrier.try_wait.acquire.cluster.shared::cta.b64         waitComplete, [addr], state; // 6a.
-            * fn_ptr++ = reinterpret_cast<void*>(
-              static_cast<bool (*)(cuda::ptx::sem_acquire_t, cuda::ptx::scope_cluster_t, uint64_t*, const uint64_t&)>(
-                cuda::ptx::mbarrier_try_wait));));
-#endif // __cccl_ptx_isa >= 800
-
-#if __cccl_ptx_isa >= 800
-  NV_IF_TARGET(
-    NV_PROVIDES_SM_90,
-    (
-        // mbarrier.try_wait.acquire.cta.shared::cta.b64         waitComplete, [addr], state , suspendTimeHint;      //
-        // 6b.
-        * fn_ptr++ = reinterpret_cast<void*>(
-          static_cast<bool (*)(
-            cuda::ptx::sem_acquire_t, cuda::ptx::scope_cta_t, uint64_t*, const uint64_t&, const uint32_t&)>(
-            cuda::ptx::mbarrier_try_wait));
-          // mbarrier.try_wait.acquire.cluster.shared::cta.b64         waitComplete, [addr], state , suspendTimeHint;
-          // // 6b.
-            * fn_ptr++ = reinterpret_cast<void*>(
-              static_cast<bool (*)(
-                cuda::ptx::sem_acquire_t, cuda::ptx::scope_cluster_t, uint64_t*, const uint64_t&, const uint32_t&)>(
-                cuda::ptx::mbarrier_try_wait));));
-#endif // __cccl_ptx_isa >= 800
-}
-
-__global__ void test_mbarrier_try_wait_parity(void** fn_ptr)
-{
-#if __cccl_ptx_isa >= 780
-  NV_IF_TARGET(NV_PROVIDES_SM_90,
-               (
-                   // mbarrier.try_wait.parity.shared::cta.b64  waitComplete, [addr], phaseParity; // 7a.
-                   * fn_ptr++ = reinterpret_cast<void*>(
-                     static_cast<bool (*)(uint64_t*, const uint32_t&)>(cuda::ptx::mbarrier_try_wait_parity));));
-#endif // __cccl_ptx_isa >= 780
-
-#if __cccl_ptx_isa >= 780
-  NV_IF_TARGET(
-    NV_PROVIDES_SM_90,
-    (
-        // mbarrier.try_wait.parity.shared::cta.b64  waitComplete, [addr], phaseParity, suspendTimeHint; // 7b.
-        * fn_ptr++ = reinterpret_cast<void*>(
-          static_cast<bool (*)(uint64_t*, const uint32_t&, const uint32_t&)>(cuda::ptx::mbarrier_try_wait_parity));));
-#endif // __cccl_ptx_isa >= 780
-
-#if __cccl_ptx_isa >= 800
-  NV_IF_TARGET(
-    NV_PROVIDES_SM_90,
-    (
-        // mbarrier.try_wait.parity.acquire.cta.shared::cta.b64  waitComplete, [addr], phaseParity;                  //
-        // 8a.
-        * fn_ptr++ = reinterpret_cast<void*>(
-          static_cast<bool (*)(cuda::ptx::sem_acquire_t, cuda::ptx::scope_cta_t, uint64_t*, const uint32_t&)>(
-            cuda::ptx::mbarrier_try_wait_parity));
-          // mbarrier.try_wait.parity.acquire.cluster.shared::cta.b64  waitComplete, [addr], phaseParity; // 8a.
-            * fn_ptr++ = reinterpret_cast<void*>(
-              static_cast<bool (*)(cuda::ptx::sem_acquire_t, cuda::ptx::scope_cluster_t, uint64_t*, const uint32_t&)>(
-                cuda::ptx::mbarrier_try_wait_parity));));
-#endif // __cccl_ptx_isa >= 800
-
-#if __cccl_ptx_isa >= 800
-  NV_IF_TARGET(
-    NV_PROVIDES_SM_90,
-    (
-        // mbarrier.try_wait.parity.acquire.cta.shared::cta.b64  waitComplete, [addr], phaseParity, suspendTimeHint; //
-        // 8b.
-        * fn_ptr++ = reinterpret_cast<void*>(
-          static_cast<bool (*)(
-            cuda::ptx::sem_acquire_t, cuda::ptx::scope_cta_t, uint64_t*, const uint32_t&, const uint32_t&)>(
-            cuda::ptx::mbarrier_try_wait_parity));
-          // mbarrier.try_wait.parity.acquire.cluster.shared::cta.b64  waitComplete, [addr], phaseParity,
-          // suspendTimeHint; // 8b.
-            * fn_ptr++ = reinterpret_cast<void*>(
-              static_cast<bool (*)(
-                cuda::ptx::sem_acquire_t, cuda::ptx::scope_cluster_t, uint64_t*, const uint32_t&, const uint32_t&)>(
-                cuda::ptx::mbarrier_try_wait_parity));));
-#endif // __cccl_ptx_isa >= 800
-}
+#include "generated/mbarrier_try_wait.inc"
+#include "generated/mbarrier_try_wait_parity.inc"
+#include "generated/mbarrier_wait.inc"
+#include "generated/mbarrier_wait_parity.inc"
 
 int main(int, char**)
 {
diff --git a/libcudacxx/test/libcudacxx/cuda/ptx/ptx.red.async.compile.pass.cpp b/libcudacxx/test/libcudacxx/cuda/ptx/ptx.red.async.compile.pass.cpp
index 5a910b77fbd..4a380ec8396 100644
--- a/libcudacxx/test/libcudacxx/cuda/ptx/ptx.red.async.compile.pass.cpp
+++ b/libcudacxx/test/libcudacxx/cuda/ptx/ptx.red.async.compile.pass.cpp
@@ -31,126 +31,7 @@
  *
  */
 
-__global__ void test_red_async(void** fn_ptr)
-{
-#if __cccl_ptx_isa >= 810
-  NV_IF_TARGET(
-    NV_PROVIDES_SM_90,
-    (
-        // red.async.relaxed.cluster.shared::cluster.mbarrier::complete_tx::bytes.inc.u32  [dest], value, [remote_bar];
-        * fn_ptr++ = reinterpret_cast<void*>(
-          static_cast<void (*)(cuda::ptx::op_inc_t, uint32_t*, const uint32_t&, uint64_t*)>(cuda::ptx::red_async));));
-#endif // __cccl_ptx_isa >= 810
-
-#if __cccl_ptx_isa >= 810
-  NV_IF_TARGET(
-    NV_PROVIDES_SM_90,
-    (
-        // red.async.relaxed.cluster.shared::cluster.mbarrier::complete_tx::bytes.dec.u32  [dest], value, [remote_bar];
-        * fn_ptr++ = reinterpret_cast<void*>(
-          static_cast<void (*)(cuda::ptx::op_dec_t, uint32_t*, const uint32_t&, uint64_t*)>(cuda::ptx::red_async));));
-#endif // __cccl_ptx_isa >= 810
-
-#if __cccl_ptx_isa >= 810
-  NV_IF_TARGET(
-    NV_PROVIDES_SM_90,
-    (
-        // red.async.relaxed.cluster.shared::cluster.mbarrier::complete_tx::bytes.min.u32  [dest], value, [remote_bar];
-        * fn_ptr++ = reinterpret_cast<void*>(
-          static_cast<void (*)(cuda::ptx::op_min_t, uint32_t*, const uint32_t&, uint64_t*)>(cuda::ptx::red_async));));
-#endif // __cccl_ptx_isa >= 810
-
-#if __cccl_ptx_isa >= 810
-  NV_IF_TARGET(
-    NV_PROVIDES_SM_90,
-    (
-        // red.async.relaxed.cluster.shared::cluster.mbarrier::complete_tx::bytes.max.u32  [dest], value, [remote_bar];
-        * fn_ptr++ = reinterpret_cast<void*>(
-          static_cast<void (*)(cuda::ptx::op_max_t, uint32_t*, const uint32_t&, uint64_t*)>(cuda::ptx::red_async));));
-#endif // __cccl_ptx_isa >= 810
-
-#if __cccl_ptx_isa >= 810
-  NV_IF_TARGET(
-    NV_PROVIDES_SM_90,
-    (
-        // red.async.relaxed.cluster.shared::cluster.mbarrier::complete_tx::bytes.add.u32  [dest], value, [remote_bar];
-        * fn_ptr++ = reinterpret_cast<void*>(
-          static_cast<void (*)(cuda::ptx::op_add_t, uint32_t*, const uint32_t&, uint64_t*)>(cuda::ptx::red_async));));
-#endif // __cccl_ptx_isa >= 810
-
-#if __cccl_ptx_isa >= 810
-  NV_IF_TARGET(
-    NV_PROVIDES_SM_90,
-    (
-        // red.async.relaxed.cluster.shared::cluster.mbarrier::complete_tx::bytes.min.s32  [dest], value, [remote_bar];
-        * fn_ptr++ = reinterpret_cast<void*>(
-          static_cast<void (*)(cuda::ptx::op_min_t, int32_t*, const int32_t&, uint64_t*)>(cuda::ptx::red_async));));
-#endif // __cccl_ptx_isa >= 810
-
-#if __cccl_ptx_isa >= 810
-  NV_IF_TARGET(
-    NV_PROVIDES_SM_90,
-    (
-        // red.async.relaxed.cluster.shared::cluster.mbarrier::complete_tx::bytes.max.s32  [dest], value, [remote_bar];
-        * fn_ptr++ = reinterpret_cast<void*>(
-          static_cast<void (*)(cuda::ptx::op_max_t, int32_t*, const int32_t&, uint64_t*)>(cuda::ptx::red_async));));
-#endif // __cccl_ptx_isa >= 810
-
-#if __cccl_ptx_isa >= 810
-  NV_IF_TARGET(
-    NV_PROVIDES_SM_90,
-    (
-        // red.async.relaxed.cluster.shared::cluster.mbarrier::complete_tx::bytes.add.s32  [dest], value, [remote_bar];
-        * fn_ptr++ = reinterpret_cast<void*>(
-          static_cast<void (*)(cuda::ptx::op_add_t, int32_t*, const int32_t&, uint64_t*)>(cuda::ptx::red_async));));
-#endif // __cccl_ptx_isa >= 810
-
-#if __cccl_ptx_isa >= 810
-  NV_IF_TARGET(
-    NV_PROVIDES_SM_90,
-    (
-        // red.async.relaxed.cluster.shared::cluster.mbarrier::complete_tx::bytes.and.b32  [dest], value, [remote_bar];
-        * fn_ptr++ = reinterpret_cast<void*>(
-          static_cast<void (*)(cuda::ptx::op_and_op_t, int32_t*, const int32_t&, uint64_t*)>(cuda::ptx::red_async));));
-#endif // __cccl_ptx_isa >= 810
-
-#if __cccl_ptx_isa >= 810
-  NV_IF_TARGET(
-    NV_PROVIDES_SM_90,
-    (
-        // red.async.relaxed.cluster.shared::cluster.mbarrier::complete_tx::bytes.or.b32  [dest], value, [remote_bar];
-        * fn_ptr++ = reinterpret_cast<void*>(
-          static_cast<void (*)(cuda::ptx::op_or_op_t, int32_t*, const int32_t&, uint64_t*)>(cuda::ptx::red_async));));
-#endif // __cccl_ptx_isa >= 810
-
-#if __cccl_ptx_isa >= 810
-  NV_IF_TARGET(
-    NV_PROVIDES_SM_90,
-    (
-        // red.async.relaxed.cluster.shared::cluster.mbarrier::complete_tx::bytes.xor.b32  [dest], value, [remote_bar];
-        * fn_ptr++ = reinterpret_cast<void*>(
-          static_cast<void (*)(cuda::ptx::op_xor_op_t, int32_t*, const int32_t&, uint64_t*)>(cuda::ptx::red_async));));
-#endif // __cccl_ptx_isa >= 810
-
-#if __cccl_ptx_isa >= 810
-  NV_IF_TARGET(
-    NV_PROVIDES_SM_90,
-    (
-        // red.async.relaxed.cluster.shared::cluster.mbarrier::complete_tx::bytes.add.u64  [dest], value, [remote_bar];
-        * fn_ptr++ = reinterpret_cast<void*>(
-          static_cast<void (*)(cuda::ptx::op_add_t, uint64_t*, const uint64_t&, uint64_t*)>(cuda::ptx::red_async));));
-#endif // __cccl_ptx_isa >= 810
-
-#if __cccl_ptx_isa >= 810
-  NV_IF_TARGET(
-    NV_PROVIDES_SM_90,
-    (
-        // red.async.relaxed.cluster.shared::cluster.mbarrier::complete_tx::bytes.add.u64  [dest], value, [remote_bar];
-        // // .u64 intentional
-        * fn_ptr++ = reinterpret_cast<void*>(
-          static_cast<void (*)(cuda::ptx::op_add_t, int64_t*, const int64_t&, int64_t*)>(cuda::ptx::red_async));));
-#endif // __cccl_ptx_isa >= 810
-}
+#include "generated/red_async.inc"
 
 int main(int, char**)
 {
diff --git a/libcudacxx/test/libcudacxx/cuda/ptx/ptx.st.async.compile.pass.cpp b/libcudacxx/test/libcudacxx/cuda/ptx/ptx.st.async.compile.pass.cpp
index 1cc0c1e2d74..2c74f48e04d 100644
--- a/libcudacxx/test/libcudacxx/cuda/ptx/ptx.st.async.compile.pass.cpp
+++ b/libcudacxx/test/libcudacxx/cuda/ptx/ptx.st.async.compile.pass.cpp
@@ -31,41 +31,7 @@
  *
  */
 
-__global__ void test_st_async(void** fn_ptr)
-{
-#if __cccl_ptx_isa >= 810
-  NV_IF_TARGET(
-    NV_PROVIDES_SM_90,
-    (
-        // st.async.weak.shared::cluster.mbarrier::complete_tx::bytes.b32 [addr], value, [remote_bar];    // 1.
-        * fn_ptr++ =
-          reinterpret_cast<void*>(static_cast<void (*)(int32_t*, const int32_t&, uint64_t*)>(cuda::ptx::st_async));
-          // st.async.weak.shared::cluster.mbarrier::complete_tx::bytes.b64 [addr], value, [remote_bar];    // 1.
-            * fn_ptr++ = reinterpret_cast<void*>(
-              static_cast<void (*)(int64_t*, const int64_t&, uint64_t*)>(cuda::ptx::st_async));));
-#endif // __cccl_ptx_isa >= 810
-
-#if __cccl_ptx_isa >= 810
-  NV_IF_TARGET(
-    NV_PROVIDES_SM_90,
-    (
-        // st.async.weak.shared::cluster.mbarrier::complete_tx::bytes.v2.b32 [addr], value, [remote_bar]; // 2.
-        * fn_ptr++ =
-          reinterpret_cast<void*>(static_cast<void (*)(int32_t*, const int32_t(&)[2], uint64_t*)>(cuda::ptx::st_async));
-          // st.async.weak.shared::cluster.mbarrier::complete_tx::bytes.v2.b64 [addr], value, [remote_bar]; // 2.
-            * fn_ptr++ = reinterpret_cast<void*>(
-              static_cast<void (*)(int64_t*, const int64_t(&)[2], uint64_t*)>(cuda::ptx::st_async));));
-#endif // __cccl_ptx_isa >= 810
-
-#if __cccl_ptx_isa >= 810
-  NV_IF_TARGET(NV_PROVIDES_SM_90,
-               (
-                   // st.async.weak.shared::cluster.mbarrier::complete_tx::bytes.v4.b32 [addr], value, [remote_bar];
-                   // // 3.
-                   * fn_ptr++ = reinterpret_cast<void*>(
-                     static_cast<void (*)(int32_t*, const int32_t(&)[4], uint64_t*)>(cuda::ptx::st_async));));
-#endif // __cccl_ptx_isa >= 810
-}
+#include "generated/st_async.inc"
 
 int main(int, char**)
 {
diff --git a/libcudacxx/test/libcudacxx/cuda/ptx/ptx.tensormap.cp_fenceproxy.compile.pass.cpp b/libcudacxx/test/libcudacxx/cuda/ptx/ptx.tensormap.cp_fenceproxy.compile.pass.cpp
index 9d923951f0c..d0d3a967836 100644
--- a/libcudacxx/test/libcudacxx/cuda/ptx/ptx.tensormap.cp_fenceproxy.compile.pass.cpp
+++ b/libcudacxx/test/libcudacxx/cuda/ptx/ptx.tensormap.cp_fenceproxy.compile.pass.cpp
@@ -31,35 +31,7 @@
  *
  */
 
-__global__ void test_tensormap_cp_fenceproxy(void** fn_ptr)
-{
-#if __cccl_ptx_isa >= 830
-  NV_IF_TARGET(
-    NV_PROVIDES_SM_90,
-    (
-        // tensormap.cp_fenceproxy.global.shared::cta.tensormap::generic.release.cta.sync.aligned  [dst], [src], size;
-        * fn_ptr++ = reinterpret_cast<void*>(
-          static_cast<void (*)(
-            cuda::ptx::sem_release_t, cuda::ptx::scope_cta_t, void*, const void*, cuda::ptx::n32_t<128>)>(
-            cuda::ptx::tensormap_cp_fenceproxy));
-          // tensormap.cp_fenceproxy.global.shared::cta.tensormap::generic.release.cluster.sync.aligned  [dst], [src],
-          // size;
-            * fn_ptr++ = reinterpret_cast<void*>(
-              static_cast<void (*)(
-                cuda::ptx::sem_release_t, cuda::ptx::scope_cluster_t, void*, const void*, cuda::ptx::n32_t<128>)>(
-                cuda::ptx::tensormap_cp_fenceproxy));
-          // tensormap.cp_fenceproxy.global.shared::cta.tensormap::generic.release.gpu.sync.aligned  [dst], [src], size;
-            * fn_ptr++ = reinterpret_cast<void*>(
-              static_cast<void (*)(
-                cuda::ptx::sem_release_t, cuda::ptx::scope_gpu_t, void*, const void*, cuda::ptx::n32_t<128>)>(
-                cuda::ptx::tensormap_cp_fenceproxy));
-          // tensormap.cp_fenceproxy.global.shared::cta.tensormap::generic.release.sys.sync.aligned  [dst], [src], size;
-            * fn_ptr++ = reinterpret_cast<void*>(
-              static_cast<void (*)(
-                cuda::ptx::sem_release_t, cuda::ptx::scope_sys_t, void*, const void*, cuda::ptx::n32_t<128>)>(
-                cuda::ptx::tensormap_cp_fenceproxy));));
-#endif // __cccl_ptx_isa >= 830
-}
+#include "generated/tensormap_cp_fenceproxy.inc"
 
 int main(int, char**)
 {
diff --git a/libcudacxx/test/libcudacxx/cuda/ptx/ptx.tensormap.replace.compile.pass.cpp b/libcudacxx/test/libcudacxx/cuda/ptx/ptx.tensormap.replace.compile.pass.cpp
index f7360eacbcd..d780ff26dca 100644
--- a/libcudacxx/test/libcudacxx/cuda/ptx/ptx.tensormap.replace.compile.pass.cpp
+++ b/libcudacxx/test/libcudacxx/cuda/ptx/ptx.tensormap.replace.compile.pass.cpp
@@ -31,204 +31,7 @@
  *
  */
 
-__global__ void test_tensormap_replace(void** fn_ptr)
-{
-#if __cccl_ptx_isa >= 830
-  NV_IF_TARGET(
-    NV_HAS_FEATURE_SM_90a,
-    (
-        // tensormap.replace.tile.global_address.global.b1024.b64    [tm_addr], new_val;
-        * fn_ptr++ = reinterpret_cast<void*>(static_cast<void (*)(cuda::ptx::space_global_t, void*, int64_t)>(
-          cuda::ptx::tensormap_replace_global_address));));
-#endif // __cccl_ptx_isa >= 830
-
-#if __cccl_ptx_isa >= 830
-  NV_IF_TARGET(
-    NV_HAS_FEATURE_SM_90a,
-    (
-        // tensormap.replace.tile.global_address.shared::cta.b1024.b64    [tm_addr], new_val;
-        * fn_ptr++ = reinterpret_cast<void*>(static_cast<void (*)(cuda::ptx::space_shared_t, void*, int64_t)>(
-          cuda::ptx::tensormap_replace_global_address));));
-#endif // __cccl_ptx_isa >= 830
-
-#if __cccl_ptx_isa >= 830
-  NV_IF_TARGET(
-    NV_HAS_FEATURE_SM_90a,
-    (
-        // tensormap.replace.tile.rank.global.b1024.b32              [tm_addr], new_val;
-        * fn_ptr++ = reinterpret_cast<void*>(
-          static_cast<void (*)(cuda::ptx::space_global_t, void*, int32_t)>(cuda::ptx::tensormap_replace_rank));));
-#endif // __cccl_ptx_isa >= 830
-
-#if __cccl_ptx_isa >= 830
-  NV_IF_TARGET(
-    NV_HAS_FEATURE_SM_90a,
-    (
-        // tensormap.replace.tile.rank.shared::cta.b1024.b32              [tm_addr], new_val;
-        * fn_ptr++ = reinterpret_cast<void*>(
-          static_cast<void (*)(cuda::ptx::space_shared_t, void*, int32_t)>(cuda::ptx::tensormap_replace_rank));));
-#endif // __cccl_ptx_isa >= 830
-
-#if __cccl_ptx_isa >= 830
-  NV_IF_TARGET(
-    NV_HAS_FEATURE_SM_90a,
-    (
-        // tensormap.replace.tile.box_dim.global.b1024.b32           [tm_addr], ord, new_val;
-        * fn_ptr++ =
-          reinterpret_cast<void*>(static_cast<void (*)(cuda::ptx::space_global_t, void*, cuda::ptx::n32_t<0>, int32_t)>(
-            cuda::ptx::tensormap_replace_box_dim));));
-#endif // __cccl_ptx_isa >= 830
-
-#if __cccl_ptx_isa >= 830
-  NV_IF_TARGET(
-    NV_HAS_FEATURE_SM_90a,
-    (
-        // tensormap.replace.tile.box_dim.shared::cta.b1024.b32           [tm_addr], ord, new_val;
-        * fn_ptr++ =
-          reinterpret_cast<void*>(static_cast<void (*)(cuda::ptx::space_shared_t, void*, cuda::ptx::n32_t<0>, int32_t)>(
-            cuda::ptx::tensormap_replace_box_dim));));
-#endif // __cccl_ptx_isa >= 830
-
-#if __cccl_ptx_isa >= 830
-  NV_IF_TARGET(
-    NV_HAS_FEATURE_SM_90a,
-    (
-        // tensormap.replace.tile.global_dim.global.b1024.b32        [tm_addr], ord, new_val;
-        * fn_ptr++ =
-          reinterpret_cast<void*>(static_cast<void (*)(cuda::ptx::space_global_t, void*, cuda::ptx::n32_t<0>, int32_t)>(
-            cuda::ptx::tensormap_replace_global_dim));));
-#endif // __cccl_ptx_isa >= 830
-
-#if __cccl_ptx_isa >= 830
-  NV_IF_TARGET(
-    NV_HAS_FEATURE_SM_90a,
-    (
-        // tensormap.replace.tile.global_dim.shared::cta.b1024.b32        [tm_addr], ord, new_val;
-        * fn_ptr++ =
-          reinterpret_cast<void*>(static_cast<void (*)(cuda::ptx::space_shared_t, void*, cuda::ptx::n32_t<0>, int32_t)>(
-            cuda::ptx::tensormap_replace_global_dim));));
-#endif // __cccl_ptx_isa >= 830
-
-#if __cccl_ptx_isa >= 830
-  NV_IF_TARGET(
-    NV_HAS_FEATURE_SM_90a,
-    (
-        // tensormap.replace.tile.global_stride.global.b1024.b64     [tm_addr], ord, new_val;
-        * fn_ptr++ =
-          reinterpret_cast<void*>(static_cast<void (*)(cuda::ptx::space_global_t, void*, cuda::ptx::n32_t<0>, int64_t)>(
-            cuda::ptx::tensormap_replace_global_stride));));
-#endif // __cccl_ptx_isa >= 830
-
-#if __cccl_ptx_isa >= 830
-  NV_IF_TARGET(
-    NV_HAS_FEATURE_SM_90a,
-    (
-        // tensormap.replace.tile.global_stride.shared::cta.b1024.b64     [tm_addr], ord, new_val;
-        * fn_ptr++ =
-          reinterpret_cast<void*>(static_cast<void (*)(cuda::ptx::space_shared_t, void*, cuda::ptx::n32_t<0>, int64_t)>(
-            cuda::ptx::tensormap_replace_global_stride));));
-#endif // __cccl_ptx_isa >= 830
-
-#if __cccl_ptx_isa >= 830
-  NV_IF_TARGET(
-    NV_HAS_FEATURE_SM_90a,
-    (
-        // tensormap.replace.tile.element_stride.global.b1024.b32    [tm_addr], ord, new_val;
-        * fn_ptr++ =
-          reinterpret_cast<void*>(static_cast<void (*)(cuda::ptx::space_global_t, void*, cuda::ptx::n32_t<0>, int32_t)>(
-            cuda::ptx::tensormap_replace_element_size));));
-#endif // __cccl_ptx_isa >= 830
-
-#if __cccl_ptx_isa >= 830
-  NV_IF_TARGET(
-    NV_HAS_FEATURE_SM_90a,
-    (
-        // tensormap.replace.tile.element_stride.shared::cta.b1024.b32    [tm_addr], ord, new_val;
-        * fn_ptr++ =
-          reinterpret_cast<void*>(static_cast<void (*)(cuda::ptx::space_shared_t, void*, cuda::ptx::n32_t<0>, int32_t)>(
-            cuda::ptx::tensormap_replace_element_size));));
-#endif // __cccl_ptx_isa >= 830
-
-#if __cccl_ptx_isa >= 830
-  NV_IF_TARGET(
-    NV_HAS_FEATURE_SM_90a,
-    (
-        // tensormap.replace.tile.elemtype.global.b1024.b32          [tm_addr], new_val;
-        * fn_ptr++ =
-          reinterpret_cast<void*>(static_cast<void (*)(cuda::ptx::space_global_t, void*, cuda::ptx::n32_t<0>)>(
-            cuda::ptx::tensormap_replace_elemtype));));
-#endif // __cccl_ptx_isa >= 830
-
-#if __cccl_ptx_isa >= 830
-  NV_IF_TARGET(
-    NV_HAS_FEATURE_SM_90a,
-    (
-        // tensormap.replace.tile.elemtype.shared::cta.b1024.b32          [tm_addr], new_val;
-        * fn_ptr++ =
-          reinterpret_cast<void*>(static_cast<void (*)(cuda::ptx::space_shared_t, void*, cuda::ptx::n32_t<0>)>(
-            cuda::ptx::tensormap_replace_elemtype));));
-#endif // __cccl_ptx_isa >= 830
-
-#if __cccl_ptx_isa >= 830
-  NV_IF_TARGET(
-    NV_HAS_FEATURE_SM_90a,
-    (
-        // tensormap.replace.tile.interleave_layout.global.b1024.b32 [tm_addr], new_val;
-        * fn_ptr++ =
-          reinterpret_cast<void*>(static_cast<void (*)(cuda::ptx::space_global_t, void*, cuda::ptx::n32_t<0>)>(
-            cuda::ptx::tensormap_replace_interleave_layout));));
-#endif // __cccl_ptx_isa >= 830
-
-#if __cccl_ptx_isa >= 830
-  NV_IF_TARGET(
-    NV_HAS_FEATURE_SM_90a,
-    (
-        // tensormap.replace.tile.interleave_layout.shared::cta.b1024.b32 [tm_addr], new_val;
-        * fn_ptr++ =
-          reinterpret_cast<void*>(static_cast<void (*)(cuda::ptx::space_shared_t, void*, cuda::ptx::n32_t<0>)>(
-            cuda::ptx::tensormap_replace_interleave_layout));));
-#endif // __cccl_ptx_isa >= 830
-
-#if __cccl_ptx_isa >= 830
-  NV_IF_TARGET(
-    NV_HAS_FEATURE_SM_90a,
-    (
-        // tensormap.replace.tile.swizzle_mode.global.b1024.b32      [tm_addr], new_val;
-        * fn_ptr++ =
-          reinterpret_cast<void*>(static_cast<void (*)(cuda::ptx::space_global_t, void*, cuda::ptx::n32_t<0>)>(
-            cuda::ptx::tensormap_replace_swizzle_mode));));
-#endif // __cccl_ptx_isa >= 830
-
-#if __cccl_ptx_isa >= 830
-  NV_IF_TARGET(
-    NV_HAS_FEATURE_SM_90a,
-    (
-        // tensormap.replace.tile.swizzle_mode.shared::cta.b1024.b32      [tm_addr], new_val;
-        * fn_ptr++ =
-          reinterpret_cast<void*>(static_cast<void (*)(cuda::ptx::space_shared_t, void*, cuda::ptx::n32_t<0>)>(
-            cuda::ptx::tensormap_replace_swizzle_mode));));
-#endif // __cccl_ptx_isa >= 830
-
-#if __cccl_ptx_isa >= 830
-  NV_IF_TARGET(
-    NV_HAS_FEATURE_SM_90a,
-    (
-        // tensormap.replace.tile.fill_mode.global.b1024.b32         [tm_addr], new_val;
-        * fn_ptr++ =
-          reinterpret_cast<void*>(static_cast<void (*)(cuda::ptx::space_global_t, void*, cuda::ptx::n32_t<0>)>(
-            cuda::ptx::tensormap_replace_fill_mode));));
-#endif // __cccl_ptx_isa >= 830
-
-#if __cccl_ptx_isa >= 830
-  NV_IF_TARGET(
-    NV_HAS_FEATURE_SM_90a,
-    (
-        // tensormap.replace.tile.fill_mode.shared::cta.b1024.b32         [tm_addr], new_val;
-        * fn_ptr++ =
-          reinterpret_cast<void*>(static_cast<void (*)(cuda::ptx::space_shared_t, void*, cuda::ptx::n32_t<0>)>(
-            cuda::ptx::tensormap_replace_fill_mode));));
-#endif // __cccl_ptx_isa >= 830
-}
+#include "generated/tensormap_replace.inc"
 
 int main(int, char**)
 {

From ee46f3e8f0f091b449923354ee2a189312b5031a Mon Sep 17 00:00:00 2001
From: Bernhard Manfred Gruber <bernhardmgruber@gmail.com>
Date: Fri, 22 Nov 2024 12:44:44 +0100
Subject: [PATCH 08/45] Reorganize PTX docs to match generator (#2929)

Co-authored-by: Allard Hendriksen <allard@allardhendriksen.nl>
---
 docs/libcudacxx/ptx/instructions.rst          |  32 +-
 .../ptx/instructions/barrier_cluster.rst      |  16 +
 .../ptx/instructions/cp_async_bulk.rst        |  30 ++
 ...oup.rst => cp_async_bulk_commit_group.rst} |   8 +-
 .../ptx/instructions/cp_async_bulk_tensor.rst |  23 ++
 .../instructions/cp_async_bulk_wait_group.rst |   9 +
 .../ptx/instructions/cp_reduce_async_bulk.rst |  61 +++
 .../cp_reduce_async_bulk_tensor.rst           |   9 +
 docs/libcudacxx/ptx/instructions/fence.rst    | 257 +-----------
 .../barrier_cluster.rst}                      |  15 -
 .../cp_async_bulk.rst}                        |  43 --
 .../generated/cp_async_bulk_commit_group.rst  |   7 +
 .../generated/cp_async_bulk_multicast.rst     |  16 +
 .../cp_async_bulk_tensor.rst}                 | 105 -----
 .../cp_async_bulk_tensor_multicast.rst        |  84 ++++
 .../cp_async_bulk_wait_group.rst}             |   8 -
 .../cp_reduce_async_bulk.rst}                 | 164 --------
 .../generated/cp_reduce_async_bulk_bf16.rst   |  53 +++
 .../generated/cp_reduce_async_bulk_f16.rst    |  53 +++
 .../cp_reduce_async_bulk_tensor.rst}          |   8 -
 .../ptx/instructions/generated/fence.rst      |  95 +++++
 .../generated/fence_mbarrier_init.rst         |  11 +
 .../generated/fence_proxy_alias.rst           |   7 +
 .../generated/fence_proxy_async.rst           |  37 ++
 .../fence_proxy_tensormap_generic.rst         | 103 +++++
 .../ptx/instructions/generated/getctarank.rst |  10 +
 .../generated/mbarrier_arrive.rst             | 111 +++++
 .../generated/mbarrier_arrive_expect_tx.rst   |  47 +++
 .../generated/mbarrier_arrive_no_complete.rst |   9 +
 .../mbarrier_expect_tx.rst}                   |   8 -
 .../mbarrier_init.rst}                        |   8 -
 .../generated/mbarrier_test_wait.rst          |  37 ++
 .../generated/mbarrier_test_wait_parity.rst   |  37 ++
 .../generated/mbarrier_try_wait.rst           |  78 ++++
 .../generated/mbarrier_try_wait_parity.rst    |  78 ++++
 .../red_async.rst}                            |  30 --
 .../generated/special_registers.rst           | 383 +++++++++++++++++
 .../{st.async.rst => generated/st_async.rst}  |  13 -
 .../tensormap_cp_fenceproxy.rst}              |   8 -
 .../tensormap_replace.rst}                    |   8 -
 .../ptx/instructions/getctarank.rst           |  11 +-
 .../ptx/instructions/mbarrier.arrive.rst      | 232 -----------
 .../ptx/instructions/mbarrier.test_wait.rst   |  91 -----
 .../ptx/instructions/mbarrier.try_wait.rst    | 174 --------
 .../ptx/instructions/mbarrier_arrive.rst      |  68 ++++
 .../ptx/instructions/mbarrier_expect_tx.rst   |   9 +
 .../ptx/instructions/mbarrier_init.rst        |   9 +
 .../ptx/instructions/mbarrier_test_wait.rst   |  19 +
 .../ptx/instructions/mbarrier_try_wait.rst    |  20 +
 .../libcudacxx/ptx/instructions/red_async.rst |  31 ++
 .../ptx/instructions/special_registers.rst    | 384 +-----------------
 docs/libcudacxx/ptx/instructions/st_async.rst |  14 +
 .../instructions/tensormap_cp_fenceproxy.rst  |   9 +
 .../ptx/instructions/tensormap_replace.rst    |   9 +
 54 files changed, 1616 insertions(+), 1583 deletions(-)
 create mode 100644 docs/libcudacxx/ptx/instructions/barrier_cluster.rst
 create mode 100644 docs/libcudacxx/ptx/instructions/cp_async_bulk.rst
 rename docs/libcudacxx/ptx/instructions/{cp.async.bulk.commit_group.rst => cp_async_bulk_commit_group.rst} (58%)
 create mode 100644 docs/libcudacxx/ptx/instructions/cp_async_bulk_tensor.rst
 create mode 100644 docs/libcudacxx/ptx/instructions/cp_async_bulk_wait_group.rst
 create mode 100644 docs/libcudacxx/ptx/instructions/cp_reduce_async_bulk.rst
 create mode 100644 docs/libcudacxx/ptx/instructions/cp_reduce_async_bulk_tensor.rst
 rename docs/libcudacxx/ptx/instructions/{barrier.cluster.rst => generated/barrier_cluster.rst} (70%)
 rename docs/libcudacxx/ptx/instructions/{cp.async.bulk.rst => generated/cp_async_bulk.rst} (57%)
 create mode 100644 docs/libcudacxx/ptx/instructions/generated/cp_async_bulk_commit_group.rst
 create mode 100644 docs/libcudacxx/ptx/instructions/generated/cp_async_bulk_multicast.rst
 rename docs/libcudacxx/ptx/instructions/{cp.async.bulk.tensor.rst => generated/cp_async_bulk_tensor.rst} (59%)
 create mode 100644 docs/libcudacxx/ptx/instructions/generated/cp_async_bulk_tensor_multicast.rst
 rename docs/libcudacxx/ptx/instructions/{cp.async.bulk.wait_group.rst => generated/cp_async_bulk_wait_group.rst} (62%)
 rename docs/libcudacxx/ptx/instructions/{cp.reduce.async.bulk.rst => generated/cp_reduce_async_bulk.rst} (80%)
 create mode 100644 docs/libcudacxx/ptx/instructions/generated/cp_reduce_async_bulk_bf16.rst
 create mode 100644 docs/libcudacxx/ptx/instructions/generated/cp_reduce_async_bulk_f16.rst
 rename docs/libcudacxx/ptx/instructions/{cp.reduce.async.bulk.tensor.rst => generated/cp_reduce_async_bulk_tensor.rst} (98%)
 create mode 100644 docs/libcudacxx/ptx/instructions/generated/fence.rst
 create mode 100644 docs/libcudacxx/ptx/instructions/generated/fence_mbarrier_init.rst
 create mode 100644 docs/libcudacxx/ptx/instructions/generated/fence_proxy_alias.rst
 create mode 100644 docs/libcudacxx/ptx/instructions/generated/fence_proxy_async.rst
 create mode 100644 docs/libcudacxx/ptx/instructions/generated/fence_proxy_tensormap_generic.rst
 create mode 100644 docs/libcudacxx/ptx/instructions/generated/getctarank.rst
 create mode 100644 docs/libcudacxx/ptx/instructions/generated/mbarrier_arrive.rst
 create mode 100644 docs/libcudacxx/ptx/instructions/generated/mbarrier_arrive_expect_tx.rst
 create mode 100644 docs/libcudacxx/ptx/instructions/generated/mbarrier_arrive_no_complete.rst
 rename docs/libcudacxx/ptx/instructions/{mbarrier.expect_tx.rst => generated/mbarrier_expect_tx.rst} (88%)
 rename docs/libcudacxx/ptx/instructions/{mbarrier.init.rst => generated/mbarrier_init.rst} (50%)
 create mode 100644 docs/libcudacxx/ptx/instructions/generated/mbarrier_test_wait.rst
 create mode 100644 docs/libcudacxx/ptx/instructions/generated/mbarrier_test_wait_parity.rst
 create mode 100644 docs/libcudacxx/ptx/instructions/generated/mbarrier_try_wait.rst
 create mode 100644 docs/libcudacxx/ptx/instructions/generated/mbarrier_try_wait_parity.rst
 rename docs/libcudacxx/ptx/instructions/{red.async.rst => generated/red_async.rst} (89%)
 create mode 100644 docs/libcudacxx/ptx/instructions/generated/special_registers.rst
 rename docs/libcudacxx/ptx/instructions/{st.async.rst => generated/st_async.rst} (83%)
 rename docs/libcudacxx/ptx/instructions/{tensormap.cp_fenceproxy.rst => generated/tensormap_cp_fenceproxy.rst} (89%)
 rename docs/libcudacxx/ptx/instructions/{tensormap.replace.rst => generated/tensormap_replace.rst} (97%)
 delete mode 100644 docs/libcudacxx/ptx/instructions/mbarrier.arrive.rst
 delete mode 100644 docs/libcudacxx/ptx/instructions/mbarrier.test_wait.rst
 delete mode 100644 docs/libcudacxx/ptx/instructions/mbarrier.try_wait.rst
 create mode 100644 docs/libcudacxx/ptx/instructions/mbarrier_arrive.rst
 create mode 100644 docs/libcudacxx/ptx/instructions/mbarrier_expect_tx.rst
 create mode 100644 docs/libcudacxx/ptx/instructions/mbarrier_init.rst
 create mode 100644 docs/libcudacxx/ptx/instructions/mbarrier_test_wait.rst
 create mode 100644 docs/libcudacxx/ptx/instructions/mbarrier_try_wait.rst
 create mode 100644 docs/libcudacxx/ptx/instructions/red_async.rst
 create mode 100644 docs/libcudacxx/ptx/instructions/st_async.rst
 create mode 100644 docs/libcudacxx/ptx/instructions/tensormap_cp_fenceproxy.rst
 create mode 100644 docs/libcudacxx/ptx/instructions/tensormap_replace.rst

diff --git a/docs/libcudacxx/ptx/instructions.rst b/docs/libcudacxx/ptx/instructions.rst
index a518dad0ff2..f0776974eec 100644
--- a/docs/libcudacxx/ptx/instructions.rst
+++ b/docs/libcudacxx/ptx/instructions.rst
@@ -6,25 +6,25 @@ PTX Instructions
 .. toctree::
    :maxdepth: 1
 
-   instructions/barrier.cluster
-   instructions/cp.async.bulk
-   instructions/cp.async.bulk.commit_group
-   instructions/cp.async.bulk.wait_group
-   instructions/cp.async.bulk.tensor
-   instructions/cp.reduce.async.bulk
-   instructions/cp.reduce.async.bulk.tensor
+   instructions/barrier_cluster
+   instructions/cp_async_bulk
+   instructions/cp_async_bulk_commit_group
+   instructions/cp_async_bulk_wait_group
+   instructions/cp_async_bulk_tensor
+   instructions/cp_reduce_async_bulk
+   instructions/cp_reduce_async_bulk_tensor
    instructions/fence
    instructions/getctarank
    instructions/mapa
-   instructions/mbarrier.init
-   instructions/mbarrier.arrive
-   instructions/mbarrier.expect_tx
-   instructions/mbarrier.test_wait
-   instructions/mbarrier.try_wait
-   instructions/red.async
-   instructions/st.async
-   instructions/tensormap.replace
-   instructions/tensormap.cp_fenceproxy
+   instructions/mbarrier_init
+   instructions/mbarrier_arrive
+   instructions/mbarrier_expect_tx
+   instructions/mbarrier_test_wait
+   instructions/mbarrier_try_wait
+   instructions/red_async
+   instructions/st_async
+   instructions/tensormap_replace
+   instructions/tensormap_cp_fenceproxy
    instructions/special_registers
 
 
diff --git a/docs/libcudacxx/ptx/instructions/barrier_cluster.rst b/docs/libcudacxx/ptx/instructions/barrier_cluster.rst
new file mode 100644
index 00000000000..bc8943bc619
--- /dev/null
+++ b/docs/libcudacxx/ptx/instructions/barrier_cluster.rst
@@ -0,0 +1,16 @@
+.. _libcudacxx-ptx-instructions-barrier-cluster:
+
+barrier.cluster
+===============
+
+-  PTX ISA:
+   `barrier.cluster <https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#parallel-synchronization-and-communication-instructions-barrier-cluster>`__
+
+Similar functionality is provided through the builtins
+``__cluster_barrier_arrive(), __cluster_barrier_arrive_relaxed(), __cluster_barrier_wait()``,
+as well as the ``cooperative_groups::cluster_group``
+`API <https://docs.nvidia.com/cuda/cuda-c-programming-guide/index.html#cluster-group>`__.
+
+The ``.aligned`` variants of the instructions are not exposed.
+
+.. include:: generated/barrier_cluster.rst
diff --git a/docs/libcudacxx/ptx/instructions/cp_async_bulk.rst b/docs/libcudacxx/ptx/instructions/cp_async_bulk.rst
new file mode 100644
index 00000000000..32121ef8a12
--- /dev/null
+++ b/docs/libcudacxx/ptx/instructions/cp_async_bulk.rst
@@ -0,0 +1,30 @@
+.. _libcudacxx-ptx-instructions-cp-async-bulk:
+
+cp.async.bulk
+=============
+
+-  PTX ISA:
+   `cp.async.bulk <https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#data-movement-and-conversion-instructions-cp-async-bulk>`__
+
+Implementation notes
+--------------------
+
+**NOTE.** Both ``srcMem`` and ``dstMem`` must be 16-byte aligned, and
+``size`` must be a multiple of 16.
+
+Changelog
+---------
+
+-  In earlier versions, ``cp_async_bulk_multicast`` was enabled for
+   SM_90. This has been changed to SM_90a.
+
+
+Unicast
+-------
+
+.. include:: generated/cp_async_bulk.rst
+
+Multicast
+---------
+
+.. include:: generated/cp_async_bulk_multicast.rst
diff --git a/docs/libcudacxx/ptx/instructions/cp.async.bulk.commit_group.rst b/docs/libcudacxx/ptx/instructions/cp_async_bulk_commit_group.rst
similarity index 58%
rename from docs/libcudacxx/ptx/instructions/cp.async.bulk.commit_group.rst
rename to docs/libcudacxx/ptx/instructions/cp_async_bulk_commit_group.rst
index cc549f54163..8efc5ac0488 100644
--- a/docs/libcudacxx/ptx/instructions/cp.async.bulk.commit_group.rst
+++ b/docs/libcudacxx/ptx/instructions/cp_async_bulk_commit_group.rst
@@ -6,10 +6,4 @@ cp.async.bulk.commit_group
 -  PTX ISA:
    `cp.async.bulk.commit_group <https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#data-movement-and-conversion-instructions-cp-async-bulk-commit-group>`__
 
-cp.async.bulk.commit_group
-^^^^^^^^^^^^^^^^^^^^^^^^^^
-.. code:: cuda
-
-   // cp.async.bulk.commit_group; // PTX ISA 80, SM_90
-   template <typename=void>
-   __device__ static inline void cp_async_bulk_commit_group();
+.. include:: generated/cp_async_bulk_commit_group.rst
diff --git a/docs/libcudacxx/ptx/instructions/cp_async_bulk_tensor.rst b/docs/libcudacxx/ptx/instructions/cp_async_bulk_tensor.rst
new file mode 100644
index 00000000000..bde3488bac9
--- /dev/null
+++ b/docs/libcudacxx/ptx/instructions/cp_async_bulk_tensor.rst
@@ -0,0 +1,23 @@
+.. _libcudacxx-ptx-instructions-cp-async-bulk-tensor:
+
+cp.async.bulk.tensor
+====================
+
+-  PTX ISA:
+   `cp.async.bulk.tensor <https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#data-movement-and-conversion-instructions-cp-async-bulk-tensor>`__
+
+Changelog
+---------
+
+-  In earlier versions, ``cp_async_bulk_tensor_multicast`` was enabled
+   for SM_90. This has been changed to SM_90a.
+
+Unicast
+-------
+
+.. include:: generated/cp_async_bulk_tensor.rst
+
+Multicast
+---------
+
+.. include:: generated/cp_async_bulk_tensor_multicast.rst
diff --git a/docs/libcudacxx/ptx/instructions/cp_async_bulk_wait_group.rst b/docs/libcudacxx/ptx/instructions/cp_async_bulk_wait_group.rst
new file mode 100644
index 00000000000..e24bb0fc9fd
--- /dev/null
+++ b/docs/libcudacxx/ptx/instructions/cp_async_bulk_wait_group.rst
@@ -0,0 +1,9 @@
+.. _libcudacxx-ptx-instructions-cp-async-bulk-wait_group:
+
+cp.async.bulk.wait_group
+========================
+
+-  PTX ISA:
+   `cp.async.bulk.wait_group <https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#data-movement-and-conversion-instructions-cp-async-bulk-wait-group>`__
+
+.. include:: generated/cp_async_bulk_wait_group.rst
diff --git a/docs/libcudacxx/ptx/instructions/cp_reduce_async_bulk.rst b/docs/libcudacxx/ptx/instructions/cp_reduce_async_bulk.rst
new file mode 100644
index 00000000000..a4710b5ce30
--- /dev/null
+++ b/docs/libcudacxx/ptx/instructions/cp_reduce_async_bulk.rst
@@ -0,0 +1,61 @@
+.. _libcudacxx-ptx-instructions-cp-reduce-async-bulk:
+
+cp.reduce.async.bulk
+====================
+
+-  PTX ISA:
+   `cp.reduce.async.bulk <https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#data-movement-and-conversion-instructions-cp-reduce-async-bulk>`__
+
+
+Integer and floating point instructions
+---------------------------------------
+
+.. include:: generated/cp_reduce_async_bulk.rst
+
+Emulation of ``.s64`` instruction
+---------------------------------
+
+PTX does not currently (CTK 12.3) expose
+``cp.reduce.async.bulk.add.s64``. This exposure is emulated in
+``cuda::ptx`` using:
+
+.. code:: cuda
+
+   // cp.reduce.async.bulk.dst.src.mbarrier::complete_tx::bytes.op.u64 [dstMem], [srcMem], size, [rdsmem_bar]; // 2. PTX ISA 80, SM_90
+   // .dst       = { .shared::cluster }
+   // .src       = { .shared::cta }
+   // .type      = { .s64 }
+   // .op        = { .add }
+   template <typename=void>
+   __device__ static inline void cp_reduce_async_bulk(
+     cuda::ptx::space_cluster_t,
+     cuda::ptx::space_shared_t,
+     cuda::ptx::op_add_t,
+     int64_t* dstMem,
+     const int64_t* srcMem,
+     uint32_t size,
+     uint64_t* rdsmem_bar);
+
+   // cp.reduce.async.bulk.dst.src.bulk_group.op.u64  [dstMem], [srcMem], size; // 6. PTX ISA 80, SM_90
+   // .dst       = { .global }
+   // .src       = { .shared::cta }
+   // .type      = { .s64 }
+   // .op        = { .add }
+   template <typename=void>
+   __device__ static inline void cp_reduce_async_bulk(
+     cuda::ptx::space_global_t,
+     cuda::ptx::space_shared_t,
+     cuda::ptx::op_add_t,
+     int64_t* dstMem,
+     const int64_t* srcMem,
+     uint32_t size);
+
+FP16 instructions
+-----------------
+
+.. include:: generated/cp_reduce_async_bulk_f16.rst
+
+BF16 instructions
+-----------------
+
+.. include:: generated/cp_reduce_async_bulk_bf16.rst
diff --git a/docs/libcudacxx/ptx/instructions/cp_reduce_async_bulk_tensor.rst b/docs/libcudacxx/ptx/instructions/cp_reduce_async_bulk_tensor.rst
new file mode 100644
index 00000000000..598d9e1e3ea
--- /dev/null
+++ b/docs/libcudacxx/ptx/instructions/cp_reduce_async_bulk_tensor.rst
@@ -0,0 +1,9 @@
+.. _libcudacxx-ptx-instructions-cp-reduce-async-bulk-tensor:
+
+cp.reduce.async.bulk.tensor
+===========================
+
+-  PTX ISA:
+   `cp.reduce.async.bulk.tensor <https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#data-movement-and-conversion-instructions-cp-reduce-async-bulk-tensor>`__
+
+.. include:: generated/cp_reduce_async_bulk_tensor.rst
diff --git a/docs/libcudacxx/ptx/instructions/fence.rst b/docs/libcudacxx/ptx/instructions/fence.rst
index 8a4e7f281cb..82de170f63b 100644
--- a/docs/libcudacxx/ptx/instructions/fence.rst
+++ b/docs/libcudacxx/ptx/instructions/fence.rst
@@ -11,272 +11,25 @@ fence
 fence
 -----
 
-fence.sc.cta
-^^^^^^^^^^^^
-.. code:: cuda
-
-   // fence{.sem}.scope; // 1. PTX ISA 60, SM_70
-   // .sem       = { .sc, .acq_rel }
-   // .scope     = { .cta, .gpu, .sys }
-   template <cuda::ptx::dot_sem Sem, cuda::ptx::dot_scope Scope>
-   __device__ static inline void fence(
-     cuda::ptx::sem_t<Sem> sem,
-     cuda::ptx::scope_t<Scope> scope);
-
-fence.sc.gpu
-^^^^^^^^^^^^
-.. code:: cuda
-
-   // fence{.sem}.scope; // 1. PTX ISA 60, SM_70
-   // .sem       = { .sc, .acq_rel }
-   // .scope     = { .cta, .gpu, .sys }
-   template <cuda::ptx::dot_sem Sem, cuda::ptx::dot_scope Scope>
-   __device__ static inline void fence(
-     cuda::ptx::sem_t<Sem> sem,
-     cuda::ptx::scope_t<Scope> scope);
-
-fence.sc.sys
-^^^^^^^^^^^^
-.. code:: cuda
-
-   // fence{.sem}.scope; // 1. PTX ISA 60, SM_70
-   // .sem       = { .sc, .acq_rel }
-   // .scope     = { .cta, .gpu, .sys }
-   template <cuda::ptx::dot_sem Sem, cuda::ptx::dot_scope Scope>
-   __device__ static inline void fence(
-     cuda::ptx::sem_t<Sem> sem,
-     cuda::ptx::scope_t<Scope> scope);
-
-fence.acq_rel.cta
-^^^^^^^^^^^^^^^^^
-.. code:: cuda
-
-   // fence{.sem}.scope; // 1. PTX ISA 60, SM_70
-   // .sem       = { .sc, .acq_rel }
-   // .scope     = { .cta, .gpu, .sys }
-   template <cuda::ptx::dot_sem Sem, cuda::ptx::dot_scope Scope>
-   __device__ static inline void fence(
-     cuda::ptx::sem_t<Sem> sem,
-     cuda::ptx::scope_t<Scope> scope);
-
-fence.acq_rel.gpu
-^^^^^^^^^^^^^^^^^
-.. code:: cuda
-
-   // fence{.sem}.scope; // 1. PTX ISA 60, SM_70
-   // .sem       = { .sc, .acq_rel }
-   // .scope     = { .cta, .gpu, .sys }
-   template <cuda::ptx::dot_sem Sem, cuda::ptx::dot_scope Scope>
-   __device__ static inline void fence(
-     cuda::ptx::sem_t<Sem> sem,
-     cuda::ptx::scope_t<Scope> scope);
-
-fence.acq_rel.sys
-^^^^^^^^^^^^^^^^^
-.. code:: cuda
-
-   // fence{.sem}.scope; // 1. PTX ISA 60, SM_70
-   // .sem       = { .sc, .acq_rel }
-   // .scope     = { .cta, .gpu, .sys }
-   template <cuda::ptx::dot_sem Sem, cuda::ptx::dot_scope Scope>
-   __device__ static inline void fence(
-     cuda::ptx::sem_t<Sem> sem,
-     cuda::ptx::scope_t<Scope> scope);
-
-fence.sc.cluster
-^^^^^^^^^^^^^^^^
-.. code:: cuda
-
-   // fence{.sem}.scope; // 2. PTX ISA 78, SM_90
-   // .sem       = { .sc, .acq_rel }
-   // .scope     = { .cluster }
-   template <cuda::ptx::dot_sem Sem>
-   __device__ static inline void fence(
-     cuda::ptx::sem_t<Sem> sem,
-     cuda::ptx::scope_cluster_t);
-
-fence.acq_rel.cluster
-^^^^^^^^^^^^^^^^^^^^^
-.. code:: cuda
-
-   // fence{.sem}.scope; // 2. PTX ISA 78, SM_90
-   // .sem       = { .sc, .acq_rel }
-   // .scope     = { .cluster }
-   template <cuda::ptx::dot_sem Sem>
-   __device__ static inline void fence(
-     cuda::ptx::sem_t<Sem> sem,
-     cuda::ptx::scope_cluster_t);
+.. include:: generated/fence.rst
 
 fence.mbarrier_init
 -------------------
 
-fence.mbarrier_init.release.cluster
-^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-.. code:: cuda
-
-   // fence.mbarrier_init.sem.scope; // 3. PTX ISA 80, SM_90
-   // .sem       = { .release }
-   // .scope     = { .cluster }
-   template <typename=void>
-   __device__ static inline void fence_mbarrier_init(
-     cuda::ptx::sem_release_t,
-     cuda::ptx::scope_cluster_t);
+.. include:: generated/fence_mbarrier_init.rst
 
 fence.proxy.alias
 -----------------
 
-fence.proxy.alias
-^^^^^^^^^^^^^^^^^
-.. code:: cuda
-
-   // fence.proxy.alias; // 4. PTX ISA 75, SM_70
-   template <typename=void>
-   __device__ static inline void fence_proxy_alias();
+.. include:: generated/fence_proxy_alias.rst
 
 fence.proxy.async
 -----------------
 
-fence.proxy.async
-^^^^^^^^^^^^^^^^^
-.. code:: cuda
-
-   // fence.proxy.async; // 5. PTX ISA 80, SM_90
-   template <typename=void>
-   __device__ static inline void fence_proxy_async();
 
-fence.proxy.async.global
-^^^^^^^^^^^^^^^^^^^^^^^^
-.. code:: cuda
-
-   // fence.proxy.async{.space}; // 6. PTX ISA 80, SM_90
-   // .space     = { .global, .shared::cluster, .shared::cta }
-   template <cuda::ptx::dot_space Space>
-   __device__ static inline void fence_proxy_async(
-     cuda::ptx::space_t<Space> space);
-
-fence.proxy.async.shared::cluster
-^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-.. code:: cuda
-
-   // fence.proxy.async{.space}; // 6. PTX ISA 80, SM_90
-   // .space     = { .global, .shared::cluster, .shared::cta }
-   template <cuda::ptx::dot_space Space>
-   __device__ static inline void fence_proxy_async(
-     cuda::ptx::space_t<Space> space);
-
-fence.proxy.async.shared::cta
-^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-.. code:: cuda
-
-   // fence.proxy.async{.space}; // 6. PTX ISA 80, SM_90
-   // .space     = { .global, .shared::cluster, .shared::cta }
-   template <cuda::ptx::dot_space Space>
-   __device__ static inline void fence_proxy_async(
-     cuda::ptx::space_t<Space> space);
+.. include:: generated/fence_proxy_async.rst
 
 fence.proxy.tensormap
 ---------------------
 
-fence.proxy.tensormap::generic.release.cta
-^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-.. code:: cuda
-
-   // fence.proxy.tensormap::generic.release.scope; // 7. PTX ISA 83, SM_90
-   // .sem       = { .release }
-   // .scope     = { .cta, .cluster, .gpu, .sys }
-   template <cuda::ptx::dot_scope Scope>
-   __device__ static inline void fence_proxy_tensormap_generic(
-     cuda::ptx::sem_release_t,
-     cuda::ptx::scope_t<Scope> scope);
-
-fence.proxy.tensormap::generic.release.cluster
-^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-.. code:: cuda
-
-   // fence.proxy.tensormap::generic.release.scope; // 7. PTX ISA 83, SM_90
-   // .sem       = { .release }
-   // .scope     = { .cta, .cluster, .gpu, .sys }
-   template <cuda::ptx::dot_scope Scope>
-   __device__ static inline void fence_proxy_tensormap_generic(
-     cuda::ptx::sem_release_t,
-     cuda::ptx::scope_t<Scope> scope);
-
-fence.proxy.tensormap::generic.release.gpu
-^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-.. code:: cuda
-
-   // fence.proxy.tensormap::generic.release.scope; // 7. PTX ISA 83, SM_90
-   // .sem       = { .release }
-   // .scope     = { .cta, .cluster, .gpu, .sys }
-   template <cuda::ptx::dot_scope Scope>
-   __device__ static inline void fence_proxy_tensormap_generic(
-     cuda::ptx::sem_release_t,
-     cuda::ptx::scope_t<Scope> scope);
-
-fence.proxy.tensormap::generic.release.sys
-^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-.. code:: cuda
-
-   // fence.proxy.tensormap::generic.release.scope; // 7. PTX ISA 83, SM_90
-   // .sem       = { .release }
-   // .scope     = { .cta, .cluster, .gpu, .sys }
-   template <cuda::ptx::dot_scope Scope>
-   __device__ static inline void fence_proxy_tensormap_generic(
-     cuda::ptx::sem_release_t,
-     cuda::ptx::scope_t<Scope> scope);
-
-fence.proxy.tensormap::generic.acquire.cta
-^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-.. code:: cuda
-
-   // fence.proxy.tensormap::generic.sem.scope [addr], size; // 8. PTX ISA 83, SM_90
-   // .sem       = { .acquire }
-   // .scope     = { .cta, .cluster, .gpu, .sys }
-   template <int N32, cuda::ptx::dot_scope Scope>
-   __device__ static inline void fence_proxy_tensormap_generic(
-     cuda::ptx::sem_acquire_t,
-     cuda::ptx::scope_t<Scope> scope,
-     const void* addr,
-     cuda::ptx::n32_t<N32> size);
-
-fence.proxy.tensormap::generic.acquire.cluster
-^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-.. code:: cuda
-
-   // fence.proxy.tensormap::generic.sem.scope [addr], size; // 8. PTX ISA 83, SM_90
-   // .sem       = { .acquire }
-   // .scope     = { .cta, .cluster, .gpu, .sys }
-   template <int N32, cuda::ptx::dot_scope Scope>
-   __device__ static inline void fence_proxy_tensormap_generic(
-     cuda::ptx::sem_acquire_t,
-     cuda::ptx::scope_t<Scope> scope,
-     const void* addr,
-     cuda::ptx::n32_t<N32> size);
-
-fence.proxy.tensormap::generic.acquire.gpu
-^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-.. code:: cuda
-
-   // fence.proxy.tensormap::generic.sem.scope [addr], size; // 8. PTX ISA 83, SM_90
-   // .sem       = { .acquire }
-   // .scope     = { .cta, .cluster, .gpu, .sys }
-   template <int N32, cuda::ptx::dot_scope Scope>
-   __device__ static inline void fence_proxy_tensormap_generic(
-     cuda::ptx::sem_acquire_t,
-     cuda::ptx::scope_t<Scope> scope,
-     const void* addr,
-     cuda::ptx::n32_t<N32> size);
-
-fence.proxy.tensormap::generic.acquire.sys
-^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-.. code:: cuda
-
-   // fence.proxy.tensormap::generic.sem.scope [addr], size; // 8. PTX ISA 83, SM_90
-   // .sem       = { .acquire }
-   // .scope     = { .cta, .cluster, .gpu, .sys }
-   template <int N32, cuda::ptx::dot_scope Scope>
-   __device__ static inline void fence_proxy_tensormap_generic(
-     cuda::ptx::sem_acquire_t,
-     cuda::ptx::scope_t<Scope> scope,
-     const void* addr,
-     cuda::ptx::n32_t<N32> size);
+.. include:: generated/fence_proxy_tensormap_generic.rst
diff --git a/docs/libcudacxx/ptx/instructions/barrier.cluster.rst b/docs/libcudacxx/ptx/instructions/generated/barrier_cluster.rst
similarity index 70%
rename from docs/libcudacxx/ptx/instructions/barrier.cluster.rst
rename to docs/libcudacxx/ptx/instructions/generated/barrier_cluster.rst
index 99048587eb5..bd994990c05 100644
--- a/docs/libcudacxx/ptx/instructions/barrier.cluster.rst
+++ b/docs/libcudacxx/ptx/instructions/generated/barrier_cluster.rst
@@ -1,18 +1,3 @@
-.. _libcudacxx-ptx-instructions-barrier-cluster:
-
-barrier.cluster
-===============
-
--  PTX ISA:
-   `barrier.cluster <https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#parallel-synchronization-and-communication-instructions-barrier-cluster>`__
-
-Similar functionality is provided through the builtins
-``__cluster_barrier_arrive(), __cluster_barrier_arrive_relaxed(), __cluster_barrier_wait()``,
-as well as the ``cooperative_groups::cluster_group``
-`API <https://docs.nvidia.com/cuda/cuda-c-programming-guide/index.html#cluster-group>`__.
-
-The ``.aligned`` variants of the instructions are not exposed.
-
 barrier.cluster.arrive
 ^^^^^^^^^^^^^^^^^^^^^^
 .. code:: cuda
diff --git a/docs/libcudacxx/ptx/instructions/cp.async.bulk.rst b/docs/libcudacxx/ptx/instructions/generated/cp_async_bulk.rst
similarity index 57%
rename from docs/libcudacxx/ptx/instructions/cp.async.bulk.rst
rename to docs/libcudacxx/ptx/instructions/generated/cp_async_bulk.rst
index 434a44a15a4..f5c236f8bf9 100644
--- a/docs/libcudacxx/ptx/instructions/cp.async.bulk.rst
+++ b/docs/libcudacxx/ptx/instructions/generated/cp_async_bulk.rst
@@ -1,26 +1,3 @@
-.. _libcudacxx-ptx-instructions-cp-async-bulk:
-
-cp.async.bulk
-=============
-
--  PTX ISA:
-   `cp.async.bulk <https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#data-movement-and-conversion-instructions-cp-async-bulk>`__
-
-Implementation notes
---------------------
-
-**NOTE.** Both ``srcMem`` and ``dstMem`` must be 16-byte aligned, and
-``size`` must be a multiple of 16.
-
-Changelog
----------
-
--  In earlier versions, ``cp_async_bulk_multicast`` was enabled for
-   SM_90. This has been changed to SM_90a.
-
-Unicast
--------
-
 cp.async.bulk.shared::cluster.global.mbarrier::complete_tx::bytes
 ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
 .. code:: cuda
@@ -67,23 +44,3 @@ cp.async.bulk.global.shared::cta.bulk_group
      void* dstMem,
      const void* srcMem,
      const uint32_t& size);
-
-Multicast
----------
-
-cp.async.bulk.shared::cluster.global.mbarrier::complete_tx::bytes.multicast::cluster
-^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-.. code:: cuda
-
-   // cp.async.bulk{.dst}{.src}.mbarrier::complete_tx::bytes.multicast::cluster [dstMem], [srcMem], size, [smem_bar], ctaMask; // 1.  PTX ISA 80, SM_90a
-   // .dst       = { .shared::cluster }
-   // .src       = { .global }
-   template <typename=void>
-   __device__ static inline void cp_async_bulk(
-     cuda::ptx::space_cluster_t,
-     cuda::ptx::space_global_t,
-     void* dstMem,
-     const void* srcMem,
-     const uint32_t& size,
-     uint64_t* smem_bar,
-     const uint16_t& ctaMask);
diff --git a/docs/libcudacxx/ptx/instructions/generated/cp_async_bulk_commit_group.rst b/docs/libcudacxx/ptx/instructions/generated/cp_async_bulk_commit_group.rst
new file mode 100644
index 00000000000..984b4aff976
--- /dev/null
+++ b/docs/libcudacxx/ptx/instructions/generated/cp_async_bulk_commit_group.rst
@@ -0,0 +1,7 @@
+cp.async.bulk.commit_group
+^^^^^^^^^^^^^^^^^^^^^^^^^^
+.. code:: cuda
+
+   // cp.async.bulk.commit_group; // PTX ISA 80, SM_90
+   template <typename=void>
+   __device__ static inline void cp_async_bulk_commit_group();
diff --git a/docs/libcudacxx/ptx/instructions/generated/cp_async_bulk_multicast.rst b/docs/libcudacxx/ptx/instructions/generated/cp_async_bulk_multicast.rst
new file mode 100644
index 00000000000..9cb15d06fa3
--- /dev/null
+++ b/docs/libcudacxx/ptx/instructions/generated/cp_async_bulk_multicast.rst
@@ -0,0 +1,16 @@
+cp.async.bulk.shared::cluster.global.mbarrier::complete_tx::bytes.multicast::cluster
+^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+.. code:: cuda
+
+   // cp.async.bulk{.dst}{.src}.mbarrier::complete_tx::bytes.multicast::cluster [dstMem], [srcMem], size, [smem_bar], ctaMask; // 1.  PTX ISA 80, SM_90a
+   // .dst       = { .shared::cluster }
+   // .src       = { .global }
+   template <typename=void>
+   __device__ static inline void cp_async_bulk(
+     cuda::ptx::space_cluster_t,
+     cuda::ptx::space_global_t,
+     void* dstMem,
+     const void* srcMem,
+     const uint32_t& size,
+     uint64_t* smem_bar,
+     const uint16_t& ctaMask);
diff --git a/docs/libcudacxx/ptx/instructions/cp.async.bulk.tensor.rst b/docs/libcudacxx/ptx/instructions/generated/cp_async_bulk_tensor.rst
similarity index 59%
rename from docs/libcudacxx/ptx/instructions/cp.async.bulk.tensor.rst
rename to docs/libcudacxx/ptx/instructions/generated/cp_async_bulk_tensor.rst
index f095abcd1a3..40eb070e66a 100644
--- a/docs/libcudacxx/ptx/instructions/cp.async.bulk.tensor.rst
+++ b/docs/libcudacxx/ptx/instructions/generated/cp_async_bulk_tensor.rst
@@ -1,20 +1,3 @@
-.. _libcudacxx-ptx-instructions-cp-async-bulk-tensor:
-
-cp.async.bulk.tensor
-====================
-
--  PTX ISA:
-   `cp.async.bulk.tensor <https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#data-movement-and-conversion-instructions-cp-async-bulk-tensor>`__
-
-Changelog
----------
-
--  In earlier versions, ``cp_async_bulk_tensor_multicast`` was enabled
-   for SM_90. This has been changed to SM_90a.
-
-Unicast
--------
-
 cp.async.bulk.tensor.1d.shared::cluster.global.tile.mbarrier::complete_tx::bytes
 ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
 .. code:: cuda
@@ -169,91 +152,3 @@ cp.async.bulk.tensor.5d.global.shared::cta.tile.bulk_group
      const void* tensorMap,
      const int32_t (&tensorCoords)[5],
      const void* srcMem);
-
-Multicast
----------
-
-cp.async.bulk.tensor.1d.shared::cluster.global.tile.mbarrier::complete_tx::bytes.multicast::cluster
-^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-.. code:: cuda
-
-   // cp.async.bulk.tensor.1d.dst.src.tile.mbarrier::complete_tx::bytes.multicast::cluster [dstMem], [tensorMap, tensorCoords], [smem_bar], ctaMask; // 2a. PTX ISA 80, SM_90a
-   // .dst       = { .shared::cluster }
-   // .src       = { .global }
-   template <typename=void>
-   __device__ static inline void cp_async_bulk_tensor(
-     cuda::ptx::space_cluster_t,
-     cuda::ptx::space_global_t,
-     void* dstMem,
-     const void* tensorMap,
-     const int32_t (&tensorCoords)[1],
-     uint64_t* smem_bar,
-     const uint16_t& ctaMask);
-
-cp.async.bulk.tensor.2d.shared::cluster.global.tile.mbarrier::complete_tx::bytes.multicast::cluster
-^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-.. code:: cuda
-
-   // cp.async.bulk.tensor.2d.dst.src.tile.mbarrier::complete_tx::bytes.multicast::cluster [dstMem], [tensorMap, tensorCoords], [smem_bar], ctaMask; // 2b. PTX ISA 80, SM_90a
-   // .dst       = { .shared::cluster }
-   // .src       = { .global }
-   template <typename=void>
-   __device__ static inline void cp_async_bulk_tensor(
-     cuda::ptx::space_cluster_t,
-     cuda::ptx::space_global_t,
-     void* dstMem,
-     const void* tensorMap,
-     const int32_t (&tensorCoords)[2],
-     uint64_t* smem_bar,
-     const uint16_t& ctaMask);
-
-cp.async.bulk.tensor.3d.shared::cluster.global.tile.mbarrier::complete_tx::bytes.multicast::cluster
-^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-.. code:: cuda
-
-   // cp.async.bulk.tensor.3d.dst.src.tile.mbarrier::complete_tx::bytes.multicast::cluster [dstMem], [tensorMap, tensorCoords], [smem_bar], ctaMask; // 2c. PTX ISA 80, SM_90a
-   // .dst       = { .shared::cluster }
-   // .src       = { .global }
-   template <typename=void>
-   __device__ static inline void cp_async_bulk_tensor(
-     cuda::ptx::space_cluster_t,
-     cuda::ptx::space_global_t,
-     void* dstMem,
-     const void* tensorMap,
-     const int32_t (&tensorCoords)[3],
-     uint64_t* smem_bar,
-     const uint16_t& ctaMask);
-
-cp.async.bulk.tensor.4d.shared::cluster.global.tile.mbarrier::complete_tx::bytes.multicast::cluster
-^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-.. code:: cuda
-
-   // cp.async.bulk.tensor.4d.dst.src.tile.mbarrier::complete_tx::bytes.multicast::cluster [dstMem], [tensorMap, tensorCoords], [smem_bar], ctaMask; // 2d. PTX ISA 80, SM_90a
-   // .dst       = { .shared::cluster }
-   // .src       = { .global }
-   template <typename=void>
-   __device__ static inline void cp_async_bulk_tensor(
-     cuda::ptx::space_cluster_t,
-     cuda::ptx::space_global_t,
-     void* dstMem,
-     const void* tensorMap,
-     const int32_t (&tensorCoords)[4],
-     uint64_t* smem_bar,
-     const uint16_t& ctaMask);
-
-cp.async.bulk.tensor.5d.shared::cluster.global.tile.mbarrier::complete_tx::bytes.multicast::cluster
-^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-.. code:: cuda
-
-   // cp.async.bulk.tensor.5d.dst.src.tile.mbarrier::complete_tx::bytes.multicast::cluster [dstMem], [tensorMap, tensorCoords], [smem_bar], ctaMask; // 2e. PTX ISA 80, SM_90a
-   // .dst       = { .shared::cluster }
-   // .src       = { .global }
-   template <typename=void>
-   __device__ static inline void cp_async_bulk_tensor(
-     cuda::ptx::space_cluster_t,
-     cuda::ptx::space_global_t,
-     void* dstMem,
-     const void* tensorMap,
-     const int32_t (&tensorCoords)[5],
-     uint64_t* smem_bar,
-     const uint16_t& ctaMask);
diff --git a/docs/libcudacxx/ptx/instructions/generated/cp_async_bulk_tensor_multicast.rst b/docs/libcudacxx/ptx/instructions/generated/cp_async_bulk_tensor_multicast.rst
new file mode 100644
index 00000000000..2481c80bf3c
--- /dev/null
+++ b/docs/libcudacxx/ptx/instructions/generated/cp_async_bulk_tensor_multicast.rst
@@ -0,0 +1,84 @@
+cp.async.bulk.tensor.1d.shared::cluster.global.tile.mbarrier::complete_tx::bytes.multicast::cluster
+^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+.. code:: cuda
+
+   // cp.async.bulk.tensor.1d.dst.src.tile.mbarrier::complete_tx::bytes.multicast::cluster [dstMem], [tensorMap, tensorCoords], [smem_bar], ctaMask; // 2a. PTX ISA 80, SM_90a
+   // .dst       = { .shared::cluster }
+   // .src       = { .global }
+   template <typename=void>
+   __device__ static inline void cp_async_bulk_tensor(
+     cuda::ptx::space_cluster_t,
+     cuda::ptx::space_global_t,
+     void* dstMem,
+     const void* tensorMap,
+     const int32_t (&tensorCoords)[1],
+     uint64_t* smem_bar,
+     const uint16_t& ctaMask);
+
+cp.async.bulk.tensor.2d.shared::cluster.global.tile.mbarrier::complete_tx::bytes.multicast::cluster
+^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+.. code:: cuda
+
+   // cp.async.bulk.tensor.2d.dst.src.tile.mbarrier::complete_tx::bytes.multicast::cluster [dstMem], [tensorMap, tensorCoords], [smem_bar], ctaMask; // 2b. PTX ISA 80, SM_90a
+   // .dst       = { .shared::cluster }
+   // .src       = { .global }
+   template <typename=void>
+   __device__ static inline void cp_async_bulk_tensor(
+     cuda::ptx::space_cluster_t,
+     cuda::ptx::space_global_t,
+     void* dstMem,
+     const void* tensorMap,
+     const int32_t (&tensorCoords)[2],
+     uint64_t* smem_bar,
+     const uint16_t& ctaMask);
+
+cp.async.bulk.tensor.3d.shared::cluster.global.tile.mbarrier::complete_tx::bytes.multicast::cluster
+^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+.. code:: cuda
+
+   // cp.async.bulk.tensor.3d.dst.src.tile.mbarrier::complete_tx::bytes.multicast::cluster [dstMem], [tensorMap, tensorCoords], [smem_bar], ctaMask; // 2c. PTX ISA 80, SM_90a
+   // .dst       = { .shared::cluster }
+   // .src       = { .global }
+   template <typename=void>
+   __device__ static inline void cp_async_bulk_tensor(
+     cuda::ptx::space_cluster_t,
+     cuda::ptx::space_global_t,
+     void* dstMem,
+     const void* tensorMap,
+     const int32_t (&tensorCoords)[3],
+     uint64_t* smem_bar,
+     const uint16_t& ctaMask);
+
+cp.async.bulk.tensor.4d.shared::cluster.global.tile.mbarrier::complete_tx::bytes.multicast::cluster
+^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+.. code:: cuda
+
+   // cp.async.bulk.tensor.4d.dst.src.tile.mbarrier::complete_tx::bytes.multicast::cluster [dstMem], [tensorMap, tensorCoords], [smem_bar], ctaMask; // 2d. PTX ISA 80, SM_90a
+   // .dst       = { .shared::cluster }
+   // .src       = { .global }
+   template <typename=void>
+   __device__ static inline void cp_async_bulk_tensor(
+     cuda::ptx::space_cluster_t,
+     cuda::ptx::space_global_t,
+     void* dstMem,
+     const void* tensorMap,
+     const int32_t (&tensorCoords)[4],
+     uint64_t* smem_bar,
+     const uint16_t& ctaMask);
+
+cp.async.bulk.tensor.5d.shared::cluster.global.tile.mbarrier::complete_tx::bytes.multicast::cluster
+^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+.. code:: cuda
+
+   // cp.async.bulk.tensor.5d.dst.src.tile.mbarrier::complete_tx::bytes.multicast::cluster [dstMem], [tensorMap, tensorCoords], [smem_bar], ctaMask; // 2e. PTX ISA 80, SM_90a
+   // .dst       = { .shared::cluster }
+   // .src       = { .global }
+   template <typename=void>
+   __device__ static inline void cp_async_bulk_tensor(
+     cuda::ptx::space_cluster_t,
+     cuda::ptx::space_global_t,
+     void* dstMem,
+     const void* tensorMap,
+     const int32_t (&tensorCoords)[5],
+     uint64_t* smem_bar,
+     const uint16_t& ctaMask);
diff --git a/docs/libcudacxx/ptx/instructions/cp.async.bulk.wait_group.rst b/docs/libcudacxx/ptx/instructions/generated/cp_async_bulk_wait_group.rst
similarity index 62%
rename from docs/libcudacxx/ptx/instructions/cp.async.bulk.wait_group.rst
rename to docs/libcudacxx/ptx/instructions/generated/cp_async_bulk_wait_group.rst
index 8939292d340..08ebd3c28a7 100644
--- a/docs/libcudacxx/ptx/instructions/cp.async.bulk.wait_group.rst
+++ b/docs/libcudacxx/ptx/instructions/generated/cp_async_bulk_wait_group.rst
@@ -1,11 +1,3 @@
-.. _libcudacxx-ptx-instructions-cp-async-bulk-wait_group:
-
-cp.async.bulk.wait_group
-========================
-
--  PTX ISA:
-   `cp.async.bulk.wait_group <https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#data-movement-and-conversion-instructions-cp-async-bulk-wait-group>`__
-
 cp.async.bulk.wait_group
 ^^^^^^^^^^^^^^^^^^^^^^^^
 .. code:: cuda
diff --git a/docs/libcudacxx/ptx/instructions/cp.reduce.async.bulk.rst b/docs/libcudacxx/ptx/instructions/generated/cp_reduce_async_bulk.rst
similarity index 80%
rename from docs/libcudacxx/ptx/instructions/cp.reduce.async.bulk.rst
rename to docs/libcudacxx/ptx/instructions/generated/cp_reduce_async_bulk.rst
index 571e1d9842f..cc82d633375 100644
--- a/docs/libcudacxx/ptx/instructions/cp.reduce.async.bulk.rst
+++ b/docs/libcudacxx/ptx/instructions/generated/cp_reduce_async_bulk.rst
@@ -1,15 +1,3 @@
-.. _libcudacxx-ptx-instructions-cp-reduce-async-bulk:
-
-cp.reduce.async.bulk
-====================
-
--  PTX ISA:
-   `cp.reduce.async.bulk <https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#data-movement-and-conversion-instructions-cp-reduce-async-bulk>`__
-
-
-Integer and floating point instructions
----------------------------------------
-
 cp.reduce.async.bulk.shared::cluster.shared::cta.mbarrier::complete_tx::bytes.and.b32
 ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
 .. code:: cuda
@@ -652,155 +640,3 @@ cp.reduce.async.bulk.global.shared::cta.bulk_group.add.u64
      int64_t* dstMem,
      const int64_t* srcMem,
      uint32_t size);
-
-Emulation of ``.s64`` instruction
----------------------------------
-
-PTX does not currently (CTK 12.3) expose
-``cp.reduce.async.bulk.add.s64``. This exposure is emulated in
-``cuda::ptx`` using:
-
-.. code:: cuda
-
-   // cp.reduce.async.bulk.dst.src.mbarrier::complete_tx::bytes.op.u64 [dstMem], [srcMem], size, [rdsmem_bar]; // 2. PTX ISA 80, SM_90
-   // .dst       = { .shared::cluster }
-   // .src       = { .shared::cta }
-   // .type      = { .s64 }
-   // .op        = { .add }
-   template <typename=void>
-   __device__ static inline void cp_reduce_async_bulk(
-     cuda::ptx::space_cluster_t,
-     cuda::ptx::space_shared_t,
-     cuda::ptx::op_add_t,
-     int64_t* dstMem,
-     const int64_t* srcMem,
-     uint32_t size,
-     uint64_t* rdsmem_bar);
-
-   // cp.reduce.async.bulk.dst.src.bulk_group.op.u64  [dstMem], [srcMem], size; // 6. PTX ISA 80, SM_90
-   // .dst       = { .global }
-   // .src       = { .shared::cta }
-   // .type      = { .s64 }
-   // .op        = { .add }
-   template <typename=void>
-   __device__ static inline void cp_reduce_async_bulk(
-     cuda::ptx::space_global_t,
-     cuda::ptx::space_shared_t,
-     cuda::ptx::op_add_t,
-     int64_t* dstMem,
-     const int64_t* srcMem,
-     uint32_t size);
-
-FP16 instructions
------------------
-
-cp.reduce.async.bulk.global.shared::cta.bulk_group.min.f16
-^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-.. code:: cuda
-
-   // cp.reduce.async.bulk.dst.src.bulk_group.op.type  [dstMem], [srcMem], size; // 4. PTX ISA 80, SM_90
-   // .dst       = { .global }
-   // .src       = { .shared::cta }
-   // .type      = { .f16 }
-   // .op        = { .min }
-   template <typename=void>
-   __device__ static inline void cp_reduce_async_bulk(
-     cuda::ptx::space_global_t,
-     cuda::ptx::space_shared_t,
-     cuda::ptx::op_min_t,
-     __half* dstMem,
-     const __half* srcMem,
-     uint32_t size);
-
-cp.reduce.async.bulk.global.shared::cta.bulk_group.max.f16
-^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-.. code:: cuda
-
-   // cp.reduce.async.bulk.dst.src.bulk_group.op.type  [dstMem], [srcMem], size; // 4. PTX ISA 80, SM_90
-   // .dst       = { .global }
-   // .src       = { .shared::cta }
-   // .type      = { .f16 }
-   // .op        = { .max }
-   template <typename=void>
-   __device__ static inline void cp_reduce_async_bulk(
-     cuda::ptx::space_global_t,
-     cuda::ptx::space_shared_t,
-     cuda::ptx::op_max_t,
-     __half* dstMem,
-     const __half* srcMem,
-     uint32_t size);
-
-cp.reduce.async.bulk.global.shared::cta.bulk_group.add.noftz.f16
-^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-.. code:: cuda
-
-   // cp.reduce.async.bulk.dst.src.bulk_group.op.noftz.type  [dstMem], [srcMem], size; // 5. PTX ISA 80, SM_90
-   // .dst       = { .global }
-   // .src       = { .shared::cta }
-   // .type      = { .f16 }
-   // .op        = { .add }
-   template <typename=void>
-   __device__ static inline void cp_reduce_async_bulk(
-     cuda::ptx::space_global_t,
-     cuda::ptx::space_shared_t,
-     cuda::ptx::op_add_t,
-     __half* dstMem,
-     const __half* srcMem,
-     uint32_t size);
-
-BF16 instructions
------------------
-
-cp.reduce.async.bulk.global.shared::cta.bulk_group.min.bf16
-^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-.. code:: cuda
-
-   // cp.reduce.async.bulk.dst.src.bulk_group.op.type  [dstMem], [srcMem], size; // 4. PTX ISA 80, SM_90
-   // .dst       = { .global }
-   // .src       = { .shared::cta }
-   // .type      = { .bf16 }
-   // .op        = { .min }
-   template <typename=void>
-   __device__ static inline void cp_reduce_async_bulk(
-     cuda::ptx::space_global_t,
-     cuda::ptx::space_shared_t,
-     cuda::ptx::op_min_t,
-     __nv_bfloat16* dstMem,
-     const __nv_bfloat16* srcMem,
-     uint32_t size);
-
-cp.reduce.async.bulk.global.shared::cta.bulk_group.max.bf16
-^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-.. code:: cuda
-
-   // cp.reduce.async.bulk.dst.src.bulk_group.op.type  [dstMem], [srcMem], size; // 4. PTX ISA 80, SM_90
-   // .dst       = { .global }
-   // .src       = { .shared::cta }
-   // .type      = { .bf16 }
-   // .op        = { .max }
-   template <typename=void>
-   __device__ static inline void cp_reduce_async_bulk(
-     cuda::ptx::space_global_t,
-     cuda::ptx::space_shared_t,
-     cuda::ptx::op_max_t,
-     __nv_bfloat16* dstMem,
-     const __nv_bfloat16* srcMem,
-     uint32_t size);
-
-cp.reduce.async.bulk.global.shared::cta.bulk_group.add.noftz.bf16
-^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-.. code:: cuda
-
-   // cp.reduce.async.bulk.dst.src.bulk_group.op.noftz.type  [dstMem], [srcMem], size; // 5. PTX ISA 80, SM_90
-   // .dst       = { .global }
-   // .src       = { .shared::cta }
-   // .type      = { .bf16 }
-   // .op        = { .add }
-   template <typename=void>
-   __device__ static inline void cp_reduce_async_bulk(
-     cuda::ptx::space_global_t,
-     cuda::ptx::space_shared_t,
-     cuda::ptx::op_add_t,
-     __nv_bfloat16* dstMem,
-     const __nv_bfloat16* srcMem,
-     uint32_t size);
diff --git a/docs/libcudacxx/ptx/instructions/generated/cp_reduce_async_bulk_bf16.rst b/docs/libcudacxx/ptx/instructions/generated/cp_reduce_async_bulk_bf16.rst
new file mode 100644
index 00000000000..e4dea98a119
--- /dev/null
+++ b/docs/libcudacxx/ptx/instructions/generated/cp_reduce_async_bulk_bf16.rst
@@ -0,0 +1,53 @@
+cp.reduce.async.bulk.global.shared::cta.bulk_group.min.bf16
+^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+.. code:: cuda
+
+   // cp.reduce.async.bulk.dst.src.bulk_group.op.type  [dstMem], [srcMem], size; // 4. PTX ISA 80, SM_90
+   // .dst       = { .global }
+   // .src       = { .shared::cta }
+   // .type      = { .bf16 }
+   // .op        = { .min }
+   template <typename=void>
+   __device__ static inline void cp_reduce_async_bulk(
+     cuda::ptx::space_global_t,
+     cuda::ptx::space_shared_t,
+     cuda::ptx::op_min_t,
+     __nv_bfloat16* dstMem,
+     const __nv_bfloat16* srcMem,
+     uint32_t size);
+
+cp.reduce.async.bulk.global.shared::cta.bulk_group.max.bf16
+^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+.. code:: cuda
+
+   // cp.reduce.async.bulk.dst.src.bulk_group.op.type  [dstMem], [srcMem], size; // 4. PTX ISA 80, SM_90
+   // .dst       = { .global }
+   // .src       = { .shared::cta }
+   // .type      = { .bf16 }
+   // .op        = { .max }
+   template <typename=void>
+   __device__ static inline void cp_reduce_async_bulk(
+     cuda::ptx::space_global_t,
+     cuda::ptx::space_shared_t,
+     cuda::ptx::op_max_t,
+     __nv_bfloat16* dstMem,
+     const __nv_bfloat16* srcMem,
+     uint32_t size);
+
+cp.reduce.async.bulk.global.shared::cta.bulk_group.add.noftz.bf16
+^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+.. code:: cuda
+
+   // cp.reduce.async.bulk.dst.src.bulk_group.op.noftz.type  [dstMem], [srcMem], size; // 5. PTX ISA 80, SM_90
+   // .dst       = { .global }
+   // .src       = { .shared::cta }
+   // .type      = { .bf16 }
+   // .op        = { .add }
+   template <typename=void>
+   __device__ static inline void cp_reduce_async_bulk(
+     cuda::ptx::space_global_t,
+     cuda::ptx::space_shared_t,
+     cuda::ptx::op_add_t,
+     __nv_bfloat16* dstMem,
+     const __nv_bfloat16* srcMem,
+     uint32_t size);
diff --git a/docs/libcudacxx/ptx/instructions/generated/cp_reduce_async_bulk_f16.rst b/docs/libcudacxx/ptx/instructions/generated/cp_reduce_async_bulk_f16.rst
new file mode 100644
index 00000000000..18c5e0bfc60
--- /dev/null
+++ b/docs/libcudacxx/ptx/instructions/generated/cp_reduce_async_bulk_f16.rst
@@ -0,0 +1,53 @@
+cp.reduce.async.bulk.global.shared::cta.bulk_group.min.f16
+^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+.. code:: cuda
+
+   // cp.reduce.async.bulk.dst.src.bulk_group.op.type  [dstMem], [srcMem], size; // 4. PTX ISA 80, SM_90
+   // .dst       = { .global }
+   // .src       = { .shared::cta }
+   // .type      = { .f16 }
+   // .op        = { .min }
+   template <typename=void>
+   __device__ static inline void cp_reduce_async_bulk(
+     cuda::ptx::space_global_t,
+     cuda::ptx::space_shared_t,
+     cuda::ptx::op_min_t,
+     __half* dstMem,
+     const __half* srcMem,
+     uint32_t size);
+
+cp.reduce.async.bulk.global.shared::cta.bulk_group.max.f16
+^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+.. code:: cuda
+
+   // cp.reduce.async.bulk.dst.src.bulk_group.op.type  [dstMem], [srcMem], size; // 4. PTX ISA 80, SM_90
+   // .dst       = { .global }
+   // .src       = { .shared::cta }
+   // .type      = { .f16 }
+   // .op        = { .max }
+   template <typename=void>
+   __device__ static inline void cp_reduce_async_bulk(
+     cuda::ptx::space_global_t,
+     cuda::ptx::space_shared_t,
+     cuda::ptx::op_max_t,
+     __half* dstMem,
+     const __half* srcMem,
+     uint32_t size);
+
+cp.reduce.async.bulk.global.shared::cta.bulk_group.add.noftz.f16
+^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+.. code:: cuda
+
+   // cp.reduce.async.bulk.dst.src.bulk_group.op.noftz.type  [dstMem], [srcMem], size; // 5. PTX ISA 80, SM_90
+   // .dst       = { .global }
+   // .src       = { .shared::cta }
+   // .type      = { .f16 }
+   // .op        = { .add }
+   template <typename=void>
+   __device__ static inline void cp_reduce_async_bulk(
+     cuda::ptx::space_global_t,
+     cuda::ptx::space_shared_t,
+     cuda::ptx::op_add_t,
+     __half* dstMem,
+     const __half* srcMem,
+     uint32_t size);
diff --git a/docs/libcudacxx/ptx/instructions/cp.reduce.async.bulk.tensor.rst b/docs/libcudacxx/ptx/instructions/generated/cp_reduce_async_bulk_tensor.rst
similarity index 98%
rename from docs/libcudacxx/ptx/instructions/cp.reduce.async.bulk.tensor.rst
rename to docs/libcudacxx/ptx/instructions/generated/cp_reduce_async_bulk_tensor.rst
index 7ea7b5675aa..c653b01cd60 100644
--- a/docs/libcudacxx/ptx/instructions/cp.reduce.async.bulk.tensor.rst
+++ b/docs/libcudacxx/ptx/instructions/generated/cp_reduce_async_bulk_tensor.rst
@@ -1,11 +1,3 @@
-.. _libcudacxx-ptx-instructions-cp-reduce-async-bulk-tensor:
-
-cp.reduce.async.bulk.tensor
-===========================
-
--  PTX ISA:
-   `cp.reduce.async.bulk.tensor <https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#data-movement-and-conversion-instructions-cp-reduce-async-bulk-tensor>`__
-
 cp.reduce.async.bulk.tensor.1d.global.shared::cta.add.tile.bulk_group
 ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
 .. code:: cuda
diff --git a/docs/libcudacxx/ptx/instructions/generated/fence.rst b/docs/libcudacxx/ptx/instructions/generated/fence.rst
new file mode 100644
index 00000000000..2fe14dcb3b2
--- /dev/null
+++ b/docs/libcudacxx/ptx/instructions/generated/fence.rst
@@ -0,0 +1,95 @@
+fence.sc.cta
+^^^^^^^^^^^^
+.. code:: cuda
+
+   // fence{.sem}.scope; // 1. PTX ISA 60, SM_70
+   // .sem       = { .sc, .acq_rel }
+   // .scope     = { .cta, .gpu, .sys }
+   template <cuda::ptx::dot_sem Sem, cuda::ptx::dot_scope Scope>
+   __device__ static inline void fence(
+     cuda::ptx::sem_t<Sem> sem,
+     cuda::ptx::scope_t<Scope> scope);
+
+fence.sc.gpu
+^^^^^^^^^^^^
+.. code:: cuda
+
+   // fence{.sem}.scope; // 1. PTX ISA 60, SM_70
+   // .sem       = { .sc, .acq_rel }
+   // .scope     = { .cta, .gpu, .sys }
+   template <cuda::ptx::dot_sem Sem, cuda::ptx::dot_scope Scope>
+   __device__ static inline void fence(
+     cuda::ptx::sem_t<Sem> sem,
+     cuda::ptx::scope_t<Scope> scope);
+
+fence.sc.sys
+^^^^^^^^^^^^
+.. code:: cuda
+
+   // fence{.sem}.scope; // 1. PTX ISA 60, SM_70
+   // .sem       = { .sc, .acq_rel }
+   // .scope     = { .cta, .gpu, .sys }
+   template <cuda::ptx::dot_sem Sem, cuda::ptx::dot_scope Scope>
+   __device__ static inline void fence(
+     cuda::ptx::sem_t<Sem> sem,
+     cuda::ptx::scope_t<Scope> scope);
+
+fence.acq_rel.cta
+^^^^^^^^^^^^^^^^^
+.. code:: cuda
+
+   // fence{.sem}.scope; // 1. PTX ISA 60, SM_70
+   // .sem       = { .sc, .acq_rel }
+   // .scope     = { .cta, .gpu, .sys }
+   template <cuda::ptx::dot_sem Sem, cuda::ptx::dot_scope Scope>
+   __device__ static inline void fence(
+     cuda::ptx::sem_t<Sem> sem,
+     cuda::ptx::scope_t<Scope> scope);
+
+fence.acq_rel.gpu
+^^^^^^^^^^^^^^^^^
+.. code:: cuda
+
+   // fence{.sem}.scope; // 1. PTX ISA 60, SM_70
+   // .sem       = { .sc, .acq_rel }
+   // .scope     = { .cta, .gpu, .sys }
+   template <cuda::ptx::dot_sem Sem, cuda::ptx::dot_scope Scope>
+   __device__ static inline void fence(
+     cuda::ptx::sem_t<Sem> sem,
+     cuda::ptx::scope_t<Scope> scope);
+
+fence.acq_rel.sys
+^^^^^^^^^^^^^^^^^
+.. code:: cuda
+
+   // fence{.sem}.scope; // 1. PTX ISA 60, SM_70
+   // .sem       = { .sc, .acq_rel }
+   // .scope     = { .cta, .gpu, .sys }
+   template <cuda::ptx::dot_sem Sem, cuda::ptx::dot_scope Scope>
+   __device__ static inline void fence(
+     cuda::ptx::sem_t<Sem> sem,
+     cuda::ptx::scope_t<Scope> scope);
+
+fence.sc.cluster
+^^^^^^^^^^^^^^^^
+.. code:: cuda
+
+   // fence{.sem}.scope; // 2. PTX ISA 78, SM_90
+   // .sem       = { .sc, .acq_rel }
+   // .scope     = { .cluster }
+   template <cuda::ptx::dot_sem Sem>
+   __device__ static inline void fence(
+     cuda::ptx::sem_t<Sem> sem,
+     cuda::ptx::scope_cluster_t);
+
+fence.acq_rel.cluster
+^^^^^^^^^^^^^^^^^^^^^
+.. code:: cuda
+
+   // fence{.sem}.scope; // 2. PTX ISA 78, SM_90
+   // .sem       = { .sc, .acq_rel }
+   // .scope     = { .cluster }
+   template <cuda::ptx::dot_sem Sem>
+   __device__ static inline void fence(
+     cuda::ptx::sem_t<Sem> sem,
+     cuda::ptx::scope_cluster_t);
diff --git a/docs/libcudacxx/ptx/instructions/generated/fence_mbarrier_init.rst b/docs/libcudacxx/ptx/instructions/generated/fence_mbarrier_init.rst
new file mode 100644
index 00000000000..0f5298e3359
--- /dev/null
+++ b/docs/libcudacxx/ptx/instructions/generated/fence_mbarrier_init.rst
@@ -0,0 +1,11 @@
+fence.mbarrier_init.release.cluster
+^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+.. code:: cuda
+
+   // fence.mbarrier_init.sem.scope; // 3. PTX ISA 80, SM_90
+   // .sem       = { .release }
+   // .scope     = { .cluster }
+   template <typename=void>
+   __device__ static inline void fence_mbarrier_init(
+     cuda::ptx::sem_release_t,
+     cuda::ptx::scope_cluster_t);
diff --git a/docs/libcudacxx/ptx/instructions/generated/fence_proxy_alias.rst b/docs/libcudacxx/ptx/instructions/generated/fence_proxy_alias.rst
new file mode 100644
index 00000000000..935aab9b6df
--- /dev/null
+++ b/docs/libcudacxx/ptx/instructions/generated/fence_proxy_alias.rst
@@ -0,0 +1,7 @@
+fence.proxy.alias
+^^^^^^^^^^^^^^^^^
+.. code:: cuda
+
+   // fence.proxy.alias; // 4. PTX ISA 75, SM_70
+   template <typename=void>
+   __device__ static inline void fence_proxy_alias();
diff --git a/docs/libcudacxx/ptx/instructions/generated/fence_proxy_async.rst b/docs/libcudacxx/ptx/instructions/generated/fence_proxy_async.rst
new file mode 100644
index 00000000000..3e741a1f6c4
--- /dev/null
+++ b/docs/libcudacxx/ptx/instructions/generated/fence_proxy_async.rst
@@ -0,0 +1,37 @@
+fence.proxy.async
+^^^^^^^^^^^^^^^^^
+.. code:: cuda
+
+   // fence.proxy.async; // 5. PTX ISA 80, SM_90
+   template <typename=void>
+   __device__ static inline void fence_proxy_async();
+
+fence.proxy.async.global
+^^^^^^^^^^^^^^^^^^^^^^^^
+.. code:: cuda
+
+   // fence.proxy.async{.space}; // 6. PTX ISA 80, SM_90
+   // .space     = { .global, .shared::cluster, .shared::cta }
+   template <cuda::ptx::dot_space Space>
+   __device__ static inline void fence_proxy_async(
+     cuda::ptx::space_t<Space> space);
+
+fence.proxy.async.shared::cluster
+^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+.. code:: cuda
+
+   // fence.proxy.async{.space}; // 6. PTX ISA 80, SM_90
+   // .space     = { .global, .shared::cluster, .shared::cta }
+   template <cuda::ptx::dot_space Space>
+   __device__ static inline void fence_proxy_async(
+     cuda::ptx::space_t<Space> space);
+
+fence.proxy.async.shared::cta
+^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+.. code:: cuda
+
+   // fence.proxy.async{.space}; // 6. PTX ISA 80, SM_90
+   // .space     = { .global, .shared::cluster, .shared::cta }
+   template <cuda::ptx::dot_space Space>
+   __device__ static inline void fence_proxy_async(
+     cuda::ptx::space_t<Space> space);
diff --git a/docs/libcudacxx/ptx/instructions/generated/fence_proxy_tensormap_generic.rst b/docs/libcudacxx/ptx/instructions/generated/fence_proxy_tensormap_generic.rst
new file mode 100644
index 00000000000..db582971c3d
--- /dev/null
+++ b/docs/libcudacxx/ptx/instructions/generated/fence_proxy_tensormap_generic.rst
@@ -0,0 +1,103 @@
+fence.proxy.tensormap::generic.release.cta
+^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+.. code:: cuda
+
+   // fence.proxy.tensormap::generic.release.scope; // 7. PTX ISA 83, SM_90
+   // .sem       = { .release }
+   // .scope     = { .cta, .cluster, .gpu, .sys }
+   template <cuda::ptx::dot_scope Scope>
+   __device__ static inline void fence_proxy_tensormap_generic(
+     cuda::ptx::sem_release_t,
+     cuda::ptx::scope_t<Scope> scope);
+
+fence.proxy.tensormap::generic.release.cluster
+^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+.. code:: cuda
+
+   // fence.proxy.tensormap::generic.release.scope; // 7. PTX ISA 83, SM_90
+   // .sem       = { .release }
+   // .scope     = { .cta, .cluster, .gpu, .sys }
+   template <cuda::ptx::dot_scope Scope>
+   __device__ static inline void fence_proxy_tensormap_generic(
+     cuda::ptx::sem_release_t,
+     cuda::ptx::scope_t<Scope> scope);
+
+fence.proxy.tensormap::generic.release.gpu
+^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+.. code:: cuda
+
+   // fence.proxy.tensormap::generic.release.scope; // 7. PTX ISA 83, SM_90
+   // .sem       = { .release }
+   // .scope     = { .cta, .cluster, .gpu, .sys }
+   template <cuda::ptx::dot_scope Scope>
+   __device__ static inline void fence_proxy_tensormap_generic(
+     cuda::ptx::sem_release_t,
+     cuda::ptx::scope_t<Scope> scope);
+
+fence.proxy.tensormap::generic.release.sys
+^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+.. code:: cuda
+
+   // fence.proxy.tensormap::generic.release.scope; // 7. PTX ISA 83, SM_90
+   // .sem       = { .release }
+   // .scope     = { .cta, .cluster, .gpu, .sys }
+   template <cuda::ptx::dot_scope Scope>
+   __device__ static inline void fence_proxy_tensormap_generic(
+     cuda::ptx::sem_release_t,
+     cuda::ptx::scope_t<Scope> scope);
+
+fence.proxy.tensormap::generic.acquire.cta
+^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+.. code:: cuda
+
+   // fence.proxy.tensormap::generic.sem.scope [addr], size; // 8. PTX ISA 83, SM_90
+   // .sem       = { .acquire }
+   // .scope     = { .cta, .cluster, .gpu, .sys }
+   template <int N32, cuda::ptx::dot_scope Scope>
+   __device__ static inline void fence_proxy_tensormap_generic(
+     cuda::ptx::sem_acquire_t,
+     cuda::ptx::scope_t<Scope> scope,
+     const void* addr,
+     cuda::ptx::n32_t<N32> size);
+
+fence.proxy.tensormap::generic.acquire.cluster
+^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+.. code:: cuda
+
+   // fence.proxy.tensormap::generic.sem.scope [addr], size; // 8. PTX ISA 83, SM_90
+   // .sem       = { .acquire }
+   // .scope     = { .cta, .cluster, .gpu, .sys }
+   template <int N32, cuda::ptx::dot_scope Scope>
+   __device__ static inline void fence_proxy_tensormap_generic(
+     cuda::ptx::sem_acquire_t,
+     cuda::ptx::scope_t<Scope> scope,
+     const void* addr,
+     cuda::ptx::n32_t<N32> size);
+
+fence.proxy.tensormap::generic.acquire.gpu
+^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+.. code:: cuda
+
+   // fence.proxy.tensormap::generic.sem.scope [addr], size; // 8. PTX ISA 83, SM_90
+   // .sem       = { .acquire }
+   // .scope     = { .cta, .cluster, .gpu, .sys }
+   template <int N32, cuda::ptx::dot_scope Scope>
+   __device__ static inline void fence_proxy_tensormap_generic(
+     cuda::ptx::sem_acquire_t,
+     cuda::ptx::scope_t<Scope> scope,
+     const void* addr,
+     cuda::ptx::n32_t<N32> size);
+
+fence.proxy.tensormap::generic.acquire.sys
+^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+.. code:: cuda
+
+   // fence.proxy.tensormap::generic.sem.scope [addr], size; // 8. PTX ISA 83, SM_90
+   // .sem       = { .acquire }
+   // .scope     = { .cta, .cluster, .gpu, .sys }
+   template <int N32, cuda::ptx::dot_scope Scope>
+   __device__ static inline void fence_proxy_tensormap_generic(
+     cuda::ptx::sem_acquire_t,
+     cuda::ptx::scope_t<Scope> scope,
+     const void* addr,
+     cuda::ptx::n32_t<N32> size);
diff --git a/docs/libcudacxx/ptx/instructions/generated/getctarank.rst b/docs/libcudacxx/ptx/instructions/generated/getctarank.rst
new file mode 100644
index 00000000000..c85f52ee302
--- /dev/null
+++ b/docs/libcudacxx/ptx/instructions/generated/getctarank.rst
@@ -0,0 +1,10 @@
+getctarank.shared::cluster.u32
+^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+.. code:: cuda
+
+   // getctarank{.space}.u32 dest, addr; // PTX ISA 78, SM_90
+   // .space     = { .shared::cluster }
+   template <typename=void>
+   __device__ static inline uint32_t getctarank(
+     cuda::ptx::space_cluster_t,
+     const void* addr);
diff --git a/docs/libcudacxx/ptx/instructions/generated/mbarrier_arrive.rst b/docs/libcudacxx/ptx/instructions/generated/mbarrier_arrive.rst
new file mode 100644
index 00000000000..92cd106cad9
--- /dev/null
+++ b/docs/libcudacxx/ptx/instructions/generated/mbarrier_arrive.rst
@@ -0,0 +1,111 @@
+mbarrier.arrive.shared.b64
+^^^^^^^^^^^^^^^^^^^^^^^^^^
+.. code:: cuda
+
+   // mbarrier.arrive.shared.b64                                  state,  [addr];           // 1.  PTX ISA 70, SM_80
+   template <typename=void>
+   __device__ static inline uint64_t mbarrier_arrive(
+     uint64_t* addr);
+
+mbarrier.arrive.shared::cta.b64
+^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+.. code:: cuda
+
+   // mbarrier.arrive.shared::cta.b64                             state,  [addr], count;    // 2.  PTX ISA 78, SM_90
+   template <typename=void>
+   __device__ static inline uint64_t mbarrier_arrive(
+     uint64_t* addr,
+     const uint32_t& count);
+
+mbarrier.arrive.release.cta.shared::cta.b64
+^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+.. code:: cuda
+
+   // mbarrier.arrive{.sem}{.scope}{.space}.b64                   state,  [addr];           // 3a.  PTX ISA 80, SM_90
+   // .sem       = { .release }
+   // .scope     = { .cta, .cluster }
+   // .space     = { .shared::cta }
+   template <cuda::ptx::dot_scope Scope>
+   __device__ static inline uint64_t mbarrier_arrive(
+     cuda::ptx::sem_release_t,
+     cuda::ptx::scope_t<Scope> scope,
+     cuda::ptx::space_shared_t,
+     uint64_t* addr);
+
+mbarrier.arrive.release.cluster.shared::cta.b64
+^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+.. code:: cuda
+
+   // mbarrier.arrive{.sem}{.scope}{.space}.b64                   state,  [addr];           // 3a.  PTX ISA 80, SM_90
+   // .sem       = { .release }
+   // .scope     = { .cta, .cluster }
+   // .space     = { .shared::cta }
+   template <cuda::ptx::dot_scope Scope>
+   __device__ static inline uint64_t mbarrier_arrive(
+     cuda::ptx::sem_release_t,
+     cuda::ptx::scope_t<Scope> scope,
+     cuda::ptx::space_shared_t,
+     uint64_t* addr);
+
+mbarrier.arrive.release.cta.shared::cta.b64
+^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+.. code:: cuda
+
+   // mbarrier.arrive{.sem}{.scope}{.space}.b64                   state,  [addr], count;    // 3b.  PTX ISA 80, SM_90
+   // .sem       = { .release }
+   // .scope     = { .cta, .cluster }
+   // .space     = { .shared::cta }
+   template <cuda::ptx::dot_scope Scope>
+   __device__ static inline uint64_t mbarrier_arrive(
+     cuda::ptx::sem_release_t,
+     cuda::ptx::scope_t<Scope> scope,
+     cuda::ptx::space_shared_t,
+     uint64_t* addr,
+     const uint32_t& count);
+
+mbarrier.arrive.release.cluster.shared::cta.b64
+^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+.. code:: cuda
+
+   // mbarrier.arrive{.sem}{.scope}{.space}.b64                   state,  [addr], count;    // 3b.  PTX ISA 80, SM_90
+   // .sem       = { .release }
+   // .scope     = { .cta, .cluster }
+   // .space     = { .shared::cta }
+   template <cuda::ptx::dot_scope Scope>
+   __device__ static inline uint64_t mbarrier_arrive(
+     cuda::ptx::sem_release_t,
+     cuda::ptx::scope_t<Scope> scope,
+     cuda::ptx::space_shared_t,
+     uint64_t* addr,
+     const uint32_t& count);
+
+mbarrier.arrive.release.cluster.shared::cluster.b64
+^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+.. code:: cuda
+
+   // mbarrier.arrive{.sem}{.scope}{.space}.b64                   _, [addr];                // 4a.  PTX ISA 80, SM_90
+   // .sem       = { .release }
+   // .scope     = { .cluster }
+   // .space     = { .shared::cluster }
+   template <typename=void>
+   __device__ static inline void mbarrier_arrive(
+     cuda::ptx::sem_release_t,
+     cuda::ptx::scope_cluster_t,
+     cuda::ptx::space_cluster_t,
+     uint64_t* addr);
+
+mbarrier.arrive.release.cluster.shared::cluster.b64
+^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+.. code:: cuda
+
+   // mbarrier.arrive{.sem}{.scope}{.space}.b64                   _, [addr], count;         // 4b.  PTX ISA 80, SM_90
+   // .sem       = { .release }
+   // .scope     = { .cluster }
+   // .space     = { .shared::cluster }
+   template <typename=void>
+   __device__ static inline void mbarrier_arrive(
+     cuda::ptx::sem_release_t,
+     cuda::ptx::scope_cluster_t,
+     cuda::ptx::space_cluster_t,
+     uint64_t* addr,
+     const uint32_t& count);
diff --git a/docs/libcudacxx/ptx/instructions/generated/mbarrier_arrive_expect_tx.rst b/docs/libcudacxx/ptx/instructions/generated/mbarrier_arrive_expect_tx.rst
new file mode 100644
index 00000000000..0087ae2f458
--- /dev/null
+++ b/docs/libcudacxx/ptx/instructions/generated/mbarrier_arrive_expect_tx.rst
@@ -0,0 +1,47 @@
+mbarrier.arrive.expect_tx.release.cta.shared::cta.b64
+^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+.. code:: cuda
+
+   // mbarrier.arrive.expect_tx{.sem}{.scope}{.space}.b64 state, [addr], tx_count; // 8.  PTX ISA 80, SM_90
+   // .sem       = { .release }
+   // .scope     = { .cta, .cluster }
+   // .space     = { .shared::cta }
+   template <cuda::ptx::dot_scope Scope>
+   __device__ static inline uint64_t mbarrier_arrive_expect_tx(
+     cuda::ptx::sem_release_t,
+     cuda::ptx::scope_t<Scope> scope,
+     cuda::ptx::space_shared_t,
+     uint64_t* addr,
+     const uint32_t& tx_count);
+
+mbarrier.arrive.expect_tx.release.cluster.shared::cta.b64
+^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+.. code:: cuda
+
+   // mbarrier.arrive.expect_tx{.sem}{.scope}{.space}.b64 state, [addr], tx_count; // 8.  PTX ISA 80, SM_90
+   // .sem       = { .release }
+   // .scope     = { .cta, .cluster }
+   // .space     = { .shared::cta }
+   template <cuda::ptx::dot_scope Scope>
+   __device__ static inline uint64_t mbarrier_arrive_expect_tx(
+     cuda::ptx::sem_release_t,
+     cuda::ptx::scope_t<Scope> scope,
+     cuda::ptx::space_shared_t,
+     uint64_t* addr,
+     const uint32_t& tx_count);
+
+mbarrier.arrive.expect_tx.release.cluster.shared::cluster.b64
+^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+.. code:: cuda
+
+   // mbarrier.arrive.expect_tx{.sem}{.scope}{.space}.b64   _, [addr], tx_count; // 9.  PTX ISA 80, SM_90
+   // .sem       = { .release }
+   // .scope     = { .cluster }
+   // .space     = { .shared::cluster }
+   template <typename=void>
+   __device__ static inline void mbarrier_arrive_expect_tx(
+     cuda::ptx::sem_release_t,
+     cuda::ptx::scope_cluster_t,
+     cuda::ptx::space_cluster_t,
+     uint64_t* addr,
+     const uint32_t& tx_count);
diff --git a/docs/libcudacxx/ptx/instructions/generated/mbarrier_arrive_no_complete.rst b/docs/libcudacxx/ptx/instructions/generated/mbarrier_arrive_no_complete.rst
new file mode 100644
index 00000000000..b6d7edbbeee
--- /dev/null
+++ b/docs/libcudacxx/ptx/instructions/generated/mbarrier_arrive_no_complete.rst
@@ -0,0 +1,9 @@
+mbarrier.arrive.noComplete.shared.b64
+^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+.. code:: cuda
+
+   // mbarrier.arrive.noComplete.shared.b64                       state,  [addr], count;    // 5.  PTX ISA 70, SM_80
+   template <typename=void>
+   __device__ static inline uint64_t mbarrier_arrive_no_complete(
+     uint64_t* addr,
+     const uint32_t& count);
diff --git a/docs/libcudacxx/ptx/instructions/mbarrier.expect_tx.rst b/docs/libcudacxx/ptx/instructions/generated/mbarrier_expect_tx.rst
similarity index 88%
rename from docs/libcudacxx/ptx/instructions/mbarrier.expect_tx.rst
rename to docs/libcudacxx/ptx/instructions/generated/mbarrier_expect_tx.rst
index 9b40db58d0c..b87d6f62a23 100644
--- a/docs/libcudacxx/ptx/instructions/mbarrier.expect_tx.rst
+++ b/docs/libcudacxx/ptx/instructions/generated/mbarrier_expect_tx.rst
@@ -1,11 +1,3 @@
-.. _libcudacxx-ptx-instructions-mbarrier-expect_tx:
-
-mbarrier.expect_tx
-==================
-
--  PTX ISA:
-   `mbarrier.expect_tx <https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#parallel-synchronization-and-communication-instructions-mbarrier-expect-tx>`__
-
 mbarrier.expect_tx.relaxed.cta.shared::cta.b64
 ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
 .. code:: cuda
diff --git a/docs/libcudacxx/ptx/instructions/mbarrier.init.rst b/docs/libcudacxx/ptx/instructions/generated/mbarrier_init.rst
similarity index 50%
rename from docs/libcudacxx/ptx/instructions/mbarrier.init.rst
rename to docs/libcudacxx/ptx/instructions/generated/mbarrier_init.rst
index 8c7e65eeab6..3e529d86d78 100644
--- a/docs/libcudacxx/ptx/instructions/mbarrier.init.rst
+++ b/docs/libcudacxx/ptx/instructions/generated/mbarrier_init.rst
@@ -1,11 +1,3 @@
-.. _libcudacxx-ptx-instructions-mbarrier-init:
-
-mbarrier.init
-=============
-
--  PTX ISA:
-   `mbarrier.arrive <https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#parallel-synchronization-and-communication-instructions-mbarrier-init>`__
-
 mbarrier.init.shared.b64
 ^^^^^^^^^^^^^^^^^^^^^^^^
 .. code:: cuda
diff --git a/docs/libcudacxx/ptx/instructions/generated/mbarrier_test_wait.rst b/docs/libcudacxx/ptx/instructions/generated/mbarrier_test_wait.rst
new file mode 100644
index 00000000000..4cb241c7ca8
--- /dev/null
+++ b/docs/libcudacxx/ptx/instructions/generated/mbarrier_test_wait.rst
@@ -0,0 +1,37 @@
+mbarrier.test_wait.shared.b64
+^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+.. code:: cuda
+
+   // mbarrier.test_wait.shared.b64 waitComplete, [addr], state;                                                  // 1.  PTX ISA 70, SM_80
+   template <typename=void>
+   __device__ static inline bool mbarrier_test_wait(
+     uint64_t* addr,
+     const uint64_t& state);
+
+mbarrier.test_wait.acquire.cta.shared::cta.b64
+^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+.. code:: cuda
+
+   // mbarrier.test_wait{.sem}{.scope}.shared::cta.b64        waitComplete, [addr], state;                        // 2.   PTX ISA 80, SM_90
+   // .sem       = { .acquire }
+   // .scope     = { .cta, .cluster }
+   template <cuda::ptx::dot_scope Scope>
+   __device__ static inline bool mbarrier_test_wait(
+     cuda::ptx::sem_acquire_t,
+     cuda::ptx::scope_t<Scope> scope,
+     uint64_t* addr,
+     const uint64_t& state);
+
+mbarrier.test_wait.acquire.cluster.shared::cta.b64
+^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+.. code:: cuda
+
+   // mbarrier.test_wait{.sem}{.scope}.shared::cta.b64        waitComplete, [addr], state;                        // 2.   PTX ISA 80, SM_90
+   // .sem       = { .acquire }
+   // .scope     = { .cta, .cluster }
+   template <cuda::ptx::dot_scope Scope>
+   __device__ static inline bool mbarrier_test_wait(
+     cuda::ptx::sem_acquire_t,
+     cuda::ptx::scope_t<Scope> scope,
+     uint64_t* addr,
+     const uint64_t& state);
diff --git a/docs/libcudacxx/ptx/instructions/generated/mbarrier_test_wait_parity.rst b/docs/libcudacxx/ptx/instructions/generated/mbarrier_test_wait_parity.rst
new file mode 100644
index 00000000000..e750c4a543f
--- /dev/null
+++ b/docs/libcudacxx/ptx/instructions/generated/mbarrier_test_wait_parity.rst
@@ -0,0 +1,37 @@
+mbarrier.test_wait.parity.shared.b64
+^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+.. code:: cuda
+
+   // mbarrier.test_wait.parity.shared.b64 waitComplete, [addr], phaseParity;                                     // 3.  PTX ISA 71, SM_80
+   template <typename=void>
+   __device__ static inline bool mbarrier_test_wait_parity(
+     uint64_t* addr,
+     const uint32_t& phaseParity);
+
+mbarrier.test_wait.parity.acquire.cta.shared::cta.b64
+^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+.. code:: cuda
+
+   // mbarrier.test_wait.parity{.sem}{.scope}.shared::cta.b64 waitComplete, [addr], phaseParity;                  // 4.  PTX ISA 80, SM_90
+   // .sem       = { .acquire }
+   // .scope     = { .cta, .cluster }
+   template <cuda::ptx::dot_scope Scope>
+   __device__ static inline bool mbarrier_test_wait_parity(
+     cuda::ptx::sem_acquire_t,
+     cuda::ptx::scope_t<Scope> scope,
+     uint64_t* addr,
+     const uint32_t& phaseParity);
+
+mbarrier.test_wait.parity.acquire.cluster.shared::cta.b64
+^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+.. code:: cuda
+
+   // mbarrier.test_wait.parity{.sem}{.scope}.shared::cta.b64 waitComplete, [addr], phaseParity;                  // 4.  PTX ISA 80, SM_90
+   // .sem       = { .acquire }
+   // .scope     = { .cta, .cluster }
+   template <cuda::ptx::dot_scope Scope>
+   __device__ static inline bool mbarrier_test_wait_parity(
+     cuda::ptx::sem_acquire_t,
+     cuda::ptx::scope_t<Scope> scope,
+     uint64_t* addr,
+     const uint32_t& phaseParity);
diff --git a/docs/libcudacxx/ptx/instructions/generated/mbarrier_try_wait.rst b/docs/libcudacxx/ptx/instructions/generated/mbarrier_try_wait.rst
new file mode 100644
index 00000000000..ce648c66ee9
--- /dev/null
+++ b/docs/libcudacxx/ptx/instructions/generated/mbarrier_try_wait.rst
@@ -0,0 +1,78 @@
+mbarrier.try_wait.shared::cta.b64
+^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+.. code:: cuda
+
+   // mbarrier.try_wait.shared::cta.b64         waitComplete, [addr], state;                                      // 5a.  PTX ISA 78, SM_90
+   template <typename=void>
+   __device__ static inline bool mbarrier_try_wait(
+     uint64_t* addr,
+     const uint64_t& state);
+
+mbarrier.try_wait.shared::cta.b64
+^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+.. code:: cuda
+
+   // mbarrier.try_wait.shared::cta.b64         waitComplete, [addr], state, suspendTimeHint;                    // 5b.  PTX ISA 78, SM_90
+   template <typename=void>
+   __device__ static inline bool mbarrier_try_wait(
+     uint64_t* addr,
+     const uint64_t& state,
+     const uint32_t& suspendTimeHint);
+
+mbarrier.try_wait.acquire.cta.shared::cta.b64
+^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+.. code:: cuda
+
+   // mbarrier.try_wait{.sem}{.scope}.shared::cta.b64         waitComplete, [addr], state;                        // 6a.  PTX ISA 80, SM_90
+   // .sem       = { .acquire }
+   // .scope     = { .cta, .cluster }
+   template <cuda::ptx::dot_scope Scope>
+   __device__ static inline bool mbarrier_try_wait(
+     cuda::ptx::sem_acquire_t,
+     cuda::ptx::scope_t<Scope> scope,
+     uint64_t* addr,
+     const uint64_t& state);
+
+mbarrier.try_wait.acquire.cluster.shared::cta.b64
+^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+.. code:: cuda
+
+   // mbarrier.try_wait{.sem}{.scope}.shared::cta.b64         waitComplete, [addr], state;                        // 6a.  PTX ISA 80, SM_90
+   // .sem       = { .acquire }
+   // .scope     = { .cta, .cluster }
+   template <cuda::ptx::dot_scope Scope>
+   __device__ static inline bool mbarrier_try_wait(
+     cuda::ptx::sem_acquire_t,
+     cuda::ptx::scope_t<Scope> scope,
+     uint64_t* addr,
+     const uint64_t& state);
+
+mbarrier.try_wait.acquire.cta.shared::cta.b64
+^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+.. code:: cuda
+
+   // mbarrier.try_wait{.sem}{.scope}.shared::cta.b64         waitComplete, [addr], state , suspendTimeHint;      // 6b.  PTX ISA 80, SM_90
+   // .sem       = { .acquire }
+   // .scope     = { .cta, .cluster }
+   template <cuda::ptx::dot_scope Scope>
+   __device__ static inline bool mbarrier_try_wait(
+     cuda::ptx::sem_acquire_t,
+     cuda::ptx::scope_t<Scope> scope,
+     uint64_t* addr,
+     const uint64_t& state,
+     const uint32_t& suspendTimeHint);
+
+mbarrier.try_wait.acquire.cluster.shared::cta.b64
+^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+.. code:: cuda
+
+   // mbarrier.try_wait{.sem}{.scope}.shared::cta.b64         waitComplete, [addr], state , suspendTimeHint;      // 6b.  PTX ISA 80, SM_90
+   // .sem       = { .acquire }
+   // .scope     = { .cta, .cluster }
+   template <cuda::ptx::dot_scope Scope>
+   __device__ static inline bool mbarrier_try_wait(
+     cuda::ptx::sem_acquire_t,
+     cuda::ptx::scope_t<Scope> scope,
+     uint64_t* addr,
+     const uint64_t& state,
+     const uint32_t& suspendTimeHint);
diff --git a/docs/libcudacxx/ptx/instructions/generated/mbarrier_try_wait_parity.rst b/docs/libcudacxx/ptx/instructions/generated/mbarrier_try_wait_parity.rst
new file mode 100644
index 00000000000..3210dc0eab1
--- /dev/null
+++ b/docs/libcudacxx/ptx/instructions/generated/mbarrier_try_wait_parity.rst
@@ -0,0 +1,78 @@
+mbarrier.try_wait.parity.shared::cta.b64
+^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+.. code:: cuda
+
+   // mbarrier.try_wait.parity.shared::cta.b64  waitComplete, [addr], phaseParity;                                // 7a.  PTX ISA 78, SM_90
+   template <typename=void>
+   __device__ static inline bool mbarrier_try_wait_parity(
+     uint64_t* addr,
+     const uint32_t& phaseParity);
+
+mbarrier.try_wait.parity.shared::cta.b64
+^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+.. code:: cuda
+
+   // mbarrier.try_wait.parity.shared::cta.b64  waitComplete, [addr], phaseParity, suspendTimeHint;               // 7b.  PTX ISA 78, SM_90
+   template <typename=void>
+   __device__ static inline bool mbarrier_try_wait_parity(
+     uint64_t* addr,
+     const uint32_t& phaseParity,
+     const uint32_t& suspendTimeHint);
+
+mbarrier.try_wait.parity.acquire.cta.shared::cta.b64
+^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+.. code:: cuda
+
+   // mbarrier.try_wait.parity{.sem}{.scope}.shared::cta.b64  waitComplete, [addr], phaseParity;                  // 8a.  PTX ISA 80, SM_90
+   // .sem       = { .acquire }
+   // .scope     = { .cta, .cluster }
+   template <cuda::ptx::dot_scope Scope>
+   __device__ static inline bool mbarrier_try_wait_parity(
+     cuda::ptx::sem_acquire_t,
+     cuda::ptx::scope_t<Scope> scope,
+     uint64_t* addr,
+     const uint32_t& phaseParity);
+
+mbarrier.try_wait.parity.acquire.cluster.shared::cta.b64
+^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+.. code:: cuda
+
+   // mbarrier.try_wait.parity{.sem}{.scope}.shared::cta.b64  waitComplete, [addr], phaseParity;                  // 8a.  PTX ISA 80, SM_90
+   // .sem       = { .acquire }
+   // .scope     = { .cta, .cluster }
+   template <cuda::ptx::dot_scope Scope>
+   __device__ static inline bool mbarrier_try_wait_parity(
+     cuda::ptx::sem_acquire_t,
+     cuda::ptx::scope_t<Scope> scope,
+     uint64_t* addr,
+     const uint32_t& phaseParity);
+
+mbarrier.try_wait.parity.acquire.cta.shared::cta.b64
+^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+.. code:: cuda
+
+   // mbarrier.try_wait.parity{.sem}{.scope}.shared::cta.b64  waitComplete, [addr], phaseParity, suspendTimeHint; // 8b.  PTX ISA 80, SM_90
+   // .sem       = { .acquire }
+   // .scope     = { .cta, .cluster }
+   template <cuda::ptx::dot_scope Scope>
+   __device__ static inline bool mbarrier_try_wait_parity(
+     cuda::ptx::sem_acquire_t,
+     cuda::ptx::scope_t<Scope> scope,
+     uint64_t* addr,
+     const uint32_t& phaseParity,
+     const uint32_t& suspendTimeHint);
+
+mbarrier.try_wait.parity.acquire.cluster.shared::cta.b64
+^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+.. code:: cuda
+
+   // mbarrier.try_wait.parity{.sem}{.scope}.shared::cta.b64  waitComplete, [addr], phaseParity, suspendTimeHint; // 8b.  PTX ISA 80, SM_90
+   // .sem       = { .acquire }
+   // .scope     = { .cta, .cluster }
+   template <cuda::ptx::dot_scope Scope>
+   __device__ static inline bool mbarrier_try_wait_parity(
+     cuda::ptx::sem_acquire_t,
+     cuda::ptx::scope_t<Scope> scope,
+     uint64_t* addr,
+     const uint32_t& phaseParity,
+     const uint32_t& suspendTimeHint);
diff --git a/docs/libcudacxx/ptx/instructions/red.async.rst b/docs/libcudacxx/ptx/instructions/generated/red_async.rst
similarity index 89%
rename from docs/libcudacxx/ptx/instructions/red.async.rst
rename to docs/libcudacxx/ptx/instructions/generated/red_async.rst
index 62599548a22..d6b9cf36549 100644
--- a/docs/libcudacxx/ptx/instructions/red.async.rst
+++ b/docs/libcudacxx/ptx/instructions/generated/red_async.rst
@@ -1,16 +1,3 @@
-.. _libcudacxx-ptx-instructions-mbarrier-red-async:
-
-red.async
-=========
-
--  PTX ISA:
-   `red.async <https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#parallel-synchronization-and-communication-instructions-red-async>`__
-
-.. _red.async-1:
-
-red.async
----------
-
 red.async.relaxed.cluster.shared::cluster.mbarrier::complete_tx::bytes.inc.u32
 ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
 .. code:: cuda
@@ -191,20 +178,3 @@ red.async.relaxed.cluster.shared::cluster.mbarrier::complete_tx::bytes.add.u64
      int64_t* dest,
      const int64_t& value,
      int64_t* remote_bar);
-
-red.async ``.s64`` emulation
-----------------------------
-
-PTX does not currently (CTK 12.3) expose ``red.async.add.s64``. This
-exposure is emulated in ``cuda::ptx`` using
-
-.. code:: cuda
-
-   // red.async.relaxed.cluster.shared::cluster.mbarrier::complete_tx::bytes{.op}.u64  [dest], value, [remote_bar]; // .u64 intentional PTX ISA 81, SM_90
-   // .op        = { .add }
-   template <typename=void>
-   __device__ static inline void red_async(
-     cuda::ptx::op_add_t,
-     int64_t* dest,
-     const int64_t& value,
-     int64_t* remote_bar);
diff --git a/docs/libcudacxx/ptx/instructions/generated/special_registers.rst b/docs/libcudacxx/ptx/instructions/generated/special_registers.rst
new file mode 100644
index 00000000000..aa1add84781
--- /dev/null
+++ b/docs/libcudacxx/ptx/instructions/generated/special_registers.rst
@@ -0,0 +1,383 @@
+tid.x
+^^^^^
+.. code:: cuda
+
+   // mov.u32 sreg_value, %%tid.x; // PTX ISA 20
+   template <typename=void>
+   __device__ static inline uint32_t get_sreg_tid_x();
+
+tid.y
+^^^^^
+.. code:: cuda
+
+   // mov.u32 sreg_value, %%tid.y; // PTX ISA 20
+   template <typename=void>
+   __device__ static inline uint32_t get_sreg_tid_y();
+
+tid.z
+^^^^^
+.. code:: cuda
+
+   // mov.u32 sreg_value, %%tid.z; // PTX ISA 20
+   template <typename=void>
+   __device__ static inline uint32_t get_sreg_tid_z();
+
+ntid.x
+^^^^^^
+.. code:: cuda
+
+   // mov.u32 sreg_value, %%ntid.x; // PTX ISA 20
+   template <typename=void>
+   __device__ static inline uint32_t get_sreg_ntid_x();
+
+ntid.y
+^^^^^^
+.. code:: cuda
+
+   // mov.u32 sreg_value, %%ntid.y; // PTX ISA 20
+   template <typename=void>
+   __device__ static inline uint32_t get_sreg_ntid_y();
+
+ntid.z
+^^^^^^
+.. code:: cuda
+
+   // mov.u32 sreg_value, %%ntid.z; // PTX ISA 20
+   template <typename=void>
+   __device__ static inline uint32_t get_sreg_ntid_z();
+
+laneid
+^^^^^^
+.. code:: cuda
+
+   // mov.u32 sreg_value, %%laneid; // PTX ISA 13
+   template <typename=void>
+   __device__ static inline uint32_t get_sreg_laneid();
+
+warpid
+^^^^^^
+.. code:: cuda
+
+   // mov.u32 sreg_value, %%warpid; // PTX ISA 13
+   template <typename=void>
+   __device__ static inline uint32_t get_sreg_warpid();
+
+nwarpid
+^^^^^^^
+.. code:: cuda
+
+   // mov.u32 sreg_value, %%nwarpid; // PTX ISA 20, SM_35
+   template <typename=void>
+   __device__ static inline uint32_t get_sreg_nwarpid();
+
+ctaid.x
+^^^^^^^
+.. code:: cuda
+
+   // mov.u32 sreg_value, %%ctaid.x; // PTX ISA 20
+   template <typename=void>
+   __device__ static inline uint32_t get_sreg_ctaid_x();
+
+ctaid.y
+^^^^^^^
+.. code:: cuda
+
+   // mov.u32 sreg_value, %%ctaid.y; // PTX ISA 20
+   template <typename=void>
+   __device__ static inline uint32_t get_sreg_ctaid_y();
+
+ctaid.z
+^^^^^^^
+.. code:: cuda
+
+   // mov.u32 sreg_value, %%ctaid.z; // PTX ISA 20
+   template <typename=void>
+   __device__ static inline uint32_t get_sreg_ctaid_z();
+
+nctaid.x
+^^^^^^^^
+.. code:: cuda
+
+   // mov.u32 sreg_value, %%nctaid.x; // PTX ISA 20
+   template <typename=void>
+   __device__ static inline uint32_t get_sreg_nctaid_x();
+
+nctaid.y
+^^^^^^^^
+.. code:: cuda
+
+   // mov.u32 sreg_value, %%nctaid.y; // PTX ISA 20
+   template <typename=void>
+   __device__ static inline uint32_t get_sreg_nctaid_y();
+
+nctaid.z
+^^^^^^^^
+.. code:: cuda
+
+   // mov.u32 sreg_value, %%nctaid.z; // PTX ISA 20
+   template <typename=void>
+   __device__ static inline uint32_t get_sreg_nctaid_z();
+
+smid
+^^^^
+.. code:: cuda
+
+   // mov.u32 sreg_value, %%smid; // PTX ISA 13
+   template <typename=void>
+   __device__ static inline uint32_t get_sreg_smid();
+
+nsmid
+^^^^^
+.. code:: cuda
+
+   // mov.u32 sreg_value, %%nsmid; // PTX ISA 20, SM_35
+   template <typename=void>
+   __device__ static inline uint32_t get_sreg_nsmid();
+
+gridid
+^^^^^^
+.. code:: cuda
+
+   // mov.u64 sreg_value, %%gridid; // PTX ISA 30
+   template <typename=void>
+   __device__ static inline uint64_t get_sreg_gridid();
+
+is_explicit_cluster
+^^^^^^^^^^^^^^^^^^^
+.. code:: cuda
+
+   // mov.pred sreg_value, %%is_explicit_cluster; // PTX ISA 78, SM_90
+   template <typename=void>
+   __device__ static inline bool get_sreg_is_explicit_cluster();
+
+clusterid.x
+^^^^^^^^^^^
+.. code:: cuda
+
+   // mov.u32 sreg_value, %%clusterid.x; // PTX ISA 78, SM_90
+   template <typename=void>
+   __device__ static inline uint32_t get_sreg_clusterid_x();
+
+clusterid.y
+^^^^^^^^^^^
+.. code:: cuda
+
+   // mov.u32 sreg_value, %%clusterid.y; // PTX ISA 78, SM_90
+   template <typename=void>
+   __device__ static inline uint32_t get_sreg_clusterid_y();
+
+clusterid.z
+^^^^^^^^^^^
+.. code:: cuda
+
+   // mov.u32 sreg_value, %%clusterid.z; // PTX ISA 78, SM_90
+   template <typename=void>
+   __device__ static inline uint32_t get_sreg_clusterid_z();
+
+nclusterid.x
+^^^^^^^^^^^^
+.. code:: cuda
+
+   // mov.u32 sreg_value, %%nclusterid.x; // PTX ISA 78, SM_90
+   template <typename=void>
+   __device__ static inline uint32_t get_sreg_nclusterid_x();
+
+nclusterid.y
+^^^^^^^^^^^^
+.. code:: cuda
+
+   // mov.u32 sreg_value, %%nclusterid.y; // PTX ISA 78, SM_90
+   template <typename=void>
+   __device__ static inline uint32_t get_sreg_nclusterid_y();
+
+nclusterid.z
+^^^^^^^^^^^^
+.. code:: cuda
+
+   // mov.u32 sreg_value, %%nclusterid.z; // PTX ISA 78, SM_90
+   template <typename=void>
+   __device__ static inline uint32_t get_sreg_nclusterid_z();
+
+cluster_ctaid.x
+^^^^^^^^^^^^^^^
+.. code:: cuda
+
+   // mov.u32 sreg_value, %%cluster_ctaid.x; // PTX ISA 78, SM_90
+   template <typename=void>
+   __device__ static inline uint32_t get_sreg_cluster_ctaid_x();
+
+cluster_ctaid.y
+^^^^^^^^^^^^^^^
+.. code:: cuda
+
+   // mov.u32 sreg_value, %%cluster_ctaid.y; // PTX ISA 78, SM_90
+   template <typename=void>
+   __device__ static inline uint32_t get_sreg_cluster_ctaid_y();
+
+cluster_ctaid.z
+^^^^^^^^^^^^^^^
+.. code:: cuda
+
+   // mov.u32 sreg_value, %%cluster_ctaid.z; // PTX ISA 78, SM_90
+   template <typename=void>
+   __device__ static inline uint32_t get_sreg_cluster_ctaid_z();
+
+cluster_nctaid.x
+^^^^^^^^^^^^^^^^
+.. code:: cuda
+
+   // mov.u32 sreg_value, %%cluster_nctaid.x; // PTX ISA 78, SM_90
+   template <typename=void>
+   __device__ static inline uint32_t get_sreg_cluster_nctaid_x();
+
+cluster_nctaid.y
+^^^^^^^^^^^^^^^^
+.. code:: cuda
+
+   // mov.u32 sreg_value, %%cluster_nctaid.y; // PTX ISA 78, SM_90
+   template <typename=void>
+   __device__ static inline uint32_t get_sreg_cluster_nctaid_y();
+
+cluster_nctaid.z
+^^^^^^^^^^^^^^^^
+.. code:: cuda
+
+   // mov.u32 sreg_value, %%cluster_nctaid.z; // PTX ISA 78, SM_90
+   template <typename=void>
+   __device__ static inline uint32_t get_sreg_cluster_nctaid_z();
+
+cluster_ctarank
+^^^^^^^^^^^^^^^
+.. code:: cuda
+
+   // mov.u32 sreg_value, %%cluster_ctarank; // PTX ISA 78, SM_90
+   template <typename=void>
+   __device__ static inline uint32_t get_sreg_cluster_ctarank();
+
+cluster_nctarank
+^^^^^^^^^^^^^^^^
+.. code:: cuda
+
+   // mov.u32 sreg_value, %%cluster_nctarank; // PTX ISA 78, SM_90
+   template <typename=void>
+   __device__ static inline uint32_t get_sreg_cluster_nctarank();
+
+lanemask_eq
+^^^^^^^^^^^
+.. code:: cuda
+
+   // mov.u32 sreg_value, %%lanemask_eq; // PTX ISA 20, SM_35
+   template <typename=void>
+   __device__ static inline uint32_t get_sreg_lanemask_eq();
+
+lanemask_le
+^^^^^^^^^^^
+.. code:: cuda
+
+   // mov.u32 sreg_value, %%lanemask_le; // PTX ISA 20, SM_35
+   template <typename=void>
+   __device__ static inline uint32_t get_sreg_lanemask_le();
+
+lanemask_lt
+^^^^^^^^^^^
+.. code:: cuda
+
+   // mov.u32 sreg_value, %%lanemask_lt; // PTX ISA 20, SM_35
+   template <typename=void>
+   __device__ static inline uint32_t get_sreg_lanemask_lt();
+
+lanemask_ge
+^^^^^^^^^^^
+.. code:: cuda
+
+   // mov.u32 sreg_value, %%lanemask_ge; // PTX ISA 20, SM_35
+   template <typename=void>
+   __device__ static inline uint32_t get_sreg_lanemask_ge();
+
+lanemask_gt
+^^^^^^^^^^^
+.. code:: cuda
+
+   // mov.u32 sreg_value, %%lanemask_gt; // PTX ISA 20, SM_35
+   template <typename=void>
+   __device__ static inline uint32_t get_sreg_lanemask_gt();
+
+clock
+^^^^^
+.. code:: cuda
+
+   // mov.u32 sreg_value, %%clock; // PTX ISA 10
+   template <typename=void>
+   __device__ static inline uint32_t get_sreg_clock();
+
+clock_hi
+^^^^^^^^
+.. code:: cuda
+
+   // mov.u32 sreg_value, %%clock_hi; // PTX ISA 50, SM_35
+   template <typename=void>
+   __device__ static inline uint32_t get_sreg_clock_hi();
+
+clock64
+^^^^^^^
+.. code:: cuda
+
+   // mov.u64 sreg_value, %%clock64; // PTX ISA 20, SM_35
+   template <typename=void>
+   __device__ static inline uint64_t get_sreg_clock64();
+
+globaltimer
+^^^^^^^^^^^
+.. code:: cuda
+
+   // mov.u64 sreg_value, %%globaltimer; // PTX ISA 31, SM_35
+   template <typename=void>
+   __device__ static inline uint64_t get_sreg_globaltimer();
+
+globaltimer_lo
+^^^^^^^^^^^^^^
+.. code:: cuda
+
+   // mov.u32 sreg_value, %%globaltimer_lo; // PTX ISA 31, SM_35
+   template <typename=void>
+   __device__ static inline uint32_t get_sreg_globaltimer_lo();
+
+globaltimer_hi
+^^^^^^^^^^^^^^
+.. code:: cuda
+
+   // mov.u32 sreg_value, %%globaltimer_hi; // PTX ISA 31, SM_35
+   template <typename=void>
+   __device__ static inline uint32_t get_sreg_globaltimer_hi();
+
+total_smem_size
+^^^^^^^^^^^^^^^
+.. code:: cuda
+
+   // mov.u32 sreg_value, %%total_smem_size; // PTX ISA 41, SM_35
+   template <typename=void>
+   __device__ static inline uint32_t get_sreg_total_smem_size();
+
+aggr_smem_size
+^^^^^^^^^^^^^^
+.. code:: cuda
+
+   // mov.u32 sreg_value, %%aggr_smem_size; // PTX ISA 81, SM_90
+   template <typename=void>
+   __device__ static inline uint32_t get_sreg_aggr_smem_size();
+
+dynamic_smem_size
+^^^^^^^^^^^^^^^^^
+.. code:: cuda
+
+   // mov.u32 sreg_value, %%dynamic_smem_size; // PTX ISA 41, SM_35
+   template <typename=void>
+   __device__ static inline uint32_t get_sreg_dynamic_smem_size();
+
+current_graph_exec
+^^^^^^^^^^^^^^^^^^
+.. code:: cuda
+
+   // mov.u64 sreg_value, %%current_graph_exec; // PTX ISA 80, SM_50
+   template <typename=void>
+   __device__ static inline uint64_t get_sreg_current_graph_exec();
diff --git a/docs/libcudacxx/ptx/instructions/st.async.rst b/docs/libcudacxx/ptx/instructions/generated/st_async.rst
similarity index 83%
rename from docs/libcudacxx/ptx/instructions/st.async.rst
rename to docs/libcudacxx/ptx/instructions/generated/st_async.rst
index a2e1ebe46a6..c519ea57f70 100644
--- a/docs/libcudacxx/ptx/instructions/st.async.rst
+++ b/docs/libcudacxx/ptx/instructions/generated/st_async.rst
@@ -1,16 +1,3 @@
-.. _libcudacxx-ptx-instructions-st-async:
-
-st.async
-========
-
--  PTX ISA:
-   `st.async <https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#data-movement-and-conversion-instructions-st-async>`__
--  Used in: :ref:`How to use st.async <libcudacxx-ptx-examples-st-async>`
-
-**NOTE.** Alignment of ``addr`` must be a multiple of vector size. For
-instance, the ``addr`` supplied to the ``v2.b32`` variant must be
-aligned to ``2 x 4 = 8`` bytes.
-
 st.async.weak.shared::cluster.mbarrier::complete_tx::bytes.b32
 ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
 .. code:: cuda
diff --git a/docs/libcudacxx/ptx/instructions/tensormap.cp_fenceproxy.rst b/docs/libcudacxx/ptx/instructions/generated/tensormap_cp_fenceproxy.rst
similarity index 89%
rename from docs/libcudacxx/ptx/instructions/tensormap.cp_fenceproxy.rst
rename to docs/libcudacxx/ptx/instructions/generated/tensormap_cp_fenceproxy.rst
index 1de158491a8..52fae102ad4 100644
--- a/docs/libcudacxx/ptx/instructions/tensormap.cp_fenceproxy.rst
+++ b/docs/libcudacxx/ptx/instructions/generated/tensormap_cp_fenceproxy.rst
@@ -1,11 +1,3 @@
-.. _libcudacxx-ptx-instructions-tensormap-cp_fenceproxy:
-
-tensormap.cp_fenceproxy
-=======================
-
--  PTX ISA:
-   `tensormap.cp_fenceproxy <https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#parallel-synchronization-and-communication-instructions-tensormap-cp-fenceproxy>`__
-
 tensormap.cp_fenceproxy.global.shared::cta.tensormap::generic.release.cta.sync.aligned
 ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
 .. code:: cuda
diff --git a/docs/libcudacxx/ptx/instructions/tensormap.replace.rst b/docs/libcudacxx/ptx/instructions/generated/tensormap_replace.rst
similarity index 97%
rename from docs/libcudacxx/ptx/instructions/tensormap.replace.rst
rename to docs/libcudacxx/ptx/instructions/generated/tensormap_replace.rst
index 7d8b839584e..33e6f1d839a 100644
--- a/docs/libcudacxx/ptx/instructions/tensormap.replace.rst
+++ b/docs/libcudacxx/ptx/instructions/generated/tensormap_replace.rst
@@ -1,11 +1,3 @@
-.. _libcudacxx-ptx-instructions-tensormap-replace:
-
-tensormap.replace
-=================
-
--  PTX ISA:
-   `tensormap.replace <https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#data-movement-and-conversion-instructions-tensormap-replace>`__
-
 tensormap.replace.tile.global_address.global.b1024.b64
 ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
 .. code:: cuda
diff --git a/docs/libcudacxx/ptx/instructions/getctarank.rst b/docs/libcudacxx/ptx/instructions/getctarank.rst
index 5bad6259103..d355ed80929 100644
--- a/docs/libcudacxx/ptx/instructions/getctarank.rst
+++ b/docs/libcudacxx/ptx/instructions/getctarank.rst
@@ -6,13 +6,4 @@ getctarank
 -  PTX ISA:
    `getctarank <https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#data-movement-and-conversion-instructions-getctarank>`__
 
-getctarank.shared::cluster.u32
-^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-.. code:: cuda
-
-   // getctarank{.space}.u32 dest, addr; // PTX ISA 78, SM_90
-   // .space     = { .shared::cluster }
-   template <typename=void>
-   __device__ static inline uint32_t getctarank(
-     cuda::ptx::space_cluster_t,
-     const void* addr);
+.. include:: generated/getctarank.rst
diff --git a/docs/libcudacxx/ptx/instructions/mbarrier.arrive.rst b/docs/libcudacxx/ptx/instructions/mbarrier.arrive.rst
deleted file mode 100644
index c383c59c6fd..00000000000
--- a/docs/libcudacxx/ptx/instructions/mbarrier.arrive.rst
+++ /dev/null
@@ -1,232 +0,0 @@
-.. _libcudacxx-ptx-instructions-mbarrier-arrive:
-
-mbarrier.arrive
-===============
-
--  PTX ISA:
-   `mbarrier.arrive <https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#parallel-synchronization-and-communication-instructions-mbarrier-arrive>`__
-
-.. _mbarrier.arrive-1:
-
-mbarrier.arrive
----------------
-
-Some of the listed PTX instructions below are semantically equivalent.
-They differ in one important way: the shorter instructions are typically
-supported on older compilers.
-
-mbarrier.arrive.shared.b64
-^^^^^^^^^^^^^^^^^^^^^^^^^^
-.. code:: cuda
-
-   // mbarrier.arrive.shared.b64                                  state,  [addr];           // 1.  PTX ISA 70, SM_80
-   template <typename=void>
-   __device__ static inline uint64_t mbarrier_arrive(
-     uint64_t* addr);
-
-mbarrier.arrive.shared::cta.b64
-^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-.. code:: cuda
-
-   // mbarrier.arrive.shared::cta.b64                             state,  [addr], count;    // 2.  PTX ISA 78, SM_90
-   template <typename=void>
-   __device__ static inline uint64_t mbarrier_arrive(
-     uint64_t* addr,
-     const uint32_t& count);
-
-mbarrier.arrive.release.cta.shared::cta.b64
-^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-.. code:: cuda
-
-   // mbarrier.arrive{.sem}{.scope}{.space}.b64                   state,  [addr];           // 3a.  PTX ISA 80, SM_90
-   // .sem       = { .release }
-   // .scope     = { .cta, .cluster }
-   // .space     = { .shared::cta }
-   template <cuda::ptx::dot_scope Scope>
-   __device__ static inline uint64_t mbarrier_arrive(
-     cuda::ptx::sem_release_t,
-     cuda::ptx::scope_t<Scope> scope,
-     cuda::ptx::space_shared_t,
-     uint64_t* addr);
-
-mbarrier.arrive.release.cluster.shared::cta.b64
-^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-.. code:: cuda
-
-   // mbarrier.arrive{.sem}{.scope}{.space}.b64                   state,  [addr];           // 3a.  PTX ISA 80, SM_90
-   // .sem       = { .release }
-   // .scope     = { .cta, .cluster }
-   // .space     = { .shared::cta }
-   template <cuda::ptx::dot_scope Scope>
-   __device__ static inline uint64_t mbarrier_arrive(
-     cuda::ptx::sem_release_t,
-     cuda::ptx::scope_t<Scope> scope,
-     cuda::ptx::space_shared_t,
-     uint64_t* addr);
-
-mbarrier.arrive.release.cta.shared::cta.b64
-^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-.. code:: cuda
-
-   // mbarrier.arrive{.sem}{.scope}{.space}.b64                   state,  [addr], count;    // 3b.  PTX ISA 80, SM_90
-   // .sem       = { .release }
-   // .scope     = { .cta, .cluster }
-   // .space     = { .shared::cta }
-   template <cuda::ptx::dot_scope Scope>
-   __device__ static inline uint64_t mbarrier_arrive(
-     cuda::ptx::sem_release_t,
-     cuda::ptx::scope_t<Scope> scope,
-     cuda::ptx::space_shared_t,
-     uint64_t* addr,
-     const uint32_t& count);
-
-mbarrier.arrive.release.cluster.shared::cta.b64
-^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-.. code:: cuda
-
-   // mbarrier.arrive{.sem}{.scope}{.space}.b64                   state,  [addr], count;    // 3b.  PTX ISA 80, SM_90
-   // .sem       = { .release }
-   // .scope     = { .cta, .cluster }
-   // .space     = { .shared::cta }
-   template <cuda::ptx::dot_scope Scope>
-   __device__ static inline uint64_t mbarrier_arrive(
-     cuda::ptx::sem_release_t,
-     cuda::ptx::scope_t<Scope> scope,
-     cuda::ptx::space_shared_t,
-     uint64_t* addr,
-     const uint32_t& count);
-
-mbarrier.arrive.release.cluster.shared::cluster.b64
-^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-.. code:: cuda
-
-   // mbarrier.arrive{.sem}{.scope}{.space}.b64                   _, [addr];                // 4a.  PTX ISA 80, SM_90
-   // .sem       = { .release }
-   // .scope     = { .cluster }
-   // .space     = { .shared::cluster }
-   template <typename=void>
-   __device__ static inline void mbarrier_arrive(
-     cuda::ptx::sem_release_t,
-     cuda::ptx::scope_cluster_t,
-     cuda::ptx::space_cluster_t,
-     uint64_t* addr);
-
-mbarrier.arrive.release.cluster.shared::cluster.b64
-^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-.. code:: cuda
-
-   // mbarrier.arrive{.sem}{.scope}{.space}.b64                   _, [addr], count;         // 4b.  PTX ISA 80, SM_90
-   // .sem       = { .release }
-   // .scope     = { .cluster }
-   // .space     = { .shared::cluster }
-   template <typename=void>
-   __device__ static inline void mbarrier_arrive(
-     cuda::ptx::sem_release_t,
-     cuda::ptx::scope_cluster_t,
-     cuda::ptx::space_cluster_t,
-     uint64_t* addr,
-     const uint32_t& count);
-
-mbarrier.arrive.no_complete
----------------------------
-
-mbarrier.arrive.noComplete.shared.b64
-^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-.. code:: cuda
-
-   // mbarrier.arrive.noComplete.shared.b64                       state,  [addr], count;    // 5.  PTX ISA 70, SM_80
-   template <typename=void>
-   __device__ static inline uint64_t mbarrier_arrive_no_complete(
-     uint64_t* addr,
-     const uint32_t& count);
-
-mbarrier.arrive.expect_tx
--------------------------
-
-mbarrier.arrive.expect_tx.release.cta.shared::cta.b64
-^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-.. code:: cuda
-
-   // mbarrier.arrive.expect_tx{.sem}{.scope}{.space}.b64 state, [addr], tx_count; // 8.  PTX ISA 80, SM_90
-   // .sem       = { .release }
-   // .scope     = { .cta, .cluster }
-   // .space     = { .shared::cta }
-   template <cuda::ptx::dot_scope Scope>
-   __device__ static inline uint64_t mbarrier_arrive_expect_tx(
-     cuda::ptx::sem_release_t,
-     cuda::ptx::scope_t<Scope> scope,
-     cuda::ptx::space_shared_t,
-     uint64_t* addr,
-     const uint32_t& tx_count);
-
-mbarrier.arrive.expect_tx.release.cluster.shared::cta.b64
-^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-.. code:: cuda
-
-   // mbarrier.arrive.expect_tx{.sem}{.scope}{.space}.b64 state, [addr], tx_count; // 8.  PTX ISA 80, SM_90
-   // .sem       = { .release }
-   // .scope     = { .cta, .cluster }
-   // .space     = { .shared::cta }
-   template <cuda::ptx::dot_scope Scope>
-   __device__ static inline uint64_t mbarrier_arrive_expect_tx(
-     cuda::ptx::sem_release_t,
-     cuda::ptx::scope_t<Scope> scope,
-     cuda::ptx::space_shared_t,
-     uint64_t* addr,
-     const uint32_t& tx_count);
-
-mbarrier.arrive.expect_tx.release.cluster.shared::cluster.b64
-^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-.. code:: cuda
-
-   // mbarrier.arrive.expect_tx{.sem}{.scope}{.space}.b64   _, [addr], tx_count; // 9.  PTX ISA 80, SM_90
-   // .sem       = { .release }
-   // .scope     = { .cluster }
-   // .space     = { .shared::cluster }
-   template <typename=void>
-   __device__ static inline void mbarrier_arrive_expect_tx(
-     cuda::ptx::sem_release_t,
-     cuda::ptx::scope_cluster_t,
-     cuda::ptx::space_cluster_t,
-     uint64_t* addr,
-     const uint32_t& tx_count);
-
-Usage
------
-
-.. code:: cuda
-
-   #include <cuda/ptx>
-   #include <cuda/barrier>
-   #include <cooperative_groups.h>
-
-   __global__ void kernel() {
-       using cuda::ptx::sem_release;
-       using cuda::ptx::space_cluster;
-       using cuda::ptx::space_shared;
-       using cuda::ptx::scope_cluster;
-       using cuda::ptx::scope_cta;
-
-       using barrier_t = cuda::barrier<cuda::thread_scope_block>;
-       __shared__ barrier_t bar;
-       init(&bar, blockDim.x);
-       __syncthreads();
-
-       NV_IF_TARGET(NV_PROVIDES_SM_90, (
-           // Arrive on local shared memory barrier:
-           uint64_t token;
-           token = cuda::ptx::mbarrier_arrive_expect_tx(sem_release, scope_cluster, space_shared, &bar, 1);
-
-           // Get address of remote cluster barrier:
-           namespace cg = cooperative_groups;
-           cg::cluster_group cluster = cg::this_cluster();
-           unsigned int other_block_rank = cluster.block_rank() ^ 1;
-           uint64_t * remote_bar = cluster.map_shared_rank(&bar, other_block_rank);
-
-           // Sync cluster to ensure remote barrier is initialized.
-           cluster.sync();
-
-           // Arrive on remote cluster barrier:
-           cuda::ptx::mbarrier_arrive_expect_tx(sem_release, scope_cluster, space_cluster, remote_bar, 1);
-       )
-   }
diff --git a/docs/libcudacxx/ptx/instructions/mbarrier.test_wait.rst b/docs/libcudacxx/ptx/instructions/mbarrier.test_wait.rst
deleted file mode 100644
index 23197e2eb7c..00000000000
--- a/docs/libcudacxx/ptx/instructions/mbarrier.test_wait.rst
+++ /dev/null
@@ -1,91 +0,0 @@
-.. _libcudacxx-ptx-instructions-mbarrier-test_wait:
-
-mbarrier.test_wait
-==================
-
--  PTX ISA:
-   `mbarrier.test_wait <https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#parallel-synchronization-and-communication-instructions-mbarrier-test-wait-mbarrier-try-wait>`__
-
-.. _mbarrier.test_wait-1:
-
-mbarrier.test_wait
-------------------
-
-mbarrier.test_wait.shared.b64
-^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-.. code:: cuda
-
-   // mbarrier.test_wait.shared.b64 waitComplete, [addr], state;                                                  // 1.  PTX ISA 70, SM_80
-   template <typename=void>
-   __device__ static inline bool mbarrier_test_wait(
-     uint64_t* addr,
-     const uint64_t& state);
-
-mbarrier.test_wait.acquire.cta.shared::cta.b64
-^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-.. code:: cuda
-
-   // mbarrier.test_wait{.sem}{.scope}.shared::cta.b64        waitComplete, [addr], state;                        // 2.   PTX ISA 80, SM_90
-   // .sem       = { .acquire }
-   // .scope     = { .cta, .cluster }
-   template <cuda::ptx::dot_scope Scope>
-   __device__ static inline bool mbarrier_test_wait(
-     cuda::ptx::sem_acquire_t,
-     cuda::ptx::scope_t<Scope> scope,
-     uint64_t* addr,
-     const uint64_t& state);
-
-mbarrier.test_wait.acquire.cluster.shared::cta.b64
-^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-.. code:: cuda
-
-   // mbarrier.test_wait{.sem}{.scope}.shared::cta.b64        waitComplete, [addr], state;                        // 2.   PTX ISA 80, SM_90
-   // .sem       = { .acquire }
-   // .scope     = { .cta, .cluster }
-   template <cuda::ptx::dot_scope Scope>
-   __device__ static inline bool mbarrier_test_wait(
-     cuda::ptx::sem_acquire_t,
-     cuda::ptx::scope_t<Scope> scope,
-     uint64_t* addr,
-     const uint64_t& state);
-
-mbarrier.test_wait.parity
--------------------------
-
-mbarrier.test_wait.parity.shared.b64
-^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-.. code:: cuda
-
-   // mbarrier.test_wait.parity.shared.b64 waitComplete, [addr], phaseParity;                                     // 3.  PTX ISA 71, SM_80
-   template <typename=void>
-   __device__ static inline bool mbarrier_test_wait_parity(
-     uint64_t* addr,
-     const uint32_t& phaseParity);
-
-mbarrier.test_wait.parity.acquire.cta.shared::cta.b64
-^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-.. code:: cuda
-
-   // mbarrier.test_wait.parity{.sem}{.scope}.shared::cta.b64 waitComplete, [addr], phaseParity;                  // 4.  PTX ISA 80, SM_90
-   // .sem       = { .acquire }
-   // .scope     = { .cta, .cluster }
-   template <cuda::ptx::dot_scope Scope>
-   __device__ static inline bool mbarrier_test_wait_parity(
-     cuda::ptx::sem_acquire_t,
-     cuda::ptx::scope_t<Scope> scope,
-     uint64_t* addr,
-     const uint32_t& phaseParity);
-
-mbarrier.test_wait.parity.acquire.cluster.shared::cta.b64
-^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-.. code:: cuda
-
-   // mbarrier.test_wait.parity{.sem}{.scope}.shared::cta.b64 waitComplete, [addr], phaseParity;                  // 4.  PTX ISA 80, SM_90
-   // .sem       = { .acquire }
-   // .scope     = { .cta, .cluster }
-   template <cuda::ptx::dot_scope Scope>
-   __device__ static inline bool mbarrier_test_wait_parity(
-     cuda::ptx::sem_acquire_t,
-     cuda::ptx::scope_t<Scope> scope,
-     uint64_t* addr,
-     const uint32_t& phaseParity);
diff --git a/docs/libcudacxx/ptx/instructions/mbarrier.try_wait.rst b/docs/libcudacxx/ptx/instructions/mbarrier.try_wait.rst
deleted file mode 100644
index 762f5e100d7..00000000000
--- a/docs/libcudacxx/ptx/instructions/mbarrier.try_wait.rst
+++ /dev/null
@@ -1,174 +0,0 @@
-.. _libcudacxx-ptx-instructions-mbarrier-try_wait:
-
-mbarrier.try_wait
-=================
-
--  PTX ISA:
-   `mbarrier.try_wait <https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#parallel-synchronization-and-communication-instructions-mbarrier-test-wait-mbarrier-try-wait>`__
-
-
-.. _mbarrier.try_wait-1:
-
-mbarrier.try_wait
------------------
-
-mbarrier.try_wait.shared::cta.b64
-^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-.. code:: cuda
-
-   // mbarrier.try_wait.shared::cta.b64         waitComplete, [addr], state;                                      // 5a.  PTX ISA 78, SM_90
-   template <typename=void>
-   __device__ static inline bool mbarrier_try_wait(
-     uint64_t* addr,
-     const uint64_t& state);
-
-mbarrier.try_wait.shared::cta.b64
-^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-.. code:: cuda
-
-   // mbarrier.try_wait.shared::cta.b64         waitComplete, [addr], state, suspendTimeHint;                    // 5b.  PTX ISA 78, SM_90
-   template <typename=void>
-   __device__ static inline bool mbarrier_try_wait(
-     uint64_t* addr,
-     const uint64_t& state,
-     const uint32_t& suspendTimeHint);
-
-mbarrier.try_wait.acquire.cta.shared::cta.b64
-^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-.. code:: cuda
-
-   // mbarrier.try_wait{.sem}{.scope}.shared::cta.b64         waitComplete, [addr], state;                        // 6a.  PTX ISA 80, SM_90
-   // .sem       = { .acquire }
-   // .scope     = { .cta, .cluster }
-   template <cuda::ptx::dot_scope Scope>
-   __device__ static inline bool mbarrier_try_wait(
-     cuda::ptx::sem_acquire_t,
-     cuda::ptx::scope_t<Scope> scope,
-     uint64_t* addr,
-     const uint64_t& state);
-
-mbarrier.try_wait.acquire.cluster.shared::cta.b64
-^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-.. code:: cuda
-
-   // mbarrier.try_wait{.sem}{.scope}.shared::cta.b64         waitComplete, [addr], state;                        // 6a.  PTX ISA 80, SM_90
-   // .sem       = { .acquire }
-   // .scope     = { .cta, .cluster }
-   template <cuda::ptx::dot_scope Scope>
-   __device__ static inline bool mbarrier_try_wait(
-     cuda::ptx::sem_acquire_t,
-     cuda::ptx::scope_t<Scope> scope,
-     uint64_t* addr,
-     const uint64_t& state);
-
-mbarrier.try_wait.acquire.cta.shared::cta.b64
-^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-.. code:: cuda
-
-   // mbarrier.try_wait{.sem}{.scope}.shared::cta.b64         waitComplete, [addr], state , suspendTimeHint;      // 6b.  PTX ISA 80, SM_90
-   // .sem       = { .acquire }
-   // .scope     = { .cta, .cluster }
-   template <cuda::ptx::dot_scope Scope>
-   __device__ static inline bool mbarrier_try_wait(
-     cuda::ptx::sem_acquire_t,
-     cuda::ptx::scope_t<Scope> scope,
-     uint64_t* addr,
-     const uint64_t& state,
-     const uint32_t& suspendTimeHint);
-
-mbarrier.try_wait.acquire.cluster.shared::cta.b64
-^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-.. code:: cuda
-
-   // mbarrier.try_wait{.sem}{.scope}.shared::cta.b64         waitComplete, [addr], state , suspendTimeHint;      // 6b.  PTX ISA 80, SM_90
-   // .sem       = { .acquire }
-   // .scope     = { .cta, .cluster }
-   template <cuda::ptx::dot_scope Scope>
-   __device__ static inline bool mbarrier_try_wait(
-     cuda::ptx::sem_acquire_t,
-     cuda::ptx::scope_t<Scope> scope,
-     uint64_t* addr,
-     const uint64_t& state,
-     const uint32_t& suspendTimeHint);
-
-mbarrier.try_wait.parity
-------------------------
-
-mbarrier.try_wait.parity.shared::cta.b64
-^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-.. code:: cuda
-
-   // mbarrier.try_wait.parity.shared::cta.b64  waitComplete, [addr], phaseParity;                                // 7a.  PTX ISA 78, SM_90
-   template <typename=void>
-   __device__ static inline bool mbarrier_try_wait_parity(
-     uint64_t* addr,
-     const uint32_t& phaseParity);
-
-mbarrier.try_wait.parity.shared::cta.b64
-^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-.. code:: cuda
-
-   // mbarrier.try_wait.parity.shared::cta.b64  waitComplete, [addr], phaseParity, suspendTimeHint;               // 7b.  PTX ISA 78, SM_90
-   template <typename=void>
-   __device__ static inline bool mbarrier_try_wait_parity(
-     uint64_t* addr,
-     const uint32_t& phaseParity,
-     const uint32_t& suspendTimeHint);
-
-mbarrier.try_wait.parity.acquire.cta.shared::cta.b64
-^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-.. code:: cuda
-
-   // mbarrier.try_wait.parity{.sem}{.scope}.shared::cta.b64  waitComplete, [addr], phaseParity;                  // 8a.  PTX ISA 80, SM_90
-   // .sem       = { .acquire }
-   // .scope     = { .cta, .cluster }
-   template <cuda::ptx::dot_scope Scope>
-   __device__ static inline bool mbarrier_try_wait_parity(
-     cuda::ptx::sem_acquire_t,
-     cuda::ptx::scope_t<Scope> scope,
-     uint64_t* addr,
-     const uint32_t& phaseParity);
-
-mbarrier.try_wait.parity.acquire.cluster.shared::cta.b64
-^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-.. code:: cuda
-
-   // mbarrier.try_wait.parity{.sem}{.scope}.shared::cta.b64  waitComplete, [addr], phaseParity;                  // 8a.  PTX ISA 80, SM_90
-   // .sem       = { .acquire }
-   // .scope     = { .cta, .cluster }
-   template <cuda::ptx::dot_scope Scope>
-   __device__ static inline bool mbarrier_try_wait_parity(
-     cuda::ptx::sem_acquire_t,
-     cuda::ptx::scope_t<Scope> scope,
-     uint64_t* addr,
-     const uint32_t& phaseParity);
-
-mbarrier.try_wait.parity.acquire.cta.shared::cta.b64
-^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-.. code:: cuda
-
-   // mbarrier.try_wait.parity{.sem}{.scope}.shared::cta.b64  waitComplete, [addr], phaseParity, suspendTimeHint; // 8b.  PTX ISA 80, SM_90
-   // .sem       = { .acquire }
-   // .scope     = { .cta, .cluster }
-   template <cuda::ptx::dot_scope Scope>
-   __device__ static inline bool mbarrier_try_wait_parity(
-     cuda::ptx::sem_acquire_t,
-     cuda::ptx::scope_t<Scope> scope,
-     uint64_t* addr,
-     const uint32_t& phaseParity,
-     const uint32_t& suspendTimeHint);
-
-mbarrier.try_wait.parity.acquire.cluster.shared::cta.b64
-^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-.. code:: cuda
-
-   // mbarrier.try_wait.parity{.sem}{.scope}.shared::cta.b64  waitComplete, [addr], phaseParity, suspendTimeHint; // 8b.  PTX ISA 80, SM_90
-   // .sem       = { .acquire }
-   // .scope     = { .cta, .cluster }
-   template <cuda::ptx::dot_scope Scope>
-   __device__ static inline bool mbarrier_try_wait_parity(
-     cuda::ptx::sem_acquire_t,
-     cuda::ptx::scope_t<Scope> scope,
-     uint64_t* addr,
-     const uint32_t& phaseParity,
-     const uint32_t& suspendTimeHint);
diff --git a/docs/libcudacxx/ptx/instructions/mbarrier_arrive.rst b/docs/libcudacxx/ptx/instructions/mbarrier_arrive.rst
new file mode 100644
index 00000000000..f01e7a95465
--- /dev/null
+++ b/docs/libcudacxx/ptx/instructions/mbarrier_arrive.rst
@@ -0,0 +1,68 @@
+.. _libcudacxx-ptx-instructions-mbarrier-arrive:
+
+mbarrier.arrive
+===============
+
+-  PTX ISA:
+   `mbarrier.arrive <https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#parallel-synchronization-and-communication-instructions-mbarrier-arrive>`__
+
+.. _mbarrier.arrive-1:
+
+mbarrier.arrive
+---------------
+
+Some of the listed PTX instructions below are semantically equivalent.
+They differ in one important way: the shorter instructions are typically
+supported on older compilers.
+
+.. include:: generated/mbarrier_arrive.rst
+
+mbarrier.arrive.no_complete
+---------------------------
+
+.. include:: generated/mbarrier_arrive_no_complete.rst
+
+mbarrier.arrive.expect_tx
+-------------------------
+
+.. include:: generated/mbarrier_arrive_expect_tx.rst
+
+Usage
+-----
+
+.. code:: cuda
+
+   #include <cuda/ptx>
+   #include <cuda/barrier>
+   #include <cooperative_groups.h>
+
+   __global__ void kernel() {
+       using cuda::ptx::sem_release;
+       using cuda::ptx::space_cluster;
+       using cuda::ptx::space_shared;
+       using cuda::ptx::scope_cluster;
+       using cuda::ptx::scope_cta;
+
+       using barrier_t = cuda::barrier<cuda::thread_scope_block>;
+       __shared__ barrier_t bar;
+       init(&bar, blockDim.x);
+       __syncthreads();
+
+       NV_IF_TARGET(NV_PROVIDES_SM_90, (
+           // Arrive on local shared memory barrier:
+           uint64_t token;
+           token = cuda::ptx::mbarrier_arrive_expect_tx(sem_release, scope_cluster, space_shared, &bar, 1);
+
+           // Get address of remote cluster barrier:
+           namespace cg = cooperative_groups;
+           cg::cluster_group cluster = cg::this_cluster();
+           unsigned int other_block_rank = cluster.block_rank() ^ 1;
+           uint64_t * remote_bar = cluster.map_shared_rank(&bar, other_block_rank);
+
+           // Sync cluster to ensure remote barrier is initialized.
+           cluster.sync();
+
+           // Arrive on remote cluster barrier:
+           cuda::ptx::mbarrier_arrive_expect_tx(sem_release, scope_cluster, space_cluster, remote_bar, 1);
+       )
+   }
diff --git a/docs/libcudacxx/ptx/instructions/mbarrier_expect_tx.rst b/docs/libcudacxx/ptx/instructions/mbarrier_expect_tx.rst
new file mode 100644
index 00000000000..6c34813242f
--- /dev/null
+++ b/docs/libcudacxx/ptx/instructions/mbarrier_expect_tx.rst
@@ -0,0 +1,9 @@
+.. _libcudacxx-ptx-instructions-mbarrier-expect_tx:
+
+mbarrier.expect_tx
+==================
+
+-  PTX ISA:
+   `mbarrier.expect_tx <https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#parallel-synchronization-and-communication-instructions-mbarrier-expect-tx>`__
+
+.. include:: generated/mbarrier_expect_tx.rst
diff --git a/docs/libcudacxx/ptx/instructions/mbarrier_init.rst b/docs/libcudacxx/ptx/instructions/mbarrier_init.rst
new file mode 100644
index 00000000000..a736f53b0a2
--- /dev/null
+++ b/docs/libcudacxx/ptx/instructions/mbarrier_init.rst
@@ -0,0 +1,9 @@
+.. _libcudacxx-ptx-instructions-mbarrier-init:
+
+mbarrier.init
+=============
+
+-  PTX ISA:
+   `mbarrier.arrive <https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#parallel-synchronization-and-communication-instructions-mbarrier-init>`__
+
+.. include:: generated/mbarrier_init.rst
diff --git a/docs/libcudacxx/ptx/instructions/mbarrier_test_wait.rst b/docs/libcudacxx/ptx/instructions/mbarrier_test_wait.rst
new file mode 100644
index 00000000000..d8a4e79473e
--- /dev/null
+++ b/docs/libcudacxx/ptx/instructions/mbarrier_test_wait.rst
@@ -0,0 +1,19 @@
+.. _libcudacxx-ptx-instructions-mbarrier-test_wait:
+
+mbarrier.test_wait
+==================
+
+-  PTX ISA:
+   `mbarrier.test_wait <https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#parallel-synchronization-and-communication-instructions-mbarrier-test-wait-mbarrier-try-wait>`__
+
+.. _mbarrier.test_wait-1:
+
+mbarrier.test_wait
+------------------
+
+.. include:: generated/mbarrier_test_wait.rst
+
+mbarrier.test_wait.parity
+-------------------------
+
+.. include:: generated/mbarrier_test_wait_parity.rst
diff --git a/docs/libcudacxx/ptx/instructions/mbarrier_try_wait.rst b/docs/libcudacxx/ptx/instructions/mbarrier_try_wait.rst
new file mode 100644
index 00000000000..1869695f3f6
--- /dev/null
+++ b/docs/libcudacxx/ptx/instructions/mbarrier_try_wait.rst
@@ -0,0 +1,20 @@
+.. _libcudacxx-ptx-instructions-mbarrier-try_wait:
+
+mbarrier.try_wait
+=================
+
+-  PTX ISA:
+   `mbarrier.try_wait <https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#parallel-synchronization-and-communication-instructions-mbarrier-test-wait-mbarrier-try-wait>`__
+
+
+.. _mbarrier.try_wait-1:
+
+mbarrier.try_wait
+-----------------
+
+.. include:: generated/mbarrier_try_wait.rst
+
+mbarrier.try_wait.parity
+------------------------
+
+.. include:: generated/mbarrier_try_wait_parity.rst
diff --git a/docs/libcudacxx/ptx/instructions/red_async.rst b/docs/libcudacxx/ptx/instructions/red_async.rst
new file mode 100644
index 00000000000..82ba07c38de
--- /dev/null
+++ b/docs/libcudacxx/ptx/instructions/red_async.rst
@@ -0,0 +1,31 @@
+.. _libcudacxx-ptx-instructions-mbarrier-red-async:
+
+red.async
+=========
+
+-  PTX ISA:
+   `red.async <https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#parallel-synchronization-and-communication-instructions-red-async>`__
+
+.. _red.async-1:
+
+red.async
+---------
+
+.. include:: generated/red_async.rst
+
+red.async ``.s64`` emulation
+----------------------------
+
+PTX does not currently (CTK 12.3) expose ``red.async.add.s64``. This
+exposure is emulated in ``cuda::ptx`` using
+
+.. code:: cuda
+
+   // red.async.relaxed.cluster.shared::cluster.mbarrier::complete_tx::bytes{.op}.u64  [dest], value, [remote_bar]; // .u64 intentional PTX ISA 81, SM_90
+   // .op        = { .add }
+   template <typename=void>
+   __device__ static inline void red_async(
+     cuda::ptx::op_add_t,
+     int64_t* dest,
+     const int64_t& value,
+     int64_t* remote_bar);
diff --git a/docs/libcudacxx/ptx/instructions/special_registers.rst b/docs/libcudacxx/ptx/instructions/special_registers.rst
index 375ce44622e..1e9597fa726 100644
--- a/docs/libcudacxx/ptx/instructions/special_registers.rst
+++ b/docs/libcudacxx/ptx/instructions/special_registers.rst
@@ -6,386 +6,4 @@ Special registers
 -  PTX ISA:
    `Special Register <https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#special-registers>`__
 
-tid.x
-^^^^^
-.. code:: cuda
-
-   // mov.u32 sreg_value, %%tid.x; // PTX ISA 20
-   template <typename=void>
-   __device__ static inline uint32_t get_sreg_tid_x();
-
-tid.y
-^^^^^
-.. code:: cuda
-
-   // mov.u32 sreg_value, %%tid.y; // PTX ISA 20
-   template <typename=void>
-   __device__ static inline uint32_t get_sreg_tid_y();
-
-tid.z
-^^^^^
-.. code:: cuda
-
-   // mov.u32 sreg_value, %%tid.z; // PTX ISA 20
-   template <typename=void>
-   __device__ static inline uint32_t get_sreg_tid_z();
-
-ntid.x
-^^^^^^
-.. code:: cuda
-
-   // mov.u32 sreg_value, %%ntid.x; // PTX ISA 20
-   template <typename=void>
-   __device__ static inline uint32_t get_sreg_ntid_x();
-
-ntid.y
-^^^^^^
-.. code:: cuda
-
-   // mov.u32 sreg_value, %%ntid.y; // PTX ISA 20
-   template <typename=void>
-   __device__ static inline uint32_t get_sreg_ntid_y();
-
-ntid.z
-^^^^^^
-.. code:: cuda
-
-   // mov.u32 sreg_value, %%ntid.z; // PTX ISA 20
-   template <typename=void>
-   __device__ static inline uint32_t get_sreg_ntid_z();
-
-laneid
-^^^^^^
-.. code:: cuda
-
-   // mov.u32 sreg_value, %%laneid; // PTX ISA 13
-   template <typename=void>
-   __device__ static inline uint32_t get_sreg_laneid();
-
-warpid
-^^^^^^
-.. code:: cuda
-
-   // mov.u32 sreg_value, %%warpid; // PTX ISA 13
-   template <typename=void>
-   __device__ static inline uint32_t get_sreg_warpid();
-
-nwarpid
-^^^^^^^
-.. code:: cuda
-
-   // mov.u32 sreg_value, %%nwarpid; // PTX ISA 20, SM_35
-   template <typename=void>
-   __device__ static inline uint32_t get_sreg_nwarpid();
-
-ctaid.x
-^^^^^^^
-.. code:: cuda
-
-   // mov.u32 sreg_value, %%ctaid.x; // PTX ISA 20
-   template <typename=void>
-   __device__ static inline uint32_t get_sreg_ctaid_x();
-
-ctaid.y
-^^^^^^^
-.. code:: cuda
-
-   // mov.u32 sreg_value, %%ctaid.y; // PTX ISA 20
-   template <typename=void>
-   __device__ static inline uint32_t get_sreg_ctaid_y();
-
-ctaid.z
-^^^^^^^
-.. code:: cuda
-
-   // mov.u32 sreg_value, %%ctaid.z; // PTX ISA 20
-   template <typename=void>
-   __device__ static inline uint32_t get_sreg_ctaid_z();
-
-nctaid.x
-^^^^^^^^
-.. code:: cuda
-
-   // mov.u32 sreg_value, %%nctaid.x; // PTX ISA 20
-   template <typename=void>
-   __device__ static inline uint32_t get_sreg_nctaid_x();
-
-nctaid.y
-^^^^^^^^
-.. code:: cuda
-
-   // mov.u32 sreg_value, %%nctaid.y; // PTX ISA 20
-   template <typename=void>
-   __device__ static inline uint32_t get_sreg_nctaid_y();
-
-nctaid.z
-^^^^^^^^
-.. code:: cuda
-
-   // mov.u32 sreg_value, %%nctaid.z; // PTX ISA 20
-   template <typename=void>
-   __device__ static inline uint32_t get_sreg_nctaid_z();
-
-smid
-^^^^
-.. code:: cuda
-
-   // mov.u32 sreg_value, %%smid; // PTX ISA 13
-   template <typename=void>
-   __device__ static inline uint32_t get_sreg_smid();
-
-nsmid
-^^^^^
-.. code:: cuda
-
-   // mov.u32 sreg_value, %%nsmid; // PTX ISA 20, SM_35
-   template <typename=void>
-   __device__ static inline uint32_t get_sreg_nsmid();
-
-gridid
-^^^^^^
-.. code:: cuda
-
-   // mov.u64 sreg_value, %%gridid; // PTX ISA 30
-   template <typename=void>
-   __device__ static inline uint64_t get_sreg_gridid();
-
-is_explicit_cluster
-^^^^^^^^^^^^^^^^^^^
-.. code:: cuda
-
-   // mov.pred sreg_value, %%is_explicit_cluster; // PTX ISA 78, SM_90
-   template <typename=void>
-   __device__ static inline bool get_sreg_is_explicit_cluster();
-
-clusterid.x
-^^^^^^^^^^^
-.. code:: cuda
-
-   // mov.u32 sreg_value, %%clusterid.x; // PTX ISA 78, SM_90
-   template <typename=void>
-   __device__ static inline uint32_t get_sreg_clusterid_x();
-
-clusterid.y
-^^^^^^^^^^^
-.. code:: cuda
-
-   // mov.u32 sreg_value, %%clusterid.y; // PTX ISA 78, SM_90
-   template <typename=void>
-   __device__ static inline uint32_t get_sreg_clusterid_y();
-
-clusterid.z
-^^^^^^^^^^^
-.. code:: cuda
-
-   // mov.u32 sreg_value, %%clusterid.z; // PTX ISA 78, SM_90
-   template <typename=void>
-   __device__ static inline uint32_t get_sreg_clusterid_z();
-
-nclusterid.x
-^^^^^^^^^^^^
-.. code:: cuda
-
-   // mov.u32 sreg_value, %%nclusterid.x; // PTX ISA 78, SM_90
-   template <typename=void>
-   __device__ static inline uint32_t get_sreg_nclusterid_x();
-
-nclusterid.y
-^^^^^^^^^^^^
-.. code:: cuda
-
-   // mov.u32 sreg_value, %%nclusterid.y; // PTX ISA 78, SM_90
-   template <typename=void>
-   __device__ static inline uint32_t get_sreg_nclusterid_y();
-
-nclusterid.z
-^^^^^^^^^^^^
-.. code:: cuda
-
-   // mov.u32 sreg_value, %%nclusterid.z; // PTX ISA 78, SM_90
-   template <typename=void>
-   __device__ static inline uint32_t get_sreg_nclusterid_z();
-
-cluster_ctaid.x
-^^^^^^^^^^^^^^^
-.. code:: cuda
-
-   // mov.u32 sreg_value, %%cluster_ctaid.x; // PTX ISA 78, SM_90
-   template <typename=void>
-   __device__ static inline uint32_t get_sreg_cluster_ctaid_x();
-
-cluster_ctaid.y
-^^^^^^^^^^^^^^^
-.. code:: cuda
-
-   // mov.u32 sreg_value, %%cluster_ctaid.y; // PTX ISA 78, SM_90
-   template <typename=void>
-   __device__ static inline uint32_t get_sreg_cluster_ctaid_y();
-
-cluster_ctaid.z
-^^^^^^^^^^^^^^^
-.. code:: cuda
-
-   // mov.u32 sreg_value, %%cluster_ctaid.z; // PTX ISA 78, SM_90
-   template <typename=void>
-   __device__ static inline uint32_t get_sreg_cluster_ctaid_z();
-
-cluster_nctaid.x
-^^^^^^^^^^^^^^^^
-.. code:: cuda
-
-   // mov.u32 sreg_value, %%cluster_nctaid.x; // PTX ISA 78, SM_90
-   template <typename=void>
-   __device__ static inline uint32_t get_sreg_cluster_nctaid_x();
-
-cluster_nctaid.y
-^^^^^^^^^^^^^^^^
-.. code:: cuda
-
-   // mov.u32 sreg_value, %%cluster_nctaid.y; // PTX ISA 78, SM_90
-   template <typename=void>
-   __device__ static inline uint32_t get_sreg_cluster_nctaid_y();
-
-cluster_nctaid.z
-^^^^^^^^^^^^^^^^
-.. code:: cuda
-
-   // mov.u32 sreg_value, %%cluster_nctaid.z; // PTX ISA 78, SM_90
-   template <typename=void>
-   __device__ static inline uint32_t get_sreg_cluster_nctaid_z();
-
-cluster_ctarank
-^^^^^^^^^^^^^^^
-.. code:: cuda
-
-   // mov.u32 sreg_value, %%cluster_ctarank; // PTX ISA 78, SM_90
-   template <typename=void>
-   __device__ static inline uint32_t get_sreg_cluster_ctarank();
-
-cluster_nctarank
-^^^^^^^^^^^^^^^^
-.. code:: cuda
-
-   // mov.u32 sreg_value, %%cluster_nctarank; // PTX ISA 78, SM_90
-   template <typename=void>
-   __device__ static inline uint32_t get_sreg_cluster_nctarank();
-
-lanemask_eq
-^^^^^^^^^^^
-.. code:: cuda
-
-   // mov.u32 sreg_value, %%lanemask_eq; // PTX ISA 20, SM_35
-   template <typename=void>
-   __device__ static inline uint32_t get_sreg_lanemask_eq();
-
-lanemask_le
-^^^^^^^^^^^
-.. code:: cuda
-
-   // mov.u32 sreg_value, %%lanemask_le; // PTX ISA 20, SM_35
-   template <typename=void>
-   __device__ static inline uint32_t get_sreg_lanemask_le();
-
-lanemask_lt
-^^^^^^^^^^^
-.. code:: cuda
-
-   // mov.u32 sreg_value, %%lanemask_lt; // PTX ISA 20, SM_35
-   template <typename=void>
-   __device__ static inline uint32_t get_sreg_lanemask_lt();
-
-lanemask_ge
-^^^^^^^^^^^
-.. code:: cuda
-
-   // mov.u32 sreg_value, %%lanemask_ge; // PTX ISA 20, SM_35
-   template <typename=void>
-   __device__ static inline uint32_t get_sreg_lanemask_ge();
-
-lanemask_gt
-^^^^^^^^^^^
-.. code:: cuda
-
-   // mov.u32 sreg_value, %%lanemask_gt; // PTX ISA 20, SM_35
-   template <typename=void>
-   __device__ static inline uint32_t get_sreg_lanemask_gt();
-
-clock
-^^^^^
-.. code:: cuda
-
-   // mov.u32 sreg_value, %%clock; // PTX ISA 10
-   template <typename=void>
-   __device__ static inline uint32_t get_sreg_clock();
-
-clock_hi
-^^^^^^^^
-.. code:: cuda
-
-   // mov.u32 sreg_value, %%clock_hi; // PTX ISA 50, SM_35
-   template <typename=void>
-   __device__ static inline uint32_t get_sreg_clock_hi();
-
-clock64
-^^^^^^^
-.. code:: cuda
-
-   // mov.u64 sreg_value, %%clock64; // PTX ISA 20, SM_35
-   template <typename=void>
-   __device__ static inline uint64_t get_sreg_clock64();
-
-globaltimer
-^^^^^^^^^^^
-.. code:: cuda
-
-   // mov.u64 sreg_value, %%globaltimer; // PTX ISA 31, SM_35
-   template <typename=void>
-   __device__ static inline uint64_t get_sreg_globaltimer();
-
-globaltimer_lo
-^^^^^^^^^^^^^^
-.. code:: cuda
-
-   // mov.u32 sreg_value, %%globaltimer_lo; // PTX ISA 31, SM_35
-   template <typename=void>
-   __device__ static inline uint32_t get_sreg_globaltimer_lo();
-
-globaltimer_hi
-^^^^^^^^^^^^^^
-.. code:: cuda
-
-   // mov.u32 sreg_value, %%globaltimer_hi; // PTX ISA 31, SM_35
-   template <typename=void>
-   __device__ static inline uint32_t get_sreg_globaltimer_hi();
-
-total_smem_size
-^^^^^^^^^^^^^^^
-.. code:: cuda
-
-   // mov.u32 sreg_value, %%total_smem_size; // PTX ISA 41, SM_35
-   template <typename=void>
-   __device__ static inline uint32_t get_sreg_total_smem_size();
-
-aggr_smem_size
-^^^^^^^^^^^^^^
-.. code:: cuda
-
-   // mov.u32 sreg_value, %%aggr_smem_size; // PTX ISA 81, SM_90
-   template <typename=void>
-   __device__ static inline uint32_t get_sreg_aggr_smem_size();
-
-dynamic_smem_size
-^^^^^^^^^^^^^^^^^
-.. code:: cuda
-
-   // mov.u32 sreg_value, %%dynamic_smem_size; // PTX ISA 41, SM_35
-   template <typename=void>
-   __device__ static inline uint32_t get_sreg_dynamic_smem_size();
-
-current_graph_exec
-^^^^^^^^^^^^^^^^^^
-.. code:: cuda
-
-   // mov.u64 sreg_value, %%current_graph_exec; // PTX ISA 80, SM_50
-   template <typename=void>
-   __device__ static inline uint64_t get_sreg_current_graph_exec();
+.. include:: generated/special_registers.rst
diff --git a/docs/libcudacxx/ptx/instructions/st_async.rst b/docs/libcudacxx/ptx/instructions/st_async.rst
new file mode 100644
index 00000000000..c71aebd7da3
--- /dev/null
+++ b/docs/libcudacxx/ptx/instructions/st_async.rst
@@ -0,0 +1,14 @@
+.. _libcudacxx-ptx-instructions-st-async:
+
+st.async
+========
+
+-  PTX ISA:
+   `st.async <https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#data-movement-and-conversion-instructions-st-async>`__
+-  Used in: :ref:`How to use st.async <libcudacxx-ptx-examples-st-async>`
+
+**NOTE.** Alignment of ``addr`` must be a multiple of vector size. For
+instance, the ``addr`` supplied to the ``v2.b32`` variant must be
+aligned to ``2 x 4 = 8`` bytes.
+
+.. include:: generated/st_async.rst
diff --git a/docs/libcudacxx/ptx/instructions/tensormap_cp_fenceproxy.rst b/docs/libcudacxx/ptx/instructions/tensormap_cp_fenceproxy.rst
new file mode 100644
index 00000000000..2f7622bba2c
--- /dev/null
+++ b/docs/libcudacxx/ptx/instructions/tensormap_cp_fenceproxy.rst
@@ -0,0 +1,9 @@
+.. _libcudacxx-ptx-instructions-tensormap-cp_fenceproxy:
+
+tensormap.cp_fenceproxy
+=======================
+
+-  PTX ISA:
+   `tensormap.cp_fenceproxy <https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#parallel-synchronization-and-communication-instructions-tensormap-cp-fenceproxy>`__
+
+.. include:: generated/tensormap_cp_fenceproxy.rst
diff --git a/docs/libcudacxx/ptx/instructions/tensormap_replace.rst b/docs/libcudacxx/ptx/instructions/tensormap_replace.rst
new file mode 100644
index 00000000000..331dcff313a
--- /dev/null
+++ b/docs/libcudacxx/ptx/instructions/tensormap_replace.rst
@@ -0,0 +1,9 @@
+.. _libcudacxx-ptx-instructions-tensormap-replace:
+
+tensormap.replace
+=================
+
+-  PTX ISA:
+   `tensormap.replace <https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#data-movement-and-conversion-instructions-tensormap-replace>`__
+
+.. include:: generated/tensormap_replace.rst

From f6ec34b40d69bc42c254de4aab8bda4008857c73 Mon Sep 17 00:00:00 2001
From: Michael Schellenberger Costa <miscco@nvidia.com>
Date: Fri, 22 Nov 2024 13:31:30 +0100
Subject: [PATCH 09/45] Improve build instructions for libcu++ (#2881)

* Improve build instructions for libcu++
* Add section about the options for the build script
* Delegate more to the contributor guidelines
---
 .../libcudacxx/setup/building_and_testing.rst | 200 ++----------------
 1 file changed, 21 insertions(+), 179 deletions(-)

diff --git a/docs/libcudacxx/setup/building_and_testing.rst b/docs/libcudacxx/setup/building_and_testing.rst
index 7a420d0c09a..5b3b010a294 100644
--- a/docs/libcudacxx/setup/building_and_testing.rst
+++ b/docs/libcudacxx/setup/building_and_testing.rst
@@ -3,200 +3,42 @@
 Building & Testing libcu++
 ==========================
 
-\*nix Systems, Native Build/Test
---------------------------------
+libcu++ can be build and tested as shown in our `contributor guidelines <https://github.com/NVIDIA/cccl/blob/main/CONTRIBUTING.md#building-and-testing>`_.
 
-The procedure is demonstrated for NVCC + GCC in C++11 mode on a
-Debian-like Linux systems; the same basic steps are required on all
-other platforms.
-
-Step 0: Install Build Requirements
-~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
-
-In a Bash shell:
-
-.. code:: bash
-
-   # Install LLVM (needed for LLVM's CMake modules)
-   apt-get -y install llvm
-
-   # Install CMake
-   apt-get -y install cmake
-
-   # Install the LLVM Integrated Tester (`lit`)
-   apt-get -y install python-pip
-   pip install lit
-
-   # Env vars that should be set, or kept in mind for use later
-   export LIBCUDACXX_ROOT=/path/to/libcudacxx # Git repo root.
-
-Step 1: Generate the Build Files
-~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
-
-In a Bash shell:
+However, often only a small subset of the full test suite needs to be run during development. For that we rely on ``lit``.
+After libcu++ has been configured either through the build scripts or directly via a cmake preset one can then run.
 
 .. code:: bash
 
-   cd ${LIBCUDACXX_ROOT}
-   cmake \
-       -S ./ \
-       -B build \
-       -DCMAKE_CXX_COMPILER=$CXX \
-       -DCMAKE_CUDA_COMPILER=$TOOLKIT/bin/nvcc \
-       -DLIBCUDACXX_ENABLE_LIBCUDACXX_TESTS=ON
-
-Step 2: Build & Run the Tests
-~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+   cd build
+   lit libcudacxx-cpp17/RELATIVE_PATH_TO_TEST_OR_SUBFOLDER -sv
 
-In a Bash shell:
+This will build and run all tests within ``RELATIVE_PATH_TO_TEST_OR_SUBFOLDER`` which must be a valid path within the CCCL.
+Note that the name of the top level folder is the same as the name of the preset. For the build script the default is
+``libcudacxx-cpp17``. As an example this is how to run all tests for ``cuda::std::span``, which are located in
+``libcudacxx/test/libcudacxx/std/containers/views/views.span``
 
 .. code:: bash
 
-   cd ${LIBCUDACXX_ROOT}/build # build directory of this repo
-   ../utils/nvidia/linux/perform_tests.bash --skip-libcxx-tests
-
-\*nix Systems, Cross Build/Test
--------------------------------
-
-The procedure is demonstrated for NVCC + GCC cross compiler in C++14
-mode on a Debian-like Linux systems targeting an aarch64 L4T system; the
-same basic steps are required on all other platforms.
-
-Step 0: Install Build Prerequisites
-~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+   cd build
 
-Follow Step 0 for \*nix native builds/tests.
+   # Builds all tests within libcudacxx/test/libcudacxx/std/containers/views/views.span
+   lit libcudacxx-cpp17/libcudacxx/test/libcudacxx/std/containers/views/views.span -sv
 
-.. _step-1-generate-the-build-files-1:
+   # Builds the individual test array.pass.cpp
+   lit libcudacxx-cpp17/libcudacxx/test/libcudacxx/std/containers/views/views.span/span.cons/array.pass.cpp -sv
 
-Step 1: Generate the Build Files
-~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
-
-In a Bash shell:
+If only building the tests and not running them is desired one can pass ``-Dexecutor="NoopExecutor()"`` to the lit invocation.
+This is especially usefull if the machine has no GPU or testing a different architecture
 
 .. code:: bash
 
-   export HOST=executor.nvidia.com
-   export USERNAME=ubuntu
-
-   cd ${LIBCUDACXX_ROOT}
-   cmake \
-     -S ./ \
-     -B build \
-     -DCMAKE_CUDA_COMPILER=$TOOLKIT/bin/nvcc \
-     -DCMAKE_CXX_COMPILER=aarch64-linux-gnu-g++ \
-     -DLIBCUDACXX_ENABLE_LIBCUDACXX_TESTS=ON \
-     -DLIBCXX_EXECUTOR="SSHExecutor(host='${HOST}', username='${USERNAME}')"
-
-Ensure that you can SSH to the target system from the host system
-without inputing a password (e.g. use SSH keys).
-
-.. _step-2-build-run-the-tests-1:
-
-Step 2: Build & Run the Tests
-~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
-
-Follow Step 2 for \*nix native builds/tests.
-
-\*nix Systems, NVRTC Build/Test
--------------------------------
-
-The procedure is demonstrated for NVRTC in C++11 mode on a Debian-like
-Linux systems; the same basic steps are required on all other platforms.
-
-.. _step-0-install-build-prerequisites-1:
-
-Step 0: Install Build Prerequisites
-~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
-
-Follow Step 0 for \*nix native builds/tests.
-
-.. _step-1-generate-the-build-files-2:
-
-Step 1: Generate the Build Files
-~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+   cd build
+   lit libcudacxx-cpp17/RELATIVE_PATH_TO_TEST_OR_SUBFOLDER -sv -Dexecutor="NoopExecutor()"
 
-In a Bash shell:
+Finally different standard modes can be tested by passing e.g ``--param=std=c++20``
 
 .. code:: bash
 
-   cd ${LIBCUDACXX_ROOT}
-   cmake \
-     -S ./ \
-     -B build \
-     -DCMAKE_CXX_COMPILER=$CC \
-     -DCMAKE_CUDA_COMPILER=$TOOLKIT/bin/nvcc \
-     -DLIBCUDACXX_ENABLE_LIBCUDACXX_TESTS=ON \
-     -DLIBCUDACXX_TEST_WITH_NVRTC=ON
-
-.. _step-2-build-run-the-tests-2:
-
-Step 2: Build & Run the Tests
-~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
-
-Follow Step 2 for \*nix native builds/tests.
-
-Windows, Native Build/Test
---------------------------
-
-.. _step-0-install-build-requirements-1:
-
-Step 0: Install Build Requirements
-~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
-
-`Install Python <https://www.python.org/downloads/windows>`_.
-
-Download `the get-pip.py bootstrap
-script <https://bootstrap.pypa.io/get-pip.py>`_ and run it.
-
-Install the LLVM Integrated Tester (``lit``) using a Visual Studio
-command prompt:
-
-.. code:: bat
-
-   pip install lit
-
-Step 0.5: Launching a Build Environment
-~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
-
-Visual Studio comes with a few build environments that are appropriate
-to use.
-
-The ``x64 Native Tools Command Prompt`` and other similarly named
-environments will work.
-
-If Powershell is desired, it would be best to launch it from within the
-native tools. This helps avoid configuration step issues.
-
-.. _step-1-generate-the-build-files-3:
-
-Step 1: Generate the Build Files
-~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
-
-In a Visual Studio command prompt:
-
-.. code:: bat
-
-   set LIBCUDACXX_ROOT=\path\to\libcudacxx # Helpful env var pointing to the git repo root.
-   cd %LIBCUDACXX_ROOT%
-
-   cmake ^
-     -S ./ ^
-     -B build ^
-     -G "Ninja" ^
-     -DCMAKE_CXX_COMPILER=cl ^
-     -DCMAKE_CUDA_COMPILER=nvcc ^
-     -DCMAKE_CUDA_COMPILER_FORCED=ON ^
-     -DLIBCUDACXX_ENABLE_LIBCUDACXX_TESTS=ON
-
-.. _step-2-build-run-the-tests-3:
-
-Step 2: Build & Run the Tests
-~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
-
-``SM_ARCH`` can be set to any integer value (Ex: “80”, “86”)
-
-.. code:: bat
-
-   set LIBCUDACXX_SITE_CONFIG=%LIBCUDACXX_ROOT%\build\test\lit.site.cfg
-   lit %LIBCUDACXX_ROOT%\test -Dcompute_archs=%SM_ARCH% -sv --no-progress-bar
+   cd build
+   lit libcudacxx-cpp17/RELATIVE_PATH_TO_TEST_OR_SUBFOLDER -sv --param=std=c++20

From b27d512d43d9b28505ca8f3f86623640bcea1f8b Mon Sep 17 00:00:00 2001
From: Bernhard Manfred Gruber <bernhardmgruber@gmail.com>
Date: Fri, 22 Nov 2024 13:37:55 +0100
Subject: [PATCH 10/45] Reorganize PTX headers to match generator (#2925)

---
 .../cuda/__ptx/instructions/barrier_cluster.h |  124 +-
 .../cuda/__ptx/instructions/cp_async_bulk.h   |  158 +-
 .../instructions/cp_async_bulk_commit_group.h |   22 +-
 .../__ptx/instructions/cp_async_bulk_tensor.h |  657 +------
 .../instructions/cp_async_bulk_wait_group.h   |   46 +-
 .../__ptx/instructions/cp_reduce_async_bulk.h | 1673 +----------------
 .../cp_reduce_async_bulk_tensor.h             |  533 +-----
 .../include/cuda/__ptx/instructions/fence.h   |  252 +--
 .../generated/barrier_cluster.inc             |  123 ++
 .../instructions/generated/cp_async_bulk.inc  |  111 ++
 .../generated/cp_async_bulk_commit_group.inc  |   21 +
 .../generated/cp_async_bulk_multicast.inc     |   45 +
 .../generated/cp_async_bulk_tensor.inc        |  416 ++++
 .../cp_async_bulk_tensor_multicast.inc        |  239 +++
 .../generated/cp_async_bulk_wait_group.inc    |   45 +
 .../generated/cp_reduce_async_bulk.inc        | 1435 ++++++++++++++
 .../generated/cp_reduce_async_bulk_bf16.inc   |  127 ++
 .../generated/cp_reduce_async_bulk_f16.inc    |  110 ++
 .../generated/cp_reduce_async_bulk_tensor.inc |  532 ++++++
 .../__ptx/instructions/generated/fence.inc    |   67 +
 .../generated/fence_mbarrier_init.inc         |   27 +
 .../generated/fence_proxy_alias.inc           |   21 +
 .../generated/fence_proxy_async.inc           |   50 +
 .../fence_proxy_tensormap_generic.inc         |   82 +
 .../__ptx/instructions/generated/get_sreg.inc | 1001 ++++++++++
 .../instructions/generated/getctarank.inc     |   27 +
 .../generated/mbarrier_arrive.inc             |  205 ++
 .../generated/mbarrier_arrive_expect_tx.inc   |   79 +
 .../generated/mbarrier_arrive_no_complete.inc |   26 +
 .../instructions/generated/mbarrier_init.inc  |   23 +
 .../generated/mbarrier_test_wait.inc          |   75 +
 .../generated/mbarrier_test_wait_parity.inc   |   75 +
 .../generated/mbarrier_try_wait.inc           |  157 ++
 .../generated/mbarrier_try_wait_parity.inc    |  157 ++
 .../instructions/generated/red_async.inc      |  417 ++++
 .../__ptx/instructions/generated/st_async.inc |  108 ++
 .../generated/tensormap_cp_fenceproxy.inc     |   54 +
 .../generated/tensormap_replace.inc           |  569 ++++++
 .../cuda/__ptx/instructions/get_sreg.h        | 1002 +---------
 .../cuda/__ptx/instructions/getctarank.h      |   28 +-
 .../cuda/__ptx/instructions/mbarrier_arrive.h |  313 +--
 .../cuda/__ptx/instructions/mbarrier_init.h   |   24 +-
 .../cuda/__ptx/instructions/mbarrier_wait.h   |  468 +----
 .../cuda/__ptx/instructions/red_async.h       |  418 +---
 .../cuda/__ptx/instructions/st_async.h        |  109 +-
 .../instructions/tensormap_cp_fenceproxy.h    |   55 +-
 .../__ptx/instructions/tensormap_replace.h    |  570 +-----
 47 files changed, 6454 insertions(+), 6422 deletions(-)
 create mode 100644 libcudacxx/include/cuda/__ptx/instructions/generated/barrier_cluster.inc
 create mode 100644 libcudacxx/include/cuda/__ptx/instructions/generated/cp_async_bulk.inc
 create mode 100644 libcudacxx/include/cuda/__ptx/instructions/generated/cp_async_bulk_commit_group.inc
 create mode 100644 libcudacxx/include/cuda/__ptx/instructions/generated/cp_async_bulk_multicast.inc
 create mode 100644 libcudacxx/include/cuda/__ptx/instructions/generated/cp_async_bulk_tensor.inc
 create mode 100644 libcudacxx/include/cuda/__ptx/instructions/generated/cp_async_bulk_tensor_multicast.inc
 create mode 100644 libcudacxx/include/cuda/__ptx/instructions/generated/cp_async_bulk_wait_group.inc
 create mode 100644 libcudacxx/include/cuda/__ptx/instructions/generated/cp_reduce_async_bulk.inc
 create mode 100644 libcudacxx/include/cuda/__ptx/instructions/generated/cp_reduce_async_bulk_bf16.inc
 create mode 100644 libcudacxx/include/cuda/__ptx/instructions/generated/cp_reduce_async_bulk_f16.inc
 create mode 100644 libcudacxx/include/cuda/__ptx/instructions/generated/cp_reduce_async_bulk_tensor.inc
 create mode 100644 libcudacxx/include/cuda/__ptx/instructions/generated/fence.inc
 create mode 100644 libcudacxx/include/cuda/__ptx/instructions/generated/fence_mbarrier_init.inc
 create mode 100644 libcudacxx/include/cuda/__ptx/instructions/generated/fence_proxy_alias.inc
 create mode 100644 libcudacxx/include/cuda/__ptx/instructions/generated/fence_proxy_async.inc
 create mode 100644 libcudacxx/include/cuda/__ptx/instructions/generated/fence_proxy_tensormap_generic.inc
 create mode 100644 libcudacxx/include/cuda/__ptx/instructions/generated/get_sreg.inc
 create mode 100644 libcudacxx/include/cuda/__ptx/instructions/generated/getctarank.inc
 create mode 100644 libcudacxx/include/cuda/__ptx/instructions/generated/mbarrier_arrive.inc
 create mode 100644 libcudacxx/include/cuda/__ptx/instructions/generated/mbarrier_arrive_expect_tx.inc
 create mode 100644 libcudacxx/include/cuda/__ptx/instructions/generated/mbarrier_arrive_no_complete.inc
 create mode 100644 libcudacxx/include/cuda/__ptx/instructions/generated/mbarrier_init.inc
 create mode 100644 libcudacxx/include/cuda/__ptx/instructions/generated/mbarrier_test_wait.inc
 create mode 100644 libcudacxx/include/cuda/__ptx/instructions/generated/mbarrier_test_wait_parity.inc
 create mode 100644 libcudacxx/include/cuda/__ptx/instructions/generated/mbarrier_try_wait.inc
 create mode 100644 libcudacxx/include/cuda/__ptx/instructions/generated/mbarrier_try_wait_parity.inc
 create mode 100644 libcudacxx/include/cuda/__ptx/instructions/generated/red_async.inc
 create mode 100644 libcudacxx/include/cuda/__ptx/instructions/generated/st_async.inc
 create mode 100644 libcudacxx/include/cuda/__ptx/instructions/generated/tensormap_cp_fenceproxy.inc
 create mode 100644 libcudacxx/include/cuda/__ptx/instructions/generated/tensormap_replace.inc

diff --git a/libcudacxx/include/cuda/__ptx/instructions/barrier_cluster.h b/libcudacxx/include/cuda/__ptx/instructions/barrier_cluster.h
index bc7d88efd48..8b09ddd1110 100644
--- a/libcudacxx/include/cuda/__ptx/instructions/barrier_cluster.h
+++ b/libcudacxx/include/cuda/__ptx/instructions/barrier_cluster.h
@@ -32,129 +32,7 @@ _LIBCUDACXX_BEGIN_NAMESPACE_CUDA_PTX
 
 // 9.7.12.3. Parallel Synchronization and Communication Instructions: barrier.cluster
 // https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#parallel-synchronization-and-communication-instructions-barrier-cluster
-/*
-// barrier.cluster.arrive; // PTX ISA 78, SM_90
-// Marked volatile and as clobbering memory
-template <typename=void>
-__device__ static inline void barrier_cluster_arrive();
-*/
-#if __cccl_ptx_isa >= 780
-extern "C" _CCCL_DEVICE void __cuda_ptx_barrier_cluster_arrive_is_not_supported_before_SM_90__();
-template <typename = void>
-_CCCL_DEVICE static inline void barrier_cluster_arrive()
-{
-  NV_IF_ELSE_TARGET(
-    NV_PROVIDES_SM_90,
-    (asm volatile("barrier.cluster.arrive;"
-                  :
-                  :
-                  : "memory");),
-    (
-      // Unsupported architectures will have a linker error with a semi-decent error message
-      __cuda_ptx_barrier_cluster_arrive_is_not_supported_before_SM_90__();));
-}
-#endif // __cccl_ptx_isa >= 780
-
-/*
-// barrier.cluster.wait; // PTX ISA 78, SM_90
-// Marked volatile and as clobbering memory
-template <typename=void>
-__device__ static inline void barrier_cluster_wait();
-*/
-#if __cccl_ptx_isa >= 780
-extern "C" _CCCL_DEVICE void __cuda_ptx_barrier_cluster_wait_is_not_supported_before_SM_90__();
-template <typename = void>
-_CCCL_DEVICE static inline void barrier_cluster_wait()
-{
-  NV_IF_ELSE_TARGET(
-    NV_PROVIDES_SM_90,
-    (asm volatile("barrier.cluster.wait;"
-                  :
-                  :
-                  : "memory");),
-    (
-      // Unsupported architectures will have a linker error with a semi-decent error message
-      __cuda_ptx_barrier_cluster_wait_is_not_supported_before_SM_90__();));
-}
-#endif // __cccl_ptx_isa >= 780
-
-/*
-// barrier.cluster.arrive.sem; // PTX ISA 80, SM_90
-// .sem       = { .release }
-// Marked volatile and as clobbering memory
-template <typename=void>
-__device__ static inline void barrier_cluster_arrive(
-  cuda::ptx::sem_release_t);
-*/
-#if __cccl_ptx_isa >= 800
-extern "C" _CCCL_DEVICE void __cuda_ptx_barrier_cluster_arrive_is_not_supported_before_SM_90__();
-template <typename = void>
-_CCCL_DEVICE static inline void barrier_cluster_arrive(sem_release_t)
-{
-  // __sem == sem_release (due to parameter type constraint)
-  NV_IF_ELSE_TARGET(
-    NV_PROVIDES_SM_90,
-    (asm volatile("barrier.cluster.arrive.release;"
-                  :
-                  :
-                  : "memory");),
-    (
-      // Unsupported architectures will have a linker error with a semi-decent error message
-      __cuda_ptx_barrier_cluster_arrive_is_not_supported_before_SM_90__();));
-}
-#endif // __cccl_ptx_isa >= 800
-
-/*
-// barrier.cluster.arrive.sem; // PTX ISA 80, SM_90
-// .sem       = { .relaxed }
-// Marked volatile
-template <typename=void>
-__device__ static inline void barrier_cluster_arrive(
-  cuda::ptx::sem_relaxed_t);
-*/
-#if __cccl_ptx_isa >= 800
-extern "C" _CCCL_DEVICE void __cuda_ptx_barrier_cluster_arrive_is_not_supported_before_SM_90__();
-template <typename = void>
-_CCCL_DEVICE static inline void barrier_cluster_arrive(sem_relaxed_t)
-{
-  // __sem == sem_relaxed (due to parameter type constraint)
-  NV_IF_ELSE_TARGET(
-    NV_PROVIDES_SM_90,
-    (asm volatile("barrier.cluster.arrive.relaxed;"
-                  :
-                  :
-                  :);),
-    (
-      // Unsupported architectures will have a linker error with a semi-decent error message
-      __cuda_ptx_barrier_cluster_arrive_is_not_supported_before_SM_90__();));
-}
-#endif // __cccl_ptx_isa >= 800
-
-/*
-// barrier.cluster.wait.sem; // PTX ISA 80, SM_90
-// .sem       = { .acquire }
-// Marked volatile and as clobbering memory
-template <typename=void>
-__device__ static inline void barrier_cluster_wait(
-  cuda::ptx::sem_acquire_t);
-*/
-#if __cccl_ptx_isa >= 800
-extern "C" _CCCL_DEVICE void __cuda_ptx_barrier_cluster_wait_is_not_supported_before_SM_90__();
-template <typename = void>
-_CCCL_DEVICE static inline void barrier_cluster_wait(sem_acquire_t)
-{
-  // __sem == sem_acquire (due to parameter type constraint)
-  NV_IF_ELSE_TARGET(
-    NV_PROVIDES_SM_90,
-    (asm volatile("barrier.cluster.wait.acquire;"
-                  :
-                  :
-                  : "memory");),
-    (
-      // Unsupported architectures will have a linker error with a semi-decent error message
-      __cuda_ptx_barrier_cluster_wait_is_not_supported_before_SM_90__();));
-}
-#endif // __cccl_ptx_isa >= 800
+#include <cuda/__ptx/instructions/generated/barrier_cluster.inc>
 
 _LIBCUDACXX_END_NAMESPACE_CUDA_PTX
 
diff --git a/libcudacxx/include/cuda/__ptx/instructions/cp_async_bulk.h b/libcudacxx/include/cuda/__ptx/instructions/cp_async_bulk.h
index 7acce210230..480a02a701e 100644
--- a/libcudacxx/include/cuda/__ptx/instructions/cp_async_bulk.h
+++ b/libcudacxx/include/cuda/__ptx/instructions/cp_async_bulk.h
@@ -32,162 +32,8 @@ _LIBCUDACXX_BEGIN_NAMESPACE_CUDA_PTX
 
 // 9.7.8.24.6. Data Movement and Conversion Instructions: cp.async.bulk
 // https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#data-movement-and-conversion-instructions-cp-async-bulk
-/*
-// cp.async.bulk.dst.src.mbarrier::complete_tx::bytes [dstMem], [srcMem], size, [smem_bar]; // 1a. unicast PTX ISA 80,
-SM_90
-// .dst       = { .shared::cluster }
-// .src       = { .global }
-template <typename=void>
-__device__ static inline void cp_async_bulk(
-  cuda::ptx::space_cluster_t,
-  cuda::ptx::space_global_t,
-  void* dstMem,
-  const void* srcMem,
-  const uint32_t& size,
-  uint64_t* smem_bar);
-*/
-#if __cccl_ptx_isa >= 800
-extern "C" _CCCL_DEVICE void __cuda_ptx_cp_async_bulk_is_not_supported_before_SM_90__();
-template <typename = void>
-_CCCL_DEVICE static inline void cp_async_bulk(
-  space_cluster_t,
-  space_global_t,
-  void* __dstMem,
-  const void* __srcMem,
-  const _CUDA_VSTD::uint32_t& __size,
-  _CUDA_VSTD::uint64_t* __smem_bar)
-{
-  // __space == space_cluster (due to parameter type constraint)
-  // __space == space_global (due to parameter type constraint)
-  NV_IF_ELSE_TARGET(
-    NV_PROVIDES_SM_90,
-    (asm("cp.async.bulk.shared::cluster.global.mbarrier::complete_tx::bytes [%0], [%1], %2, [%3]; // 1a. unicast"
-         :
-         : "r"(__as_ptr_smem(__dstMem)), "l"(__as_ptr_gmem(__srcMem)), "r"(__size), "r"(__as_ptr_smem(__smem_bar))
-         : "memory");),
-    (
-      // Unsupported architectures will have a linker error with a semi-decent error message
-      __cuda_ptx_cp_async_bulk_is_not_supported_before_SM_90__();));
-}
-#endif // __cccl_ptx_isa >= 800
-
-/*
-// cp.async.bulk.dst.src.mbarrier::complete_tx::bytes [dstMem], [srcMem], size, [rdsmem_bar]; // 2.  PTX ISA 80, SM_90
-// .dst       = { .shared::cluster }
-// .src       = { .shared::cta }
-template <typename=void>
-__device__ static inline void cp_async_bulk(
-  cuda::ptx::space_cluster_t,
-  cuda::ptx::space_shared_t,
-  void* dstMem,
-  const void* srcMem,
-  const uint32_t& size,
-  uint64_t* rdsmem_bar);
-*/
-#if __cccl_ptx_isa >= 800
-extern "C" _CCCL_DEVICE void __cuda_ptx_cp_async_bulk_is_not_supported_before_SM_90__();
-template <typename = void>
-_CCCL_DEVICE static inline void cp_async_bulk(
-  space_cluster_t,
-  space_shared_t,
-  void* __dstMem,
-  const void* __srcMem,
-  const _CUDA_VSTD::uint32_t& __size,
-  _CUDA_VSTD::uint64_t* __rdsmem_bar)
-{
-  // __space == space_cluster (due to parameter type constraint)
-  // __space == space_shared (due to parameter type constraint)
-  NV_IF_ELSE_TARGET(
-    NV_PROVIDES_SM_90,
-    (asm("cp.async.bulk.shared::cluster.shared::cta.mbarrier::complete_tx::bytes [%0], [%1], %2, [%3]; // 2. "
-         :
-         : "r"(__as_ptr_remote_dsmem(__dstMem)),
-           "r"(__as_ptr_smem(__srcMem)),
-           "r"(__size),
-           "r"(__as_ptr_remote_dsmem(__rdsmem_bar))
-         : "memory");),
-    (
-      // Unsupported architectures will have a linker error with a semi-decent error message
-      __cuda_ptx_cp_async_bulk_is_not_supported_before_SM_90__();));
-}
-#endif // __cccl_ptx_isa >= 800
-
-/*
-// cp.async.bulk.dst.src.bulk_group [dstMem], [srcMem], size; // 3.  PTX ISA 80, SM_90
-// .dst       = { .global }
-// .src       = { .shared::cta }
-template <typename=void>
-__device__ static inline void cp_async_bulk(
-  cuda::ptx::space_global_t,
-  cuda::ptx::space_shared_t,
-  void* dstMem,
-  const void* srcMem,
-  const uint32_t& size);
-*/
-#if __cccl_ptx_isa >= 800
-extern "C" _CCCL_DEVICE void __cuda_ptx_cp_async_bulk_is_not_supported_before_SM_90__();
-template <typename = void>
-_CCCL_DEVICE static inline void
-cp_async_bulk(space_global_t, space_shared_t, void* __dstMem, const void* __srcMem, const _CUDA_VSTD::uint32_t& __size)
-{
-  // __space == space_global (due to parameter type constraint)
-  // __space == space_shared (due to parameter type constraint)
-  NV_IF_ELSE_TARGET(
-    NV_PROVIDES_SM_90,
-    (asm("cp.async.bulk.global.shared::cta.bulk_group [%0], [%1], %2; // 3. "
-         :
-         : "l"(__as_ptr_gmem(__dstMem)), "r"(__as_ptr_smem(__srcMem)), "r"(__size)
-         : "memory");),
-    (
-      // Unsupported architectures will have a linker error with a semi-decent error message
-      __cuda_ptx_cp_async_bulk_is_not_supported_before_SM_90__();));
-}
-#endif // __cccl_ptx_isa >= 800
-/*
-// cp.async.bulk{.dst}{.src}.mbarrier::complete_tx::bytes.multicast::cluster [dstMem], [srcMem], size, [smem_bar],
-ctaMask; // 1.  PTX ISA 80, SM_90a
-// .dst       = { .shared::cluster }
-// .src       = { .global }
-template <typename=void>
-__device__ static inline void cp_async_bulk(
-  cuda::ptx::space_cluster_t,
-  cuda::ptx::space_global_t,
-  void* dstMem,
-  const void* srcMem,
-  const uint32_t& size,
-  uint64_t* smem_bar,
-  const uint16_t& ctaMask);
-*/
-#if __cccl_ptx_isa >= 800
-extern "C" _CCCL_DEVICE void __cuda_ptx_cp_async_bulk_is_not_supported_before_SM_90a__();
-template <typename = void>
-_CCCL_DEVICE static inline void cp_async_bulk(
-  space_cluster_t,
-  space_global_t,
-  void* __dstMem,
-  const void* __srcMem,
-  const _CUDA_VSTD::uint32_t& __size,
-  _CUDA_VSTD::uint64_t* __smem_bar,
-  const _CUDA_VSTD::uint16_t& __ctaMask)
-{
-  // __space == space_cluster (due to parameter type constraint)
-  // __space == space_global (due to parameter type constraint)
-  NV_IF_ELSE_TARGET(
-    NV_HAS_FEATURE_SM_90a,
-    (asm("cp.async.bulk.shared::cluster.global.mbarrier::complete_tx::bytes.multicast::cluster [%0], [%1], %2, [%3], "
-         "%4; // 1. "
-         :
-         : "r"(__as_ptr_smem(__dstMem)),
-           "l"(__as_ptr_gmem(__srcMem)),
-           "r"(__size),
-           "r"(__as_ptr_smem(__smem_bar)),
-           "h"(__ctaMask)
-         : "memory");),
-    (
-      // Unsupported architectures will have a linker error with a semi-decent error message
-      __cuda_ptx_cp_async_bulk_is_not_supported_before_SM_90a__();));
-}
-#endif // __cccl_ptx_isa >= 800
+#include <cuda/__ptx/instructions/generated/cp_async_bulk.inc>
+#include <cuda/__ptx/instructions/generated/cp_async_bulk_multicast.inc>
 
 _LIBCUDACXX_END_NAMESPACE_CUDA_PTX
 
diff --git a/libcudacxx/include/cuda/__ptx/instructions/cp_async_bulk_commit_group.h b/libcudacxx/include/cuda/__ptx/instructions/cp_async_bulk_commit_group.h
index f0028105350..bd97259cf19 100644
--- a/libcudacxx/include/cuda/__ptx/instructions/cp_async_bulk_commit_group.h
+++ b/libcudacxx/include/cuda/__ptx/instructions/cp_async_bulk_commit_group.h
@@ -32,27 +32,7 @@ _LIBCUDACXX_BEGIN_NAMESPACE_CUDA_PTX
 
 // 9.7.8.24.12. Data Movement and Conversion Instructions: cp.async.bulk.commit_group
 // https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#data-movement-and-conversion-instructions-cp-async-bulk-commit-group
-/*
-// cp.async.bulk.commit_group; // PTX ISA 80, SM_90
-template <typename=void>
-__device__ static inline void cp_async_bulk_commit_group();
-*/
-#if __cccl_ptx_isa >= 800
-extern "C" _CCCL_DEVICE void __cuda_ptx_cp_async_bulk_commit_group_is_not_supported_before_SM_90__();
-template <typename = void>
-_CCCL_DEVICE static inline void cp_async_bulk_commit_group()
-{
-  NV_IF_ELSE_TARGET(
-    NV_PROVIDES_SM_90,
-    (asm volatile("cp.async.bulk.commit_group;"
-                  :
-                  :
-                  :);),
-    (
-      // Unsupported architectures will have a linker error with a semi-decent error message
-      __cuda_ptx_cp_async_bulk_commit_group_is_not_supported_before_SM_90__();));
-}
-#endif // __cccl_ptx_isa >= 800
+#include <cuda/__ptx/instructions/generated/cp_async_bulk_commit_group.inc>
 
 _LIBCUDACXX_END_NAMESPACE_CUDA_PTX
 
diff --git a/libcudacxx/include/cuda/__ptx/instructions/cp_async_bulk_tensor.h b/libcudacxx/include/cuda/__ptx/instructions/cp_async_bulk_tensor.h
index b66981e8bbb..5b9f575ce5f 100644
--- a/libcudacxx/include/cuda/__ptx/instructions/cp_async_bulk_tensor.h
+++ b/libcudacxx/include/cuda/__ptx/instructions/cp_async_bulk_tensor.h
@@ -32,661 +32,8 @@ _LIBCUDACXX_BEGIN_NAMESPACE_CUDA_PTX
 
 // 9.7.8.24.9. Data Movement and Conversion Instructions: cp.async.bulk.tensor
 // https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#data-movement-and-conversion-instructions-cp-async-bulk-tensor
-/*
-// cp.async.bulk.tensor.1d.dst.src.tile.mbarrier::complete_tx::bytes [dstMem], [tensorMap, tensorCoords], [smem_bar];//
-1a. PTX ISA 80, SM_90
-// .dst       = { .shared::cluster }
-// .src       = { .global }
-template <typename=void>
-__device__ static inline void cp_async_bulk_tensor(
-  cuda::ptx::space_cluster_t,
-  cuda::ptx::space_global_t,
-  void* dstMem,
-  const void* tensorMap,
-  const int32_t (&tensorCoords)[1],
-  uint64_t* smem_bar);
-*/
-#if __cccl_ptx_isa >= 800
-extern "C" _CCCL_DEVICE void __cuda_ptx_cp_async_bulk_tensor_is_not_supported_before_SM_90__();
-template <typename = void>
-_CCCL_DEVICE static inline void cp_async_bulk_tensor(
-  space_cluster_t,
-  space_global_t,
-  void* __dstMem,
-  const void* __tensorMap,
-  const _CUDA_VSTD::int32_t (&__tensorCoords)[1],
-  _CUDA_VSTD::uint64_t* __smem_bar)
-{
-  // __space == space_cluster (due to parameter type constraint)
-  // __space == space_global (due to parameter type constraint)
-  NV_IF_ELSE_TARGET(
-    NV_PROVIDES_SM_90,
-    (asm("cp.async.bulk.tensor.1d.shared::cluster.global.tile.mbarrier::complete_tx::bytes [%0], [%1, {%2}], [%3];// "
-         "1a."
-         :
-         : "r"(__as_ptr_smem(__dstMem)), "l"(__tensorMap), "r"(__tensorCoords[0]), "r"(__as_ptr_smem(__smem_bar))
-         : "memory");),
-    (
-      // Unsupported architectures will have a linker error with a semi-decent error message
-      __cuda_ptx_cp_async_bulk_tensor_is_not_supported_before_SM_90__();));
-}
-#endif // __cccl_ptx_isa >= 800
-
-/*
-// cp.async.bulk.tensor.1d.dst.src.tile.bulk_group [tensorMap, tensorCoords], [srcMem]; // 3a. PTX ISA 80, SM_90
-// .dst       = { .global }
-// .src       = { .shared::cta }
-template <typename=void>
-__device__ static inline void cp_async_bulk_tensor(
-  cuda::ptx::space_global_t,
-  cuda::ptx::space_shared_t,
-  const void* tensorMap,
-  const int32_t (&tensorCoords)[1],
-  const void* srcMem);
-*/
-#if __cccl_ptx_isa >= 800
-extern "C" _CCCL_DEVICE void __cuda_ptx_cp_async_bulk_tensor_is_not_supported_before_SM_90__();
-template <typename = void>
-_CCCL_DEVICE static inline void cp_async_bulk_tensor(
-  space_global_t,
-  space_shared_t,
-  const void* __tensorMap,
-  const _CUDA_VSTD::int32_t (&__tensorCoords)[1],
-  const void* __srcMem)
-{
-  // __space == space_global (due to parameter type constraint)
-  // __space == space_shared (due to parameter type constraint)
-  NV_IF_ELSE_TARGET(
-    NV_PROVIDES_SM_90,
-    (asm("cp.async.bulk.tensor.1d.global.shared::cta.tile.bulk_group [%0, {%1}], [%2]; // 3a."
-         :
-         : "l"(__tensorMap), "r"(__tensorCoords[0]), "r"(__as_ptr_smem(__srcMem))
-         : "memory");),
-    (
-      // Unsupported architectures will have a linker error with a semi-decent error message
-      __cuda_ptx_cp_async_bulk_tensor_is_not_supported_before_SM_90__();));
-}
-#endif // __cccl_ptx_isa >= 800
-
-/*
-// cp.async.bulk.tensor.2d.dst.src.tile.mbarrier::complete_tx::bytes [dstMem], [tensorMap, tensorCoords], [smem_bar];//
-1b. PTX ISA 80, SM_90
-// .dst       = { .shared::cluster }
-// .src       = { .global }
-template <typename=void>
-__device__ static inline void cp_async_bulk_tensor(
-  cuda::ptx::space_cluster_t,
-  cuda::ptx::space_global_t,
-  void* dstMem,
-  const void* tensorMap,
-  const int32_t (&tensorCoords)[2],
-  uint64_t* smem_bar);
-*/
-#if __cccl_ptx_isa >= 800
-extern "C" _CCCL_DEVICE void __cuda_ptx_cp_async_bulk_tensor_is_not_supported_before_SM_90__();
-template <typename = void>
-_CCCL_DEVICE static inline void cp_async_bulk_tensor(
-  space_cluster_t,
-  space_global_t,
-  void* __dstMem,
-  const void* __tensorMap,
-  const _CUDA_VSTD::int32_t (&__tensorCoords)[2],
-  _CUDA_VSTD::uint64_t* __smem_bar)
-{
-  // __space == space_cluster (due to parameter type constraint)
-  // __space == space_global (due to parameter type constraint)
-  NV_IF_ELSE_TARGET(
-    NV_PROVIDES_SM_90,
-    (asm("cp.async.bulk.tensor.2d.shared::cluster.global.tile.mbarrier::complete_tx::bytes [%0], [%1, {%2, %3}], "
-         "[%4];// 1b."
-         :
-         : "r"(__as_ptr_smem(__dstMem)),
-           "l"(__tensorMap),
-           "r"(__tensorCoords[0]),
-           "r"(__tensorCoords[1]),
-           "r"(__as_ptr_smem(__smem_bar))
-         : "memory");),
-    (
-      // Unsupported architectures will have a linker error with a semi-decent error message
-      __cuda_ptx_cp_async_bulk_tensor_is_not_supported_before_SM_90__();));
-}
-#endif // __cccl_ptx_isa >= 800
-
-/*
-// cp.async.bulk.tensor.2d.dst.src.tile.bulk_group [tensorMap, tensorCoords], [srcMem]; // 3b. PTX ISA 80, SM_90
-// .dst       = { .global }
-// .src       = { .shared::cta }
-template <typename=void>
-__device__ static inline void cp_async_bulk_tensor(
-  cuda::ptx::space_global_t,
-  cuda::ptx::space_shared_t,
-  const void* tensorMap,
-  const int32_t (&tensorCoords)[2],
-  const void* srcMem);
-*/
-#if __cccl_ptx_isa >= 800
-extern "C" _CCCL_DEVICE void __cuda_ptx_cp_async_bulk_tensor_is_not_supported_before_SM_90__();
-template <typename = void>
-_CCCL_DEVICE static inline void cp_async_bulk_tensor(
-  space_global_t,
-  space_shared_t,
-  const void* __tensorMap,
-  const _CUDA_VSTD::int32_t (&__tensorCoords)[2],
-  const void* __srcMem)
-{
-  // __space == space_global (due to parameter type constraint)
-  // __space == space_shared (due to parameter type constraint)
-  NV_IF_ELSE_TARGET(
-    NV_PROVIDES_SM_90,
-    (asm("cp.async.bulk.tensor.2d.global.shared::cta.tile.bulk_group [%0, {%1, %2}], [%3]; // 3b."
-         :
-         : "l"(__tensorMap), "r"(__tensorCoords[0]), "r"(__tensorCoords[1]), "r"(__as_ptr_smem(__srcMem))
-         : "memory");),
-    (
-      // Unsupported architectures will have a linker error with a semi-decent error message
-      __cuda_ptx_cp_async_bulk_tensor_is_not_supported_before_SM_90__();));
-}
-#endif // __cccl_ptx_isa >= 800
-
-/*
-// cp.async.bulk.tensor.3d.dst.src.tile.mbarrier::complete_tx::bytes [dstMem], [tensorMap, tensorCoords], [smem_bar];//
-1c. PTX ISA 80, SM_90
-// .dst       = { .shared::cluster }
-// .src       = { .global }
-template <typename=void>
-__device__ static inline void cp_async_bulk_tensor(
-  cuda::ptx::space_cluster_t,
-  cuda::ptx::space_global_t,
-  void* dstMem,
-  const void* tensorMap,
-  const int32_t (&tensorCoords)[3],
-  uint64_t* smem_bar);
-*/
-#if __cccl_ptx_isa >= 800
-extern "C" _CCCL_DEVICE void __cuda_ptx_cp_async_bulk_tensor_is_not_supported_before_SM_90__();
-template <typename = void>
-_CCCL_DEVICE static inline void cp_async_bulk_tensor(
-  space_cluster_t,
-  space_global_t,
-  void* __dstMem,
-  const void* __tensorMap,
-  const _CUDA_VSTD::int32_t (&__tensorCoords)[3],
-  _CUDA_VSTD::uint64_t* __smem_bar)
-{
-  // __space == space_cluster (due to parameter type constraint)
-  // __space == space_global (due to parameter type constraint)
-  NV_IF_ELSE_TARGET(
-    NV_PROVIDES_SM_90,
-    (asm("cp.async.bulk.tensor.3d.shared::cluster.global.tile.mbarrier::complete_tx::bytes [%0], [%1, {%2, %3, %4}], "
-         "[%5];// 1c."
-         :
-         : "r"(__as_ptr_smem(__dstMem)),
-           "l"(__tensorMap),
-           "r"(__tensorCoords[0]),
-           "r"(__tensorCoords[1]),
-           "r"(__tensorCoords[2]),
-           "r"(__as_ptr_smem(__smem_bar))
-         : "memory");),
-    (
-      // Unsupported architectures will have a linker error with a semi-decent error message
-      __cuda_ptx_cp_async_bulk_tensor_is_not_supported_before_SM_90__();));
-}
-#endif // __cccl_ptx_isa >= 800
-
-/*
-// cp.async.bulk.tensor.3d.dst.src.tile.bulk_group [tensorMap, tensorCoords], [srcMem]; // 3c. PTX ISA 80, SM_90
-// .dst       = { .global }
-// .src       = { .shared::cta }
-template <typename=void>
-__device__ static inline void cp_async_bulk_tensor(
-  cuda::ptx::space_global_t,
-  cuda::ptx::space_shared_t,
-  const void* tensorMap,
-  const int32_t (&tensorCoords)[3],
-  const void* srcMem);
-*/
-#if __cccl_ptx_isa >= 800
-extern "C" _CCCL_DEVICE void __cuda_ptx_cp_async_bulk_tensor_is_not_supported_before_SM_90__();
-template <typename = void>
-_CCCL_DEVICE static inline void cp_async_bulk_tensor(
-  space_global_t,
-  space_shared_t,
-  const void* __tensorMap,
-  const _CUDA_VSTD::int32_t (&__tensorCoords)[3],
-  const void* __srcMem)
-{
-  // __space == space_global (due to parameter type constraint)
-  // __space == space_shared (due to parameter type constraint)
-  NV_IF_ELSE_TARGET(
-    NV_PROVIDES_SM_90,
-    (asm("cp.async.bulk.tensor.3d.global.shared::cta.tile.bulk_group [%0, {%1, %2, %3}], [%4]; // 3c."
-         :
-         : "l"(__tensorMap),
-           "r"(__tensorCoords[0]),
-           "r"(__tensorCoords[1]),
-           "r"(__tensorCoords[2]),
-           "r"(__as_ptr_smem(__srcMem))
-         : "memory");),
-    (
-      // Unsupported architectures will have a linker error with a semi-decent error message
-      __cuda_ptx_cp_async_bulk_tensor_is_not_supported_before_SM_90__();));
-}
-#endif // __cccl_ptx_isa >= 800
-
-/*
-// cp.async.bulk.tensor.4d.dst.src.tile.mbarrier::complete_tx::bytes [dstMem], [tensorMap, tensorCoords], [smem_bar];//
-1d. PTX ISA 80, SM_90
-// .dst       = { .shared::cluster }
-// .src       = { .global }
-template <typename=void>
-__device__ static inline void cp_async_bulk_tensor(
-  cuda::ptx::space_cluster_t,
-  cuda::ptx::space_global_t,
-  void* dstMem,
-  const void* tensorMap,
-  const int32_t (&tensorCoords)[4],
-  uint64_t* smem_bar);
-*/
-#if __cccl_ptx_isa >= 800
-extern "C" _CCCL_DEVICE void __cuda_ptx_cp_async_bulk_tensor_is_not_supported_before_SM_90__();
-template <typename = void>
-_CCCL_DEVICE static inline void cp_async_bulk_tensor(
-  space_cluster_t,
-  space_global_t,
-  void* __dstMem,
-  const void* __tensorMap,
-  const _CUDA_VSTD::int32_t (&__tensorCoords)[4],
-  _CUDA_VSTD::uint64_t* __smem_bar)
-{
-  // __space == space_cluster (due to parameter type constraint)
-  // __space == space_global (due to parameter type constraint)
-  NV_IF_ELSE_TARGET(
-    NV_PROVIDES_SM_90,
-    (asm("cp.async.bulk.tensor.4d.shared::cluster.global.tile.mbarrier::complete_tx::bytes [%0], [%1, {%2, %3, %4, "
-         "%5}], [%6];// 1d."
-         :
-         : "r"(__as_ptr_smem(__dstMem)),
-           "l"(__tensorMap),
-           "r"(__tensorCoords[0]),
-           "r"(__tensorCoords[1]),
-           "r"(__tensorCoords[2]),
-           "r"(__tensorCoords[3]),
-           "r"(__as_ptr_smem(__smem_bar))
-         : "memory");),
-    (
-      // Unsupported architectures will have a linker error with a semi-decent error message
-      __cuda_ptx_cp_async_bulk_tensor_is_not_supported_before_SM_90__();));
-}
-#endif // __cccl_ptx_isa >= 800
-
-/*
-// cp.async.bulk.tensor.4d.dst.src.tile.bulk_group [tensorMap, tensorCoords], [srcMem]; // 3d. PTX ISA 80, SM_90
-// .dst       = { .global }
-// .src       = { .shared::cta }
-template <typename=void>
-__device__ static inline void cp_async_bulk_tensor(
-  cuda::ptx::space_global_t,
-  cuda::ptx::space_shared_t,
-  const void* tensorMap,
-  const int32_t (&tensorCoords)[4],
-  const void* srcMem);
-*/
-#if __cccl_ptx_isa >= 800
-extern "C" _CCCL_DEVICE void __cuda_ptx_cp_async_bulk_tensor_is_not_supported_before_SM_90__();
-template <typename = void>
-_CCCL_DEVICE static inline void cp_async_bulk_tensor(
-  space_global_t,
-  space_shared_t,
-  const void* __tensorMap,
-  const _CUDA_VSTD::int32_t (&__tensorCoords)[4],
-  const void* __srcMem)
-{
-  // __space == space_global (due to parameter type constraint)
-  // __space == space_shared (due to parameter type constraint)
-  NV_IF_ELSE_TARGET(
-    NV_PROVIDES_SM_90,
-    (asm("cp.async.bulk.tensor.4d.global.shared::cta.tile.bulk_group [%0, {%1, %2, %3, %4}], [%5]; // 3d."
-         :
-         : "l"(__tensorMap),
-           "r"(__tensorCoords[0]),
-           "r"(__tensorCoords[1]),
-           "r"(__tensorCoords[2]),
-           "r"(__tensorCoords[3]),
-           "r"(__as_ptr_smem(__srcMem))
-         : "memory");),
-    (
-      // Unsupported architectures will have a linker error with a semi-decent error message
-      __cuda_ptx_cp_async_bulk_tensor_is_not_supported_before_SM_90__();));
-}
-#endif // __cccl_ptx_isa >= 800
-
-/*
-// cp.async.bulk.tensor.5d.dst.src.tile.mbarrier::complete_tx::bytes [dstMem], [tensorMap, tensorCoords], [smem_bar];//
-1e. PTX ISA 80, SM_90
-// .dst       = { .shared::cluster }
-// .src       = { .global }
-template <typename=void>
-__device__ static inline void cp_async_bulk_tensor(
-  cuda::ptx::space_cluster_t,
-  cuda::ptx::space_global_t,
-  void* dstMem,
-  const void* tensorMap,
-  const int32_t (&tensorCoords)[5],
-  uint64_t* smem_bar);
-*/
-#if __cccl_ptx_isa >= 800
-extern "C" _CCCL_DEVICE void __cuda_ptx_cp_async_bulk_tensor_is_not_supported_before_SM_90__();
-template <typename = void>
-_CCCL_DEVICE static inline void cp_async_bulk_tensor(
-  space_cluster_t,
-  space_global_t,
-  void* __dstMem,
-  const void* __tensorMap,
-  const _CUDA_VSTD::int32_t (&__tensorCoords)[5],
-  _CUDA_VSTD::uint64_t* __smem_bar)
-{
-  // __space == space_cluster (due to parameter type constraint)
-  // __space == space_global (due to parameter type constraint)
-  NV_IF_ELSE_TARGET(
-    NV_PROVIDES_SM_90,
-    (asm("cp.async.bulk.tensor.5d.shared::cluster.global.tile.mbarrier::complete_tx::bytes [%0], [%1, {%2, %3, %4, %5, "
-         "%6}], [%7];// 1e."
-         :
-         : "r"(__as_ptr_smem(__dstMem)),
-           "l"(__tensorMap),
-           "r"(__tensorCoords[0]),
-           "r"(__tensorCoords[1]),
-           "r"(__tensorCoords[2]),
-           "r"(__tensorCoords[3]),
-           "r"(__tensorCoords[4]),
-           "r"(__as_ptr_smem(__smem_bar))
-         : "memory");),
-    (
-      // Unsupported architectures will have a linker error with a semi-decent error message
-      __cuda_ptx_cp_async_bulk_tensor_is_not_supported_before_SM_90__();));
-}
-#endif // __cccl_ptx_isa >= 800
-
-/*
-// cp.async.bulk.tensor.5d.dst.src.tile.bulk_group [tensorMap, tensorCoords], [srcMem]; // 3e. PTX ISA 80, SM_90
-// .dst       = { .global }
-// .src       = { .shared::cta }
-template <typename=void>
-__device__ static inline void cp_async_bulk_tensor(
-  cuda::ptx::space_global_t,
-  cuda::ptx::space_shared_t,
-  const void* tensorMap,
-  const int32_t (&tensorCoords)[5],
-  const void* srcMem);
-*/
-#if __cccl_ptx_isa >= 800
-extern "C" _CCCL_DEVICE void __cuda_ptx_cp_async_bulk_tensor_is_not_supported_before_SM_90__();
-template <typename = void>
-_CCCL_DEVICE static inline void cp_async_bulk_tensor(
-  space_global_t,
-  space_shared_t,
-  const void* __tensorMap,
-  const _CUDA_VSTD::int32_t (&__tensorCoords)[5],
-  const void* __srcMem)
-{
-  // __space == space_global (due to parameter type constraint)
-  // __space == space_shared (due to parameter type constraint)
-  NV_IF_ELSE_TARGET(
-    NV_PROVIDES_SM_90,
-    (asm("cp.async.bulk.tensor.5d.global.shared::cta.tile.bulk_group [%0, {%1, %2, %3, %4, %5}], [%6]; // 3e."
-         :
-         : "l"(__tensorMap),
-           "r"(__tensorCoords[0]),
-           "r"(__tensorCoords[1]),
-           "r"(__tensorCoords[2]),
-           "r"(__tensorCoords[3]),
-           "r"(__tensorCoords[4]),
-           "r"(__as_ptr_smem(__srcMem))
-         : "memory");),
-    (
-      // Unsupported architectures will have a linker error with a semi-decent error message
-      __cuda_ptx_cp_async_bulk_tensor_is_not_supported_before_SM_90__();));
-}
-#endif // __cccl_ptx_isa >= 800
-/*
-// cp.async.bulk.tensor.1d.dst.src.tile.mbarrier::complete_tx::bytes.multicast::cluster [dstMem], [tensorMap,
-tensorCoords], [smem_bar], ctaMask; // 2a. PTX ISA 80, SM_90a
-// .dst       = { .shared::cluster }
-// .src       = { .global }
-template <typename=void>
-__device__ static inline void cp_async_bulk_tensor(
-  cuda::ptx::space_cluster_t,
-  cuda::ptx::space_global_t,
-  void* dstMem,
-  const void* tensorMap,
-  const int32_t (&tensorCoords)[1],
-  uint64_t* smem_bar,
-  const uint16_t& ctaMask);
-*/
-#if __cccl_ptx_isa >= 800
-extern "C" _CCCL_DEVICE void __cuda_ptx_cp_async_bulk_tensor_is_not_supported_before_SM_90a__();
-template <typename = void>
-_CCCL_DEVICE static inline void cp_async_bulk_tensor(
-  space_cluster_t,
-  space_global_t,
-  void* __dstMem,
-  const void* __tensorMap,
-  const _CUDA_VSTD::int32_t (&__tensorCoords)[1],
-  _CUDA_VSTD::uint64_t* __smem_bar,
-  const _CUDA_VSTD::uint16_t& __ctaMask)
-{
-  // __space == space_cluster (due to parameter type constraint)
-  // __space == space_global (due to parameter type constraint)
-  NV_IF_ELSE_TARGET(
-    NV_HAS_FEATURE_SM_90a,
-    (asm("cp.async.bulk.tensor.1d.shared::cluster.global.tile.mbarrier::complete_tx::bytes.multicast::cluster [%0], "
-         "[%1, {%2}], [%3], %4; // 2a."
-         :
-         : "r"(__as_ptr_smem(__dstMem)),
-           "l"(__tensorMap),
-           "r"(__tensorCoords[0]),
-           "r"(__as_ptr_smem(__smem_bar)),
-           "h"(__ctaMask)
-         : "memory");),
-    (
-      // Unsupported architectures will have a linker error with a semi-decent error message
-      __cuda_ptx_cp_async_bulk_tensor_is_not_supported_before_SM_90a__();));
-}
-#endif // __cccl_ptx_isa >= 800
-
-/*
-// cp.async.bulk.tensor.2d.dst.src.tile.mbarrier::complete_tx::bytes.multicast::cluster [dstMem], [tensorMap,
-tensorCoords], [smem_bar], ctaMask; // 2b. PTX ISA 80, SM_90a
-// .dst       = { .shared::cluster }
-// .src       = { .global }
-template <typename=void>
-__device__ static inline void cp_async_bulk_tensor(
-  cuda::ptx::space_cluster_t,
-  cuda::ptx::space_global_t,
-  void* dstMem,
-  const void* tensorMap,
-  const int32_t (&tensorCoords)[2],
-  uint64_t* smem_bar,
-  const uint16_t& ctaMask);
-*/
-#if __cccl_ptx_isa >= 800
-extern "C" _CCCL_DEVICE void __cuda_ptx_cp_async_bulk_tensor_is_not_supported_before_SM_90a__();
-template <typename = void>
-_CCCL_DEVICE static inline void cp_async_bulk_tensor(
-  space_cluster_t,
-  space_global_t,
-  void* __dstMem,
-  const void* __tensorMap,
-  const _CUDA_VSTD::int32_t (&__tensorCoords)[2],
-  _CUDA_VSTD::uint64_t* __smem_bar,
-  const _CUDA_VSTD::uint16_t& __ctaMask)
-{
-  // __space == space_cluster (due to parameter type constraint)
-  // __space == space_global (due to parameter type constraint)
-  NV_IF_ELSE_TARGET(
-    NV_HAS_FEATURE_SM_90a,
-    (asm("cp.async.bulk.tensor.2d.shared::cluster.global.tile.mbarrier::complete_tx::bytes.multicast::cluster [%0], "
-         "[%1, {%2, %3}], [%4], %5; // 2b."
-         :
-         : "r"(__as_ptr_smem(__dstMem)),
-           "l"(__tensorMap),
-           "r"(__tensorCoords[0]),
-           "r"(__tensorCoords[1]),
-           "r"(__as_ptr_smem(__smem_bar)),
-           "h"(__ctaMask)
-         : "memory");),
-    (
-      // Unsupported architectures will have a linker error with a semi-decent error message
-      __cuda_ptx_cp_async_bulk_tensor_is_not_supported_before_SM_90a__();));
-}
-#endif // __cccl_ptx_isa >= 800
-
-/*
-// cp.async.bulk.tensor.3d.dst.src.tile.mbarrier::complete_tx::bytes.multicast::cluster [dstMem], [tensorMap,
-tensorCoords], [smem_bar], ctaMask; // 2c. PTX ISA 80, SM_90a
-// .dst       = { .shared::cluster }
-// .src       = { .global }
-template <typename=void>
-__device__ static inline void cp_async_bulk_tensor(
-  cuda::ptx::space_cluster_t,
-  cuda::ptx::space_global_t,
-  void* dstMem,
-  const void* tensorMap,
-  const int32_t (&tensorCoords)[3],
-  uint64_t* smem_bar,
-  const uint16_t& ctaMask);
-*/
-#if __cccl_ptx_isa >= 800
-extern "C" _CCCL_DEVICE void __cuda_ptx_cp_async_bulk_tensor_is_not_supported_before_SM_90a__();
-template <typename = void>
-_CCCL_DEVICE static inline void cp_async_bulk_tensor(
-  space_cluster_t,
-  space_global_t,
-  void* __dstMem,
-  const void* __tensorMap,
-  const _CUDA_VSTD::int32_t (&__tensorCoords)[3],
-  _CUDA_VSTD::uint64_t* __smem_bar,
-  const _CUDA_VSTD::uint16_t& __ctaMask)
-{
-  // __space == space_cluster (due to parameter type constraint)
-  // __space == space_global (due to parameter type constraint)
-  NV_IF_ELSE_TARGET(
-    NV_HAS_FEATURE_SM_90a,
-    (asm("cp.async.bulk.tensor.3d.shared::cluster.global.tile.mbarrier::complete_tx::bytes.multicast::cluster [%0], "
-         "[%1, {%2, %3, %4}], [%5], %6; // 2c."
-         :
-         : "r"(__as_ptr_smem(__dstMem)),
-           "l"(__tensorMap),
-           "r"(__tensorCoords[0]),
-           "r"(__tensorCoords[1]),
-           "r"(__tensorCoords[2]),
-           "r"(__as_ptr_smem(__smem_bar)),
-           "h"(__ctaMask)
-         : "memory");),
-    (
-      // Unsupported architectures will have a linker error with a semi-decent error message
-      __cuda_ptx_cp_async_bulk_tensor_is_not_supported_before_SM_90a__();));
-}
-#endif // __cccl_ptx_isa >= 800
-
-/*
-// cp.async.bulk.tensor.4d.dst.src.tile.mbarrier::complete_tx::bytes.multicast::cluster [dstMem], [tensorMap,
-tensorCoords], [smem_bar], ctaMask; // 2d. PTX ISA 80, SM_90a
-// .dst       = { .shared::cluster }
-// .src       = { .global }
-template <typename=void>
-__device__ static inline void cp_async_bulk_tensor(
-  cuda::ptx::space_cluster_t,
-  cuda::ptx::space_global_t,
-  void* dstMem,
-  const void* tensorMap,
-  const int32_t (&tensorCoords)[4],
-  uint64_t* smem_bar,
-  const uint16_t& ctaMask);
-*/
-#if __cccl_ptx_isa >= 800
-extern "C" _CCCL_DEVICE void __cuda_ptx_cp_async_bulk_tensor_is_not_supported_before_SM_90a__();
-template <typename = void>
-_CCCL_DEVICE static inline void cp_async_bulk_tensor(
-  space_cluster_t,
-  space_global_t,
-  void* __dstMem,
-  const void* __tensorMap,
-  const _CUDA_VSTD::int32_t (&__tensorCoords)[4],
-  _CUDA_VSTD::uint64_t* __smem_bar,
-  const _CUDA_VSTD::uint16_t& __ctaMask)
-{
-  // __space == space_cluster (due to parameter type constraint)
-  // __space == space_global (due to parameter type constraint)
-  NV_IF_ELSE_TARGET(
-    NV_HAS_FEATURE_SM_90a,
-    (asm("cp.async.bulk.tensor.4d.shared::cluster.global.tile.mbarrier::complete_tx::bytes.multicast::cluster [%0], "
-         "[%1, {%2, %3, %4, %5}], [%6], %7; // 2d."
-         :
-         : "r"(__as_ptr_smem(__dstMem)),
-           "l"(__tensorMap),
-           "r"(__tensorCoords[0]),
-           "r"(__tensorCoords[1]),
-           "r"(__tensorCoords[2]),
-           "r"(__tensorCoords[3]),
-           "r"(__as_ptr_smem(__smem_bar)),
-           "h"(__ctaMask)
-         : "memory");),
-    (
-      // Unsupported architectures will have a linker error with a semi-decent error message
-      __cuda_ptx_cp_async_bulk_tensor_is_not_supported_before_SM_90a__();));
-}
-#endif // __cccl_ptx_isa >= 800
-
-/*
-// cp.async.bulk.tensor.5d.dst.src.tile.mbarrier::complete_tx::bytes.multicast::cluster [dstMem], [tensorMap,
-tensorCoords], [smem_bar], ctaMask; // 2e. PTX ISA 80, SM_90a
-// .dst       = { .shared::cluster }
-// .src       = { .global }
-template <typename=void>
-__device__ static inline void cp_async_bulk_tensor(
-  cuda::ptx::space_cluster_t,
-  cuda::ptx::space_global_t,
-  void* dstMem,
-  const void* tensorMap,
-  const int32_t (&tensorCoords)[5],
-  uint64_t* smem_bar,
-  const uint16_t& ctaMask);
-*/
-#if __cccl_ptx_isa >= 800
-extern "C" _CCCL_DEVICE void __cuda_ptx_cp_async_bulk_tensor_is_not_supported_before_SM_90a__();
-template <typename = void>
-_CCCL_DEVICE static inline void cp_async_bulk_tensor(
-  space_cluster_t,
-  space_global_t,
-  void* __dstMem,
-  const void* __tensorMap,
-  const _CUDA_VSTD::int32_t (&__tensorCoords)[5],
-  _CUDA_VSTD::uint64_t* __smem_bar,
-  const _CUDA_VSTD::uint16_t& __ctaMask)
-{
-  // __space == space_cluster (due to parameter type constraint)
-  // __space == space_global (due to parameter type constraint)
-  NV_IF_ELSE_TARGET(
-    NV_HAS_FEATURE_SM_90a,
-    (asm("cp.async.bulk.tensor.5d.shared::cluster.global.tile.mbarrier::complete_tx::bytes.multicast::cluster [%0], "
-         "[%1, {%2, %3, %4, %5, %6}], [%7], %8; // 2e."
-         :
-         : "r"(__as_ptr_smem(__dstMem)),
-           "l"(__tensorMap),
-           "r"(__tensorCoords[0]),
-           "r"(__tensorCoords[1]),
-           "r"(__tensorCoords[2]),
-           "r"(__tensorCoords[3]),
-           "r"(__tensorCoords[4]),
-           "r"(__as_ptr_smem(__smem_bar)),
-           "h"(__ctaMask)
-         : "memory");),
-    (
-      // Unsupported architectures will have a linker error with a semi-decent error message
-      __cuda_ptx_cp_async_bulk_tensor_is_not_supported_before_SM_90a__();));
-}
-#endif // __cccl_ptx_isa >= 800
+#include <cuda/__ptx/instructions/generated/cp_async_bulk_tensor.inc>
+#include <cuda/__ptx/instructions/generated/cp_async_bulk_tensor_multicast.inc>
 
 _LIBCUDACXX_END_NAMESPACE_CUDA_PTX
 
diff --git a/libcudacxx/include/cuda/__ptx/instructions/cp_async_bulk_wait_group.h b/libcudacxx/include/cuda/__ptx/instructions/cp_async_bulk_wait_group.h
index 5dcbf8572f4..00a3700e1a9 100644
--- a/libcudacxx/include/cuda/__ptx/instructions/cp_async_bulk_wait_group.h
+++ b/libcudacxx/include/cuda/__ptx/instructions/cp_async_bulk_wait_group.h
@@ -32,51 +32,7 @@ _LIBCUDACXX_BEGIN_NAMESPACE_CUDA_PTX
 
 // 9.7.8.24.13. Data Movement and Conversion Instructions: cp.async.bulk.wait_group
 // https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#data-movement-and-conversion-instructions-cp-async-bulk-wait-group
-/*
-// cp.async.bulk.wait_group N; // PTX ISA 80, SM_90
-template <int N32>
-__device__ static inline void cp_async_bulk_wait_group(
-  cuda::ptx::n32_t<N32> N);
-*/
-#if __cccl_ptx_isa >= 800
-extern "C" _CCCL_DEVICE void __cuda_ptx_cp_async_bulk_wait_group_is_not_supported_before_SM_90__();
-template <int _N32>
-_CCCL_DEVICE static inline void cp_async_bulk_wait_group(n32_t<_N32> __n)
-{
-  NV_IF_ELSE_TARGET(
-    NV_PROVIDES_SM_90,
-    (asm volatile("cp.async.bulk.wait_group %0;"
-                  :
-                  : "n"(__n.value)
-                  : "memory");),
-    (
-      // Unsupported architectures will have a linker error with a semi-decent error message
-      __cuda_ptx_cp_async_bulk_wait_group_is_not_supported_before_SM_90__();));
-}
-#endif // __cccl_ptx_isa >= 800
-
-/*
-// cp.async.bulk.wait_group.read N; // PTX ISA 80, SM_90
-template <int N32>
-__device__ static inline void cp_async_bulk_wait_group_read(
-  cuda::ptx::n32_t<N32> N);
-*/
-#if __cccl_ptx_isa >= 800
-extern "C" _CCCL_DEVICE void __cuda_ptx_cp_async_bulk_wait_group_read_is_not_supported_before_SM_90__();
-template <int _N32>
-_CCCL_DEVICE static inline void cp_async_bulk_wait_group_read(n32_t<_N32> __n)
-{
-  NV_IF_ELSE_TARGET(
-    NV_PROVIDES_SM_90,
-    (asm volatile("cp.async.bulk.wait_group.read %0;"
-                  :
-                  : "n"(__n.value)
-                  : "memory");),
-    (
-      // Unsupported architectures will have a linker error with a semi-decent error message
-      __cuda_ptx_cp_async_bulk_wait_group_read_is_not_supported_before_SM_90__();));
-}
-#endif // __cccl_ptx_isa >= 800
+#include <cuda/__ptx/instructions/generated/cp_async_bulk_wait_group.inc>
 
 _LIBCUDACXX_END_NAMESPACE_CUDA_PTX
 
diff --git a/libcudacxx/include/cuda/__ptx/instructions/cp_reduce_async_bulk.h b/libcudacxx/include/cuda/__ptx/instructions/cp_reduce_async_bulk.h
index ee89e33c1c2..ee6d90bc4d9 100644
--- a/libcudacxx/include/cuda/__ptx/instructions/cp_reduce_async_bulk.h
+++ b/libcudacxx/include/cuda/__ptx/instructions/cp_reduce_async_bulk.h
@@ -43,1679 +43,12 @@ _LIBCUDACXX_BEGIN_NAMESPACE_CUDA_PTX
 
 // 9.7.8.24.7. Data Movement and Conversion Instructions: cp.reduce.async.bulk
 // https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#data-movement-and-conversion-instructions-cp-reduce-async-bulk
-/*
-// cp.reduce.async.bulk.dst.src.mbarrier::complete_tx::bytes.op.type [dstMem], [srcMem], size, [rdsmem_bar]; // 1. PTX
-ISA 80, SM_90
-// .dst       = { .shared::cluster }
-// .src       = { .shared::cta }
-// .type      = { .b32 }
-// .op        = { .and }
-template <typename B32>
-__device__ static inline void cp_reduce_async_bulk(
-  cuda::ptx::space_cluster_t,
-  cuda::ptx::space_shared_t,
-  cuda::ptx::op_and_op_t,
-  B32* dstMem,
-  const B32* srcMem,
-  uint32_t size,
-  uint64_t* rdsmem_bar);
-*/
-#if __cccl_ptx_isa >= 800
-extern "C" _CCCL_DEVICE void __cuda_ptx_cp_reduce_async_bulk_is_not_supported_before_SM_90__();
-template <typename _B32>
-_CCCL_DEVICE static inline void cp_reduce_async_bulk(
-  space_cluster_t,
-  space_shared_t,
-  op_and_op_t,
-  _B32* __dstMem,
-  const _B32* __srcMem,
-  _CUDA_VSTD::uint32_t __size,
-  _CUDA_VSTD::uint64_t* __rdsmem_bar)
-{
-  // __space == space_cluster (due to parameter type constraint)
-  // __space == space_shared (due to parameter type constraint)
-  // __type == type_b32 (due to parameter type constraint)
-  // __op == op_and_op (due to parameter type constraint)
-  NV_IF_ELSE_TARGET(
-    NV_PROVIDES_SM_90,
-    (asm("cp.reduce.async.bulk.shared::cluster.shared::cta.mbarrier::complete_tx::bytes.and.b32 [%0], [%1], %2, [%3]; "
-         "// 1."
-         :
-         : "r"(__as_ptr_remote_dsmem(__dstMem)),
-           "r"(__as_ptr_smem(__srcMem)),
-           "r"(__size),
-           "r"(__as_ptr_remote_dsmem(__rdsmem_bar))
-         : "memory");),
-    (
-      // Unsupported architectures will have a linker error with a semi-decent error message
-      __cuda_ptx_cp_reduce_async_bulk_is_not_supported_before_SM_90__();));
-}
-#endif // __cccl_ptx_isa >= 800
-
-/*
-// cp.reduce.async.bulk.dst.src.mbarrier::complete_tx::bytes.op.type [dstMem], [srcMem], size, [rdsmem_bar]; // 1. PTX
-ISA 80, SM_90
-// .dst       = { .shared::cluster }
-// .src       = { .shared::cta }
-// .type      = { .b32 }
-// .op        = { .or }
-template <typename B32>
-__device__ static inline void cp_reduce_async_bulk(
-  cuda::ptx::space_cluster_t,
-  cuda::ptx::space_shared_t,
-  cuda::ptx::op_or_op_t,
-  B32* dstMem,
-  const B32* srcMem,
-  uint32_t size,
-  uint64_t* rdsmem_bar);
-*/
-#if __cccl_ptx_isa >= 800
-extern "C" _CCCL_DEVICE void __cuda_ptx_cp_reduce_async_bulk_is_not_supported_before_SM_90__();
-template <typename _B32>
-_CCCL_DEVICE static inline void cp_reduce_async_bulk(
-  space_cluster_t,
-  space_shared_t,
-  op_or_op_t,
-  _B32* __dstMem,
-  const _B32* __srcMem,
-  _CUDA_VSTD::uint32_t __size,
-  _CUDA_VSTD::uint64_t* __rdsmem_bar)
-{
-  // __space == space_cluster (due to parameter type constraint)
-  // __space == space_shared (due to parameter type constraint)
-  // __type == type_b32 (due to parameter type constraint)
-  // __op == op_or_op (due to parameter type constraint)
-  NV_IF_ELSE_TARGET(
-    NV_PROVIDES_SM_90,
-    (asm("cp.reduce.async.bulk.shared::cluster.shared::cta.mbarrier::complete_tx::bytes.or.b32 [%0], [%1], %2, [%3]; "
-         "// 1."
-         :
-         : "r"(__as_ptr_remote_dsmem(__dstMem)),
-           "r"(__as_ptr_smem(__srcMem)),
-           "r"(__size),
-           "r"(__as_ptr_remote_dsmem(__rdsmem_bar))
-         : "memory");),
-    (
-      // Unsupported architectures will have a linker error with a semi-decent error message
-      __cuda_ptx_cp_reduce_async_bulk_is_not_supported_before_SM_90__();));
-}
-#endif // __cccl_ptx_isa >= 800
-
-/*
-// cp.reduce.async.bulk.dst.src.mbarrier::complete_tx::bytes.op.type [dstMem], [srcMem], size, [rdsmem_bar]; // 1. PTX
-ISA 80, SM_90
-// .dst       = { .shared::cluster }
-// .src       = { .shared::cta }
-// .type      = { .b32 }
-// .op        = { .xor }
-template <typename B32>
-__device__ static inline void cp_reduce_async_bulk(
-  cuda::ptx::space_cluster_t,
-  cuda::ptx::space_shared_t,
-  cuda::ptx::op_xor_op_t,
-  B32* dstMem,
-  const B32* srcMem,
-  uint32_t size,
-  uint64_t* rdsmem_bar);
-*/
-#if __cccl_ptx_isa >= 800
-extern "C" _CCCL_DEVICE void __cuda_ptx_cp_reduce_async_bulk_is_not_supported_before_SM_90__();
-template <typename _B32>
-_CCCL_DEVICE static inline void cp_reduce_async_bulk(
-  space_cluster_t,
-  space_shared_t,
-  op_xor_op_t,
-  _B32* __dstMem,
-  const _B32* __srcMem,
-  _CUDA_VSTD::uint32_t __size,
-  _CUDA_VSTD::uint64_t* __rdsmem_bar)
-{
-  // __space == space_cluster (due to parameter type constraint)
-  // __space == space_shared (due to parameter type constraint)
-  // __type == type_b32 (due to parameter type constraint)
-  // __op == op_xor_op (due to parameter type constraint)
-  NV_IF_ELSE_TARGET(
-    NV_PROVIDES_SM_90,
-    (asm("cp.reduce.async.bulk.shared::cluster.shared::cta.mbarrier::complete_tx::bytes.xor.b32 [%0], [%1], %2, [%3]; "
-         "// 1."
-         :
-         : "r"(__as_ptr_remote_dsmem(__dstMem)),
-           "r"(__as_ptr_smem(__srcMem)),
-           "r"(__size),
-           "r"(__as_ptr_remote_dsmem(__rdsmem_bar))
-         : "memory");),
-    (
-      // Unsupported architectures will have a linker error with a semi-decent error message
-      __cuda_ptx_cp_reduce_async_bulk_is_not_supported_before_SM_90__();));
-}
-#endif // __cccl_ptx_isa >= 800
-
-/*
-// cp.reduce.async.bulk.dst.src.mbarrier::complete_tx::bytes.op.type [dstMem], [srcMem], size, [rdsmem_bar]; // 1. PTX
-ISA 80, SM_90
-// .dst       = { .shared::cluster }
-// .src       = { .shared::cta }
-// .type      = { .u32 }
-// .op        = { .min }
-template <typename=void>
-__device__ static inline void cp_reduce_async_bulk(
-  cuda::ptx::space_cluster_t,
-  cuda::ptx::space_shared_t,
-  cuda::ptx::op_min_t,
-  uint32_t* dstMem,
-  const uint32_t* srcMem,
-  uint32_t size,
-  uint64_t* rdsmem_bar);
-*/
-#if __cccl_ptx_isa >= 800
-extern "C" _CCCL_DEVICE void __cuda_ptx_cp_reduce_async_bulk_is_not_supported_before_SM_90__();
-template <typename = void>
-_CCCL_DEVICE static inline void cp_reduce_async_bulk(
-  space_cluster_t,
-  space_shared_t,
-  op_min_t,
-  _CUDA_VSTD::uint32_t* __dstMem,
-  const _CUDA_VSTD::uint32_t* __srcMem,
-  _CUDA_VSTD::uint32_t __size,
-  _CUDA_VSTD::uint64_t* __rdsmem_bar)
-{
-  // __space == space_cluster (due to parameter type constraint)
-  // __space == space_shared (due to parameter type constraint)
-  // __type == type_u32 (due to parameter type constraint)
-  // __op == op_min (due to parameter type constraint)
-  NV_IF_ELSE_TARGET(
-    NV_PROVIDES_SM_90,
-    (asm("cp.reduce.async.bulk.shared::cluster.shared::cta.mbarrier::complete_tx::bytes.min.u32 [%0], [%1], %2, [%3]; "
-         "// 1."
-         :
-         : "r"(__as_ptr_remote_dsmem(__dstMem)),
-           "r"(__as_ptr_smem(__srcMem)),
-           "r"(__size),
-           "r"(__as_ptr_remote_dsmem(__rdsmem_bar))
-         : "memory");),
-    (
-      // Unsupported architectures will have a linker error with a semi-decent error message
-      __cuda_ptx_cp_reduce_async_bulk_is_not_supported_before_SM_90__();));
-}
-#endif // __cccl_ptx_isa >= 800
-
-/*
-// cp.reduce.async.bulk.dst.src.mbarrier::complete_tx::bytes.op.type [dstMem], [srcMem], size, [rdsmem_bar]; // 1. PTX
-ISA 80, SM_90
-// .dst       = { .shared::cluster }
-// .src       = { .shared::cta }
-// .type      = { .u32 }
-// .op        = { .max }
-template <typename=void>
-__device__ static inline void cp_reduce_async_bulk(
-  cuda::ptx::space_cluster_t,
-  cuda::ptx::space_shared_t,
-  cuda::ptx::op_max_t,
-  uint32_t* dstMem,
-  const uint32_t* srcMem,
-  uint32_t size,
-  uint64_t* rdsmem_bar);
-*/
-#if __cccl_ptx_isa >= 800
-extern "C" _CCCL_DEVICE void __cuda_ptx_cp_reduce_async_bulk_is_not_supported_before_SM_90__();
-template <typename = void>
-_CCCL_DEVICE static inline void cp_reduce_async_bulk(
-  space_cluster_t,
-  space_shared_t,
-  op_max_t,
-  _CUDA_VSTD::uint32_t* __dstMem,
-  const _CUDA_VSTD::uint32_t* __srcMem,
-  _CUDA_VSTD::uint32_t __size,
-  _CUDA_VSTD::uint64_t* __rdsmem_bar)
-{
-  // __space == space_cluster (due to parameter type constraint)
-  // __space == space_shared (due to parameter type constraint)
-  // __type == type_u32 (due to parameter type constraint)
-  // __op == op_max (due to parameter type constraint)
-  NV_IF_ELSE_TARGET(
-    NV_PROVIDES_SM_90,
-    (asm("cp.reduce.async.bulk.shared::cluster.shared::cta.mbarrier::complete_tx::bytes.max.u32 [%0], [%1], %2, [%3]; "
-         "// 1."
-         :
-         : "r"(__as_ptr_remote_dsmem(__dstMem)),
-           "r"(__as_ptr_smem(__srcMem)),
-           "r"(__size),
-           "r"(__as_ptr_remote_dsmem(__rdsmem_bar))
-         : "memory");),
-    (
-      // Unsupported architectures will have a linker error with a semi-decent error message
-      __cuda_ptx_cp_reduce_async_bulk_is_not_supported_before_SM_90__();));
-}
-#endif // __cccl_ptx_isa >= 800
-
-/*
-// cp.reduce.async.bulk.dst.src.mbarrier::complete_tx::bytes.op.type [dstMem], [srcMem], size, [rdsmem_bar]; // 1. PTX
-ISA 80, SM_90
-// .dst       = { .shared::cluster }
-// .src       = { .shared::cta }
-// .type      = { .u32 }
-// .op        = { .add }
-template <typename=void>
-__device__ static inline void cp_reduce_async_bulk(
-  cuda::ptx::space_cluster_t,
-  cuda::ptx::space_shared_t,
-  cuda::ptx::op_add_t,
-  uint32_t* dstMem,
-  const uint32_t* srcMem,
-  uint32_t size,
-  uint64_t* rdsmem_bar);
-*/
-#if __cccl_ptx_isa >= 800
-extern "C" _CCCL_DEVICE void __cuda_ptx_cp_reduce_async_bulk_is_not_supported_before_SM_90__();
-template <typename = void>
-_CCCL_DEVICE static inline void cp_reduce_async_bulk(
-  space_cluster_t,
-  space_shared_t,
-  op_add_t,
-  _CUDA_VSTD::uint32_t* __dstMem,
-  const _CUDA_VSTD::uint32_t* __srcMem,
-  _CUDA_VSTD::uint32_t __size,
-  _CUDA_VSTD::uint64_t* __rdsmem_bar)
-{
-  // __space == space_cluster (due to parameter type constraint)
-  // __space == space_shared (due to parameter type constraint)
-  // __type == type_u32 (due to parameter type constraint)
-  // __op == op_add (due to parameter type constraint)
-  NV_IF_ELSE_TARGET(
-    NV_PROVIDES_SM_90,
-    (asm("cp.reduce.async.bulk.shared::cluster.shared::cta.mbarrier::complete_tx::bytes.add.u32 [%0], [%1], %2, [%3]; "
-         "// 1."
-         :
-         : "r"(__as_ptr_remote_dsmem(__dstMem)),
-           "r"(__as_ptr_smem(__srcMem)),
-           "r"(__size),
-           "r"(__as_ptr_remote_dsmem(__rdsmem_bar))
-         : "memory");),
-    (
-      // Unsupported architectures will have a linker error with a semi-decent error message
-      __cuda_ptx_cp_reduce_async_bulk_is_not_supported_before_SM_90__();));
-}
-#endif // __cccl_ptx_isa >= 800
-
-/*
-// cp.reduce.async.bulk.dst.src.mbarrier::complete_tx::bytes.op.type [dstMem], [srcMem], size, [rdsmem_bar]; // 1. PTX
-ISA 80, SM_90
-// .dst       = { .shared::cluster }
-// .src       = { .shared::cta }
-// .type      = { .u32 }
-// .op        = { .inc }
-template <typename=void>
-__device__ static inline void cp_reduce_async_bulk(
-  cuda::ptx::space_cluster_t,
-  cuda::ptx::space_shared_t,
-  cuda::ptx::op_inc_t,
-  uint32_t* dstMem,
-  const uint32_t* srcMem,
-  uint32_t size,
-  uint64_t* rdsmem_bar);
-*/
-#if __cccl_ptx_isa >= 800
-extern "C" _CCCL_DEVICE void __cuda_ptx_cp_reduce_async_bulk_is_not_supported_before_SM_90__();
-template <typename = void>
-_CCCL_DEVICE static inline void cp_reduce_async_bulk(
-  space_cluster_t,
-  space_shared_t,
-  op_inc_t,
-  _CUDA_VSTD::uint32_t* __dstMem,
-  const _CUDA_VSTD::uint32_t* __srcMem,
-  _CUDA_VSTD::uint32_t __size,
-  _CUDA_VSTD::uint64_t* __rdsmem_bar)
-{
-  // __space == space_cluster (due to parameter type constraint)
-  // __space == space_shared (due to parameter type constraint)
-  // __type == type_u32 (due to parameter type constraint)
-  // __op == op_inc (due to parameter type constraint)
-  NV_IF_ELSE_TARGET(
-    NV_PROVIDES_SM_90,
-    (asm("cp.reduce.async.bulk.shared::cluster.shared::cta.mbarrier::complete_tx::bytes.inc.u32 [%0], [%1], %2, [%3]; "
-         "// 1."
-         :
-         : "r"(__as_ptr_remote_dsmem(__dstMem)),
-           "r"(__as_ptr_smem(__srcMem)),
-           "r"(__size),
-           "r"(__as_ptr_remote_dsmem(__rdsmem_bar))
-         : "memory");),
-    (
-      // Unsupported architectures will have a linker error with a semi-decent error message
-      __cuda_ptx_cp_reduce_async_bulk_is_not_supported_before_SM_90__();));
-}
-#endif // __cccl_ptx_isa >= 800
-
-/*
-// cp.reduce.async.bulk.dst.src.mbarrier::complete_tx::bytes.op.type [dstMem], [srcMem], size, [rdsmem_bar]; // 1. PTX
-ISA 80, SM_90
-// .dst       = { .shared::cluster }
-// .src       = { .shared::cta }
-// .type      = { .u32 }
-// .op        = { .dec }
-template <typename=void>
-__device__ static inline void cp_reduce_async_bulk(
-  cuda::ptx::space_cluster_t,
-  cuda::ptx::space_shared_t,
-  cuda::ptx::op_dec_t,
-  uint32_t* dstMem,
-  const uint32_t* srcMem,
-  uint32_t size,
-  uint64_t* rdsmem_bar);
-*/
-#if __cccl_ptx_isa >= 800
-extern "C" _CCCL_DEVICE void __cuda_ptx_cp_reduce_async_bulk_is_not_supported_before_SM_90__();
-template <typename = void>
-_CCCL_DEVICE static inline void cp_reduce_async_bulk(
-  space_cluster_t,
-  space_shared_t,
-  op_dec_t,
-  _CUDA_VSTD::uint32_t* __dstMem,
-  const _CUDA_VSTD::uint32_t* __srcMem,
-  _CUDA_VSTD::uint32_t __size,
-  _CUDA_VSTD::uint64_t* __rdsmem_bar)
-{
-  // __space == space_cluster (due to parameter type constraint)
-  // __space == space_shared (due to parameter type constraint)
-  // __type == type_u32 (due to parameter type constraint)
-  // __op == op_dec (due to parameter type constraint)
-  NV_IF_ELSE_TARGET(
-    NV_PROVIDES_SM_90,
-    (asm("cp.reduce.async.bulk.shared::cluster.shared::cta.mbarrier::complete_tx::bytes.dec.u32 [%0], [%1], %2, [%3]; "
-         "// 1."
-         :
-         : "r"(__as_ptr_remote_dsmem(__dstMem)),
-           "r"(__as_ptr_smem(__srcMem)),
-           "r"(__size),
-           "r"(__as_ptr_remote_dsmem(__rdsmem_bar))
-         : "memory");),
-    (
-      // Unsupported architectures will have a linker error with a semi-decent error message
-      __cuda_ptx_cp_reduce_async_bulk_is_not_supported_before_SM_90__();));
-}
-#endif // __cccl_ptx_isa >= 800
-
-/*
-// cp.reduce.async.bulk.dst.src.mbarrier::complete_tx::bytes.op.type [dstMem], [srcMem], size, [rdsmem_bar]; // 1. PTX
-ISA 80, SM_90
-// .dst       = { .shared::cluster }
-// .src       = { .shared::cta }
-// .type      = { .s32 }
-// .op        = { .min }
-template <typename=void>
-__device__ static inline void cp_reduce_async_bulk(
-  cuda::ptx::space_cluster_t,
-  cuda::ptx::space_shared_t,
-  cuda::ptx::op_min_t,
-  int32_t* dstMem,
-  const int32_t* srcMem,
-  uint32_t size,
-  uint64_t* rdsmem_bar);
-*/
-#if __cccl_ptx_isa >= 800
-extern "C" _CCCL_DEVICE void __cuda_ptx_cp_reduce_async_bulk_is_not_supported_before_SM_90__();
-template <typename = void>
-_CCCL_DEVICE static inline void cp_reduce_async_bulk(
-  space_cluster_t,
-  space_shared_t,
-  op_min_t,
-  _CUDA_VSTD::int32_t* __dstMem,
-  const _CUDA_VSTD::int32_t* __srcMem,
-  _CUDA_VSTD::uint32_t __size,
-  _CUDA_VSTD::uint64_t* __rdsmem_bar)
-{
-  // __space == space_cluster (due to parameter type constraint)
-  // __space == space_shared (due to parameter type constraint)
-  // __type == type_s32 (due to parameter type constraint)
-  // __op == op_min (due to parameter type constraint)
-  NV_IF_ELSE_TARGET(
-    NV_PROVIDES_SM_90,
-    (asm("cp.reduce.async.bulk.shared::cluster.shared::cta.mbarrier::complete_tx::bytes.min.s32 [%0], [%1], %2, [%3]; "
-         "// 1."
-         :
-         : "r"(__as_ptr_remote_dsmem(__dstMem)),
-           "r"(__as_ptr_smem(__srcMem)),
-           "r"(__size),
-           "r"(__as_ptr_remote_dsmem(__rdsmem_bar))
-         : "memory");),
-    (
-      // Unsupported architectures will have a linker error with a semi-decent error message
-      __cuda_ptx_cp_reduce_async_bulk_is_not_supported_before_SM_90__();));
-}
-#endif // __cccl_ptx_isa >= 800
-
-/*
-// cp.reduce.async.bulk.dst.src.mbarrier::complete_tx::bytes.op.type [dstMem], [srcMem], size, [rdsmem_bar]; // 1. PTX
-ISA 80, SM_90
-// .dst       = { .shared::cluster }
-// .src       = { .shared::cta }
-// .type      = { .s32 }
-// .op        = { .max }
-template <typename=void>
-__device__ static inline void cp_reduce_async_bulk(
-  cuda::ptx::space_cluster_t,
-  cuda::ptx::space_shared_t,
-  cuda::ptx::op_max_t,
-  int32_t* dstMem,
-  const int32_t* srcMem,
-  uint32_t size,
-  uint64_t* rdsmem_bar);
-*/
-#if __cccl_ptx_isa >= 800
-extern "C" _CCCL_DEVICE void __cuda_ptx_cp_reduce_async_bulk_is_not_supported_before_SM_90__();
-template <typename = void>
-_CCCL_DEVICE static inline void cp_reduce_async_bulk(
-  space_cluster_t,
-  space_shared_t,
-  op_max_t,
-  _CUDA_VSTD::int32_t* __dstMem,
-  const _CUDA_VSTD::int32_t* __srcMem,
-  _CUDA_VSTD::uint32_t __size,
-  _CUDA_VSTD::uint64_t* __rdsmem_bar)
-{
-  // __space == space_cluster (due to parameter type constraint)
-  // __space == space_shared (due to parameter type constraint)
-  // __type == type_s32 (due to parameter type constraint)
-  // __op == op_max (due to parameter type constraint)
-  NV_IF_ELSE_TARGET(
-    NV_PROVIDES_SM_90,
-    (asm("cp.reduce.async.bulk.shared::cluster.shared::cta.mbarrier::complete_tx::bytes.max.s32 [%0], [%1], %2, [%3]; "
-         "// 1."
-         :
-         : "r"(__as_ptr_remote_dsmem(__dstMem)),
-           "r"(__as_ptr_smem(__srcMem)),
-           "r"(__size),
-           "r"(__as_ptr_remote_dsmem(__rdsmem_bar))
-         : "memory");),
-    (
-      // Unsupported architectures will have a linker error with a semi-decent error message
-      __cuda_ptx_cp_reduce_async_bulk_is_not_supported_before_SM_90__();));
-}
-#endif // __cccl_ptx_isa >= 800
-
-/*
-// cp.reduce.async.bulk.dst.src.mbarrier::complete_tx::bytes.op.type [dstMem], [srcMem], size, [rdsmem_bar]; // 1. PTX
-ISA 80, SM_90
-// .dst       = { .shared::cluster }
-// .src       = { .shared::cta }
-// .type      = { .s32 }
-// .op        = { .add }
-template <typename=void>
-__device__ static inline void cp_reduce_async_bulk(
-  cuda::ptx::space_cluster_t,
-  cuda::ptx::space_shared_t,
-  cuda::ptx::op_add_t,
-  int32_t* dstMem,
-  const int32_t* srcMem,
-  uint32_t size,
-  uint64_t* rdsmem_bar);
-*/
-#if __cccl_ptx_isa >= 800
-extern "C" _CCCL_DEVICE void __cuda_ptx_cp_reduce_async_bulk_is_not_supported_before_SM_90__();
-template <typename = void>
-_CCCL_DEVICE static inline void cp_reduce_async_bulk(
-  space_cluster_t,
-  space_shared_t,
-  op_add_t,
-  _CUDA_VSTD::int32_t* __dstMem,
-  const _CUDA_VSTD::int32_t* __srcMem,
-  _CUDA_VSTD::uint32_t __size,
-  _CUDA_VSTD::uint64_t* __rdsmem_bar)
-{
-  // __space == space_cluster (due to parameter type constraint)
-  // __space == space_shared (due to parameter type constraint)
-  // __type == type_s32 (due to parameter type constraint)
-  // __op == op_add (due to parameter type constraint)
-  NV_IF_ELSE_TARGET(
-    NV_PROVIDES_SM_90,
-    (asm("cp.reduce.async.bulk.shared::cluster.shared::cta.mbarrier::complete_tx::bytes.add.s32 [%0], [%1], %2, [%3]; "
-         "// 1."
-         :
-         : "r"(__as_ptr_remote_dsmem(__dstMem)),
-           "r"(__as_ptr_smem(__srcMem)),
-           "r"(__size),
-           "r"(__as_ptr_remote_dsmem(__rdsmem_bar))
-         : "memory");),
-    (
-      // Unsupported architectures will have a linker error with a semi-decent error message
-      __cuda_ptx_cp_reduce_async_bulk_is_not_supported_before_SM_90__();));
-}
-#endif // __cccl_ptx_isa >= 800
-
-/*
-// cp.reduce.async.bulk.dst.src.mbarrier::complete_tx::bytes.op.type [dstMem], [srcMem], size, [rdsmem_bar]; // 1. PTX
-ISA 80, SM_90
-// .dst       = { .shared::cluster }
-// .src       = { .shared::cta }
-// .type      = { .u64 }
-// .op        = { .add }
-template <typename=void>
-__device__ static inline void cp_reduce_async_bulk(
-  cuda::ptx::space_cluster_t,
-  cuda::ptx::space_shared_t,
-  cuda::ptx::op_add_t,
-  uint64_t* dstMem,
-  const uint64_t* srcMem,
-  uint32_t size,
-  uint64_t* rdsmem_bar);
-*/
-#if __cccl_ptx_isa >= 800
-extern "C" _CCCL_DEVICE void __cuda_ptx_cp_reduce_async_bulk_is_not_supported_before_SM_90__();
-template <typename = void>
-_CCCL_DEVICE static inline void cp_reduce_async_bulk(
-  space_cluster_t,
-  space_shared_t,
-  op_add_t,
-  _CUDA_VSTD::uint64_t* __dstMem,
-  const _CUDA_VSTD::uint64_t* __srcMem,
-  _CUDA_VSTD::uint32_t __size,
-  _CUDA_VSTD::uint64_t* __rdsmem_bar)
-{
-  // __space == space_cluster (due to parameter type constraint)
-  // __space == space_shared (due to parameter type constraint)
-  // __type == type_u64 (due to parameter type constraint)
-  // __op == op_add (due to parameter type constraint)
-  NV_IF_ELSE_TARGET(
-    NV_PROVIDES_SM_90,
-    (asm("cp.reduce.async.bulk.shared::cluster.shared::cta.mbarrier::complete_tx::bytes.add.u64 [%0], [%1], %2, [%3]; "
-         "// 1."
-         :
-         : "r"(__as_ptr_remote_dsmem(__dstMem)),
-           "r"(__as_ptr_smem(__srcMem)),
-           "r"(__size),
-           "r"(__as_ptr_remote_dsmem(__rdsmem_bar))
-         : "memory");),
-    (
-      // Unsupported architectures will have a linker error with a semi-decent error message
-      __cuda_ptx_cp_reduce_async_bulk_is_not_supported_before_SM_90__();));
-}
-#endif // __cccl_ptx_isa >= 800
-
-/*
-// cp.reduce.async.bulk.dst.src.mbarrier::complete_tx::bytes.op.u64 [dstMem], [srcMem], size, [rdsmem_bar]; // 2. PTX
-ISA 80, SM_90
-// .dst       = { .shared::cluster }
-// .src       = { .shared::cta }
-// .type      = { .s64 }
-// .op        = { .add }
-template <typename=void>
-__device__ static inline void cp_reduce_async_bulk(
-  cuda::ptx::space_cluster_t,
-  cuda::ptx::space_shared_t,
-  cuda::ptx::op_add_t,
-  int64_t* dstMem,
-  const int64_t* srcMem,
-  uint32_t size,
-  uint64_t* rdsmem_bar);
-*/
-#if __cccl_ptx_isa >= 800
-extern "C" _CCCL_DEVICE void __cuda_ptx_cp_reduce_async_bulk_is_not_supported_before_SM_90__();
-template <typename = void>
-_CCCL_DEVICE static inline void cp_reduce_async_bulk(
-  space_cluster_t,
-  space_shared_t,
-  op_add_t,
-  _CUDA_VSTD::int64_t* __dstMem,
-  const _CUDA_VSTD::int64_t* __srcMem,
-  _CUDA_VSTD::uint32_t __size,
-  _CUDA_VSTD::uint64_t* __rdsmem_bar)
-{
-  // __space == space_cluster (due to parameter type constraint)
-  // __space == space_shared (due to parameter type constraint)
-  // __type == type_s64 (due to parameter type constraint)
-  // __op == op_add (due to parameter type constraint)
-  NV_IF_ELSE_TARGET(
-    NV_PROVIDES_SM_90,
-    (asm("cp.reduce.async.bulk.shared::cluster.shared::cta.mbarrier::complete_tx::bytes.add.u64 [%0], [%1], %2, [%3]; "
-         "// 2."
-         :
-         : "r"(__as_ptr_remote_dsmem(__dstMem)),
-           "r"(__as_ptr_smem(__srcMem)),
-           "r"(__size),
-           "r"(__as_ptr_remote_dsmem(__rdsmem_bar))
-         : "memory");),
-    (
-      // Unsupported architectures will have a linker error with a semi-decent error message
-      __cuda_ptx_cp_reduce_async_bulk_is_not_supported_before_SM_90__();));
-}
-#endif // __cccl_ptx_isa >= 800
-
-/*
-// cp.reduce.async.bulk.dst.src.bulk_group.op.type  [dstMem], [srcMem], size; // 3. PTX ISA 80, SM_90
-// .dst       = { .global }
-// .src       = { .shared::cta }
-// .type      = { .b32, .b64 }
-// .op        = { .and }
-template <typename Type>
-__device__ static inline void cp_reduce_async_bulk(
-  cuda::ptx::space_global_t,
-  cuda::ptx::space_shared_t,
-  cuda::ptx::op_and_op_t,
-  Type* dstMem,
-  const Type* srcMem,
-  uint32_t size);
-*/
-#if __cccl_ptx_isa >= 800
-extern "C" _CCCL_DEVICE void __cuda_ptx_cp_reduce_async_bulk_is_not_supported_before_SM_90__();
-template <typename _Type>
-_CCCL_DEVICE static inline void cp_reduce_async_bulk(
-  space_global_t, space_shared_t, op_and_op_t, _Type* __dstMem, const _Type* __srcMem, _CUDA_VSTD::uint32_t __size)
-{
-  // __space == space_global (due to parameter type constraint)
-  // __space == space_shared (due to parameter type constraint)
-  static_assert(sizeof(_Type) == 4 || sizeof(_Type) == 8, "");
-  // __op == op_and_op (due to parameter type constraint)
-  NV_IF_ELSE_TARGET(
-    NV_PROVIDES_SM_90,
-    (
-      _CCCL_IF_CONSTEXPR (sizeof(_Type) == 4) {
-        asm("cp.reduce.async.bulk.global.shared::cta.bulk_group.and.b32  [%0], [%1], %2; // 3."
-            :
-            : "l"(__as_ptr_gmem(__dstMem)), "r"(__as_ptr_smem(__srcMem)), "r"(__size)
-            : "memory");
-      } _CCCL_ELSE_IF_CONSTEXPR (sizeof(_Type) == 8) {
-        asm("cp.reduce.async.bulk.global.shared::cta.bulk_group.and.b64  [%0], [%1], %2; // 3."
-            :
-            : "l"(__as_ptr_gmem(__dstMem)), "r"(__as_ptr_smem(__srcMem)), "r"(__size)
-            : "memory");
-      }),
-    (
-      // Unsupported architectures will have a linker error with a semi-decent error message
-      __cuda_ptx_cp_reduce_async_bulk_is_not_supported_before_SM_90__();));
-}
-#endif // __cccl_ptx_isa >= 800
-
-/*
-// cp.reduce.async.bulk.dst.src.bulk_group.op.type  [dstMem], [srcMem], size; // 3. PTX ISA 80, SM_90
-// .dst       = { .global }
-// .src       = { .shared::cta }
-// .type      = { .b32, .b64 }
-// .op        = { .or }
-template <typename Type>
-__device__ static inline void cp_reduce_async_bulk(
-  cuda::ptx::space_global_t,
-  cuda::ptx::space_shared_t,
-  cuda::ptx::op_or_op_t,
-  Type* dstMem,
-  const Type* srcMem,
-  uint32_t size);
-*/
-#if __cccl_ptx_isa >= 800
-extern "C" _CCCL_DEVICE void __cuda_ptx_cp_reduce_async_bulk_is_not_supported_before_SM_90__();
-template <typename _Type>
-_CCCL_DEVICE static inline void cp_reduce_async_bulk(
-  space_global_t, space_shared_t, op_or_op_t, _Type* __dstMem, const _Type* __srcMem, _CUDA_VSTD::uint32_t __size)
-{
-  // __space == space_global (due to parameter type constraint)
-  // __space == space_shared (due to parameter type constraint)
-  static_assert(sizeof(_Type) == 4 || sizeof(_Type) == 8, "");
-  // __op == op_or_op (due to parameter type constraint)
-  NV_IF_ELSE_TARGET(
-    NV_PROVIDES_SM_90,
-    (
-      _CCCL_IF_CONSTEXPR (sizeof(_Type) == 4) {
-        asm("cp.reduce.async.bulk.global.shared::cta.bulk_group.or.b32  [%0], [%1], %2; // 3."
-            :
-            : "l"(__as_ptr_gmem(__dstMem)), "r"(__as_ptr_smem(__srcMem)), "r"(__size)
-            : "memory");
-      } _CCCL_ELSE_IF_CONSTEXPR (sizeof(_Type) == 8) {
-        asm("cp.reduce.async.bulk.global.shared::cta.bulk_group.or.b64  [%0], [%1], %2; // 3."
-            :
-            : "l"(__as_ptr_gmem(__dstMem)), "r"(__as_ptr_smem(__srcMem)), "r"(__size)
-            : "memory");
-      }),
-    (
-      // Unsupported architectures will have a linker error with a semi-decent error message
-      __cuda_ptx_cp_reduce_async_bulk_is_not_supported_before_SM_90__();));
-}
-#endif // __cccl_ptx_isa >= 800
-
-/*
-// cp.reduce.async.bulk.dst.src.bulk_group.op.type  [dstMem], [srcMem], size; // 3. PTX ISA 80, SM_90
-// .dst       = { .global }
-// .src       = { .shared::cta }
-// .type      = { .b32, .b64 }
-// .op        = { .xor }
-template <typename Type>
-__device__ static inline void cp_reduce_async_bulk(
-  cuda::ptx::space_global_t,
-  cuda::ptx::space_shared_t,
-  cuda::ptx::op_xor_op_t,
-  Type* dstMem,
-  const Type* srcMem,
-  uint32_t size);
-*/
-#if __cccl_ptx_isa >= 800
-extern "C" _CCCL_DEVICE void __cuda_ptx_cp_reduce_async_bulk_is_not_supported_before_SM_90__();
-template <typename _Type>
-_CCCL_DEVICE static inline void cp_reduce_async_bulk(
-  space_global_t, space_shared_t, op_xor_op_t, _Type* __dstMem, const _Type* __srcMem, _CUDA_VSTD::uint32_t __size)
-{
-  // __space == space_global (due to parameter type constraint)
-  // __space == space_shared (due to parameter type constraint)
-  static_assert(sizeof(_Type) == 4 || sizeof(_Type) == 8, "");
-  // __op == op_xor_op (due to parameter type constraint)
-  NV_IF_ELSE_TARGET(
-    NV_PROVIDES_SM_90,
-    (
-      _CCCL_IF_CONSTEXPR (sizeof(_Type) == 4) {
-        asm("cp.reduce.async.bulk.global.shared::cta.bulk_group.xor.b32  [%0], [%1], %2; // 3."
-            :
-            : "l"(__as_ptr_gmem(__dstMem)), "r"(__as_ptr_smem(__srcMem)), "r"(__size)
-            : "memory");
-      } _CCCL_ELSE_IF_CONSTEXPR (sizeof(_Type) == 8) {
-        asm("cp.reduce.async.bulk.global.shared::cta.bulk_group.xor.b64  [%0], [%1], %2; // 3."
-            :
-            : "l"(__as_ptr_gmem(__dstMem)), "r"(__as_ptr_smem(__srcMem)), "r"(__size)
-            : "memory");
-      }),
-    (
-      // Unsupported architectures will have a linker error with a semi-decent error message
-      __cuda_ptx_cp_reduce_async_bulk_is_not_supported_before_SM_90__();));
-}
-#endif // __cccl_ptx_isa >= 800
-
-/*
-// cp.reduce.async.bulk.dst.src.bulk_group.op.type  [dstMem], [srcMem], size; // 4. PTX ISA 80, SM_90
-// .dst       = { .global }
-// .src       = { .shared::cta }
-// .type      = { .u32 }
-// .op        = { .min }
-template <typename=void>
-__device__ static inline void cp_reduce_async_bulk(
-  cuda::ptx::space_global_t,
-  cuda::ptx::space_shared_t,
-  cuda::ptx::op_min_t,
-  uint32_t* dstMem,
-  const uint32_t* srcMem,
-  uint32_t size);
-*/
-#if __cccl_ptx_isa >= 800
-extern "C" _CCCL_DEVICE void __cuda_ptx_cp_reduce_async_bulk_is_not_supported_before_SM_90__();
-template <typename = void>
-_CCCL_DEVICE static inline void cp_reduce_async_bulk(
-  space_global_t,
-  space_shared_t,
-  op_min_t,
-  _CUDA_VSTD::uint32_t* __dstMem,
-  const _CUDA_VSTD::uint32_t* __srcMem,
-  _CUDA_VSTD::uint32_t __size)
-{
-  // __space == space_global (due to parameter type constraint)
-  // __space == space_shared (due to parameter type constraint)
-  // __type == type_u32 (due to parameter type constraint)
-  // __op == op_min (due to parameter type constraint)
-  NV_IF_ELSE_TARGET(
-    NV_PROVIDES_SM_90,
-    (asm("cp.reduce.async.bulk.global.shared::cta.bulk_group.min.u32  [%0], [%1], %2; // 4."
-         :
-         : "l"(__as_ptr_gmem(__dstMem)), "r"(__as_ptr_smem(__srcMem)), "r"(__size)
-         : "memory");),
-    (
-      // Unsupported architectures will have a linker error with a semi-decent error message
-      __cuda_ptx_cp_reduce_async_bulk_is_not_supported_before_SM_90__();));
-}
-#endif // __cccl_ptx_isa >= 800
-
-/*
-// cp.reduce.async.bulk.dst.src.bulk_group.op.type  [dstMem], [srcMem], size; // 4. PTX ISA 80, SM_90
-// .dst       = { .global }
-// .src       = { .shared::cta }
-// .type      = { .u32 }
-// .op        = { .max }
-template <typename=void>
-__device__ static inline void cp_reduce_async_bulk(
-  cuda::ptx::space_global_t,
-  cuda::ptx::space_shared_t,
-  cuda::ptx::op_max_t,
-  uint32_t* dstMem,
-  const uint32_t* srcMem,
-  uint32_t size);
-*/
-#if __cccl_ptx_isa >= 800
-extern "C" _CCCL_DEVICE void __cuda_ptx_cp_reduce_async_bulk_is_not_supported_before_SM_90__();
-template <typename = void>
-_CCCL_DEVICE static inline void cp_reduce_async_bulk(
-  space_global_t,
-  space_shared_t,
-  op_max_t,
-  _CUDA_VSTD::uint32_t* __dstMem,
-  const _CUDA_VSTD::uint32_t* __srcMem,
-  _CUDA_VSTD::uint32_t __size)
-{
-  // __space == space_global (due to parameter type constraint)
-  // __space == space_shared (due to parameter type constraint)
-  // __type == type_u32 (due to parameter type constraint)
-  // __op == op_max (due to parameter type constraint)
-  NV_IF_ELSE_TARGET(
-    NV_PROVIDES_SM_90,
-    (asm("cp.reduce.async.bulk.global.shared::cta.bulk_group.max.u32  [%0], [%1], %2; // 4."
-         :
-         : "l"(__as_ptr_gmem(__dstMem)), "r"(__as_ptr_smem(__srcMem)), "r"(__size)
-         : "memory");),
-    (
-      // Unsupported architectures will have a linker error with a semi-decent error message
-      __cuda_ptx_cp_reduce_async_bulk_is_not_supported_before_SM_90__();));
-}
-#endif // __cccl_ptx_isa >= 800
-
-/*
-// cp.reduce.async.bulk.dst.src.bulk_group.op.type  [dstMem], [srcMem], size; // 4. PTX ISA 80, SM_90
-// .dst       = { .global }
-// .src       = { .shared::cta }
-// .type      = { .u32 }
-// .op        = { .add }
-template <typename=void>
-__device__ static inline void cp_reduce_async_bulk(
-  cuda::ptx::space_global_t,
-  cuda::ptx::space_shared_t,
-  cuda::ptx::op_add_t,
-  uint32_t* dstMem,
-  const uint32_t* srcMem,
-  uint32_t size);
-*/
-#if __cccl_ptx_isa >= 800
-extern "C" _CCCL_DEVICE void __cuda_ptx_cp_reduce_async_bulk_is_not_supported_before_SM_90__();
-template <typename = void>
-_CCCL_DEVICE static inline void cp_reduce_async_bulk(
-  space_global_t,
-  space_shared_t,
-  op_add_t,
-  _CUDA_VSTD::uint32_t* __dstMem,
-  const _CUDA_VSTD::uint32_t* __srcMem,
-  _CUDA_VSTD::uint32_t __size)
-{
-  // __space == space_global (due to parameter type constraint)
-  // __space == space_shared (due to parameter type constraint)
-  // __type == type_u32 (due to parameter type constraint)
-  // __op == op_add (due to parameter type constraint)
-  NV_IF_ELSE_TARGET(
-    NV_PROVIDES_SM_90,
-    (asm("cp.reduce.async.bulk.global.shared::cta.bulk_group.add.u32  [%0], [%1], %2; // 4."
-         :
-         : "l"(__as_ptr_gmem(__dstMem)), "r"(__as_ptr_smem(__srcMem)), "r"(__size)
-         : "memory");),
-    (
-      // Unsupported architectures will have a linker error with a semi-decent error message
-      __cuda_ptx_cp_reduce_async_bulk_is_not_supported_before_SM_90__();));
-}
-#endif // __cccl_ptx_isa >= 800
-
-/*
-// cp.reduce.async.bulk.dst.src.bulk_group.op.type  [dstMem], [srcMem], size; // 4. PTX ISA 80, SM_90
-// .dst       = { .global }
-// .src       = { .shared::cta }
-// .type      = { .u32 }
-// .op        = { .inc }
-template <typename=void>
-__device__ static inline void cp_reduce_async_bulk(
-  cuda::ptx::space_global_t,
-  cuda::ptx::space_shared_t,
-  cuda::ptx::op_inc_t,
-  uint32_t* dstMem,
-  const uint32_t* srcMem,
-  uint32_t size);
-*/
-#if __cccl_ptx_isa >= 800
-extern "C" _CCCL_DEVICE void __cuda_ptx_cp_reduce_async_bulk_is_not_supported_before_SM_90__();
-template <typename = void>
-_CCCL_DEVICE static inline void cp_reduce_async_bulk(
-  space_global_t,
-  space_shared_t,
-  op_inc_t,
-  _CUDA_VSTD::uint32_t* __dstMem,
-  const _CUDA_VSTD::uint32_t* __srcMem,
-  _CUDA_VSTD::uint32_t __size)
-{
-  // __space == space_global (due to parameter type constraint)
-  // __space == space_shared (due to parameter type constraint)
-  // __type == type_u32 (due to parameter type constraint)
-  // __op == op_inc (due to parameter type constraint)
-  NV_IF_ELSE_TARGET(
-    NV_PROVIDES_SM_90,
-    (asm("cp.reduce.async.bulk.global.shared::cta.bulk_group.inc.u32  [%0], [%1], %2; // 4."
-         :
-         : "l"(__as_ptr_gmem(__dstMem)), "r"(__as_ptr_smem(__srcMem)), "r"(__size)
-         : "memory");),
-    (
-      // Unsupported architectures will have a linker error with a semi-decent error message
-      __cuda_ptx_cp_reduce_async_bulk_is_not_supported_before_SM_90__();));
-}
-#endif // __cccl_ptx_isa >= 800
-
-/*
-// cp.reduce.async.bulk.dst.src.bulk_group.op.type  [dstMem], [srcMem], size; // 4. PTX ISA 80, SM_90
-// .dst       = { .global }
-// .src       = { .shared::cta }
-// .type      = { .u32 }
-// .op        = { .dec }
-template <typename=void>
-__device__ static inline void cp_reduce_async_bulk(
-  cuda::ptx::space_global_t,
-  cuda::ptx::space_shared_t,
-  cuda::ptx::op_dec_t,
-  uint32_t* dstMem,
-  const uint32_t* srcMem,
-  uint32_t size);
-*/
-#if __cccl_ptx_isa >= 800
-extern "C" _CCCL_DEVICE void __cuda_ptx_cp_reduce_async_bulk_is_not_supported_before_SM_90__();
-template <typename = void>
-_CCCL_DEVICE static inline void cp_reduce_async_bulk(
-  space_global_t,
-  space_shared_t,
-  op_dec_t,
-  _CUDA_VSTD::uint32_t* __dstMem,
-  const _CUDA_VSTD::uint32_t* __srcMem,
-  _CUDA_VSTD::uint32_t __size)
-{
-  // __space == space_global (due to parameter type constraint)
-  // __space == space_shared (due to parameter type constraint)
-  // __type == type_u32 (due to parameter type constraint)
-  // __op == op_dec (due to parameter type constraint)
-  NV_IF_ELSE_TARGET(
-    NV_PROVIDES_SM_90,
-    (asm("cp.reduce.async.bulk.global.shared::cta.bulk_group.dec.u32  [%0], [%1], %2; // 4."
-         :
-         : "l"(__as_ptr_gmem(__dstMem)), "r"(__as_ptr_smem(__srcMem)), "r"(__size)
-         : "memory");),
-    (
-      // Unsupported architectures will have a linker error with a semi-decent error message
-      __cuda_ptx_cp_reduce_async_bulk_is_not_supported_before_SM_90__();));
-}
-#endif // __cccl_ptx_isa >= 800
-
-/*
-// cp.reduce.async.bulk.dst.src.bulk_group.op.type  [dstMem], [srcMem], size; // 4. PTX ISA 80, SM_90
-// .dst       = { .global }
-// .src       = { .shared::cta }
-// .type      = { .s32 }
-// .op        = { .min }
-template <typename=void>
-__device__ static inline void cp_reduce_async_bulk(
-  cuda::ptx::space_global_t,
-  cuda::ptx::space_shared_t,
-  cuda::ptx::op_min_t,
-  int32_t* dstMem,
-  const int32_t* srcMem,
-  uint32_t size);
-*/
-#if __cccl_ptx_isa >= 800
-extern "C" _CCCL_DEVICE void __cuda_ptx_cp_reduce_async_bulk_is_not_supported_before_SM_90__();
-template <typename = void>
-_CCCL_DEVICE static inline void cp_reduce_async_bulk(
-  space_global_t,
-  space_shared_t,
-  op_min_t,
-  _CUDA_VSTD::int32_t* __dstMem,
-  const _CUDA_VSTD::int32_t* __srcMem,
-  _CUDA_VSTD::uint32_t __size)
-{
-  // __space == space_global (due to parameter type constraint)
-  // __space == space_shared (due to parameter type constraint)
-  // __type == type_s32 (due to parameter type constraint)
-  // __op == op_min (due to parameter type constraint)
-  NV_IF_ELSE_TARGET(
-    NV_PROVIDES_SM_90,
-    (asm("cp.reduce.async.bulk.global.shared::cta.bulk_group.min.s32  [%0], [%1], %2; // 4."
-         :
-         : "l"(__as_ptr_gmem(__dstMem)), "r"(__as_ptr_smem(__srcMem)), "r"(__size)
-         : "memory");),
-    (
-      // Unsupported architectures will have a linker error with a semi-decent error message
-      __cuda_ptx_cp_reduce_async_bulk_is_not_supported_before_SM_90__();));
-}
-#endif // __cccl_ptx_isa >= 800
-
-/*
-// cp.reduce.async.bulk.dst.src.bulk_group.op.type  [dstMem], [srcMem], size; // 4. PTX ISA 80, SM_90
-// .dst       = { .global }
-// .src       = { .shared::cta }
-// .type      = { .s32 }
-// .op        = { .max }
-template <typename=void>
-__device__ static inline void cp_reduce_async_bulk(
-  cuda::ptx::space_global_t,
-  cuda::ptx::space_shared_t,
-  cuda::ptx::op_max_t,
-  int32_t* dstMem,
-  const int32_t* srcMem,
-  uint32_t size);
-*/
-#if __cccl_ptx_isa >= 800
-extern "C" _CCCL_DEVICE void __cuda_ptx_cp_reduce_async_bulk_is_not_supported_before_SM_90__();
-template <typename = void>
-_CCCL_DEVICE static inline void cp_reduce_async_bulk(
-  space_global_t,
-  space_shared_t,
-  op_max_t,
-  _CUDA_VSTD::int32_t* __dstMem,
-  const _CUDA_VSTD::int32_t* __srcMem,
-  _CUDA_VSTD::uint32_t __size)
-{
-  // __space == space_global (due to parameter type constraint)
-  // __space == space_shared (due to parameter type constraint)
-  // __type == type_s32 (due to parameter type constraint)
-  // __op == op_max (due to parameter type constraint)
-  NV_IF_ELSE_TARGET(
-    NV_PROVIDES_SM_90,
-    (asm("cp.reduce.async.bulk.global.shared::cta.bulk_group.max.s32  [%0], [%1], %2; // 4."
-         :
-         : "l"(__as_ptr_gmem(__dstMem)), "r"(__as_ptr_smem(__srcMem)), "r"(__size)
-         : "memory");),
-    (
-      // Unsupported architectures will have a linker error with a semi-decent error message
-      __cuda_ptx_cp_reduce_async_bulk_is_not_supported_before_SM_90__();));
-}
-#endif // __cccl_ptx_isa >= 800
-
-/*
-// cp.reduce.async.bulk.dst.src.bulk_group.op.type  [dstMem], [srcMem], size; // 4. PTX ISA 80, SM_90
-// .dst       = { .global }
-// .src       = { .shared::cta }
-// .type      = { .s32 }
-// .op        = { .add }
-template <typename=void>
-__device__ static inline void cp_reduce_async_bulk(
-  cuda::ptx::space_global_t,
-  cuda::ptx::space_shared_t,
-  cuda::ptx::op_add_t,
-  int32_t* dstMem,
-  const int32_t* srcMem,
-  uint32_t size);
-*/
-#if __cccl_ptx_isa >= 800
-extern "C" _CCCL_DEVICE void __cuda_ptx_cp_reduce_async_bulk_is_not_supported_before_SM_90__();
-template <typename = void>
-_CCCL_DEVICE static inline void cp_reduce_async_bulk(
-  space_global_t,
-  space_shared_t,
-  op_add_t,
-  _CUDA_VSTD::int32_t* __dstMem,
-  const _CUDA_VSTD::int32_t* __srcMem,
-  _CUDA_VSTD::uint32_t __size)
-{
-  // __space == space_global (due to parameter type constraint)
-  // __space == space_shared (due to parameter type constraint)
-  // __type == type_s32 (due to parameter type constraint)
-  // __op == op_add (due to parameter type constraint)
-  NV_IF_ELSE_TARGET(
-    NV_PROVIDES_SM_90,
-    (asm("cp.reduce.async.bulk.global.shared::cta.bulk_group.add.s32  [%0], [%1], %2; // 4."
-         :
-         : "l"(__as_ptr_gmem(__dstMem)), "r"(__as_ptr_smem(__srcMem)), "r"(__size)
-         : "memory");),
-    (
-      // Unsupported architectures will have a linker error with a semi-decent error message
-      __cuda_ptx_cp_reduce_async_bulk_is_not_supported_before_SM_90__();));
-}
-#endif // __cccl_ptx_isa >= 800
-
-/*
-// cp.reduce.async.bulk.dst.src.bulk_group.op.type  [dstMem], [srcMem], size; // 4. PTX ISA 80, SM_90
-// .dst       = { .global }
-// .src       = { .shared::cta }
-// .type      = { .u64 }
-// .op        = { .min }
-template <typename=void>
-__device__ static inline void cp_reduce_async_bulk(
-  cuda::ptx::space_global_t,
-  cuda::ptx::space_shared_t,
-  cuda::ptx::op_min_t,
-  uint64_t* dstMem,
-  const uint64_t* srcMem,
-  uint32_t size);
-*/
-#if __cccl_ptx_isa >= 800
-extern "C" _CCCL_DEVICE void __cuda_ptx_cp_reduce_async_bulk_is_not_supported_before_SM_90__();
-template <typename = void>
-_CCCL_DEVICE static inline void cp_reduce_async_bulk(
-  space_global_t,
-  space_shared_t,
-  op_min_t,
-  _CUDA_VSTD::uint64_t* __dstMem,
-  const _CUDA_VSTD::uint64_t* __srcMem,
-  _CUDA_VSTD::uint32_t __size)
-{
-  // __space == space_global (due to parameter type constraint)
-  // __space == space_shared (due to parameter type constraint)
-  // __type == type_u64 (due to parameter type constraint)
-  // __op == op_min (due to parameter type constraint)
-  NV_IF_ELSE_TARGET(
-    NV_PROVIDES_SM_90,
-    (asm("cp.reduce.async.bulk.global.shared::cta.bulk_group.min.u64  [%0], [%1], %2; // 4."
-         :
-         : "l"(__as_ptr_gmem(__dstMem)), "r"(__as_ptr_smem(__srcMem)), "r"(__size)
-         : "memory");),
-    (
-      // Unsupported architectures will have a linker error with a semi-decent error message
-      __cuda_ptx_cp_reduce_async_bulk_is_not_supported_before_SM_90__();));
-}
-#endif // __cccl_ptx_isa >= 800
-
-/*
-// cp.reduce.async.bulk.dst.src.bulk_group.op.type  [dstMem], [srcMem], size; // 4. PTX ISA 80, SM_90
-// .dst       = { .global }
-// .src       = { .shared::cta }
-// .type      = { .u64 }
-// .op        = { .max }
-template <typename=void>
-__device__ static inline void cp_reduce_async_bulk(
-  cuda::ptx::space_global_t,
-  cuda::ptx::space_shared_t,
-  cuda::ptx::op_max_t,
-  uint64_t* dstMem,
-  const uint64_t* srcMem,
-  uint32_t size);
-*/
-#if __cccl_ptx_isa >= 800
-extern "C" _CCCL_DEVICE void __cuda_ptx_cp_reduce_async_bulk_is_not_supported_before_SM_90__();
-template <typename = void>
-_CCCL_DEVICE static inline void cp_reduce_async_bulk(
-  space_global_t,
-  space_shared_t,
-  op_max_t,
-  _CUDA_VSTD::uint64_t* __dstMem,
-  const _CUDA_VSTD::uint64_t* __srcMem,
-  _CUDA_VSTD::uint32_t __size)
-{
-  // __space == space_global (due to parameter type constraint)
-  // __space == space_shared (due to parameter type constraint)
-  // __type == type_u64 (due to parameter type constraint)
-  // __op == op_max (due to parameter type constraint)
-  NV_IF_ELSE_TARGET(
-    NV_PROVIDES_SM_90,
-    (asm("cp.reduce.async.bulk.global.shared::cta.bulk_group.max.u64  [%0], [%1], %2; // 4."
-         :
-         : "l"(__as_ptr_gmem(__dstMem)), "r"(__as_ptr_smem(__srcMem)), "r"(__size)
-         : "memory");),
-    (
-      // Unsupported architectures will have a linker error with a semi-decent error message
-      __cuda_ptx_cp_reduce_async_bulk_is_not_supported_before_SM_90__();));
-}
-#endif // __cccl_ptx_isa >= 800
-
-/*
-// cp.reduce.async.bulk.dst.src.bulk_group.op.type  [dstMem], [srcMem], size; // 4. PTX ISA 80, SM_90
-// .dst       = { .global }
-// .src       = { .shared::cta }
-// .type      = { .u64 }
-// .op        = { .add }
-template <typename=void>
-__device__ static inline void cp_reduce_async_bulk(
-  cuda::ptx::space_global_t,
-  cuda::ptx::space_shared_t,
-  cuda::ptx::op_add_t,
-  uint64_t* dstMem,
-  const uint64_t* srcMem,
-  uint32_t size);
-*/
-#if __cccl_ptx_isa >= 800
-extern "C" _CCCL_DEVICE void __cuda_ptx_cp_reduce_async_bulk_is_not_supported_before_SM_90__();
-template <typename = void>
-_CCCL_DEVICE static inline void cp_reduce_async_bulk(
-  space_global_t,
-  space_shared_t,
-  op_add_t,
-  _CUDA_VSTD::uint64_t* __dstMem,
-  const _CUDA_VSTD::uint64_t* __srcMem,
-  _CUDA_VSTD::uint32_t __size)
-{
-  // __space == space_global (due to parameter type constraint)
-  // __space == space_shared (due to parameter type constraint)
-  // __type == type_u64 (due to parameter type constraint)
-  // __op == op_add (due to parameter type constraint)
-  NV_IF_ELSE_TARGET(
-    NV_PROVIDES_SM_90,
-    (asm("cp.reduce.async.bulk.global.shared::cta.bulk_group.add.u64  [%0], [%1], %2; // 4."
-         :
-         : "l"(__as_ptr_gmem(__dstMem)), "r"(__as_ptr_smem(__srcMem)), "r"(__size)
-         : "memory");),
-    (
-      // Unsupported architectures will have a linker error with a semi-decent error message
-      __cuda_ptx_cp_reduce_async_bulk_is_not_supported_before_SM_90__();));
-}
-#endif // __cccl_ptx_isa >= 800
-
-/*
-// cp.reduce.async.bulk.dst.src.bulk_group.op.type  [dstMem], [srcMem], size; // 4. PTX ISA 80, SM_90
-// .dst       = { .global }
-// .src       = { .shared::cta }
-// .type      = { .s64 }
-// .op        = { .min }
-template <typename=void>
-__device__ static inline void cp_reduce_async_bulk(
-  cuda::ptx::space_global_t,
-  cuda::ptx::space_shared_t,
-  cuda::ptx::op_min_t,
-  int64_t* dstMem,
-  const int64_t* srcMem,
-  uint32_t size);
-*/
-#if __cccl_ptx_isa >= 800
-extern "C" _CCCL_DEVICE void __cuda_ptx_cp_reduce_async_bulk_is_not_supported_before_SM_90__();
-template <typename = void>
-_CCCL_DEVICE static inline void cp_reduce_async_bulk(
-  space_global_t,
-  space_shared_t,
-  op_min_t,
-  _CUDA_VSTD::int64_t* __dstMem,
-  const _CUDA_VSTD::int64_t* __srcMem,
-  _CUDA_VSTD::uint32_t __size)
-{
-  // __space == space_global (due to parameter type constraint)
-  // __space == space_shared (due to parameter type constraint)
-  // __type == type_s64 (due to parameter type constraint)
-  // __op == op_min (due to parameter type constraint)
-  NV_IF_ELSE_TARGET(
-    NV_PROVIDES_SM_90,
-    (asm("cp.reduce.async.bulk.global.shared::cta.bulk_group.min.s64  [%0], [%1], %2; // 4."
-         :
-         : "l"(__as_ptr_gmem(__dstMem)), "r"(__as_ptr_smem(__srcMem)), "r"(__size)
-         : "memory");),
-    (
-      // Unsupported architectures will have a linker error with a semi-decent error message
-      __cuda_ptx_cp_reduce_async_bulk_is_not_supported_before_SM_90__();));
-}
-#endif // __cccl_ptx_isa >= 800
-
-/*
-// cp.reduce.async.bulk.dst.src.bulk_group.op.type  [dstMem], [srcMem], size; // 4. PTX ISA 80, SM_90
-// .dst       = { .global }
-// .src       = { .shared::cta }
-// .type      = { .s64 }
-// .op        = { .max }
-template <typename=void>
-__device__ static inline void cp_reduce_async_bulk(
-  cuda::ptx::space_global_t,
-  cuda::ptx::space_shared_t,
-  cuda::ptx::op_max_t,
-  int64_t* dstMem,
-  const int64_t* srcMem,
-  uint32_t size);
-*/
-#if __cccl_ptx_isa >= 800
-extern "C" _CCCL_DEVICE void __cuda_ptx_cp_reduce_async_bulk_is_not_supported_before_SM_90__();
-template <typename = void>
-_CCCL_DEVICE static inline void cp_reduce_async_bulk(
-  space_global_t,
-  space_shared_t,
-  op_max_t,
-  _CUDA_VSTD::int64_t* __dstMem,
-  const _CUDA_VSTD::int64_t* __srcMem,
-  _CUDA_VSTD::uint32_t __size)
-{
-  // __space == space_global (due to parameter type constraint)
-  // __space == space_shared (due to parameter type constraint)
-  // __type == type_s64 (due to parameter type constraint)
-  // __op == op_max (due to parameter type constraint)
-  NV_IF_ELSE_TARGET(
-    NV_PROVIDES_SM_90,
-    (asm("cp.reduce.async.bulk.global.shared::cta.bulk_group.max.s64  [%0], [%1], %2; // 4."
-         :
-         : "l"(__as_ptr_gmem(__dstMem)), "r"(__as_ptr_smem(__srcMem)), "r"(__size)
-         : "memory");),
-    (
-      // Unsupported architectures will have a linker error with a semi-decent error message
-      __cuda_ptx_cp_reduce_async_bulk_is_not_supported_before_SM_90__();));
-}
-#endif // __cccl_ptx_isa >= 800
-
-/*
-// cp.reduce.async.bulk.dst.src.bulk_group.op.type  [dstMem], [srcMem], size; // 4. PTX ISA 80, SM_90
-// .dst       = { .global }
-// .src       = { .shared::cta }
-// .type      = { .f32 }
-// .op        = { .add }
-template <typename=void>
-__device__ static inline void cp_reduce_async_bulk(
-  cuda::ptx::space_global_t,
-  cuda::ptx::space_shared_t,
-  cuda::ptx::op_add_t,
-  float* dstMem,
-  const float* srcMem,
-  uint32_t size);
-*/
-#if __cccl_ptx_isa >= 800
-extern "C" _CCCL_DEVICE void __cuda_ptx_cp_reduce_async_bulk_is_not_supported_before_SM_90__();
-template <typename = void>
-_CCCL_DEVICE static inline void cp_reduce_async_bulk(
-  space_global_t, space_shared_t, op_add_t, float* __dstMem, const float* __srcMem, _CUDA_VSTD::uint32_t __size)
-{
-  // __space == space_global (due to parameter type constraint)
-  // __space == space_shared (due to parameter type constraint)
-  // __type == type_f32 (due to parameter type constraint)
-  // __op == op_add (due to parameter type constraint)
-  NV_IF_ELSE_TARGET(
-    NV_PROVIDES_SM_90,
-    (asm("cp.reduce.async.bulk.global.shared::cta.bulk_group.add.f32  [%0], [%1], %2; // 4."
-         :
-         : "l"(__as_ptr_gmem(__dstMem)), "r"(__as_ptr_smem(__srcMem)), "r"(__size)
-         : "memory");),
-    (
-      // Unsupported architectures will have a linker error with a semi-decent error message
-      __cuda_ptx_cp_reduce_async_bulk_is_not_supported_before_SM_90__();));
-}
-#endif // __cccl_ptx_isa >= 800
-
-/*
-// cp.reduce.async.bulk.dst.src.bulk_group.op.type  [dstMem], [srcMem], size; // 4. PTX ISA 80, SM_90
-// .dst       = { .global }
-// .src       = { .shared::cta }
-// .type      = { .f64 }
-// .op        = { .add }
-template <typename=void>
-__device__ static inline void cp_reduce_async_bulk(
-  cuda::ptx::space_global_t,
-  cuda::ptx::space_shared_t,
-  cuda::ptx::op_add_t,
-  double* dstMem,
-  const double* srcMem,
-  uint32_t size);
-*/
-#if __cccl_ptx_isa >= 800
-extern "C" _CCCL_DEVICE void __cuda_ptx_cp_reduce_async_bulk_is_not_supported_before_SM_90__();
-template <typename = void>
-_CCCL_DEVICE static inline void cp_reduce_async_bulk(
-  space_global_t, space_shared_t, op_add_t, double* __dstMem, const double* __srcMem, _CUDA_VSTD::uint32_t __size)
-{
-  // __space == space_global (due to parameter type constraint)
-  // __space == space_shared (due to parameter type constraint)
-  // __type == type_f64 (due to parameter type constraint)
-  // __op == op_add (due to parameter type constraint)
-  NV_IF_ELSE_TARGET(
-    NV_PROVIDES_SM_90,
-    (asm("cp.reduce.async.bulk.global.shared::cta.bulk_group.add.f64  [%0], [%1], %2; // 4."
-         :
-         : "l"(__as_ptr_gmem(__dstMem)), "r"(__as_ptr_smem(__srcMem)), "r"(__size)
-         : "memory");),
-    (
-      // Unsupported architectures will have a linker error with a semi-decent error message
-      __cuda_ptx_cp_reduce_async_bulk_is_not_supported_before_SM_90__();));
-}
-#endif // __cccl_ptx_isa >= 800
-
-/*
-// cp.reduce.async.bulk.dst.src.bulk_group.op.u64  [dstMem], [srcMem], size; // 6. PTX ISA 80, SM_90
-// .dst       = { .global }
-// .src       = { .shared::cta }
-// .type      = { .s64 }
-// .op        = { .add }
-template <typename=void>
-__device__ static inline void cp_reduce_async_bulk(
-  cuda::ptx::space_global_t,
-  cuda::ptx::space_shared_t,
-  cuda::ptx::op_add_t,
-  int64_t* dstMem,
-  const int64_t* srcMem,
-  uint32_t size);
-*/
-#if __cccl_ptx_isa >= 800
-extern "C" _CCCL_DEVICE void __cuda_ptx_cp_reduce_async_bulk_is_not_supported_before_SM_90__();
-template <typename = void>
-_CCCL_DEVICE static inline void cp_reduce_async_bulk(
-  space_global_t,
-  space_shared_t,
-  op_add_t,
-  _CUDA_VSTD::int64_t* __dstMem,
-  const _CUDA_VSTD::int64_t* __srcMem,
-  _CUDA_VSTD::uint32_t __size)
-{
-  // __space == space_global (due to parameter type constraint)
-  // __space == space_shared (due to parameter type constraint)
-  // __type == type_s64 (due to parameter type constraint)
-  // __op == op_add (due to parameter type constraint)
-  NV_IF_ELSE_TARGET(
-    NV_PROVIDES_SM_90,
-    (asm("cp.reduce.async.bulk.global.shared::cta.bulk_group.add.u64  [%0], [%1], %2; // 6."
-         :
-         : "l"(__as_ptr_gmem(__dstMem)), "r"(__as_ptr_smem(__srcMem)), "r"(__size)
-         : "memory");),
-    (
-      // Unsupported architectures will have a linker error with a semi-decent error message
-      __cuda_ptx_cp_reduce_async_bulk_is_not_supported_before_SM_90__();));
-}
-#endif // __cccl_ptx_isa >= 800
-
+#include <cuda/__ptx/instructions/generated/cp_reduce_async_bulk.inc>
 #ifdef _LIBCUDACXX_HAS_NVF16
-/*
-// cp.reduce.async.bulk.dst.src.bulk_group.op.type  [dstMem], [srcMem], size; // 4. PTX ISA 80, SM_90
-// .dst       = { .global }
-// .src       = { .shared::cta }
-// .type      = { .f16 }
-// .op        = { .min }
-template <typename=void>
-__device__ static inline void cp_reduce_async_bulk(
-  cuda::ptx::space_global_t,
-  cuda::ptx::space_shared_t,
-  cuda::ptx::op_min_t,
-  __half* dstMem,
-  const __half* srcMem,
-  uint32_t size);
-*/
-#  if __cccl_ptx_isa >= 800
-extern "C" _CCCL_DEVICE void __cuda_ptx_cp_reduce_async_bulk_is_not_supported_before_SM_90__();
-template <typename = void>
-_CCCL_DEVICE static inline void cp_reduce_async_bulk(
-  space_global_t, space_shared_t, op_min_t, __half* __dstMem, const __half* __srcMem, _CUDA_VSTD::uint32_t __size)
-{
-  // __space == space_global (due to parameter type constraint)
-  // __space == space_shared (due to parameter type constraint)
-  // __type == type_f16 (due to parameter type constraint)
-  // __op == op_min (due to parameter type constraint)
-  NV_IF_ELSE_TARGET(
-    NV_PROVIDES_SM_90,
-    (asm("cp.reduce.async.bulk.global.shared::cta.bulk_group.min.f16  [%0], [%1], %2; // 4."
-         :
-         : "l"(__as_ptr_gmem(__dstMem)), "r"(__as_ptr_smem(__srcMem)), "r"(__size)
-         : "memory");),
-    (
-      // Unsupported architectures will have a linker error with a semi-decent error message
-      __cuda_ptx_cp_reduce_async_bulk_is_not_supported_before_SM_90__();));
-}
-#  endif // __cccl_ptx_isa >= 800
-
-/*
-// cp.reduce.async.bulk.dst.src.bulk_group.op.type  [dstMem], [srcMem], size; // 4. PTX ISA 80, SM_90
-// .dst       = { .global }
-// .src       = { .shared::cta }
-// .type      = { .f16 }
-// .op        = { .max }
-template <typename=void>
-__device__ static inline void cp_reduce_async_bulk(
-  cuda::ptx::space_global_t,
-  cuda::ptx::space_shared_t,
-  cuda::ptx::op_max_t,
-  __half* dstMem,
-  const __half* srcMem,
-  uint32_t size);
-*/
-#  if __cccl_ptx_isa >= 800
-extern "C" _CCCL_DEVICE void __cuda_ptx_cp_reduce_async_bulk_is_not_supported_before_SM_90__();
-template <typename = void>
-_CCCL_DEVICE static inline void cp_reduce_async_bulk(
-  space_global_t, space_shared_t, op_max_t, __half* __dstMem, const __half* __srcMem, _CUDA_VSTD::uint32_t __size)
-{
-  // __space == space_global (due to parameter type constraint)
-  // __space == space_shared (due to parameter type constraint)
-  // __type == type_f16 (due to parameter type constraint)
-  // __op == op_max (due to parameter type constraint)
-  NV_IF_ELSE_TARGET(
-    NV_PROVIDES_SM_90,
-    (asm("cp.reduce.async.bulk.global.shared::cta.bulk_group.max.f16  [%0], [%1], %2; // 4."
-         :
-         : "l"(__as_ptr_gmem(__dstMem)), "r"(__as_ptr_smem(__srcMem)), "r"(__size)
-         : "memory");),
-    (
-      // Unsupported architectures will have a linker error with a semi-decent error message
-      __cuda_ptx_cp_reduce_async_bulk_is_not_supported_before_SM_90__();));
-}
-#  endif // __cccl_ptx_isa >= 800
-
-/*
-// cp.reduce.async.bulk.dst.src.bulk_group.op.noftz.type  [dstMem], [srcMem], size; // 5. PTX ISA 80, SM_90
-// .dst       = { .global }
-// .src       = { .shared::cta }
-// .type      = { .f16 }
-// .op        = { .add }
-template <typename=void>
-__device__ static inline void cp_reduce_async_bulk(
-  cuda::ptx::space_global_t,
-  cuda::ptx::space_shared_t,
-  cuda::ptx::op_add_t,
-  __half* dstMem,
-  const __half* srcMem,
-  uint32_t size);
-*/
-#  if __cccl_ptx_isa >= 800
-extern "C" _CCCL_DEVICE void __cuda_ptx_cp_reduce_async_bulk_is_not_supported_before_SM_90__();
-template <typename = void>
-_CCCL_DEVICE static inline void cp_reduce_async_bulk(
-  space_global_t, space_shared_t, op_add_t, __half* __dstMem, const __half* __srcMem, _CUDA_VSTD::uint32_t __size)
-{
-  // __space == space_global (due to parameter type constraint)
-  // __space == space_shared (due to parameter type constraint)
-  // __type == type_f16 (due to parameter type constraint)
-  // __op == op_add (due to parameter type constraint)
-  NV_IF_ELSE_TARGET(
-    NV_PROVIDES_SM_90,
-    (asm("cp.reduce.async.bulk.global.shared::cta.bulk_group.add.noftz.f16  [%0], [%1], %2; // 5."
-         :
-         : "l"(__as_ptr_gmem(__dstMem)), "r"(__as_ptr_smem(__srcMem)), "r"(__size)
-         : "memory");),
-    (
-      // Unsupported architectures will have a linker error with a semi-decent error message
-      __cuda_ptx_cp_reduce_async_bulk_is_not_supported_before_SM_90__();));
-}
-#  endif // __cccl_ptx_isa >= 800
+#  include <cuda/__ptx/instructions/generated/cp_reduce_async_bulk_f16.inc>
 #endif // _LIBCUDACXX_HAS_NVF16
-
 #ifdef _LIBCUDACXX_HAS_NVBF16
-/*
-// cp.reduce.async.bulk.dst.src.bulk_group.op.type  [dstMem], [srcMem], size; // 4. PTX ISA 80, SM_90
-// .dst       = { .global }
-// .src       = { .shared::cta }
-// .type      = { .bf16 }
-// .op        = { .min }
-template <typename=void>
-__device__ static inline void cp_reduce_async_bulk(
-  cuda::ptx::space_global_t,
-  cuda::ptx::space_shared_t,
-  cuda::ptx::op_min_t,
-  __nv_bfloat16* dstMem,
-  const __nv_bfloat16* srcMem,
-  uint32_t size);
-*/
-#  if __cccl_ptx_isa >= 800
-extern "C" _CCCL_DEVICE void __cuda_ptx_cp_reduce_async_bulk_is_not_supported_before_SM_90__();
-template <typename = void>
-_CCCL_DEVICE static inline void cp_reduce_async_bulk(
-  space_global_t,
-  space_shared_t,
-  op_min_t,
-  __nv_bfloat16* __dstMem,
-  const __nv_bfloat16* __srcMem,
-  _CUDA_VSTD::uint32_t __size)
-{
-  // __space == space_global (due to parameter type constraint)
-  // __space == space_shared (due to parameter type constraint)
-  // __type == type_bf16 (due to parameter type constraint)
-  // __op == op_min (due to parameter type constraint)
-  NV_IF_ELSE_TARGET(
-    NV_PROVIDES_SM_90,
-    (asm("cp.reduce.async.bulk.global.shared::cta.bulk_group.min.bf16  [%0], [%1], %2; // 4."
-         :
-         : "l"(__as_ptr_gmem(__dstMem)), "r"(__as_ptr_smem(__srcMem)), "r"(__size)
-         : "memory");),
-    (
-      // Unsupported architectures will have a linker error with a semi-decent error message
-      __cuda_ptx_cp_reduce_async_bulk_is_not_supported_before_SM_90__();));
-}
-#  endif // __cccl_ptx_isa >= 800
-
-/*
-// cp.reduce.async.bulk.dst.src.bulk_group.op.type  [dstMem], [srcMem], size; // 4. PTX ISA 80, SM_90
-// .dst       = { .global }
-// .src       = { .shared::cta }
-// .type      = { .bf16 }
-// .op        = { .max }
-template <typename=void>
-__device__ static inline void cp_reduce_async_bulk(
-  cuda::ptx::space_global_t,
-  cuda::ptx::space_shared_t,
-  cuda::ptx::op_max_t,
-  __nv_bfloat16* dstMem,
-  const __nv_bfloat16* srcMem,
-  uint32_t size);
-*/
-#  if __cccl_ptx_isa >= 800
-extern "C" _CCCL_DEVICE void __cuda_ptx_cp_reduce_async_bulk_is_not_supported_before_SM_90__();
-template <typename = void>
-_CCCL_DEVICE static inline void cp_reduce_async_bulk(
-  space_global_t,
-  space_shared_t,
-  op_max_t,
-  __nv_bfloat16* __dstMem,
-  const __nv_bfloat16* __srcMem,
-  _CUDA_VSTD::uint32_t __size)
-{
-  // __space == space_global (due to parameter type constraint)
-  // __space == space_shared (due to parameter type constraint)
-  // __type == type_bf16 (due to parameter type constraint)
-  // __op == op_max (due to parameter type constraint)
-  NV_IF_ELSE_TARGET(
-    NV_PROVIDES_SM_90,
-    (asm("cp.reduce.async.bulk.global.shared::cta.bulk_group.max.bf16  [%0], [%1], %2; // 4."
-         :
-         : "l"(__as_ptr_gmem(__dstMem)), "r"(__as_ptr_smem(__srcMem)), "r"(__size)
-         : "memory");),
-    (
-      // Unsupported architectures will have a linker error with a semi-decent error message
-      __cuda_ptx_cp_reduce_async_bulk_is_not_supported_before_SM_90__();));
-}
-#  endif // __cccl_ptx_isa >= 800
-
-/*
-// cp.reduce.async.bulk.dst.src.bulk_group.op.noftz.type  [dstMem], [srcMem], size; // 5. PTX ISA 80, SM_90
-// .dst       = { .global }
-// .src       = { .shared::cta }
-// .type      = { .bf16 }
-// .op        = { .add }
-template <typename=void>
-__device__ static inline void cp_reduce_async_bulk(
-  cuda::ptx::space_global_t,
-  cuda::ptx::space_shared_t,
-  cuda::ptx::op_add_t,
-  __nv_bfloat16* dstMem,
-  const __nv_bfloat16* srcMem,
-  uint32_t size);
-*/
-#  if __cccl_ptx_isa >= 800
-extern "C" _CCCL_DEVICE void __cuda_ptx_cp_reduce_async_bulk_is_not_supported_before_SM_90__();
-template <typename = void>
-_CCCL_DEVICE static inline void cp_reduce_async_bulk(
-  space_global_t,
-  space_shared_t,
-  op_add_t,
-  __nv_bfloat16* __dstMem,
-  const __nv_bfloat16* __srcMem,
-  _CUDA_VSTD::uint32_t __size)
-{
-  // __space == space_global (due to parameter type constraint)
-  // __space == space_shared (due to parameter type constraint)
-  // __type == type_bf16 (due to parameter type constraint)
-  // __op == op_add (due to parameter type constraint)
-  NV_IF_ELSE_TARGET(
-    NV_PROVIDES_SM_90,
-    (asm("cp.reduce.async.bulk.global.shared::cta.bulk_group.add.noftz.bf16  [%0], [%1], %2; // 5."
-         :
-         : "l"(__as_ptr_gmem(__dstMem)), "r"(__as_ptr_smem(__srcMem)), "r"(__size)
-         : "memory");),
-    (
-      // Unsupported architectures will have a linker error with a semi-decent error message
-      __cuda_ptx_cp_reduce_async_bulk_is_not_supported_before_SM_90__();));
-}
-#  endif // __cccl_ptx_isa >= 800
+#  include <cuda/__ptx/instructions/generated/cp_reduce_async_bulk_bf16.inc>
 #endif // _LIBCUDACXX_HAS_NVBF16
 
 _LIBCUDACXX_END_NAMESPACE_CUDA_PTX
diff --git a/libcudacxx/include/cuda/__ptx/instructions/cp_reduce_async_bulk_tensor.h b/libcudacxx/include/cuda/__ptx/instructions/cp_reduce_async_bulk_tensor.h
index 4ecb108a719..a6b23a706c7 100644
--- a/libcudacxx/include/cuda/__ptx/instructions/cp_reduce_async_bulk_tensor.h
+++ b/libcudacxx/include/cuda/__ptx/instructions/cp_reduce_async_bulk_tensor.h
@@ -32,538 +32,7 @@ _LIBCUDACXX_BEGIN_NAMESPACE_CUDA_PTX
 
 // 9.7.8.24.10. Data Movement and Conversion Instructions: cp.reduce.async.bulk.tensor
 // https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#data-movement-and-conversion-instructions-cp-reduce-async-bulk-tensor
-/*
-// cp.reduce.async.bulk.tensor.1d.dst.src.op.tile.bulk_group [tensorMap, tensorCoords], [srcMem]; // 1a. PTX ISA 80,
-SM_90
-// .dst       = { .global }
-// .src       = { .shared::cta }
-// .op        = { .add, .min, .max, .inc, .dec, .and, .or, .xor }
-template <cuda::ptx::dot_op Op>
-__device__ static inline void cp_reduce_async_bulk_tensor(
-  cuda::ptx::space_global_t,
-  cuda::ptx::space_shared_t,
-  cuda::ptx::op_t<Op> op,
-  const void* tensorMap,
-  const int32_t (&tensorCoords)[1],
-  const void* srcMem);
-*/
-#if __cccl_ptx_isa >= 800
-extern "C" _CCCL_DEVICE void __cuda_ptx_cp_reduce_async_bulk_tensor_is_not_supported_before_SM_90__();
-template <dot_op _Op>
-_CCCL_DEVICE static inline void cp_reduce_async_bulk_tensor(
-  space_global_t,
-  space_shared_t,
-  op_t<_Op> __op,
-  const void* __tensorMap,
-  const _CUDA_VSTD::int32_t (&__tensorCoords)[1],
-  const void* __srcMem)
-{
-  // __space == space_global (due to parameter type constraint)
-  // __space == space_shared (due to parameter type constraint)
-  static_assert(__op == op_add || __op == op_min || __op == op_max || __op == op_inc || __op == op_dec
-                  || __op == op_and_op || __op == op_or_op || __op == op_xor_op,
-                "");
-  NV_IF_ELSE_TARGET(
-    NV_PROVIDES_SM_90,
-    (
-      _CCCL_IF_CONSTEXPR (__op == op_add) {
-        asm("cp.reduce.async.bulk.tensor.1d.global.shared::cta.add.tile.bulk_group [%0, {%1}], [%2]; // 1a."
-            :
-            : "l"(__tensorMap), "r"(__tensorCoords[0]), "r"(__as_ptr_smem(__srcMem))
-            : "memory");
-      } _CCCL_ELSE_IF_CONSTEXPR (__op == op_min) {
-        asm("cp.reduce.async.bulk.tensor.1d.global.shared::cta.min.tile.bulk_group [%0, {%1}], [%2]; // 1a."
-            :
-            : "l"(__tensorMap), "r"(__tensorCoords[0]), "r"(__as_ptr_smem(__srcMem))
-            : "memory");
-      } _CCCL_ELSE_IF_CONSTEXPR (__op == op_max) {
-        asm("cp.reduce.async.bulk.tensor.1d.global.shared::cta.max.tile.bulk_group [%0, {%1}], [%2]; // 1a."
-            :
-            : "l"(__tensorMap), "r"(__tensorCoords[0]), "r"(__as_ptr_smem(__srcMem))
-            : "memory");
-      } _CCCL_ELSE_IF_CONSTEXPR (__op == op_inc) {
-        asm("cp.reduce.async.bulk.tensor.1d.global.shared::cta.inc.tile.bulk_group [%0, {%1}], [%2]; // 1a."
-            :
-            : "l"(__tensorMap), "r"(__tensorCoords[0]), "r"(__as_ptr_smem(__srcMem))
-            : "memory");
-      } _CCCL_ELSE_IF_CONSTEXPR (__op == op_dec) {
-        asm("cp.reduce.async.bulk.tensor.1d.global.shared::cta.dec.tile.bulk_group [%0, {%1}], [%2]; // 1a."
-            :
-            : "l"(__tensorMap), "r"(__tensorCoords[0]), "r"(__as_ptr_smem(__srcMem))
-            : "memory");
-      } _CCCL_ELSE_IF_CONSTEXPR (__op == op_and_op) {
-        asm("cp.reduce.async.bulk.tensor.1d.global.shared::cta.and.tile.bulk_group [%0, {%1}], [%2]; // 1a."
-            :
-            : "l"(__tensorMap), "r"(__tensorCoords[0]), "r"(__as_ptr_smem(__srcMem))
-            : "memory");
-      } _CCCL_ELSE_IF_CONSTEXPR (__op == op_or_op) {
-        asm("cp.reduce.async.bulk.tensor.1d.global.shared::cta.or.tile.bulk_group [%0, {%1}], [%2]; // 1a."
-            :
-            : "l"(__tensorMap), "r"(__tensorCoords[0]), "r"(__as_ptr_smem(__srcMem))
-            : "memory");
-      } _CCCL_ELSE_IF_CONSTEXPR (__op == op_xor_op) {
-        asm("cp.reduce.async.bulk.tensor.1d.global.shared::cta.xor.tile.bulk_group [%0, {%1}], [%2]; // 1a."
-            :
-            : "l"(__tensorMap), "r"(__tensorCoords[0]), "r"(__as_ptr_smem(__srcMem))
-            : "memory");
-      }),
-    (
-      // Unsupported architectures will have a linker error with a semi-decent error message
-      __cuda_ptx_cp_reduce_async_bulk_tensor_is_not_supported_before_SM_90__();));
-}
-#endif // __cccl_ptx_isa >= 800
-
-/*
-// cp.reduce.async.bulk.tensor.2d.dst.src.op.tile.bulk_group [tensorMap, tensorCoords], [srcMem]; // 1b. PTX ISA 80,
-SM_90
-// .dst       = { .global }
-// .src       = { .shared::cta }
-// .op        = { .add, .min, .max, .inc, .dec, .and, .or, .xor }
-template <cuda::ptx::dot_op Op>
-__device__ static inline void cp_reduce_async_bulk_tensor(
-  cuda::ptx::space_global_t,
-  cuda::ptx::space_shared_t,
-  cuda::ptx::op_t<Op> op,
-  const void* tensorMap,
-  const int32_t (&tensorCoords)[2],
-  const void* srcMem);
-*/
-#if __cccl_ptx_isa >= 800
-extern "C" _CCCL_DEVICE void __cuda_ptx_cp_reduce_async_bulk_tensor_is_not_supported_before_SM_90__();
-template <dot_op _Op>
-_CCCL_DEVICE static inline void cp_reduce_async_bulk_tensor(
-  space_global_t,
-  space_shared_t,
-  op_t<_Op> __op,
-  const void* __tensorMap,
-  const _CUDA_VSTD::int32_t (&__tensorCoords)[2],
-  const void* __srcMem)
-{
-  // __space == space_global (due to parameter type constraint)
-  // __space == space_shared (due to parameter type constraint)
-  static_assert(__op == op_add || __op == op_min || __op == op_max || __op == op_inc || __op == op_dec
-                  || __op == op_and_op || __op == op_or_op || __op == op_xor_op,
-                "");
-  NV_IF_ELSE_TARGET(
-    NV_PROVIDES_SM_90,
-    (
-      _CCCL_IF_CONSTEXPR (__op == op_add) {
-        asm("cp.reduce.async.bulk.tensor.2d.global.shared::cta.add.tile.bulk_group [%0, {%1, %2}], [%3]; // 1b."
-            :
-            : "l"(__tensorMap), "r"(__tensorCoords[0]), "r"(__tensorCoords[1]), "r"(__as_ptr_smem(__srcMem))
-            : "memory");
-      } _CCCL_ELSE_IF_CONSTEXPR (__op == op_min) {
-        asm("cp.reduce.async.bulk.tensor.2d.global.shared::cta.min.tile.bulk_group [%0, {%1, %2}], [%3]; // 1b."
-            :
-            : "l"(__tensorMap), "r"(__tensorCoords[0]), "r"(__tensorCoords[1]), "r"(__as_ptr_smem(__srcMem))
-            : "memory");
-      } _CCCL_ELSE_IF_CONSTEXPR (__op == op_max) {
-        asm("cp.reduce.async.bulk.tensor.2d.global.shared::cta.max.tile.bulk_group [%0, {%1, %2}], [%3]; // 1b."
-            :
-            : "l"(__tensorMap), "r"(__tensorCoords[0]), "r"(__tensorCoords[1]), "r"(__as_ptr_smem(__srcMem))
-            : "memory");
-      } _CCCL_ELSE_IF_CONSTEXPR (__op == op_inc) {
-        asm("cp.reduce.async.bulk.tensor.2d.global.shared::cta.inc.tile.bulk_group [%0, {%1, %2}], [%3]; // 1b."
-            :
-            : "l"(__tensorMap), "r"(__tensorCoords[0]), "r"(__tensorCoords[1]), "r"(__as_ptr_smem(__srcMem))
-            : "memory");
-      } _CCCL_ELSE_IF_CONSTEXPR (__op == op_dec) {
-        asm("cp.reduce.async.bulk.tensor.2d.global.shared::cta.dec.tile.bulk_group [%0, {%1, %2}], [%3]; // 1b."
-            :
-            : "l"(__tensorMap), "r"(__tensorCoords[0]), "r"(__tensorCoords[1]), "r"(__as_ptr_smem(__srcMem))
-            : "memory");
-      } _CCCL_ELSE_IF_CONSTEXPR (__op == op_and_op) {
-        asm("cp.reduce.async.bulk.tensor.2d.global.shared::cta.and.tile.bulk_group [%0, {%1, %2}], [%3]; // 1b."
-            :
-            : "l"(__tensorMap), "r"(__tensorCoords[0]), "r"(__tensorCoords[1]), "r"(__as_ptr_smem(__srcMem))
-            : "memory");
-      } _CCCL_ELSE_IF_CONSTEXPR (__op == op_or_op) {
-        asm("cp.reduce.async.bulk.tensor.2d.global.shared::cta.or.tile.bulk_group [%0, {%1, %2}], [%3]; // 1b."
-            :
-            : "l"(__tensorMap), "r"(__tensorCoords[0]), "r"(__tensorCoords[1]), "r"(__as_ptr_smem(__srcMem))
-            : "memory");
-      } _CCCL_ELSE_IF_CONSTEXPR (__op == op_xor_op) {
-        asm("cp.reduce.async.bulk.tensor.2d.global.shared::cta.xor.tile.bulk_group [%0, {%1, %2}], [%3]; // 1b."
-            :
-            : "l"(__tensorMap), "r"(__tensorCoords[0]), "r"(__tensorCoords[1]), "r"(__as_ptr_smem(__srcMem))
-            : "memory");
-      }),
-    (
-      // Unsupported architectures will have a linker error with a semi-decent error message
-      __cuda_ptx_cp_reduce_async_bulk_tensor_is_not_supported_before_SM_90__();));
-}
-#endif // __cccl_ptx_isa >= 800
-
-/*
-// cp.reduce.async.bulk.tensor.3d.dst.src.op.tile.bulk_group [tensorMap, tensorCoords], [srcMem]; // 1c. PTX ISA 80,
-SM_90
-// .dst       = { .global }
-// .src       = { .shared::cta }
-// .op        = { .add, .min, .max, .inc, .dec, .and, .or, .xor }
-template <cuda::ptx::dot_op Op>
-__device__ static inline void cp_reduce_async_bulk_tensor(
-  cuda::ptx::space_global_t,
-  cuda::ptx::space_shared_t,
-  cuda::ptx::op_t<Op> op,
-  const void* tensorMap,
-  const int32_t (&tensorCoords)[3],
-  const void* srcMem);
-*/
-#if __cccl_ptx_isa >= 800
-extern "C" _CCCL_DEVICE void __cuda_ptx_cp_reduce_async_bulk_tensor_is_not_supported_before_SM_90__();
-template <dot_op _Op>
-_CCCL_DEVICE static inline void cp_reduce_async_bulk_tensor(
-  space_global_t,
-  space_shared_t,
-  op_t<_Op> __op,
-  const void* __tensorMap,
-  const _CUDA_VSTD::int32_t (&__tensorCoords)[3],
-  const void* __srcMem)
-{
-  // __space == space_global (due to parameter type constraint)
-  // __space == space_shared (due to parameter type constraint)
-  static_assert(__op == op_add || __op == op_min || __op == op_max || __op == op_inc || __op == op_dec
-                  || __op == op_and_op || __op == op_or_op || __op == op_xor_op,
-                "");
-  NV_IF_ELSE_TARGET(
-    NV_PROVIDES_SM_90,
-    (
-      _CCCL_IF_CONSTEXPR (__op == op_add) {
-        asm("cp.reduce.async.bulk.tensor.3d.global.shared::cta.add.tile.bulk_group [%0, {%1, %2, %3}], [%4]; // 1c."
-            :
-            : "l"(__tensorMap),
-              "r"(__tensorCoords[0]),
-              "r"(__tensorCoords[1]),
-              "r"(__tensorCoords[2]),
-              "r"(__as_ptr_smem(__srcMem))
-            : "memory");
-      } _CCCL_ELSE_IF_CONSTEXPR (__op == op_min) {
-        asm("cp.reduce.async.bulk.tensor.3d.global.shared::cta.min.tile.bulk_group [%0, {%1, %2, %3}], [%4]; // 1c."
-            :
-            : "l"(__tensorMap),
-              "r"(__tensorCoords[0]),
-              "r"(__tensorCoords[1]),
-              "r"(__tensorCoords[2]),
-              "r"(__as_ptr_smem(__srcMem))
-            : "memory");
-      } _CCCL_ELSE_IF_CONSTEXPR (__op == op_max) {
-        asm("cp.reduce.async.bulk.tensor.3d.global.shared::cta.max.tile.bulk_group [%0, {%1, %2, %3}], [%4]; // 1c."
-            :
-            : "l"(__tensorMap),
-              "r"(__tensorCoords[0]),
-              "r"(__tensorCoords[1]),
-              "r"(__tensorCoords[2]),
-              "r"(__as_ptr_smem(__srcMem))
-            : "memory");
-      } _CCCL_ELSE_IF_CONSTEXPR (__op == op_inc) {
-        asm("cp.reduce.async.bulk.tensor.3d.global.shared::cta.inc.tile.bulk_group [%0, {%1, %2, %3}], [%4]; // 1c."
-            :
-            : "l"(__tensorMap),
-              "r"(__tensorCoords[0]),
-              "r"(__tensorCoords[1]),
-              "r"(__tensorCoords[2]),
-              "r"(__as_ptr_smem(__srcMem))
-            : "memory");
-      } _CCCL_ELSE_IF_CONSTEXPR (__op == op_dec) {
-        asm("cp.reduce.async.bulk.tensor.3d.global.shared::cta.dec.tile.bulk_group [%0, {%1, %2, %3}], [%4]; // 1c."
-            :
-            : "l"(__tensorMap),
-              "r"(__tensorCoords[0]),
-              "r"(__tensorCoords[1]),
-              "r"(__tensorCoords[2]),
-              "r"(__as_ptr_smem(__srcMem))
-            : "memory");
-      } _CCCL_ELSE_IF_CONSTEXPR (__op == op_and_op) {
-        asm("cp.reduce.async.bulk.tensor.3d.global.shared::cta.and.tile.bulk_group [%0, {%1, %2, %3}], [%4]; // 1c."
-            :
-            : "l"(__tensorMap),
-              "r"(__tensorCoords[0]),
-              "r"(__tensorCoords[1]),
-              "r"(__tensorCoords[2]),
-              "r"(__as_ptr_smem(__srcMem))
-            : "memory");
-      } _CCCL_ELSE_IF_CONSTEXPR (__op == op_or_op) {
-        asm("cp.reduce.async.bulk.tensor.3d.global.shared::cta.or.tile.bulk_group [%0, {%1, %2, %3}], [%4]; // 1c."
-            :
-            : "l"(__tensorMap),
-              "r"(__tensorCoords[0]),
-              "r"(__tensorCoords[1]),
-              "r"(__tensorCoords[2]),
-              "r"(__as_ptr_smem(__srcMem))
-            : "memory");
-      } _CCCL_ELSE_IF_CONSTEXPR (__op == op_xor_op) {
-        asm("cp.reduce.async.bulk.tensor.3d.global.shared::cta.xor.tile.bulk_group [%0, {%1, %2, %3}], [%4]; // 1c."
-            :
-            : "l"(__tensorMap),
-              "r"(__tensorCoords[0]),
-              "r"(__tensorCoords[1]),
-              "r"(__tensorCoords[2]),
-              "r"(__as_ptr_smem(__srcMem))
-            : "memory");
-      }),
-    (
-      // Unsupported architectures will have a linker error with a semi-decent error message
-      __cuda_ptx_cp_reduce_async_bulk_tensor_is_not_supported_before_SM_90__();));
-}
-#endif // __cccl_ptx_isa >= 800
-
-/*
-// cp.reduce.async.bulk.tensor.4d.dst.src.op.tile.bulk_group [tensorMap, tensorCoords], [srcMem]; // 1d. PTX ISA 80,
-SM_90
-// .dst       = { .global }
-// .src       = { .shared::cta }
-// .op        = { .add, .min, .max, .inc, .dec, .and, .or, .xor }
-template <cuda::ptx::dot_op Op>
-__device__ static inline void cp_reduce_async_bulk_tensor(
-  cuda::ptx::space_global_t,
-  cuda::ptx::space_shared_t,
-  cuda::ptx::op_t<Op> op,
-  const void* tensorMap,
-  const int32_t (&tensorCoords)[4],
-  const void* srcMem);
-*/
-#if __cccl_ptx_isa >= 800
-extern "C" _CCCL_DEVICE void __cuda_ptx_cp_reduce_async_bulk_tensor_is_not_supported_before_SM_90__();
-template <dot_op _Op>
-_CCCL_DEVICE static inline void cp_reduce_async_bulk_tensor(
-  space_global_t,
-  space_shared_t,
-  op_t<_Op> __op,
-  const void* __tensorMap,
-  const _CUDA_VSTD::int32_t (&__tensorCoords)[4],
-  const void* __srcMem)
-{
-  // __space == space_global (due to parameter type constraint)
-  // __space == space_shared (due to parameter type constraint)
-  static_assert(__op == op_add || __op == op_min || __op == op_max || __op == op_inc || __op == op_dec
-                  || __op == op_and_op || __op == op_or_op || __op == op_xor_op,
-                "");
-  NV_IF_ELSE_TARGET(
-    NV_PROVIDES_SM_90,
-    (
-      _CCCL_IF_CONSTEXPR (__op == op_add) {
-        asm("cp.reduce.async.bulk.tensor.4d.global.shared::cta.add.tile.bulk_group [%0, {%1, %2, %3, %4}], [%5]; // 1d."
-            :
-            : "l"(__tensorMap),
-              "r"(__tensorCoords[0]),
-              "r"(__tensorCoords[1]),
-              "r"(__tensorCoords[2]),
-              "r"(__tensorCoords[3]),
-              "r"(__as_ptr_smem(__srcMem))
-            : "memory");
-      } _CCCL_ELSE_IF_CONSTEXPR (__op == op_min) {
-        asm("cp.reduce.async.bulk.tensor.4d.global.shared::cta.min.tile.bulk_group [%0, {%1, %2, %3, %4}], [%5]; // 1d."
-            :
-            : "l"(__tensorMap),
-              "r"(__tensorCoords[0]),
-              "r"(__tensorCoords[1]),
-              "r"(__tensorCoords[2]),
-              "r"(__tensorCoords[3]),
-              "r"(__as_ptr_smem(__srcMem))
-            : "memory");
-      } _CCCL_ELSE_IF_CONSTEXPR (__op == op_max) {
-        asm("cp.reduce.async.bulk.tensor.4d.global.shared::cta.max.tile.bulk_group [%0, {%1, %2, %3, %4}], [%5]; // 1d."
-            :
-            : "l"(__tensorMap),
-              "r"(__tensorCoords[0]),
-              "r"(__tensorCoords[1]),
-              "r"(__tensorCoords[2]),
-              "r"(__tensorCoords[3]),
-              "r"(__as_ptr_smem(__srcMem))
-            : "memory");
-      } _CCCL_ELSE_IF_CONSTEXPR (__op == op_inc) {
-        asm("cp.reduce.async.bulk.tensor.4d.global.shared::cta.inc.tile.bulk_group [%0, {%1, %2, %3, %4}], [%5]; // 1d."
-            :
-            : "l"(__tensorMap),
-              "r"(__tensorCoords[0]),
-              "r"(__tensorCoords[1]),
-              "r"(__tensorCoords[2]),
-              "r"(__tensorCoords[3]),
-              "r"(__as_ptr_smem(__srcMem))
-            : "memory");
-      } _CCCL_ELSE_IF_CONSTEXPR (__op == op_dec) {
-        asm("cp.reduce.async.bulk.tensor.4d.global.shared::cta.dec.tile.bulk_group [%0, {%1, %2, %3, %4}], [%5]; // 1d."
-            :
-            : "l"(__tensorMap),
-              "r"(__tensorCoords[0]),
-              "r"(__tensorCoords[1]),
-              "r"(__tensorCoords[2]),
-              "r"(__tensorCoords[3]),
-              "r"(__as_ptr_smem(__srcMem))
-            : "memory");
-      } _CCCL_ELSE_IF_CONSTEXPR (__op == op_and_op) {
-        asm("cp.reduce.async.bulk.tensor.4d.global.shared::cta.and.tile.bulk_group [%0, {%1, %2, %3, %4}], [%5]; // 1d."
-            :
-            : "l"(__tensorMap),
-              "r"(__tensorCoords[0]),
-              "r"(__tensorCoords[1]),
-              "r"(__tensorCoords[2]),
-              "r"(__tensorCoords[3]),
-              "r"(__as_ptr_smem(__srcMem))
-            : "memory");
-      } _CCCL_ELSE_IF_CONSTEXPR (__op == op_or_op) {
-        asm("cp.reduce.async.bulk.tensor.4d.global.shared::cta.or.tile.bulk_group [%0, {%1, %2, %3, %4}], [%5]; // 1d."
-            :
-            : "l"(__tensorMap),
-              "r"(__tensorCoords[0]),
-              "r"(__tensorCoords[1]),
-              "r"(__tensorCoords[2]),
-              "r"(__tensorCoords[3]),
-              "r"(__as_ptr_smem(__srcMem))
-            : "memory");
-      } _CCCL_ELSE_IF_CONSTEXPR (__op == op_xor_op) {
-        asm("cp.reduce.async.bulk.tensor.4d.global.shared::cta.xor.tile.bulk_group [%0, {%1, %2, %3, %4}], [%5]; // 1d."
-            :
-            : "l"(__tensorMap),
-              "r"(__tensorCoords[0]),
-              "r"(__tensorCoords[1]),
-              "r"(__tensorCoords[2]),
-              "r"(__tensorCoords[3]),
-              "r"(__as_ptr_smem(__srcMem))
-            : "memory");
-      }),
-    (
-      // Unsupported architectures will have a linker error with a semi-decent error message
-      __cuda_ptx_cp_reduce_async_bulk_tensor_is_not_supported_before_SM_90__();));
-}
-#endif // __cccl_ptx_isa >= 800
-
-/*
-// cp.reduce.async.bulk.tensor.5d.dst.src.op.tile.bulk_group [tensorMap, tensorCoords], [srcMem]; // 1e. PTX ISA 80,
-SM_90
-// .dst       = { .global }
-// .src       = { .shared::cta }
-// .op        = { .add, .min, .max, .inc, .dec, .and, .or, .xor }
-template <cuda::ptx::dot_op Op>
-__device__ static inline void cp_reduce_async_bulk_tensor(
-  cuda::ptx::space_global_t,
-  cuda::ptx::space_shared_t,
-  cuda::ptx::op_t<Op> op,
-  const void* tensorMap,
-  const int32_t (&tensorCoords)[5],
-  const void* srcMem);
-*/
-#if __cccl_ptx_isa >= 800
-extern "C" _CCCL_DEVICE void __cuda_ptx_cp_reduce_async_bulk_tensor_is_not_supported_before_SM_90__();
-template <dot_op _Op>
-_CCCL_DEVICE static inline void cp_reduce_async_bulk_tensor(
-  space_global_t,
-  space_shared_t,
-  op_t<_Op> __op,
-  const void* __tensorMap,
-  const _CUDA_VSTD::int32_t (&__tensorCoords)[5],
-  const void* __srcMem)
-{
-  // __space == space_global (due to parameter type constraint)
-  // __space == space_shared (due to parameter type constraint)
-  static_assert(__op == op_add || __op == op_min || __op == op_max || __op == op_inc || __op == op_dec
-                  || __op == op_and_op || __op == op_or_op || __op == op_xor_op,
-                "");
-  NV_IF_ELSE_TARGET(
-    NV_PROVIDES_SM_90,
-    (
-      _CCCL_IF_CONSTEXPR (__op == op_add) {
-        asm("cp.reduce.async.bulk.tensor.5d.global.shared::cta.add.tile.bulk_group [%0, {%1, %2, %3, %4, %5}], [%6]; "
-            "// 1e."
-            :
-            : "l"(__tensorMap),
-              "r"(__tensorCoords[0]),
-              "r"(__tensorCoords[1]),
-              "r"(__tensorCoords[2]),
-              "r"(__tensorCoords[3]),
-              "r"(__tensorCoords[4]),
-              "r"(__as_ptr_smem(__srcMem))
-            : "memory");
-      } _CCCL_ELSE_IF_CONSTEXPR (__op == op_min) {
-        asm("cp.reduce.async.bulk.tensor.5d.global.shared::cta.min.tile.bulk_group [%0, {%1, %2, %3, %4, %5}], [%6]; "
-            "// 1e."
-            :
-            : "l"(__tensorMap),
-              "r"(__tensorCoords[0]),
-              "r"(__tensorCoords[1]),
-              "r"(__tensorCoords[2]),
-              "r"(__tensorCoords[3]),
-              "r"(__tensorCoords[4]),
-              "r"(__as_ptr_smem(__srcMem))
-            : "memory");
-      } _CCCL_ELSE_IF_CONSTEXPR (__op == op_max) {
-        asm("cp.reduce.async.bulk.tensor.5d.global.shared::cta.max.tile.bulk_group [%0, {%1, %2, %3, %4, %5}], [%6]; "
-            "// 1e."
-            :
-            : "l"(__tensorMap),
-              "r"(__tensorCoords[0]),
-              "r"(__tensorCoords[1]),
-              "r"(__tensorCoords[2]),
-              "r"(__tensorCoords[3]),
-              "r"(__tensorCoords[4]),
-              "r"(__as_ptr_smem(__srcMem))
-            : "memory");
-      } _CCCL_ELSE_IF_CONSTEXPR (__op == op_inc) {
-        asm("cp.reduce.async.bulk.tensor.5d.global.shared::cta.inc.tile.bulk_group [%0, {%1, %2, %3, %4, %5}], [%6]; "
-            "// 1e."
-            :
-            : "l"(__tensorMap),
-              "r"(__tensorCoords[0]),
-              "r"(__tensorCoords[1]),
-              "r"(__tensorCoords[2]),
-              "r"(__tensorCoords[3]),
-              "r"(__tensorCoords[4]),
-              "r"(__as_ptr_smem(__srcMem))
-            : "memory");
-      } _CCCL_ELSE_IF_CONSTEXPR (__op == op_dec) {
-        asm("cp.reduce.async.bulk.tensor.5d.global.shared::cta.dec.tile.bulk_group [%0, {%1, %2, %3, %4, %5}], [%6]; "
-            "// 1e."
-            :
-            : "l"(__tensorMap),
-              "r"(__tensorCoords[0]),
-              "r"(__tensorCoords[1]),
-              "r"(__tensorCoords[2]),
-              "r"(__tensorCoords[3]),
-              "r"(__tensorCoords[4]),
-              "r"(__as_ptr_smem(__srcMem))
-            : "memory");
-      } _CCCL_ELSE_IF_CONSTEXPR (__op == op_and_op) {
-        asm("cp.reduce.async.bulk.tensor.5d.global.shared::cta.and.tile.bulk_group [%0, {%1, %2, %3, %4, %5}], [%6]; "
-            "// 1e."
-            :
-            : "l"(__tensorMap),
-              "r"(__tensorCoords[0]),
-              "r"(__tensorCoords[1]),
-              "r"(__tensorCoords[2]),
-              "r"(__tensorCoords[3]),
-              "r"(__tensorCoords[4]),
-              "r"(__as_ptr_smem(__srcMem))
-            : "memory");
-      } _CCCL_ELSE_IF_CONSTEXPR (__op == op_or_op) {
-        asm("cp.reduce.async.bulk.tensor.5d.global.shared::cta.or.tile.bulk_group [%0, {%1, %2, %3, %4, %5}], [%6]; // "
-            "1e."
-            :
-            : "l"(__tensorMap),
-              "r"(__tensorCoords[0]),
-              "r"(__tensorCoords[1]),
-              "r"(__tensorCoords[2]),
-              "r"(__tensorCoords[3]),
-              "r"(__tensorCoords[4]),
-              "r"(__as_ptr_smem(__srcMem))
-            : "memory");
-      } _CCCL_ELSE_IF_CONSTEXPR (__op == op_xor_op) {
-        asm("cp.reduce.async.bulk.tensor.5d.global.shared::cta.xor.tile.bulk_group [%0, {%1, %2, %3, %4, %5}], [%6]; "
-            "// 1e."
-            :
-            : "l"(__tensorMap),
-              "r"(__tensorCoords[0]),
-              "r"(__tensorCoords[1]),
-              "r"(__tensorCoords[2]),
-              "r"(__tensorCoords[3]),
-              "r"(__tensorCoords[4]),
-              "r"(__as_ptr_smem(__srcMem))
-            : "memory");
-      }),
-    (
-      // Unsupported architectures will have a linker error with a semi-decent error message
-      __cuda_ptx_cp_reduce_async_bulk_tensor_is_not_supported_before_SM_90__();));
-}
-#endif // __cccl_ptx_isa >= 800
+#include <cuda/__ptx/instructions/generated/cp_reduce_async_bulk_tensor.inc>
 
 _LIBCUDACXX_END_NAMESPACE_CUDA_PTX
 
diff --git a/libcudacxx/include/cuda/__ptx/instructions/fence.h b/libcudacxx/include/cuda/__ptx/instructions/fence.h
index 956f86c910e..045f09cb40e 100644
--- a/libcudacxx/include/cuda/__ptx/instructions/fence.h
+++ b/libcudacxx/include/cuda/__ptx/instructions/fence.h
@@ -32,253 +32,11 @@ _LIBCUDACXX_BEGIN_NAMESPACE_CUDA_PTX
 
 // 9.7.12.4. Parallel Synchronization and Communication Instructions: membar/fence
 // https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#parallel-synchronization-and-communication-instructions-membar-fence
-/*
-// fence{.sem}.scope; // 1. PTX ISA 60, SM_70
-// .sem       = { .sc, .acq_rel }
-// .scope     = { .cta, .gpu, .sys }
-template <cuda::ptx::dot_sem Sem, cuda::ptx::dot_scope Scope>
-__device__ static inline void fence(
-  cuda::ptx::sem_t<Sem> sem,
-  cuda::ptx::scope_t<Scope> scope);
-*/
-#if __cccl_ptx_isa >= 600
-extern "C" _CCCL_DEVICE void __cuda_ptx_fence_is_not_supported_before_SM_70__();
-template <dot_sem _Sem, dot_scope _Scope>
-_CCCL_DEVICE static inline void fence(sem_t<_Sem> __sem, scope_t<_Scope> __scope)
-{
-  static_assert(__sem == sem_sc || __sem == sem_acq_rel, "");
-  static_assert(__scope == scope_cta || __scope == scope_gpu || __scope == scope_sys, "");
-  NV_IF_ELSE_TARGET(
-    NV_PROVIDES_SM_70,
-    (
-      _CCCL_IF_CONSTEXPR (__sem == sem_sc && __scope == scope_cta) {
-        asm volatile("fence.sc.cta; // 1." : : : "memory");
-      } _CCCL_ELSE_IF_CONSTEXPR (__sem == sem_sc && __scope == scope_gpu) {
-        asm volatile("fence.sc.gpu; // 1." : : : "memory");
-      } _CCCL_ELSE_IF_CONSTEXPR (__sem == sem_sc && __scope == scope_sys) {
-        asm volatile("fence.sc.sys; // 1." : : : "memory");
-      } _CCCL_ELSE_IF_CONSTEXPR (__sem == sem_acq_rel && __scope == scope_cta) {
-        asm volatile("fence.acq_rel.cta; // 1." : : : "memory");
-      } _CCCL_ELSE_IF_CONSTEXPR (__sem == sem_acq_rel && __scope == scope_gpu) {
-        asm volatile("fence.acq_rel.gpu; // 1." : : : "memory");
-      } _CCCL_ELSE_IF_CONSTEXPR (__sem == sem_acq_rel && __scope == scope_sys) {
-        asm volatile("fence.acq_rel.sys; // 1." : : : "memory");
-      }),
-    (
-      // Unsupported architectures will have a linker error with a semi-decent error message
-      __cuda_ptx_fence_is_not_supported_before_SM_70__();));
-}
-#endif // __cccl_ptx_isa >= 600
-
-/*
-// fence{.sem}.scope; // 2. PTX ISA 78, SM_90
-// .sem       = { .sc, .acq_rel }
-// .scope     = { .cluster }
-template <cuda::ptx::dot_sem Sem>
-__device__ static inline void fence(
-  cuda::ptx::sem_t<Sem> sem,
-  cuda::ptx::scope_cluster_t);
-*/
-#if __cccl_ptx_isa >= 780
-extern "C" _CCCL_DEVICE void __cuda_ptx_fence_is_not_supported_before_SM_90__();
-template <dot_sem _Sem>
-_CCCL_DEVICE static inline void fence(sem_t<_Sem> __sem, scope_cluster_t)
-{
-  static_assert(__sem == sem_sc || __sem == sem_acq_rel, "");
-  // __scope == scope_cluster (due to parameter type constraint)
-  NV_IF_ELSE_TARGET(
-    NV_PROVIDES_SM_90,
-    (
-      _CCCL_IF_CONSTEXPR (__sem == sem_sc) {
-        asm volatile("fence.sc.cluster; // 2." : : : "memory");
-      } _CCCL_ELSE_IF_CONSTEXPR (__sem == sem_acq_rel) {
-        asm volatile("fence.acq_rel.cluster; // 2." : : : "memory");
-      }),
-    (
-      // Unsupported architectures will have a linker error with a semi-decent error message
-      __cuda_ptx_fence_is_not_supported_before_SM_90__();));
-}
-#endif // __cccl_ptx_isa >= 780
-/*
-// fence.mbarrier_init.sem.scope; // 3. PTX ISA 80, SM_90
-// .sem       = { .release }
-// .scope     = { .cluster }
-template <typename=void>
-__device__ static inline void fence_mbarrier_init(
-  cuda::ptx::sem_release_t,
-  cuda::ptx::scope_cluster_t);
-*/
-#if __cccl_ptx_isa >= 800
-extern "C" _CCCL_DEVICE void __cuda_ptx_fence_mbarrier_init_is_not_supported_before_SM_90__();
-template <typename = void>
-_CCCL_DEVICE static inline void fence_mbarrier_init(sem_release_t, scope_cluster_t)
-{
-  // __sem == sem_release (due to parameter type constraint)
-  // __scope == scope_cluster (due to parameter type constraint)
-  NV_IF_ELSE_TARGET(
-    NV_PROVIDES_SM_90,
-    (asm volatile("fence.mbarrier_init.release.cluster; // 3."
-                  :
-                  :
-                  : "memory");),
-    (
-      // Unsupported architectures will have a linker error with a semi-decent error message
-      __cuda_ptx_fence_mbarrier_init_is_not_supported_before_SM_90__();));
-}
-#endif // __cccl_ptx_isa >= 800
-/*
-// fence.proxy.alias; // 4. PTX ISA 75, SM_70
-template <typename=void>
-__device__ static inline void fence_proxy_alias();
-*/
-#if __cccl_ptx_isa >= 750
-extern "C" _CCCL_DEVICE void __cuda_ptx_fence_proxy_alias_is_not_supported_before_SM_70__();
-template <typename = void>
-_CCCL_DEVICE static inline void fence_proxy_alias()
-{
-  NV_IF_ELSE_TARGET(
-    NV_PROVIDES_SM_70,
-    (asm volatile("fence.proxy.alias; // 4."
-                  :
-                  :
-                  : "memory");),
-    (
-      // Unsupported architectures will have a linker error with a semi-decent error message
-      __cuda_ptx_fence_proxy_alias_is_not_supported_before_SM_70__();));
-}
-#endif // __cccl_ptx_isa >= 750
-/*
-// fence.proxy.async; // 5. PTX ISA 80, SM_90
-template <typename=void>
-__device__ static inline void fence_proxy_async();
-*/
-#if __cccl_ptx_isa >= 800
-extern "C" _CCCL_DEVICE void __cuda_ptx_fence_proxy_async_is_not_supported_before_SM_90__();
-template <typename = void>
-_CCCL_DEVICE static inline void fence_proxy_async()
-{
-  NV_IF_ELSE_TARGET(
-    NV_PROVIDES_SM_90,
-    (asm volatile("fence.proxy.async; // 5."
-                  :
-                  :
-                  : "memory");),
-    (
-      // Unsupported architectures will have a linker error with a semi-decent error message
-      __cuda_ptx_fence_proxy_async_is_not_supported_before_SM_90__();));
-}
-#endif // __cccl_ptx_isa >= 800
-
-/*
-// fence.proxy.async{.space}; // 6. PTX ISA 80, SM_90
-// .space     = { .global, .shared::cluster, .shared::cta }
-template <cuda::ptx::dot_space Space>
-__device__ static inline void fence_proxy_async(
-  cuda::ptx::space_t<Space> space);
-*/
-#if __cccl_ptx_isa >= 800
-extern "C" _CCCL_DEVICE void __cuda_ptx_fence_proxy_async_is_not_supported_before_SM_90__();
-template <dot_space _Space>
-_CCCL_DEVICE static inline void fence_proxy_async(space_t<_Space> __space)
-{
-  static_assert(__space == space_global || __space == space_cluster || __space == space_shared, "");
-  NV_IF_ELSE_TARGET(
-    NV_PROVIDES_SM_90,
-    (
-      _CCCL_IF_CONSTEXPR (__space == space_global) {
-        asm volatile("fence.proxy.async.global; // 6." : : : "memory");
-      } _CCCL_ELSE_IF_CONSTEXPR (__space == space_cluster) {
-        asm volatile("fence.proxy.async.shared::cluster; // 6." : : : "memory");
-      } _CCCL_ELSE_IF_CONSTEXPR (__space == space_shared) {
-        asm volatile("fence.proxy.async.shared::cta; // 6." : : : "memory");
-      }),
-    (
-      // Unsupported architectures will have a linker error with a semi-decent error message
-      __cuda_ptx_fence_proxy_async_is_not_supported_before_SM_90__();));
-}
-#endif // __cccl_ptx_isa >= 800
-/*
-// fence.proxy.tensormap::generic.release.scope; // 7. PTX ISA 83, SM_90
-// .sem       = { .release }
-// .scope     = { .cta, .cluster, .gpu, .sys }
-template <cuda::ptx::dot_scope Scope>
-__device__ static inline void fence_proxy_tensormap_generic(
-  cuda::ptx::sem_release_t,
-  cuda::ptx::scope_t<Scope> scope);
-*/
-#if __cccl_ptx_isa >= 830
-extern "C" _CCCL_DEVICE void __cuda_ptx_fence_proxy_tensormap_generic_is_not_supported_before_SM_90__();
-template <dot_scope _Scope>
-_CCCL_DEVICE static inline void fence_proxy_tensormap_generic(sem_release_t, scope_t<_Scope> __scope)
-{
-  // __sem == sem_release (due to parameter type constraint)
-  static_assert(__scope == scope_cta || __scope == scope_cluster || __scope == scope_gpu || __scope == scope_sys, "");
-  NV_IF_ELSE_TARGET(
-    NV_PROVIDES_SM_90,
-    (
-      _CCCL_IF_CONSTEXPR (__scope == scope_cta) {
-        asm volatile("fence.proxy.tensormap::generic.release.cta; // 7." : : : "memory");
-      } _CCCL_ELSE_IF_CONSTEXPR (__scope == scope_cluster) {
-        asm volatile("fence.proxy.tensormap::generic.release.cluster; // 7." : : : "memory");
-      } _CCCL_ELSE_IF_CONSTEXPR (__scope == scope_gpu) {
-        asm volatile("fence.proxy.tensormap::generic.release.gpu; // 7." : : : "memory");
-      } _CCCL_ELSE_IF_CONSTEXPR (__scope == scope_sys) {
-        asm volatile("fence.proxy.tensormap::generic.release.sys; // 7." : : : "memory");
-      }),
-    (
-      // Unsupported architectures will have a linker error with a semi-decent error message
-      __cuda_ptx_fence_proxy_tensormap_generic_is_not_supported_before_SM_90__();));
-}
-#endif // __cccl_ptx_isa >= 830
-
-/*
-// fence.proxy.tensormap::generic.sem.scope [addr], size; // 8. PTX ISA 83, SM_90
-// .sem       = { .acquire }
-// .scope     = { .cta, .cluster, .gpu, .sys }
-template <int N32, cuda::ptx::dot_scope Scope>
-__device__ static inline void fence_proxy_tensormap_generic(
-  cuda::ptx::sem_acquire_t,
-  cuda::ptx::scope_t<Scope> scope,
-  const void* addr,
-  cuda::ptx::n32_t<N32> size);
-*/
-#if __cccl_ptx_isa >= 830
-extern "C" _CCCL_DEVICE void __cuda_ptx_fence_proxy_tensormap_generic_is_not_supported_before_SM_90__();
-template <int _N32, dot_scope _Scope>
-_CCCL_DEVICE static inline void
-fence_proxy_tensormap_generic(sem_acquire_t, scope_t<_Scope> __scope, const void* __addr, n32_t<_N32> __size)
-{
-  // __sem == sem_acquire (due to parameter type constraint)
-  static_assert(__scope == scope_cta || __scope == scope_cluster || __scope == scope_gpu || __scope == scope_sys, "");
-  NV_IF_ELSE_TARGET(
-    NV_PROVIDES_SM_90,
-    (
-      _CCCL_IF_CONSTEXPR (__scope == scope_cta) {
-        asm volatile("fence.proxy.tensormap::generic.acquire.cta [%0], %1; // 8."
-                     :
-                     : "l"(__addr), "n"(__size.value)
-                     : "memory");
-      } _CCCL_ELSE_IF_CONSTEXPR (__scope == scope_cluster) {
-        asm volatile("fence.proxy.tensormap::generic.acquire.cluster [%0], %1; // 8."
-                     :
-                     : "l"(__addr), "n"(__size.value)
-                     : "memory");
-      } _CCCL_ELSE_IF_CONSTEXPR (__scope == scope_gpu) {
-        asm volatile("fence.proxy.tensormap::generic.acquire.gpu [%0], %1; // 8."
-                     :
-                     : "l"(__addr), "n"(__size.value)
-                     : "memory");
-      } _CCCL_ELSE_IF_CONSTEXPR (__scope == scope_sys) {
-        asm volatile("fence.proxy.tensormap::generic.acquire.sys [%0], %1; // 8."
-                     :
-                     : "l"(__addr), "n"(__size.value)
-                     : "memory");
-      }),
-    (
-      // Unsupported architectures will have a linker error with a semi-decent error message
-      __cuda_ptx_fence_proxy_tensormap_generic_is_not_supported_before_SM_90__();));
-}
-#endif // __cccl_ptx_isa >= 830
+#include <cuda/__ptx/instructions/generated/fence.inc>
+#include <cuda/__ptx/instructions/generated/fence_mbarrier_init.inc>
+#include <cuda/__ptx/instructions/generated/fence_proxy_alias.inc>
+#include <cuda/__ptx/instructions/generated/fence_proxy_async.inc>
+#include <cuda/__ptx/instructions/generated/fence_proxy_tensormap_generic.inc>
 
 _LIBCUDACXX_END_NAMESPACE_CUDA_PTX
 
diff --git a/libcudacxx/include/cuda/__ptx/instructions/generated/barrier_cluster.inc b/libcudacxx/include/cuda/__ptx/instructions/generated/barrier_cluster.inc
new file mode 100644
index 00000000000..ca9238bc3ff
--- /dev/null
+++ b/libcudacxx/include/cuda/__ptx/instructions/generated/barrier_cluster.inc
@@ -0,0 +1,123 @@
+/*
+// barrier.cluster.arrive; // PTX ISA 78, SM_90
+// Marked volatile and as clobbering memory
+template <typename=void>
+__device__ static inline void barrier_cluster_arrive();
+*/
+#if __cccl_ptx_isa >= 780
+extern "C" _CCCL_DEVICE void __cuda_ptx_barrier_cluster_arrive_is_not_supported_before_SM_90__();
+template <typename = void>
+_CCCL_DEVICE static inline void barrier_cluster_arrive()
+{
+  NV_IF_ELSE_TARGET(
+    NV_PROVIDES_SM_90,
+    (asm volatile("barrier.cluster.arrive;"
+                  :
+                  :
+                  : "memory");),
+    (
+      // Unsupported architectures will have a linker error with a semi-decent error message
+      __cuda_ptx_barrier_cluster_arrive_is_not_supported_before_SM_90__();));
+}
+#endif // __cccl_ptx_isa >= 780
+
+/*
+// barrier.cluster.wait; // PTX ISA 78, SM_90
+// Marked volatile and as clobbering memory
+template <typename=void>
+__device__ static inline void barrier_cluster_wait();
+*/
+#if __cccl_ptx_isa >= 780
+extern "C" _CCCL_DEVICE void __cuda_ptx_barrier_cluster_wait_is_not_supported_before_SM_90__();
+template <typename = void>
+_CCCL_DEVICE static inline void barrier_cluster_wait()
+{
+  NV_IF_ELSE_TARGET(
+    NV_PROVIDES_SM_90,
+    (asm volatile("barrier.cluster.wait;"
+                  :
+                  :
+                  : "memory");),
+    (
+      // Unsupported architectures will have a linker error with a semi-decent error message
+      __cuda_ptx_barrier_cluster_wait_is_not_supported_before_SM_90__();));
+}
+#endif // __cccl_ptx_isa >= 780
+
+/*
+// barrier.cluster.arrive.sem; // PTX ISA 80, SM_90
+// .sem       = { .release }
+// Marked volatile and as clobbering memory
+template <typename=void>
+__device__ static inline void barrier_cluster_arrive(
+  cuda::ptx::sem_release_t);
+*/
+#if __cccl_ptx_isa >= 800
+extern "C" _CCCL_DEVICE void __cuda_ptx_barrier_cluster_arrive_is_not_supported_before_SM_90__();
+template <typename = void>
+_CCCL_DEVICE static inline void barrier_cluster_arrive(sem_release_t)
+{
+  // __sem == sem_release (due to parameter type constraint)
+  NV_IF_ELSE_TARGET(
+    NV_PROVIDES_SM_90,
+    (asm volatile("barrier.cluster.arrive.release;"
+                  :
+                  :
+                  : "memory");),
+    (
+      // Unsupported architectures will have a linker error with a semi-decent error message
+      __cuda_ptx_barrier_cluster_arrive_is_not_supported_before_SM_90__();));
+}
+#endif // __cccl_ptx_isa >= 800
+
+/*
+// barrier.cluster.arrive.sem; // PTX ISA 80, SM_90
+// .sem       = { .relaxed }
+// Marked volatile
+template <typename=void>
+__device__ static inline void barrier_cluster_arrive(
+  cuda::ptx::sem_relaxed_t);
+*/
+#if __cccl_ptx_isa >= 800
+extern "C" _CCCL_DEVICE void __cuda_ptx_barrier_cluster_arrive_is_not_supported_before_SM_90__();
+template <typename = void>
+_CCCL_DEVICE static inline void barrier_cluster_arrive(sem_relaxed_t)
+{
+  // __sem == sem_relaxed (due to parameter type constraint)
+  NV_IF_ELSE_TARGET(
+    NV_PROVIDES_SM_90,
+    (asm volatile("barrier.cluster.arrive.relaxed;"
+                  :
+                  :
+                  :);),
+    (
+      // Unsupported architectures will have a linker error with a semi-decent error message
+      __cuda_ptx_barrier_cluster_arrive_is_not_supported_before_SM_90__();));
+}
+#endif // __cccl_ptx_isa >= 800
+
+/*
+// barrier.cluster.wait.sem; // PTX ISA 80, SM_90
+// .sem       = { .acquire }
+// Marked volatile and as clobbering memory
+template <typename=void>
+__device__ static inline void barrier_cluster_wait(
+  cuda::ptx::sem_acquire_t);
+*/
+#if __cccl_ptx_isa >= 800
+extern "C" _CCCL_DEVICE void __cuda_ptx_barrier_cluster_wait_is_not_supported_before_SM_90__();
+template <typename = void>
+_CCCL_DEVICE static inline void barrier_cluster_wait(sem_acquire_t)
+{
+  // __sem == sem_acquire (due to parameter type constraint)
+  NV_IF_ELSE_TARGET(
+    NV_PROVIDES_SM_90,
+    (asm volatile("barrier.cluster.wait.acquire;"
+                  :
+                  :
+                  : "memory");),
+    (
+      // Unsupported architectures will have a linker error with a semi-decent error message
+      __cuda_ptx_barrier_cluster_wait_is_not_supported_before_SM_90__();));
+}
+#endif // __cccl_ptx_isa >= 800
diff --git a/libcudacxx/include/cuda/__ptx/instructions/generated/cp_async_bulk.inc b/libcudacxx/include/cuda/__ptx/instructions/generated/cp_async_bulk.inc
new file mode 100644
index 00000000000..69f77053b95
--- /dev/null
+++ b/libcudacxx/include/cuda/__ptx/instructions/generated/cp_async_bulk.inc
@@ -0,0 +1,111 @@
+/*
+// cp.async.bulk.dst.src.mbarrier::complete_tx::bytes [dstMem], [srcMem], size, [smem_bar]; // 1a. unicast PTX ISA 80,
+SM_90
+// .dst       = { .shared::cluster }
+// .src       = { .global }
+template <typename=void>
+__device__ static inline void cp_async_bulk(
+  cuda::ptx::space_cluster_t,
+  cuda::ptx::space_global_t,
+  void* dstMem,
+  const void* srcMem,
+  const uint32_t& size,
+  uint64_t* smem_bar);
+*/
+#if __cccl_ptx_isa >= 800
+extern "C" _CCCL_DEVICE void __cuda_ptx_cp_async_bulk_is_not_supported_before_SM_90__();
+template <typename = void>
+_CCCL_DEVICE static inline void cp_async_bulk(
+  space_cluster_t,
+  space_global_t,
+  void* __dstMem,
+  const void* __srcMem,
+  const _CUDA_VSTD::uint32_t& __size,
+  _CUDA_VSTD::uint64_t* __smem_bar)
+{
+  // __space == space_cluster (due to parameter type constraint)
+  // __space == space_global (due to parameter type constraint)
+  NV_IF_ELSE_TARGET(
+    NV_PROVIDES_SM_90,
+    (asm("cp.async.bulk.shared::cluster.global.mbarrier::complete_tx::bytes [%0], [%1], %2, [%3]; // 1a. unicast"
+         :
+         : "r"(__as_ptr_smem(__dstMem)), "l"(__as_ptr_gmem(__srcMem)), "r"(__size), "r"(__as_ptr_smem(__smem_bar))
+         : "memory");),
+    (
+      // Unsupported architectures will have a linker error with a semi-decent error message
+      __cuda_ptx_cp_async_bulk_is_not_supported_before_SM_90__();));
+}
+#endif // __cccl_ptx_isa >= 800
+
+/*
+// cp.async.bulk.dst.src.mbarrier::complete_tx::bytes [dstMem], [srcMem], size, [rdsmem_bar]; // 2.  PTX ISA 80, SM_90
+// .dst       = { .shared::cluster }
+// .src       = { .shared::cta }
+template <typename=void>
+__device__ static inline void cp_async_bulk(
+  cuda::ptx::space_cluster_t,
+  cuda::ptx::space_shared_t,
+  void* dstMem,
+  const void* srcMem,
+  const uint32_t& size,
+  uint64_t* rdsmem_bar);
+*/
+#if __cccl_ptx_isa >= 800
+extern "C" _CCCL_DEVICE void __cuda_ptx_cp_async_bulk_is_not_supported_before_SM_90__();
+template <typename = void>
+_CCCL_DEVICE static inline void cp_async_bulk(
+  space_cluster_t,
+  space_shared_t,
+  void* __dstMem,
+  const void* __srcMem,
+  const _CUDA_VSTD::uint32_t& __size,
+  _CUDA_VSTD::uint64_t* __rdsmem_bar)
+{
+  // __space == space_cluster (due to parameter type constraint)
+  // __space == space_shared (due to parameter type constraint)
+  NV_IF_ELSE_TARGET(
+    NV_PROVIDES_SM_90,
+    (asm("cp.async.bulk.shared::cluster.shared::cta.mbarrier::complete_tx::bytes [%0], [%1], %2, [%3]; // 2. "
+         :
+         : "r"(__as_ptr_remote_dsmem(__dstMem)),
+           "r"(__as_ptr_smem(__srcMem)),
+           "r"(__size),
+           "r"(__as_ptr_remote_dsmem(__rdsmem_bar))
+         : "memory");),
+    (
+      // Unsupported architectures will have a linker error with a semi-decent error message
+      __cuda_ptx_cp_async_bulk_is_not_supported_before_SM_90__();));
+}
+#endif // __cccl_ptx_isa >= 800
+
+/*
+// cp.async.bulk.dst.src.bulk_group [dstMem], [srcMem], size; // 3.  PTX ISA 80, SM_90
+// .dst       = { .global }
+// .src       = { .shared::cta }
+template <typename=void>
+__device__ static inline void cp_async_bulk(
+  cuda::ptx::space_global_t,
+  cuda::ptx::space_shared_t,
+  void* dstMem,
+  const void* srcMem,
+  const uint32_t& size);
+*/
+#if __cccl_ptx_isa >= 800
+extern "C" _CCCL_DEVICE void __cuda_ptx_cp_async_bulk_is_not_supported_before_SM_90__();
+template <typename = void>
+_CCCL_DEVICE static inline void
+cp_async_bulk(space_global_t, space_shared_t, void* __dstMem, const void* __srcMem, const _CUDA_VSTD::uint32_t& __size)
+{
+  // __space == space_global (due to parameter type constraint)
+  // __space == space_shared (due to parameter type constraint)
+  NV_IF_ELSE_TARGET(
+    NV_PROVIDES_SM_90,
+    (asm("cp.async.bulk.global.shared::cta.bulk_group [%0], [%1], %2; // 3. "
+         :
+         : "l"(__as_ptr_gmem(__dstMem)), "r"(__as_ptr_smem(__srcMem)), "r"(__size)
+         : "memory");),
+    (
+      // Unsupported architectures will have a linker error with a semi-decent error message
+      __cuda_ptx_cp_async_bulk_is_not_supported_before_SM_90__();));
+}
+#endif // __cccl_ptx_isa >= 800
diff --git a/libcudacxx/include/cuda/__ptx/instructions/generated/cp_async_bulk_commit_group.inc b/libcudacxx/include/cuda/__ptx/instructions/generated/cp_async_bulk_commit_group.inc
new file mode 100644
index 00000000000..24baddaea8f
--- /dev/null
+++ b/libcudacxx/include/cuda/__ptx/instructions/generated/cp_async_bulk_commit_group.inc
@@ -0,0 +1,21 @@
+/*
+// cp.async.bulk.commit_group; // PTX ISA 80, SM_90
+template <typename=void>
+__device__ static inline void cp_async_bulk_commit_group();
+*/
+#if __cccl_ptx_isa >= 800
+extern "C" _CCCL_DEVICE void __cuda_ptx_cp_async_bulk_commit_group_is_not_supported_before_SM_90__();
+template <typename = void>
+_CCCL_DEVICE static inline void cp_async_bulk_commit_group()
+{
+  NV_IF_ELSE_TARGET(
+    NV_PROVIDES_SM_90,
+    (asm volatile("cp.async.bulk.commit_group;"
+                  :
+                  :
+                  :);),
+    (
+      // Unsupported architectures will have a linker error with a semi-decent error message
+      __cuda_ptx_cp_async_bulk_commit_group_is_not_supported_before_SM_90__();));
+}
+#endif // __cccl_ptx_isa >= 800
diff --git a/libcudacxx/include/cuda/__ptx/instructions/generated/cp_async_bulk_multicast.inc b/libcudacxx/include/cuda/__ptx/instructions/generated/cp_async_bulk_multicast.inc
new file mode 100644
index 00000000000..cdd5a535eb6
--- /dev/null
+++ b/libcudacxx/include/cuda/__ptx/instructions/generated/cp_async_bulk_multicast.inc
@@ -0,0 +1,45 @@
+/*
+// cp.async.bulk{.dst}{.src}.mbarrier::complete_tx::bytes.multicast::cluster [dstMem], [srcMem], size, [smem_bar],
+ctaMask; // 1.  PTX ISA 80, SM_90a
+// .dst       = { .shared::cluster }
+// .src       = { .global }
+template <typename=void>
+__device__ static inline void cp_async_bulk(
+  cuda::ptx::space_cluster_t,
+  cuda::ptx::space_global_t,
+  void* dstMem,
+  const void* srcMem,
+  const uint32_t& size,
+  uint64_t* smem_bar,
+  const uint16_t& ctaMask);
+*/
+#if __cccl_ptx_isa >= 800
+extern "C" _CCCL_DEVICE void __cuda_ptx_cp_async_bulk_is_not_supported_before_SM_90a__();
+template <typename = void>
+_CCCL_DEVICE static inline void cp_async_bulk(
+  space_cluster_t,
+  space_global_t,
+  void* __dstMem,
+  const void* __srcMem,
+  const _CUDA_VSTD::uint32_t& __size,
+  _CUDA_VSTD::uint64_t* __smem_bar,
+  const _CUDA_VSTD::uint16_t& __ctaMask)
+{
+  // __space == space_cluster (due to parameter type constraint)
+  // __space == space_global (due to parameter type constraint)
+  NV_IF_ELSE_TARGET(
+    NV_HAS_FEATURE_SM_90a,
+    (asm("cp.async.bulk.shared::cluster.global.mbarrier::complete_tx::bytes.multicast::cluster [%0], [%1], %2, [%3], "
+         "%4; // 1. "
+         :
+         : "r"(__as_ptr_smem(__dstMem)),
+           "l"(__as_ptr_gmem(__srcMem)),
+           "r"(__size),
+           "r"(__as_ptr_smem(__smem_bar)),
+           "h"(__ctaMask)
+         : "memory");),
+    (
+      // Unsupported architectures will have a linker error with a semi-decent error message
+      __cuda_ptx_cp_async_bulk_is_not_supported_before_SM_90a__();));
+}
+#endif // __cccl_ptx_isa >= 800
diff --git a/libcudacxx/include/cuda/__ptx/instructions/generated/cp_async_bulk_tensor.inc b/libcudacxx/include/cuda/__ptx/instructions/generated/cp_async_bulk_tensor.inc
new file mode 100644
index 00000000000..547888d5b0f
--- /dev/null
+++ b/libcudacxx/include/cuda/__ptx/instructions/generated/cp_async_bulk_tensor.inc
@@ -0,0 +1,416 @@
+/*
+// cp.async.bulk.tensor.1d.dst.src.tile.mbarrier::complete_tx::bytes [dstMem], [tensorMap, tensorCoords], [smem_bar];//
+1a. PTX ISA 80, SM_90
+// .dst       = { .shared::cluster }
+// .src       = { .global }
+template <typename=void>
+__device__ static inline void cp_async_bulk_tensor(
+  cuda::ptx::space_cluster_t,
+  cuda::ptx::space_global_t,
+  void* dstMem,
+  const void* tensorMap,
+  const int32_t (&tensorCoords)[1],
+  uint64_t* smem_bar);
+*/
+#if __cccl_ptx_isa >= 800
+extern "C" _CCCL_DEVICE void __cuda_ptx_cp_async_bulk_tensor_is_not_supported_before_SM_90__();
+template <typename = void>
+_CCCL_DEVICE static inline void cp_async_bulk_tensor(
+  space_cluster_t,
+  space_global_t,
+  void* __dstMem,
+  const void* __tensorMap,
+  const _CUDA_VSTD::int32_t (&__tensorCoords)[1],
+  _CUDA_VSTD::uint64_t* __smem_bar)
+{
+  // __space == space_cluster (due to parameter type constraint)
+  // __space == space_global (due to parameter type constraint)
+  NV_IF_ELSE_TARGET(
+    NV_PROVIDES_SM_90,
+    (asm("cp.async.bulk.tensor.1d.shared::cluster.global.tile.mbarrier::complete_tx::bytes [%0], [%1, {%2}], [%3];// "
+         "1a."
+         :
+         : "r"(__as_ptr_smem(__dstMem)), "l"(__tensorMap), "r"(__tensorCoords[0]), "r"(__as_ptr_smem(__smem_bar))
+         : "memory");),
+    (
+      // Unsupported architectures will have a linker error with a semi-decent error message
+      __cuda_ptx_cp_async_bulk_tensor_is_not_supported_before_SM_90__();));
+}
+#endif // __cccl_ptx_isa >= 800
+
+/*
+// cp.async.bulk.tensor.1d.dst.src.tile.bulk_group [tensorMap, tensorCoords], [srcMem]; // 3a. PTX ISA 80, SM_90
+// .dst       = { .global }
+// .src       = { .shared::cta }
+template <typename=void>
+__device__ static inline void cp_async_bulk_tensor(
+  cuda::ptx::space_global_t,
+  cuda::ptx::space_shared_t,
+  const void* tensorMap,
+  const int32_t (&tensorCoords)[1],
+  const void* srcMem);
+*/
+#if __cccl_ptx_isa >= 800
+extern "C" _CCCL_DEVICE void __cuda_ptx_cp_async_bulk_tensor_is_not_supported_before_SM_90__();
+template <typename = void>
+_CCCL_DEVICE static inline void cp_async_bulk_tensor(
+  space_global_t,
+  space_shared_t,
+  const void* __tensorMap,
+  const _CUDA_VSTD::int32_t (&__tensorCoords)[1],
+  const void* __srcMem)
+{
+  // __space == space_global (due to parameter type constraint)
+  // __space == space_shared (due to parameter type constraint)
+  NV_IF_ELSE_TARGET(
+    NV_PROVIDES_SM_90,
+    (asm("cp.async.bulk.tensor.1d.global.shared::cta.tile.bulk_group [%0, {%1}], [%2]; // 3a."
+         :
+         : "l"(__tensorMap), "r"(__tensorCoords[0]), "r"(__as_ptr_smem(__srcMem))
+         : "memory");),
+    (
+      // Unsupported architectures will have a linker error with a semi-decent error message
+      __cuda_ptx_cp_async_bulk_tensor_is_not_supported_before_SM_90__();));
+}
+#endif // __cccl_ptx_isa >= 800
+
+/*
+// cp.async.bulk.tensor.2d.dst.src.tile.mbarrier::complete_tx::bytes [dstMem], [tensorMap, tensorCoords], [smem_bar];//
+1b. PTX ISA 80, SM_90
+// .dst       = { .shared::cluster }
+// .src       = { .global }
+template <typename=void>
+__device__ static inline void cp_async_bulk_tensor(
+  cuda::ptx::space_cluster_t,
+  cuda::ptx::space_global_t,
+  void* dstMem,
+  const void* tensorMap,
+  const int32_t (&tensorCoords)[2],
+  uint64_t* smem_bar);
+*/
+#if __cccl_ptx_isa >= 800
+extern "C" _CCCL_DEVICE void __cuda_ptx_cp_async_bulk_tensor_is_not_supported_before_SM_90__();
+template <typename = void>
+_CCCL_DEVICE static inline void cp_async_bulk_tensor(
+  space_cluster_t,
+  space_global_t,
+  void* __dstMem,
+  const void* __tensorMap,
+  const _CUDA_VSTD::int32_t (&__tensorCoords)[2],
+  _CUDA_VSTD::uint64_t* __smem_bar)
+{
+  // __space == space_cluster (due to parameter type constraint)
+  // __space == space_global (due to parameter type constraint)
+  NV_IF_ELSE_TARGET(
+    NV_PROVIDES_SM_90,
+    (asm("cp.async.bulk.tensor.2d.shared::cluster.global.tile.mbarrier::complete_tx::bytes [%0], [%1, {%2, %3}], "
+         "[%4];// 1b."
+         :
+         : "r"(__as_ptr_smem(__dstMem)),
+           "l"(__tensorMap),
+           "r"(__tensorCoords[0]),
+           "r"(__tensorCoords[1]),
+           "r"(__as_ptr_smem(__smem_bar))
+         : "memory");),
+    (
+      // Unsupported architectures will have a linker error with a semi-decent error message
+      __cuda_ptx_cp_async_bulk_tensor_is_not_supported_before_SM_90__();));
+}
+#endif // __cccl_ptx_isa >= 800
+
+/*
+// cp.async.bulk.tensor.2d.dst.src.tile.bulk_group [tensorMap, tensorCoords], [srcMem]; // 3b. PTX ISA 80, SM_90
+// .dst       = { .global }
+// .src       = { .shared::cta }
+template <typename=void>
+__device__ static inline void cp_async_bulk_tensor(
+  cuda::ptx::space_global_t,
+  cuda::ptx::space_shared_t,
+  const void* tensorMap,
+  const int32_t (&tensorCoords)[2],
+  const void* srcMem);
+*/
+#if __cccl_ptx_isa >= 800
+extern "C" _CCCL_DEVICE void __cuda_ptx_cp_async_bulk_tensor_is_not_supported_before_SM_90__();
+template <typename = void>
+_CCCL_DEVICE static inline void cp_async_bulk_tensor(
+  space_global_t,
+  space_shared_t,
+  const void* __tensorMap,
+  const _CUDA_VSTD::int32_t (&__tensorCoords)[2],
+  const void* __srcMem)
+{
+  // __space == space_global (due to parameter type constraint)
+  // __space == space_shared (due to parameter type constraint)
+  NV_IF_ELSE_TARGET(
+    NV_PROVIDES_SM_90,
+    (asm("cp.async.bulk.tensor.2d.global.shared::cta.tile.bulk_group [%0, {%1, %2}], [%3]; // 3b."
+         :
+         : "l"(__tensorMap), "r"(__tensorCoords[0]), "r"(__tensorCoords[1]), "r"(__as_ptr_smem(__srcMem))
+         : "memory");),
+    (
+      // Unsupported architectures will have a linker error with a semi-decent error message
+      __cuda_ptx_cp_async_bulk_tensor_is_not_supported_before_SM_90__();));
+}
+#endif // __cccl_ptx_isa >= 800
+
+/*
+// cp.async.bulk.tensor.3d.dst.src.tile.mbarrier::complete_tx::bytes [dstMem], [tensorMap, tensorCoords], [smem_bar];//
+1c. PTX ISA 80, SM_90
+// .dst       = { .shared::cluster }
+// .src       = { .global }
+template <typename=void>
+__device__ static inline void cp_async_bulk_tensor(
+  cuda::ptx::space_cluster_t,
+  cuda::ptx::space_global_t,
+  void* dstMem,
+  const void* tensorMap,
+  const int32_t (&tensorCoords)[3],
+  uint64_t* smem_bar);
+*/
+#if __cccl_ptx_isa >= 800
+extern "C" _CCCL_DEVICE void __cuda_ptx_cp_async_bulk_tensor_is_not_supported_before_SM_90__();
+template <typename = void>
+_CCCL_DEVICE static inline void cp_async_bulk_tensor(
+  space_cluster_t,
+  space_global_t,
+  void* __dstMem,
+  const void* __tensorMap,
+  const _CUDA_VSTD::int32_t (&__tensorCoords)[3],
+  _CUDA_VSTD::uint64_t* __smem_bar)
+{
+  // __space == space_cluster (due to parameter type constraint)
+  // __space == space_global (due to parameter type constraint)
+  NV_IF_ELSE_TARGET(
+    NV_PROVIDES_SM_90,
+    (asm("cp.async.bulk.tensor.3d.shared::cluster.global.tile.mbarrier::complete_tx::bytes [%0], [%1, {%2, %3, %4}], "
+         "[%5];// 1c."
+         :
+         : "r"(__as_ptr_smem(__dstMem)),
+           "l"(__tensorMap),
+           "r"(__tensorCoords[0]),
+           "r"(__tensorCoords[1]),
+           "r"(__tensorCoords[2]),
+           "r"(__as_ptr_smem(__smem_bar))
+         : "memory");),
+    (
+      // Unsupported architectures will have a linker error with a semi-decent error message
+      __cuda_ptx_cp_async_bulk_tensor_is_not_supported_before_SM_90__();));
+}
+#endif // __cccl_ptx_isa >= 800
+
+/*
+// cp.async.bulk.tensor.3d.dst.src.tile.bulk_group [tensorMap, tensorCoords], [srcMem]; // 3c. PTX ISA 80, SM_90
+// .dst       = { .global }
+// .src       = { .shared::cta }
+template <typename=void>
+__device__ static inline void cp_async_bulk_tensor(
+  cuda::ptx::space_global_t,
+  cuda::ptx::space_shared_t,
+  const void* tensorMap,
+  const int32_t (&tensorCoords)[3],
+  const void* srcMem);
+*/
+#if __cccl_ptx_isa >= 800
+extern "C" _CCCL_DEVICE void __cuda_ptx_cp_async_bulk_tensor_is_not_supported_before_SM_90__();
+template <typename = void>
+_CCCL_DEVICE static inline void cp_async_bulk_tensor(
+  space_global_t,
+  space_shared_t,
+  const void* __tensorMap,
+  const _CUDA_VSTD::int32_t (&__tensorCoords)[3],
+  const void* __srcMem)
+{
+  // __space == space_global (due to parameter type constraint)
+  // __space == space_shared (due to parameter type constraint)
+  NV_IF_ELSE_TARGET(
+    NV_PROVIDES_SM_90,
+    (asm("cp.async.bulk.tensor.3d.global.shared::cta.tile.bulk_group [%0, {%1, %2, %3}], [%4]; // 3c."
+         :
+         : "l"(__tensorMap),
+           "r"(__tensorCoords[0]),
+           "r"(__tensorCoords[1]),
+           "r"(__tensorCoords[2]),
+           "r"(__as_ptr_smem(__srcMem))
+         : "memory");),
+    (
+      // Unsupported architectures will have a linker error with a semi-decent error message
+      __cuda_ptx_cp_async_bulk_tensor_is_not_supported_before_SM_90__();));
+}
+#endif // __cccl_ptx_isa >= 800
+
+/*
+// cp.async.bulk.tensor.4d.dst.src.tile.mbarrier::complete_tx::bytes [dstMem], [tensorMap, tensorCoords], [smem_bar];//
+1d. PTX ISA 80, SM_90
+// .dst       = { .shared::cluster }
+// .src       = { .global }
+template <typename=void>
+__device__ static inline void cp_async_bulk_tensor(
+  cuda::ptx::space_cluster_t,
+  cuda::ptx::space_global_t,
+  void* dstMem,
+  const void* tensorMap,
+  const int32_t (&tensorCoords)[4],
+  uint64_t* smem_bar);
+*/
+#if __cccl_ptx_isa >= 800
+extern "C" _CCCL_DEVICE void __cuda_ptx_cp_async_bulk_tensor_is_not_supported_before_SM_90__();
+template <typename = void>
+_CCCL_DEVICE static inline void cp_async_bulk_tensor(
+  space_cluster_t,
+  space_global_t,
+  void* __dstMem,
+  const void* __tensorMap,
+  const _CUDA_VSTD::int32_t (&__tensorCoords)[4],
+  _CUDA_VSTD::uint64_t* __smem_bar)
+{
+  // __space == space_cluster (due to parameter type constraint)
+  // __space == space_global (due to parameter type constraint)
+  NV_IF_ELSE_TARGET(
+    NV_PROVIDES_SM_90,
+    (asm("cp.async.bulk.tensor.4d.shared::cluster.global.tile.mbarrier::complete_tx::bytes [%0], [%1, {%2, %3, %4, "
+         "%5}], [%6];// 1d."
+         :
+         : "r"(__as_ptr_smem(__dstMem)),
+           "l"(__tensorMap),
+           "r"(__tensorCoords[0]),
+           "r"(__tensorCoords[1]),
+           "r"(__tensorCoords[2]),
+           "r"(__tensorCoords[3]),
+           "r"(__as_ptr_smem(__smem_bar))
+         : "memory");),
+    (
+      // Unsupported architectures will have a linker error with a semi-decent error message
+      __cuda_ptx_cp_async_bulk_tensor_is_not_supported_before_SM_90__();));
+}
+#endif // __cccl_ptx_isa >= 800
+
+/*
+// cp.async.bulk.tensor.4d.dst.src.tile.bulk_group [tensorMap, tensorCoords], [srcMem]; // 3d. PTX ISA 80, SM_90
+// .dst       = { .global }
+// .src       = { .shared::cta }
+template <typename=void>
+__device__ static inline void cp_async_bulk_tensor(
+  cuda::ptx::space_global_t,
+  cuda::ptx::space_shared_t,
+  const void* tensorMap,
+  const int32_t (&tensorCoords)[4],
+  const void* srcMem);
+*/
+#if __cccl_ptx_isa >= 800
+extern "C" _CCCL_DEVICE void __cuda_ptx_cp_async_bulk_tensor_is_not_supported_before_SM_90__();
+template <typename = void>
+_CCCL_DEVICE static inline void cp_async_bulk_tensor(
+  space_global_t,
+  space_shared_t,
+  const void* __tensorMap,
+  const _CUDA_VSTD::int32_t (&__tensorCoords)[4],
+  const void* __srcMem)
+{
+  // __space == space_global (due to parameter type constraint)
+  // __space == space_shared (due to parameter type constraint)
+  NV_IF_ELSE_TARGET(
+    NV_PROVIDES_SM_90,
+    (asm("cp.async.bulk.tensor.4d.global.shared::cta.tile.bulk_group [%0, {%1, %2, %3, %4}], [%5]; // 3d."
+         :
+         : "l"(__tensorMap),
+           "r"(__tensorCoords[0]),
+           "r"(__tensorCoords[1]),
+           "r"(__tensorCoords[2]),
+           "r"(__tensorCoords[3]),
+           "r"(__as_ptr_smem(__srcMem))
+         : "memory");),
+    (
+      // Unsupported architectures will have a linker error with a semi-decent error message
+      __cuda_ptx_cp_async_bulk_tensor_is_not_supported_before_SM_90__();));
+}
+#endif // __cccl_ptx_isa >= 800
+
+/*
+// cp.async.bulk.tensor.5d.dst.src.tile.mbarrier::complete_tx::bytes [dstMem], [tensorMap, tensorCoords], [smem_bar];//
+1e. PTX ISA 80, SM_90
+// .dst       = { .shared::cluster }
+// .src       = { .global }
+template <typename=void>
+__device__ static inline void cp_async_bulk_tensor(
+  cuda::ptx::space_cluster_t,
+  cuda::ptx::space_global_t,
+  void* dstMem,
+  const void* tensorMap,
+  const int32_t (&tensorCoords)[5],
+  uint64_t* smem_bar);
+*/
+#if __cccl_ptx_isa >= 800
+extern "C" _CCCL_DEVICE void __cuda_ptx_cp_async_bulk_tensor_is_not_supported_before_SM_90__();
+template <typename = void>
+_CCCL_DEVICE static inline void cp_async_bulk_tensor(
+  space_cluster_t,
+  space_global_t,
+  void* __dstMem,
+  const void* __tensorMap,
+  const _CUDA_VSTD::int32_t (&__tensorCoords)[5],
+  _CUDA_VSTD::uint64_t* __smem_bar)
+{
+  // __space == space_cluster (due to parameter type constraint)
+  // __space == space_global (due to parameter type constraint)
+  NV_IF_ELSE_TARGET(
+    NV_PROVIDES_SM_90,
+    (asm("cp.async.bulk.tensor.5d.shared::cluster.global.tile.mbarrier::complete_tx::bytes [%0], [%1, {%2, %3, %4, %5, "
+         "%6}], [%7];// 1e."
+         :
+         : "r"(__as_ptr_smem(__dstMem)),
+           "l"(__tensorMap),
+           "r"(__tensorCoords[0]),
+           "r"(__tensorCoords[1]),
+           "r"(__tensorCoords[2]),
+           "r"(__tensorCoords[3]),
+           "r"(__tensorCoords[4]),
+           "r"(__as_ptr_smem(__smem_bar))
+         : "memory");),
+    (
+      // Unsupported architectures will have a linker error with a semi-decent error message
+      __cuda_ptx_cp_async_bulk_tensor_is_not_supported_before_SM_90__();));
+}
+#endif // __cccl_ptx_isa >= 800
+
+/*
+// cp.async.bulk.tensor.5d.dst.src.tile.bulk_group [tensorMap, tensorCoords], [srcMem]; // 3e. PTX ISA 80, SM_90
+// .dst       = { .global }
+// .src       = { .shared::cta }
+template <typename=void>
+__device__ static inline void cp_async_bulk_tensor(
+  cuda::ptx::space_global_t,
+  cuda::ptx::space_shared_t,
+  const void* tensorMap,
+  const int32_t (&tensorCoords)[5],
+  const void* srcMem);
+*/
+#if __cccl_ptx_isa >= 800
+extern "C" _CCCL_DEVICE void __cuda_ptx_cp_async_bulk_tensor_is_not_supported_before_SM_90__();
+template <typename = void>
+_CCCL_DEVICE static inline void cp_async_bulk_tensor(
+  space_global_t,
+  space_shared_t,
+  const void* __tensorMap,
+  const _CUDA_VSTD::int32_t (&__tensorCoords)[5],
+  const void* __srcMem)
+{
+  // __space == space_global (due to parameter type constraint)
+  // __space == space_shared (due to parameter type constraint)
+  NV_IF_ELSE_TARGET(
+    NV_PROVIDES_SM_90,
+    (asm("cp.async.bulk.tensor.5d.global.shared::cta.tile.bulk_group [%0, {%1, %2, %3, %4, %5}], [%6]; // 3e."
+         :
+         : "l"(__tensorMap),
+           "r"(__tensorCoords[0]),
+           "r"(__tensorCoords[1]),
+           "r"(__tensorCoords[2]),
+           "r"(__tensorCoords[3]),
+           "r"(__tensorCoords[4]),
+           "r"(__as_ptr_smem(__srcMem))
+         : "memory");),
+    (
+      // Unsupported architectures will have a linker error with a semi-decent error message
+      __cuda_ptx_cp_async_bulk_tensor_is_not_supported_before_SM_90__();));
+}
+#endif // __cccl_ptx_isa >= 800
diff --git a/libcudacxx/include/cuda/__ptx/instructions/generated/cp_async_bulk_tensor_multicast.inc b/libcudacxx/include/cuda/__ptx/instructions/generated/cp_async_bulk_tensor_multicast.inc
new file mode 100644
index 00000000000..020698a15b1
--- /dev/null
+++ b/libcudacxx/include/cuda/__ptx/instructions/generated/cp_async_bulk_tensor_multicast.inc
@@ -0,0 +1,239 @@
+/*
+// cp.async.bulk.tensor.1d.dst.src.tile.mbarrier::complete_tx::bytes.multicast::cluster [dstMem], [tensorMap,
+tensorCoords], [smem_bar], ctaMask; // 2a. PTX ISA 80, SM_90a
+// .dst       = { .shared::cluster }
+// .src       = { .global }
+template <typename=void>
+__device__ static inline void cp_async_bulk_tensor(
+  cuda::ptx::space_cluster_t,
+  cuda::ptx::space_global_t,
+  void* dstMem,
+  const void* tensorMap,
+  const int32_t (&tensorCoords)[1],
+  uint64_t* smem_bar,
+  const uint16_t& ctaMask);
+*/
+#if __cccl_ptx_isa >= 800
+extern "C" _CCCL_DEVICE void __cuda_ptx_cp_async_bulk_tensor_is_not_supported_before_SM_90a__();
+template <typename = void>
+_CCCL_DEVICE static inline void cp_async_bulk_tensor(
+  space_cluster_t,
+  space_global_t,
+  void* __dstMem,
+  const void* __tensorMap,
+  const _CUDA_VSTD::int32_t (&__tensorCoords)[1],
+  _CUDA_VSTD::uint64_t* __smem_bar,
+  const _CUDA_VSTD::uint16_t& __ctaMask)
+{
+  // __space == space_cluster (due to parameter type constraint)
+  // __space == space_global (due to parameter type constraint)
+  NV_IF_ELSE_TARGET(
+    NV_HAS_FEATURE_SM_90a,
+    (asm("cp.async.bulk.tensor.1d.shared::cluster.global.tile.mbarrier::complete_tx::bytes.multicast::cluster [%0], "
+         "[%1, {%2}], [%3], %4; // 2a."
+         :
+         : "r"(__as_ptr_smem(__dstMem)),
+           "l"(__tensorMap),
+           "r"(__tensorCoords[0]),
+           "r"(__as_ptr_smem(__smem_bar)),
+           "h"(__ctaMask)
+         : "memory");),
+    (
+      // Unsupported architectures will have a linker error with a semi-decent error message
+      __cuda_ptx_cp_async_bulk_tensor_is_not_supported_before_SM_90a__();));
+}
+#endif // __cccl_ptx_isa >= 800
+
+/*
+// cp.async.bulk.tensor.2d.dst.src.tile.mbarrier::complete_tx::bytes.multicast::cluster [dstMem], [tensorMap,
+tensorCoords], [smem_bar], ctaMask; // 2b. PTX ISA 80, SM_90a
+// .dst       = { .shared::cluster }
+// .src       = { .global }
+template <typename=void>
+__device__ static inline void cp_async_bulk_tensor(
+  cuda::ptx::space_cluster_t,
+  cuda::ptx::space_global_t,
+  void* dstMem,
+  const void* tensorMap,
+  const int32_t (&tensorCoords)[2],
+  uint64_t* smem_bar,
+  const uint16_t& ctaMask);
+*/
+#if __cccl_ptx_isa >= 800
+extern "C" _CCCL_DEVICE void __cuda_ptx_cp_async_bulk_tensor_is_not_supported_before_SM_90a__();
+template <typename = void>
+_CCCL_DEVICE static inline void cp_async_bulk_tensor(
+  space_cluster_t,
+  space_global_t,
+  void* __dstMem,
+  const void* __tensorMap,
+  const _CUDA_VSTD::int32_t (&__tensorCoords)[2],
+  _CUDA_VSTD::uint64_t* __smem_bar,
+  const _CUDA_VSTD::uint16_t& __ctaMask)
+{
+  // __space == space_cluster (due to parameter type constraint)
+  // __space == space_global (due to parameter type constraint)
+  NV_IF_ELSE_TARGET(
+    NV_HAS_FEATURE_SM_90a,
+    (asm("cp.async.bulk.tensor.2d.shared::cluster.global.tile.mbarrier::complete_tx::bytes.multicast::cluster [%0], "
+         "[%1, {%2, %3}], [%4], %5; // 2b."
+         :
+         : "r"(__as_ptr_smem(__dstMem)),
+           "l"(__tensorMap),
+           "r"(__tensorCoords[0]),
+           "r"(__tensorCoords[1]),
+           "r"(__as_ptr_smem(__smem_bar)),
+           "h"(__ctaMask)
+         : "memory");),
+    (
+      // Unsupported architectures will have a linker error with a semi-decent error message
+      __cuda_ptx_cp_async_bulk_tensor_is_not_supported_before_SM_90a__();));
+}
+#endif // __cccl_ptx_isa >= 800
+
+/*
+// cp.async.bulk.tensor.3d.dst.src.tile.mbarrier::complete_tx::bytes.multicast::cluster [dstMem], [tensorMap,
+tensorCoords], [smem_bar], ctaMask; // 2c. PTX ISA 80, SM_90a
+// .dst       = { .shared::cluster }
+// .src       = { .global }
+template <typename=void>
+__device__ static inline void cp_async_bulk_tensor(
+  cuda::ptx::space_cluster_t,
+  cuda::ptx::space_global_t,
+  void* dstMem,
+  const void* tensorMap,
+  const int32_t (&tensorCoords)[3],
+  uint64_t* smem_bar,
+  const uint16_t& ctaMask);
+*/
+#if __cccl_ptx_isa >= 800
+extern "C" _CCCL_DEVICE void __cuda_ptx_cp_async_bulk_tensor_is_not_supported_before_SM_90a__();
+template <typename = void>
+_CCCL_DEVICE static inline void cp_async_bulk_tensor(
+  space_cluster_t,
+  space_global_t,
+  void* __dstMem,
+  const void* __tensorMap,
+  const _CUDA_VSTD::int32_t (&__tensorCoords)[3],
+  _CUDA_VSTD::uint64_t* __smem_bar,
+  const _CUDA_VSTD::uint16_t& __ctaMask)
+{
+  // __space == space_cluster (due to parameter type constraint)
+  // __space == space_global (due to parameter type constraint)
+  NV_IF_ELSE_TARGET(
+    NV_HAS_FEATURE_SM_90a,
+    (asm("cp.async.bulk.tensor.3d.shared::cluster.global.tile.mbarrier::complete_tx::bytes.multicast::cluster [%0], "
+         "[%1, {%2, %3, %4}], [%5], %6; // 2c."
+         :
+         : "r"(__as_ptr_smem(__dstMem)),
+           "l"(__tensorMap),
+           "r"(__tensorCoords[0]),
+           "r"(__tensorCoords[1]),
+           "r"(__tensorCoords[2]),
+           "r"(__as_ptr_smem(__smem_bar)),
+           "h"(__ctaMask)
+         : "memory");),
+    (
+      // Unsupported architectures will have a linker error with a semi-decent error message
+      __cuda_ptx_cp_async_bulk_tensor_is_not_supported_before_SM_90a__();));
+}
+#endif // __cccl_ptx_isa >= 800
+
+/*
+// cp.async.bulk.tensor.4d.dst.src.tile.mbarrier::complete_tx::bytes.multicast::cluster [dstMem], [tensorMap,
+tensorCoords], [smem_bar], ctaMask; // 2d. PTX ISA 80, SM_90a
+// .dst       = { .shared::cluster }
+// .src       = { .global }
+template <typename=void>
+__device__ static inline void cp_async_bulk_tensor(
+  cuda::ptx::space_cluster_t,
+  cuda::ptx::space_global_t,
+  void* dstMem,
+  const void* tensorMap,
+  const int32_t (&tensorCoords)[4],
+  uint64_t* smem_bar,
+  const uint16_t& ctaMask);
+*/
+#if __cccl_ptx_isa >= 800
+extern "C" _CCCL_DEVICE void __cuda_ptx_cp_async_bulk_tensor_is_not_supported_before_SM_90a__();
+template <typename = void>
+_CCCL_DEVICE static inline void cp_async_bulk_tensor(
+  space_cluster_t,
+  space_global_t,
+  void* __dstMem,
+  const void* __tensorMap,
+  const _CUDA_VSTD::int32_t (&__tensorCoords)[4],
+  _CUDA_VSTD::uint64_t* __smem_bar,
+  const _CUDA_VSTD::uint16_t& __ctaMask)
+{
+  // __space == space_cluster (due to parameter type constraint)
+  // __space == space_global (due to parameter type constraint)
+  NV_IF_ELSE_TARGET(
+    NV_HAS_FEATURE_SM_90a,
+    (asm("cp.async.bulk.tensor.4d.shared::cluster.global.tile.mbarrier::complete_tx::bytes.multicast::cluster [%0], "
+         "[%1, {%2, %3, %4, %5}], [%6], %7; // 2d."
+         :
+         : "r"(__as_ptr_smem(__dstMem)),
+           "l"(__tensorMap),
+           "r"(__tensorCoords[0]),
+           "r"(__tensorCoords[1]),
+           "r"(__tensorCoords[2]),
+           "r"(__tensorCoords[3]),
+           "r"(__as_ptr_smem(__smem_bar)),
+           "h"(__ctaMask)
+         : "memory");),
+    (
+      // Unsupported architectures will have a linker error with a semi-decent error message
+      __cuda_ptx_cp_async_bulk_tensor_is_not_supported_before_SM_90a__();));
+}
+#endif // __cccl_ptx_isa >= 800
+
+/*
+// cp.async.bulk.tensor.5d.dst.src.tile.mbarrier::complete_tx::bytes.multicast::cluster [dstMem], [tensorMap,
+tensorCoords], [smem_bar], ctaMask; // 2e. PTX ISA 80, SM_90a
+// .dst       = { .shared::cluster }
+// .src       = { .global }
+template <typename=void>
+__device__ static inline void cp_async_bulk_tensor(
+  cuda::ptx::space_cluster_t,
+  cuda::ptx::space_global_t,
+  void* dstMem,
+  const void* tensorMap,
+  const int32_t (&tensorCoords)[5],
+  uint64_t* smem_bar,
+  const uint16_t& ctaMask);
+*/
+#if __cccl_ptx_isa >= 800
+extern "C" _CCCL_DEVICE void __cuda_ptx_cp_async_bulk_tensor_is_not_supported_before_SM_90a__();
+template <typename = void>
+_CCCL_DEVICE static inline void cp_async_bulk_tensor(
+  space_cluster_t,
+  space_global_t,
+  void* __dstMem,
+  const void* __tensorMap,
+  const _CUDA_VSTD::int32_t (&__tensorCoords)[5],
+  _CUDA_VSTD::uint64_t* __smem_bar,
+  const _CUDA_VSTD::uint16_t& __ctaMask)
+{
+  // __space == space_cluster (due to parameter type constraint)
+  // __space == space_global (due to parameter type constraint)
+  NV_IF_ELSE_TARGET(
+    NV_HAS_FEATURE_SM_90a,
+    (asm("cp.async.bulk.tensor.5d.shared::cluster.global.tile.mbarrier::complete_tx::bytes.multicast::cluster [%0], "
+         "[%1, {%2, %3, %4, %5, %6}], [%7], %8; // 2e."
+         :
+         : "r"(__as_ptr_smem(__dstMem)),
+           "l"(__tensorMap),
+           "r"(__tensorCoords[0]),
+           "r"(__tensorCoords[1]),
+           "r"(__tensorCoords[2]),
+           "r"(__tensorCoords[3]),
+           "r"(__tensorCoords[4]),
+           "r"(__as_ptr_smem(__smem_bar)),
+           "h"(__ctaMask)
+         : "memory");),
+    (
+      // Unsupported architectures will have a linker error with a semi-decent error message
+      __cuda_ptx_cp_async_bulk_tensor_is_not_supported_before_SM_90a__();));
+}
+#endif // __cccl_ptx_isa >= 800
diff --git a/libcudacxx/include/cuda/__ptx/instructions/generated/cp_async_bulk_wait_group.inc b/libcudacxx/include/cuda/__ptx/instructions/generated/cp_async_bulk_wait_group.inc
new file mode 100644
index 00000000000..1a715a0fac6
--- /dev/null
+++ b/libcudacxx/include/cuda/__ptx/instructions/generated/cp_async_bulk_wait_group.inc
@@ -0,0 +1,45 @@
+/*
+// cp.async.bulk.wait_group N; // PTX ISA 80, SM_90
+template <int N32>
+__device__ static inline void cp_async_bulk_wait_group(
+  cuda::ptx::n32_t<N32> N);
+*/
+#if __cccl_ptx_isa >= 800
+extern "C" _CCCL_DEVICE void __cuda_ptx_cp_async_bulk_wait_group_is_not_supported_before_SM_90__();
+template <int _N32>
+_CCCL_DEVICE static inline void cp_async_bulk_wait_group(n32_t<_N32> __n)
+{
+  NV_IF_ELSE_TARGET(
+    NV_PROVIDES_SM_90,
+    (asm volatile("cp.async.bulk.wait_group %0;"
+                  :
+                  : "n"(__n.value)
+                  : "memory");),
+    (
+      // Unsupported architectures will have a linker error with a semi-decent error message
+      __cuda_ptx_cp_async_bulk_wait_group_is_not_supported_before_SM_90__();));
+}
+#endif // __cccl_ptx_isa >= 800
+
+/*
+// cp.async.bulk.wait_group.read N; // PTX ISA 80, SM_90
+template <int N32>
+__device__ static inline void cp_async_bulk_wait_group_read(
+  cuda::ptx::n32_t<N32> N);
+*/
+#if __cccl_ptx_isa >= 800
+extern "C" _CCCL_DEVICE void __cuda_ptx_cp_async_bulk_wait_group_read_is_not_supported_before_SM_90__();
+template <int _N32>
+_CCCL_DEVICE static inline void cp_async_bulk_wait_group_read(n32_t<_N32> __n)
+{
+  NV_IF_ELSE_TARGET(
+    NV_PROVIDES_SM_90,
+    (asm volatile("cp.async.bulk.wait_group.read %0;"
+                  :
+                  : "n"(__n.value)
+                  : "memory");),
+    (
+      // Unsupported architectures will have a linker error with a semi-decent error message
+      __cuda_ptx_cp_async_bulk_wait_group_read_is_not_supported_before_SM_90__();));
+}
+#endif // __cccl_ptx_isa >= 800
diff --git a/libcudacxx/include/cuda/__ptx/instructions/generated/cp_reduce_async_bulk.inc b/libcudacxx/include/cuda/__ptx/instructions/generated/cp_reduce_async_bulk.inc
new file mode 100644
index 00000000000..50059ff6c5b
--- /dev/null
+++ b/libcudacxx/include/cuda/__ptx/instructions/generated/cp_reduce_async_bulk.inc
@@ -0,0 +1,1435 @@
+// 9.7.8.24.7. Data Movement and Conversion Instructions: cp.reduce.async.bulk
+// https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#data-movement-and-conversion-instructions-cp-reduce-async-bulk
+/*
+// cp.reduce.async.bulk.dst.src.mbarrier::complete_tx::bytes.op.type [dstMem], [srcMem], size, [rdsmem_bar]; // 1. PTX
+ISA 80, SM_90
+// .dst       = { .shared::cluster }
+// .src       = { .shared::cta }
+// .type      = { .b32 }
+// .op        = { .and }
+template <typename B32>
+__device__ static inline void cp_reduce_async_bulk(
+  cuda::ptx::space_cluster_t,
+  cuda::ptx::space_shared_t,
+  cuda::ptx::op_and_op_t,
+  B32* dstMem,
+  const B32* srcMem,
+  uint32_t size,
+  uint64_t* rdsmem_bar);
+*/
+#if __cccl_ptx_isa >= 800
+extern "C" _CCCL_DEVICE void __cuda_ptx_cp_reduce_async_bulk_is_not_supported_before_SM_90__();
+template <typename _B32>
+_CCCL_DEVICE static inline void cp_reduce_async_bulk(
+  space_cluster_t,
+  space_shared_t,
+  op_and_op_t,
+  _B32* __dstMem,
+  const _B32* __srcMem,
+  _CUDA_VSTD::uint32_t __size,
+  _CUDA_VSTD::uint64_t* __rdsmem_bar)
+{
+  // __space == space_cluster (due to parameter type constraint)
+  // __space == space_shared (due to parameter type constraint)
+  // __type == type_b32 (due to parameter type constraint)
+  // __op == op_and_op (due to parameter type constraint)
+  NV_IF_ELSE_TARGET(
+    NV_PROVIDES_SM_90,
+    (asm("cp.reduce.async.bulk.shared::cluster.shared::cta.mbarrier::complete_tx::bytes.and.b32 [%0], [%1], %2, [%3]; "
+         "// 1."
+         :
+         : "r"(__as_ptr_remote_dsmem(__dstMem)),
+           "r"(__as_ptr_smem(__srcMem)),
+           "r"(__size),
+           "r"(__as_ptr_remote_dsmem(__rdsmem_bar))
+         : "memory");),
+    (
+      // Unsupported architectures will have a linker error with a semi-decent error message
+      __cuda_ptx_cp_reduce_async_bulk_is_not_supported_before_SM_90__();));
+}
+#endif // __cccl_ptx_isa >= 800
+
+/*
+// cp.reduce.async.bulk.dst.src.mbarrier::complete_tx::bytes.op.type [dstMem], [srcMem], size, [rdsmem_bar]; // 1. PTX
+ISA 80, SM_90
+// .dst       = { .shared::cluster }
+// .src       = { .shared::cta }
+// .type      = { .b32 }
+// .op        = { .or }
+template <typename B32>
+__device__ static inline void cp_reduce_async_bulk(
+  cuda::ptx::space_cluster_t,
+  cuda::ptx::space_shared_t,
+  cuda::ptx::op_or_op_t,
+  B32* dstMem,
+  const B32* srcMem,
+  uint32_t size,
+  uint64_t* rdsmem_bar);
+*/
+#if __cccl_ptx_isa >= 800
+extern "C" _CCCL_DEVICE void __cuda_ptx_cp_reduce_async_bulk_is_not_supported_before_SM_90__();
+template <typename _B32>
+_CCCL_DEVICE static inline void cp_reduce_async_bulk(
+  space_cluster_t,
+  space_shared_t,
+  op_or_op_t,
+  _B32* __dstMem,
+  const _B32* __srcMem,
+  _CUDA_VSTD::uint32_t __size,
+  _CUDA_VSTD::uint64_t* __rdsmem_bar)
+{
+  // __space == space_cluster (due to parameter type constraint)
+  // __space == space_shared (due to parameter type constraint)
+  // __type == type_b32 (due to parameter type constraint)
+  // __op == op_or_op (due to parameter type constraint)
+  NV_IF_ELSE_TARGET(
+    NV_PROVIDES_SM_90,
+    (asm("cp.reduce.async.bulk.shared::cluster.shared::cta.mbarrier::complete_tx::bytes.or.b32 [%0], [%1], %2, [%3]; "
+         "// 1."
+         :
+         : "r"(__as_ptr_remote_dsmem(__dstMem)),
+           "r"(__as_ptr_smem(__srcMem)),
+           "r"(__size),
+           "r"(__as_ptr_remote_dsmem(__rdsmem_bar))
+         : "memory");),
+    (
+      // Unsupported architectures will have a linker error with a semi-decent error message
+      __cuda_ptx_cp_reduce_async_bulk_is_not_supported_before_SM_90__();));
+}
+#endif // __cccl_ptx_isa >= 800
+
+/*
+// cp.reduce.async.bulk.dst.src.mbarrier::complete_tx::bytes.op.type [dstMem], [srcMem], size, [rdsmem_bar]; // 1. PTX
+ISA 80, SM_90
+// .dst       = { .shared::cluster }
+// .src       = { .shared::cta }
+// .type      = { .b32 }
+// .op        = { .xor }
+template <typename B32>
+__device__ static inline void cp_reduce_async_bulk(
+  cuda::ptx::space_cluster_t,
+  cuda::ptx::space_shared_t,
+  cuda::ptx::op_xor_op_t,
+  B32* dstMem,
+  const B32* srcMem,
+  uint32_t size,
+  uint64_t* rdsmem_bar);
+*/
+#if __cccl_ptx_isa >= 800
+extern "C" _CCCL_DEVICE void __cuda_ptx_cp_reduce_async_bulk_is_not_supported_before_SM_90__();
+template <typename _B32>
+_CCCL_DEVICE static inline void cp_reduce_async_bulk(
+  space_cluster_t,
+  space_shared_t,
+  op_xor_op_t,
+  _B32* __dstMem,
+  const _B32* __srcMem,
+  _CUDA_VSTD::uint32_t __size,
+  _CUDA_VSTD::uint64_t* __rdsmem_bar)
+{
+  // __space == space_cluster (due to parameter type constraint)
+  // __space == space_shared (due to parameter type constraint)
+  // __type == type_b32 (due to parameter type constraint)
+  // __op == op_xor_op (due to parameter type constraint)
+  NV_IF_ELSE_TARGET(
+    NV_PROVIDES_SM_90,
+    (asm("cp.reduce.async.bulk.shared::cluster.shared::cta.mbarrier::complete_tx::bytes.xor.b32 [%0], [%1], %2, [%3]; "
+         "// 1."
+         :
+         : "r"(__as_ptr_remote_dsmem(__dstMem)),
+           "r"(__as_ptr_smem(__srcMem)),
+           "r"(__size),
+           "r"(__as_ptr_remote_dsmem(__rdsmem_bar))
+         : "memory");),
+    (
+      // Unsupported architectures will have a linker error with a semi-decent error message
+      __cuda_ptx_cp_reduce_async_bulk_is_not_supported_before_SM_90__();));
+}
+#endif // __cccl_ptx_isa >= 800
+
+/*
+// cp.reduce.async.bulk.dst.src.mbarrier::complete_tx::bytes.op.type [dstMem], [srcMem], size, [rdsmem_bar]; // 1. PTX
+ISA 80, SM_90
+// .dst       = { .shared::cluster }
+// .src       = { .shared::cta }
+// .type      = { .u32 }
+// .op        = { .min }
+template <typename=void>
+__device__ static inline void cp_reduce_async_bulk(
+  cuda::ptx::space_cluster_t,
+  cuda::ptx::space_shared_t,
+  cuda::ptx::op_min_t,
+  uint32_t* dstMem,
+  const uint32_t* srcMem,
+  uint32_t size,
+  uint64_t* rdsmem_bar);
+*/
+#if __cccl_ptx_isa >= 800
+extern "C" _CCCL_DEVICE void __cuda_ptx_cp_reduce_async_bulk_is_not_supported_before_SM_90__();
+template <typename = void>
+_CCCL_DEVICE static inline void cp_reduce_async_bulk(
+  space_cluster_t,
+  space_shared_t,
+  op_min_t,
+  _CUDA_VSTD::uint32_t* __dstMem,
+  const _CUDA_VSTD::uint32_t* __srcMem,
+  _CUDA_VSTD::uint32_t __size,
+  _CUDA_VSTD::uint64_t* __rdsmem_bar)
+{
+  // __space == space_cluster (due to parameter type constraint)
+  // __space == space_shared (due to parameter type constraint)
+  // __type == type_u32 (due to parameter type constraint)
+  // __op == op_min (due to parameter type constraint)
+  NV_IF_ELSE_TARGET(
+    NV_PROVIDES_SM_90,
+    (asm("cp.reduce.async.bulk.shared::cluster.shared::cta.mbarrier::complete_tx::bytes.min.u32 [%0], [%1], %2, [%3]; "
+         "// 1."
+         :
+         : "r"(__as_ptr_remote_dsmem(__dstMem)),
+           "r"(__as_ptr_smem(__srcMem)),
+           "r"(__size),
+           "r"(__as_ptr_remote_dsmem(__rdsmem_bar))
+         : "memory");),
+    (
+      // Unsupported architectures will have a linker error with a semi-decent error message
+      __cuda_ptx_cp_reduce_async_bulk_is_not_supported_before_SM_90__();));
+}
+#endif // __cccl_ptx_isa >= 800
+
+/*
+// cp.reduce.async.bulk.dst.src.mbarrier::complete_tx::bytes.op.type [dstMem], [srcMem], size, [rdsmem_bar]; // 1. PTX
+ISA 80, SM_90
+// .dst       = { .shared::cluster }
+// .src       = { .shared::cta }
+// .type      = { .u32 }
+// .op        = { .max }
+template <typename=void>
+__device__ static inline void cp_reduce_async_bulk(
+  cuda::ptx::space_cluster_t,
+  cuda::ptx::space_shared_t,
+  cuda::ptx::op_max_t,
+  uint32_t* dstMem,
+  const uint32_t* srcMem,
+  uint32_t size,
+  uint64_t* rdsmem_bar);
+*/
+#if __cccl_ptx_isa >= 800
+extern "C" _CCCL_DEVICE void __cuda_ptx_cp_reduce_async_bulk_is_not_supported_before_SM_90__();
+template <typename = void>
+_CCCL_DEVICE static inline void cp_reduce_async_bulk(
+  space_cluster_t,
+  space_shared_t,
+  op_max_t,
+  _CUDA_VSTD::uint32_t* __dstMem,
+  const _CUDA_VSTD::uint32_t* __srcMem,
+  _CUDA_VSTD::uint32_t __size,
+  _CUDA_VSTD::uint64_t* __rdsmem_bar)
+{
+  // __space == space_cluster (due to parameter type constraint)
+  // __space == space_shared (due to parameter type constraint)
+  // __type == type_u32 (due to parameter type constraint)
+  // __op == op_max (due to parameter type constraint)
+  NV_IF_ELSE_TARGET(
+    NV_PROVIDES_SM_90,
+    (asm("cp.reduce.async.bulk.shared::cluster.shared::cta.mbarrier::complete_tx::bytes.max.u32 [%0], [%1], %2, [%3]; "
+         "// 1."
+         :
+         : "r"(__as_ptr_remote_dsmem(__dstMem)),
+           "r"(__as_ptr_smem(__srcMem)),
+           "r"(__size),
+           "r"(__as_ptr_remote_dsmem(__rdsmem_bar))
+         : "memory");),
+    (
+      // Unsupported architectures will have a linker error with a semi-decent error message
+      __cuda_ptx_cp_reduce_async_bulk_is_not_supported_before_SM_90__();));
+}
+#endif // __cccl_ptx_isa >= 800
+
+/*
+// cp.reduce.async.bulk.dst.src.mbarrier::complete_tx::bytes.op.type [dstMem], [srcMem], size, [rdsmem_bar]; // 1. PTX
+ISA 80, SM_90
+// .dst       = { .shared::cluster }
+// .src       = { .shared::cta }
+// .type      = { .u32 }
+// .op        = { .add }
+template <typename=void>
+__device__ static inline void cp_reduce_async_bulk(
+  cuda::ptx::space_cluster_t,
+  cuda::ptx::space_shared_t,
+  cuda::ptx::op_add_t,
+  uint32_t* dstMem,
+  const uint32_t* srcMem,
+  uint32_t size,
+  uint64_t* rdsmem_bar);
+*/
+#if __cccl_ptx_isa >= 800
+extern "C" _CCCL_DEVICE void __cuda_ptx_cp_reduce_async_bulk_is_not_supported_before_SM_90__();
+template <typename = void>
+_CCCL_DEVICE static inline void cp_reduce_async_bulk(
+  space_cluster_t,
+  space_shared_t,
+  op_add_t,
+  _CUDA_VSTD::uint32_t* __dstMem,
+  const _CUDA_VSTD::uint32_t* __srcMem,
+  _CUDA_VSTD::uint32_t __size,
+  _CUDA_VSTD::uint64_t* __rdsmem_bar)
+{
+  // __space == space_cluster (due to parameter type constraint)
+  // __space == space_shared (due to parameter type constraint)
+  // __type == type_u32 (due to parameter type constraint)
+  // __op == op_add (due to parameter type constraint)
+  NV_IF_ELSE_TARGET(
+    NV_PROVIDES_SM_90,
+    (asm("cp.reduce.async.bulk.shared::cluster.shared::cta.mbarrier::complete_tx::bytes.add.u32 [%0], [%1], %2, [%3]; "
+         "// 1."
+         :
+         : "r"(__as_ptr_remote_dsmem(__dstMem)),
+           "r"(__as_ptr_smem(__srcMem)),
+           "r"(__size),
+           "r"(__as_ptr_remote_dsmem(__rdsmem_bar))
+         : "memory");),
+    (
+      // Unsupported architectures will have a linker error with a semi-decent error message
+      __cuda_ptx_cp_reduce_async_bulk_is_not_supported_before_SM_90__();));
+}
+#endif // __cccl_ptx_isa >= 800
+
+/*
+// cp.reduce.async.bulk.dst.src.mbarrier::complete_tx::bytes.op.type [dstMem], [srcMem], size, [rdsmem_bar]; // 1. PTX
+ISA 80, SM_90
+// .dst       = { .shared::cluster }
+// .src       = { .shared::cta }
+// .type      = { .u32 }
+// .op        = { .inc }
+template <typename=void>
+__device__ static inline void cp_reduce_async_bulk(
+  cuda::ptx::space_cluster_t,
+  cuda::ptx::space_shared_t,
+  cuda::ptx::op_inc_t,
+  uint32_t* dstMem,
+  const uint32_t* srcMem,
+  uint32_t size,
+  uint64_t* rdsmem_bar);
+*/
+#if __cccl_ptx_isa >= 800
+extern "C" _CCCL_DEVICE void __cuda_ptx_cp_reduce_async_bulk_is_not_supported_before_SM_90__();
+template <typename = void>
+_CCCL_DEVICE static inline void cp_reduce_async_bulk(
+  space_cluster_t,
+  space_shared_t,
+  op_inc_t,
+  _CUDA_VSTD::uint32_t* __dstMem,
+  const _CUDA_VSTD::uint32_t* __srcMem,
+  _CUDA_VSTD::uint32_t __size,
+  _CUDA_VSTD::uint64_t* __rdsmem_bar)
+{
+  // __space == space_cluster (due to parameter type constraint)
+  // __space == space_shared (due to parameter type constraint)
+  // __type == type_u32 (due to parameter type constraint)
+  // __op == op_inc (due to parameter type constraint)
+  NV_IF_ELSE_TARGET(
+    NV_PROVIDES_SM_90,
+    (asm("cp.reduce.async.bulk.shared::cluster.shared::cta.mbarrier::complete_tx::bytes.inc.u32 [%0], [%1], %2, [%3]; "
+         "// 1."
+         :
+         : "r"(__as_ptr_remote_dsmem(__dstMem)),
+           "r"(__as_ptr_smem(__srcMem)),
+           "r"(__size),
+           "r"(__as_ptr_remote_dsmem(__rdsmem_bar))
+         : "memory");),
+    (
+      // Unsupported architectures will have a linker error with a semi-decent error message
+      __cuda_ptx_cp_reduce_async_bulk_is_not_supported_before_SM_90__();));
+}
+#endif // __cccl_ptx_isa >= 800
+
+/*
+// cp.reduce.async.bulk.dst.src.mbarrier::complete_tx::bytes.op.type [dstMem], [srcMem], size, [rdsmem_bar]; // 1. PTX
+ISA 80, SM_90
+// .dst       = { .shared::cluster }
+// .src       = { .shared::cta }
+// .type      = { .u32 }
+// .op        = { .dec }
+template <typename=void>
+__device__ static inline void cp_reduce_async_bulk(
+  cuda::ptx::space_cluster_t,
+  cuda::ptx::space_shared_t,
+  cuda::ptx::op_dec_t,
+  uint32_t* dstMem,
+  const uint32_t* srcMem,
+  uint32_t size,
+  uint64_t* rdsmem_bar);
+*/
+#if __cccl_ptx_isa >= 800
+extern "C" _CCCL_DEVICE void __cuda_ptx_cp_reduce_async_bulk_is_not_supported_before_SM_90__();
+template <typename = void>
+_CCCL_DEVICE static inline void cp_reduce_async_bulk(
+  space_cluster_t,
+  space_shared_t,
+  op_dec_t,
+  _CUDA_VSTD::uint32_t* __dstMem,
+  const _CUDA_VSTD::uint32_t* __srcMem,
+  _CUDA_VSTD::uint32_t __size,
+  _CUDA_VSTD::uint64_t* __rdsmem_bar)
+{
+  // __space == space_cluster (due to parameter type constraint)
+  // __space == space_shared (due to parameter type constraint)
+  // __type == type_u32 (due to parameter type constraint)
+  // __op == op_dec (due to parameter type constraint)
+  NV_IF_ELSE_TARGET(
+    NV_PROVIDES_SM_90,
+    (asm("cp.reduce.async.bulk.shared::cluster.shared::cta.mbarrier::complete_tx::bytes.dec.u32 [%0], [%1], %2, [%3]; "
+         "// 1."
+         :
+         : "r"(__as_ptr_remote_dsmem(__dstMem)),
+           "r"(__as_ptr_smem(__srcMem)),
+           "r"(__size),
+           "r"(__as_ptr_remote_dsmem(__rdsmem_bar))
+         : "memory");),
+    (
+      // Unsupported architectures will have a linker error with a semi-decent error message
+      __cuda_ptx_cp_reduce_async_bulk_is_not_supported_before_SM_90__();));
+}
+#endif // __cccl_ptx_isa >= 800
+
+/*
+// cp.reduce.async.bulk.dst.src.mbarrier::complete_tx::bytes.op.type [dstMem], [srcMem], size, [rdsmem_bar]; // 1. PTX
+ISA 80, SM_90
+// .dst       = { .shared::cluster }
+// .src       = { .shared::cta }
+// .type      = { .s32 }
+// .op        = { .min }
+template <typename=void>
+__device__ static inline void cp_reduce_async_bulk(
+  cuda::ptx::space_cluster_t,
+  cuda::ptx::space_shared_t,
+  cuda::ptx::op_min_t,
+  int32_t* dstMem,
+  const int32_t* srcMem,
+  uint32_t size,
+  uint64_t* rdsmem_bar);
+*/
+#if __cccl_ptx_isa >= 800
+extern "C" _CCCL_DEVICE void __cuda_ptx_cp_reduce_async_bulk_is_not_supported_before_SM_90__();
+template <typename = void>
+_CCCL_DEVICE static inline void cp_reduce_async_bulk(
+  space_cluster_t,
+  space_shared_t,
+  op_min_t,
+  _CUDA_VSTD::int32_t* __dstMem,
+  const _CUDA_VSTD::int32_t* __srcMem,
+  _CUDA_VSTD::uint32_t __size,
+  _CUDA_VSTD::uint64_t* __rdsmem_bar)
+{
+  // __space == space_cluster (due to parameter type constraint)
+  // __space == space_shared (due to parameter type constraint)
+  // __type == type_s32 (due to parameter type constraint)
+  // __op == op_min (due to parameter type constraint)
+  NV_IF_ELSE_TARGET(
+    NV_PROVIDES_SM_90,
+    (asm("cp.reduce.async.bulk.shared::cluster.shared::cta.mbarrier::complete_tx::bytes.min.s32 [%0], [%1], %2, [%3]; "
+         "// 1."
+         :
+         : "r"(__as_ptr_remote_dsmem(__dstMem)),
+           "r"(__as_ptr_smem(__srcMem)),
+           "r"(__size),
+           "r"(__as_ptr_remote_dsmem(__rdsmem_bar))
+         : "memory");),
+    (
+      // Unsupported architectures will have a linker error with a semi-decent error message
+      __cuda_ptx_cp_reduce_async_bulk_is_not_supported_before_SM_90__();));
+}
+#endif // __cccl_ptx_isa >= 800
+
+/*
+// cp.reduce.async.bulk.dst.src.mbarrier::complete_tx::bytes.op.type [dstMem], [srcMem], size, [rdsmem_bar]; // 1. PTX
+ISA 80, SM_90
+// .dst       = { .shared::cluster }
+// .src       = { .shared::cta }
+// .type      = { .s32 }
+// .op        = { .max }
+template <typename=void>
+__device__ static inline void cp_reduce_async_bulk(
+  cuda::ptx::space_cluster_t,
+  cuda::ptx::space_shared_t,
+  cuda::ptx::op_max_t,
+  int32_t* dstMem,
+  const int32_t* srcMem,
+  uint32_t size,
+  uint64_t* rdsmem_bar);
+*/
+#if __cccl_ptx_isa >= 800
+extern "C" _CCCL_DEVICE void __cuda_ptx_cp_reduce_async_bulk_is_not_supported_before_SM_90__();
+template <typename = void>
+_CCCL_DEVICE static inline void cp_reduce_async_bulk(
+  space_cluster_t,
+  space_shared_t,
+  op_max_t,
+  _CUDA_VSTD::int32_t* __dstMem,
+  const _CUDA_VSTD::int32_t* __srcMem,
+  _CUDA_VSTD::uint32_t __size,
+  _CUDA_VSTD::uint64_t* __rdsmem_bar)
+{
+  // __space == space_cluster (due to parameter type constraint)
+  // __space == space_shared (due to parameter type constraint)
+  // __type == type_s32 (due to parameter type constraint)
+  // __op == op_max (due to parameter type constraint)
+  NV_IF_ELSE_TARGET(
+    NV_PROVIDES_SM_90,
+    (asm("cp.reduce.async.bulk.shared::cluster.shared::cta.mbarrier::complete_tx::bytes.max.s32 [%0], [%1], %2, [%3]; "
+         "// 1."
+         :
+         : "r"(__as_ptr_remote_dsmem(__dstMem)),
+           "r"(__as_ptr_smem(__srcMem)),
+           "r"(__size),
+           "r"(__as_ptr_remote_dsmem(__rdsmem_bar))
+         : "memory");),
+    (
+      // Unsupported architectures will have a linker error with a semi-decent error message
+      __cuda_ptx_cp_reduce_async_bulk_is_not_supported_before_SM_90__();));
+}
+#endif // __cccl_ptx_isa >= 800
+
+/*
+// cp.reduce.async.bulk.dst.src.mbarrier::complete_tx::bytes.op.type [dstMem], [srcMem], size, [rdsmem_bar]; // 1. PTX
+ISA 80, SM_90
+// .dst       = { .shared::cluster }
+// .src       = { .shared::cta }
+// .type      = { .s32 }
+// .op        = { .add }
+template <typename=void>
+__device__ static inline void cp_reduce_async_bulk(
+  cuda::ptx::space_cluster_t,
+  cuda::ptx::space_shared_t,
+  cuda::ptx::op_add_t,
+  int32_t* dstMem,
+  const int32_t* srcMem,
+  uint32_t size,
+  uint64_t* rdsmem_bar);
+*/
+#if __cccl_ptx_isa >= 800
+extern "C" _CCCL_DEVICE void __cuda_ptx_cp_reduce_async_bulk_is_not_supported_before_SM_90__();
+template <typename = void>
+_CCCL_DEVICE static inline void cp_reduce_async_bulk(
+  space_cluster_t,
+  space_shared_t,
+  op_add_t,
+  _CUDA_VSTD::int32_t* __dstMem,
+  const _CUDA_VSTD::int32_t* __srcMem,
+  _CUDA_VSTD::uint32_t __size,
+  _CUDA_VSTD::uint64_t* __rdsmem_bar)
+{
+  // __space == space_cluster (due to parameter type constraint)
+  // __space == space_shared (due to parameter type constraint)
+  // __type == type_s32 (due to parameter type constraint)
+  // __op == op_add (due to parameter type constraint)
+  NV_IF_ELSE_TARGET(
+    NV_PROVIDES_SM_90,
+    (asm("cp.reduce.async.bulk.shared::cluster.shared::cta.mbarrier::complete_tx::bytes.add.s32 [%0], [%1], %2, [%3]; "
+         "// 1."
+         :
+         : "r"(__as_ptr_remote_dsmem(__dstMem)),
+           "r"(__as_ptr_smem(__srcMem)),
+           "r"(__size),
+           "r"(__as_ptr_remote_dsmem(__rdsmem_bar))
+         : "memory");),
+    (
+      // Unsupported architectures will have a linker error with a semi-decent error message
+      __cuda_ptx_cp_reduce_async_bulk_is_not_supported_before_SM_90__();));
+}
+#endif // __cccl_ptx_isa >= 800
+
+/*
+// cp.reduce.async.bulk.dst.src.mbarrier::complete_tx::bytes.op.type [dstMem], [srcMem], size, [rdsmem_bar]; // 1. PTX
+ISA 80, SM_90
+// .dst       = { .shared::cluster }
+// .src       = { .shared::cta }
+// .type      = { .u64 }
+// .op        = { .add }
+template <typename=void>
+__device__ static inline void cp_reduce_async_bulk(
+  cuda::ptx::space_cluster_t,
+  cuda::ptx::space_shared_t,
+  cuda::ptx::op_add_t,
+  uint64_t* dstMem,
+  const uint64_t* srcMem,
+  uint32_t size,
+  uint64_t* rdsmem_bar);
+*/
+#if __cccl_ptx_isa >= 800
+extern "C" _CCCL_DEVICE void __cuda_ptx_cp_reduce_async_bulk_is_not_supported_before_SM_90__();
+template <typename = void>
+_CCCL_DEVICE static inline void cp_reduce_async_bulk(
+  space_cluster_t,
+  space_shared_t,
+  op_add_t,
+  _CUDA_VSTD::uint64_t* __dstMem,
+  const _CUDA_VSTD::uint64_t* __srcMem,
+  _CUDA_VSTD::uint32_t __size,
+  _CUDA_VSTD::uint64_t* __rdsmem_bar)
+{
+  // __space == space_cluster (due to parameter type constraint)
+  // __space == space_shared (due to parameter type constraint)
+  // __type == type_u64 (due to parameter type constraint)
+  // __op == op_add (due to parameter type constraint)
+  NV_IF_ELSE_TARGET(
+    NV_PROVIDES_SM_90,
+    (asm("cp.reduce.async.bulk.shared::cluster.shared::cta.mbarrier::complete_tx::bytes.add.u64 [%0], [%1], %2, [%3]; "
+         "// 1."
+         :
+         : "r"(__as_ptr_remote_dsmem(__dstMem)),
+           "r"(__as_ptr_smem(__srcMem)),
+           "r"(__size),
+           "r"(__as_ptr_remote_dsmem(__rdsmem_bar))
+         : "memory");),
+    (
+      // Unsupported architectures will have a linker error with a semi-decent error message
+      __cuda_ptx_cp_reduce_async_bulk_is_not_supported_before_SM_90__();));
+}
+#endif // __cccl_ptx_isa >= 800
+
+/*
+// cp.reduce.async.bulk.dst.src.mbarrier::complete_tx::bytes.op.u64 [dstMem], [srcMem], size, [rdsmem_bar]; // 2. PTX
+ISA 80, SM_90
+// .dst       = { .shared::cluster }
+// .src       = { .shared::cta }
+// .type      = { .s64 }
+// .op        = { .add }
+template <typename=void>
+__device__ static inline void cp_reduce_async_bulk(
+  cuda::ptx::space_cluster_t,
+  cuda::ptx::space_shared_t,
+  cuda::ptx::op_add_t,
+  int64_t* dstMem,
+  const int64_t* srcMem,
+  uint32_t size,
+  uint64_t* rdsmem_bar);
+*/
+#if __cccl_ptx_isa >= 800
+extern "C" _CCCL_DEVICE void __cuda_ptx_cp_reduce_async_bulk_is_not_supported_before_SM_90__();
+template <typename = void>
+_CCCL_DEVICE static inline void cp_reduce_async_bulk(
+  space_cluster_t,
+  space_shared_t,
+  op_add_t,
+  _CUDA_VSTD::int64_t* __dstMem,
+  const _CUDA_VSTD::int64_t* __srcMem,
+  _CUDA_VSTD::uint32_t __size,
+  _CUDA_VSTD::uint64_t* __rdsmem_bar)
+{
+  // __space == space_cluster (due to parameter type constraint)
+  // __space == space_shared (due to parameter type constraint)
+  // __type == type_s64 (due to parameter type constraint)
+  // __op == op_add (due to parameter type constraint)
+  NV_IF_ELSE_TARGET(
+    NV_PROVIDES_SM_90,
+    (asm("cp.reduce.async.bulk.shared::cluster.shared::cta.mbarrier::complete_tx::bytes.add.u64 [%0], [%1], %2, [%3]; "
+         "// 2."
+         :
+         : "r"(__as_ptr_remote_dsmem(__dstMem)),
+           "r"(__as_ptr_smem(__srcMem)),
+           "r"(__size),
+           "r"(__as_ptr_remote_dsmem(__rdsmem_bar))
+         : "memory");),
+    (
+      // Unsupported architectures will have a linker error with a semi-decent error message
+      __cuda_ptx_cp_reduce_async_bulk_is_not_supported_before_SM_90__();));
+}
+#endif // __cccl_ptx_isa >= 800
+
+/*
+// cp.reduce.async.bulk.dst.src.bulk_group.op.type  [dstMem], [srcMem], size; // 3. PTX ISA 80, SM_90
+// .dst       = { .global }
+// .src       = { .shared::cta }
+// .type      = { .b32, .b64 }
+// .op        = { .and }
+template <typename Type>
+__device__ static inline void cp_reduce_async_bulk(
+  cuda::ptx::space_global_t,
+  cuda::ptx::space_shared_t,
+  cuda::ptx::op_and_op_t,
+  Type* dstMem,
+  const Type* srcMem,
+  uint32_t size);
+*/
+#if __cccl_ptx_isa >= 800
+extern "C" _CCCL_DEVICE void __cuda_ptx_cp_reduce_async_bulk_is_not_supported_before_SM_90__();
+template <typename _Type>
+_CCCL_DEVICE static inline void cp_reduce_async_bulk(
+  space_global_t, space_shared_t, op_and_op_t, _Type* __dstMem, const _Type* __srcMem, _CUDA_VSTD::uint32_t __size)
+{
+  // __space == space_global (due to parameter type constraint)
+  // __space == space_shared (due to parameter type constraint)
+  static_assert(sizeof(_Type) == 4 || sizeof(_Type) == 8, "");
+  // __op == op_and_op (due to parameter type constraint)
+  NV_IF_ELSE_TARGET(
+    NV_PROVIDES_SM_90,
+    (
+      _CCCL_IF_CONSTEXPR (sizeof(_Type) == 4) {
+        asm("cp.reduce.async.bulk.global.shared::cta.bulk_group.and.b32  [%0], [%1], %2; // 3."
+            :
+            : "l"(__as_ptr_gmem(__dstMem)), "r"(__as_ptr_smem(__srcMem)), "r"(__size)
+            : "memory");
+      } _CCCL_ELSE_IF_CONSTEXPR (sizeof(_Type) == 8) {
+        asm("cp.reduce.async.bulk.global.shared::cta.bulk_group.and.b64  [%0], [%1], %2; // 3."
+            :
+            : "l"(__as_ptr_gmem(__dstMem)), "r"(__as_ptr_smem(__srcMem)), "r"(__size)
+            : "memory");
+      }),
+    (
+      // Unsupported architectures will have a linker error with a semi-decent error message
+      __cuda_ptx_cp_reduce_async_bulk_is_not_supported_before_SM_90__();));
+}
+#endif // __cccl_ptx_isa >= 800
+
+/*
+// cp.reduce.async.bulk.dst.src.bulk_group.op.type  [dstMem], [srcMem], size; // 3. PTX ISA 80, SM_90
+// .dst       = { .global }
+// .src       = { .shared::cta }
+// .type      = { .b32, .b64 }
+// .op        = { .or }
+template <typename Type>
+__device__ static inline void cp_reduce_async_bulk(
+  cuda::ptx::space_global_t,
+  cuda::ptx::space_shared_t,
+  cuda::ptx::op_or_op_t,
+  Type* dstMem,
+  const Type* srcMem,
+  uint32_t size);
+*/
+#if __cccl_ptx_isa >= 800
+extern "C" _CCCL_DEVICE void __cuda_ptx_cp_reduce_async_bulk_is_not_supported_before_SM_90__();
+template <typename _Type>
+_CCCL_DEVICE static inline void cp_reduce_async_bulk(
+  space_global_t, space_shared_t, op_or_op_t, _Type* __dstMem, const _Type* __srcMem, _CUDA_VSTD::uint32_t __size)
+{
+  // __space == space_global (due to parameter type constraint)
+  // __space == space_shared (due to parameter type constraint)
+  static_assert(sizeof(_Type) == 4 || sizeof(_Type) == 8, "");
+  // __op == op_or_op (due to parameter type constraint)
+  NV_IF_ELSE_TARGET(
+    NV_PROVIDES_SM_90,
+    (
+      _CCCL_IF_CONSTEXPR (sizeof(_Type) == 4) {
+        asm("cp.reduce.async.bulk.global.shared::cta.bulk_group.or.b32  [%0], [%1], %2; // 3."
+            :
+            : "l"(__as_ptr_gmem(__dstMem)), "r"(__as_ptr_smem(__srcMem)), "r"(__size)
+            : "memory");
+      } _CCCL_ELSE_IF_CONSTEXPR (sizeof(_Type) == 8) {
+        asm("cp.reduce.async.bulk.global.shared::cta.bulk_group.or.b64  [%0], [%1], %2; // 3."
+            :
+            : "l"(__as_ptr_gmem(__dstMem)), "r"(__as_ptr_smem(__srcMem)), "r"(__size)
+            : "memory");
+      }),
+    (
+      // Unsupported architectures will have a linker error with a semi-decent error message
+      __cuda_ptx_cp_reduce_async_bulk_is_not_supported_before_SM_90__();));
+}
+#endif // __cccl_ptx_isa >= 800
+
+/*
+// cp.reduce.async.bulk.dst.src.bulk_group.op.type  [dstMem], [srcMem], size; // 3. PTX ISA 80, SM_90
+// .dst       = { .global }
+// .src       = { .shared::cta }
+// .type      = { .b32, .b64 }
+// .op        = { .xor }
+template <typename Type>
+__device__ static inline void cp_reduce_async_bulk(
+  cuda::ptx::space_global_t,
+  cuda::ptx::space_shared_t,
+  cuda::ptx::op_xor_op_t,
+  Type* dstMem,
+  const Type* srcMem,
+  uint32_t size);
+*/
+#if __cccl_ptx_isa >= 800
+extern "C" _CCCL_DEVICE void __cuda_ptx_cp_reduce_async_bulk_is_not_supported_before_SM_90__();
+template <typename _Type>
+_CCCL_DEVICE static inline void cp_reduce_async_bulk(
+  space_global_t, space_shared_t, op_xor_op_t, _Type* __dstMem, const _Type* __srcMem, _CUDA_VSTD::uint32_t __size)
+{
+  // __space == space_global (due to parameter type constraint)
+  // __space == space_shared (due to parameter type constraint)
+  static_assert(sizeof(_Type) == 4 || sizeof(_Type) == 8, "");
+  // __op == op_xor_op (due to parameter type constraint)
+  NV_IF_ELSE_TARGET(
+    NV_PROVIDES_SM_90,
+    (
+      _CCCL_IF_CONSTEXPR (sizeof(_Type) == 4) {
+        asm("cp.reduce.async.bulk.global.shared::cta.bulk_group.xor.b32  [%0], [%1], %2; // 3."
+            :
+            : "l"(__as_ptr_gmem(__dstMem)), "r"(__as_ptr_smem(__srcMem)), "r"(__size)
+            : "memory");
+      } _CCCL_ELSE_IF_CONSTEXPR (sizeof(_Type) == 8) {
+        asm("cp.reduce.async.bulk.global.shared::cta.bulk_group.xor.b64  [%0], [%1], %2; // 3."
+            :
+            : "l"(__as_ptr_gmem(__dstMem)), "r"(__as_ptr_smem(__srcMem)), "r"(__size)
+            : "memory");
+      }),
+    (
+      // Unsupported architectures will have a linker error with a semi-decent error message
+      __cuda_ptx_cp_reduce_async_bulk_is_not_supported_before_SM_90__();));
+}
+#endif // __cccl_ptx_isa >= 800
+
+/*
+// cp.reduce.async.bulk.dst.src.bulk_group.op.type  [dstMem], [srcMem], size; // 4. PTX ISA 80, SM_90
+// .dst       = { .global }
+// .src       = { .shared::cta }
+// .type      = { .u32 }
+// .op        = { .min }
+template <typename=void>
+__device__ static inline void cp_reduce_async_bulk(
+  cuda::ptx::space_global_t,
+  cuda::ptx::space_shared_t,
+  cuda::ptx::op_min_t,
+  uint32_t* dstMem,
+  const uint32_t* srcMem,
+  uint32_t size);
+*/
+#if __cccl_ptx_isa >= 800
+extern "C" _CCCL_DEVICE void __cuda_ptx_cp_reduce_async_bulk_is_not_supported_before_SM_90__();
+template <typename = void>
+_CCCL_DEVICE static inline void cp_reduce_async_bulk(
+  space_global_t,
+  space_shared_t,
+  op_min_t,
+  _CUDA_VSTD::uint32_t* __dstMem,
+  const _CUDA_VSTD::uint32_t* __srcMem,
+  _CUDA_VSTD::uint32_t __size)
+{
+  // __space == space_global (due to parameter type constraint)
+  // __space == space_shared (due to parameter type constraint)
+  // __type == type_u32 (due to parameter type constraint)
+  // __op == op_min (due to parameter type constraint)
+  NV_IF_ELSE_TARGET(
+    NV_PROVIDES_SM_90,
+    (asm("cp.reduce.async.bulk.global.shared::cta.bulk_group.min.u32  [%0], [%1], %2; // 4."
+         :
+         : "l"(__as_ptr_gmem(__dstMem)), "r"(__as_ptr_smem(__srcMem)), "r"(__size)
+         : "memory");),
+    (
+      // Unsupported architectures will have a linker error with a semi-decent error message
+      __cuda_ptx_cp_reduce_async_bulk_is_not_supported_before_SM_90__();));
+}
+#endif // __cccl_ptx_isa >= 800
+
+/*
+// cp.reduce.async.bulk.dst.src.bulk_group.op.type  [dstMem], [srcMem], size; // 4. PTX ISA 80, SM_90
+// .dst       = { .global }
+// .src       = { .shared::cta }
+// .type      = { .u32 }
+// .op        = { .max }
+template <typename=void>
+__device__ static inline void cp_reduce_async_bulk(
+  cuda::ptx::space_global_t,
+  cuda::ptx::space_shared_t,
+  cuda::ptx::op_max_t,
+  uint32_t* dstMem,
+  const uint32_t* srcMem,
+  uint32_t size);
+*/
+#if __cccl_ptx_isa >= 800
+extern "C" _CCCL_DEVICE void __cuda_ptx_cp_reduce_async_bulk_is_not_supported_before_SM_90__();
+template <typename = void>
+_CCCL_DEVICE static inline void cp_reduce_async_bulk(
+  space_global_t,
+  space_shared_t,
+  op_max_t,
+  _CUDA_VSTD::uint32_t* __dstMem,
+  const _CUDA_VSTD::uint32_t* __srcMem,
+  _CUDA_VSTD::uint32_t __size)
+{
+  // __space == space_global (due to parameter type constraint)
+  // __space == space_shared (due to parameter type constraint)
+  // __type == type_u32 (due to parameter type constraint)
+  // __op == op_max (due to parameter type constraint)
+  NV_IF_ELSE_TARGET(
+    NV_PROVIDES_SM_90,
+    (asm("cp.reduce.async.bulk.global.shared::cta.bulk_group.max.u32  [%0], [%1], %2; // 4."
+         :
+         : "l"(__as_ptr_gmem(__dstMem)), "r"(__as_ptr_smem(__srcMem)), "r"(__size)
+         : "memory");),
+    (
+      // Unsupported architectures will have a linker error with a semi-decent error message
+      __cuda_ptx_cp_reduce_async_bulk_is_not_supported_before_SM_90__();));
+}
+#endif // __cccl_ptx_isa >= 800
+
+/*
+// cp.reduce.async.bulk.dst.src.bulk_group.op.type  [dstMem], [srcMem], size; // 4. PTX ISA 80, SM_90
+// .dst       = { .global }
+// .src       = { .shared::cta }
+// .type      = { .u32 }
+// .op        = { .add }
+template <typename=void>
+__device__ static inline void cp_reduce_async_bulk(
+  cuda::ptx::space_global_t,
+  cuda::ptx::space_shared_t,
+  cuda::ptx::op_add_t,
+  uint32_t* dstMem,
+  const uint32_t* srcMem,
+  uint32_t size);
+*/
+#if __cccl_ptx_isa >= 800
+extern "C" _CCCL_DEVICE void __cuda_ptx_cp_reduce_async_bulk_is_not_supported_before_SM_90__();
+template <typename = void>
+_CCCL_DEVICE static inline void cp_reduce_async_bulk(
+  space_global_t,
+  space_shared_t,
+  op_add_t,
+  _CUDA_VSTD::uint32_t* __dstMem,
+  const _CUDA_VSTD::uint32_t* __srcMem,
+  _CUDA_VSTD::uint32_t __size)
+{
+  // __space == space_global (due to parameter type constraint)
+  // __space == space_shared (due to parameter type constraint)
+  // __type == type_u32 (due to parameter type constraint)
+  // __op == op_add (due to parameter type constraint)
+  NV_IF_ELSE_TARGET(
+    NV_PROVIDES_SM_90,
+    (asm("cp.reduce.async.bulk.global.shared::cta.bulk_group.add.u32  [%0], [%1], %2; // 4."
+         :
+         : "l"(__as_ptr_gmem(__dstMem)), "r"(__as_ptr_smem(__srcMem)), "r"(__size)
+         : "memory");),
+    (
+      // Unsupported architectures will have a linker error with a semi-decent error message
+      __cuda_ptx_cp_reduce_async_bulk_is_not_supported_before_SM_90__();));
+}
+#endif // __cccl_ptx_isa >= 800
+
+/*
+// cp.reduce.async.bulk.dst.src.bulk_group.op.type  [dstMem], [srcMem], size; // 4. PTX ISA 80, SM_90
+// .dst       = { .global }
+// .src       = { .shared::cta }
+// .type      = { .u32 }
+// .op        = { .inc }
+template <typename=void>
+__device__ static inline void cp_reduce_async_bulk(
+  cuda::ptx::space_global_t,
+  cuda::ptx::space_shared_t,
+  cuda::ptx::op_inc_t,
+  uint32_t* dstMem,
+  const uint32_t* srcMem,
+  uint32_t size);
+*/
+#if __cccl_ptx_isa >= 800
+extern "C" _CCCL_DEVICE void __cuda_ptx_cp_reduce_async_bulk_is_not_supported_before_SM_90__();
+template <typename = void>
+_CCCL_DEVICE static inline void cp_reduce_async_bulk(
+  space_global_t,
+  space_shared_t,
+  op_inc_t,
+  _CUDA_VSTD::uint32_t* __dstMem,
+  const _CUDA_VSTD::uint32_t* __srcMem,
+  _CUDA_VSTD::uint32_t __size)
+{
+  // __space == space_global (due to parameter type constraint)
+  // __space == space_shared (due to parameter type constraint)
+  // __type == type_u32 (due to parameter type constraint)
+  // __op == op_inc (due to parameter type constraint)
+  NV_IF_ELSE_TARGET(
+    NV_PROVIDES_SM_90,
+    (asm("cp.reduce.async.bulk.global.shared::cta.bulk_group.inc.u32  [%0], [%1], %2; // 4."
+         :
+         : "l"(__as_ptr_gmem(__dstMem)), "r"(__as_ptr_smem(__srcMem)), "r"(__size)
+         : "memory");),
+    (
+      // Unsupported architectures will have a linker error with a semi-decent error message
+      __cuda_ptx_cp_reduce_async_bulk_is_not_supported_before_SM_90__();));
+}
+#endif // __cccl_ptx_isa >= 800
+
+/*
+// cp.reduce.async.bulk.dst.src.bulk_group.op.type  [dstMem], [srcMem], size; // 4. PTX ISA 80, SM_90
+// .dst       = { .global }
+// .src       = { .shared::cta }
+// .type      = { .u32 }
+// .op        = { .dec }
+template <typename=void>
+__device__ static inline void cp_reduce_async_bulk(
+  cuda::ptx::space_global_t,
+  cuda::ptx::space_shared_t,
+  cuda::ptx::op_dec_t,
+  uint32_t* dstMem,
+  const uint32_t* srcMem,
+  uint32_t size);
+*/
+#if __cccl_ptx_isa >= 800
+extern "C" _CCCL_DEVICE void __cuda_ptx_cp_reduce_async_bulk_is_not_supported_before_SM_90__();
+template <typename = void>
+_CCCL_DEVICE static inline void cp_reduce_async_bulk(
+  space_global_t,
+  space_shared_t,
+  op_dec_t,
+  _CUDA_VSTD::uint32_t* __dstMem,
+  const _CUDA_VSTD::uint32_t* __srcMem,
+  _CUDA_VSTD::uint32_t __size)
+{
+  // __space == space_global (due to parameter type constraint)
+  // __space == space_shared (due to parameter type constraint)
+  // __type == type_u32 (due to parameter type constraint)
+  // __op == op_dec (due to parameter type constraint)
+  NV_IF_ELSE_TARGET(
+    NV_PROVIDES_SM_90,
+    (asm("cp.reduce.async.bulk.global.shared::cta.bulk_group.dec.u32  [%0], [%1], %2; // 4."
+         :
+         : "l"(__as_ptr_gmem(__dstMem)), "r"(__as_ptr_smem(__srcMem)), "r"(__size)
+         : "memory");),
+    (
+      // Unsupported architectures will have a linker error with a semi-decent error message
+      __cuda_ptx_cp_reduce_async_bulk_is_not_supported_before_SM_90__();));
+}
+#endif // __cccl_ptx_isa >= 800
+
+/*
+// cp.reduce.async.bulk.dst.src.bulk_group.op.type  [dstMem], [srcMem], size; // 4. PTX ISA 80, SM_90
+// .dst       = { .global }
+// .src       = { .shared::cta }
+// .type      = { .s32 }
+// .op        = { .min }
+template <typename=void>
+__device__ static inline void cp_reduce_async_bulk(
+  cuda::ptx::space_global_t,
+  cuda::ptx::space_shared_t,
+  cuda::ptx::op_min_t,
+  int32_t* dstMem,
+  const int32_t* srcMem,
+  uint32_t size);
+*/
+#if __cccl_ptx_isa >= 800
+extern "C" _CCCL_DEVICE void __cuda_ptx_cp_reduce_async_bulk_is_not_supported_before_SM_90__();
+template <typename = void>
+_CCCL_DEVICE static inline void cp_reduce_async_bulk(
+  space_global_t,
+  space_shared_t,
+  op_min_t,
+  _CUDA_VSTD::int32_t* __dstMem,
+  const _CUDA_VSTD::int32_t* __srcMem,
+  _CUDA_VSTD::uint32_t __size)
+{
+  // __space == space_global (due to parameter type constraint)
+  // __space == space_shared (due to parameter type constraint)
+  // __type == type_s32 (due to parameter type constraint)
+  // __op == op_min (due to parameter type constraint)
+  NV_IF_ELSE_TARGET(
+    NV_PROVIDES_SM_90,
+    (asm("cp.reduce.async.bulk.global.shared::cta.bulk_group.min.s32  [%0], [%1], %2; // 4."
+         :
+         : "l"(__as_ptr_gmem(__dstMem)), "r"(__as_ptr_smem(__srcMem)), "r"(__size)
+         : "memory");),
+    (
+      // Unsupported architectures will have a linker error with a semi-decent error message
+      __cuda_ptx_cp_reduce_async_bulk_is_not_supported_before_SM_90__();));
+}
+#endif // __cccl_ptx_isa >= 800
+
+/*
+// cp.reduce.async.bulk.dst.src.bulk_group.op.type  [dstMem], [srcMem], size; // 4. PTX ISA 80, SM_90
+// .dst       = { .global }
+// .src       = { .shared::cta }
+// .type      = { .s32 }
+// .op        = { .max }
+template <typename=void>
+__device__ static inline void cp_reduce_async_bulk(
+  cuda::ptx::space_global_t,
+  cuda::ptx::space_shared_t,
+  cuda::ptx::op_max_t,
+  int32_t* dstMem,
+  const int32_t* srcMem,
+  uint32_t size);
+*/
+#if __cccl_ptx_isa >= 800
+extern "C" _CCCL_DEVICE void __cuda_ptx_cp_reduce_async_bulk_is_not_supported_before_SM_90__();
+template <typename = void>
+_CCCL_DEVICE static inline void cp_reduce_async_bulk(
+  space_global_t,
+  space_shared_t,
+  op_max_t,
+  _CUDA_VSTD::int32_t* __dstMem,
+  const _CUDA_VSTD::int32_t* __srcMem,
+  _CUDA_VSTD::uint32_t __size)
+{
+  // __space == space_global (due to parameter type constraint)
+  // __space == space_shared (due to parameter type constraint)
+  // __type == type_s32 (due to parameter type constraint)
+  // __op == op_max (due to parameter type constraint)
+  NV_IF_ELSE_TARGET(
+    NV_PROVIDES_SM_90,
+    (asm("cp.reduce.async.bulk.global.shared::cta.bulk_group.max.s32  [%0], [%1], %2; // 4."
+         :
+         : "l"(__as_ptr_gmem(__dstMem)), "r"(__as_ptr_smem(__srcMem)), "r"(__size)
+         : "memory");),
+    (
+      // Unsupported architectures will have a linker error with a semi-decent error message
+      __cuda_ptx_cp_reduce_async_bulk_is_not_supported_before_SM_90__();));
+}
+#endif // __cccl_ptx_isa >= 800
+
+/*
+// cp.reduce.async.bulk.dst.src.bulk_group.op.type  [dstMem], [srcMem], size; // 4. PTX ISA 80, SM_90
+// .dst       = { .global }
+// .src       = { .shared::cta }
+// .type      = { .s32 }
+// .op        = { .add }
+template <typename=void>
+__device__ static inline void cp_reduce_async_bulk(
+  cuda::ptx::space_global_t,
+  cuda::ptx::space_shared_t,
+  cuda::ptx::op_add_t,
+  int32_t* dstMem,
+  const int32_t* srcMem,
+  uint32_t size);
+*/
+#if __cccl_ptx_isa >= 800
+extern "C" _CCCL_DEVICE void __cuda_ptx_cp_reduce_async_bulk_is_not_supported_before_SM_90__();
+template <typename = void>
+_CCCL_DEVICE static inline void cp_reduce_async_bulk(
+  space_global_t,
+  space_shared_t,
+  op_add_t,
+  _CUDA_VSTD::int32_t* __dstMem,
+  const _CUDA_VSTD::int32_t* __srcMem,
+  _CUDA_VSTD::uint32_t __size)
+{
+  // __space == space_global (due to parameter type constraint)
+  // __space == space_shared (due to parameter type constraint)
+  // __type == type_s32 (due to parameter type constraint)
+  // __op == op_add (due to parameter type constraint)
+  NV_IF_ELSE_TARGET(
+    NV_PROVIDES_SM_90,
+    (asm("cp.reduce.async.bulk.global.shared::cta.bulk_group.add.s32  [%0], [%1], %2; // 4."
+         :
+         : "l"(__as_ptr_gmem(__dstMem)), "r"(__as_ptr_smem(__srcMem)), "r"(__size)
+         : "memory");),
+    (
+      // Unsupported architectures will have a linker error with a semi-decent error message
+      __cuda_ptx_cp_reduce_async_bulk_is_not_supported_before_SM_90__();));
+}
+#endif // __cccl_ptx_isa >= 800
+
+/*
+// cp.reduce.async.bulk.dst.src.bulk_group.op.type  [dstMem], [srcMem], size; // 4. PTX ISA 80, SM_90
+// .dst       = { .global }
+// .src       = { .shared::cta }
+// .type      = { .u64 }
+// .op        = { .min }
+template <typename=void>
+__device__ static inline void cp_reduce_async_bulk(
+  cuda::ptx::space_global_t,
+  cuda::ptx::space_shared_t,
+  cuda::ptx::op_min_t,
+  uint64_t* dstMem,
+  const uint64_t* srcMem,
+  uint32_t size);
+*/
+#if __cccl_ptx_isa >= 800
+extern "C" _CCCL_DEVICE void __cuda_ptx_cp_reduce_async_bulk_is_not_supported_before_SM_90__();
+template <typename = void>
+_CCCL_DEVICE static inline void cp_reduce_async_bulk(
+  space_global_t,
+  space_shared_t,
+  op_min_t,
+  _CUDA_VSTD::uint64_t* __dstMem,
+  const _CUDA_VSTD::uint64_t* __srcMem,
+  _CUDA_VSTD::uint32_t __size)
+{
+  // __space == space_global (due to parameter type constraint)
+  // __space == space_shared (due to parameter type constraint)
+  // __type == type_u64 (due to parameter type constraint)
+  // __op == op_min (due to parameter type constraint)
+  NV_IF_ELSE_TARGET(
+    NV_PROVIDES_SM_90,
+    (asm("cp.reduce.async.bulk.global.shared::cta.bulk_group.min.u64  [%0], [%1], %2; // 4."
+         :
+         : "l"(__as_ptr_gmem(__dstMem)), "r"(__as_ptr_smem(__srcMem)), "r"(__size)
+         : "memory");),
+    (
+      // Unsupported architectures will have a linker error with a semi-decent error message
+      __cuda_ptx_cp_reduce_async_bulk_is_not_supported_before_SM_90__();));
+}
+#endif // __cccl_ptx_isa >= 800
+
+/*
+// cp.reduce.async.bulk.dst.src.bulk_group.op.type  [dstMem], [srcMem], size; // 4. PTX ISA 80, SM_90
+// .dst       = { .global }
+// .src       = { .shared::cta }
+// .type      = { .u64 }
+// .op        = { .max }
+template <typename=void>
+__device__ static inline void cp_reduce_async_bulk(
+  cuda::ptx::space_global_t,
+  cuda::ptx::space_shared_t,
+  cuda::ptx::op_max_t,
+  uint64_t* dstMem,
+  const uint64_t* srcMem,
+  uint32_t size);
+*/
+#if __cccl_ptx_isa >= 800
+extern "C" _CCCL_DEVICE void __cuda_ptx_cp_reduce_async_bulk_is_not_supported_before_SM_90__();
+template <typename = void>
+_CCCL_DEVICE static inline void cp_reduce_async_bulk(
+  space_global_t,
+  space_shared_t,
+  op_max_t,
+  _CUDA_VSTD::uint64_t* __dstMem,
+  const _CUDA_VSTD::uint64_t* __srcMem,
+  _CUDA_VSTD::uint32_t __size)
+{
+  // __space == space_global (due to parameter type constraint)
+  // __space == space_shared (due to parameter type constraint)
+  // __type == type_u64 (due to parameter type constraint)
+  // __op == op_max (due to parameter type constraint)
+  NV_IF_ELSE_TARGET(
+    NV_PROVIDES_SM_90,
+    (asm("cp.reduce.async.bulk.global.shared::cta.bulk_group.max.u64  [%0], [%1], %2; // 4."
+         :
+         : "l"(__as_ptr_gmem(__dstMem)), "r"(__as_ptr_smem(__srcMem)), "r"(__size)
+         : "memory");),
+    (
+      // Unsupported architectures will have a linker error with a semi-decent error message
+      __cuda_ptx_cp_reduce_async_bulk_is_not_supported_before_SM_90__();));
+}
+#endif // __cccl_ptx_isa >= 800
+
+/*
+// cp.reduce.async.bulk.dst.src.bulk_group.op.type  [dstMem], [srcMem], size; // 4. PTX ISA 80, SM_90
+// .dst       = { .global }
+// .src       = { .shared::cta }
+// .type      = { .u64 }
+// .op        = { .add }
+template <typename=void>
+__device__ static inline void cp_reduce_async_bulk(
+  cuda::ptx::space_global_t,
+  cuda::ptx::space_shared_t,
+  cuda::ptx::op_add_t,
+  uint64_t* dstMem,
+  const uint64_t* srcMem,
+  uint32_t size);
+*/
+#if __cccl_ptx_isa >= 800
+extern "C" _CCCL_DEVICE void __cuda_ptx_cp_reduce_async_bulk_is_not_supported_before_SM_90__();
+template <typename = void>
+_CCCL_DEVICE static inline void cp_reduce_async_bulk(
+  space_global_t,
+  space_shared_t,
+  op_add_t,
+  _CUDA_VSTD::uint64_t* __dstMem,
+  const _CUDA_VSTD::uint64_t* __srcMem,
+  _CUDA_VSTD::uint32_t __size)
+{
+  // __space == space_global (due to parameter type constraint)
+  // __space == space_shared (due to parameter type constraint)
+  // __type == type_u64 (due to parameter type constraint)
+  // __op == op_add (due to parameter type constraint)
+  NV_IF_ELSE_TARGET(
+    NV_PROVIDES_SM_90,
+    (asm("cp.reduce.async.bulk.global.shared::cta.bulk_group.add.u64  [%0], [%1], %2; // 4."
+         :
+         : "l"(__as_ptr_gmem(__dstMem)), "r"(__as_ptr_smem(__srcMem)), "r"(__size)
+         : "memory");),
+    (
+      // Unsupported architectures will have a linker error with a semi-decent error message
+      __cuda_ptx_cp_reduce_async_bulk_is_not_supported_before_SM_90__();));
+}
+#endif // __cccl_ptx_isa >= 800
+
+/*
+// cp.reduce.async.bulk.dst.src.bulk_group.op.type  [dstMem], [srcMem], size; // 4. PTX ISA 80, SM_90
+// .dst       = { .global }
+// .src       = { .shared::cta }
+// .type      = { .s64 }
+// .op        = { .min }
+template <typename=void>
+__device__ static inline void cp_reduce_async_bulk(
+  cuda::ptx::space_global_t,
+  cuda::ptx::space_shared_t,
+  cuda::ptx::op_min_t,
+  int64_t* dstMem,
+  const int64_t* srcMem,
+  uint32_t size);
+*/
+#if __cccl_ptx_isa >= 800
+extern "C" _CCCL_DEVICE void __cuda_ptx_cp_reduce_async_bulk_is_not_supported_before_SM_90__();
+template <typename = void>
+_CCCL_DEVICE static inline void cp_reduce_async_bulk(
+  space_global_t,
+  space_shared_t,
+  op_min_t,
+  _CUDA_VSTD::int64_t* __dstMem,
+  const _CUDA_VSTD::int64_t* __srcMem,
+  _CUDA_VSTD::uint32_t __size)
+{
+  // __space == space_global (due to parameter type constraint)
+  // __space == space_shared (due to parameter type constraint)
+  // __type == type_s64 (due to parameter type constraint)
+  // __op == op_min (due to parameter type constraint)
+  NV_IF_ELSE_TARGET(
+    NV_PROVIDES_SM_90,
+    (asm("cp.reduce.async.bulk.global.shared::cta.bulk_group.min.s64  [%0], [%1], %2; // 4."
+         :
+         : "l"(__as_ptr_gmem(__dstMem)), "r"(__as_ptr_smem(__srcMem)), "r"(__size)
+         : "memory");),
+    (
+      // Unsupported architectures will have a linker error with a semi-decent error message
+      __cuda_ptx_cp_reduce_async_bulk_is_not_supported_before_SM_90__();));
+}
+#endif // __cccl_ptx_isa >= 800
+
+/*
+// cp.reduce.async.bulk.dst.src.bulk_group.op.type  [dstMem], [srcMem], size; // 4. PTX ISA 80, SM_90
+// .dst       = { .global }
+// .src       = { .shared::cta }
+// .type      = { .s64 }
+// .op        = { .max }
+template <typename=void>
+__device__ static inline void cp_reduce_async_bulk(
+  cuda::ptx::space_global_t,
+  cuda::ptx::space_shared_t,
+  cuda::ptx::op_max_t,
+  int64_t* dstMem,
+  const int64_t* srcMem,
+  uint32_t size);
+*/
+#if __cccl_ptx_isa >= 800
+extern "C" _CCCL_DEVICE void __cuda_ptx_cp_reduce_async_bulk_is_not_supported_before_SM_90__();
+template <typename = void>
+_CCCL_DEVICE static inline void cp_reduce_async_bulk(
+  space_global_t,
+  space_shared_t,
+  op_max_t,
+  _CUDA_VSTD::int64_t* __dstMem,
+  const _CUDA_VSTD::int64_t* __srcMem,
+  _CUDA_VSTD::uint32_t __size)
+{
+  // __space == space_global (due to parameter type constraint)
+  // __space == space_shared (due to parameter type constraint)
+  // __type == type_s64 (due to parameter type constraint)
+  // __op == op_max (due to parameter type constraint)
+  NV_IF_ELSE_TARGET(
+    NV_PROVIDES_SM_90,
+    (asm("cp.reduce.async.bulk.global.shared::cta.bulk_group.max.s64  [%0], [%1], %2; // 4."
+         :
+         : "l"(__as_ptr_gmem(__dstMem)), "r"(__as_ptr_smem(__srcMem)), "r"(__size)
+         : "memory");),
+    (
+      // Unsupported architectures will have a linker error with a semi-decent error message
+      __cuda_ptx_cp_reduce_async_bulk_is_not_supported_before_SM_90__();));
+}
+#endif // __cccl_ptx_isa >= 800
+
+/*
+// cp.reduce.async.bulk.dst.src.bulk_group.op.type  [dstMem], [srcMem], size; // 4. PTX ISA 80, SM_90
+// .dst       = { .global }
+// .src       = { .shared::cta }
+// .type      = { .f32 }
+// .op        = { .add }
+template <typename=void>
+__device__ static inline void cp_reduce_async_bulk(
+  cuda::ptx::space_global_t,
+  cuda::ptx::space_shared_t,
+  cuda::ptx::op_add_t,
+  float* dstMem,
+  const float* srcMem,
+  uint32_t size);
+*/
+#if __cccl_ptx_isa >= 800
+extern "C" _CCCL_DEVICE void __cuda_ptx_cp_reduce_async_bulk_is_not_supported_before_SM_90__();
+template <typename = void>
+_CCCL_DEVICE static inline void cp_reduce_async_bulk(
+  space_global_t, space_shared_t, op_add_t, float* __dstMem, const float* __srcMem, _CUDA_VSTD::uint32_t __size)
+{
+  // __space == space_global (due to parameter type constraint)
+  // __space == space_shared (due to parameter type constraint)
+  // __type == type_f32 (due to parameter type constraint)
+  // __op == op_add (due to parameter type constraint)
+  NV_IF_ELSE_TARGET(
+    NV_PROVIDES_SM_90,
+    (asm("cp.reduce.async.bulk.global.shared::cta.bulk_group.add.f32  [%0], [%1], %2; // 4."
+         :
+         : "l"(__as_ptr_gmem(__dstMem)), "r"(__as_ptr_smem(__srcMem)), "r"(__size)
+         : "memory");),
+    (
+      // Unsupported architectures will have a linker error with a semi-decent error message
+      __cuda_ptx_cp_reduce_async_bulk_is_not_supported_before_SM_90__();));
+}
+#endif // __cccl_ptx_isa >= 800
+
+/*
+// cp.reduce.async.bulk.dst.src.bulk_group.op.type  [dstMem], [srcMem], size; // 4. PTX ISA 80, SM_90
+// .dst       = { .global }
+// .src       = { .shared::cta }
+// .type      = { .f64 }
+// .op        = { .add }
+template <typename=void>
+__device__ static inline void cp_reduce_async_bulk(
+  cuda::ptx::space_global_t,
+  cuda::ptx::space_shared_t,
+  cuda::ptx::op_add_t,
+  double* dstMem,
+  const double* srcMem,
+  uint32_t size);
+*/
+#if __cccl_ptx_isa >= 800
+extern "C" _CCCL_DEVICE void __cuda_ptx_cp_reduce_async_bulk_is_not_supported_before_SM_90__();
+template <typename = void>
+_CCCL_DEVICE static inline void cp_reduce_async_bulk(
+  space_global_t, space_shared_t, op_add_t, double* __dstMem, const double* __srcMem, _CUDA_VSTD::uint32_t __size)
+{
+  // __space == space_global (due to parameter type constraint)
+  // __space == space_shared (due to parameter type constraint)
+  // __type == type_f64 (due to parameter type constraint)
+  // __op == op_add (due to parameter type constraint)
+  NV_IF_ELSE_TARGET(
+    NV_PROVIDES_SM_90,
+    (asm("cp.reduce.async.bulk.global.shared::cta.bulk_group.add.f64  [%0], [%1], %2; // 4."
+         :
+         : "l"(__as_ptr_gmem(__dstMem)), "r"(__as_ptr_smem(__srcMem)), "r"(__size)
+         : "memory");),
+    (
+      // Unsupported architectures will have a linker error with a semi-decent error message
+      __cuda_ptx_cp_reduce_async_bulk_is_not_supported_before_SM_90__();));
+}
+#endif // __cccl_ptx_isa >= 800
+
+/*
+// cp.reduce.async.bulk.dst.src.bulk_group.op.u64  [dstMem], [srcMem], size; // 6. PTX ISA 80, SM_90
+// .dst       = { .global }
+// .src       = { .shared::cta }
+// .type      = { .s64 }
+// .op        = { .add }
+template <typename=void>
+__device__ static inline void cp_reduce_async_bulk(
+  cuda::ptx::space_global_t,
+  cuda::ptx::space_shared_t,
+  cuda::ptx::op_add_t,
+  int64_t* dstMem,
+  const int64_t* srcMem,
+  uint32_t size);
+*/
+#if __cccl_ptx_isa >= 800
+extern "C" _CCCL_DEVICE void __cuda_ptx_cp_reduce_async_bulk_is_not_supported_before_SM_90__();
+template <typename = void>
+_CCCL_DEVICE static inline void cp_reduce_async_bulk(
+  space_global_t,
+  space_shared_t,
+  op_add_t,
+  _CUDA_VSTD::int64_t* __dstMem,
+  const _CUDA_VSTD::int64_t* __srcMem,
+  _CUDA_VSTD::uint32_t __size)
+{
+  // __space == space_global (due to parameter type constraint)
+  // __space == space_shared (due to parameter type constraint)
+  // __type == type_s64 (due to parameter type constraint)
+  // __op == op_add (due to parameter type constraint)
+  NV_IF_ELSE_TARGET(
+    NV_PROVIDES_SM_90,
+    (asm("cp.reduce.async.bulk.global.shared::cta.bulk_group.add.u64  [%0], [%1], %2; // 6."
+         :
+         : "l"(__as_ptr_gmem(__dstMem)), "r"(__as_ptr_smem(__srcMem)), "r"(__size)
+         : "memory");),
+    (
+      // Unsupported architectures will have a linker error with a semi-decent error message
+      __cuda_ptx_cp_reduce_async_bulk_is_not_supported_before_SM_90__();));
+}
+#endif // __cccl_ptx_isa >= 800
diff --git a/libcudacxx/include/cuda/__ptx/instructions/generated/cp_reduce_async_bulk_bf16.inc b/libcudacxx/include/cuda/__ptx/instructions/generated/cp_reduce_async_bulk_bf16.inc
new file mode 100644
index 00000000000..c657e8d1935
--- /dev/null
+++ b/libcudacxx/include/cuda/__ptx/instructions/generated/cp_reduce_async_bulk_bf16.inc
@@ -0,0 +1,127 @@
+#ifdef _LIBCUDACXX_HAS_NVBF16
+/*
+// cp.reduce.async.bulk.dst.src.bulk_group.op.type  [dstMem], [srcMem], size; // 4. PTX ISA 80, SM_90
+// .dst       = { .global }
+// .src       = { .shared::cta }
+// .type      = { .bf16 }
+// .op        = { .min }
+template <typename=void>
+__device__ static inline void cp_reduce_async_bulk(
+  cuda::ptx::space_global_t,
+  cuda::ptx::space_shared_t,
+  cuda::ptx::op_min_t,
+  __nv_bfloat16* dstMem,
+  const __nv_bfloat16* srcMem,
+  uint32_t size);
+*/
+#  if __cccl_ptx_isa >= 800
+extern "C" _CCCL_DEVICE void __cuda_ptx_cp_reduce_async_bulk_is_not_supported_before_SM_90__();
+template <typename = void>
+_CCCL_DEVICE static inline void cp_reduce_async_bulk(
+  space_global_t,
+  space_shared_t,
+  op_min_t,
+  __nv_bfloat16* __dstMem,
+  const __nv_bfloat16* __srcMem,
+  _CUDA_VSTD::uint32_t __size)
+{
+  // __space == space_global (due to parameter type constraint)
+  // __space == space_shared (due to parameter type constraint)
+  // __type == type_bf16 (due to parameter type constraint)
+  // __op == op_min (due to parameter type constraint)
+  NV_IF_ELSE_TARGET(
+    NV_PROVIDES_SM_90,
+    (asm("cp.reduce.async.bulk.global.shared::cta.bulk_group.min.bf16  [%0], [%1], %2; // 4."
+         :
+         : "l"(__as_ptr_gmem(__dstMem)), "r"(__as_ptr_smem(__srcMem)), "r"(__size)
+         : "memory");),
+    (
+      // Unsupported architectures will have a linker error with a semi-decent error message
+      __cuda_ptx_cp_reduce_async_bulk_is_not_supported_before_SM_90__();));
+}
+#  endif // __cccl_ptx_isa >= 800
+
+/*
+// cp.reduce.async.bulk.dst.src.bulk_group.op.type  [dstMem], [srcMem], size; // 4. PTX ISA 80, SM_90
+// .dst       = { .global }
+// .src       = { .shared::cta }
+// .type      = { .bf16 }
+// .op        = { .max }
+template <typename=void>
+__device__ static inline void cp_reduce_async_bulk(
+  cuda::ptx::space_global_t,
+  cuda::ptx::space_shared_t,
+  cuda::ptx::op_max_t,
+  __nv_bfloat16* dstMem,
+  const __nv_bfloat16* srcMem,
+  uint32_t size);
+*/
+#  if __cccl_ptx_isa >= 800
+extern "C" _CCCL_DEVICE void __cuda_ptx_cp_reduce_async_bulk_is_not_supported_before_SM_90__();
+template <typename = void>
+_CCCL_DEVICE static inline void cp_reduce_async_bulk(
+  space_global_t,
+  space_shared_t,
+  op_max_t,
+  __nv_bfloat16* __dstMem,
+  const __nv_bfloat16* __srcMem,
+  _CUDA_VSTD::uint32_t __size)
+{
+  // __space == space_global (due to parameter type constraint)
+  // __space == space_shared (due to parameter type constraint)
+  // __type == type_bf16 (due to parameter type constraint)
+  // __op == op_max (due to parameter type constraint)
+  NV_IF_ELSE_TARGET(
+    NV_PROVIDES_SM_90,
+    (asm("cp.reduce.async.bulk.global.shared::cta.bulk_group.max.bf16  [%0], [%1], %2; // 4."
+         :
+         : "l"(__as_ptr_gmem(__dstMem)), "r"(__as_ptr_smem(__srcMem)), "r"(__size)
+         : "memory");),
+    (
+      // Unsupported architectures will have a linker error with a semi-decent error message
+      __cuda_ptx_cp_reduce_async_bulk_is_not_supported_before_SM_90__();));
+}
+#  endif // __cccl_ptx_isa >= 800
+
+/*
+// cp.reduce.async.bulk.dst.src.bulk_group.op.noftz.type  [dstMem], [srcMem], size; // 5. PTX ISA 80, SM_90
+// .dst       = { .global }
+// .src       = { .shared::cta }
+// .type      = { .bf16 }
+// .op        = { .add }
+template <typename=void>
+__device__ static inline void cp_reduce_async_bulk(
+  cuda::ptx::space_global_t,
+  cuda::ptx::space_shared_t,
+  cuda::ptx::op_add_t,
+  __nv_bfloat16* dstMem,
+  const __nv_bfloat16* srcMem,
+  uint32_t size);
+*/
+#  if __cccl_ptx_isa >= 800
+extern "C" _CCCL_DEVICE void __cuda_ptx_cp_reduce_async_bulk_is_not_supported_before_SM_90__();
+template <typename = void>
+_CCCL_DEVICE static inline void cp_reduce_async_bulk(
+  space_global_t,
+  space_shared_t,
+  op_add_t,
+  __nv_bfloat16* __dstMem,
+  const __nv_bfloat16* __srcMem,
+  _CUDA_VSTD::uint32_t __size)
+{
+  // __space == space_global (due to parameter type constraint)
+  // __space == space_shared (due to parameter type constraint)
+  // __type == type_bf16 (due to parameter type constraint)
+  // __op == op_add (due to parameter type constraint)
+  NV_IF_ELSE_TARGET(
+    NV_PROVIDES_SM_90,
+    (asm("cp.reduce.async.bulk.global.shared::cta.bulk_group.add.noftz.bf16  [%0], [%1], %2; // 5."
+         :
+         : "l"(__as_ptr_gmem(__dstMem)), "r"(__as_ptr_smem(__srcMem)), "r"(__size)
+         : "memory");),
+    (
+      // Unsupported architectures will have a linker error with a semi-decent error message
+      __cuda_ptx_cp_reduce_async_bulk_is_not_supported_before_SM_90__();));
+}
+#  endif // __cccl_ptx_isa >= 800
+#endif // _LIBCUDACXX_HAS_NVBF16
diff --git a/libcudacxx/include/cuda/__ptx/instructions/generated/cp_reduce_async_bulk_f16.inc b/libcudacxx/include/cuda/__ptx/instructions/generated/cp_reduce_async_bulk_f16.inc
new file mode 100644
index 00000000000..3a52630db53
--- /dev/null
+++ b/libcudacxx/include/cuda/__ptx/instructions/generated/cp_reduce_async_bulk_f16.inc
@@ -0,0 +1,110 @@
+/*
+// cp.reduce.async.bulk.dst.src.bulk_group.op.type  [dstMem], [srcMem], size; // 4. PTX ISA 80, SM_90
+// .dst       = { .global }
+// .src       = { .shared::cta }
+// .type      = { .f16 }
+// .op        = { .min }
+template <typename=void>
+__device__ static inline void cp_reduce_async_bulk(
+  cuda::ptx::space_global_t,
+  cuda::ptx::space_shared_t,
+  cuda::ptx::op_min_t,
+  __half* dstMem,
+  const __half* srcMem,
+  uint32_t size);
+*/
+#  if __cccl_ptx_isa >= 800
+extern "C" _CCCL_DEVICE void __cuda_ptx_cp_reduce_async_bulk_is_not_supported_before_SM_90__();
+template <typename = void>
+_CCCL_DEVICE static inline void cp_reduce_async_bulk(
+  space_global_t, space_shared_t, op_min_t, __half* __dstMem, const __half* __srcMem, _CUDA_VSTD::uint32_t __size)
+{
+  // __space == space_global (due to parameter type constraint)
+  // __space == space_shared (due to parameter type constraint)
+  // __type == type_f16 (due to parameter type constraint)
+  // __op == op_min (due to parameter type constraint)
+  NV_IF_ELSE_TARGET(
+    NV_PROVIDES_SM_90,
+    (asm("cp.reduce.async.bulk.global.shared::cta.bulk_group.min.f16  [%0], [%1], %2; // 4."
+         :
+         : "l"(__as_ptr_gmem(__dstMem)), "r"(__as_ptr_smem(__srcMem)), "r"(__size)
+         : "memory");),
+    (
+      // Unsupported architectures will have a linker error with a semi-decent error message
+      __cuda_ptx_cp_reduce_async_bulk_is_not_supported_before_SM_90__();));
+}
+#  endif // __cccl_ptx_isa >= 800
+
+/*
+// cp.reduce.async.bulk.dst.src.bulk_group.op.type  [dstMem], [srcMem], size; // 4. PTX ISA 80, SM_90
+// .dst       = { .global }
+// .src       = { .shared::cta }
+// .type      = { .f16 }
+// .op        = { .max }
+template <typename=void>
+__device__ static inline void cp_reduce_async_bulk(
+  cuda::ptx::space_global_t,
+  cuda::ptx::space_shared_t,
+  cuda::ptx::op_max_t,
+  __half* dstMem,
+  const __half* srcMem,
+  uint32_t size);
+*/
+#  if __cccl_ptx_isa >= 800
+extern "C" _CCCL_DEVICE void __cuda_ptx_cp_reduce_async_bulk_is_not_supported_before_SM_90__();
+template <typename = void>
+_CCCL_DEVICE static inline void cp_reduce_async_bulk(
+  space_global_t, space_shared_t, op_max_t, __half* __dstMem, const __half* __srcMem, _CUDA_VSTD::uint32_t __size)
+{
+  // __space == space_global (due to parameter type constraint)
+  // __space == space_shared (due to parameter type constraint)
+  // __type == type_f16 (due to parameter type constraint)
+  // __op == op_max (due to parameter type constraint)
+  NV_IF_ELSE_TARGET(
+    NV_PROVIDES_SM_90,
+    (asm("cp.reduce.async.bulk.global.shared::cta.bulk_group.max.f16  [%0], [%1], %2; // 4."
+         :
+         : "l"(__as_ptr_gmem(__dstMem)), "r"(__as_ptr_smem(__srcMem)), "r"(__size)
+         : "memory");),
+    (
+      // Unsupported architectures will have a linker error with a semi-decent error message
+      __cuda_ptx_cp_reduce_async_bulk_is_not_supported_before_SM_90__();));
+}
+#  endif // __cccl_ptx_isa >= 800
+
+/*
+// cp.reduce.async.bulk.dst.src.bulk_group.op.noftz.type  [dstMem], [srcMem], size; // 5. PTX ISA 80, SM_90
+// .dst       = { .global }
+// .src       = { .shared::cta }
+// .type      = { .f16 }
+// .op        = { .add }
+template <typename=void>
+__device__ static inline void cp_reduce_async_bulk(
+  cuda::ptx::space_global_t,
+  cuda::ptx::space_shared_t,
+  cuda::ptx::op_add_t,
+  __half* dstMem,
+  const __half* srcMem,
+  uint32_t size);
+*/
+#  if __cccl_ptx_isa >= 800
+extern "C" _CCCL_DEVICE void __cuda_ptx_cp_reduce_async_bulk_is_not_supported_before_SM_90__();
+template <typename = void>
+_CCCL_DEVICE static inline void cp_reduce_async_bulk(
+  space_global_t, space_shared_t, op_add_t, __half* __dstMem, const __half* __srcMem, _CUDA_VSTD::uint32_t __size)
+{
+  // __space == space_global (due to parameter type constraint)
+  // __space == space_shared (due to parameter type constraint)
+  // __type == type_f16 (due to parameter type constraint)
+  // __op == op_add (due to parameter type constraint)
+  NV_IF_ELSE_TARGET(
+    NV_PROVIDES_SM_90,
+    (asm("cp.reduce.async.bulk.global.shared::cta.bulk_group.add.noftz.f16  [%0], [%1], %2; // 5."
+         :
+         : "l"(__as_ptr_gmem(__dstMem)), "r"(__as_ptr_smem(__srcMem)), "r"(__size)
+         : "memory");),
+    (
+      // Unsupported architectures will have a linker error with a semi-decent error message
+      __cuda_ptx_cp_reduce_async_bulk_is_not_supported_before_SM_90__();));
+}
+#  endif // __cccl_ptx_isa >= 800
diff --git a/libcudacxx/include/cuda/__ptx/instructions/generated/cp_reduce_async_bulk_tensor.inc b/libcudacxx/include/cuda/__ptx/instructions/generated/cp_reduce_async_bulk_tensor.inc
new file mode 100644
index 00000000000..32008f6af5b
--- /dev/null
+++ b/libcudacxx/include/cuda/__ptx/instructions/generated/cp_reduce_async_bulk_tensor.inc
@@ -0,0 +1,532 @@
+/*
+// cp.reduce.async.bulk.tensor.1d.dst.src.op.tile.bulk_group [tensorMap, tensorCoords], [srcMem]; // 1a. PTX ISA 80,
+SM_90
+// .dst       = { .global }
+// .src       = { .shared::cta }
+// .op        = { .add, .min, .max, .inc, .dec, .and, .or, .xor }
+template <cuda::ptx::dot_op Op>
+__device__ static inline void cp_reduce_async_bulk_tensor(
+  cuda::ptx::space_global_t,
+  cuda::ptx::space_shared_t,
+  cuda::ptx::op_t<Op> op,
+  const void* tensorMap,
+  const int32_t (&tensorCoords)[1],
+  const void* srcMem);
+*/
+#if __cccl_ptx_isa >= 800
+extern "C" _CCCL_DEVICE void __cuda_ptx_cp_reduce_async_bulk_tensor_is_not_supported_before_SM_90__();
+template <dot_op _Op>
+_CCCL_DEVICE static inline void cp_reduce_async_bulk_tensor(
+  space_global_t,
+  space_shared_t,
+  op_t<_Op> __op,
+  const void* __tensorMap,
+  const _CUDA_VSTD::int32_t (&__tensorCoords)[1],
+  const void* __srcMem)
+{
+  // __space == space_global (due to parameter type constraint)
+  // __space == space_shared (due to parameter type constraint)
+  static_assert(__op == op_add || __op == op_min || __op == op_max || __op == op_inc || __op == op_dec
+                  || __op == op_and_op || __op == op_or_op || __op == op_xor_op,
+                "");
+  NV_IF_ELSE_TARGET(
+    NV_PROVIDES_SM_90,
+    (
+      _CCCL_IF_CONSTEXPR (__op == op_add) {
+        asm("cp.reduce.async.bulk.tensor.1d.global.shared::cta.add.tile.bulk_group [%0, {%1}], [%2]; // 1a."
+            :
+            : "l"(__tensorMap), "r"(__tensorCoords[0]), "r"(__as_ptr_smem(__srcMem))
+            : "memory");
+      } _CCCL_ELSE_IF_CONSTEXPR (__op == op_min) {
+        asm("cp.reduce.async.bulk.tensor.1d.global.shared::cta.min.tile.bulk_group [%0, {%1}], [%2]; // 1a."
+            :
+            : "l"(__tensorMap), "r"(__tensorCoords[0]), "r"(__as_ptr_smem(__srcMem))
+            : "memory");
+      } _CCCL_ELSE_IF_CONSTEXPR (__op == op_max) {
+        asm("cp.reduce.async.bulk.tensor.1d.global.shared::cta.max.tile.bulk_group [%0, {%1}], [%2]; // 1a."
+            :
+            : "l"(__tensorMap), "r"(__tensorCoords[0]), "r"(__as_ptr_smem(__srcMem))
+            : "memory");
+      } _CCCL_ELSE_IF_CONSTEXPR (__op == op_inc) {
+        asm("cp.reduce.async.bulk.tensor.1d.global.shared::cta.inc.tile.bulk_group [%0, {%1}], [%2]; // 1a."
+            :
+            : "l"(__tensorMap), "r"(__tensorCoords[0]), "r"(__as_ptr_smem(__srcMem))
+            : "memory");
+      } _CCCL_ELSE_IF_CONSTEXPR (__op == op_dec) {
+        asm("cp.reduce.async.bulk.tensor.1d.global.shared::cta.dec.tile.bulk_group [%0, {%1}], [%2]; // 1a."
+            :
+            : "l"(__tensorMap), "r"(__tensorCoords[0]), "r"(__as_ptr_smem(__srcMem))
+            : "memory");
+      } _CCCL_ELSE_IF_CONSTEXPR (__op == op_and_op) {
+        asm("cp.reduce.async.bulk.tensor.1d.global.shared::cta.and.tile.bulk_group [%0, {%1}], [%2]; // 1a."
+            :
+            : "l"(__tensorMap), "r"(__tensorCoords[0]), "r"(__as_ptr_smem(__srcMem))
+            : "memory");
+      } _CCCL_ELSE_IF_CONSTEXPR (__op == op_or_op) {
+        asm("cp.reduce.async.bulk.tensor.1d.global.shared::cta.or.tile.bulk_group [%0, {%1}], [%2]; // 1a."
+            :
+            : "l"(__tensorMap), "r"(__tensorCoords[0]), "r"(__as_ptr_smem(__srcMem))
+            : "memory");
+      } _CCCL_ELSE_IF_CONSTEXPR (__op == op_xor_op) {
+        asm("cp.reduce.async.bulk.tensor.1d.global.shared::cta.xor.tile.bulk_group [%0, {%1}], [%2]; // 1a."
+            :
+            : "l"(__tensorMap), "r"(__tensorCoords[0]), "r"(__as_ptr_smem(__srcMem))
+            : "memory");
+      }),
+    (
+      // Unsupported architectures will have a linker error with a semi-decent error message
+      __cuda_ptx_cp_reduce_async_bulk_tensor_is_not_supported_before_SM_90__();));
+}
+#endif // __cccl_ptx_isa >= 800
+
+/*
+// cp.reduce.async.bulk.tensor.2d.dst.src.op.tile.bulk_group [tensorMap, tensorCoords], [srcMem]; // 1b. PTX ISA 80,
+SM_90
+// .dst       = { .global }
+// .src       = { .shared::cta }
+// .op        = { .add, .min, .max, .inc, .dec, .and, .or, .xor }
+template <cuda::ptx::dot_op Op>
+__device__ static inline void cp_reduce_async_bulk_tensor(
+  cuda::ptx::space_global_t,
+  cuda::ptx::space_shared_t,
+  cuda::ptx::op_t<Op> op,
+  const void* tensorMap,
+  const int32_t (&tensorCoords)[2],
+  const void* srcMem);
+*/
+#if __cccl_ptx_isa >= 800
+extern "C" _CCCL_DEVICE void __cuda_ptx_cp_reduce_async_bulk_tensor_is_not_supported_before_SM_90__();
+template <dot_op _Op>
+_CCCL_DEVICE static inline void cp_reduce_async_bulk_tensor(
+  space_global_t,
+  space_shared_t,
+  op_t<_Op> __op,
+  const void* __tensorMap,
+  const _CUDA_VSTD::int32_t (&__tensorCoords)[2],
+  const void* __srcMem)
+{
+  // __space == space_global (due to parameter type constraint)
+  // __space == space_shared (due to parameter type constraint)
+  static_assert(__op == op_add || __op == op_min || __op == op_max || __op == op_inc || __op == op_dec
+                  || __op == op_and_op || __op == op_or_op || __op == op_xor_op,
+                "");
+  NV_IF_ELSE_TARGET(
+    NV_PROVIDES_SM_90,
+    (
+      _CCCL_IF_CONSTEXPR (__op == op_add) {
+        asm("cp.reduce.async.bulk.tensor.2d.global.shared::cta.add.tile.bulk_group [%0, {%1, %2}], [%3]; // 1b."
+            :
+            : "l"(__tensorMap), "r"(__tensorCoords[0]), "r"(__tensorCoords[1]), "r"(__as_ptr_smem(__srcMem))
+            : "memory");
+      } _CCCL_ELSE_IF_CONSTEXPR (__op == op_min) {
+        asm("cp.reduce.async.bulk.tensor.2d.global.shared::cta.min.tile.bulk_group [%0, {%1, %2}], [%3]; // 1b."
+            :
+            : "l"(__tensorMap), "r"(__tensorCoords[0]), "r"(__tensorCoords[1]), "r"(__as_ptr_smem(__srcMem))
+            : "memory");
+      } _CCCL_ELSE_IF_CONSTEXPR (__op == op_max) {
+        asm("cp.reduce.async.bulk.tensor.2d.global.shared::cta.max.tile.bulk_group [%0, {%1, %2}], [%3]; // 1b."
+            :
+            : "l"(__tensorMap), "r"(__tensorCoords[0]), "r"(__tensorCoords[1]), "r"(__as_ptr_smem(__srcMem))
+            : "memory");
+      } _CCCL_ELSE_IF_CONSTEXPR (__op == op_inc) {
+        asm("cp.reduce.async.bulk.tensor.2d.global.shared::cta.inc.tile.bulk_group [%0, {%1, %2}], [%3]; // 1b."
+            :
+            : "l"(__tensorMap), "r"(__tensorCoords[0]), "r"(__tensorCoords[1]), "r"(__as_ptr_smem(__srcMem))
+            : "memory");
+      } _CCCL_ELSE_IF_CONSTEXPR (__op == op_dec) {
+        asm("cp.reduce.async.bulk.tensor.2d.global.shared::cta.dec.tile.bulk_group [%0, {%1, %2}], [%3]; // 1b."
+            :
+            : "l"(__tensorMap), "r"(__tensorCoords[0]), "r"(__tensorCoords[1]), "r"(__as_ptr_smem(__srcMem))
+            : "memory");
+      } _CCCL_ELSE_IF_CONSTEXPR (__op == op_and_op) {
+        asm("cp.reduce.async.bulk.tensor.2d.global.shared::cta.and.tile.bulk_group [%0, {%1, %2}], [%3]; // 1b."
+            :
+            : "l"(__tensorMap), "r"(__tensorCoords[0]), "r"(__tensorCoords[1]), "r"(__as_ptr_smem(__srcMem))
+            : "memory");
+      } _CCCL_ELSE_IF_CONSTEXPR (__op == op_or_op) {
+        asm("cp.reduce.async.bulk.tensor.2d.global.shared::cta.or.tile.bulk_group [%0, {%1, %2}], [%3]; // 1b."
+            :
+            : "l"(__tensorMap), "r"(__tensorCoords[0]), "r"(__tensorCoords[1]), "r"(__as_ptr_smem(__srcMem))
+            : "memory");
+      } _CCCL_ELSE_IF_CONSTEXPR (__op == op_xor_op) {
+        asm("cp.reduce.async.bulk.tensor.2d.global.shared::cta.xor.tile.bulk_group [%0, {%1, %2}], [%3]; // 1b."
+            :
+            : "l"(__tensorMap), "r"(__tensorCoords[0]), "r"(__tensorCoords[1]), "r"(__as_ptr_smem(__srcMem))
+            : "memory");
+      }),
+    (
+      // Unsupported architectures will have a linker error with a semi-decent error message
+      __cuda_ptx_cp_reduce_async_bulk_tensor_is_not_supported_before_SM_90__();));
+}
+#endif // __cccl_ptx_isa >= 800
+
+/*
+// cp.reduce.async.bulk.tensor.3d.dst.src.op.tile.bulk_group [tensorMap, tensorCoords], [srcMem]; // 1c. PTX ISA 80,
+SM_90
+// .dst       = { .global }
+// .src       = { .shared::cta }
+// .op        = { .add, .min, .max, .inc, .dec, .and, .or, .xor }
+template <cuda::ptx::dot_op Op>
+__device__ static inline void cp_reduce_async_bulk_tensor(
+  cuda::ptx::space_global_t,
+  cuda::ptx::space_shared_t,
+  cuda::ptx::op_t<Op> op,
+  const void* tensorMap,
+  const int32_t (&tensorCoords)[3],
+  const void* srcMem);
+*/
+#if __cccl_ptx_isa >= 800
+extern "C" _CCCL_DEVICE void __cuda_ptx_cp_reduce_async_bulk_tensor_is_not_supported_before_SM_90__();
+template <dot_op _Op>
+_CCCL_DEVICE static inline void cp_reduce_async_bulk_tensor(
+  space_global_t,
+  space_shared_t,
+  op_t<_Op> __op,
+  const void* __tensorMap,
+  const _CUDA_VSTD::int32_t (&__tensorCoords)[3],
+  const void* __srcMem)
+{
+  // __space == space_global (due to parameter type constraint)
+  // __space == space_shared (due to parameter type constraint)
+  static_assert(__op == op_add || __op == op_min || __op == op_max || __op == op_inc || __op == op_dec
+                  || __op == op_and_op || __op == op_or_op || __op == op_xor_op,
+                "");
+  NV_IF_ELSE_TARGET(
+    NV_PROVIDES_SM_90,
+    (
+      _CCCL_IF_CONSTEXPR (__op == op_add) {
+        asm("cp.reduce.async.bulk.tensor.3d.global.shared::cta.add.tile.bulk_group [%0, {%1, %2, %3}], [%4]; // 1c."
+            :
+            : "l"(__tensorMap),
+              "r"(__tensorCoords[0]),
+              "r"(__tensorCoords[1]),
+              "r"(__tensorCoords[2]),
+              "r"(__as_ptr_smem(__srcMem))
+            : "memory");
+      } _CCCL_ELSE_IF_CONSTEXPR (__op == op_min) {
+        asm("cp.reduce.async.bulk.tensor.3d.global.shared::cta.min.tile.bulk_group [%0, {%1, %2, %3}], [%4]; // 1c."
+            :
+            : "l"(__tensorMap),
+              "r"(__tensorCoords[0]),
+              "r"(__tensorCoords[1]),
+              "r"(__tensorCoords[2]),
+              "r"(__as_ptr_smem(__srcMem))
+            : "memory");
+      } _CCCL_ELSE_IF_CONSTEXPR (__op == op_max) {
+        asm("cp.reduce.async.bulk.tensor.3d.global.shared::cta.max.tile.bulk_group [%0, {%1, %2, %3}], [%4]; // 1c."
+            :
+            : "l"(__tensorMap),
+              "r"(__tensorCoords[0]),
+              "r"(__tensorCoords[1]),
+              "r"(__tensorCoords[2]),
+              "r"(__as_ptr_smem(__srcMem))
+            : "memory");
+      } _CCCL_ELSE_IF_CONSTEXPR (__op == op_inc) {
+        asm("cp.reduce.async.bulk.tensor.3d.global.shared::cta.inc.tile.bulk_group [%0, {%1, %2, %3}], [%4]; // 1c."
+            :
+            : "l"(__tensorMap),
+              "r"(__tensorCoords[0]),
+              "r"(__tensorCoords[1]),
+              "r"(__tensorCoords[2]),
+              "r"(__as_ptr_smem(__srcMem))
+            : "memory");
+      } _CCCL_ELSE_IF_CONSTEXPR (__op == op_dec) {
+        asm("cp.reduce.async.bulk.tensor.3d.global.shared::cta.dec.tile.bulk_group [%0, {%1, %2, %3}], [%4]; // 1c."
+            :
+            : "l"(__tensorMap),
+              "r"(__tensorCoords[0]),
+              "r"(__tensorCoords[1]),
+              "r"(__tensorCoords[2]),
+              "r"(__as_ptr_smem(__srcMem))
+            : "memory");
+      } _CCCL_ELSE_IF_CONSTEXPR (__op == op_and_op) {
+        asm("cp.reduce.async.bulk.tensor.3d.global.shared::cta.and.tile.bulk_group [%0, {%1, %2, %3}], [%4]; // 1c."
+            :
+            : "l"(__tensorMap),
+              "r"(__tensorCoords[0]),
+              "r"(__tensorCoords[1]),
+              "r"(__tensorCoords[2]),
+              "r"(__as_ptr_smem(__srcMem))
+            : "memory");
+      } _CCCL_ELSE_IF_CONSTEXPR (__op == op_or_op) {
+        asm("cp.reduce.async.bulk.tensor.3d.global.shared::cta.or.tile.bulk_group [%0, {%1, %2, %3}], [%4]; // 1c."
+            :
+            : "l"(__tensorMap),
+              "r"(__tensorCoords[0]),
+              "r"(__tensorCoords[1]),
+              "r"(__tensorCoords[2]),
+              "r"(__as_ptr_smem(__srcMem))
+            : "memory");
+      } _CCCL_ELSE_IF_CONSTEXPR (__op == op_xor_op) {
+        asm("cp.reduce.async.bulk.tensor.3d.global.shared::cta.xor.tile.bulk_group [%0, {%1, %2, %3}], [%4]; // 1c."
+            :
+            : "l"(__tensorMap),
+              "r"(__tensorCoords[0]),
+              "r"(__tensorCoords[1]),
+              "r"(__tensorCoords[2]),
+              "r"(__as_ptr_smem(__srcMem))
+            : "memory");
+      }),
+    (
+      // Unsupported architectures will have a linker error with a semi-decent error message
+      __cuda_ptx_cp_reduce_async_bulk_tensor_is_not_supported_before_SM_90__();));
+}
+#endif // __cccl_ptx_isa >= 800
+
+/*
+// cp.reduce.async.bulk.tensor.4d.dst.src.op.tile.bulk_group [tensorMap, tensorCoords], [srcMem]; // 1d. PTX ISA 80,
+SM_90
+// .dst       = { .global }
+// .src       = { .shared::cta }
+// .op        = { .add, .min, .max, .inc, .dec, .and, .or, .xor }
+template <cuda::ptx::dot_op Op>
+__device__ static inline void cp_reduce_async_bulk_tensor(
+  cuda::ptx::space_global_t,
+  cuda::ptx::space_shared_t,
+  cuda::ptx::op_t<Op> op,
+  const void* tensorMap,
+  const int32_t (&tensorCoords)[4],
+  const void* srcMem);
+*/
+#if __cccl_ptx_isa >= 800
+extern "C" _CCCL_DEVICE void __cuda_ptx_cp_reduce_async_bulk_tensor_is_not_supported_before_SM_90__();
+template <dot_op _Op>
+_CCCL_DEVICE static inline void cp_reduce_async_bulk_tensor(
+  space_global_t,
+  space_shared_t,
+  op_t<_Op> __op,
+  const void* __tensorMap,
+  const _CUDA_VSTD::int32_t (&__tensorCoords)[4],
+  const void* __srcMem)
+{
+  // __space == space_global (due to parameter type constraint)
+  // __space == space_shared (due to parameter type constraint)
+  static_assert(__op == op_add || __op == op_min || __op == op_max || __op == op_inc || __op == op_dec
+                  || __op == op_and_op || __op == op_or_op || __op == op_xor_op,
+                "");
+  NV_IF_ELSE_TARGET(
+    NV_PROVIDES_SM_90,
+    (
+      _CCCL_IF_CONSTEXPR (__op == op_add) {
+        asm("cp.reduce.async.bulk.tensor.4d.global.shared::cta.add.tile.bulk_group [%0, {%1, %2, %3, %4}], [%5]; // 1d."
+            :
+            : "l"(__tensorMap),
+              "r"(__tensorCoords[0]),
+              "r"(__tensorCoords[1]),
+              "r"(__tensorCoords[2]),
+              "r"(__tensorCoords[3]),
+              "r"(__as_ptr_smem(__srcMem))
+            : "memory");
+      } _CCCL_ELSE_IF_CONSTEXPR (__op == op_min) {
+        asm("cp.reduce.async.bulk.tensor.4d.global.shared::cta.min.tile.bulk_group [%0, {%1, %2, %3, %4}], [%5]; // 1d."
+            :
+            : "l"(__tensorMap),
+              "r"(__tensorCoords[0]),
+              "r"(__tensorCoords[1]),
+              "r"(__tensorCoords[2]),
+              "r"(__tensorCoords[3]),
+              "r"(__as_ptr_smem(__srcMem))
+            : "memory");
+      } _CCCL_ELSE_IF_CONSTEXPR (__op == op_max) {
+        asm("cp.reduce.async.bulk.tensor.4d.global.shared::cta.max.tile.bulk_group [%0, {%1, %2, %3, %4}], [%5]; // 1d."
+            :
+            : "l"(__tensorMap),
+              "r"(__tensorCoords[0]),
+              "r"(__tensorCoords[1]),
+              "r"(__tensorCoords[2]),
+              "r"(__tensorCoords[3]),
+              "r"(__as_ptr_smem(__srcMem))
+            : "memory");
+      } _CCCL_ELSE_IF_CONSTEXPR (__op == op_inc) {
+        asm("cp.reduce.async.bulk.tensor.4d.global.shared::cta.inc.tile.bulk_group [%0, {%1, %2, %3, %4}], [%5]; // 1d."
+            :
+            : "l"(__tensorMap),
+              "r"(__tensorCoords[0]),
+              "r"(__tensorCoords[1]),
+              "r"(__tensorCoords[2]),
+              "r"(__tensorCoords[3]),
+              "r"(__as_ptr_smem(__srcMem))
+            : "memory");
+      } _CCCL_ELSE_IF_CONSTEXPR (__op == op_dec) {
+        asm("cp.reduce.async.bulk.tensor.4d.global.shared::cta.dec.tile.bulk_group [%0, {%1, %2, %3, %4}], [%5]; // 1d."
+            :
+            : "l"(__tensorMap),
+              "r"(__tensorCoords[0]),
+              "r"(__tensorCoords[1]),
+              "r"(__tensorCoords[2]),
+              "r"(__tensorCoords[3]),
+              "r"(__as_ptr_smem(__srcMem))
+            : "memory");
+      } _CCCL_ELSE_IF_CONSTEXPR (__op == op_and_op) {
+        asm("cp.reduce.async.bulk.tensor.4d.global.shared::cta.and.tile.bulk_group [%0, {%1, %2, %3, %4}], [%5]; // 1d."
+            :
+            : "l"(__tensorMap),
+              "r"(__tensorCoords[0]),
+              "r"(__tensorCoords[1]),
+              "r"(__tensorCoords[2]),
+              "r"(__tensorCoords[3]),
+              "r"(__as_ptr_smem(__srcMem))
+            : "memory");
+      } _CCCL_ELSE_IF_CONSTEXPR (__op == op_or_op) {
+        asm("cp.reduce.async.bulk.tensor.4d.global.shared::cta.or.tile.bulk_group [%0, {%1, %2, %3, %4}], [%5]; // 1d."
+            :
+            : "l"(__tensorMap),
+              "r"(__tensorCoords[0]),
+              "r"(__tensorCoords[1]),
+              "r"(__tensorCoords[2]),
+              "r"(__tensorCoords[3]),
+              "r"(__as_ptr_smem(__srcMem))
+            : "memory");
+      } _CCCL_ELSE_IF_CONSTEXPR (__op == op_xor_op) {
+        asm("cp.reduce.async.bulk.tensor.4d.global.shared::cta.xor.tile.bulk_group [%0, {%1, %2, %3, %4}], [%5]; // 1d."
+            :
+            : "l"(__tensorMap),
+              "r"(__tensorCoords[0]),
+              "r"(__tensorCoords[1]),
+              "r"(__tensorCoords[2]),
+              "r"(__tensorCoords[3]),
+              "r"(__as_ptr_smem(__srcMem))
+            : "memory");
+      }),
+    (
+      // Unsupported architectures will have a linker error with a semi-decent error message
+      __cuda_ptx_cp_reduce_async_bulk_tensor_is_not_supported_before_SM_90__();));
+}
+#endif // __cccl_ptx_isa >= 800
+
+/*
+// cp.reduce.async.bulk.tensor.5d.dst.src.op.tile.bulk_group [tensorMap, tensorCoords], [srcMem]; // 1e. PTX ISA 80,
+SM_90
+// .dst       = { .global }
+// .src       = { .shared::cta }
+// .op        = { .add, .min, .max, .inc, .dec, .and, .or, .xor }
+template <cuda::ptx::dot_op Op>
+__device__ static inline void cp_reduce_async_bulk_tensor(
+  cuda::ptx::space_global_t,
+  cuda::ptx::space_shared_t,
+  cuda::ptx::op_t<Op> op,
+  const void* tensorMap,
+  const int32_t (&tensorCoords)[5],
+  const void* srcMem);
+*/
+#if __cccl_ptx_isa >= 800
+extern "C" _CCCL_DEVICE void __cuda_ptx_cp_reduce_async_bulk_tensor_is_not_supported_before_SM_90__();
+template <dot_op _Op>
+_CCCL_DEVICE static inline void cp_reduce_async_bulk_tensor(
+  space_global_t,
+  space_shared_t,
+  op_t<_Op> __op,
+  const void* __tensorMap,
+  const _CUDA_VSTD::int32_t (&__tensorCoords)[5],
+  const void* __srcMem)
+{
+  // __space == space_global (due to parameter type constraint)
+  // __space == space_shared (due to parameter type constraint)
+  static_assert(__op == op_add || __op == op_min || __op == op_max || __op == op_inc || __op == op_dec
+                  || __op == op_and_op || __op == op_or_op || __op == op_xor_op,
+                "");
+  NV_IF_ELSE_TARGET(
+    NV_PROVIDES_SM_90,
+    (
+      _CCCL_IF_CONSTEXPR (__op == op_add) {
+        asm("cp.reduce.async.bulk.tensor.5d.global.shared::cta.add.tile.bulk_group [%0, {%1, %2, %3, %4, %5}], [%6]; "
+            "// 1e."
+            :
+            : "l"(__tensorMap),
+              "r"(__tensorCoords[0]),
+              "r"(__tensorCoords[1]),
+              "r"(__tensorCoords[2]),
+              "r"(__tensorCoords[3]),
+              "r"(__tensorCoords[4]),
+              "r"(__as_ptr_smem(__srcMem))
+            : "memory");
+      } _CCCL_ELSE_IF_CONSTEXPR (__op == op_min) {
+        asm("cp.reduce.async.bulk.tensor.5d.global.shared::cta.min.tile.bulk_group [%0, {%1, %2, %3, %4, %5}], [%6]; "
+            "// 1e."
+            :
+            : "l"(__tensorMap),
+              "r"(__tensorCoords[0]),
+              "r"(__tensorCoords[1]),
+              "r"(__tensorCoords[2]),
+              "r"(__tensorCoords[3]),
+              "r"(__tensorCoords[4]),
+              "r"(__as_ptr_smem(__srcMem))
+            : "memory");
+      } _CCCL_ELSE_IF_CONSTEXPR (__op == op_max) {
+        asm("cp.reduce.async.bulk.tensor.5d.global.shared::cta.max.tile.bulk_group [%0, {%1, %2, %3, %4, %5}], [%6]; "
+            "// 1e."
+            :
+            : "l"(__tensorMap),
+              "r"(__tensorCoords[0]),
+              "r"(__tensorCoords[1]),
+              "r"(__tensorCoords[2]),
+              "r"(__tensorCoords[3]),
+              "r"(__tensorCoords[4]),
+              "r"(__as_ptr_smem(__srcMem))
+            : "memory");
+      } _CCCL_ELSE_IF_CONSTEXPR (__op == op_inc) {
+        asm("cp.reduce.async.bulk.tensor.5d.global.shared::cta.inc.tile.bulk_group [%0, {%1, %2, %3, %4, %5}], [%6]; "
+            "// 1e."
+            :
+            : "l"(__tensorMap),
+              "r"(__tensorCoords[0]),
+              "r"(__tensorCoords[1]),
+              "r"(__tensorCoords[2]),
+              "r"(__tensorCoords[3]),
+              "r"(__tensorCoords[4]),
+              "r"(__as_ptr_smem(__srcMem))
+            : "memory");
+      } _CCCL_ELSE_IF_CONSTEXPR (__op == op_dec) {
+        asm("cp.reduce.async.bulk.tensor.5d.global.shared::cta.dec.tile.bulk_group [%0, {%1, %2, %3, %4, %5}], [%6]; "
+            "// 1e."
+            :
+            : "l"(__tensorMap),
+              "r"(__tensorCoords[0]),
+              "r"(__tensorCoords[1]),
+              "r"(__tensorCoords[2]),
+              "r"(__tensorCoords[3]),
+              "r"(__tensorCoords[4]),
+              "r"(__as_ptr_smem(__srcMem))
+            : "memory");
+      } _CCCL_ELSE_IF_CONSTEXPR (__op == op_and_op) {
+        asm("cp.reduce.async.bulk.tensor.5d.global.shared::cta.and.tile.bulk_group [%0, {%1, %2, %3, %4, %5}], [%6]; "
+            "// 1e."
+            :
+            : "l"(__tensorMap),
+              "r"(__tensorCoords[0]),
+              "r"(__tensorCoords[1]),
+              "r"(__tensorCoords[2]),
+              "r"(__tensorCoords[3]),
+              "r"(__tensorCoords[4]),
+              "r"(__as_ptr_smem(__srcMem))
+            : "memory");
+      } _CCCL_ELSE_IF_CONSTEXPR (__op == op_or_op) {
+        asm("cp.reduce.async.bulk.tensor.5d.global.shared::cta.or.tile.bulk_group [%0, {%1, %2, %3, %4, %5}], [%6]; // "
+            "1e."
+            :
+            : "l"(__tensorMap),
+              "r"(__tensorCoords[0]),
+              "r"(__tensorCoords[1]),
+              "r"(__tensorCoords[2]),
+              "r"(__tensorCoords[3]),
+              "r"(__tensorCoords[4]),
+              "r"(__as_ptr_smem(__srcMem))
+            : "memory");
+      } _CCCL_ELSE_IF_CONSTEXPR (__op == op_xor_op) {
+        asm("cp.reduce.async.bulk.tensor.5d.global.shared::cta.xor.tile.bulk_group [%0, {%1, %2, %3, %4, %5}], [%6]; "
+            "// 1e."
+            :
+            : "l"(__tensorMap),
+              "r"(__tensorCoords[0]),
+              "r"(__tensorCoords[1]),
+              "r"(__tensorCoords[2]),
+              "r"(__tensorCoords[3]),
+              "r"(__tensorCoords[4]),
+              "r"(__as_ptr_smem(__srcMem))
+            : "memory");
+      }),
+    (
+      // Unsupported architectures will have a linker error with a semi-decent error message
+      __cuda_ptx_cp_reduce_async_bulk_tensor_is_not_supported_before_SM_90__();));
+}
+#endif // __cccl_ptx_isa >= 800
diff --git a/libcudacxx/include/cuda/__ptx/instructions/generated/fence.inc b/libcudacxx/include/cuda/__ptx/instructions/generated/fence.inc
new file mode 100644
index 00000000000..f10ec07ebb5
--- /dev/null
+++ b/libcudacxx/include/cuda/__ptx/instructions/generated/fence.inc
@@ -0,0 +1,67 @@
+/*
+// fence{.sem}.scope; // 1. PTX ISA 60, SM_70
+// .sem       = { .sc, .acq_rel }
+// .scope     = { .cta, .gpu, .sys }
+template <cuda::ptx::dot_sem Sem, cuda::ptx::dot_scope Scope>
+__device__ static inline void fence(
+  cuda::ptx::sem_t<Sem> sem,
+  cuda::ptx::scope_t<Scope> scope);
+*/
+#if __cccl_ptx_isa >= 600
+extern "C" _CCCL_DEVICE void __cuda_ptx_fence_is_not_supported_before_SM_70__();
+template <dot_sem _Sem, dot_scope _Scope>
+_CCCL_DEVICE static inline void fence(sem_t<_Sem> __sem, scope_t<_Scope> __scope)
+{
+  static_assert(__sem == sem_sc || __sem == sem_acq_rel, "");
+  static_assert(__scope == scope_cta || __scope == scope_gpu || __scope == scope_sys, "");
+  NV_IF_ELSE_TARGET(
+    NV_PROVIDES_SM_70,
+    (
+      _CCCL_IF_CONSTEXPR (__sem == sem_sc && __scope == scope_cta) {
+        asm volatile("fence.sc.cta; // 1." : : : "memory");
+      } _CCCL_ELSE_IF_CONSTEXPR (__sem == sem_sc && __scope == scope_gpu) {
+        asm volatile("fence.sc.gpu; // 1." : : : "memory");
+      } _CCCL_ELSE_IF_CONSTEXPR (__sem == sem_sc && __scope == scope_sys) {
+        asm volatile("fence.sc.sys; // 1." : : : "memory");
+      } _CCCL_ELSE_IF_CONSTEXPR (__sem == sem_acq_rel && __scope == scope_cta) {
+        asm volatile("fence.acq_rel.cta; // 1." : : : "memory");
+      } _CCCL_ELSE_IF_CONSTEXPR (__sem == sem_acq_rel && __scope == scope_gpu) {
+        asm volatile("fence.acq_rel.gpu; // 1." : : : "memory");
+      } _CCCL_ELSE_IF_CONSTEXPR (__sem == sem_acq_rel && __scope == scope_sys) {
+        asm volatile("fence.acq_rel.sys; // 1." : : : "memory");
+      }),
+    (
+      // Unsupported architectures will have a linker error with a semi-decent error message
+      __cuda_ptx_fence_is_not_supported_before_SM_70__();));
+}
+#endif // __cccl_ptx_isa >= 600
+
+/*
+// fence{.sem}.scope; // 2. PTX ISA 78, SM_90
+// .sem       = { .sc, .acq_rel }
+// .scope     = { .cluster }
+template <cuda::ptx::dot_sem Sem>
+__device__ static inline void fence(
+  cuda::ptx::sem_t<Sem> sem,
+  cuda::ptx::scope_cluster_t);
+*/
+#if __cccl_ptx_isa >= 780
+extern "C" _CCCL_DEVICE void __cuda_ptx_fence_is_not_supported_before_SM_90__();
+template <dot_sem _Sem>
+_CCCL_DEVICE static inline void fence(sem_t<_Sem> __sem, scope_cluster_t)
+{
+  static_assert(__sem == sem_sc || __sem == sem_acq_rel, "");
+  // __scope == scope_cluster (due to parameter type constraint)
+  NV_IF_ELSE_TARGET(
+    NV_PROVIDES_SM_90,
+    (
+      _CCCL_IF_CONSTEXPR (__sem == sem_sc) {
+        asm volatile("fence.sc.cluster; // 2." : : : "memory");
+      } _CCCL_ELSE_IF_CONSTEXPR (__sem == sem_acq_rel) {
+        asm volatile("fence.acq_rel.cluster; // 2." : : : "memory");
+      }),
+    (
+      // Unsupported architectures will have a linker error with a semi-decent error message
+      __cuda_ptx_fence_is_not_supported_before_SM_90__();));
+}
+#endif // __cccl_ptx_isa >= 780
diff --git a/libcudacxx/include/cuda/__ptx/instructions/generated/fence_mbarrier_init.inc b/libcudacxx/include/cuda/__ptx/instructions/generated/fence_mbarrier_init.inc
new file mode 100644
index 00000000000..0d39c222598
--- /dev/null
+++ b/libcudacxx/include/cuda/__ptx/instructions/generated/fence_mbarrier_init.inc
@@ -0,0 +1,27 @@
+/*
+// fence.mbarrier_init.sem.scope; // 3. PTX ISA 80, SM_90
+// .sem       = { .release }
+// .scope     = { .cluster }
+template <typename=void>
+__device__ static inline void fence_mbarrier_init(
+  cuda::ptx::sem_release_t,
+  cuda::ptx::scope_cluster_t);
+*/
+#if __cccl_ptx_isa >= 800
+extern "C" _CCCL_DEVICE void __cuda_ptx_fence_mbarrier_init_is_not_supported_before_SM_90__();
+template <typename = void>
+_CCCL_DEVICE static inline void fence_mbarrier_init(sem_release_t, scope_cluster_t)
+{
+  // __sem == sem_release (due to parameter type constraint)
+  // __scope == scope_cluster (due to parameter type constraint)
+  NV_IF_ELSE_TARGET(
+    NV_PROVIDES_SM_90,
+    (asm volatile("fence.mbarrier_init.release.cluster; // 3."
+                  :
+                  :
+                  : "memory");),
+    (
+      // Unsupported architectures will have a linker error with a semi-decent error message
+      __cuda_ptx_fence_mbarrier_init_is_not_supported_before_SM_90__();));
+}
+#endif // __cccl_ptx_isa >= 800
diff --git a/libcudacxx/include/cuda/__ptx/instructions/generated/fence_proxy_alias.inc b/libcudacxx/include/cuda/__ptx/instructions/generated/fence_proxy_alias.inc
new file mode 100644
index 00000000000..98260b851ca
--- /dev/null
+++ b/libcudacxx/include/cuda/__ptx/instructions/generated/fence_proxy_alias.inc
@@ -0,0 +1,21 @@
+/*
+// fence.proxy.alias; // 4. PTX ISA 75, SM_70
+template <typename=void>
+__device__ static inline void fence_proxy_alias();
+*/
+#if __cccl_ptx_isa >= 750
+extern "C" _CCCL_DEVICE void __cuda_ptx_fence_proxy_alias_is_not_supported_before_SM_70__();
+template <typename = void>
+_CCCL_DEVICE static inline void fence_proxy_alias()
+{
+  NV_IF_ELSE_TARGET(
+    NV_PROVIDES_SM_70,
+    (asm volatile("fence.proxy.alias; // 4."
+                  :
+                  :
+                  : "memory");),
+    (
+      // Unsupported architectures will have a linker error with a semi-decent error message
+      __cuda_ptx_fence_proxy_alias_is_not_supported_before_SM_70__();));
+}
+#endif // __cccl_ptx_isa >= 750
diff --git a/libcudacxx/include/cuda/__ptx/instructions/generated/fence_proxy_async.inc b/libcudacxx/include/cuda/__ptx/instructions/generated/fence_proxy_async.inc
new file mode 100644
index 00000000000..f0a37baabdb
--- /dev/null
+++ b/libcudacxx/include/cuda/__ptx/instructions/generated/fence_proxy_async.inc
@@ -0,0 +1,50 @@
+/*
+// fence.proxy.async; // 5. PTX ISA 80, SM_90
+template <typename=void>
+__device__ static inline void fence_proxy_async();
+*/
+#if __cccl_ptx_isa >= 800
+extern "C" _CCCL_DEVICE void __cuda_ptx_fence_proxy_async_is_not_supported_before_SM_90__();
+template <typename = void>
+_CCCL_DEVICE static inline void fence_proxy_async()
+{
+  NV_IF_ELSE_TARGET(
+    NV_PROVIDES_SM_90,
+    (asm volatile("fence.proxy.async; // 5."
+                  :
+                  :
+                  : "memory");),
+    (
+      // Unsupported architectures will have a linker error with a semi-decent error message
+      __cuda_ptx_fence_proxy_async_is_not_supported_before_SM_90__();));
+}
+#endif // __cccl_ptx_isa >= 800
+
+/*
+// fence.proxy.async{.space}; // 6. PTX ISA 80, SM_90
+// .space     = { .global, .shared::cluster, .shared::cta }
+template <cuda::ptx::dot_space Space>
+__device__ static inline void fence_proxy_async(
+  cuda::ptx::space_t<Space> space);
+*/
+#if __cccl_ptx_isa >= 800
+extern "C" _CCCL_DEVICE void __cuda_ptx_fence_proxy_async_is_not_supported_before_SM_90__();
+template <dot_space _Space>
+_CCCL_DEVICE static inline void fence_proxy_async(space_t<_Space> __space)
+{
+  static_assert(__space == space_global || __space == space_cluster || __space == space_shared, "");
+  NV_IF_ELSE_TARGET(
+    NV_PROVIDES_SM_90,
+    (
+      _CCCL_IF_CONSTEXPR (__space == space_global) {
+        asm volatile("fence.proxy.async.global; // 6." : : : "memory");
+      } _CCCL_ELSE_IF_CONSTEXPR (__space == space_cluster) {
+        asm volatile("fence.proxy.async.shared::cluster; // 6." : : : "memory");
+      } _CCCL_ELSE_IF_CONSTEXPR (__space == space_shared) {
+        asm volatile("fence.proxy.async.shared::cta; // 6." : : : "memory");
+      }),
+    (
+      // Unsupported architectures will have a linker error with a semi-decent error message
+      __cuda_ptx_fence_proxy_async_is_not_supported_before_SM_90__();));
+}
+#endif // __cccl_ptx_isa >= 800
diff --git a/libcudacxx/include/cuda/__ptx/instructions/generated/fence_proxy_tensormap_generic.inc b/libcudacxx/include/cuda/__ptx/instructions/generated/fence_proxy_tensormap_generic.inc
new file mode 100644
index 00000000000..3e5b2a265f4
--- /dev/null
+++ b/libcudacxx/include/cuda/__ptx/instructions/generated/fence_proxy_tensormap_generic.inc
@@ -0,0 +1,82 @@
+/*
+// fence.proxy.tensormap::generic.release.scope; // 7. PTX ISA 83, SM_90
+// .sem       = { .release }
+// .scope     = { .cta, .cluster, .gpu, .sys }
+template <cuda::ptx::dot_scope Scope>
+__device__ static inline void fence_proxy_tensormap_generic(
+  cuda::ptx::sem_release_t,
+  cuda::ptx::scope_t<Scope> scope);
+*/
+#if __cccl_ptx_isa >= 830
+extern "C" _CCCL_DEVICE void __cuda_ptx_fence_proxy_tensormap_generic_is_not_supported_before_SM_90__();
+template <dot_scope _Scope>
+_CCCL_DEVICE static inline void fence_proxy_tensormap_generic(sem_release_t, scope_t<_Scope> __scope)
+{
+  // __sem == sem_release (due to parameter type constraint)
+  static_assert(__scope == scope_cta || __scope == scope_cluster || __scope == scope_gpu || __scope == scope_sys, "");
+  NV_IF_ELSE_TARGET(
+    NV_PROVIDES_SM_90,
+    (
+      _CCCL_IF_CONSTEXPR (__scope == scope_cta) {
+        asm volatile("fence.proxy.tensormap::generic.release.cta; // 7." : : : "memory");
+      } _CCCL_ELSE_IF_CONSTEXPR (__scope == scope_cluster) {
+        asm volatile("fence.proxy.tensormap::generic.release.cluster; // 7." : : : "memory");
+      } _CCCL_ELSE_IF_CONSTEXPR (__scope == scope_gpu) {
+        asm volatile("fence.proxy.tensormap::generic.release.gpu; // 7." : : : "memory");
+      } _CCCL_ELSE_IF_CONSTEXPR (__scope == scope_sys) {
+        asm volatile("fence.proxy.tensormap::generic.release.sys; // 7." : : : "memory");
+      }),
+    (
+      // Unsupported architectures will have a linker error with a semi-decent error message
+      __cuda_ptx_fence_proxy_tensormap_generic_is_not_supported_before_SM_90__();));
+}
+#endif // __cccl_ptx_isa >= 830
+
+/*
+// fence.proxy.tensormap::generic.sem.scope [addr], size; // 8. PTX ISA 83, SM_90
+// .sem       = { .acquire }
+// .scope     = { .cta, .cluster, .gpu, .sys }
+template <int N32, cuda::ptx::dot_scope Scope>
+__device__ static inline void fence_proxy_tensormap_generic(
+  cuda::ptx::sem_acquire_t,
+  cuda::ptx::scope_t<Scope> scope,
+  const void* addr,
+  cuda::ptx::n32_t<N32> size);
+*/
+#if __cccl_ptx_isa >= 830
+extern "C" _CCCL_DEVICE void __cuda_ptx_fence_proxy_tensormap_generic_is_not_supported_before_SM_90__();
+template <int _N32, dot_scope _Scope>
+_CCCL_DEVICE static inline void
+fence_proxy_tensormap_generic(sem_acquire_t, scope_t<_Scope> __scope, const void* __addr, n32_t<_N32> __size)
+{
+  // __sem == sem_acquire (due to parameter type constraint)
+  static_assert(__scope == scope_cta || __scope == scope_cluster || __scope == scope_gpu || __scope == scope_sys, "");
+  NV_IF_ELSE_TARGET(
+    NV_PROVIDES_SM_90,
+    (
+      _CCCL_IF_CONSTEXPR (__scope == scope_cta) {
+        asm volatile("fence.proxy.tensormap::generic.acquire.cta [%0], %1; // 8."
+                     :
+                     : "l"(__addr), "n"(__size.value)
+                     : "memory");
+      } _CCCL_ELSE_IF_CONSTEXPR (__scope == scope_cluster) {
+        asm volatile("fence.proxy.tensormap::generic.acquire.cluster [%0], %1; // 8."
+                     :
+                     : "l"(__addr), "n"(__size.value)
+                     : "memory");
+      } _CCCL_ELSE_IF_CONSTEXPR (__scope == scope_gpu) {
+        asm volatile("fence.proxy.tensormap::generic.acquire.gpu [%0], %1; // 8."
+                     :
+                     : "l"(__addr), "n"(__size.value)
+                     : "memory");
+      } _CCCL_ELSE_IF_CONSTEXPR (__scope == scope_sys) {
+        asm volatile("fence.proxy.tensormap::generic.acquire.sys [%0], %1; // 8."
+                     :
+                     : "l"(__addr), "n"(__size.value)
+                     : "memory");
+      }),
+    (
+      // Unsupported architectures will have a linker error with a semi-decent error message
+      __cuda_ptx_fence_proxy_tensormap_generic_is_not_supported_before_SM_90__();));
+}
+#endif // __cccl_ptx_isa >= 830
diff --git a/libcudacxx/include/cuda/__ptx/instructions/generated/get_sreg.inc b/libcudacxx/include/cuda/__ptx/instructions/generated/get_sreg.inc
new file mode 100644
index 00000000000..dd3079915f7
--- /dev/null
+++ b/libcudacxx/include/cuda/__ptx/instructions/generated/get_sreg.inc
@@ -0,0 +1,1001 @@
+/*
+// mov.u32 sreg_value, %%tid.x; // PTX ISA 20
+template <typename=void>
+__device__ static inline uint32_t get_sreg_tid_x();
+*/
+#if __cccl_ptx_isa >= 200
+template <typename = void>
+_CCCL_DEVICE static inline _CUDA_VSTD::uint32_t get_sreg_tid_x()
+{
+  _CUDA_VSTD::uint32_t __sreg_value;
+  asm("mov.u32 %0, %%tid.x;" : "=r"(__sreg_value) : :);
+  return __sreg_value;
+}
+#endif // __cccl_ptx_isa >= 200
+
+/*
+// mov.u32 sreg_value, %%tid.y; // PTX ISA 20
+template <typename=void>
+__device__ static inline uint32_t get_sreg_tid_y();
+*/
+#if __cccl_ptx_isa >= 200
+template <typename = void>
+_CCCL_DEVICE static inline _CUDA_VSTD::uint32_t get_sreg_tid_y()
+{
+  _CUDA_VSTD::uint32_t __sreg_value;
+  asm("mov.u32 %0, %%tid.y;" : "=r"(__sreg_value) : :);
+  return __sreg_value;
+}
+#endif // __cccl_ptx_isa >= 200
+
+/*
+// mov.u32 sreg_value, %%tid.z; // PTX ISA 20
+template <typename=void>
+__device__ static inline uint32_t get_sreg_tid_z();
+*/
+#if __cccl_ptx_isa >= 200
+template <typename = void>
+_CCCL_DEVICE static inline _CUDA_VSTD::uint32_t get_sreg_tid_z()
+{
+  _CUDA_VSTD::uint32_t __sreg_value;
+  asm("mov.u32 %0, %%tid.z;" : "=r"(__sreg_value) : :);
+  return __sreg_value;
+}
+#endif // __cccl_ptx_isa >= 200
+
+/*
+// mov.u32 sreg_value, %%ntid.x; // PTX ISA 20
+template <typename=void>
+__device__ static inline uint32_t get_sreg_ntid_x();
+*/
+#if __cccl_ptx_isa >= 200
+template <typename = void>
+_CCCL_DEVICE static inline _CUDA_VSTD::uint32_t get_sreg_ntid_x()
+{
+  _CUDA_VSTD::uint32_t __sreg_value;
+  asm volatile("mov.u32 %0, %%ntid.x;" : "=r"(__sreg_value) : :);
+  return __sreg_value;
+}
+#endif // __cccl_ptx_isa >= 200
+
+/*
+// mov.u32 sreg_value, %%ntid.y; // PTX ISA 20
+template <typename=void>
+__device__ static inline uint32_t get_sreg_ntid_y();
+*/
+#if __cccl_ptx_isa >= 200
+template <typename = void>
+_CCCL_DEVICE static inline _CUDA_VSTD::uint32_t get_sreg_ntid_y()
+{
+  _CUDA_VSTD::uint32_t __sreg_value;
+  asm volatile("mov.u32 %0, %%ntid.y;" : "=r"(__sreg_value) : :);
+  return __sreg_value;
+}
+#endif // __cccl_ptx_isa >= 200
+
+/*
+// mov.u32 sreg_value, %%ntid.z; // PTX ISA 20
+template <typename=void>
+__device__ static inline uint32_t get_sreg_ntid_z();
+*/
+#if __cccl_ptx_isa >= 200
+template <typename = void>
+_CCCL_DEVICE static inline _CUDA_VSTD::uint32_t get_sreg_ntid_z()
+{
+  _CUDA_VSTD::uint32_t __sreg_value;
+  asm volatile("mov.u32 %0, %%ntid.z;" : "=r"(__sreg_value) : :);
+  return __sreg_value;
+}
+#endif // __cccl_ptx_isa >= 200
+
+/*
+// mov.u32 sreg_value, %%laneid; // PTX ISA 13
+template <typename=void>
+__device__ static inline uint32_t get_sreg_laneid();
+*/
+#if __cccl_ptx_isa >= 130
+template <typename = void>
+_CCCL_DEVICE static inline _CUDA_VSTD::uint32_t get_sreg_laneid()
+{
+  _CUDA_VSTD::uint32_t __sreg_value;
+  asm("mov.u32 %0, %%laneid;" : "=r"(__sreg_value) : :);
+  return __sreg_value;
+}
+#endif // __cccl_ptx_isa >= 130
+
+/*
+// mov.u32 sreg_value, %%warpid; // PTX ISA 13
+template <typename=void>
+__device__ static inline uint32_t get_sreg_warpid();
+*/
+#if __cccl_ptx_isa >= 130
+template <typename = void>
+_CCCL_DEVICE static inline _CUDA_VSTD::uint32_t get_sreg_warpid()
+{
+  _CUDA_VSTD::uint32_t __sreg_value;
+  asm volatile("mov.u32 %0, %%warpid;" : "=r"(__sreg_value) : :);
+  return __sreg_value;
+}
+#endif // __cccl_ptx_isa >= 130
+
+/*
+// mov.u32 sreg_value, %%nwarpid; // PTX ISA 20, SM_35
+template <typename=void>
+__device__ static inline uint32_t get_sreg_nwarpid();
+*/
+#if __cccl_ptx_isa >= 200
+extern "C" _CCCL_DEVICE void __cuda_ptx_get_sreg_nwarpid_is_not_supported_before_SM_35__();
+template <typename = void>
+_CCCL_DEVICE static inline _CUDA_VSTD::uint32_t get_sreg_nwarpid()
+{
+  NV_IF_ELSE_TARGET(
+    NV_PROVIDES_SM_35,
+    (_CUDA_VSTD::uint32_t __sreg_value; asm volatile(
+       "mov.u32 %0, %%nwarpid;"
+       : "=r"(__sreg_value)
+       :
+       :);
+     return __sreg_value;),
+    (
+      // Unsupported architectures will have a linker error with a semi-decent error message
+      __cuda_ptx_get_sreg_nwarpid_is_not_supported_before_SM_35__(); return 0;));
+}
+#endif // __cccl_ptx_isa >= 200
+
+/*
+// mov.u32 sreg_value, %%ctaid.x; // PTX ISA 20
+template <typename=void>
+__device__ static inline uint32_t get_sreg_ctaid_x();
+*/
+#if __cccl_ptx_isa >= 200
+template <typename = void>
+_CCCL_DEVICE static inline _CUDA_VSTD::uint32_t get_sreg_ctaid_x()
+{
+  _CUDA_VSTD::uint32_t __sreg_value;
+  asm("mov.u32 %0, %%ctaid.x;" : "=r"(__sreg_value) : :);
+  return __sreg_value;
+}
+#endif // __cccl_ptx_isa >= 200
+
+/*
+// mov.u32 sreg_value, %%ctaid.y; // PTX ISA 20
+template <typename=void>
+__device__ static inline uint32_t get_sreg_ctaid_y();
+*/
+#if __cccl_ptx_isa >= 200
+template <typename = void>
+_CCCL_DEVICE static inline _CUDA_VSTD::uint32_t get_sreg_ctaid_y()
+{
+  _CUDA_VSTD::uint32_t __sreg_value;
+  asm("mov.u32 %0, %%ctaid.y;" : "=r"(__sreg_value) : :);
+  return __sreg_value;
+}
+#endif // __cccl_ptx_isa >= 200
+
+/*
+// mov.u32 sreg_value, %%ctaid.z; // PTX ISA 20
+template <typename=void>
+__device__ static inline uint32_t get_sreg_ctaid_z();
+*/
+#if __cccl_ptx_isa >= 200
+template <typename = void>
+_CCCL_DEVICE static inline _CUDA_VSTD::uint32_t get_sreg_ctaid_z()
+{
+  _CUDA_VSTD::uint32_t __sreg_value;
+  asm("mov.u32 %0, %%ctaid.z;" : "=r"(__sreg_value) : :);
+  return __sreg_value;
+}
+#endif // __cccl_ptx_isa >= 200
+
+/*
+// mov.u32 sreg_value, %%nctaid.x; // PTX ISA 20
+template <typename=void>
+__device__ static inline uint32_t get_sreg_nctaid_x();
+*/
+#if __cccl_ptx_isa >= 200
+template <typename = void>
+_CCCL_DEVICE static inline _CUDA_VSTD::uint32_t get_sreg_nctaid_x()
+{
+  _CUDA_VSTD::uint32_t __sreg_value;
+  asm("mov.u32 %0, %%nctaid.x;" : "=r"(__sreg_value) : :);
+  return __sreg_value;
+}
+#endif // __cccl_ptx_isa >= 200
+
+/*
+// mov.u32 sreg_value, %%nctaid.y; // PTX ISA 20
+template <typename=void>
+__device__ static inline uint32_t get_sreg_nctaid_y();
+*/
+#if __cccl_ptx_isa >= 200
+template <typename = void>
+_CCCL_DEVICE static inline _CUDA_VSTD::uint32_t get_sreg_nctaid_y()
+{
+  _CUDA_VSTD::uint32_t __sreg_value;
+  asm("mov.u32 %0, %%nctaid.y;" : "=r"(__sreg_value) : :);
+  return __sreg_value;
+}
+#endif // __cccl_ptx_isa >= 200
+
+/*
+// mov.u32 sreg_value, %%nctaid.z; // PTX ISA 20
+template <typename=void>
+__device__ static inline uint32_t get_sreg_nctaid_z();
+*/
+#if __cccl_ptx_isa >= 200
+template <typename = void>
+_CCCL_DEVICE static inline _CUDA_VSTD::uint32_t get_sreg_nctaid_z()
+{
+  _CUDA_VSTD::uint32_t __sreg_value;
+  asm("mov.u32 %0, %%nctaid.z;" : "=r"(__sreg_value) : :);
+  return __sreg_value;
+}
+#endif // __cccl_ptx_isa >= 200
+
+/*
+// mov.u32 sreg_value, %%smid; // PTX ISA 13
+template <typename=void>
+__device__ static inline uint32_t get_sreg_smid();
+*/
+#if __cccl_ptx_isa >= 130
+template <typename = void>
+_CCCL_DEVICE static inline _CUDA_VSTD::uint32_t get_sreg_smid()
+{
+  _CUDA_VSTD::uint32_t __sreg_value;
+  asm("mov.u32 %0, %%smid;" : "=r"(__sreg_value) : :);
+  return __sreg_value;
+}
+#endif // __cccl_ptx_isa >= 130
+
+/*
+// mov.u32 sreg_value, %%nsmid; // PTX ISA 20, SM_35
+template <typename=void>
+__device__ static inline uint32_t get_sreg_nsmid();
+*/
+#if __cccl_ptx_isa >= 200
+extern "C" _CCCL_DEVICE void __cuda_ptx_get_sreg_nsmid_is_not_supported_before_SM_35__();
+template <typename = void>
+_CCCL_DEVICE static inline _CUDA_VSTD::uint32_t get_sreg_nsmid()
+{
+  NV_IF_ELSE_TARGET(
+    NV_PROVIDES_SM_35,
+    (_CUDA_VSTD::uint32_t __sreg_value; asm volatile(
+       "mov.u32 %0, %%nsmid;"
+       : "=r"(__sreg_value)
+       :
+       :);
+     return __sreg_value;),
+    (
+      // Unsupported architectures will have a linker error with a semi-decent error message
+      __cuda_ptx_get_sreg_nsmid_is_not_supported_before_SM_35__(); return 0;));
+}
+#endif // __cccl_ptx_isa >= 200
+
+/*
+// mov.u64 sreg_value, %%gridid; // PTX ISA 30
+template <typename=void>
+__device__ static inline uint64_t get_sreg_gridid();
+*/
+#if __cccl_ptx_isa >= 300
+template <typename = void>
+_CCCL_DEVICE static inline _CUDA_VSTD::uint64_t get_sreg_gridid()
+{
+  _CUDA_VSTD::uint64_t __sreg_value;
+  asm("mov.u64 %0, %%gridid;" : "=l"(__sreg_value) : :);
+  return __sreg_value;
+}
+#endif // __cccl_ptx_isa >= 300
+
+/*
+// mov.pred sreg_value, %%is_explicit_cluster; // PTX ISA 78, SM_90
+template <typename=void>
+__device__ static inline bool get_sreg_is_explicit_cluster();
+*/
+#if __cccl_ptx_isa >= 780
+extern "C" _CCCL_DEVICE void __cuda_ptx_get_sreg_is_explicit_cluster_is_not_supported_before_SM_90__();
+template <typename = void>
+_CCCL_DEVICE static inline bool get_sreg_is_explicit_cluster()
+{
+  NV_IF_ELSE_TARGET(
+    NV_PROVIDES_SM_90,
+    (_CUDA_VSTD::uint32_t __sreg_value;
+     asm("{\n\t .reg .pred P_OUT; \n\t"
+         "mov.pred P_OUT, %%is_explicit_cluster;\n\t"
+         "selp.b32 %0, 1, 0, P_OUT; \n"
+         "}"
+         : "=r"(__sreg_value)
+         :
+         :);
+     return static_cast<bool>(__sreg_value);),
+    (
+      // Unsupported architectures will have a linker error with a semi-decent error message
+      __cuda_ptx_get_sreg_is_explicit_cluster_is_not_supported_before_SM_90__(); return false;));
+}
+#endif // __cccl_ptx_isa >= 780
+
+/*
+// mov.u32 sreg_value, %%clusterid.x; // PTX ISA 78, SM_90
+template <typename=void>
+__device__ static inline uint32_t get_sreg_clusterid_x();
+*/
+#if __cccl_ptx_isa >= 780
+extern "C" _CCCL_DEVICE void __cuda_ptx_get_sreg_clusterid_x_is_not_supported_before_SM_90__();
+template <typename = void>
+_CCCL_DEVICE static inline _CUDA_VSTD::uint32_t get_sreg_clusterid_x()
+{
+  NV_IF_ELSE_TARGET(
+    NV_PROVIDES_SM_90,
+    (_CUDA_VSTD::uint32_t __sreg_value;
+     asm("mov.u32 %0, %%clusterid.x;"
+         : "=r"(__sreg_value)
+         :
+         :);
+     return __sreg_value;),
+    (
+      // Unsupported architectures will have a linker error with a semi-decent error message
+      __cuda_ptx_get_sreg_clusterid_x_is_not_supported_before_SM_90__(); return 0;));
+}
+#endif // __cccl_ptx_isa >= 780
+
+/*
+// mov.u32 sreg_value, %%clusterid.y; // PTX ISA 78, SM_90
+template <typename=void>
+__device__ static inline uint32_t get_sreg_clusterid_y();
+*/
+#if __cccl_ptx_isa >= 780
+extern "C" _CCCL_DEVICE void __cuda_ptx_get_sreg_clusterid_y_is_not_supported_before_SM_90__();
+template <typename = void>
+_CCCL_DEVICE static inline _CUDA_VSTD::uint32_t get_sreg_clusterid_y()
+{
+  NV_IF_ELSE_TARGET(
+    NV_PROVIDES_SM_90,
+    (_CUDA_VSTD::uint32_t __sreg_value;
+     asm("mov.u32 %0, %%clusterid.y;"
+         : "=r"(__sreg_value)
+         :
+         :);
+     return __sreg_value;),
+    (
+      // Unsupported architectures will have a linker error with a semi-decent error message
+      __cuda_ptx_get_sreg_clusterid_y_is_not_supported_before_SM_90__(); return 0;));
+}
+#endif // __cccl_ptx_isa >= 780
+
+/*
+// mov.u32 sreg_value, %%clusterid.z; // PTX ISA 78, SM_90
+template <typename=void>
+__device__ static inline uint32_t get_sreg_clusterid_z();
+*/
+#if __cccl_ptx_isa >= 780
+extern "C" _CCCL_DEVICE void __cuda_ptx_get_sreg_clusterid_z_is_not_supported_before_SM_90__();
+template <typename = void>
+_CCCL_DEVICE static inline _CUDA_VSTD::uint32_t get_sreg_clusterid_z()
+{
+  NV_IF_ELSE_TARGET(
+    NV_PROVIDES_SM_90,
+    (_CUDA_VSTD::uint32_t __sreg_value;
+     asm("mov.u32 %0, %%clusterid.z;"
+         : "=r"(__sreg_value)
+         :
+         :);
+     return __sreg_value;),
+    (
+      // Unsupported architectures will have a linker error with a semi-decent error message
+      __cuda_ptx_get_sreg_clusterid_z_is_not_supported_before_SM_90__(); return 0;));
+}
+#endif // __cccl_ptx_isa >= 780
+
+/*
+// mov.u32 sreg_value, %%nclusterid.x; // PTX ISA 78, SM_90
+template <typename=void>
+__device__ static inline uint32_t get_sreg_nclusterid_x();
+*/
+#if __cccl_ptx_isa >= 780
+extern "C" _CCCL_DEVICE void __cuda_ptx_get_sreg_nclusterid_x_is_not_supported_before_SM_90__();
+template <typename = void>
+_CCCL_DEVICE static inline _CUDA_VSTD::uint32_t get_sreg_nclusterid_x()
+{
+  NV_IF_ELSE_TARGET(
+    NV_PROVIDES_SM_90,
+    (_CUDA_VSTD::uint32_t __sreg_value;
+     asm("mov.u32 %0, %%nclusterid.x;"
+         : "=r"(__sreg_value)
+         :
+         :);
+     return __sreg_value;),
+    (
+      // Unsupported architectures will have a linker error with a semi-decent error message
+      __cuda_ptx_get_sreg_nclusterid_x_is_not_supported_before_SM_90__(); return 0;));
+}
+#endif // __cccl_ptx_isa >= 780
+
+/*
+// mov.u32 sreg_value, %%nclusterid.y; // PTX ISA 78, SM_90
+template <typename=void>
+__device__ static inline uint32_t get_sreg_nclusterid_y();
+*/
+#if __cccl_ptx_isa >= 780
+extern "C" _CCCL_DEVICE void __cuda_ptx_get_sreg_nclusterid_y_is_not_supported_before_SM_90__();
+template <typename = void>
+_CCCL_DEVICE static inline _CUDA_VSTD::uint32_t get_sreg_nclusterid_y()
+{
+  NV_IF_ELSE_TARGET(
+    NV_PROVIDES_SM_90,
+    (_CUDA_VSTD::uint32_t __sreg_value;
+     asm("mov.u32 %0, %%nclusterid.y;"
+         : "=r"(__sreg_value)
+         :
+         :);
+     return __sreg_value;),
+    (
+      // Unsupported architectures will have a linker error with a semi-decent error message
+      __cuda_ptx_get_sreg_nclusterid_y_is_not_supported_before_SM_90__(); return 0;));
+}
+#endif // __cccl_ptx_isa >= 780
+
+/*
+// mov.u32 sreg_value, %%nclusterid.z; // PTX ISA 78, SM_90
+template <typename=void>
+__device__ static inline uint32_t get_sreg_nclusterid_z();
+*/
+#if __cccl_ptx_isa >= 780
+extern "C" _CCCL_DEVICE void __cuda_ptx_get_sreg_nclusterid_z_is_not_supported_before_SM_90__();
+template <typename = void>
+_CCCL_DEVICE static inline _CUDA_VSTD::uint32_t get_sreg_nclusterid_z()
+{
+  NV_IF_ELSE_TARGET(
+    NV_PROVIDES_SM_90,
+    (_CUDA_VSTD::uint32_t __sreg_value;
+     asm("mov.u32 %0, %%nclusterid.z;"
+         : "=r"(__sreg_value)
+         :
+         :);
+     return __sreg_value;),
+    (
+      // Unsupported architectures will have a linker error with a semi-decent error message
+      __cuda_ptx_get_sreg_nclusterid_z_is_not_supported_before_SM_90__(); return 0;));
+}
+#endif // __cccl_ptx_isa >= 780
+
+/*
+// mov.u32 sreg_value, %%cluster_ctaid.x; // PTX ISA 78, SM_90
+template <typename=void>
+__device__ static inline uint32_t get_sreg_cluster_ctaid_x();
+*/
+#if __cccl_ptx_isa >= 780
+extern "C" _CCCL_DEVICE void __cuda_ptx_get_sreg_cluster_ctaid_x_is_not_supported_before_SM_90__();
+template <typename = void>
+_CCCL_DEVICE static inline _CUDA_VSTD::uint32_t get_sreg_cluster_ctaid_x()
+{
+  NV_IF_ELSE_TARGET(
+    NV_PROVIDES_SM_90,
+    (_CUDA_VSTD::uint32_t __sreg_value;
+     asm("mov.u32 %0, %%cluster_ctaid.x;"
+         : "=r"(__sreg_value)
+         :
+         :);
+     return __sreg_value;),
+    (
+      // Unsupported architectures will have a linker error with a semi-decent error message
+      __cuda_ptx_get_sreg_cluster_ctaid_x_is_not_supported_before_SM_90__(); return 0;));
+}
+#endif // __cccl_ptx_isa >= 780
+
+/*
+// mov.u32 sreg_value, %%cluster_ctaid.y; // PTX ISA 78, SM_90
+template <typename=void>
+__device__ static inline uint32_t get_sreg_cluster_ctaid_y();
+*/
+#if __cccl_ptx_isa >= 780
+extern "C" _CCCL_DEVICE void __cuda_ptx_get_sreg_cluster_ctaid_y_is_not_supported_before_SM_90__();
+template <typename = void>
+_CCCL_DEVICE static inline _CUDA_VSTD::uint32_t get_sreg_cluster_ctaid_y()
+{
+  NV_IF_ELSE_TARGET(
+    NV_PROVIDES_SM_90,
+    (_CUDA_VSTD::uint32_t __sreg_value;
+     asm("mov.u32 %0, %%cluster_ctaid.y;"
+         : "=r"(__sreg_value)
+         :
+         :);
+     return __sreg_value;),
+    (
+      // Unsupported architectures will have a linker error with a semi-decent error message
+      __cuda_ptx_get_sreg_cluster_ctaid_y_is_not_supported_before_SM_90__(); return 0;));
+}
+#endif // __cccl_ptx_isa >= 780
+
+/*
+// mov.u32 sreg_value, %%cluster_ctaid.z; // PTX ISA 78, SM_90
+template <typename=void>
+__device__ static inline uint32_t get_sreg_cluster_ctaid_z();
+*/
+#if __cccl_ptx_isa >= 780
+extern "C" _CCCL_DEVICE void __cuda_ptx_get_sreg_cluster_ctaid_z_is_not_supported_before_SM_90__();
+template <typename = void>
+_CCCL_DEVICE static inline _CUDA_VSTD::uint32_t get_sreg_cluster_ctaid_z()
+{
+  NV_IF_ELSE_TARGET(
+    NV_PROVIDES_SM_90,
+    (_CUDA_VSTD::uint32_t __sreg_value;
+     asm("mov.u32 %0, %%cluster_ctaid.z;"
+         : "=r"(__sreg_value)
+         :
+         :);
+     return __sreg_value;),
+    (
+      // Unsupported architectures will have a linker error with a semi-decent error message
+      __cuda_ptx_get_sreg_cluster_ctaid_z_is_not_supported_before_SM_90__(); return 0;));
+}
+#endif // __cccl_ptx_isa >= 780
+
+/*
+// mov.u32 sreg_value, %%cluster_nctaid.x; // PTX ISA 78, SM_90
+template <typename=void>
+__device__ static inline uint32_t get_sreg_cluster_nctaid_x();
+*/
+#if __cccl_ptx_isa >= 780
+extern "C" _CCCL_DEVICE void __cuda_ptx_get_sreg_cluster_nctaid_x_is_not_supported_before_SM_90__();
+template <typename = void>
+_CCCL_DEVICE static inline _CUDA_VSTD::uint32_t get_sreg_cluster_nctaid_x()
+{
+  NV_IF_ELSE_TARGET(
+    NV_PROVIDES_SM_90,
+    (_CUDA_VSTD::uint32_t __sreg_value;
+     asm("mov.u32 %0, %%cluster_nctaid.x;"
+         : "=r"(__sreg_value)
+         :
+         :);
+     return __sreg_value;),
+    (
+      // Unsupported architectures will have a linker error with a semi-decent error message
+      __cuda_ptx_get_sreg_cluster_nctaid_x_is_not_supported_before_SM_90__(); return 0;));
+}
+#endif // __cccl_ptx_isa >= 780
+
+/*
+// mov.u32 sreg_value, %%cluster_nctaid.y; // PTX ISA 78, SM_90
+template <typename=void>
+__device__ static inline uint32_t get_sreg_cluster_nctaid_y();
+*/
+#if __cccl_ptx_isa >= 780
+extern "C" _CCCL_DEVICE void __cuda_ptx_get_sreg_cluster_nctaid_y_is_not_supported_before_SM_90__();
+template <typename = void>
+_CCCL_DEVICE static inline _CUDA_VSTD::uint32_t get_sreg_cluster_nctaid_y()
+{
+  NV_IF_ELSE_TARGET(
+    NV_PROVIDES_SM_90,
+    (_CUDA_VSTD::uint32_t __sreg_value;
+     asm("mov.u32 %0, %%cluster_nctaid.y;"
+         : "=r"(__sreg_value)
+         :
+         :);
+     return __sreg_value;),
+    (
+      // Unsupported architectures will have a linker error with a semi-decent error message
+      __cuda_ptx_get_sreg_cluster_nctaid_y_is_not_supported_before_SM_90__(); return 0;));
+}
+#endif // __cccl_ptx_isa >= 780
+
+/*
+// mov.u32 sreg_value, %%cluster_nctaid.z; // PTX ISA 78, SM_90
+template <typename=void>
+__device__ static inline uint32_t get_sreg_cluster_nctaid_z();
+*/
+#if __cccl_ptx_isa >= 780
+extern "C" _CCCL_DEVICE void __cuda_ptx_get_sreg_cluster_nctaid_z_is_not_supported_before_SM_90__();
+template <typename = void>
+_CCCL_DEVICE static inline _CUDA_VSTD::uint32_t get_sreg_cluster_nctaid_z()
+{
+  NV_IF_ELSE_TARGET(
+    NV_PROVIDES_SM_90,
+    (_CUDA_VSTD::uint32_t __sreg_value;
+     asm("mov.u32 %0, %%cluster_nctaid.z;"
+         : "=r"(__sreg_value)
+         :
+         :);
+     return __sreg_value;),
+    (
+      // Unsupported architectures will have a linker error with a semi-decent error message
+      __cuda_ptx_get_sreg_cluster_nctaid_z_is_not_supported_before_SM_90__(); return 0;));
+}
+#endif // __cccl_ptx_isa >= 780
+
+/*
+// mov.u32 sreg_value, %%cluster_ctarank; // PTX ISA 78, SM_90
+template <typename=void>
+__device__ static inline uint32_t get_sreg_cluster_ctarank();
+*/
+#if __cccl_ptx_isa >= 780
+extern "C" _CCCL_DEVICE void __cuda_ptx_get_sreg_cluster_ctarank_is_not_supported_before_SM_90__();
+template <typename = void>
+_CCCL_DEVICE static inline _CUDA_VSTD::uint32_t get_sreg_cluster_ctarank()
+{
+  NV_IF_ELSE_TARGET(
+    NV_PROVIDES_SM_90,
+    (_CUDA_VSTD::uint32_t __sreg_value;
+     asm("mov.u32 %0, %%cluster_ctarank;"
+         : "=r"(__sreg_value)
+         :
+         :);
+     return __sreg_value;),
+    (
+      // Unsupported architectures will have a linker error with a semi-decent error message
+      __cuda_ptx_get_sreg_cluster_ctarank_is_not_supported_before_SM_90__(); return 0;));
+}
+#endif // __cccl_ptx_isa >= 780
+
+/*
+// mov.u32 sreg_value, %%cluster_nctarank; // PTX ISA 78, SM_90
+template <typename=void>
+__device__ static inline uint32_t get_sreg_cluster_nctarank();
+*/
+#if __cccl_ptx_isa >= 780
+extern "C" _CCCL_DEVICE void __cuda_ptx_get_sreg_cluster_nctarank_is_not_supported_before_SM_90__();
+template <typename = void>
+_CCCL_DEVICE static inline _CUDA_VSTD::uint32_t get_sreg_cluster_nctarank()
+{
+  NV_IF_ELSE_TARGET(
+    NV_PROVIDES_SM_90,
+    (_CUDA_VSTD::uint32_t __sreg_value;
+     asm("mov.u32 %0, %%cluster_nctarank;"
+         : "=r"(__sreg_value)
+         :
+         :);
+     return __sreg_value;),
+    (
+      // Unsupported architectures will have a linker error with a semi-decent error message
+      __cuda_ptx_get_sreg_cluster_nctarank_is_not_supported_before_SM_90__(); return 0;));
+}
+#endif // __cccl_ptx_isa >= 780
+
+/*
+// mov.u32 sreg_value, %%lanemask_eq; // PTX ISA 20, SM_35
+template <typename=void>
+__device__ static inline uint32_t get_sreg_lanemask_eq();
+*/
+#if __cccl_ptx_isa >= 200
+extern "C" _CCCL_DEVICE void __cuda_ptx_get_sreg_lanemask_eq_is_not_supported_before_SM_35__();
+template <typename = void>
+_CCCL_DEVICE static inline _CUDA_VSTD::uint32_t get_sreg_lanemask_eq()
+{
+  NV_IF_ELSE_TARGET(
+    NV_PROVIDES_SM_35,
+    (_CUDA_VSTD::uint32_t __sreg_value;
+     asm("mov.u32 %0, %%lanemask_eq;"
+         : "=r"(__sreg_value)
+         :
+         :);
+     return __sreg_value;),
+    (
+      // Unsupported architectures will have a linker error with a semi-decent error message
+      __cuda_ptx_get_sreg_lanemask_eq_is_not_supported_before_SM_35__(); return 0;));
+}
+#endif // __cccl_ptx_isa >= 200
+
+/*
+// mov.u32 sreg_value, %%lanemask_le; // PTX ISA 20, SM_35
+template <typename=void>
+__device__ static inline uint32_t get_sreg_lanemask_le();
+*/
+#if __cccl_ptx_isa >= 200
+extern "C" _CCCL_DEVICE void __cuda_ptx_get_sreg_lanemask_le_is_not_supported_before_SM_35__();
+template <typename = void>
+_CCCL_DEVICE static inline _CUDA_VSTD::uint32_t get_sreg_lanemask_le()
+{
+  NV_IF_ELSE_TARGET(
+    NV_PROVIDES_SM_35,
+    (_CUDA_VSTD::uint32_t __sreg_value;
+     asm("mov.u32 %0, %%lanemask_le;"
+         : "=r"(__sreg_value)
+         :
+         :);
+     return __sreg_value;),
+    (
+      // Unsupported architectures will have a linker error with a semi-decent error message
+      __cuda_ptx_get_sreg_lanemask_le_is_not_supported_before_SM_35__(); return 0;));
+}
+#endif // __cccl_ptx_isa >= 200
+
+/*
+// mov.u32 sreg_value, %%lanemask_lt; // PTX ISA 20, SM_35
+template <typename=void>
+__device__ static inline uint32_t get_sreg_lanemask_lt();
+*/
+#if __cccl_ptx_isa >= 200
+extern "C" _CCCL_DEVICE void __cuda_ptx_get_sreg_lanemask_lt_is_not_supported_before_SM_35__();
+template <typename = void>
+_CCCL_DEVICE static inline _CUDA_VSTD::uint32_t get_sreg_lanemask_lt()
+{
+  NV_IF_ELSE_TARGET(
+    NV_PROVIDES_SM_35,
+    (_CUDA_VSTD::uint32_t __sreg_value;
+     asm("mov.u32 %0, %%lanemask_lt;"
+         : "=r"(__sreg_value)
+         :
+         :);
+     return __sreg_value;),
+    (
+      // Unsupported architectures will have a linker error with a semi-decent error message
+      __cuda_ptx_get_sreg_lanemask_lt_is_not_supported_before_SM_35__(); return 0;));
+}
+#endif // __cccl_ptx_isa >= 200
+
+/*
+// mov.u32 sreg_value, %%lanemask_ge; // PTX ISA 20, SM_35
+template <typename=void>
+__device__ static inline uint32_t get_sreg_lanemask_ge();
+*/
+#if __cccl_ptx_isa >= 200
+extern "C" _CCCL_DEVICE void __cuda_ptx_get_sreg_lanemask_ge_is_not_supported_before_SM_35__();
+template <typename = void>
+_CCCL_DEVICE static inline _CUDA_VSTD::uint32_t get_sreg_lanemask_ge()
+{
+  NV_IF_ELSE_TARGET(
+    NV_PROVIDES_SM_35,
+    (_CUDA_VSTD::uint32_t __sreg_value;
+     asm("mov.u32 %0, %%lanemask_ge;"
+         : "=r"(__sreg_value)
+         :
+         :);
+     return __sreg_value;),
+    (
+      // Unsupported architectures will have a linker error with a semi-decent error message
+      __cuda_ptx_get_sreg_lanemask_ge_is_not_supported_before_SM_35__(); return 0;));
+}
+#endif // __cccl_ptx_isa >= 200
+
+/*
+// mov.u32 sreg_value, %%lanemask_gt; // PTX ISA 20, SM_35
+template <typename=void>
+__device__ static inline uint32_t get_sreg_lanemask_gt();
+*/
+#if __cccl_ptx_isa >= 200
+extern "C" _CCCL_DEVICE void __cuda_ptx_get_sreg_lanemask_gt_is_not_supported_before_SM_35__();
+template <typename = void>
+_CCCL_DEVICE static inline _CUDA_VSTD::uint32_t get_sreg_lanemask_gt()
+{
+  NV_IF_ELSE_TARGET(
+    NV_PROVIDES_SM_35,
+    (_CUDA_VSTD::uint32_t __sreg_value;
+     asm("mov.u32 %0, %%lanemask_gt;"
+         : "=r"(__sreg_value)
+         :
+         :);
+     return __sreg_value;),
+    (
+      // Unsupported architectures will have a linker error with a semi-decent error message
+      __cuda_ptx_get_sreg_lanemask_gt_is_not_supported_before_SM_35__(); return 0;));
+}
+#endif // __cccl_ptx_isa >= 200
+
+/*
+// mov.u32 sreg_value, %%clock; // PTX ISA 10
+template <typename=void>
+__device__ static inline uint32_t get_sreg_clock();
+*/
+#if __cccl_ptx_isa >= 100
+template <typename = void>
+_CCCL_DEVICE static inline _CUDA_VSTD::uint32_t get_sreg_clock()
+{
+  _CUDA_VSTD::uint32_t __sreg_value;
+  asm volatile("mov.u32 %0, %%clock;" : "=r"(__sreg_value) : :);
+  return __sreg_value;
+}
+#endif // __cccl_ptx_isa >= 100
+
+/*
+// mov.u32 sreg_value, %%clock_hi; // PTX ISA 50, SM_35
+template <typename=void>
+__device__ static inline uint32_t get_sreg_clock_hi();
+*/
+#if __cccl_ptx_isa >= 500
+extern "C" _CCCL_DEVICE void __cuda_ptx_get_sreg_clock_hi_is_not_supported_before_SM_35__();
+template <typename = void>
+_CCCL_DEVICE static inline _CUDA_VSTD::uint32_t get_sreg_clock_hi()
+{
+  NV_IF_ELSE_TARGET(
+    NV_PROVIDES_SM_35,
+    (_CUDA_VSTD::uint32_t __sreg_value; asm volatile(
+       "mov.u32 %0, %%clock_hi;"
+       : "=r"(__sreg_value)
+       :
+       :);
+     return __sreg_value;),
+    (
+      // Unsupported architectures will have a linker error with a semi-decent error message
+      __cuda_ptx_get_sreg_clock_hi_is_not_supported_before_SM_35__(); return 0;));
+}
+#endif // __cccl_ptx_isa >= 500
+
+/*
+// mov.u64 sreg_value, %%clock64; // PTX ISA 20, SM_35
+template <typename=void>
+__device__ static inline uint64_t get_sreg_clock64();
+*/
+#if __cccl_ptx_isa >= 200
+extern "C" _CCCL_DEVICE void __cuda_ptx_get_sreg_clock64_is_not_supported_before_SM_35__();
+template <typename = void>
+_CCCL_DEVICE static inline _CUDA_VSTD::uint64_t get_sreg_clock64()
+{
+  NV_IF_ELSE_TARGET(
+    NV_PROVIDES_SM_35,
+    (_CUDA_VSTD::uint64_t __sreg_value; asm volatile(
+       "mov.u64 %0, %%clock64;"
+       : "=l"(__sreg_value)
+       :
+       :);
+     return __sreg_value;),
+    (
+      // Unsupported architectures will have a linker error with a semi-decent error message
+      __cuda_ptx_get_sreg_clock64_is_not_supported_before_SM_35__(); return 0;));
+}
+#endif // __cccl_ptx_isa >= 200
+
+/*
+// mov.u64 sreg_value, %%globaltimer; // PTX ISA 31, SM_35
+template <typename=void>
+__device__ static inline uint64_t get_sreg_globaltimer();
+*/
+#if __cccl_ptx_isa >= 310
+extern "C" _CCCL_DEVICE void __cuda_ptx_get_sreg_globaltimer_is_not_supported_before_SM_35__();
+template <typename = void>
+_CCCL_DEVICE static inline _CUDA_VSTD::uint64_t get_sreg_globaltimer()
+{
+  NV_IF_ELSE_TARGET(
+    NV_PROVIDES_SM_35,
+    (_CUDA_VSTD::uint64_t __sreg_value; asm volatile(
+       "mov.u64 %0, %%globaltimer;"
+       : "=l"(__sreg_value)
+       :
+       :);
+     return __sreg_value;),
+    (
+      // Unsupported architectures will have a linker error with a semi-decent error message
+      __cuda_ptx_get_sreg_globaltimer_is_not_supported_before_SM_35__(); return 0;));
+}
+#endif // __cccl_ptx_isa >= 310
+
+/*
+// mov.u32 sreg_value, %%globaltimer_lo; // PTX ISA 31, SM_35
+template <typename=void>
+__device__ static inline uint32_t get_sreg_globaltimer_lo();
+*/
+#if __cccl_ptx_isa >= 310
+extern "C" _CCCL_DEVICE void __cuda_ptx_get_sreg_globaltimer_lo_is_not_supported_before_SM_35__();
+template <typename = void>
+_CCCL_DEVICE static inline _CUDA_VSTD::uint32_t get_sreg_globaltimer_lo()
+{
+  NV_IF_ELSE_TARGET(
+    NV_PROVIDES_SM_35,
+    (_CUDA_VSTD::uint32_t __sreg_value; asm volatile(
+       "mov.u32 %0, %%globaltimer_lo;"
+       : "=r"(__sreg_value)
+       :
+       :);
+     return __sreg_value;),
+    (
+      // Unsupported architectures will have a linker error with a semi-decent error message
+      __cuda_ptx_get_sreg_globaltimer_lo_is_not_supported_before_SM_35__(); return 0;));
+}
+#endif // __cccl_ptx_isa >= 310
+
+/*
+// mov.u32 sreg_value, %%globaltimer_hi; // PTX ISA 31, SM_35
+template <typename=void>
+__device__ static inline uint32_t get_sreg_globaltimer_hi();
+*/
+#if __cccl_ptx_isa >= 310
+extern "C" _CCCL_DEVICE void __cuda_ptx_get_sreg_globaltimer_hi_is_not_supported_before_SM_35__();
+template <typename = void>
+_CCCL_DEVICE static inline _CUDA_VSTD::uint32_t get_sreg_globaltimer_hi()
+{
+  NV_IF_ELSE_TARGET(
+    NV_PROVIDES_SM_35,
+    (_CUDA_VSTD::uint32_t __sreg_value; asm volatile(
+       "mov.u32 %0, %%globaltimer_hi;"
+       : "=r"(__sreg_value)
+       :
+       :);
+     return __sreg_value;),
+    (
+      // Unsupported architectures will have a linker error with a semi-decent error message
+      __cuda_ptx_get_sreg_globaltimer_hi_is_not_supported_before_SM_35__(); return 0;));
+}
+#endif // __cccl_ptx_isa >= 310
+
+/*
+// mov.u32 sreg_value, %%total_smem_size; // PTX ISA 41, SM_35
+template <typename=void>
+__device__ static inline uint32_t get_sreg_total_smem_size();
+*/
+#if __cccl_ptx_isa >= 410
+extern "C" _CCCL_DEVICE void __cuda_ptx_get_sreg_total_smem_size_is_not_supported_before_SM_35__();
+template <typename = void>
+_CCCL_DEVICE static inline _CUDA_VSTD::uint32_t get_sreg_total_smem_size()
+{
+  NV_IF_ELSE_TARGET(
+    NV_PROVIDES_SM_35,
+    (_CUDA_VSTD::uint32_t __sreg_value;
+     asm("mov.u32 %0, %%total_smem_size;"
+         : "=r"(__sreg_value)
+         :
+         :);
+     return __sreg_value;),
+    (
+      // Unsupported architectures will have a linker error with a semi-decent error message
+      __cuda_ptx_get_sreg_total_smem_size_is_not_supported_before_SM_35__(); return 0;));
+}
+#endif // __cccl_ptx_isa >= 410
+
+/*
+// mov.u32 sreg_value, %%aggr_smem_size; // PTX ISA 81, SM_90
+template <typename=void>
+__device__ static inline uint32_t get_sreg_aggr_smem_size();
+*/
+#if __cccl_ptx_isa >= 810
+extern "C" _CCCL_DEVICE void __cuda_ptx_get_sreg_aggr_smem_size_is_not_supported_before_SM_90__();
+template <typename = void>
+_CCCL_DEVICE static inline _CUDA_VSTD::uint32_t get_sreg_aggr_smem_size()
+{
+  NV_IF_ELSE_TARGET(
+    NV_PROVIDES_SM_90,
+    (_CUDA_VSTD::uint32_t __sreg_value;
+     asm("mov.u32 %0, %%aggr_smem_size;"
+         : "=r"(__sreg_value)
+         :
+         :);
+     return __sreg_value;),
+    (
+      // Unsupported architectures will have a linker error with a semi-decent error message
+      __cuda_ptx_get_sreg_aggr_smem_size_is_not_supported_before_SM_90__(); return 0;));
+}
+#endif // __cccl_ptx_isa >= 810
+
+/*
+// mov.u32 sreg_value, %%dynamic_smem_size; // PTX ISA 41, SM_35
+template <typename=void>
+__device__ static inline uint32_t get_sreg_dynamic_smem_size();
+*/
+#if __cccl_ptx_isa >= 410
+extern "C" _CCCL_DEVICE void __cuda_ptx_get_sreg_dynamic_smem_size_is_not_supported_before_SM_35__();
+template <typename = void>
+_CCCL_DEVICE static inline _CUDA_VSTD::uint32_t get_sreg_dynamic_smem_size()
+{
+  NV_IF_ELSE_TARGET(
+    NV_PROVIDES_SM_35,
+    (_CUDA_VSTD::uint32_t __sreg_value;
+     asm("mov.u32 %0, %%dynamic_smem_size;"
+         : "=r"(__sreg_value)
+         :
+         :);
+     return __sreg_value;),
+    (
+      // Unsupported architectures will have a linker error with a semi-decent error message
+      __cuda_ptx_get_sreg_dynamic_smem_size_is_not_supported_before_SM_35__(); return 0;));
+}
+#endif // __cccl_ptx_isa >= 410
+
+/*
+// mov.u64 sreg_value, %%current_graph_exec; // PTX ISA 80, SM_50
+template <typename=void>
+__device__ static inline uint64_t get_sreg_current_graph_exec();
+*/
+#if __cccl_ptx_isa >= 800
+extern "C" _CCCL_DEVICE void __cuda_ptx_get_sreg_current_graph_exec_is_not_supported_before_SM_50__();
+template <typename = void>
+_CCCL_DEVICE static inline _CUDA_VSTD::uint64_t get_sreg_current_graph_exec()
+{
+  NV_IF_ELSE_TARGET(
+    NV_PROVIDES_SM_50,
+    (_CUDA_VSTD::uint64_t __sreg_value;
+     asm("mov.u64 %0, %%current_graph_exec;"
+         : "=l"(__sreg_value)
+         :
+         :);
+     return __sreg_value;),
+    (
+      // Unsupported architectures will have a linker error with a semi-decent error message
+      __cuda_ptx_get_sreg_current_graph_exec_is_not_supported_before_SM_50__(); return 0;));
+}
+#endif // __cccl_ptx_isa >= 800
diff --git a/libcudacxx/include/cuda/__ptx/instructions/generated/getctarank.inc b/libcudacxx/include/cuda/__ptx/instructions/generated/getctarank.inc
new file mode 100644
index 00000000000..51bd351be87
--- /dev/null
+++ b/libcudacxx/include/cuda/__ptx/instructions/generated/getctarank.inc
@@ -0,0 +1,27 @@
+/*
+// getctarank{.space}.u32 dest, addr; // PTX ISA 78, SM_90
+// .space     = { .shared::cluster }
+template <typename=void>
+__device__ static inline uint32_t getctarank(
+  cuda::ptx::space_cluster_t,
+  const void* addr);
+*/
+#if __cccl_ptx_isa >= 780
+extern "C" _CCCL_DEVICE void __cuda_ptx_getctarank_is_not_supported_before_SM_90__();
+template <typename = void>
+_CCCL_DEVICE static inline _CUDA_VSTD::uint32_t getctarank(space_cluster_t, const void* __addr)
+{
+  // __space == space_cluster (due to parameter type constraint)
+  NV_IF_ELSE_TARGET(
+    NV_PROVIDES_SM_90,
+    (_CUDA_VSTD::uint32_t __dest;
+     asm("getctarank.shared::cluster.u32 %0, %1;"
+         : "=r"(__dest)
+         : "r"(__as_ptr_smem(__addr))
+         :);
+     return __dest;),
+    (
+      // Unsupported architectures will have a linker error with a semi-decent error message
+      __cuda_ptx_getctarank_is_not_supported_before_SM_90__(); return 0;));
+}
+#endif // __cccl_ptx_isa >= 780
diff --git a/libcudacxx/include/cuda/__ptx/instructions/generated/mbarrier_arrive.inc b/libcudacxx/include/cuda/__ptx/instructions/generated/mbarrier_arrive.inc
new file mode 100644
index 00000000000..f3e2b860d50
--- /dev/null
+++ b/libcudacxx/include/cuda/__ptx/instructions/generated/mbarrier_arrive.inc
@@ -0,0 +1,205 @@
+/*
+// mbarrier.arrive.shared.b64                                  state,  [addr];           // 1.  PTX ISA 70, SM_80
+template <typename=void>
+__device__ static inline uint64_t mbarrier_arrive(
+  uint64_t* addr);
+*/
+#if __cccl_ptx_isa >= 700
+extern "C" _CCCL_DEVICE void __cuda_ptx_mbarrier_arrive_is_not_supported_before_SM_80__();
+template <typename = void>
+_CCCL_DEVICE static inline _CUDA_VSTD::uint64_t mbarrier_arrive(_CUDA_VSTD::uint64_t* __addr)
+{
+  NV_IF_ELSE_TARGET(
+    NV_PROVIDES_SM_80,
+    (_CUDA_VSTD::uint64_t __state;
+     asm("mbarrier.arrive.shared.b64                                  %0,  [%1];           // 1. "
+         : "=l"(__state)
+         : "r"(__as_ptr_smem(__addr))
+         : "memory");
+     return __state;),
+    (
+      // Unsupported architectures will have a linker error with a semi-decent error message
+      __cuda_ptx_mbarrier_arrive_is_not_supported_before_SM_80__(); return 0;));
+}
+#endif // __cccl_ptx_isa >= 700
+
+/*
+// mbarrier.arrive.shared::cta.b64                             state,  [addr], count;    // 2.  PTX ISA 78, SM_90
+template <typename=void>
+__device__ static inline uint64_t mbarrier_arrive(
+  uint64_t* addr,
+  const uint32_t& count);
+*/
+#if __cccl_ptx_isa >= 780
+extern "C" _CCCL_DEVICE void __cuda_ptx_mbarrier_arrive_is_not_supported_before_SM_90__();
+template <typename = void>
+_CCCL_DEVICE static inline _CUDA_VSTD::uint64_t
+mbarrier_arrive(_CUDA_VSTD::uint64_t* __addr, const _CUDA_VSTD::uint32_t& __count)
+{
+  NV_IF_ELSE_TARGET(
+    NV_PROVIDES_SM_90,
+    (_CUDA_VSTD::uint64_t __state;
+     asm("mbarrier.arrive.shared::cta.b64                             %0,  [%1], %2;    // 2. "
+         : "=l"(__state)
+         : "r"(__as_ptr_smem(__addr)), "r"(__count)
+         : "memory");
+     return __state;),
+    (
+      // Unsupported architectures will have a linker error with a semi-decent error message
+      __cuda_ptx_mbarrier_arrive_is_not_supported_before_SM_90__(); return 0;));
+}
+#endif // __cccl_ptx_isa >= 780
+
+/*
+// mbarrier.arrive{.sem}{.scope}{.space}.b64                   state,  [addr];           // 3a.  PTX ISA 80, SM_90
+// .sem       = { .release }
+// .scope     = { .cta, .cluster }
+// .space     = { .shared::cta }
+template <cuda::ptx::dot_scope Scope>
+__device__ static inline uint64_t mbarrier_arrive(
+  cuda::ptx::sem_release_t,
+  cuda::ptx::scope_t<Scope> scope,
+  cuda::ptx::space_shared_t,
+  uint64_t* addr);
+*/
+#if __cccl_ptx_isa >= 800
+extern "C" _CCCL_DEVICE void __cuda_ptx_mbarrier_arrive_is_not_supported_before_SM_90__();
+template <dot_scope _Scope>
+_CCCL_DEVICE static inline _CUDA_VSTD::uint64_t
+mbarrier_arrive(sem_release_t, scope_t<_Scope> __scope, space_shared_t, _CUDA_VSTD::uint64_t* __addr)
+{
+  // __sem == sem_release (due to parameter type constraint)
+  static_assert(__scope == scope_cta || __scope == scope_cluster, "");
+  // __space == space_shared (due to parameter type constraint)
+  NV_IF_ELSE_TARGET(
+    NV_PROVIDES_SM_90,
+    (
+      _CUDA_VSTD::uint64_t __state; _CCCL_IF_CONSTEXPR (__scope == scope_cta) {
+        asm("mbarrier.arrive.release.cta.shared::cta.b64                   %0,  [%1];           // 3a. "
+            : "=l"(__state)
+            : "r"(__as_ptr_smem(__addr))
+            : "memory");
+      } _CCCL_ELSE_IF_CONSTEXPR (__scope == scope_cluster) {
+        asm("mbarrier.arrive.release.cluster.shared::cta.b64                   %0,  [%1];           // 3a. "
+            : "=l"(__state)
+            : "r"(__as_ptr_smem(__addr))
+            : "memory");
+      } return __state;),
+    (
+      // Unsupported architectures will have a linker error with a semi-decent error message
+      __cuda_ptx_mbarrier_arrive_is_not_supported_before_SM_90__(); return 0;));
+}
+#endif // __cccl_ptx_isa >= 800
+
+/*
+// mbarrier.arrive{.sem}{.scope}{.space}.b64                   state,  [addr], count;    // 3b.  PTX ISA 80, SM_90
+// .sem       = { .release }
+// .scope     = { .cta, .cluster }
+// .space     = { .shared::cta }
+template <cuda::ptx::dot_scope Scope>
+__device__ static inline uint64_t mbarrier_arrive(
+  cuda::ptx::sem_release_t,
+  cuda::ptx::scope_t<Scope> scope,
+  cuda::ptx::space_shared_t,
+  uint64_t* addr,
+  const uint32_t& count);
+*/
+#if __cccl_ptx_isa >= 800
+extern "C" _CCCL_DEVICE void __cuda_ptx_mbarrier_arrive_is_not_supported_before_SM_90__();
+template <dot_scope _Scope>
+_CCCL_DEVICE static inline _CUDA_VSTD::uint64_t mbarrier_arrive(
+  sem_release_t,
+  scope_t<_Scope> __scope,
+  space_shared_t,
+  _CUDA_VSTD::uint64_t* __addr,
+  const _CUDA_VSTD::uint32_t& __count)
+{
+  // __sem == sem_release (due to parameter type constraint)
+  static_assert(__scope == scope_cta || __scope == scope_cluster, "");
+  // __space == space_shared (due to parameter type constraint)
+  NV_IF_ELSE_TARGET(
+    NV_PROVIDES_SM_90,
+    (
+      _CUDA_VSTD::uint64_t __state; _CCCL_IF_CONSTEXPR (__scope == scope_cta) {
+        asm("mbarrier.arrive.release.cta.shared::cta.b64                   %0,  [%1], %2;    // 3b. "
+            : "=l"(__state)
+            : "r"(__as_ptr_smem(__addr)), "r"(__count)
+            : "memory");
+      } _CCCL_ELSE_IF_CONSTEXPR (__scope == scope_cluster) {
+        asm("mbarrier.arrive.release.cluster.shared::cta.b64                   %0,  [%1], %2;    // 3b. "
+            : "=l"(__state)
+            : "r"(__as_ptr_smem(__addr)), "r"(__count)
+            : "memory");
+      } return __state;),
+    (
+      // Unsupported architectures will have a linker error with a semi-decent error message
+      __cuda_ptx_mbarrier_arrive_is_not_supported_before_SM_90__(); return 0;));
+}
+#endif // __cccl_ptx_isa >= 800
+
+/*
+// mbarrier.arrive{.sem}{.scope}{.space}.b64                   _, [addr];                // 4a.  PTX ISA 80, SM_90
+// .sem       = { .release }
+// .scope     = { .cluster }
+// .space     = { .shared::cluster }
+template <typename=void>
+__device__ static inline void mbarrier_arrive(
+  cuda::ptx::sem_release_t,
+  cuda::ptx::scope_cluster_t,
+  cuda::ptx::space_cluster_t,
+  uint64_t* addr);
+*/
+#if __cccl_ptx_isa >= 800
+extern "C" _CCCL_DEVICE void __cuda_ptx_mbarrier_arrive_is_not_supported_before_SM_90__();
+template <typename = void>
+_CCCL_DEVICE static inline void
+mbarrier_arrive(sem_release_t, scope_cluster_t, space_cluster_t, _CUDA_VSTD::uint64_t* __addr)
+{
+  // __sem == sem_release (due to parameter type constraint)
+  // __scope == scope_cluster (due to parameter type constraint)
+  // __space == space_cluster (due to parameter type constraint)
+  NV_IF_ELSE_TARGET(
+    NV_PROVIDES_SM_90,
+    (asm("mbarrier.arrive.release.cluster.shared::cluster.b64                   _, [%0];                // 4a. "
+         :
+         : "r"(__as_ptr_remote_dsmem(__addr))
+         : "memory");),
+    (
+      // Unsupported architectures will have a linker error with a semi-decent error message
+      __cuda_ptx_mbarrier_arrive_is_not_supported_before_SM_90__();));
+}
+#endif // __cccl_ptx_isa >= 800
+
+/*
+// mbarrier.arrive{.sem}{.scope}{.space}.b64                   _, [addr], count;         // 4b.  PTX ISA 80, SM_90
+// .sem       = { .release }
+// .scope     = { .cluster }
+// .space     = { .shared::cluster }
+template <typename=void>
+__device__ static inline void mbarrier_arrive(
+  cuda::ptx::sem_release_t,
+  cuda::ptx::scope_cluster_t,
+  cuda::ptx::space_cluster_t,
+  uint64_t* addr,
+  const uint32_t& count);
+*/
+#if __cccl_ptx_isa >= 800
+extern "C" _CCCL_DEVICE void __cuda_ptx_mbarrier_arrive_is_not_supported_before_SM_90__();
+template <typename = void>
+_CCCL_DEVICE static inline void mbarrier_arrive(
+  sem_release_t, scope_cluster_t, space_cluster_t, _CUDA_VSTD::uint64_t* __addr, const _CUDA_VSTD::uint32_t& __count)
+{
+  // __sem == sem_release (due to parameter type constraint)
+  // __scope == scope_cluster (due to parameter type constraint)
+  // __space == space_cluster (due to parameter type constraint)
+  NV_IF_ELSE_TARGET(
+    NV_PROVIDES_SM_90,
+    (asm("mbarrier.arrive.release.cluster.shared::cluster.b64                   _, [%0], %1;         // 4b. "
+         :
+         : "r"(__as_ptr_remote_dsmem(__addr)), "r"(__count)
+         : "memory");),
+    (
+      // Unsupported architectures will have a linker error with a semi-decent error message
+      __cuda_ptx_mbarrier_arrive_is_not_supported_before_SM_90__();));
+}
+#endif // __cccl_ptx_isa >= 800
diff --git a/libcudacxx/include/cuda/__ptx/instructions/generated/mbarrier_arrive_expect_tx.inc b/libcudacxx/include/cuda/__ptx/instructions/generated/mbarrier_arrive_expect_tx.inc
new file mode 100644
index 00000000000..efb749957b1
--- /dev/null
+++ b/libcudacxx/include/cuda/__ptx/instructions/generated/mbarrier_arrive_expect_tx.inc
@@ -0,0 +1,79 @@
+/*
+// mbarrier.arrive.expect_tx{.sem}{.scope}{.space}.b64 state, [addr], tx_count; // 8.  PTX ISA 80, SM_90
+// .sem       = { .release }
+// .scope     = { .cta, .cluster }
+// .space     = { .shared::cta }
+template <cuda::ptx::dot_scope Scope>
+__device__ static inline uint64_t mbarrier_arrive_expect_tx(
+  cuda::ptx::sem_release_t,
+  cuda::ptx::scope_t<Scope> scope,
+  cuda::ptx::space_shared_t,
+  uint64_t* addr,
+  const uint32_t& tx_count);
+*/
+#if __cccl_ptx_isa >= 800
+extern "C" _CCCL_DEVICE void __cuda_ptx_mbarrier_arrive_expect_tx_is_not_supported_before_SM_90__();
+template <dot_scope _Scope>
+_CCCL_DEVICE static inline _CUDA_VSTD::uint64_t mbarrier_arrive_expect_tx(
+  sem_release_t,
+  scope_t<_Scope> __scope,
+  space_shared_t,
+  _CUDA_VSTD::uint64_t* __addr,
+  const _CUDA_VSTD::uint32_t& __tx_count)
+{
+  // __sem == sem_release (due to parameter type constraint)
+  static_assert(__scope == scope_cta || __scope == scope_cluster, "");
+  // __space == space_shared (due to parameter type constraint)
+  NV_IF_ELSE_TARGET(
+    NV_PROVIDES_SM_90,
+    (
+      _CUDA_VSTD::uint64_t __state; _CCCL_IF_CONSTEXPR (__scope == scope_cta) {
+        asm("mbarrier.arrive.expect_tx.release.cta.shared::cta.b64 %0, [%1], %2; // 8. "
+            : "=l"(__state)
+            : "r"(__as_ptr_smem(__addr)), "r"(__tx_count)
+            : "memory");
+      } _CCCL_ELSE_IF_CONSTEXPR (__scope == scope_cluster) {
+        asm("mbarrier.arrive.expect_tx.release.cluster.shared::cta.b64 %0, [%1], %2; // 8. "
+            : "=l"(__state)
+            : "r"(__as_ptr_smem(__addr)), "r"(__tx_count)
+            : "memory");
+      } return __state;),
+    (
+      // Unsupported architectures will have a linker error with a semi-decent error message
+      __cuda_ptx_mbarrier_arrive_expect_tx_is_not_supported_before_SM_90__(); return 0;));
+}
+#endif // __cccl_ptx_isa >= 800
+
+/*
+// mbarrier.arrive.expect_tx{.sem}{.scope}{.space}.b64   _, [addr], tx_count; // 9.  PTX ISA 80, SM_90
+// .sem       = { .release }
+// .scope     = { .cluster }
+// .space     = { .shared::cluster }
+template <typename=void>
+__device__ static inline void mbarrier_arrive_expect_tx(
+  cuda::ptx::sem_release_t,
+  cuda::ptx::scope_cluster_t,
+  cuda::ptx::space_cluster_t,
+  uint64_t* addr,
+  const uint32_t& tx_count);
+*/
+#if __cccl_ptx_isa >= 800
+extern "C" _CCCL_DEVICE void __cuda_ptx_mbarrier_arrive_expect_tx_is_not_supported_before_SM_90__();
+template <typename = void>
+_CCCL_DEVICE static inline void mbarrier_arrive_expect_tx(
+  sem_release_t, scope_cluster_t, space_cluster_t, _CUDA_VSTD::uint64_t* __addr, const _CUDA_VSTD::uint32_t& __tx_count)
+{
+  // __sem == sem_release (due to parameter type constraint)
+  // __scope == scope_cluster (due to parameter type constraint)
+  // __space == space_cluster (due to parameter type constraint)
+  NV_IF_ELSE_TARGET(
+    NV_PROVIDES_SM_90,
+    (asm("mbarrier.arrive.expect_tx.release.cluster.shared::cluster.b64   _, [%0], %1; // 9. "
+         :
+         : "r"(__as_ptr_remote_dsmem(__addr)), "r"(__tx_count)
+         : "memory");),
+    (
+      // Unsupported architectures will have a linker error with a semi-decent error message
+      __cuda_ptx_mbarrier_arrive_expect_tx_is_not_supported_before_SM_90__();));
+}
+#endif // __cccl_ptx_isa >= 800
diff --git a/libcudacxx/include/cuda/__ptx/instructions/generated/mbarrier_arrive_no_complete.inc b/libcudacxx/include/cuda/__ptx/instructions/generated/mbarrier_arrive_no_complete.inc
new file mode 100644
index 00000000000..879bedebdc9
--- /dev/null
+++ b/libcudacxx/include/cuda/__ptx/instructions/generated/mbarrier_arrive_no_complete.inc
@@ -0,0 +1,26 @@
+/*
+// mbarrier.arrive.noComplete.shared.b64                       state,  [addr], count;    // 5.  PTX ISA 70, SM_80
+template <typename=void>
+__device__ static inline uint64_t mbarrier_arrive_no_complete(
+  uint64_t* addr,
+  const uint32_t& count);
+*/
+#if __cccl_ptx_isa >= 700
+extern "C" _CCCL_DEVICE void __cuda_ptx_mbarrier_arrive_no_complete_is_not_supported_before_SM_80__();
+template <typename = void>
+_CCCL_DEVICE static inline _CUDA_VSTD::uint64_t
+mbarrier_arrive_no_complete(_CUDA_VSTD::uint64_t* __addr, const _CUDA_VSTD::uint32_t& __count)
+{
+  NV_IF_ELSE_TARGET(
+    NV_PROVIDES_SM_80,
+    (_CUDA_VSTD::uint64_t __state;
+     asm("mbarrier.arrive.noComplete.shared.b64                       %0,  [%1], %2;    // 5. "
+         : "=l"(__state)
+         : "r"(__as_ptr_smem(__addr)), "r"(__count)
+         : "memory");
+     return __state;),
+    (
+      // Unsupported architectures will have a linker error with a semi-decent error message
+      __cuda_ptx_mbarrier_arrive_no_complete_is_not_supported_before_SM_80__(); return 0;));
+}
+#endif // __cccl_ptx_isa >= 700
diff --git a/libcudacxx/include/cuda/__ptx/instructions/generated/mbarrier_init.inc b/libcudacxx/include/cuda/__ptx/instructions/generated/mbarrier_init.inc
new file mode 100644
index 00000000000..3afeeacfccf
--- /dev/null
+++ b/libcudacxx/include/cuda/__ptx/instructions/generated/mbarrier_init.inc
@@ -0,0 +1,23 @@
+/*
+// mbarrier.init.shared.b64 [addr], count; // PTX ISA 70, SM_80
+template <typename=void>
+__device__ static inline void mbarrier_init(
+  uint64_t* addr,
+  const uint32_t& count);
+*/
+#if __cccl_ptx_isa >= 700
+extern "C" _CCCL_DEVICE void __cuda_ptx_mbarrier_init_is_not_supported_before_SM_80__();
+template <typename = void>
+_CCCL_DEVICE static inline void mbarrier_init(_CUDA_VSTD::uint64_t* __addr, const _CUDA_VSTD::uint32_t& __count)
+{
+  NV_IF_ELSE_TARGET(
+    NV_PROVIDES_SM_80,
+    (asm("mbarrier.init.shared.b64 [%0], %1;"
+         :
+         : "r"(__as_ptr_smem(__addr)), "r"(__count)
+         : "memory");),
+    (
+      // Unsupported architectures will have a linker error with a semi-decent error message
+      __cuda_ptx_mbarrier_init_is_not_supported_before_SM_80__();));
+}
+#endif // __cccl_ptx_isa >= 700
diff --git a/libcudacxx/include/cuda/__ptx/instructions/generated/mbarrier_test_wait.inc b/libcudacxx/include/cuda/__ptx/instructions/generated/mbarrier_test_wait.inc
new file mode 100644
index 00000000000..301c0364af4
--- /dev/null
+++ b/libcudacxx/include/cuda/__ptx/instructions/generated/mbarrier_test_wait.inc
@@ -0,0 +1,75 @@
+/*
+// mbarrier.test_wait.shared.b64 waitComplete, [addr], state;                                                  // 1. PTX
+ISA 70, SM_80 template <typename=void>
+__device__ static inline bool mbarrier_test_wait(
+  uint64_t* addr,
+  const uint64_t& state);
+*/
+#if __cccl_ptx_isa >= 700
+extern "C" _CCCL_DEVICE void __cuda_ptx_mbarrier_test_wait_is_not_supported_before_SM_80__();
+template <typename = void>
+_CCCL_DEVICE static inline bool mbarrier_test_wait(_CUDA_VSTD::uint64_t* __addr, const _CUDA_VSTD::uint64_t& __state)
+{
+  NV_IF_ELSE_TARGET(
+    NV_PROVIDES_SM_80,
+    (_CUDA_VSTD::uint32_t __waitComplete;
+     asm("{\n\t .reg .pred P_OUT; \n\t"
+         "mbarrier.test_wait.shared.b64 P_OUT, [%1], %2;                                                  // 1. \n\t"
+         "selp.b32 %0, 1, 0, P_OUT; \n"
+         "}"
+         : "=r"(__waitComplete)
+         : "r"(__as_ptr_smem(__addr)), "l"(__state)
+         : "memory");
+     return static_cast<bool>(__waitComplete);),
+    (
+      // Unsupported architectures will have a linker error with a semi-decent error message
+      __cuda_ptx_mbarrier_test_wait_is_not_supported_before_SM_80__(); return false;));
+}
+#endif // __cccl_ptx_isa >= 700
+
+/*
+// mbarrier.test_wait{.sem}{.scope}.shared::cta.b64        waitComplete, [addr], state;                        // 2. PTX
+ISA 80, SM_90
+// .sem       = { .acquire }
+// .scope     = { .cta, .cluster }
+template <cuda::ptx::dot_scope Scope>
+__device__ static inline bool mbarrier_test_wait(
+  cuda::ptx::sem_acquire_t,
+  cuda::ptx::scope_t<Scope> scope,
+  uint64_t* addr,
+  const uint64_t& state);
+*/
+#if __cccl_ptx_isa >= 800
+extern "C" _CCCL_DEVICE void __cuda_ptx_mbarrier_test_wait_is_not_supported_before_SM_90__();
+template <dot_scope _Scope>
+_CCCL_DEVICE static inline bool mbarrier_test_wait(
+  sem_acquire_t, scope_t<_Scope> __scope, _CUDA_VSTD::uint64_t* __addr, const _CUDA_VSTD::uint64_t& __state)
+{
+  // __sem == sem_acquire (due to parameter type constraint)
+  static_assert(__scope == scope_cta || __scope == scope_cluster, "");
+  NV_IF_ELSE_TARGET(
+    NV_PROVIDES_SM_90,
+    (
+      _CUDA_VSTD::uint32_t __waitComplete; _CCCL_IF_CONSTEXPR (__scope == scope_cta) {
+        asm("{\n\t .reg .pred P_OUT; \n\t"
+            "mbarrier.test_wait.acquire.cta.shared::cta.b64        P_OUT, [%1], %2;                        // 2.  \n\t"
+            "selp.b32 %0, 1, 0, P_OUT; \n"
+            "}"
+            : "=r"(__waitComplete)
+            : "r"(__as_ptr_smem(__addr)), "l"(__state)
+            : "memory");
+      } _CCCL_ELSE_IF_CONSTEXPR (__scope == scope_cluster) {
+        asm("{\n\t .reg .pred P_OUT; \n\t"
+            "mbarrier.test_wait.acquire.cluster.shared::cta.b64        P_OUT, [%1], %2;                        // 2.  "
+            "\n\t"
+            "selp.b32 %0, 1, 0, P_OUT; \n"
+            "}"
+            : "=r"(__waitComplete)
+            : "r"(__as_ptr_smem(__addr)), "l"(__state)
+            : "memory");
+      } return static_cast<bool>(__waitComplete);),
+    (
+      // Unsupported architectures will have a linker error with a semi-decent error message
+      __cuda_ptx_mbarrier_test_wait_is_not_supported_before_SM_90__(); return false;));
+}
+#endif // __cccl_ptx_isa >= 800
diff --git a/libcudacxx/include/cuda/__ptx/instructions/generated/mbarrier_test_wait_parity.inc b/libcudacxx/include/cuda/__ptx/instructions/generated/mbarrier_test_wait_parity.inc
new file mode 100644
index 00000000000..604cfd92045
--- /dev/null
+++ b/libcudacxx/include/cuda/__ptx/instructions/generated/mbarrier_test_wait_parity.inc
@@ -0,0 +1,75 @@
+/*
+// mbarrier.test_wait.parity.shared.b64 waitComplete, [addr], phaseParity;                                     // 3. PTX
+ISA 71, SM_80 template <typename=void>
+__device__ static inline bool mbarrier_test_wait_parity(
+  uint64_t* addr,
+  const uint32_t& phaseParity);
+*/
+#if __cccl_ptx_isa >= 710
+extern "C" _CCCL_DEVICE void __cuda_ptx_mbarrier_test_wait_parity_is_not_supported_before_SM_80__();
+template <typename = void>
+_CCCL_DEVICE static inline bool
+mbarrier_test_wait_parity(_CUDA_VSTD::uint64_t* __addr, const _CUDA_VSTD::uint32_t& __phaseParity)
+{
+  NV_IF_ELSE_TARGET(
+    NV_PROVIDES_SM_80,
+    (_CUDA_VSTD::uint32_t __waitComplete;
+     asm("{\n\t .reg .pred P_OUT; \n\t"
+         "mbarrier.test_wait.parity.shared.b64 P_OUT, [%1], %2;                                     // 3. \n\t"
+         "selp.b32 %0, 1, 0, P_OUT; \n"
+         "}"
+         : "=r"(__waitComplete)
+         : "r"(__as_ptr_smem(__addr)), "r"(__phaseParity)
+         : "memory");
+     return static_cast<bool>(__waitComplete);),
+    (
+      // Unsupported architectures will have a linker error with a semi-decent error message
+      __cuda_ptx_mbarrier_test_wait_parity_is_not_supported_before_SM_80__(); return false;));
+}
+#endif // __cccl_ptx_isa >= 710
+
+/*
+// mbarrier.test_wait.parity{.sem}{.scope}.shared::cta.b64 waitComplete, [addr], phaseParity;                  // 4. PTX
+ISA 80, SM_90
+// .sem       = { .acquire }
+// .scope     = { .cta, .cluster }
+template <cuda::ptx::dot_scope Scope>
+__device__ static inline bool mbarrier_test_wait_parity(
+  cuda::ptx::sem_acquire_t,
+  cuda::ptx::scope_t<Scope> scope,
+  uint64_t* addr,
+  const uint32_t& phaseParity);
+*/
+#if __cccl_ptx_isa >= 800
+extern "C" _CCCL_DEVICE void __cuda_ptx_mbarrier_test_wait_parity_is_not_supported_before_SM_90__();
+template <dot_scope _Scope>
+_CCCL_DEVICE static inline bool mbarrier_test_wait_parity(
+  sem_acquire_t, scope_t<_Scope> __scope, _CUDA_VSTD::uint64_t* __addr, const _CUDA_VSTD::uint32_t& __phaseParity)
+{
+  // __sem == sem_acquire (due to parameter type constraint)
+  static_assert(__scope == scope_cta || __scope == scope_cluster, "");
+  NV_IF_ELSE_TARGET(
+    NV_PROVIDES_SM_90,
+    (
+      _CUDA_VSTD::uint32_t __waitComplete; _CCCL_IF_CONSTEXPR (__scope == scope_cta) {
+        asm("{\n\t .reg .pred P_OUT; \n\t"
+            "mbarrier.test_wait.parity.acquire.cta.shared::cta.b64 P_OUT, [%1], %2;                  // 4. \n\t"
+            "selp.b32 %0, 1, 0, P_OUT; \n"
+            "}"
+            : "=r"(__waitComplete)
+            : "r"(__as_ptr_smem(__addr)), "r"(__phaseParity)
+            : "memory");
+      } _CCCL_ELSE_IF_CONSTEXPR (__scope == scope_cluster) {
+        asm("{\n\t .reg .pred P_OUT; \n\t"
+            "mbarrier.test_wait.parity.acquire.cluster.shared::cta.b64 P_OUT, [%1], %2;                  // 4. \n\t"
+            "selp.b32 %0, 1, 0, P_OUT; \n"
+            "}"
+            : "=r"(__waitComplete)
+            : "r"(__as_ptr_smem(__addr)), "r"(__phaseParity)
+            : "memory");
+      } return static_cast<bool>(__waitComplete);),
+    (
+      // Unsupported architectures will have a linker error with a semi-decent error message
+      __cuda_ptx_mbarrier_test_wait_parity_is_not_supported_before_SM_90__(); return false;));
+}
+#endif // __cccl_ptx_isa >= 800
diff --git a/libcudacxx/include/cuda/__ptx/instructions/generated/mbarrier_try_wait.inc b/libcudacxx/include/cuda/__ptx/instructions/generated/mbarrier_try_wait.inc
new file mode 100644
index 00000000000..c5f2062664c
--- /dev/null
+++ b/libcudacxx/include/cuda/__ptx/instructions/generated/mbarrier_try_wait.inc
@@ -0,0 +1,157 @@
+/*
+// mbarrier.try_wait.shared::cta.b64         waitComplete, [addr], state;                                      // 5a.
+PTX ISA 78, SM_90 template <typename=void>
+__device__ static inline bool mbarrier_try_wait(
+  uint64_t* addr,
+  const uint64_t& state);
+*/
+#if __cccl_ptx_isa >= 780
+extern "C" _CCCL_DEVICE void __cuda_ptx_mbarrier_try_wait_is_not_supported_before_SM_90__();
+template <typename = void>
+_CCCL_DEVICE static inline bool mbarrier_try_wait(_CUDA_VSTD::uint64_t* __addr, const _CUDA_VSTD::uint64_t& __state)
+{
+  NV_IF_ELSE_TARGET(
+    NV_PROVIDES_SM_90,
+    (_CUDA_VSTD::uint32_t __waitComplete;
+     asm("{\n\t .reg .pred P_OUT; \n\t"
+         "mbarrier.try_wait.shared::cta.b64         P_OUT, [%1], %2;                                      // 5a. \n\t"
+         "selp.b32 %0, 1, 0, P_OUT; \n"
+         "}"
+         : "=r"(__waitComplete)
+         : "r"(__as_ptr_smem(__addr)), "l"(__state)
+         : "memory");
+     return static_cast<bool>(__waitComplete);),
+    (
+      // Unsupported architectures will have a linker error with a semi-decent error message
+      __cuda_ptx_mbarrier_try_wait_is_not_supported_before_SM_90__(); return false;));
+}
+#endif // __cccl_ptx_isa >= 780
+
+/*
+// mbarrier.try_wait.shared::cta.b64         waitComplete, [addr], state, suspendTimeHint;                    // 5b. PTX
+ISA 78, SM_90 template <typename=void>
+__device__ static inline bool mbarrier_try_wait(
+  uint64_t* addr,
+  const uint64_t& state,
+  const uint32_t& suspendTimeHint);
+*/
+#if __cccl_ptx_isa >= 780
+extern "C" _CCCL_DEVICE void __cuda_ptx_mbarrier_try_wait_is_not_supported_before_SM_90__();
+template <typename = void>
+_CCCL_DEVICE static inline bool mbarrier_try_wait(
+  _CUDA_VSTD::uint64_t* __addr, const _CUDA_VSTD::uint64_t& __state, const _CUDA_VSTD::uint32_t& __suspendTimeHint)
+{
+  NV_IF_ELSE_TARGET(
+    NV_PROVIDES_SM_90,
+    (_CUDA_VSTD::uint32_t __waitComplete;
+     asm("{\n\t .reg .pred P_OUT; \n\t"
+         "mbarrier.try_wait.shared::cta.b64         P_OUT, [%1], %2, %3;                    // 5b. \n\t"
+         "selp.b32 %0, 1, 0, P_OUT; \n"
+         "}"
+         : "=r"(__waitComplete)
+         : "r"(__as_ptr_smem(__addr)), "l"(__state), "r"(__suspendTimeHint)
+         : "memory");
+     return static_cast<bool>(__waitComplete);),
+    (
+      // Unsupported architectures will have a linker error with a semi-decent error message
+      __cuda_ptx_mbarrier_try_wait_is_not_supported_before_SM_90__(); return false;));
+}
+#endif // __cccl_ptx_isa >= 780
+
+/*
+// mbarrier.try_wait{.sem}{.scope}.shared::cta.b64         waitComplete, [addr], state;                        // 6a.
+PTX ISA 80, SM_90
+// .sem       = { .acquire }
+// .scope     = { .cta, .cluster }
+template <cuda::ptx::dot_scope Scope>
+__device__ static inline bool mbarrier_try_wait(
+  cuda::ptx::sem_acquire_t,
+  cuda::ptx::scope_t<Scope> scope,
+  uint64_t* addr,
+  const uint64_t& state);
+*/
+#if __cccl_ptx_isa >= 800
+extern "C" _CCCL_DEVICE void __cuda_ptx_mbarrier_try_wait_is_not_supported_before_SM_90__();
+template <dot_scope _Scope>
+_CCCL_DEVICE static inline bool mbarrier_try_wait(
+  sem_acquire_t, scope_t<_Scope> __scope, _CUDA_VSTD::uint64_t* __addr, const _CUDA_VSTD::uint64_t& __state)
+{
+  // __sem == sem_acquire (due to parameter type constraint)
+  static_assert(__scope == scope_cta || __scope == scope_cluster, "");
+  NV_IF_ELSE_TARGET(
+    NV_PROVIDES_SM_90,
+    (
+      _CUDA_VSTD::uint32_t __waitComplete; _CCCL_IF_CONSTEXPR (__scope == scope_cta) {
+        asm("{\n\t .reg .pred P_OUT; \n\t"
+            "mbarrier.try_wait.acquire.cta.shared::cta.b64         P_OUT, [%1], %2;                        // 6a. \n\t"
+            "selp.b32 %0, 1, 0, P_OUT; \n"
+            "}"
+            : "=r"(__waitComplete)
+            : "r"(__as_ptr_smem(__addr)), "l"(__state)
+            : "memory");
+      } _CCCL_ELSE_IF_CONSTEXPR (__scope == scope_cluster) {
+        asm("{\n\t .reg .pred P_OUT; \n\t"
+            "mbarrier.try_wait.acquire.cluster.shared::cta.b64         P_OUT, [%1], %2;                        // 6a. "
+            "\n\t"
+            "selp.b32 %0, 1, 0, P_OUT; \n"
+            "}"
+            : "=r"(__waitComplete)
+            : "r"(__as_ptr_smem(__addr)), "l"(__state)
+            : "memory");
+      } return static_cast<bool>(__waitComplete);),
+    (
+      // Unsupported architectures will have a linker error with a semi-decent error message
+      __cuda_ptx_mbarrier_try_wait_is_not_supported_before_SM_90__(); return false;));
+}
+#endif // __cccl_ptx_isa >= 800
+
+/*
+// mbarrier.try_wait{.sem}{.scope}.shared::cta.b64         waitComplete, [addr], state , suspendTimeHint;      // 6b.
+PTX ISA 80, SM_90
+// .sem       = { .acquire }
+// .scope     = { .cta, .cluster }
+template <cuda::ptx::dot_scope Scope>
+__device__ static inline bool mbarrier_try_wait(
+  cuda::ptx::sem_acquire_t,
+  cuda::ptx::scope_t<Scope> scope,
+  uint64_t* addr,
+  const uint64_t& state,
+  const uint32_t& suspendTimeHint);
+*/
+#if __cccl_ptx_isa >= 800
+extern "C" _CCCL_DEVICE void __cuda_ptx_mbarrier_try_wait_is_not_supported_before_SM_90__();
+template <dot_scope _Scope>
+_CCCL_DEVICE static inline bool mbarrier_try_wait(
+  sem_acquire_t,
+  scope_t<_Scope> __scope,
+  _CUDA_VSTD::uint64_t* __addr,
+  const _CUDA_VSTD::uint64_t& __state,
+  const _CUDA_VSTD::uint32_t& __suspendTimeHint)
+{
+  // __sem == sem_acquire (due to parameter type constraint)
+  static_assert(__scope == scope_cta || __scope == scope_cluster, "");
+  NV_IF_ELSE_TARGET(
+    NV_PROVIDES_SM_90,
+    (
+      _CUDA_VSTD::uint32_t __waitComplete; _CCCL_IF_CONSTEXPR (__scope == scope_cta) {
+        asm("{\n\t .reg .pred P_OUT; \n\t"
+            "mbarrier.try_wait.acquire.cta.shared::cta.b64         P_OUT, [%1], %2 , %3;      // 6b. \n\t"
+            "selp.b32 %0, 1, 0, P_OUT; \n"
+            "}"
+            : "=r"(__waitComplete)
+            : "r"(__as_ptr_smem(__addr)), "l"(__state), "r"(__suspendTimeHint)
+            : "memory");
+      } _CCCL_ELSE_IF_CONSTEXPR (__scope == scope_cluster) {
+        asm("{\n\t .reg .pred P_OUT; \n\t"
+            "mbarrier.try_wait.acquire.cluster.shared::cta.b64         P_OUT, [%1], %2 , %3;      // 6b. \n\t"
+            "selp.b32 %0, 1, 0, P_OUT; \n"
+            "}"
+            : "=r"(__waitComplete)
+            : "r"(__as_ptr_smem(__addr)), "l"(__state), "r"(__suspendTimeHint)
+            : "memory");
+      } return static_cast<bool>(__waitComplete);),
+    (
+      // Unsupported architectures will have a linker error with a semi-decent error message
+      __cuda_ptx_mbarrier_try_wait_is_not_supported_before_SM_90__(); return false;));
+}
+#endif // __cccl_ptx_isa >= 800
diff --git a/libcudacxx/include/cuda/__ptx/instructions/generated/mbarrier_try_wait_parity.inc b/libcudacxx/include/cuda/__ptx/instructions/generated/mbarrier_try_wait_parity.inc
new file mode 100644
index 00000000000..321bfc515da
--- /dev/null
+++ b/libcudacxx/include/cuda/__ptx/instructions/generated/mbarrier_try_wait_parity.inc
@@ -0,0 +1,157 @@
+/*
+// mbarrier.try_wait.parity.shared::cta.b64  waitComplete, [addr], phaseParity;                                // 7a.
+PTX ISA 78, SM_90 template <typename=void>
+__device__ static inline bool mbarrier_try_wait_parity(
+  uint64_t* addr,
+  const uint32_t& phaseParity);
+*/
+#if __cccl_ptx_isa >= 780
+extern "C" _CCCL_DEVICE void __cuda_ptx_mbarrier_try_wait_parity_is_not_supported_before_SM_90__();
+template <typename = void>
+_CCCL_DEVICE static inline bool
+mbarrier_try_wait_parity(_CUDA_VSTD::uint64_t* __addr, const _CUDA_VSTD::uint32_t& __phaseParity)
+{
+  NV_IF_ELSE_TARGET(
+    NV_PROVIDES_SM_90,
+    (_CUDA_VSTD::uint32_t __waitComplete;
+     asm("{\n\t .reg .pred P_OUT; \n\t"
+         "mbarrier.try_wait.parity.shared::cta.b64  P_OUT, [%1], %2;                                // 7a. \n\t"
+         "selp.b32 %0, 1, 0, P_OUT; \n"
+         "}"
+         : "=r"(__waitComplete)
+         : "r"(__as_ptr_smem(__addr)), "r"(__phaseParity)
+         : "memory");
+     return static_cast<bool>(__waitComplete);),
+    (
+      // Unsupported architectures will have a linker error with a semi-decent error message
+      __cuda_ptx_mbarrier_try_wait_parity_is_not_supported_before_SM_90__(); return false;));
+}
+#endif // __cccl_ptx_isa >= 780
+
+/*
+// mbarrier.try_wait.parity.shared::cta.b64  waitComplete, [addr], phaseParity, suspendTimeHint;               // 7b.
+PTX ISA 78, SM_90 template <typename=void>
+__device__ static inline bool mbarrier_try_wait_parity(
+  uint64_t* addr,
+  const uint32_t& phaseParity,
+  const uint32_t& suspendTimeHint);
+*/
+#if __cccl_ptx_isa >= 780
+extern "C" _CCCL_DEVICE void __cuda_ptx_mbarrier_try_wait_parity_is_not_supported_before_SM_90__();
+template <typename = void>
+_CCCL_DEVICE static inline bool mbarrier_try_wait_parity(
+  _CUDA_VSTD::uint64_t* __addr, const _CUDA_VSTD::uint32_t& __phaseParity, const _CUDA_VSTD::uint32_t& __suspendTimeHint)
+{
+  NV_IF_ELSE_TARGET(
+    NV_PROVIDES_SM_90,
+    (_CUDA_VSTD::uint32_t __waitComplete;
+     asm("{\n\t .reg .pred P_OUT; \n\t"
+         "mbarrier.try_wait.parity.shared::cta.b64  P_OUT, [%1], %2, %3;               // 7b. \n\t"
+         "selp.b32 %0, 1, 0, P_OUT; \n"
+         "}"
+         : "=r"(__waitComplete)
+         : "r"(__as_ptr_smem(__addr)), "r"(__phaseParity), "r"(__suspendTimeHint)
+         : "memory");
+     return static_cast<bool>(__waitComplete);),
+    (
+      // Unsupported architectures will have a linker error with a semi-decent error message
+      __cuda_ptx_mbarrier_try_wait_parity_is_not_supported_before_SM_90__(); return false;));
+}
+#endif // __cccl_ptx_isa >= 780
+
+/*
+// mbarrier.try_wait.parity{.sem}{.scope}.shared::cta.b64  waitComplete, [addr], phaseParity;                  // 8a.
+PTX ISA 80, SM_90
+// .sem       = { .acquire }
+// .scope     = { .cta, .cluster }
+template <cuda::ptx::dot_scope Scope>
+__device__ static inline bool mbarrier_try_wait_parity(
+  cuda::ptx::sem_acquire_t,
+  cuda::ptx::scope_t<Scope> scope,
+  uint64_t* addr,
+  const uint32_t& phaseParity);
+*/
+#if __cccl_ptx_isa >= 800
+extern "C" _CCCL_DEVICE void __cuda_ptx_mbarrier_try_wait_parity_is_not_supported_before_SM_90__();
+template <dot_scope _Scope>
+_CCCL_DEVICE static inline bool mbarrier_try_wait_parity(
+  sem_acquire_t, scope_t<_Scope> __scope, _CUDA_VSTD::uint64_t* __addr, const _CUDA_VSTD::uint32_t& __phaseParity)
+{
+  // __sem == sem_acquire (due to parameter type constraint)
+  static_assert(__scope == scope_cta || __scope == scope_cluster, "");
+  NV_IF_ELSE_TARGET(
+    NV_PROVIDES_SM_90,
+    (
+      _CUDA_VSTD::uint32_t __waitComplete; _CCCL_IF_CONSTEXPR (__scope == scope_cta) {
+        asm("{\n\t .reg .pred P_OUT; \n\t"
+            "mbarrier.try_wait.parity.acquire.cta.shared::cta.b64  P_OUT, [%1], %2;                  // 8a. \n\t"
+            "selp.b32 %0, 1, 0, P_OUT; \n"
+            "}"
+            : "=r"(__waitComplete)
+            : "r"(__as_ptr_smem(__addr)), "r"(__phaseParity)
+            : "memory");
+      } _CCCL_ELSE_IF_CONSTEXPR (__scope == scope_cluster) {
+        asm("{\n\t .reg .pred P_OUT; \n\t"
+            "mbarrier.try_wait.parity.acquire.cluster.shared::cta.b64  P_OUT, [%1], %2;                  // 8a. \n\t"
+            "selp.b32 %0, 1, 0, P_OUT; \n"
+            "}"
+            : "=r"(__waitComplete)
+            : "r"(__as_ptr_smem(__addr)), "r"(__phaseParity)
+            : "memory");
+      } return static_cast<bool>(__waitComplete);),
+    (
+      // Unsupported architectures will have a linker error with a semi-decent error message
+      __cuda_ptx_mbarrier_try_wait_parity_is_not_supported_before_SM_90__(); return false;));
+}
+#endif // __cccl_ptx_isa >= 800
+
+/*
+// mbarrier.try_wait.parity{.sem}{.scope}.shared::cta.b64  waitComplete, [addr], phaseParity, suspendTimeHint; // 8b.
+PTX ISA 80, SM_90
+// .sem       = { .acquire }
+// .scope     = { .cta, .cluster }
+template <cuda::ptx::dot_scope Scope>
+__device__ static inline bool mbarrier_try_wait_parity(
+  cuda::ptx::sem_acquire_t,
+  cuda::ptx::scope_t<Scope> scope,
+  uint64_t* addr,
+  const uint32_t& phaseParity,
+  const uint32_t& suspendTimeHint);
+*/
+#if __cccl_ptx_isa >= 800
+extern "C" _CCCL_DEVICE void __cuda_ptx_mbarrier_try_wait_parity_is_not_supported_before_SM_90__();
+template <dot_scope _Scope>
+_CCCL_DEVICE static inline bool mbarrier_try_wait_parity(
+  sem_acquire_t,
+  scope_t<_Scope> __scope,
+  _CUDA_VSTD::uint64_t* __addr,
+  const _CUDA_VSTD::uint32_t& __phaseParity,
+  const _CUDA_VSTD::uint32_t& __suspendTimeHint)
+{
+  // __sem == sem_acquire (due to parameter type constraint)
+  static_assert(__scope == scope_cta || __scope == scope_cluster, "");
+  NV_IF_ELSE_TARGET(
+    NV_PROVIDES_SM_90,
+    (
+      _CUDA_VSTD::uint32_t __waitComplete; _CCCL_IF_CONSTEXPR (__scope == scope_cta) {
+        asm("{\n\t .reg .pred P_OUT; \n\t"
+            "mbarrier.try_wait.parity.acquire.cta.shared::cta.b64  P_OUT, [%1], %2, %3; // 8b. \n\t"
+            "selp.b32 %0, 1, 0, P_OUT; \n"
+            "}"
+            : "=r"(__waitComplete)
+            : "r"(__as_ptr_smem(__addr)), "r"(__phaseParity), "r"(__suspendTimeHint)
+            : "memory");
+      } _CCCL_ELSE_IF_CONSTEXPR (__scope == scope_cluster) {
+        asm("{\n\t .reg .pred P_OUT; \n\t"
+            "mbarrier.try_wait.parity.acquire.cluster.shared::cta.b64  P_OUT, [%1], %2, %3; // 8b. \n\t"
+            "selp.b32 %0, 1, 0, P_OUT; \n"
+            "}"
+            : "=r"(__waitComplete)
+            : "r"(__as_ptr_smem(__addr)), "r"(__phaseParity), "r"(__suspendTimeHint)
+            : "memory");
+      } return static_cast<bool>(__waitComplete);),
+    (
+      // Unsupported architectures will have a linker error with a semi-decent error message
+      __cuda_ptx_mbarrier_try_wait_parity_is_not_supported_before_SM_90__(); return false;));
+}
+#endif // __cccl_ptx_isa >= 800
diff --git a/libcudacxx/include/cuda/__ptx/instructions/generated/red_async.inc b/libcudacxx/include/cuda/__ptx/instructions/generated/red_async.inc
new file mode 100644
index 00000000000..3157fa1c627
--- /dev/null
+++ b/libcudacxx/include/cuda/__ptx/instructions/generated/red_async.inc
@@ -0,0 +1,417 @@
+/*
+// red.async.relaxed.cluster.shared::cluster.mbarrier::complete_tx::bytes{.op}{.type}  [dest], value, [remote_bar];  //
+PTX ISA 81, SM_90
+// .type      = { .u32 }
+// .op        = { .inc }
+template <typename=void>
+__device__ static inline void red_async(
+  cuda::ptx::op_inc_t,
+  uint32_t* dest,
+  const uint32_t& value,
+  uint64_t* remote_bar);
+*/
+#if __cccl_ptx_isa >= 810
+extern "C" _CCCL_DEVICE void __cuda_ptx_red_async_is_not_supported_before_SM_90__();
+template <typename = void>
+_CCCL_DEVICE static inline void red_async(
+  op_inc_t, _CUDA_VSTD::uint32_t* __dest, const _CUDA_VSTD::uint32_t& __value, _CUDA_VSTD::uint64_t* __remote_bar)
+{
+  // __type == type_u32 (due to parameter type constraint)
+  // __op == op_inc (due to parameter type constraint)
+  NV_IF_ELSE_TARGET(
+    NV_PROVIDES_SM_90,
+    (asm("red.async.relaxed.cluster.shared::cluster.mbarrier::complete_tx::bytes.inc.u32  [%0], %1, [%2]; "
+         :
+         : "r"(__as_ptr_remote_dsmem(__dest)), "r"(__value), "r"(__as_ptr_remote_dsmem(__remote_bar))
+         : "memory");),
+    (
+      // Unsupported architectures will have a linker error with a semi-decent error message
+      __cuda_ptx_red_async_is_not_supported_before_SM_90__();));
+}
+#endif // __cccl_ptx_isa >= 810
+
+/*
+// red.async.relaxed.cluster.shared::cluster.mbarrier::complete_tx::bytes{.op}{.type}  [dest], value, [remote_bar];  //
+PTX ISA 81, SM_90
+// .type      = { .u32 }
+// .op        = { .dec }
+template <typename=void>
+__device__ static inline void red_async(
+  cuda::ptx::op_dec_t,
+  uint32_t* dest,
+  const uint32_t& value,
+  uint64_t* remote_bar);
+*/
+#if __cccl_ptx_isa >= 810
+extern "C" _CCCL_DEVICE void __cuda_ptx_red_async_is_not_supported_before_SM_90__();
+template <typename = void>
+_CCCL_DEVICE static inline void red_async(
+  op_dec_t, _CUDA_VSTD::uint32_t* __dest, const _CUDA_VSTD::uint32_t& __value, _CUDA_VSTD::uint64_t* __remote_bar)
+{
+  // __type == type_u32 (due to parameter type constraint)
+  // __op == op_dec (due to parameter type constraint)
+  NV_IF_ELSE_TARGET(
+    NV_PROVIDES_SM_90,
+    (asm("red.async.relaxed.cluster.shared::cluster.mbarrier::complete_tx::bytes.dec.u32  [%0], %1, [%2]; "
+         :
+         : "r"(__as_ptr_remote_dsmem(__dest)), "r"(__value), "r"(__as_ptr_remote_dsmem(__remote_bar))
+         : "memory");),
+    (
+      // Unsupported architectures will have a linker error with a semi-decent error message
+      __cuda_ptx_red_async_is_not_supported_before_SM_90__();));
+}
+#endif // __cccl_ptx_isa >= 810
+
+/*
+// red.async.relaxed.cluster.shared::cluster.mbarrier::complete_tx::bytes{.op}{.type}  [dest], value, [remote_bar];  //
+PTX ISA 81, SM_90
+// .type      = { .u32 }
+// .op        = { .min }
+template <typename=void>
+__device__ static inline void red_async(
+  cuda::ptx::op_min_t,
+  uint32_t* dest,
+  const uint32_t& value,
+  uint64_t* remote_bar);
+*/
+#if __cccl_ptx_isa >= 810
+extern "C" _CCCL_DEVICE void __cuda_ptx_red_async_is_not_supported_before_SM_90__();
+template <typename = void>
+_CCCL_DEVICE static inline void red_async(
+  op_min_t, _CUDA_VSTD::uint32_t* __dest, const _CUDA_VSTD::uint32_t& __value, _CUDA_VSTD::uint64_t* __remote_bar)
+{
+  // __type == type_u32 (due to parameter type constraint)
+  // __op == op_min (due to parameter type constraint)
+  NV_IF_ELSE_TARGET(
+    NV_PROVIDES_SM_90,
+    (asm("red.async.relaxed.cluster.shared::cluster.mbarrier::complete_tx::bytes.min.u32  [%0], %1, [%2]; "
+         :
+         : "r"(__as_ptr_remote_dsmem(__dest)), "r"(__value), "r"(__as_ptr_remote_dsmem(__remote_bar))
+         : "memory");),
+    (
+      // Unsupported architectures will have a linker error with a semi-decent error message
+      __cuda_ptx_red_async_is_not_supported_before_SM_90__();));
+}
+#endif // __cccl_ptx_isa >= 810
+
+/*
+// red.async.relaxed.cluster.shared::cluster.mbarrier::complete_tx::bytes{.op}{.type}  [dest], value, [remote_bar];  //
+PTX ISA 81, SM_90
+// .type      = { .u32 }
+// .op        = { .max }
+template <typename=void>
+__device__ static inline void red_async(
+  cuda::ptx::op_max_t,
+  uint32_t* dest,
+  const uint32_t& value,
+  uint64_t* remote_bar);
+*/
+#if __cccl_ptx_isa >= 810
+extern "C" _CCCL_DEVICE void __cuda_ptx_red_async_is_not_supported_before_SM_90__();
+template <typename = void>
+_CCCL_DEVICE static inline void red_async(
+  op_max_t, _CUDA_VSTD::uint32_t* __dest, const _CUDA_VSTD::uint32_t& __value, _CUDA_VSTD::uint64_t* __remote_bar)
+{
+  // __type == type_u32 (due to parameter type constraint)
+  // __op == op_max (due to parameter type constraint)
+  NV_IF_ELSE_TARGET(
+    NV_PROVIDES_SM_90,
+    (asm("red.async.relaxed.cluster.shared::cluster.mbarrier::complete_tx::bytes.max.u32  [%0], %1, [%2]; "
+         :
+         : "r"(__as_ptr_remote_dsmem(__dest)), "r"(__value), "r"(__as_ptr_remote_dsmem(__remote_bar))
+         : "memory");),
+    (
+      // Unsupported architectures will have a linker error with a semi-decent error message
+      __cuda_ptx_red_async_is_not_supported_before_SM_90__();));
+}
+#endif // __cccl_ptx_isa >= 810
+
+/*
+// red.async.relaxed.cluster.shared::cluster.mbarrier::complete_tx::bytes{.op}{.type}  [dest], value, [remote_bar];  //
+PTX ISA 81, SM_90
+// .type      = { .u32 }
+// .op        = { .add }
+template <typename=void>
+__device__ static inline void red_async(
+  cuda::ptx::op_add_t,
+  uint32_t* dest,
+  const uint32_t& value,
+  uint64_t* remote_bar);
+*/
+#if __cccl_ptx_isa >= 810
+extern "C" _CCCL_DEVICE void __cuda_ptx_red_async_is_not_supported_before_SM_90__();
+template <typename = void>
+_CCCL_DEVICE static inline void red_async(
+  op_add_t, _CUDA_VSTD::uint32_t* __dest, const _CUDA_VSTD::uint32_t& __value, _CUDA_VSTD::uint64_t* __remote_bar)
+{
+  // __type == type_u32 (due to parameter type constraint)
+  // __op == op_add (due to parameter type constraint)
+  NV_IF_ELSE_TARGET(
+    NV_PROVIDES_SM_90,
+    (asm("red.async.relaxed.cluster.shared::cluster.mbarrier::complete_tx::bytes.add.u32  [%0], %1, [%2]; "
+         :
+         : "r"(__as_ptr_remote_dsmem(__dest)), "r"(__value), "r"(__as_ptr_remote_dsmem(__remote_bar))
+         : "memory");),
+    (
+      // Unsupported architectures will have a linker error with a semi-decent error message
+      __cuda_ptx_red_async_is_not_supported_before_SM_90__();));
+}
+#endif // __cccl_ptx_isa >= 810
+
+/*
+// red.async.relaxed.cluster.shared::cluster.mbarrier::complete_tx::bytes{.op}{.type}  [dest], value, [remote_bar];  //
+PTX ISA 81, SM_90
+// .type      = { .s32 }
+// .op        = { .min }
+template <typename=void>
+__device__ static inline void red_async(
+  cuda::ptx::op_min_t,
+  int32_t* dest,
+  const int32_t& value,
+  uint64_t* remote_bar);
+*/
+#if __cccl_ptx_isa >= 810
+extern "C" _CCCL_DEVICE void __cuda_ptx_red_async_is_not_supported_before_SM_90__();
+template <typename = void>
+_CCCL_DEVICE static inline void
+red_async(op_min_t, _CUDA_VSTD::int32_t* __dest, const _CUDA_VSTD::int32_t& __value, _CUDA_VSTD::uint64_t* __remote_bar)
+{
+  // __type == type_s32 (due to parameter type constraint)
+  // __op == op_min (due to parameter type constraint)
+  NV_IF_ELSE_TARGET(
+    NV_PROVIDES_SM_90,
+    (asm("red.async.relaxed.cluster.shared::cluster.mbarrier::complete_tx::bytes.min.s32  [%0], %1, [%2]; "
+         :
+         : "r"(__as_ptr_remote_dsmem(__dest)), "r"(__value), "r"(__as_ptr_remote_dsmem(__remote_bar))
+         : "memory");),
+    (
+      // Unsupported architectures will have a linker error with a semi-decent error message
+      __cuda_ptx_red_async_is_not_supported_before_SM_90__();));
+}
+#endif // __cccl_ptx_isa >= 810
+
+/*
+// red.async.relaxed.cluster.shared::cluster.mbarrier::complete_tx::bytes{.op}{.type}  [dest], value, [remote_bar];  //
+PTX ISA 81, SM_90
+// .type      = { .s32 }
+// .op        = { .max }
+template <typename=void>
+__device__ static inline void red_async(
+  cuda::ptx::op_max_t,
+  int32_t* dest,
+  const int32_t& value,
+  uint64_t* remote_bar);
+*/
+#if __cccl_ptx_isa >= 810
+extern "C" _CCCL_DEVICE void __cuda_ptx_red_async_is_not_supported_before_SM_90__();
+template <typename = void>
+_CCCL_DEVICE static inline void
+red_async(op_max_t, _CUDA_VSTD::int32_t* __dest, const _CUDA_VSTD::int32_t& __value, _CUDA_VSTD::uint64_t* __remote_bar)
+{
+  // __type == type_s32 (due to parameter type constraint)
+  // __op == op_max (due to parameter type constraint)
+  NV_IF_ELSE_TARGET(
+    NV_PROVIDES_SM_90,
+    (asm("red.async.relaxed.cluster.shared::cluster.mbarrier::complete_tx::bytes.max.s32  [%0], %1, [%2]; "
+         :
+         : "r"(__as_ptr_remote_dsmem(__dest)), "r"(__value), "r"(__as_ptr_remote_dsmem(__remote_bar))
+         : "memory");),
+    (
+      // Unsupported architectures will have a linker error with a semi-decent error message
+      __cuda_ptx_red_async_is_not_supported_before_SM_90__();));
+}
+#endif // __cccl_ptx_isa >= 810
+
+/*
+// red.async.relaxed.cluster.shared::cluster.mbarrier::complete_tx::bytes{.op}{.type}  [dest], value, [remote_bar];  //
+PTX ISA 81, SM_90
+// .type      = { .s32 }
+// .op        = { .add }
+template <typename=void>
+__device__ static inline void red_async(
+  cuda::ptx::op_add_t,
+  int32_t* dest,
+  const int32_t& value,
+  uint64_t* remote_bar);
+*/
+#if __cccl_ptx_isa >= 810
+extern "C" _CCCL_DEVICE void __cuda_ptx_red_async_is_not_supported_before_SM_90__();
+template <typename = void>
+_CCCL_DEVICE static inline void
+red_async(op_add_t, _CUDA_VSTD::int32_t* __dest, const _CUDA_VSTD::int32_t& __value, _CUDA_VSTD::uint64_t* __remote_bar)
+{
+  // __type == type_s32 (due to parameter type constraint)
+  // __op == op_add (due to parameter type constraint)
+  NV_IF_ELSE_TARGET(
+    NV_PROVIDES_SM_90,
+    (asm("red.async.relaxed.cluster.shared::cluster.mbarrier::complete_tx::bytes.add.s32  [%0], %1, [%2]; "
+         :
+         : "r"(__as_ptr_remote_dsmem(__dest)), "r"(__value), "r"(__as_ptr_remote_dsmem(__remote_bar))
+         : "memory");),
+    (
+      // Unsupported architectures will have a linker error with a semi-decent error message
+      __cuda_ptx_red_async_is_not_supported_before_SM_90__();));
+}
+#endif // __cccl_ptx_isa >= 810
+
+/*
+// red.async.relaxed.cluster.shared::cluster.mbarrier::complete_tx::bytes{.op}{.type}  [dest], value, [remote_bar];  //
+PTX ISA 81, SM_90
+// .type      = { .b32 }
+// .op        = { .and }
+template <typename B32>
+__device__ static inline void red_async(
+  cuda::ptx::op_and_op_t,
+  B32* dest,
+  const B32& value,
+  uint64_t* remote_bar);
+*/
+#if __cccl_ptx_isa >= 810
+extern "C" _CCCL_DEVICE void __cuda_ptx_red_async_is_not_supported_before_SM_90__();
+template <typename _B32>
+_CCCL_DEVICE static inline void
+red_async(op_and_op_t, _B32* __dest, const _B32& __value, _CUDA_VSTD::uint64_t* __remote_bar)
+{
+  // __type == type_b32 (due to parameter type constraint)
+  // __op == op_and_op (due to parameter type constraint)
+  static_assert(sizeof(_B32) == 4, "");
+  NV_IF_ELSE_TARGET(
+    NV_PROVIDES_SM_90,
+    (asm("red.async.relaxed.cluster.shared::cluster.mbarrier::complete_tx::bytes.and.b32  [%0], %1, [%2]; "
+         :
+         : "r"(__as_ptr_remote_dsmem(__dest)), "r"(__as_b32(__value)), "r"(__as_ptr_remote_dsmem(__remote_bar))
+         : "memory");),
+    (
+      // Unsupported architectures will have a linker error with a semi-decent error message
+      __cuda_ptx_red_async_is_not_supported_before_SM_90__();));
+}
+#endif // __cccl_ptx_isa >= 810
+
+/*
+// red.async.relaxed.cluster.shared::cluster.mbarrier::complete_tx::bytes{.op}{.type}  [dest], value, [remote_bar];  //
+PTX ISA 81, SM_90
+// .type      = { .b32 }
+// .op        = { .or }
+template <typename B32>
+__device__ static inline void red_async(
+  cuda::ptx::op_or_op_t,
+  B32* dest,
+  const B32& value,
+  uint64_t* remote_bar);
+*/
+#if __cccl_ptx_isa >= 810
+extern "C" _CCCL_DEVICE void __cuda_ptx_red_async_is_not_supported_before_SM_90__();
+template <typename _B32>
+_CCCL_DEVICE static inline void
+red_async(op_or_op_t, _B32* __dest, const _B32& __value, _CUDA_VSTD::uint64_t* __remote_bar)
+{
+  // __type == type_b32 (due to parameter type constraint)
+  // __op == op_or_op (due to parameter type constraint)
+  static_assert(sizeof(_B32) == 4, "");
+  NV_IF_ELSE_TARGET(
+    NV_PROVIDES_SM_90,
+    (asm("red.async.relaxed.cluster.shared::cluster.mbarrier::complete_tx::bytes.or.b32  [%0], %1, [%2]; "
+         :
+         : "r"(__as_ptr_remote_dsmem(__dest)), "r"(__as_b32(__value)), "r"(__as_ptr_remote_dsmem(__remote_bar))
+         : "memory");),
+    (
+      // Unsupported architectures will have a linker error with a semi-decent error message
+      __cuda_ptx_red_async_is_not_supported_before_SM_90__();));
+}
+#endif // __cccl_ptx_isa >= 810
+
+/*
+// red.async.relaxed.cluster.shared::cluster.mbarrier::complete_tx::bytes{.op}{.type}  [dest], value, [remote_bar];  //
+PTX ISA 81, SM_90
+// .type      = { .b32 }
+// .op        = { .xor }
+template <typename B32>
+__device__ static inline void red_async(
+  cuda::ptx::op_xor_op_t,
+  B32* dest,
+  const B32& value,
+  uint64_t* remote_bar);
+*/
+#if __cccl_ptx_isa >= 810
+extern "C" _CCCL_DEVICE void __cuda_ptx_red_async_is_not_supported_before_SM_90__();
+template <typename _B32>
+_CCCL_DEVICE static inline void
+red_async(op_xor_op_t, _B32* __dest, const _B32& __value, _CUDA_VSTD::uint64_t* __remote_bar)
+{
+  // __type == type_b32 (due to parameter type constraint)
+  // __op == op_xor_op (due to parameter type constraint)
+  static_assert(sizeof(_B32) == 4, "");
+  NV_IF_ELSE_TARGET(
+    NV_PROVIDES_SM_90,
+    (asm("red.async.relaxed.cluster.shared::cluster.mbarrier::complete_tx::bytes.xor.b32  [%0], %1, [%2]; "
+         :
+         : "r"(__as_ptr_remote_dsmem(__dest)), "r"(__as_b32(__value)), "r"(__as_ptr_remote_dsmem(__remote_bar))
+         : "memory");),
+    (
+      // Unsupported architectures will have a linker error with a semi-decent error message
+      __cuda_ptx_red_async_is_not_supported_before_SM_90__();));
+}
+#endif // __cccl_ptx_isa >= 810
+
+/*
+// red.async.relaxed.cluster.shared::cluster.mbarrier::complete_tx::bytes{.op}{.type}  [dest], value, [remote_bar];  //
+PTX ISA 81, SM_90
+// .type      = { .u64 }
+// .op        = { .add }
+template <typename=void>
+__device__ static inline void red_async(
+  cuda::ptx::op_add_t,
+  uint64_t* dest,
+  const uint64_t& value,
+  uint64_t* remote_bar);
+*/
+#if __cccl_ptx_isa >= 810
+extern "C" _CCCL_DEVICE void __cuda_ptx_red_async_is_not_supported_before_SM_90__();
+template <typename = void>
+_CCCL_DEVICE static inline void red_async(
+  op_add_t, _CUDA_VSTD::uint64_t* __dest, const _CUDA_VSTD::uint64_t& __value, _CUDA_VSTD::uint64_t* __remote_bar)
+{
+  // __type == type_u64 (due to parameter type constraint)
+  // __op == op_add (due to parameter type constraint)
+  NV_IF_ELSE_TARGET(
+    NV_PROVIDES_SM_90,
+    (asm("red.async.relaxed.cluster.shared::cluster.mbarrier::complete_tx::bytes.add.u64  [%0], %1, [%2]; "
+         :
+         : "r"(__as_ptr_remote_dsmem(__dest)), "l"(__value), "r"(__as_ptr_remote_dsmem(__remote_bar))
+         : "memory");),
+    (
+      // Unsupported architectures will have a linker error with a semi-decent error message
+      __cuda_ptx_red_async_is_not_supported_before_SM_90__();));
+}
+#endif // __cccl_ptx_isa >= 810
+
+/*
+// red.async.relaxed.cluster.shared::cluster.mbarrier::complete_tx::bytes{.op}.u64  [dest], value, [remote_bar]; // .u64
+intentional PTX ISA 81, SM_90
+// .op        = { .add }
+template <typename=void>
+__device__ static inline void red_async(
+  cuda::ptx::op_add_t,
+  int64_t* dest,
+  const int64_t& value,
+  int64_t* remote_bar);
+*/
+#if __cccl_ptx_isa >= 810
+extern "C" _CCCL_DEVICE void __cuda_ptx_red_async_is_not_supported_before_SM_90__();
+template <typename = void>
+_CCCL_DEVICE static inline void
+red_async(op_add_t, _CUDA_VSTD::int64_t* __dest, const _CUDA_VSTD::int64_t& __value, _CUDA_VSTD::int64_t* __remote_bar)
+{
+  // __op == op_add (due to parameter type constraint)
+  NV_IF_ELSE_TARGET(
+    NV_PROVIDES_SM_90,
+    (asm("red.async.relaxed.cluster.shared::cluster.mbarrier::complete_tx::bytes.add.u64  [%0], %1, [%2]; // .u64 "
+         "intentional"
+         :
+         : "r"(__as_ptr_remote_dsmem(__dest)), "l"(__value), "r"(__as_ptr_remote_dsmem(__remote_bar))
+         : "memory");),
+    (
+      // Unsupported architectures will have a linker error with a semi-decent error message
+      __cuda_ptx_red_async_is_not_supported_before_SM_90__();));
+}
+#endif // __cccl_ptx_isa >= 810
diff --git a/libcudacxx/include/cuda/__ptx/instructions/generated/st_async.inc b/libcudacxx/include/cuda/__ptx/instructions/generated/st_async.inc
new file mode 100644
index 00000000000..9dfab243ffe
--- /dev/null
+++ b/libcudacxx/include/cuda/__ptx/instructions/generated/st_async.inc
@@ -0,0 +1,108 @@
+/*
+// st.async.weak.shared::cluster.mbarrier::complete_tx::bytes{.type} [addr], value, [remote_bar];    // 1.  PTX ISA 81,
+SM_90
+// .type      = { .b32, .b64 }
+template <typename Type>
+__device__ static inline void st_async(
+  Type* addr,
+  const Type& value,
+  uint64_t* remote_bar);
+*/
+#if __cccl_ptx_isa >= 810
+extern "C" _CCCL_DEVICE void __cuda_ptx_st_async_is_not_supported_before_SM_90__();
+template <typename _Type>
+_CCCL_DEVICE static inline void st_async(_Type* __addr, const _Type& __value, _CUDA_VSTD::uint64_t* __remote_bar)
+{
+  static_assert(sizeof(_Type) == 4 || sizeof(_Type) == 8, "");
+  NV_IF_ELSE_TARGET(
+    NV_PROVIDES_SM_90,
+    (
+      _CCCL_IF_CONSTEXPR (sizeof(_Type) == 4) {
+        asm("st.async.weak.shared::cluster.mbarrier::complete_tx::bytes.b32 [%0], %1, [%2];    // 1. "
+            :
+            : "r"(__as_ptr_remote_dsmem(__addr)), "r"(__as_b32(__value)), "r"(__as_ptr_remote_dsmem(__remote_bar))
+            : "memory");
+      } _CCCL_ELSE_IF_CONSTEXPR (sizeof(_Type) == 8) {
+        asm("st.async.weak.shared::cluster.mbarrier::complete_tx::bytes.b64 [%0], %1, [%2];    // 1. "
+            :
+            : "r"(__as_ptr_remote_dsmem(__addr)), "l"(__as_b64(__value)), "r"(__as_ptr_remote_dsmem(__remote_bar))
+            : "memory");
+      }),
+    (
+      // Unsupported architectures will have a linker error with a semi-decent error message
+      __cuda_ptx_st_async_is_not_supported_before_SM_90__();));
+}
+#endif // __cccl_ptx_isa >= 810
+
+/*
+// st.async.weak.shared::cluster.mbarrier::complete_tx::bytes.v2{.type} [addr], value, [remote_bar]; // 2.  PTX ISA 81,
+SM_90
+// .type      = { .b32, .b64 }
+template <typename Type>
+__device__ static inline void st_async(
+  Type* addr,
+  const Type (&value)[2],
+  uint64_t* remote_bar);
+*/
+#if __cccl_ptx_isa >= 810
+extern "C" _CCCL_DEVICE void __cuda_ptx_st_async_is_not_supported_before_SM_90__();
+template <typename _Type>
+_CCCL_DEVICE static inline void st_async(_Type* __addr, const _Type (&__value)[2], _CUDA_VSTD::uint64_t* __remote_bar)
+{
+  static_assert(sizeof(_Type) == 4 || sizeof(_Type) == 8, "");
+  NV_IF_ELSE_TARGET(
+    NV_PROVIDES_SM_90,
+    (
+      _CCCL_IF_CONSTEXPR (sizeof(_Type) == 4) {
+        asm("st.async.weak.shared::cluster.mbarrier::complete_tx::bytes.v2.b32 [%0], {%1, %2}, [%3]; // 2. "
+            :
+            : "r"(__as_ptr_remote_dsmem(__addr)),
+              "r"(__as_b32(__value[0])),
+              "r"(__as_b32(__value[1])),
+              "r"(__as_ptr_remote_dsmem(__remote_bar))
+            : "memory");
+      } _CCCL_ELSE_IF_CONSTEXPR (sizeof(_Type) == 8) {
+        asm("st.async.weak.shared::cluster.mbarrier::complete_tx::bytes.v2.b64 [%0], {%1, %2}, [%3]; // 2. "
+            :
+            : "r"(__as_ptr_remote_dsmem(__addr)),
+              "l"(__as_b64(__value[0])),
+              "l"(__as_b64(__value[1])),
+              "r"(__as_ptr_remote_dsmem(__remote_bar))
+            : "memory");
+      }),
+    (
+      // Unsupported architectures will have a linker error with a semi-decent error message
+      __cuda_ptx_st_async_is_not_supported_before_SM_90__();));
+}
+#endif // __cccl_ptx_isa >= 810
+
+/*
+// st.async.weak.shared::cluster.mbarrier::complete_tx::bytes.v4.b32 [addr], value, [remote_bar];    // 3.  PTX ISA 81,
+SM_90 template <typename B32>
+__device__ static inline void st_async(
+  B32* addr,
+  const B32 (&value)[4],
+  uint64_t* remote_bar);
+*/
+#if __cccl_ptx_isa >= 810
+extern "C" _CCCL_DEVICE void __cuda_ptx_st_async_is_not_supported_before_SM_90__();
+template <typename _B32>
+_CCCL_DEVICE static inline void st_async(_B32* __addr, const _B32 (&__value)[4], _CUDA_VSTD::uint64_t* __remote_bar)
+{
+  static_assert(sizeof(_B32) == 4, "");
+  NV_IF_ELSE_TARGET(
+    NV_PROVIDES_SM_90,
+    (asm("st.async.weak.shared::cluster.mbarrier::complete_tx::bytes.v4.b32 [%0], {%1, %2, %3, %4}, [%5];    // 3. "
+         :
+         : "r"(__as_ptr_remote_dsmem(__addr)),
+           "r"(__as_b32(__value[0])),
+           "r"(__as_b32(__value[1])),
+           "r"(__as_b32(__value[2])),
+           "r"(__as_b32(__value[3])),
+           "r"(__as_ptr_remote_dsmem(__remote_bar))
+         : "memory");),
+    (
+      // Unsupported architectures will have a linker error with a semi-decent error message
+      __cuda_ptx_st_async_is_not_supported_before_SM_90__();));
+}
+#endif // __cccl_ptx_isa >= 810
diff --git a/libcudacxx/include/cuda/__ptx/instructions/generated/tensormap_cp_fenceproxy.inc b/libcudacxx/include/cuda/__ptx/instructions/generated/tensormap_cp_fenceproxy.inc
new file mode 100644
index 00000000000..033d0606e7f
--- /dev/null
+++ b/libcudacxx/include/cuda/__ptx/instructions/generated/tensormap_cp_fenceproxy.inc
@@ -0,0 +1,54 @@
+/*
+// tensormap.cp_fenceproxy.global.shared::cta.tensormap::generic.sem.scope.sync.aligned  [dst], [src], size; // PTX ISA
+83, SM_90
+// .sem       = { .release }
+// .scope     = { .cta, .cluster, .gpu, .sys }
+template <int N32, cuda::ptx::dot_scope Scope>
+__device__ static inline void tensormap_cp_fenceproxy(
+  cuda::ptx::sem_release_t,
+  cuda::ptx::scope_t<Scope> scope,
+  void* dst,
+  const void* src,
+  cuda::ptx::n32_t<N32> size);
+*/
+#if __cccl_ptx_isa >= 830
+extern "C" _CCCL_DEVICE void __cuda_ptx_tensormap_cp_fenceproxy_is_not_supported_before_SM_90__();
+template <int _N32, dot_scope _Scope>
+_CCCL_DEVICE static inline void
+tensormap_cp_fenceproxy(sem_release_t, scope_t<_Scope> __scope, void* __dst, const void* __src, n32_t<_N32> __size)
+{
+  // __sem == sem_release (due to parameter type constraint)
+  static_assert(__scope == scope_cta || __scope == scope_cluster || __scope == scope_gpu || __scope == scope_sys, "");
+  NV_IF_ELSE_TARGET(
+    NV_PROVIDES_SM_90,
+    (
+      _CCCL_IF_CONSTEXPR (__scope == scope_cta) {
+        asm volatile(
+          "tensormap.cp_fenceproxy.global.shared::cta.tensormap::generic.release.cta.sync.aligned  [%0], [%1], %2;"
+          :
+          : "l"(__as_ptr_gmem(__dst)), "r"(__as_ptr_smem(__src)), "n"(__size.value)
+          : "memory");
+      } _CCCL_ELSE_IF_CONSTEXPR (__scope == scope_cluster) {
+        asm volatile(
+          "tensormap.cp_fenceproxy.global.shared::cta.tensormap::generic.release.cluster.sync.aligned  [%0], [%1], %2;"
+          :
+          : "l"(__as_ptr_gmem(__dst)), "r"(__as_ptr_smem(__src)), "n"(__size.value)
+          : "memory");
+      } _CCCL_ELSE_IF_CONSTEXPR (__scope == scope_gpu) {
+        asm volatile(
+          "tensormap.cp_fenceproxy.global.shared::cta.tensormap::generic.release.gpu.sync.aligned  [%0], [%1], %2;"
+          :
+          : "l"(__as_ptr_gmem(__dst)), "r"(__as_ptr_smem(__src)), "n"(__size.value)
+          : "memory");
+      } _CCCL_ELSE_IF_CONSTEXPR (__scope == scope_sys) {
+        asm volatile(
+          "tensormap.cp_fenceproxy.global.shared::cta.tensormap::generic.release.sys.sync.aligned  [%0], [%1], %2;"
+          :
+          : "l"(__as_ptr_gmem(__dst)), "r"(__as_ptr_smem(__src)), "n"(__size.value)
+          : "memory");
+      }),
+    (
+      // Unsupported architectures will have a linker error with a semi-decent error message
+      __cuda_ptx_tensormap_cp_fenceproxy_is_not_supported_before_SM_90__();));
+}
+#endif // __cccl_ptx_isa >= 830
diff --git a/libcudacxx/include/cuda/__ptx/instructions/generated/tensormap_replace.inc b/libcudacxx/include/cuda/__ptx/instructions/generated/tensormap_replace.inc
new file mode 100644
index 00000000000..3b1060ead38
--- /dev/null
+++ b/libcudacxx/include/cuda/__ptx/instructions/generated/tensormap_replace.inc
@@ -0,0 +1,569 @@
+/*
+// tensormap.replace.tile.global_address.space.b1024.b64    [tm_addr], new_val; // PTX ISA 83, SM_90a
+// .space     = { .global }
+template <typename B64>
+__device__ static inline void tensormap_replace_global_address(
+  cuda::ptx::space_global_t,
+  void* tm_addr,
+  B64 new_val);
+*/
+#if __cccl_ptx_isa >= 830
+extern "C" _CCCL_DEVICE void __cuda_ptx_tensormap_replace_global_address_is_not_supported_before_SM_90a__();
+template <typename _B64>
+_CCCL_DEVICE static inline void tensormap_replace_global_address(space_global_t, void* __tm_addr, _B64 __new_val)
+{
+  // __space == space_global (due to parameter type constraint)
+  static_assert(sizeof(_B64) == 8, "");
+  NV_IF_ELSE_TARGET(
+    NV_HAS_FEATURE_SM_90a,
+    (asm("tensormap.replace.tile.global_address.global.b1024.b64    [%0], %1;"
+         :
+         : "l"(__as_ptr_gmem(__tm_addr)), "l"(__as_b64(__new_val))
+         : "memory");),
+    (
+      // Unsupported architectures will have a linker error with a semi-decent error message
+      __cuda_ptx_tensormap_replace_global_address_is_not_supported_before_SM_90a__();));
+}
+#endif // __cccl_ptx_isa >= 830
+
+/*
+// tensormap.replace.tile.global_address.space.b1024.b64    [tm_addr], new_val; // PTX ISA 83, SM_90a
+// .space     = { .shared::cta }
+template <typename B64>
+__device__ static inline void tensormap_replace_global_address(
+  cuda::ptx::space_shared_t,
+  void* tm_addr,
+  B64 new_val);
+*/
+#if __cccl_ptx_isa >= 830
+extern "C" _CCCL_DEVICE void __cuda_ptx_tensormap_replace_global_address_is_not_supported_before_SM_90a__();
+template <typename _B64>
+_CCCL_DEVICE static inline void tensormap_replace_global_address(space_shared_t, void* __tm_addr, _B64 __new_val)
+{
+  // __space == space_shared (due to parameter type constraint)
+  static_assert(sizeof(_B64) == 8, "");
+  NV_IF_ELSE_TARGET(
+    NV_HAS_FEATURE_SM_90a,
+    (asm("tensormap.replace.tile.global_address.shared::cta.b1024.b64    [%0], %1;"
+         :
+         : "r"(__as_ptr_smem(__tm_addr)), "l"(__as_b64(__new_val))
+         : "memory");),
+    (
+      // Unsupported architectures will have a linker error with a semi-decent error message
+      __cuda_ptx_tensormap_replace_global_address_is_not_supported_before_SM_90a__();));
+}
+#endif // __cccl_ptx_isa >= 830
+
+/*
+// tensormap.replace.tile.rank.space.b1024.b32              [tm_addr], new_val; // PTX ISA 83, SM_90a
+// .space     = { .global }
+template <typename B32>
+__device__ static inline void tensormap_replace_rank(
+  cuda::ptx::space_global_t,
+  void* tm_addr,
+  B32 new_val);
+*/
+#if __cccl_ptx_isa >= 830
+extern "C" _CCCL_DEVICE void __cuda_ptx_tensormap_replace_rank_is_not_supported_before_SM_90a__();
+template <typename _B32>
+_CCCL_DEVICE static inline void tensormap_replace_rank(space_global_t, void* __tm_addr, _B32 __new_val)
+{
+  // __space == space_global (due to parameter type constraint)
+  static_assert(sizeof(_B32) == 4, "");
+  NV_IF_ELSE_TARGET(
+    NV_HAS_FEATURE_SM_90a,
+    (asm("tensormap.replace.tile.rank.global.b1024.b32              [%0], %1;"
+         :
+         : "l"(__as_ptr_gmem(__tm_addr)), "r"(__as_b32(__new_val))
+         : "memory");),
+    (
+      // Unsupported architectures will have a linker error with a semi-decent error message
+      __cuda_ptx_tensormap_replace_rank_is_not_supported_before_SM_90a__();));
+}
+#endif // __cccl_ptx_isa >= 830
+
+/*
+// tensormap.replace.tile.rank.space.b1024.b32              [tm_addr], new_val; // PTX ISA 83, SM_90a
+// .space     = { .shared::cta }
+template <typename B32>
+__device__ static inline void tensormap_replace_rank(
+  cuda::ptx::space_shared_t,
+  void* tm_addr,
+  B32 new_val);
+*/
+#if __cccl_ptx_isa >= 830
+extern "C" _CCCL_DEVICE void __cuda_ptx_tensormap_replace_rank_is_not_supported_before_SM_90a__();
+template <typename _B32>
+_CCCL_DEVICE static inline void tensormap_replace_rank(space_shared_t, void* __tm_addr, _B32 __new_val)
+{
+  // __space == space_shared (due to parameter type constraint)
+  static_assert(sizeof(_B32) == 4, "");
+  NV_IF_ELSE_TARGET(
+    NV_HAS_FEATURE_SM_90a,
+    (asm("tensormap.replace.tile.rank.shared::cta.b1024.b32              [%0], %1;"
+         :
+         : "r"(__as_ptr_smem(__tm_addr)), "r"(__as_b32(__new_val))
+         : "memory");),
+    (
+      // Unsupported architectures will have a linker error with a semi-decent error message
+      __cuda_ptx_tensormap_replace_rank_is_not_supported_before_SM_90a__();));
+}
+#endif // __cccl_ptx_isa >= 830
+
+/*
+// tensormap.replace.tile.box_dim.space.b1024.b32           [tm_addr], ord, new_val; // PTX ISA 83, SM_90a
+// .space     = { .global }
+template <int N32, typename B32>
+__device__ static inline void tensormap_replace_box_dim(
+  cuda::ptx::space_global_t,
+  void* tm_addr,
+  cuda::ptx::n32_t<N32> ord,
+  B32 new_val);
+*/
+#if __cccl_ptx_isa >= 830
+extern "C" _CCCL_DEVICE void __cuda_ptx_tensormap_replace_box_dim_is_not_supported_before_SM_90a__();
+template <int _N32, typename _B32>
+_CCCL_DEVICE static inline void
+tensormap_replace_box_dim(space_global_t, void* __tm_addr, n32_t<_N32> __ord, _B32 __new_val)
+{
+  // __space == space_global (due to parameter type constraint)
+  static_assert(sizeof(_B32) == 4, "");
+  NV_IF_ELSE_TARGET(
+    NV_HAS_FEATURE_SM_90a,
+    (asm("tensormap.replace.tile.box_dim.global.b1024.b32           [%0], %1, %2;"
+         :
+         : "l"(__as_ptr_gmem(__tm_addr)), "n"(__ord.value), "r"(__as_b32(__new_val))
+         : "memory");),
+    (
+      // Unsupported architectures will have a linker error with a semi-decent error message
+      __cuda_ptx_tensormap_replace_box_dim_is_not_supported_before_SM_90a__();));
+}
+#endif // __cccl_ptx_isa >= 830
+
+/*
+// tensormap.replace.tile.box_dim.space.b1024.b32           [tm_addr], ord, new_val; // PTX ISA 83, SM_90a
+// .space     = { .shared::cta }
+template <int N32, typename B32>
+__device__ static inline void tensormap_replace_box_dim(
+  cuda::ptx::space_shared_t,
+  void* tm_addr,
+  cuda::ptx::n32_t<N32> ord,
+  B32 new_val);
+*/
+#if __cccl_ptx_isa >= 830
+extern "C" _CCCL_DEVICE void __cuda_ptx_tensormap_replace_box_dim_is_not_supported_before_SM_90a__();
+template <int _N32, typename _B32>
+_CCCL_DEVICE static inline void
+tensormap_replace_box_dim(space_shared_t, void* __tm_addr, n32_t<_N32> __ord, _B32 __new_val)
+{
+  // __space == space_shared (due to parameter type constraint)
+  static_assert(sizeof(_B32) == 4, "");
+  NV_IF_ELSE_TARGET(
+    NV_HAS_FEATURE_SM_90a,
+    (asm("tensormap.replace.tile.box_dim.shared::cta.b1024.b32           [%0], %1, %2;"
+         :
+         : "r"(__as_ptr_smem(__tm_addr)), "n"(__ord.value), "r"(__as_b32(__new_val))
+         : "memory");),
+    (
+      // Unsupported architectures will have a linker error with a semi-decent error message
+      __cuda_ptx_tensormap_replace_box_dim_is_not_supported_before_SM_90a__();));
+}
+#endif // __cccl_ptx_isa >= 830
+
+/*
+// tensormap.replace.tile.global_dim.space.b1024.b32        [tm_addr], ord, new_val; // PTX ISA 83, SM_90a
+// .space     = { .global }
+template <int N32, typename B32>
+__device__ static inline void tensormap_replace_global_dim(
+  cuda::ptx::space_global_t,
+  void* tm_addr,
+  cuda::ptx::n32_t<N32> ord,
+  B32 new_val);
+*/
+#if __cccl_ptx_isa >= 830
+extern "C" _CCCL_DEVICE void __cuda_ptx_tensormap_replace_global_dim_is_not_supported_before_SM_90a__();
+template <int _N32, typename _B32>
+_CCCL_DEVICE static inline void
+tensormap_replace_global_dim(space_global_t, void* __tm_addr, n32_t<_N32> __ord, _B32 __new_val)
+{
+  // __space == space_global (due to parameter type constraint)
+  static_assert(sizeof(_B32) == 4, "");
+  NV_IF_ELSE_TARGET(
+    NV_HAS_FEATURE_SM_90a,
+    (asm("tensormap.replace.tile.global_dim.global.b1024.b32        [%0], %1, %2;"
+         :
+         : "l"(__as_ptr_gmem(__tm_addr)), "n"(__ord.value), "r"(__as_b32(__new_val))
+         : "memory");),
+    (
+      // Unsupported architectures will have a linker error with a semi-decent error message
+      __cuda_ptx_tensormap_replace_global_dim_is_not_supported_before_SM_90a__();));
+}
+#endif // __cccl_ptx_isa >= 830
+
+/*
+// tensormap.replace.tile.global_dim.space.b1024.b32        [tm_addr], ord, new_val; // PTX ISA 83, SM_90a
+// .space     = { .shared::cta }
+template <int N32, typename B32>
+__device__ static inline void tensormap_replace_global_dim(
+  cuda::ptx::space_shared_t,
+  void* tm_addr,
+  cuda::ptx::n32_t<N32> ord,
+  B32 new_val);
+*/
+#if __cccl_ptx_isa >= 830
+extern "C" _CCCL_DEVICE void __cuda_ptx_tensormap_replace_global_dim_is_not_supported_before_SM_90a__();
+template <int _N32, typename _B32>
+_CCCL_DEVICE static inline void
+tensormap_replace_global_dim(space_shared_t, void* __tm_addr, n32_t<_N32> __ord, _B32 __new_val)
+{
+  // __space == space_shared (due to parameter type constraint)
+  static_assert(sizeof(_B32) == 4, "");
+  NV_IF_ELSE_TARGET(
+    NV_HAS_FEATURE_SM_90a,
+    (asm("tensormap.replace.tile.global_dim.shared::cta.b1024.b32        [%0], %1, %2;"
+         :
+         : "r"(__as_ptr_smem(__tm_addr)), "n"(__ord.value), "r"(__as_b32(__new_val))
+         : "memory");),
+    (
+      // Unsupported architectures will have a linker error with a semi-decent error message
+      __cuda_ptx_tensormap_replace_global_dim_is_not_supported_before_SM_90a__();));
+}
+#endif // __cccl_ptx_isa >= 830
+
+/*
+// tensormap.replace.tile.global_stride.space.b1024.b64     [tm_addr], ord, new_val; // PTX ISA 83, SM_90a
+// .space     = { .global }
+template <int N32, typename B64>
+__device__ static inline void tensormap_replace_global_stride(
+  cuda::ptx::space_global_t,
+  void* tm_addr,
+  cuda::ptx::n32_t<N32> ord,
+  B64 new_val);
+*/
+#if __cccl_ptx_isa >= 830
+extern "C" _CCCL_DEVICE void __cuda_ptx_tensormap_replace_global_stride_is_not_supported_before_SM_90a__();
+template <int _N32, typename _B64>
+_CCCL_DEVICE static inline void
+tensormap_replace_global_stride(space_global_t, void* __tm_addr, n32_t<_N32> __ord, _B64 __new_val)
+{
+  // __space == space_global (due to parameter type constraint)
+  static_assert(sizeof(_B64) == 8, "");
+  NV_IF_ELSE_TARGET(
+    NV_HAS_FEATURE_SM_90a,
+    (asm("tensormap.replace.tile.global_stride.global.b1024.b64     [%0], %1, %2;"
+         :
+         : "l"(__as_ptr_gmem(__tm_addr)), "n"(__ord.value), "l"(__as_b64(__new_val))
+         : "memory");),
+    (
+      // Unsupported architectures will have a linker error with a semi-decent error message
+      __cuda_ptx_tensormap_replace_global_stride_is_not_supported_before_SM_90a__();));
+}
+#endif // __cccl_ptx_isa >= 830
+
+/*
+// tensormap.replace.tile.global_stride.space.b1024.b64     [tm_addr], ord, new_val; // PTX ISA 83, SM_90a
+// .space     = { .shared::cta }
+template <int N32, typename B64>
+__device__ static inline void tensormap_replace_global_stride(
+  cuda::ptx::space_shared_t,
+  void* tm_addr,
+  cuda::ptx::n32_t<N32> ord,
+  B64 new_val);
+*/
+#if __cccl_ptx_isa >= 830
+extern "C" _CCCL_DEVICE void __cuda_ptx_tensormap_replace_global_stride_is_not_supported_before_SM_90a__();
+template <int _N32, typename _B64>
+_CCCL_DEVICE static inline void
+tensormap_replace_global_stride(space_shared_t, void* __tm_addr, n32_t<_N32> __ord, _B64 __new_val)
+{
+  // __space == space_shared (due to parameter type constraint)
+  static_assert(sizeof(_B64) == 8, "");
+  NV_IF_ELSE_TARGET(
+    NV_HAS_FEATURE_SM_90a,
+    (asm("tensormap.replace.tile.global_stride.shared::cta.b1024.b64     [%0], %1, %2;"
+         :
+         : "r"(__as_ptr_smem(__tm_addr)), "n"(__ord.value), "l"(__as_b64(__new_val))
+         : "memory");),
+    (
+      // Unsupported architectures will have a linker error with a semi-decent error message
+      __cuda_ptx_tensormap_replace_global_stride_is_not_supported_before_SM_90a__();));
+}
+#endif // __cccl_ptx_isa >= 830
+
+/*
+// tensormap.replace.tile.element_stride.space.b1024.b32    [tm_addr], ord, new_val; // PTX ISA 83, SM_90a
+// .space     = { .global }
+template <int N32, typename B32>
+__device__ static inline void tensormap_replace_element_size(
+  cuda::ptx::space_global_t,
+  void* tm_addr,
+  cuda::ptx::n32_t<N32> ord,
+  B32 new_val);
+*/
+#if __cccl_ptx_isa >= 830
+extern "C" _CCCL_DEVICE void __cuda_ptx_tensormap_replace_element_size_is_not_supported_before_SM_90a__();
+template <int _N32, typename _B32>
+_CCCL_DEVICE static inline void
+tensormap_replace_element_size(space_global_t, void* __tm_addr, n32_t<_N32> __ord, _B32 __new_val)
+{
+  // __space == space_global (due to parameter type constraint)
+  static_assert(sizeof(_B32) == 4, "");
+  NV_IF_ELSE_TARGET(
+    NV_HAS_FEATURE_SM_90a,
+    (asm("tensormap.replace.tile.element_stride.global.b1024.b32    [%0], %1, %2;"
+         :
+         : "l"(__as_ptr_gmem(__tm_addr)), "n"(__ord.value), "r"(__as_b32(__new_val))
+         : "memory");),
+    (
+      // Unsupported architectures will have a linker error with a semi-decent error message
+      __cuda_ptx_tensormap_replace_element_size_is_not_supported_before_SM_90a__();));
+}
+#endif // __cccl_ptx_isa >= 830
+
+/*
+// tensormap.replace.tile.element_stride.space.b1024.b32    [tm_addr], ord, new_val; // PTX ISA 83, SM_90a
+// .space     = { .shared::cta }
+template <int N32, typename B32>
+__device__ static inline void tensormap_replace_element_size(
+  cuda::ptx::space_shared_t,
+  void* tm_addr,
+  cuda::ptx::n32_t<N32> ord,
+  B32 new_val);
+*/
+#if __cccl_ptx_isa >= 830
+extern "C" _CCCL_DEVICE void __cuda_ptx_tensormap_replace_element_size_is_not_supported_before_SM_90a__();
+template <int _N32, typename _B32>
+_CCCL_DEVICE static inline void
+tensormap_replace_element_size(space_shared_t, void* __tm_addr, n32_t<_N32> __ord, _B32 __new_val)
+{
+  // __space == space_shared (due to parameter type constraint)
+  static_assert(sizeof(_B32) == 4, "");
+  NV_IF_ELSE_TARGET(
+    NV_HAS_FEATURE_SM_90a,
+    (asm("tensormap.replace.tile.element_stride.shared::cta.b1024.b32    [%0], %1, %2;"
+         :
+         : "r"(__as_ptr_smem(__tm_addr)), "n"(__ord.value), "r"(__as_b32(__new_val))
+         : "memory");),
+    (
+      // Unsupported architectures will have a linker error with a semi-decent error message
+      __cuda_ptx_tensormap_replace_element_size_is_not_supported_before_SM_90a__();));
+}
+#endif // __cccl_ptx_isa >= 830
+
+/*
+// tensormap.replace.tile.elemtype.space.b1024.b32          [tm_addr], new_val; // PTX ISA 83, SM_90a
+// .space     = { .global }
+template <int N32>
+__device__ static inline void tensormap_replace_elemtype(
+  cuda::ptx::space_global_t,
+  void* tm_addr,
+  cuda::ptx::n32_t<N32> new_val);
+*/
+#if __cccl_ptx_isa >= 830
+extern "C" _CCCL_DEVICE void __cuda_ptx_tensormap_replace_elemtype_is_not_supported_before_SM_90a__();
+template <int _N32>
+_CCCL_DEVICE static inline void tensormap_replace_elemtype(space_global_t, void* __tm_addr, n32_t<_N32> __new_val)
+{
+  // __space == space_global (due to parameter type constraint)
+  NV_IF_ELSE_TARGET(
+    NV_HAS_FEATURE_SM_90a,
+    (asm("tensormap.replace.tile.elemtype.global.b1024.b32          [%0], %1;"
+         :
+         : "l"(__as_ptr_gmem(__tm_addr)), "n"(__new_val.value)
+         : "memory");),
+    (
+      // Unsupported architectures will have a linker error with a semi-decent error message
+      __cuda_ptx_tensormap_replace_elemtype_is_not_supported_before_SM_90a__();));
+}
+#endif // __cccl_ptx_isa >= 830
+
+/*
+// tensormap.replace.tile.elemtype.space.b1024.b32          [tm_addr], new_val; // PTX ISA 83, SM_90a
+// .space     = { .shared::cta }
+template <int N32>
+__device__ static inline void tensormap_replace_elemtype(
+  cuda::ptx::space_shared_t,
+  void* tm_addr,
+  cuda::ptx::n32_t<N32> new_val);
+*/
+#if __cccl_ptx_isa >= 830
+extern "C" _CCCL_DEVICE void __cuda_ptx_tensormap_replace_elemtype_is_not_supported_before_SM_90a__();
+template <int _N32>
+_CCCL_DEVICE static inline void tensormap_replace_elemtype(space_shared_t, void* __tm_addr, n32_t<_N32> __new_val)
+{
+  // __space == space_shared (due to parameter type constraint)
+  NV_IF_ELSE_TARGET(
+    NV_HAS_FEATURE_SM_90a,
+    (asm("tensormap.replace.tile.elemtype.shared::cta.b1024.b32          [%0], %1;"
+         :
+         : "r"(__as_ptr_smem(__tm_addr)), "n"(__new_val.value)
+         : "memory");),
+    (
+      // Unsupported architectures will have a linker error with a semi-decent error message
+      __cuda_ptx_tensormap_replace_elemtype_is_not_supported_before_SM_90a__();));
+}
+#endif // __cccl_ptx_isa >= 830
+
+/*
+// tensormap.replace.tile.interleave_layout.space.b1024.b32 [tm_addr], new_val; // PTX ISA 83, SM_90a
+// .space     = { .global }
+template <int N32>
+__device__ static inline void tensormap_replace_interleave_layout(
+  cuda::ptx::space_global_t,
+  void* tm_addr,
+  cuda::ptx::n32_t<N32> new_val);
+*/
+#if __cccl_ptx_isa >= 830
+extern "C" _CCCL_DEVICE void __cuda_ptx_tensormap_replace_interleave_layout_is_not_supported_before_SM_90a__();
+template <int _N32>
+_CCCL_DEVICE static inline void
+tensormap_replace_interleave_layout(space_global_t, void* __tm_addr, n32_t<_N32> __new_val)
+{
+  // __space == space_global (due to parameter type constraint)
+  NV_IF_ELSE_TARGET(
+    NV_HAS_FEATURE_SM_90a,
+    (asm("tensormap.replace.tile.interleave_layout.global.b1024.b32 [%0], %1;"
+         :
+         : "l"(__as_ptr_gmem(__tm_addr)), "n"(__new_val.value)
+         : "memory");),
+    (
+      // Unsupported architectures will have a linker error with a semi-decent error message
+      __cuda_ptx_tensormap_replace_interleave_layout_is_not_supported_before_SM_90a__();));
+}
+#endif // __cccl_ptx_isa >= 830
+
+/*
+// tensormap.replace.tile.interleave_layout.space.b1024.b32 [tm_addr], new_val; // PTX ISA 83, SM_90a
+// .space     = { .shared::cta }
+template <int N32>
+__device__ static inline void tensormap_replace_interleave_layout(
+  cuda::ptx::space_shared_t,
+  void* tm_addr,
+  cuda::ptx::n32_t<N32> new_val);
+*/
+#if __cccl_ptx_isa >= 830
+extern "C" _CCCL_DEVICE void __cuda_ptx_tensormap_replace_interleave_layout_is_not_supported_before_SM_90a__();
+template <int _N32>
+_CCCL_DEVICE static inline void
+tensormap_replace_interleave_layout(space_shared_t, void* __tm_addr, n32_t<_N32> __new_val)
+{
+  // __space == space_shared (due to parameter type constraint)
+  NV_IF_ELSE_TARGET(
+    NV_HAS_FEATURE_SM_90a,
+    (asm("tensormap.replace.tile.interleave_layout.shared::cta.b1024.b32 [%0], %1;"
+         :
+         : "r"(__as_ptr_smem(__tm_addr)), "n"(__new_val.value)
+         : "memory");),
+    (
+      // Unsupported architectures will have a linker error with a semi-decent error message
+      __cuda_ptx_tensormap_replace_interleave_layout_is_not_supported_before_SM_90a__();));
+}
+#endif // __cccl_ptx_isa >= 830
+
+/*
+// tensormap.replace.tile.swizzle_mode.space.b1024.b32      [tm_addr], new_val; // PTX ISA 83, SM_90a
+// .space     = { .global }
+template <int N32>
+__device__ static inline void tensormap_replace_swizzle_mode(
+  cuda::ptx::space_global_t,
+  void* tm_addr,
+  cuda::ptx::n32_t<N32> new_val);
+*/
+#if __cccl_ptx_isa >= 830
+extern "C" _CCCL_DEVICE void __cuda_ptx_tensormap_replace_swizzle_mode_is_not_supported_before_SM_90a__();
+template <int _N32>
+_CCCL_DEVICE static inline void tensormap_replace_swizzle_mode(space_global_t, void* __tm_addr, n32_t<_N32> __new_val)
+{
+  // __space == space_global (due to parameter type constraint)
+  NV_IF_ELSE_TARGET(
+    NV_HAS_FEATURE_SM_90a,
+    (asm("tensormap.replace.tile.swizzle_mode.global.b1024.b32      [%0], %1;"
+         :
+         : "l"(__as_ptr_gmem(__tm_addr)), "n"(__new_val.value)
+         : "memory");),
+    (
+      // Unsupported architectures will have a linker error with a semi-decent error message
+      __cuda_ptx_tensormap_replace_swizzle_mode_is_not_supported_before_SM_90a__();));
+}
+#endif // __cccl_ptx_isa >= 830
+
+/*
+// tensormap.replace.tile.swizzle_mode.space.b1024.b32      [tm_addr], new_val; // PTX ISA 83, SM_90a
+// .space     = { .shared::cta }
+template <int N32>
+__device__ static inline void tensormap_replace_swizzle_mode(
+  cuda::ptx::space_shared_t,
+  void* tm_addr,
+  cuda::ptx::n32_t<N32> new_val);
+*/
+#if __cccl_ptx_isa >= 830
+extern "C" _CCCL_DEVICE void __cuda_ptx_tensormap_replace_swizzle_mode_is_not_supported_before_SM_90a__();
+template <int _N32>
+_CCCL_DEVICE static inline void tensormap_replace_swizzle_mode(space_shared_t, void* __tm_addr, n32_t<_N32> __new_val)
+{
+  // __space == space_shared (due to parameter type constraint)
+  NV_IF_ELSE_TARGET(
+    NV_HAS_FEATURE_SM_90a,
+    (asm("tensormap.replace.tile.swizzle_mode.shared::cta.b1024.b32      [%0], %1;"
+         :
+         : "r"(__as_ptr_smem(__tm_addr)), "n"(__new_val.value)
+         : "memory");),
+    (
+      // Unsupported architectures will have a linker error with a semi-decent error message
+      __cuda_ptx_tensormap_replace_swizzle_mode_is_not_supported_before_SM_90a__();));
+}
+#endif // __cccl_ptx_isa >= 830
+
+/*
+// tensormap.replace.tile.fill_mode.space.b1024.b32         [tm_addr], new_val; // PTX ISA 83, SM_90a
+// .space     = { .global }
+template <int N32>
+__device__ static inline void tensormap_replace_fill_mode(
+  cuda::ptx::space_global_t,
+  void* tm_addr,
+  cuda::ptx::n32_t<N32> new_val);
+*/
+#if __cccl_ptx_isa >= 830
+extern "C" _CCCL_DEVICE void __cuda_ptx_tensormap_replace_fill_mode_is_not_supported_before_SM_90a__();
+template <int _N32>
+_CCCL_DEVICE static inline void tensormap_replace_fill_mode(space_global_t, void* __tm_addr, n32_t<_N32> __new_val)
+{
+  // __space == space_global (due to parameter type constraint)
+  NV_IF_ELSE_TARGET(
+    NV_HAS_FEATURE_SM_90a,
+    (asm("tensormap.replace.tile.fill_mode.global.b1024.b32         [%0], %1;"
+         :
+         : "l"(__as_ptr_gmem(__tm_addr)), "n"(__new_val.value)
+         : "memory");),
+    (
+      // Unsupported architectures will have a linker error with a semi-decent error message
+      __cuda_ptx_tensormap_replace_fill_mode_is_not_supported_before_SM_90a__();));
+}
+#endif // __cccl_ptx_isa >= 830
+
+/*
+// tensormap.replace.tile.fill_mode.space.b1024.b32         [tm_addr], new_val; // PTX ISA 83, SM_90a
+// .space     = { .shared::cta }
+template <int N32>
+__device__ static inline void tensormap_replace_fill_mode(
+  cuda::ptx::space_shared_t,
+  void* tm_addr,
+  cuda::ptx::n32_t<N32> new_val);
+*/
+#if __cccl_ptx_isa >= 830
+extern "C" _CCCL_DEVICE void __cuda_ptx_tensormap_replace_fill_mode_is_not_supported_before_SM_90a__();
+template <int _N32>
+_CCCL_DEVICE static inline void tensormap_replace_fill_mode(space_shared_t, void* __tm_addr, n32_t<_N32> __new_val)
+{
+  // __space == space_shared (due to parameter type constraint)
+  NV_IF_ELSE_TARGET(
+    NV_HAS_FEATURE_SM_90a,
+    (asm("tensormap.replace.tile.fill_mode.shared::cta.b1024.b32         [%0], %1;"
+         :
+         : "r"(__as_ptr_smem(__tm_addr)), "n"(__new_val.value)
+         : "memory");),
+    (
+      // Unsupported architectures will have a linker error with a semi-decent error message
+      __cuda_ptx_tensormap_replace_fill_mode_is_not_supported_before_SM_90a__();));
+}
+#endif // __cccl_ptx_isa >= 830
diff --git a/libcudacxx/include/cuda/__ptx/instructions/get_sreg.h b/libcudacxx/include/cuda/__ptx/instructions/get_sreg.h
index 8982984885d..033005beb5b 100644
--- a/libcudacxx/include/cuda/__ptx/instructions/get_sreg.h
+++ b/libcudacxx/include/cuda/__ptx/instructions/get_sreg.h
@@ -32,1007 +32,7 @@ _LIBCUDACXX_BEGIN_NAMESPACE_CUDA_PTX
 
 // 10. Special Registers
 // https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#special-registers
-/*
-// mov.u32 sreg_value, %%tid.x; // PTX ISA 20
-template <typename=void>
-__device__ static inline uint32_t get_sreg_tid_x();
-*/
-#if __cccl_ptx_isa >= 200
-template <typename = void>
-_CCCL_DEVICE static inline _CUDA_VSTD::uint32_t get_sreg_tid_x()
-{
-  _CUDA_VSTD::uint32_t __sreg_value;
-  asm("mov.u32 %0, %%tid.x;" : "=r"(__sreg_value) : :);
-  return __sreg_value;
-}
-#endif // __cccl_ptx_isa >= 200
-
-/*
-// mov.u32 sreg_value, %%tid.y; // PTX ISA 20
-template <typename=void>
-__device__ static inline uint32_t get_sreg_tid_y();
-*/
-#if __cccl_ptx_isa >= 200
-template <typename = void>
-_CCCL_DEVICE static inline _CUDA_VSTD::uint32_t get_sreg_tid_y()
-{
-  _CUDA_VSTD::uint32_t __sreg_value;
-  asm("mov.u32 %0, %%tid.y;" : "=r"(__sreg_value) : :);
-  return __sreg_value;
-}
-#endif // __cccl_ptx_isa >= 200
-
-/*
-// mov.u32 sreg_value, %%tid.z; // PTX ISA 20
-template <typename=void>
-__device__ static inline uint32_t get_sreg_tid_z();
-*/
-#if __cccl_ptx_isa >= 200
-template <typename = void>
-_CCCL_DEVICE static inline _CUDA_VSTD::uint32_t get_sreg_tid_z()
-{
-  _CUDA_VSTD::uint32_t __sreg_value;
-  asm("mov.u32 %0, %%tid.z;" : "=r"(__sreg_value) : :);
-  return __sreg_value;
-}
-#endif // __cccl_ptx_isa >= 200
-
-/*
-// mov.u32 sreg_value, %%ntid.x; // PTX ISA 20
-template <typename=void>
-__device__ static inline uint32_t get_sreg_ntid_x();
-*/
-#if __cccl_ptx_isa >= 200
-template <typename = void>
-_CCCL_DEVICE static inline _CUDA_VSTD::uint32_t get_sreg_ntid_x()
-{
-  _CUDA_VSTD::uint32_t __sreg_value;
-  asm volatile("mov.u32 %0, %%ntid.x;" : "=r"(__sreg_value) : :);
-  return __sreg_value;
-}
-#endif // __cccl_ptx_isa >= 200
-
-/*
-// mov.u32 sreg_value, %%ntid.y; // PTX ISA 20
-template <typename=void>
-__device__ static inline uint32_t get_sreg_ntid_y();
-*/
-#if __cccl_ptx_isa >= 200
-template <typename = void>
-_CCCL_DEVICE static inline _CUDA_VSTD::uint32_t get_sreg_ntid_y()
-{
-  _CUDA_VSTD::uint32_t __sreg_value;
-  asm volatile("mov.u32 %0, %%ntid.y;" : "=r"(__sreg_value) : :);
-  return __sreg_value;
-}
-#endif // __cccl_ptx_isa >= 200
-
-/*
-// mov.u32 sreg_value, %%ntid.z; // PTX ISA 20
-template <typename=void>
-__device__ static inline uint32_t get_sreg_ntid_z();
-*/
-#if __cccl_ptx_isa >= 200
-template <typename = void>
-_CCCL_DEVICE static inline _CUDA_VSTD::uint32_t get_sreg_ntid_z()
-{
-  _CUDA_VSTD::uint32_t __sreg_value;
-  asm volatile("mov.u32 %0, %%ntid.z;" : "=r"(__sreg_value) : :);
-  return __sreg_value;
-}
-#endif // __cccl_ptx_isa >= 200
-
-/*
-// mov.u32 sreg_value, %%laneid; // PTX ISA 13
-template <typename=void>
-__device__ static inline uint32_t get_sreg_laneid();
-*/
-#if __cccl_ptx_isa >= 130
-template <typename = void>
-_CCCL_DEVICE static inline _CUDA_VSTD::uint32_t get_sreg_laneid()
-{
-  _CUDA_VSTD::uint32_t __sreg_value;
-  asm("mov.u32 %0, %%laneid;" : "=r"(__sreg_value) : :);
-  return __sreg_value;
-}
-#endif // __cccl_ptx_isa >= 130
-
-/*
-// mov.u32 sreg_value, %%warpid; // PTX ISA 13
-template <typename=void>
-__device__ static inline uint32_t get_sreg_warpid();
-*/
-#if __cccl_ptx_isa >= 130
-template <typename = void>
-_CCCL_DEVICE static inline _CUDA_VSTD::uint32_t get_sreg_warpid()
-{
-  _CUDA_VSTD::uint32_t __sreg_value;
-  asm volatile("mov.u32 %0, %%warpid;" : "=r"(__sreg_value) : :);
-  return __sreg_value;
-}
-#endif // __cccl_ptx_isa >= 130
-
-/*
-// mov.u32 sreg_value, %%nwarpid; // PTX ISA 20, SM_35
-template <typename=void>
-__device__ static inline uint32_t get_sreg_nwarpid();
-*/
-#if __cccl_ptx_isa >= 200
-extern "C" _CCCL_DEVICE void __cuda_ptx_get_sreg_nwarpid_is_not_supported_before_SM_35__();
-template <typename = void>
-_CCCL_DEVICE static inline _CUDA_VSTD::uint32_t get_sreg_nwarpid()
-{
-  NV_IF_ELSE_TARGET(
-    NV_PROVIDES_SM_35,
-    (_CUDA_VSTD::uint32_t __sreg_value; asm volatile(
-       "mov.u32 %0, %%nwarpid;"
-       : "=r"(__sreg_value)
-       :
-       :);
-     return __sreg_value;),
-    (
-      // Unsupported architectures will have a linker error with a semi-decent error message
-      __cuda_ptx_get_sreg_nwarpid_is_not_supported_before_SM_35__(); return 0;));
-}
-#endif // __cccl_ptx_isa >= 200
-
-/*
-// mov.u32 sreg_value, %%ctaid.x; // PTX ISA 20
-template <typename=void>
-__device__ static inline uint32_t get_sreg_ctaid_x();
-*/
-#if __cccl_ptx_isa >= 200
-template <typename = void>
-_CCCL_DEVICE static inline _CUDA_VSTD::uint32_t get_sreg_ctaid_x()
-{
-  _CUDA_VSTD::uint32_t __sreg_value;
-  asm("mov.u32 %0, %%ctaid.x;" : "=r"(__sreg_value) : :);
-  return __sreg_value;
-}
-#endif // __cccl_ptx_isa >= 200
-
-/*
-// mov.u32 sreg_value, %%ctaid.y; // PTX ISA 20
-template <typename=void>
-__device__ static inline uint32_t get_sreg_ctaid_y();
-*/
-#if __cccl_ptx_isa >= 200
-template <typename = void>
-_CCCL_DEVICE static inline _CUDA_VSTD::uint32_t get_sreg_ctaid_y()
-{
-  _CUDA_VSTD::uint32_t __sreg_value;
-  asm("mov.u32 %0, %%ctaid.y;" : "=r"(__sreg_value) : :);
-  return __sreg_value;
-}
-#endif // __cccl_ptx_isa >= 200
-
-/*
-// mov.u32 sreg_value, %%ctaid.z; // PTX ISA 20
-template <typename=void>
-__device__ static inline uint32_t get_sreg_ctaid_z();
-*/
-#if __cccl_ptx_isa >= 200
-template <typename = void>
-_CCCL_DEVICE static inline _CUDA_VSTD::uint32_t get_sreg_ctaid_z()
-{
-  _CUDA_VSTD::uint32_t __sreg_value;
-  asm("mov.u32 %0, %%ctaid.z;" : "=r"(__sreg_value) : :);
-  return __sreg_value;
-}
-#endif // __cccl_ptx_isa >= 200
-
-/*
-// mov.u32 sreg_value, %%nctaid.x; // PTX ISA 20
-template <typename=void>
-__device__ static inline uint32_t get_sreg_nctaid_x();
-*/
-#if __cccl_ptx_isa >= 200
-template <typename = void>
-_CCCL_DEVICE static inline _CUDA_VSTD::uint32_t get_sreg_nctaid_x()
-{
-  _CUDA_VSTD::uint32_t __sreg_value;
-  asm("mov.u32 %0, %%nctaid.x;" : "=r"(__sreg_value) : :);
-  return __sreg_value;
-}
-#endif // __cccl_ptx_isa >= 200
-
-/*
-// mov.u32 sreg_value, %%nctaid.y; // PTX ISA 20
-template <typename=void>
-__device__ static inline uint32_t get_sreg_nctaid_y();
-*/
-#if __cccl_ptx_isa >= 200
-template <typename = void>
-_CCCL_DEVICE static inline _CUDA_VSTD::uint32_t get_sreg_nctaid_y()
-{
-  _CUDA_VSTD::uint32_t __sreg_value;
-  asm("mov.u32 %0, %%nctaid.y;" : "=r"(__sreg_value) : :);
-  return __sreg_value;
-}
-#endif // __cccl_ptx_isa >= 200
-
-/*
-// mov.u32 sreg_value, %%nctaid.z; // PTX ISA 20
-template <typename=void>
-__device__ static inline uint32_t get_sreg_nctaid_z();
-*/
-#if __cccl_ptx_isa >= 200
-template <typename = void>
-_CCCL_DEVICE static inline _CUDA_VSTD::uint32_t get_sreg_nctaid_z()
-{
-  _CUDA_VSTD::uint32_t __sreg_value;
-  asm("mov.u32 %0, %%nctaid.z;" : "=r"(__sreg_value) : :);
-  return __sreg_value;
-}
-#endif // __cccl_ptx_isa >= 200
-
-/*
-// mov.u32 sreg_value, %%smid; // PTX ISA 13
-template <typename=void>
-__device__ static inline uint32_t get_sreg_smid();
-*/
-#if __cccl_ptx_isa >= 130
-template <typename = void>
-_CCCL_DEVICE static inline _CUDA_VSTD::uint32_t get_sreg_smid()
-{
-  _CUDA_VSTD::uint32_t __sreg_value;
-  asm("mov.u32 %0, %%smid;" : "=r"(__sreg_value) : :);
-  return __sreg_value;
-}
-#endif // __cccl_ptx_isa >= 130
-
-/*
-// mov.u32 sreg_value, %%nsmid; // PTX ISA 20, SM_35
-template <typename=void>
-__device__ static inline uint32_t get_sreg_nsmid();
-*/
-#if __cccl_ptx_isa >= 200
-extern "C" _CCCL_DEVICE void __cuda_ptx_get_sreg_nsmid_is_not_supported_before_SM_35__();
-template <typename = void>
-_CCCL_DEVICE static inline _CUDA_VSTD::uint32_t get_sreg_nsmid()
-{
-  NV_IF_ELSE_TARGET(
-    NV_PROVIDES_SM_35,
-    (_CUDA_VSTD::uint32_t __sreg_value; asm volatile(
-       "mov.u32 %0, %%nsmid;"
-       : "=r"(__sreg_value)
-       :
-       :);
-     return __sreg_value;),
-    (
-      // Unsupported architectures will have a linker error with a semi-decent error message
-      __cuda_ptx_get_sreg_nsmid_is_not_supported_before_SM_35__(); return 0;));
-}
-#endif // __cccl_ptx_isa >= 200
-
-/*
-// mov.u64 sreg_value, %%gridid; // PTX ISA 30
-template <typename=void>
-__device__ static inline uint64_t get_sreg_gridid();
-*/
-#if __cccl_ptx_isa >= 300
-template <typename = void>
-_CCCL_DEVICE static inline _CUDA_VSTD::uint64_t get_sreg_gridid()
-{
-  _CUDA_VSTD::uint64_t __sreg_value;
-  asm("mov.u64 %0, %%gridid;" : "=l"(__sreg_value) : :);
-  return __sreg_value;
-}
-#endif // __cccl_ptx_isa >= 300
-
-/*
-// mov.pred sreg_value, %%is_explicit_cluster; // PTX ISA 78, SM_90
-template <typename=void>
-__device__ static inline bool get_sreg_is_explicit_cluster();
-*/
-#if __cccl_ptx_isa >= 780
-extern "C" _CCCL_DEVICE void __cuda_ptx_get_sreg_is_explicit_cluster_is_not_supported_before_SM_90__();
-template <typename = void>
-_CCCL_DEVICE static inline bool get_sreg_is_explicit_cluster()
-{
-  NV_IF_ELSE_TARGET(
-    NV_PROVIDES_SM_90,
-    (_CUDA_VSTD::uint32_t __sreg_value;
-     asm("{\n\t .reg .pred P_OUT; \n\t"
-         "mov.pred P_OUT, %%is_explicit_cluster;\n\t"
-         "selp.b32 %0, 1, 0, P_OUT; \n"
-         "}"
-         : "=r"(__sreg_value)
-         :
-         :);
-     return static_cast<bool>(__sreg_value);),
-    (
-      // Unsupported architectures will have a linker error with a semi-decent error message
-      __cuda_ptx_get_sreg_is_explicit_cluster_is_not_supported_before_SM_90__(); return false;));
-}
-#endif // __cccl_ptx_isa >= 780
-
-/*
-// mov.u32 sreg_value, %%clusterid.x; // PTX ISA 78, SM_90
-template <typename=void>
-__device__ static inline uint32_t get_sreg_clusterid_x();
-*/
-#if __cccl_ptx_isa >= 780
-extern "C" _CCCL_DEVICE void __cuda_ptx_get_sreg_clusterid_x_is_not_supported_before_SM_90__();
-template <typename = void>
-_CCCL_DEVICE static inline _CUDA_VSTD::uint32_t get_sreg_clusterid_x()
-{
-  NV_IF_ELSE_TARGET(
-    NV_PROVIDES_SM_90,
-    (_CUDA_VSTD::uint32_t __sreg_value;
-     asm("mov.u32 %0, %%clusterid.x;"
-         : "=r"(__sreg_value)
-         :
-         :);
-     return __sreg_value;),
-    (
-      // Unsupported architectures will have a linker error with a semi-decent error message
-      __cuda_ptx_get_sreg_clusterid_x_is_not_supported_before_SM_90__(); return 0;));
-}
-#endif // __cccl_ptx_isa >= 780
-
-/*
-// mov.u32 sreg_value, %%clusterid.y; // PTX ISA 78, SM_90
-template <typename=void>
-__device__ static inline uint32_t get_sreg_clusterid_y();
-*/
-#if __cccl_ptx_isa >= 780
-extern "C" _CCCL_DEVICE void __cuda_ptx_get_sreg_clusterid_y_is_not_supported_before_SM_90__();
-template <typename = void>
-_CCCL_DEVICE static inline _CUDA_VSTD::uint32_t get_sreg_clusterid_y()
-{
-  NV_IF_ELSE_TARGET(
-    NV_PROVIDES_SM_90,
-    (_CUDA_VSTD::uint32_t __sreg_value;
-     asm("mov.u32 %0, %%clusterid.y;"
-         : "=r"(__sreg_value)
-         :
-         :);
-     return __sreg_value;),
-    (
-      // Unsupported architectures will have a linker error with a semi-decent error message
-      __cuda_ptx_get_sreg_clusterid_y_is_not_supported_before_SM_90__(); return 0;));
-}
-#endif // __cccl_ptx_isa >= 780
-
-/*
-// mov.u32 sreg_value, %%clusterid.z; // PTX ISA 78, SM_90
-template <typename=void>
-__device__ static inline uint32_t get_sreg_clusterid_z();
-*/
-#if __cccl_ptx_isa >= 780
-extern "C" _CCCL_DEVICE void __cuda_ptx_get_sreg_clusterid_z_is_not_supported_before_SM_90__();
-template <typename = void>
-_CCCL_DEVICE static inline _CUDA_VSTD::uint32_t get_sreg_clusterid_z()
-{
-  NV_IF_ELSE_TARGET(
-    NV_PROVIDES_SM_90,
-    (_CUDA_VSTD::uint32_t __sreg_value;
-     asm("mov.u32 %0, %%clusterid.z;"
-         : "=r"(__sreg_value)
-         :
-         :);
-     return __sreg_value;),
-    (
-      // Unsupported architectures will have a linker error with a semi-decent error message
-      __cuda_ptx_get_sreg_clusterid_z_is_not_supported_before_SM_90__(); return 0;));
-}
-#endif // __cccl_ptx_isa >= 780
-
-/*
-// mov.u32 sreg_value, %%nclusterid.x; // PTX ISA 78, SM_90
-template <typename=void>
-__device__ static inline uint32_t get_sreg_nclusterid_x();
-*/
-#if __cccl_ptx_isa >= 780
-extern "C" _CCCL_DEVICE void __cuda_ptx_get_sreg_nclusterid_x_is_not_supported_before_SM_90__();
-template <typename = void>
-_CCCL_DEVICE static inline _CUDA_VSTD::uint32_t get_sreg_nclusterid_x()
-{
-  NV_IF_ELSE_TARGET(
-    NV_PROVIDES_SM_90,
-    (_CUDA_VSTD::uint32_t __sreg_value;
-     asm("mov.u32 %0, %%nclusterid.x;"
-         : "=r"(__sreg_value)
-         :
-         :);
-     return __sreg_value;),
-    (
-      // Unsupported architectures will have a linker error with a semi-decent error message
-      __cuda_ptx_get_sreg_nclusterid_x_is_not_supported_before_SM_90__(); return 0;));
-}
-#endif // __cccl_ptx_isa >= 780
-
-/*
-// mov.u32 sreg_value, %%nclusterid.y; // PTX ISA 78, SM_90
-template <typename=void>
-__device__ static inline uint32_t get_sreg_nclusterid_y();
-*/
-#if __cccl_ptx_isa >= 780
-extern "C" _CCCL_DEVICE void __cuda_ptx_get_sreg_nclusterid_y_is_not_supported_before_SM_90__();
-template <typename = void>
-_CCCL_DEVICE static inline _CUDA_VSTD::uint32_t get_sreg_nclusterid_y()
-{
-  NV_IF_ELSE_TARGET(
-    NV_PROVIDES_SM_90,
-    (_CUDA_VSTD::uint32_t __sreg_value;
-     asm("mov.u32 %0, %%nclusterid.y;"
-         : "=r"(__sreg_value)
-         :
-         :);
-     return __sreg_value;),
-    (
-      // Unsupported architectures will have a linker error with a semi-decent error message
-      __cuda_ptx_get_sreg_nclusterid_y_is_not_supported_before_SM_90__(); return 0;));
-}
-#endif // __cccl_ptx_isa >= 780
-
-/*
-// mov.u32 sreg_value, %%nclusterid.z; // PTX ISA 78, SM_90
-template <typename=void>
-__device__ static inline uint32_t get_sreg_nclusterid_z();
-*/
-#if __cccl_ptx_isa >= 780
-extern "C" _CCCL_DEVICE void __cuda_ptx_get_sreg_nclusterid_z_is_not_supported_before_SM_90__();
-template <typename = void>
-_CCCL_DEVICE static inline _CUDA_VSTD::uint32_t get_sreg_nclusterid_z()
-{
-  NV_IF_ELSE_TARGET(
-    NV_PROVIDES_SM_90,
-    (_CUDA_VSTD::uint32_t __sreg_value;
-     asm("mov.u32 %0, %%nclusterid.z;"
-         : "=r"(__sreg_value)
-         :
-         :);
-     return __sreg_value;),
-    (
-      // Unsupported architectures will have a linker error with a semi-decent error message
-      __cuda_ptx_get_sreg_nclusterid_z_is_not_supported_before_SM_90__(); return 0;));
-}
-#endif // __cccl_ptx_isa >= 780
-
-/*
-// mov.u32 sreg_value, %%cluster_ctaid.x; // PTX ISA 78, SM_90
-template <typename=void>
-__device__ static inline uint32_t get_sreg_cluster_ctaid_x();
-*/
-#if __cccl_ptx_isa >= 780
-extern "C" _CCCL_DEVICE void __cuda_ptx_get_sreg_cluster_ctaid_x_is_not_supported_before_SM_90__();
-template <typename = void>
-_CCCL_DEVICE static inline _CUDA_VSTD::uint32_t get_sreg_cluster_ctaid_x()
-{
-  NV_IF_ELSE_TARGET(
-    NV_PROVIDES_SM_90,
-    (_CUDA_VSTD::uint32_t __sreg_value;
-     asm("mov.u32 %0, %%cluster_ctaid.x;"
-         : "=r"(__sreg_value)
-         :
-         :);
-     return __sreg_value;),
-    (
-      // Unsupported architectures will have a linker error with a semi-decent error message
-      __cuda_ptx_get_sreg_cluster_ctaid_x_is_not_supported_before_SM_90__(); return 0;));
-}
-#endif // __cccl_ptx_isa >= 780
-
-/*
-// mov.u32 sreg_value, %%cluster_ctaid.y; // PTX ISA 78, SM_90
-template <typename=void>
-__device__ static inline uint32_t get_sreg_cluster_ctaid_y();
-*/
-#if __cccl_ptx_isa >= 780
-extern "C" _CCCL_DEVICE void __cuda_ptx_get_sreg_cluster_ctaid_y_is_not_supported_before_SM_90__();
-template <typename = void>
-_CCCL_DEVICE static inline _CUDA_VSTD::uint32_t get_sreg_cluster_ctaid_y()
-{
-  NV_IF_ELSE_TARGET(
-    NV_PROVIDES_SM_90,
-    (_CUDA_VSTD::uint32_t __sreg_value;
-     asm("mov.u32 %0, %%cluster_ctaid.y;"
-         : "=r"(__sreg_value)
-         :
-         :);
-     return __sreg_value;),
-    (
-      // Unsupported architectures will have a linker error with a semi-decent error message
-      __cuda_ptx_get_sreg_cluster_ctaid_y_is_not_supported_before_SM_90__(); return 0;));
-}
-#endif // __cccl_ptx_isa >= 780
-
-/*
-// mov.u32 sreg_value, %%cluster_ctaid.z; // PTX ISA 78, SM_90
-template <typename=void>
-__device__ static inline uint32_t get_sreg_cluster_ctaid_z();
-*/
-#if __cccl_ptx_isa >= 780
-extern "C" _CCCL_DEVICE void __cuda_ptx_get_sreg_cluster_ctaid_z_is_not_supported_before_SM_90__();
-template <typename = void>
-_CCCL_DEVICE static inline _CUDA_VSTD::uint32_t get_sreg_cluster_ctaid_z()
-{
-  NV_IF_ELSE_TARGET(
-    NV_PROVIDES_SM_90,
-    (_CUDA_VSTD::uint32_t __sreg_value;
-     asm("mov.u32 %0, %%cluster_ctaid.z;"
-         : "=r"(__sreg_value)
-         :
-         :);
-     return __sreg_value;),
-    (
-      // Unsupported architectures will have a linker error with a semi-decent error message
-      __cuda_ptx_get_sreg_cluster_ctaid_z_is_not_supported_before_SM_90__(); return 0;));
-}
-#endif // __cccl_ptx_isa >= 780
-
-/*
-// mov.u32 sreg_value, %%cluster_nctaid.x; // PTX ISA 78, SM_90
-template <typename=void>
-__device__ static inline uint32_t get_sreg_cluster_nctaid_x();
-*/
-#if __cccl_ptx_isa >= 780
-extern "C" _CCCL_DEVICE void __cuda_ptx_get_sreg_cluster_nctaid_x_is_not_supported_before_SM_90__();
-template <typename = void>
-_CCCL_DEVICE static inline _CUDA_VSTD::uint32_t get_sreg_cluster_nctaid_x()
-{
-  NV_IF_ELSE_TARGET(
-    NV_PROVIDES_SM_90,
-    (_CUDA_VSTD::uint32_t __sreg_value;
-     asm("mov.u32 %0, %%cluster_nctaid.x;"
-         : "=r"(__sreg_value)
-         :
-         :);
-     return __sreg_value;),
-    (
-      // Unsupported architectures will have a linker error with a semi-decent error message
-      __cuda_ptx_get_sreg_cluster_nctaid_x_is_not_supported_before_SM_90__(); return 0;));
-}
-#endif // __cccl_ptx_isa >= 780
-
-/*
-// mov.u32 sreg_value, %%cluster_nctaid.y; // PTX ISA 78, SM_90
-template <typename=void>
-__device__ static inline uint32_t get_sreg_cluster_nctaid_y();
-*/
-#if __cccl_ptx_isa >= 780
-extern "C" _CCCL_DEVICE void __cuda_ptx_get_sreg_cluster_nctaid_y_is_not_supported_before_SM_90__();
-template <typename = void>
-_CCCL_DEVICE static inline _CUDA_VSTD::uint32_t get_sreg_cluster_nctaid_y()
-{
-  NV_IF_ELSE_TARGET(
-    NV_PROVIDES_SM_90,
-    (_CUDA_VSTD::uint32_t __sreg_value;
-     asm("mov.u32 %0, %%cluster_nctaid.y;"
-         : "=r"(__sreg_value)
-         :
-         :);
-     return __sreg_value;),
-    (
-      // Unsupported architectures will have a linker error with a semi-decent error message
-      __cuda_ptx_get_sreg_cluster_nctaid_y_is_not_supported_before_SM_90__(); return 0;));
-}
-#endif // __cccl_ptx_isa >= 780
-
-/*
-// mov.u32 sreg_value, %%cluster_nctaid.z; // PTX ISA 78, SM_90
-template <typename=void>
-__device__ static inline uint32_t get_sreg_cluster_nctaid_z();
-*/
-#if __cccl_ptx_isa >= 780
-extern "C" _CCCL_DEVICE void __cuda_ptx_get_sreg_cluster_nctaid_z_is_not_supported_before_SM_90__();
-template <typename = void>
-_CCCL_DEVICE static inline _CUDA_VSTD::uint32_t get_sreg_cluster_nctaid_z()
-{
-  NV_IF_ELSE_TARGET(
-    NV_PROVIDES_SM_90,
-    (_CUDA_VSTD::uint32_t __sreg_value;
-     asm("mov.u32 %0, %%cluster_nctaid.z;"
-         : "=r"(__sreg_value)
-         :
-         :);
-     return __sreg_value;),
-    (
-      // Unsupported architectures will have a linker error with a semi-decent error message
-      __cuda_ptx_get_sreg_cluster_nctaid_z_is_not_supported_before_SM_90__(); return 0;));
-}
-#endif // __cccl_ptx_isa >= 780
-
-/*
-// mov.u32 sreg_value, %%cluster_ctarank; // PTX ISA 78, SM_90
-template <typename=void>
-__device__ static inline uint32_t get_sreg_cluster_ctarank();
-*/
-#if __cccl_ptx_isa >= 780
-extern "C" _CCCL_DEVICE void __cuda_ptx_get_sreg_cluster_ctarank_is_not_supported_before_SM_90__();
-template <typename = void>
-_CCCL_DEVICE static inline _CUDA_VSTD::uint32_t get_sreg_cluster_ctarank()
-{
-  NV_IF_ELSE_TARGET(
-    NV_PROVIDES_SM_90,
-    (_CUDA_VSTD::uint32_t __sreg_value;
-     asm("mov.u32 %0, %%cluster_ctarank;"
-         : "=r"(__sreg_value)
-         :
-         :);
-     return __sreg_value;),
-    (
-      // Unsupported architectures will have a linker error with a semi-decent error message
-      __cuda_ptx_get_sreg_cluster_ctarank_is_not_supported_before_SM_90__(); return 0;));
-}
-#endif // __cccl_ptx_isa >= 780
-
-/*
-// mov.u32 sreg_value, %%cluster_nctarank; // PTX ISA 78, SM_90
-template <typename=void>
-__device__ static inline uint32_t get_sreg_cluster_nctarank();
-*/
-#if __cccl_ptx_isa >= 780
-extern "C" _CCCL_DEVICE void __cuda_ptx_get_sreg_cluster_nctarank_is_not_supported_before_SM_90__();
-template <typename = void>
-_CCCL_DEVICE static inline _CUDA_VSTD::uint32_t get_sreg_cluster_nctarank()
-{
-  NV_IF_ELSE_TARGET(
-    NV_PROVIDES_SM_90,
-    (_CUDA_VSTD::uint32_t __sreg_value;
-     asm("mov.u32 %0, %%cluster_nctarank;"
-         : "=r"(__sreg_value)
-         :
-         :);
-     return __sreg_value;),
-    (
-      // Unsupported architectures will have a linker error with a semi-decent error message
-      __cuda_ptx_get_sreg_cluster_nctarank_is_not_supported_before_SM_90__(); return 0;));
-}
-#endif // __cccl_ptx_isa >= 780
-
-/*
-// mov.u32 sreg_value, %%lanemask_eq; // PTX ISA 20, SM_35
-template <typename=void>
-__device__ static inline uint32_t get_sreg_lanemask_eq();
-*/
-#if __cccl_ptx_isa >= 200
-extern "C" _CCCL_DEVICE void __cuda_ptx_get_sreg_lanemask_eq_is_not_supported_before_SM_35__();
-template <typename = void>
-_CCCL_DEVICE static inline _CUDA_VSTD::uint32_t get_sreg_lanemask_eq()
-{
-  NV_IF_ELSE_TARGET(
-    NV_PROVIDES_SM_35,
-    (_CUDA_VSTD::uint32_t __sreg_value;
-     asm("mov.u32 %0, %%lanemask_eq;"
-         : "=r"(__sreg_value)
-         :
-         :);
-     return __sreg_value;),
-    (
-      // Unsupported architectures will have a linker error with a semi-decent error message
-      __cuda_ptx_get_sreg_lanemask_eq_is_not_supported_before_SM_35__(); return 0;));
-}
-#endif // __cccl_ptx_isa >= 200
-
-/*
-// mov.u32 sreg_value, %%lanemask_le; // PTX ISA 20, SM_35
-template <typename=void>
-__device__ static inline uint32_t get_sreg_lanemask_le();
-*/
-#if __cccl_ptx_isa >= 200
-extern "C" _CCCL_DEVICE void __cuda_ptx_get_sreg_lanemask_le_is_not_supported_before_SM_35__();
-template <typename = void>
-_CCCL_DEVICE static inline _CUDA_VSTD::uint32_t get_sreg_lanemask_le()
-{
-  NV_IF_ELSE_TARGET(
-    NV_PROVIDES_SM_35,
-    (_CUDA_VSTD::uint32_t __sreg_value;
-     asm("mov.u32 %0, %%lanemask_le;"
-         : "=r"(__sreg_value)
-         :
-         :);
-     return __sreg_value;),
-    (
-      // Unsupported architectures will have a linker error with a semi-decent error message
-      __cuda_ptx_get_sreg_lanemask_le_is_not_supported_before_SM_35__(); return 0;));
-}
-#endif // __cccl_ptx_isa >= 200
-
-/*
-// mov.u32 sreg_value, %%lanemask_lt; // PTX ISA 20, SM_35
-template <typename=void>
-__device__ static inline uint32_t get_sreg_lanemask_lt();
-*/
-#if __cccl_ptx_isa >= 200
-extern "C" _CCCL_DEVICE void __cuda_ptx_get_sreg_lanemask_lt_is_not_supported_before_SM_35__();
-template <typename = void>
-_CCCL_DEVICE static inline _CUDA_VSTD::uint32_t get_sreg_lanemask_lt()
-{
-  NV_IF_ELSE_TARGET(
-    NV_PROVIDES_SM_35,
-    (_CUDA_VSTD::uint32_t __sreg_value;
-     asm("mov.u32 %0, %%lanemask_lt;"
-         : "=r"(__sreg_value)
-         :
-         :);
-     return __sreg_value;),
-    (
-      // Unsupported architectures will have a linker error with a semi-decent error message
-      __cuda_ptx_get_sreg_lanemask_lt_is_not_supported_before_SM_35__(); return 0;));
-}
-#endif // __cccl_ptx_isa >= 200
-
-/*
-// mov.u32 sreg_value, %%lanemask_ge; // PTX ISA 20, SM_35
-template <typename=void>
-__device__ static inline uint32_t get_sreg_lanemask_ge();
-*/
-#if __cccl_ptx_isa >= 200
-extern "C" _CCCL_DEVICE void __cuda_ptx_get_sreg_lanemask_ge_is_not_supported_before_SM_35__();
-template <typename = void>
-_CCCL_DEVICE static inline _CUDA_VSTD::uint32_t get_sreg_lanemask_ge()
-{
-  NV_IF_ELSE_TARGET(
-    NV_PROVIDES_SM_35,
-    (_CUDA_VSTD::uint32_t __sreg_value;
-     asm("mov.u32 %0, %%lanemask_ge;"
-         : "=r"(__sreg_value)
-         :
-         :);
-     return __sreg_value;),
-    (
-      // Unsupported architectures will have a linker error with a semi-decent error message
-      __cuda_ptx_get_sreg_lanemask_ge_is_not_supported_before_SM_35__(); return 0;));
-}
-#endif // __cccl_ptx_isa >= 200
-
-/*
-// mov.u32 sreg_value, %%lanemask_gt; // PTX ISA 20, SM_35
-template <typename=void>
-__device__ static inline uint32_t get_sreg_lanemask_gt();
-*/
-#if __cccl_ptx_isa >= 200
-extern "C" _CCCL_DEVICE void __cuda_ptx_get_sreg_lanemask_gt_is_not_supported_before_SM_35__();
-template <typename = void>
-_CCCL_DEVICE static inline _CUDA_VSTD::uint32_t get_sreg_lanemask_gt()
-{
-  NV_IF_ELSE_TARGET(
-    NV_PROVIDES_SM_35,
-    (_CUDA_VSTD::uint32_t __sreg_value;
-     asm("mov.u32 %0, %%lanemask_gt;"
-         : "=r"(__sreg_value)
-         :
-         :);
-     return __sreg_value;),
-    (
-      // Unsupported architectures will have a linker error with a semi-decent error message
-      __cuda_ptx_get_sreg_lanemask_gt_is_not_supported_before_SM_35__(); return 0;));
-}
-#endif // __cccl_ptx_isa >= 200
-
-/*
-// mov.u32 sreg_value, %%clock; // PTX ISA 10
-template <typename=void>
-__device__ static inline uint32_t get_sreg_clock();
-*/
-#if __cccl_ptx_isa >= 100
-template <typename = void>
-_CCCL_DEVICE static inline _CUDA_VSTD::uint32_t get_sreg_clock()
-{
-  _CUDA_VSTD::uint32_t __sreg_value;
-  asm volatile("mov.u32 %0, %%clock;" : "=r"(__sreg_value) : :);
-  return __sreg_value;
-}
-#endif // __cccl_ptx_isa >= 100
-
-/*
-// mov.u32 sreg_value, %%clock_hi; // PTX ISA 50, SM_35
-template <typename=void>
-__device__ static inline uint32_t get_sreg_clock_hi();
-*/
-#if __cccl_ptx_isa >= 500
-extern "C" _CCCL_DEVICE void __cuda_ptx_get_sreg_clock_hi_is_not_supported_before_SM_35__();
-template <typename = void>
-_CCCL_DEVICE static inline _CUDA_VSTD::uint32_t get_sreg_clock_hi()
-{
-  NV_IF_ELSE_TARGET(
-    NV_PROVIDES_SM_35,
-    (_CUDA_VSTD::uint32_t __sreg_value; asm volatile(
-       "mov.u32 %0, %%clock_hi;"
-       : "=r"(__sreg_value)
-       :
-       :);
-     return __sreg_value;),
-    (
-      // Unsupported architectures will have a linker error with a semi-decent error message
-      __cuda_ptx_get_sreg_clock_hi_is_not_supported_before_SM_35__(); return 0;));
-}
-#endif // __cccl_ptx_isa >= 500
-
-/*
-// mov.u64 sreg_value, %%clock64; // PTX ISA 20, SM_35
-template <typename=void>
-__device__ static inline uint64_t get_sreg_clock64();
-*/
-#if __cccl_ptx_isa >= 200
-extern "C" _CCCL_DEVICE void __cuda_ptx_get_sreg_clock64_is_not_supported_before_SM_35__();
-template <typename = void>
-_CCCL_DEVICE static inline _CUDA_VSTD::uint64_t get_sreg_clock64()
-{
-  NV_IF_ELSE_TARGET(
-    NV_PROVIDES_SM_35,
-    (_CUDA_VSTD::uint64_t __sreg_value; asm volatile(
-       "mov.u64 %0, %%clock64;"
-       : "=l"(__sreg_value)
-       :
-       :);
-     return __sreg_value;),
-    (
-      // Unsupported architectures will have a linker error with a semi-decent error message
-      __cuda_ptx_get_sreg_clock64_is_not_supported_before_SM_35__(); return 0;));
-}
-#endif // __cccl_ptx_isa >= 200
-
-/*
-// mov.u64 sreg_value, %%globaltimer; // PTX ISA 31, SM_35
-template <typename=void>
-__device__ static inline uint64_t get_sreg_globaltimer();
-*/
-#if __cccl_ptx_isa >= 310
-extern "C" _CCCL_DEVICE void __cuda_ptx_get_sreg_globaltimer_is_not_supported_before_SM_35__();
-template <typename = void>
-_CCCL_DEVICE static inline _CUDA_VSTD::uint64_t get_sreg_globaltimer()
-{
-  NV_IF_ELSE_TARGET(
-    NV_PROVIDES_SM_35,
-    (_CUDA_VSTD::uint64_t __sreg_value; asm volatile(
-       "mov.u64 %0, %%globaltimer;"
-       : "=l"(__sreg_value)
-       :
-       :);
-     return __sreg_value;),
-    (
-      // Unsupported architectures will have a linker error with a semi-decent error message
-      __cuda_ptx_get_sreg_globaltimer_is_not_supported_before_SM_35__(); return 0;));
-}
-#endif // __cccl_ptx_isa >= 310
-
-/*
-// mov.u32 sreg_value, %%globaltimer_lo; // PTX ISA 31, SM_35
-template <typename=void>
-__device__ static inline uint32_t get_sreg_globaltimer_lo();
-*/
-#if __cccl_ptx_isa >= 310
-extern "C" _CCCL_DEVICE void __cuda_ptx_get_sreg_globaltimer_lo_is_not_supported_before_SM_35__();
-template <typename = void>
-_CCCL_DEVICE static inline _CUDA_VSTD::uint32_t get_sreg_globaltimer_lo()
-{
-  NV_IF_ELSE_TARGET(
-    NV_PROVIDES_SM_35,
-    (_CUDA_VSTD::uint32_t __sreg_value; asm volatile(
-       "mov.u32 %0, %%globaltimer_lo;"
-       : "=r"(__sreg_value)
-       :
-       :);
-     return __sreg_value;),
-    (
-      // Unsupported architectures will have a linker error with a semi-decent error message
-      __cuda_ptx_get_sreg_globaltimer_lo_is_not_supported_before_SM_35__(); return 0;));
-}
-#endif // __cccl_ptx_isa >= 310
-
-/*
-// mov.u32 sreg_value, %%globaltimer_hi; // PTX ISA 31, SM_35
-template <typename=void>
-__device__ static inline uint32_t get_sreg_globaltimer_hi();
-*/
-#if __cccl_ptx_isa >= 310
-extern "C" _CCCL_DEVICE void __cuda_ptx_get_sreg_globaltimer_hi_is_not_supported_before_SM_35__();
-template <typename = void>
-_CCCL_DEVICE static inline _CUDA_VSTD::uint32_t get_sreg_globaltimer_hi()
-{
-  NV_IF_ELSE_TARGET(
-    NV_PROVIDES_SM_35,
-    (_CUDA_VSTD::uint32_t __sreg_value; asm volatile(
-       "mov.u32 %0, %%globaltimer_hi;"
-       : "=r"(__sreg_value)
-       :
-       :);
-     return __sreg_value;),
-    (
-      // Unsupported architectures will have a linker error with a semi-decent error message
-      __cuda_ptx_get_sreg_globaltimer_hi_is_not_supported_before_SM_35__(); return 0;));
-}
-#endif // __cccl_ptx_isa >= 310
-
-/*
-// mov.u32 sreg_value, %%total_smem_size; // PTX ISA 41, SM_35
-template <typename=void>
-__device__ static inline uint32_t get_sreg_total_smem_size();
-*/
-#if __cccl_ptx_isa >= 410
-extern "C" _CCCL_DEVICE void __cuda_ptx_get_sreg_total_smem_size_is_not_supported_before_SM_35__();
-template <typename = void>
-_CCCL_DEVICE static inline _CUDA_VSTD::uint32_t get_sreg_total_smem_size()
-{
-  NV_IF_ELSE_TARGET(
-    NV_PROVIDES_SM_35,
-    (_CUDA_VSTD::uint32_t __sreg_value;
-     asm("mov.u32 %0, %%total_smem_size;"
-         : "=r"(__sreg_value)
-         :
-         :);
-     return __sreg_value;),
-    (
-      // Unsupported architectures will have a linker error with a semi-decent error message
-      __cuda_ptx_get_sreg_total_smem_size_is_not_supported_before_SM_35__(); return 0;));
-}
-#endif // __cccl_ptx_isa >= 410
-
-/*
-// mov.u32 sreg_value, %%aggr_smem_size; // PTX ISA 81, SM_90
-template <typename=void>
-__device__ static inline uint32_t get_sreg_aggr_smem_size();
-*/
-#if __cccl_ptx_isa >= 810
-extern "C" _CCCL_DEVICE void __cuda_ptx_get_sreg_aggr_smem_size_is_not_supported_before_SM_90__();
-template <typename = void>
-_CCCL_DEVICE static inline _CUDA_VSTD::uint32_t get_sreg_aggr_smem_size()
-{
-  NV_IF_ELSE_TARGET(
-    NV_PROVIDES_SM_90,
-    (_CUDA_VSTD::uint32_t __sreg_value;
-     asm("mov.u32 %0, %%aggr_smem_size;"
-         : "=r"(__sreg_value)
-         :
-         :);
-     return __sreg_value;),
-    (
-      // Unsupported architectures will have a linker error with a semi-decent error message
-      __cuda_ptx_get_sreg_aggr_smem_size_is_not_supported_before_SM_90__(); return 0;));
-}
-#endif // __cccl_ptx_isa >= 810
-
-/*
-// mov.u32 sreg_value, %%dynamic_smem_size; // PTX ISA 41, SM_35
-template <typename=void>
-__device__ static inline uint32_t get_sreg_dynamic_smem_size();
-*/
-#if __cccl_ptx_isa >= 410
-extern "C" _CCCL_DEVICE void __cuda_ptx_get_sreg_dynamic_smem_size_is_not_supported_before_SM_35__();
-template <typename = void>
-_CCCL_DEVICE static inline _CUDA_VSTD::uint32_t get_sreg_dynamic_smem_size()
-{
-  NV_IF_ELSE_TARGET(
-    NV_PROVIDES_SM_35,
-    (_CUDA_VSTD::uint32_t __sreg_value;
-     asm("mov.u32 %0, %%dynamic_smem_size;"
-         : "=r"(__sreg_value)
-         :
-         :);
-     return __sreg_value;),
-    (
-      // Unsupported architectures will have a linker error with a semi-decent error message
-      __cuda_ptx_get_sreg_dynamic_smem_size_is_not_supported_before_SM_35__(); return 0;));
-}
-#endif // __cccl_ptx_isa >= 410
-
-/*
-// mov.u64 sreg_value, %%current_graph_exec; // PTX ISA 80, SM_50
-template <typename=void>
-__device__ static inline uint64_t get_sreg_current_graph_exec();
-*/
-#if __cccl_ptx_isa >= 800
-extern "C" _CCCL_DEVICE void __cuda_ptx_get_sreg_current_graph_exec_is_not_supported_before_SM_50__();
-template <typename = void>
-_CCCL_DEVICE static inline _CUDA_VSTD::uint64_t get_sreg_current_graph_exec()
-{
-  NV_IF_ELSE_TARGET(
-    NV_PROVIDES_SM_50,
-    (_CUDA_VSTD::uint64_t __sreg_value;
-     asm("mov.u64 %0, %%current_graph_exec;"
-         : "=l"(__sreg_value)
-         :
-         :);
-     return __sreg_value;),
-    (
-      // Unsupported architectures will have a linker error with a semi-decent error message
-      __cuda_ptx_get_sreg_current_graph_exec_is_not_supported_before_SM_50__(); return 0;));
-}
-#endif // __cccl_ptx_isa >= 800
+#include <cuda/__ptx/instructions/generated/get_sreg.inc>
 
 _LIBCUDACXX_END_NAMESPACE_CUDA_PTX
 
diff --git a/libcudacxx/include/cuda/__ptx/instructions/getctarank.h b/libcudacxx/include/cuda/__ptx/instructions/getctarank.h
index f1a2bbbd0e9..f5ed3424d3b 100644
--- a/libcudacxx/include/cuda/__ptx/instructions/getctarank.h
+++ b/libcudacxx/include/cuda/__ptx/instructions/getctarank.h
@@ -32,33 +32,7 @@ _LIBCUDACXX_BEGIN_NAMESPACE_CUDA_PTX
 
 // 9.7.8.23. Data Movement and Conversion Instructions: getctarank
 // https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#data-movement-and-conversion-instructions-getctarank
-/*
-// getctarank{.space}.u32 dest, addr; // PTX ISA 78, SM_90
-// .space     = { .shared::cluster }
-template <typename=void>
-__device__ static inline uint32_t getctarank(
-  cuda::ptx::space_cluster_t,
-  const void* addr);
-*/
-#if __cccl_ptx_isa >= 780
-extern "C" _CCCL_DEVICE void __cuda_ptx_getctarank_is_not_supported_before_SM_90__();
-template <typename = void>
-_CCCL_DEVICE static inline _CUDA_VSTD::uint32_t getctarank(space_cluster_t, const void* __addr)
-{
-  // __space == space_cluster (due to parameter type constraint)
-  NV_IF_ELSE_TARGET(
-    NV_PROVIDES_SM_90,
-    (_CUDA_VSTD::uint32_t __dest;
-     asm("getctarank.shared::cluster.u32 %0, %1;"
-         : "=r"(__dest)
-         : "r"(__as_ptr_smem(__addr))
-         :);
-     return __dest;),
-    (
-      // Unsupported architectures will have a linker error with a semi-decent error message
-      __cuda_ptx_getctarank_is_not_supported_before_SM_90__(); return 0;));
-}
-#endif // __cccl_ptx_isa >= 780
+#include <cuda/__ptx/instructions/generated/getctarank.inc>
 
 _LIBCUDACXX_END_NAMESPACE_CUDA_PTX
 
diff --git a/libcudacxx/include/cuda/__ptx/instructions/mbarrier_arrive.h b/libcudacxx/include/cuda/__ptx/instructions/mbarrier_arrive.h
index 5b423990f1c..fb1341a61d8 100644
--- a/libcudacxx/include/cuda/__ptx/instructions/mbarrier_arrive.h
+++ b/libcudacxx/include/cuda/__ptx/instructions/mbarrier_arrive.h
@@ -32,316 +32,9 @@ _LIBCUDACXX_BEGIN_NAMESPACE_CUDA_PTX
 
 // 9.7.12.15.13. Parallel Synchronization and Communication Instructions: mbarrier.arrive
 // https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#parallel-synchronization-and-communication-instructions-mbarrier-arrive
-/*
-// mbarrier.arrive.shared.b64                                  state,  [addr];           // 1.  PTX ISA 70, SM_80
-template <typename=void>
-__device__ static inline uint64_t mbarrier_arrive(
-  uint64_t* addr);
-*/
-#if __cccl_ptx_isa >= 700
-extern "C" _CCCL_DEVICE void __cuda_ptx_mbarrier_arrive_is_not_supported_before_SM_80__();
-template <typename = void>
-_CCCL_DEVICE static inline _CUDA_VSTD::uint64_t mbarrier_arrive(_CUDA_VSTD::uint64_t* __addr)
-{
-  NV_IF_ELSE_TARGET(
-    NV_PROVIDES_SM_80,
-    (_CUDA_VSTD::uint64_t __state;
-     asm("mbarrier.arrive.shared.b64                                  %0,  [%1];           // 1. "
-         : "=l"(__state)
-         : "r"(__as_ptr_smem(__addr))
-         : "memory");
-     return __state;),
-    (
-      // Unsupported architectures will have a linker error with a semi-decent error message
-      __cuda_ptx_mbarrier_arrive_is_not_supported_before_SM_80__(); return 0;));
-}
-#endif // __cccl_ptx_isa >= 700
-
-/*
-// mbarrier.arrive.shared::cta.b64                             state,  [addr], count;    // 2.  PTX ISA 78, SM_90
-template <typename=void>
-__device__ static inline uint64_t mbarrier_arrive(
-  uint64_t* addr,
-  const uint32_t& count);
-*/
-#if __cccl_ptx_isa >= 780
-extern "C" _CCCL_DEVICE void __cuda_ptx_mbarrier_arrive_is_not_supported_before_SM_90__();
-template <typename = void>
-_CCCL_DEVICE static inline _CUDA_VSTD::uint64_t
-mbarrier_arrive(_CUDA_VSTD::uint64_t* __addr, const _CUDA_VSTD::uint32_t& __count)
-{
-  NV_IF_ELSE_TARGET(
-    NV_PROVIDES_SM_90,
-    (_CUDA_VSTD::uint64_t __state;
-     asm("mbarrier.arrive.shared::cta.b64                             %0,  [%1], %2;    // 2. "
-         : "=l"(__state)
-         : "r"(__as_ptr_smem(__addr)), "r"(__count)
-         : "memory");
-     return __state;),
-    (
-      // Unsupported architectures will have a linker error with a semi-decent error message
-      __cuda_ptx_mbarrier_arrive_is_not_supported_before_SM_90__(); return 0;));
-}
-#endif // __cccl_ptx_isa >= 780
-
-/*
-// mbarrier.arrive{.sem}{.scope}{.space}.b64                   state,  [addr];           // 3a.  PTX ISA 80, SM_90
-// .sem       = { .release }
-// .scope     = { .cta, .cluster }
-// .space     = { .shared::cta }
-template <cuda::ptx::dot_scope Scope>
-__device__ static inline uint64_t mbarrier_arrive(
-  cuda::ptx::sem_release_t,
-  cuda::ptx::scope_t<Scope> scope,
-  cuda::ptx::space_shared_t,
-  uint64_t* addr);
-*/
-#if __cccl_ptx_isa >= 800
-extern "C" _CCCL_DEVICE void __cuda_ptx_mbarrier_arrive_is_not_supported_before_SM_90__();
-template <dot_scope _Scope>
-_CCCL_DEVICE static inline _CUDA_VSTD::uint64_t
-mbarrier_arrive(sem_release_t, scope_t<_Scope> __scope, space_shared_t, _CUDA_VSTD::uint64_t* __addr)
-{
-  // __sem == sem_release (due to parameter type constraint)
-  static_assert(__scope == scope_cta || __scope == scope_cluster, "");
-  // __space == space_shared (due to parameter type constraint)
-  NV_IF_ELSE_TARGET(
-    NV_PROVIDES_SM_90,
-    (
-      _CUDA_VSTD::uint64_t __state; _CCCL_IF_CONSTEXPR (__scope == scope_cta) {
-        asm("mbarrier.arrive.release.cta.shared::cta.b64                   %0,  [%1];           // 3a. "
-            : "=l"(__state)
-            : "r"(__as_ptr_smem(__addr))
-            : "memory");
-      } _CCCL_ELSE_IF_CONSTEXPR (__scope == scope_cluster) {
-        asm("mbarrier.arrive.release.cluster.shared::cta.b64                   %0,  [%1];           // 3a. "
-            : "=l"(__state)
-            : "r"(__as_ptr_smem(__addr))
-            : "memory");
-      } return __state;),
-    (
-      // Unsupported architectures will have a linker error with a semi-decent error message
-      __cuda_ptx_mbarrier_arrive_is_not_supported_before_SM_90__(); return 0;));
-}
-#endif // __cccl_ptx_isa >= 800
-
-/*
-// mbarrier.arrive{.sem}{.scope}{.space}.b64                   state,  [addr], count;    // 3b.  PTX ISA 80, SM_90
-// .sem       = { .release }
-// .scope     = { .cta, .cluster }
-// .space     = { .shared::cta }
-template <cuda::ptx::dot_scope Scope>
-__device__ static inline uint64_t mbarrier_arrive(
-  cuda::ptx::sem_release_t,
-  cuda::ptx::scope_t<Scope> scope,
-  cuda::ptx::space_shared_t,
-  uint64_t* addr,
-  const uint32_t& count);
-*/
-#if __cccl_ptx_isa >= 800
-extern "C" _CCCL_DEVICE void __cuda_ptx_mbarrier_arrive_is_not_supported_before_SM_90__();
-template <dot_scope _Scope>
-_CCCL_DEVICE static inline _CUDA_VSTD::uint64_t mbarrier_arrive(
-  sem_release_t,
-  scope_t<_Scope> __scope,
-  space_shared_t,
-  _CUDA_VSTD::uint64_t* __addr,
-  const _CUDA_VSTD::uint32_t& __count)
-{
-  // __sem == sem_release (due to parameter type constraint)
-  static_assert(__scope == scope_cta || __scope == scope_cluster, "");
-  // __space == space_shared (due to parameter type constraint)
-  NV_IF_ELSE_TARGET(
-    NV_PROVIDES_SM_90,
-    (
-      _CUDA_VSTD::uint64_t __state; _CCCL_IF_CONSTEXPR (__scope == scope_cta) {
-        asm("mbarrier.arrive.release.cta.shared::cta.b64                   %0,  [%1], %2;    // 3b. "
-            : "=l"(__state)
-            : "r"(__as_ptr_smem(__addr)), "r"(__count)
-            : "memory");
-      } _CCCL_ELSE_IF_CONSTEXPR (__scope == scope_cluster) {
-        asm("mbarrier.arrive.release.cluster.shared::cta.b64                   %0,  [%1], %2;    // 3b. "
-            : "=l"(__state)
-            : "r"(__as_ptr_smem(__addr)), "r"(__count)
-            : "memory");
-      } return __state;),
-    (
-      // Unsupported architectures will have a linker error with a semi-decent error message
-      __cuda_ptx_mbarrier_arrive_is_not_supported_before_SM_90__(); return 0;));
-}
-#endif // __cccl_ptx_isa >= 800
-
-/*
-// mbarrier.arrive{.sem}{.scope}{.space}.b64                   _, [addr];                // 4a.  PTX ISA 80, SM_90
-// .sem       = { .release }
-// .scope     = { .cluster }
-// .space     = { .shared::cluster }
-template <typename=void>
-__device__ static inline void mbarrier_arrive(
-  cuda::ptx::sem_release_t,
-  cuda::ptx::scope_cluster_t,
-  cuda::ptx::space_cluster_t,
-  uint64_t* addr);
-*/
-#if __cccl_ptx_isa >= 800
-extern "C" _CCCL_DEVICE void __cuda_ptx_mbarrier_arrive_is_not_supported_before_SM_90__();
-template <typename = void>
-_CCCL_DEVICE static inline void
-mbarrier_arrive(sem_release_t, scope_cluster_t, space_cluster_t, _CUDA_VSTD::uint64_t* __addr)
-{
-  // __sem == sem_release (due to parameter type constraint)
-  // __scope == scope_cluster (due to parameter type constraint)
-  // __space == space_cluster (due to parameter type constraint)
-  NV_IF_ELSE_TARGET(
-    NV_PROVIDES_SM_90,
-    (asm("mbarrier.arrive.release.cluster.shared::cluster.b64                   _, [%0];                // 4a. "
-         :
-         : "r"(__as_ptr_remote_dsmem(__addr))
-         : "memory");),
-    (
-      // Unsupported architectures will have a linker error with a semi-decent error message
-      __cuda_ptx_mbarrier_arrive_is_not_supported_before_SM_90__();));
-}
-#endif // __cccl_ptx_isa >= 800
-
-/*
-// mbarrier.arrive{.sem}{.scope}{.space}.b64                   _, [addr], count;         // 4b.  PTX ISA 80, SM_90
-// .sem       = { .release }
-// .scope     = { .cluster }
-// .space     = { .shared::cluster }
-template <typename=void>
-__device__ static inline void mbarrier_arrive(
-  cuda::ptx::sem_release_t,
-  cuda::ptx::scope_cluster_t,
-  cuda::ptx::space_cluster_t,
-  uint64_t* addr,
-  const uint32_t& count);
-*/
-#if __cccl_ptx_isa >= 800
-extern "C" _CCCL_DEVICE void __cuda_ptx_mbarrier_arrive_is_not_supported_before_SM_90__();
-template <typename = void>
-_CCCL_DEVICE static inline void mbarrier_arrive(
-  sem_release_t, scope_cluster_t, space_cluster_t, _CUDA_VSTD::uint64_t* __addr, const _CUDA_VSTD::uint32_t& __count)
-{
-  // __sem == sem_release (due to parameter type constraint)
-  // __scope == scope_cluster (due to parameter type constraint)
-  // __space == space_cluster (due to parameter type constraint)
-  NV_IF_ELSE_TARGET(
-    NV_PROVIDES_SM_90,
-    (asm("mbarrier.arrive.release.cluster.shared::cluster.b64                   _, [%0], %1;         // 4b. "
-         :
-         : "r"(__as_ptr_remote_dsmem(__addr)), "r"(__count)
-         : "memory");),
-    (
-      // Unsupported architectures will have a linker error with a semi-decent error message
-      __cuda_ptx_mbarrier_arrive_is_not_supported_before_SM_90__();));
-}
-#endif // __cccl_ptx_isa >= 800
-/*
-// mbarrier.arrive.noComplete.shared.b64                       state,  [addr], count;    // 5.  PTX ISA 70, SM_80
-template <typename=void>
-__device__ static inline uint64_t mbarrier_arrive_no_complete(
-  uint64_t* addr,
-  const uint32_t& count);
-*/
-#if __cccl_ptx_isa >= 700
-extern "C" _CCCL_DEVICE void __cuda_ptx_mbarrier_arrive_no_complete_is_not_supported_before_SM_80__();
-template <typename = void>
-_CCCL_DEVICE static inline _CUDA_VSTD::uint64_t
-mbarrier_arrive_no_complete(_CUDA_VSTD::uint64_t* __addr, const _CUDA_VSTD::uint32_t& __count)
-{
-  NV_IF_ELSE_TARGET(
-    NV_PROVIDES_SM_80,
-    (_CUDA_VSTD::uint64_t __state;
-     asm("mbarrier.arrive.noComplete.shared.b64                       %0,  [%1], %2;    // 5. "
-         : "=l"(__state)
-         : "r"(__as_ptr_smem(__addr)), "r"(__count)
-         : "memory");
-     return __state;),
-    (
-      // Unsupported architectures will have a linker error with a semi-decent error message
-      __cuda_ptx_mbarrier_arrive_no_complete_is_not_supported_before_SM_80__(); return 0;));
-}
-#endif // __cccl_ptx_isa >= 700
-/*
-// mbarrier.arrive.expect_tx{.sem}{.scope}{.space}.b64 state, [addr], tx_count; // 8.  PTX ISA 80, SM_90
-// .sem       = { .release }
-// .scope     = { .cta, .cluster }
-// .space     = { .shared::cta }
-template <cuda::ptx::dot_scope Scope>
-__device__ static inline uint64_t mbarrier_arrive_expect_tx(
-  cuda::ptx::sem_release_t,
-  cuda::ptx::scope_t<Scope> scope,
-  cuda::ptx::space_shared_t,
-  uint64_t* addr,
-  const uint32_t& tx_count);
-*/
-#if __cccl_ptx_isa >= 800
-extern "C" _CCCL_DEVICE void __cuda_ptx_mbarrier_arrive_expect_tx_is_not_supported_before_SM_90__();
-template <dot_scope _Scope>
-_CCCL_DEVICE static inline _CUDA_VSTD::uint64_t mbarrier_arrive_expect_tx(
-  sem_release_t,
-  scope_t<_Scope> __scope,
-  space_shared_t,
-  _CUDA_VSTD::uint64_t* __addr,
-  const _CUDA_VSTD::uint32_t& __tx_count)
-{
-  // __sem == sem_release (due to parameter type constraint)
-  static_assert(__scope == scope_cta || __scope == scope_cluster, "");
-  // __space == space_shared (due to parameter type constraint)
-  NV_IF_ELSE_TARGET(
-    NV_PROVIDES_SM_90,
-    (
-      _CUDA_VSTD::uint64_t __state; _CCCL_IF_CONSTEXPR (__scope == scope_cta) {
-        asm("mbarrier.arrive.expect_tx.release.cta.shared::cta.b64 %0, [%1], %2; // 8. "
-            : "=l"(__state)
-            : "r"(__as_ptr_smem(__addr)), "r"(__tx_count)
-            : "memory");
-      } _CCCL_ELSE_IF_CONSTEXPR (__scope == scope_cluster) {
-        asm("mbarrier.arrive.expect_tx.release.cluster.shared::cta.b64 %0, [%1], %2; // 8. "
-            : "=l"(__state)
-            : "r"(__as_ptr_smem(__addr)), "r"(__tx_count)
-            : "memory");
-      } return __state;),
-    (
-      // Unsupported architectures will have a linker error with a semi-decent error message
-      __cuda_ptx_mbarrier_arrive_expect_tx_is_not_supported_before_SM_90__(); return 0;));
-}
-#endif // __cccl_ptx_isa >= 800
-
-/*
-// mbarrier.arrive.expect_tx{.sem}{.scope}{.space}.b64   _, [addr], tx_count; // 9.  PTX ISA 80, SM_90
-// .sem       = { .release }
-// .scope     = { .cluster }
-// .space     = { .shared::cluster }
-template <typename=void>
-__device__ static inline void mbarrier_arrive_expect_tx(
-  cuda::ptx::sem_release_t,
-  cuda::ptx::scope_cluster_t,
-  cuda::ptx::space_cluster_t,
-  uint64_t* addr,
-  const uint32_t& tx_count);
-*/
-#if __cccl_ptx_isa >= 800
-extern "C" _CCCL_DEVICE void __cuda_ptx_mbarrier_arrive_expect_tx_is_not_supported_before_SM_90__();
-template <typename = void>
-_CCCL_DEVICE static inline void mbarrier_arrive_expect_tx(
-  sem_release_t, scope_cluster_t, space_cluster_t, _CUDA_VSTD::uint64_t* __addr, const _CUDA_VSTD::uint32_t& __tx_count)
-{
-  // __sem == sem_release (due to parameter type constraint)
-  // __scope == scope_cluster (due to parameter type constraint)
-  // __space == space_cluster (due to parameter type constraint)
-  NV_IF_ELSE_TARGET(
-    NV_PROVIDES_SM_90,
-    (asm("mbarrier.arrive.expect_tx.release.cluster.shared::cluster.b64   _, [%0], %1; // 9. "
-         :
-         : "r"(__as_ptr_remote_dsmem(__addr)), "r"(__tx_count)
-         : "memory");),
-    (
-      // Unsupported architectures will have a linker error with a semi-decent error message
-      __cuda_ptx_mbarrier_arrive_expect_tx_is_not_supported_before_SM_90__();));
-}
-#endif // __cccl_ptx_isa >= 800
+#include <cuda/__ptx/instructions/generated/mbarrier_arrive.inc>
+#include <cuda/__ptx/instructions/generated/mbarrier_arrive_expect_tx.inc>
+#include <cuda/__ptx/instructions/generated/mbarrier_arrive_no_complete.inc>
 
 _LIBCUDACXX_END_NAMESPACE_CUDA_PTX
 
diff --git a/libcudacxx/include/cuda/__ptx/instructions/mbarrier_init.h b/libcudacxx/include/cuda/__ptx/instructions/mbarrier_init.h
index 366b1b67eec..575abda7a41 100644
--- a/libcudacxx/include/cuda/__ptx/instructions/mbarrier_init.h
+++ b/libcudacxx/include/cuda/__ptx/instructions/mbarrier_init.h
@@ -32,29 +32,7 @@ _LIBCUDACXX_BEGIN_NAMESPACE_CUDA_PTX
 
 // 9.7.12.15.9. Parallel Synchronization and Communication Instructions: mbarrier.init
 // https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#parallel-synchronization-and-communication-instructions-mbarrier-init
-/*
-// mbarrier.init.shared.b64 [addr], count; // PTX ISA 70, SM_80
-template <typename=void>
-__device__ static inline void mbarrier_init(
-  uint64_t* addr,
-  const uint32_t& count);
-*/
-#if __cccl_ptx_isa >= 700
-extern "C" _CCCL_DEVICE void __cuda_ptx_mbarrier_init_is_not_supported_before_SM_80__();
-template <typename = void>
-_CCCL_DEVICE static inline void mbarrier_init(_CUDA_VSTD::uint64_t* __addr, const _CUDA_VSTD::uint32_t& __count)
-{
-  NV_IF_ELSE_TARGET(
-    NV_PROVIDES_SM_80,
-    (asm("mbarrier.init.shared.b64 [%0], %1;"
-         :
-         : "r"(__as_ptr_smem(__addr)), "r"(__count)
-         : "memory");),
-    (
-      // Unsupported architectures will have a linker error with a semi-decent error message
-      __cuda_ptx_mbarrier_init_is_not_supported_before_SM_80__();));
-}
-#endif // __cccl_ptx_isa >= 700
+#include <cuda/__ptx/instructions/generated/mbarrier_init.inc>
 
 _LIBCUDACXX_END_NAMESPACE_CUDA_PTX
 
diff --git a/libcudacxx/include/cuda/__ptx/instructions/mbarrier_wait.h b/libcudacxx/include/cuda/__ptx/instructions/mbarrier_wait.h
index 837fec44b9f..2d6adb78eec 100644
--- a/libcudacxx/include/cuda/__ptx/instructions/mbarrier_wait.h
+++ b/libcudacxx/include/cuda/__ptx/instructions/mbarrier_wait.h
@@ -32,470 +32,10 @@ _LIBCUDACXX_BEGIN_NAMESPACE_CUDA_PTX
 
 // 9.7.12.15.16. Parallel Synchronization and Communication Instructions: mbarrier.test_wait/mbarrier.try_wait
 // https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#parallel-synchronization-and-communication-instructions-mbarrier-test-wait-mbarrier-try-wait
-/*
-// mbarrier.test_wait.shared.b64 waitComplete, [addr], state;                                                  // 1. PTX
-ISA 70, SM_80 template <typename=void>
-__device__ static inline bool mbarrier_test_wait(
-  uint64_t* addr,
-  const uint64_t& state);
-*/
-#if __cccl_ptx_isa >= 700
-extern "C" _CCCL_DEVICE void __cuda_ptx_mbarrier_test_wait_is_not_supported_before_SM_80__();
-template <typename = void>
-_CCCL_DEVICE static inline bool mbarrier_test_wait(_CUDA_VSTD::uint64_t* __addr, const _CUDA_VSTD::uint64_t& __state)
-{
-  NV_IF_ELSE_TARGET(
-    NV_PROVIDES_SM_80,
-    (_CUDA_VSTD::uint32_t __waitComplete;
-     asm("{\n\t .reg .pred P_OUT; \n\t"
-         "mbarrier.test_wait.shared.b64 P_OUT, [%1], %2;                                                  // 1. \n\t"
-         "selp.b32 %0, 1, 0, P_OUT; \n"
-         "}"
-         : "=r"(__waitComplete)
-         : "r"(__as_ptr_smem(__addr)), "l"(__state)
-         : "memory");
-     return static_cast<bool>(__waitComplete);),
-    (
-      // Unsupported architectures will have a linker error with a semi-decent error message
-      __cuda_ptx_mbarrier_test_wait_is_not_supported_before_SM_80__(); return false;));
-}
-#endif // __cccl_ptx_isa >= 700
-
-/*
-// mbarrier.test_wait{.sem}{.scope}.shared::cta.b64        waitComplete, [addr], state;                        // 2. PTX
-ISA 80, SM_90
-// .sem       = { .acquire }
-// .scope     = { .cta, .cluster }
-template <cuda::ptx::dot_scope Scope>
-__device__ static inline bool mbarrier_test_wait(
-  cuda::ptx::sem_acquire_t,
-  cuda::ptx::scope_t<Scope> scope,
-  uint64_t* addr,
-  const uint64_t& state);
-*/
-#if __cccl_ptx_isa >= 800
-extern "C" _CCCL_DEVICE void __cuda_ptx_mbarrier_test_wait_is_not_supported_before_SM_90__();
-template <dot_scope _Scope>
-_CCCL_DEVICE static inline bool mbarrier_test_wait(
-  sem_acquire_t, scope_t<_Scope> __scope, _CUDA_VSTD::uint64_t* __addr, const _CUDA_VSTD::uint64_t& __state)
-{
-  // __sem == sem_acquire (due to parameter type constraint)
-  static_assert(__scope == scope_cta || __scope == scope_cluster, "");
-  NV_IF_ELSE_TARGET(
-    NV_PROVIDES_SM_90,
-    (
-      _CUDA_VSTD::uint32_t __waitComplete; _CCCL_IF_CONSTEXPR (__scope == scope_cta) {
-        asm("{\n\t .reg .pred P_OUT; \n\t"
-            "mbarrier.test_wait.acquire.cta.shared::cta.b64        P_OUT, [%1], %2;                        // 2.  \n\t"
-            "selp.b32 %0, 1, 0, P_OUT; \n"
-            "}"
-            : "=r"(__waitComplete)
-            : "r"(__as_ptr_smem(__addr)), "l"(__state)
-            : "memory");
-      } _CCCL_ELSE_IF_CONSTEXPR (__scope == scope_cluster) {
-        asm("{\n\t .reg .pred P_OUT; \n\t"
-            "mbarrier.test_wait.acquire.cluster.shared::cta.b64        P_OUT, [%1], %2;                        // 2.  "
-            "\n\t"
-            "selp.b32 %0, 1, 0, P_OUT; \n"
-            "}"
-            : "=r"(__waitComplete)
-            : "r"(__as_ptr_smem(__addr)), "l"(__state)
-            : "memory");
-      } return static_cast<bool>(__waitComplete);),
-    (
-      // Unsupported architectures will have a linker error with a semi-decent error message
-      __cuda_ptx_mbarrier_test_wait_is_not_supported_before_SM_90__(); return false;));
-}
-#endif // __cccl_ptx_isa >= 800
-/*
-// mbarrier.test_wait.parity.shared.b64 waitComplete, [addr], phaseParity;                                     // 3. PTX
-ISA 71, SM_80 template <typename=void>
-__device__ static inline bool mbarrier_test_wait_parity(
-  uint64_t* addr,
-  const uint32_t& phaseParity);
-*/
-#if __cccl_ptx_isa >= 710
-extern "C" _CCCL_DEVICE void __cuda_ptx_mbarrier_test_wait_parity_is_not_supported_before_SM_80__();
-template <typename = void>
-_CCCL_DEVICE static inline bool
-mbarrier_test_wait_parity(_CUDA_VSTD::uint64_t* __addr, const _CUDA_VSTD::uint32_t& __phaseParity)
-{
-  NV_IF_ELSE_TARGET(
-    NV_PROVIDES_SM_80,
-    (_CUDA_VSTD::uint32_t __waitComplete;
-     asm("{\n\t .reg .pred P_OUT; \n\t"
-         "mbarrier.test_wait.parity.shared.b64 P_OUT, [%1], %2;                                     // 3. \n\t"
-         "selp.b32 %0, 1, 0, P_OUT; \n"
-         "}"
-         : "=r"(__waitComplete)
-         : "r"(__as_ptr_smem(__addr)), "r"(__phaseParity)
-         : "memory");
-     return static_cast<bool>(__waitComplete);),
-    (
-      // Unsupported architectures will have a linker error with a semi-decent error message
-      __cuda_ptx_mbarrier_test_wait_parity_is_not_supported_before_SM_80__(); return false;));
-}
-#endif // __cccl_ptx_isa >= 710
-
-/*
-// mbarrier.test_wait.parity{.sem}{.scope}.shared::cta.b64 waitComplete, [addr], phaseParity;                  // 4. PTX
-ISA 80, SM_90
-// .sem       = { .acquire }
-// .scope     = { .cta, .cluster }
-template <cuda::ptx::dot_scope Scope>
-__device__ static inline bool mbarrier_test_wait_parity(
-  cuda::ptx::sem_acquire_t,
-  cuda::ptx::scope_t<Scope> scope,
-  uint64_t* addr,
-  const uint32_t& phaseParity);
-*/
-#if __cccl_ptx_isa >= 800
-extern "C" _CCCL_DEVICE void __cuda_ptx_mbarrier_test_wait_parity_is_not_supported_before_SM_90__();
-template <dot_scope _Scope>
-_CCCL_DEVICE static inline bool mbarrier_test_wait_parity(
-  sem_acquire_t, scope_t<_Scope> __scope, _CUDA_VSTD::uint64_t* __addr, const _CUDA_VSTD::uint32_t& __phaseParity)
-{
-  // __sem == sem_acquire (due to parameter type constraint)
-  static_assert(__scope == scope_cta || __scope == scope_cluster, "");
-  NV_IF_ELSE_TARGET(
-    NV_PROVIDES_SM_90,
-    (
-      _CUDA_VSTD::uint32_t __waitComplete; _CCCL_IF_CONSTEXPR (__scope == scope_cta) {
-        asm("{\n\t .reg .pred P_OUT; \n\t"
-            "mbarrier.test_wait.parity.acquire.cta.shared::cta.b64 P_OUT, [%1], %2;                  // 4. \n\t"
-            "selp.b32 %0, 1, 0, P_OUT; \n"
-            "}"
-            : "=r"(__waitComplete)
-            : "r"(__as_ptr_smem(__addr)), "r"(__phaseParity)
-            : "memory");
-      } _CCCL_ELSE_IF_CONSTEXPR (__scope == scope_cluster) {
-        asm("{\n\t .reg .pred P_OUT; \n\t"
-            "mbarrier.test_wait.parity.acquire.cluster.shared::cta.b64 P_OUT, [%1], %2;                  // 4. \n\t"
-            "selp.b32 %0, 1, 0, P_OUT; \n"
-            "}"
-            : "=r"(__waitComplete)
-            : "r"(__as_ptr_smem(__addr)), "r"(__phaseParity)
-            : "memory");
-      } return static_cast<bool>(__waitComplete);),
-    (
-      // Unsupported architectures will have a linker error with a semi-decent error message
-      __cuda_ptx_mbarrier_test_wait_parity_is_not_supported_before_SM_90__(); return false;));
-}
-#endif // __cccl_ptx_isa >= 800
-/*
-// mbarrier.try_wait.shared::cta.b64         waitComplete, [addr], state;                                      // 5a.
-PTX ISA 78, SM_90 template <typename=void>
-__device__ static inline bool mbarrier_try_wait(
-  uint64_t* addr,
-  const uint64_t& state);
-*/
-#if __cccl_ptx_isa >= 780
-extern "C" _CCCL_DEVICE void __cuda_ptx_mbarrier_try_wait_is_not_supported_before_SM_90__();
-template <typename = void>
-_CCCL_DEVICE static inline bool mbarrier_try_wait(_CUDA_VSTD::uint64_t* __addr, const _CUDA_VSTD::uint64_t& __state)
-{
-  NV_IF_ELSE_TARGET(
-    NV_PROVIDES_SM_90,
-    (_CUDA_VSTD::uint32_t __waitComplete;
-     asm("{\n\t .reg .pred P_OUT; \n\t"
-         "mbarrier.try_wait.shared::cta.b64         P_OUT, [%1], %2;                                      // 5a. \n\t"
-         "selp.b32 %0, 1, 0, P_OUT; \n"
-         "}"
-         : "=r"(__waitComplete)
-         : "r"(__as_ptr_smem(__addr)), "l"(__state)
-         : "memory");
-     return static_cast<bool>(__waitComplete);),
-    (
-      // Unsupported architectures will have a linker error with a semi-decent error message
-      __cuda_ptx_mbarrier_try_wait_is_not_supported_before_SM_90__(); return false;));
-}
-#endif // __cccl_ptx_isa >= 780
-
-/*
-// mbarrier.try_wait.shared::cta.b64         waitComplete, [addr], state, suspendTimeHint;                    // 5b. PTX
-ISA 78, SM_90 template <typename=void>
-__device__ static inline bool mbarrier_try_wait(
-  uint64_t* addr,
-  const uint64_t& state,
-  const uint32_t& suspendTimeHint);
-*/
-#if __cccl_ptx_isa >= 780
-extern "C" _CCCL_DEVICE void __cuda_ptx_mbarrier_try_wait_is_not_supported_before_SM_90__();
-template <typename = void>
-_CCCL_DEVICE static inline bool mbarrier_try_wait(
-  _CUDA_VSTD::uint64_t* __addr, const _CUDA_VSTD::uint64_t& __state, const _CUDA_VSTD::uint32_t& __suspendTimeHint)
-{
-  NV_IF_ELSE_TARGET(
-    NV_PROVIDES_SM_90,
-    (_CUDA_VSTD::uint32_t __waitComplete;
-     asm("{\n\t .reg .pred P_OUT; \n\t"
-         "mbarrier.try_wait.shared::cta.b64         P_OUT, [%1], %2, %3;                    // 5b. \n\t"
-         "selp.b32 %0, 1, 0, P_OUT; \n"
-         "}"
-         : "=r"(__waitComplete)
-         : "r"(__as_ptr_smem(__addr)), "l"(__state), "r"(__suspendTimeHint)
-         : "memory");
-     return static_cast<bool>(__waitComplete);),
-    (
-      // Unsupported architectures will have a linker error with a semi-decent error message
-      __cuda_ptx_mbarrier_try_wait_is_not_supported_before_SM_90__(); return false;));
-}
-#endif // __cccl_ptx_isa >= 780
-
-/*
-// mbarrier.try_wait{.sem}{.scope}.shared::cta.b64         waitComplete, [addr], state;                        // 6a.
-PTX ISA 80, SM_90
-// .sem       = { .acquire }
-// .scope     = { .cta, .cluster }
-template <cuda::ptx::dot_scope Scope>
-__device__ static inline bool mbarrier_try_wait(
-  cuda::ptx::sem_acquire_t,
-  cuda::ptx::scope_t<Scope> scope,
-  uint64_t* addr,
-  const uint64_t& state);
-*/
-#if __cccl_ptx_isa >= 800
-extern "C" _CCCL_DEVICE void __cuda_ptx_mbarrier_try_wait_is_not_supported_before_SM_90__();
-template <dot_scope _Scope>
-_CCCL_DEVICE static inline bool mbarrier_try_wait(
-  sem_acquire_t, scope_t<_Scope> __scope, _CUDA_VSTD::uint64_t* __addr, const _CUDA_VSTD::uint64_t& __state)
-{
-  // __sem == sem_acquire (due to parameter type constraint)
-  static_assert(__scope == scope_cta || __scope == scope_cluster, "");
-  NV_IF_ELSE_TARGET(
-    NV_PROVIDES_SM_90,
-    (
-      _CUDA_VSTD::uint32_t __waitComplete; _CCCL_IF_CONSTEXPR (__scope == scope_cta) {
-        asm("{\n\t .reg .pred P_OUT; \n\t"
-            "mbarrier.try_wait.acquire.cta.shared::cta.b64         P_OUT, [%1], %2;                        // 6a. \n\t"
-            "selp.b32 %0, 1, 0, P_OUT; \n"
-            "}"
-            : "=r"(__waitComplete)
-            : "r"(__as_ptr_smem(__addr)), "l"(__state)
-            : "memory");
-      } _CCCL_ELSE_IF_CONSTEXPR (__scope == scope_cluster) {
-        asm("{\n\t .reg .pred P_OUT; \n\t"
-            "mbarrier.try_wait.acquire.cluster.shared::cta.b64         P_OUT, [%1], %2;                        // 6a. "
-            "\n\t"
-            "selp.b32 %0, 1, 0, P_OUT; \n"
-            "}"
-            : "=r"(__waitComplete)
-            : "r"(__as_ptr_smem(__addr)), "l"(__state)
-            : "memory");
-      } return static_cast<bool>(__waitComplete);),
-    (
-      // Unsupported architectures will have a linker error with a semi-decent error message
-      __cuda_ptx_mbarrier_try_wait_is_not_supported_before_SM_90__(); return false;));
-}
-#endif // __cccl_ptx_isa >= 800
-
-/*
-// mbarrier.try_wait{.sem}{.scope}.shared::cta.b64         waitComplete, [addr], state , suspendTimeHint;      // 6b.
-PTX ISA 80, SM_90
-// .sem       = { .acquire }
-// .scope     = { .cta, .cluster }
-template <cuda::ptx::dot_scope Scope>
-__device__ static inline bool mbarrier_try_wait(
-  cuda::ptx::sem_acquire_t,
-  cuda::ptx::scope_t<Scope> scope,
-  uint64_t* addr,
-  const uint64_t& state,
-  const uint32_t& suspendTimeHint);
-*/
-#if __cccl_ptx_isa >= 800
-extern "C" _CCCL_DEVICE void __cuda_ptx_mbarrier_try_wait_is_not_supported_before_SM_90__();
-template <dot_scope _Scope>
-_CCCL_DEVICE static inline bool mbarrier_try_wait(
-  sem_acquire_t,
-  scope_t<_Scope> __scope,
-  _CUDA_VSTD::uint64_t* __addr,
-  const _CUDA_VSTD::uint64_t& __state,
-  const _CUDA_VSTD::uint32_t& __suspendTimeHint)
-{
-  // __sem == sem_acquire (due to parameter type constraint)
-  static_assert(__scope == scope_cta || __scope == scope_cluster, "");
-  NV_IF_ELSE_TARGET(
-    NV_PROVIDES_SM_90,
-    (
-      _CUDA_VSTD::uint32_t __waitComplete; _CCCL_IF_CONSTEXPR (__scope == scope_cta) {
-        asm("{\n\t .reg .pred P_OUT; \n\t"
-            "mbarrier.try_wait.acquire.cta.shared::cta.b64         P_OUT, [%1], %2 , %3;      // 6b. \n\t"
-            "selp.b32 %0, 1, 0, P_OUT; \n"
-            "}"
-            : "=r"(__waitComplete)
-            : "r"(__as_ptr_smem(__addr)), "l"(__state), "r"(__suspendTimeHint)
-            : "memory");
-      } _CCCL_ELSE_IF_CONSTEXPR (__scope == scope_cluster) {
-        asm("{\n\t .reg .pred P_OUT; \n\t"
-            "mbarrier.try_wait.acquire.cluster.shared::cta.b64         P_OUT, [%1], %2 , %3;      // 6b. \n\t"
-            "selp.b32 %0, 1, 0, P_OUT; \n"
-            "}"
-            : "=r"(__waitComplete)
-            : "r"(__as_ptr_smem(__addr)), "l"(__state), "r"(__suspendTimeHint)
-            : "memory");
-      } return static_cast<bool>(__waitComplete);),
-    (
-      // Unsupported architectures will have a linker error with a semi-decent error message
-      __cuda_ptx_mbarrier_try_wait_is_not_supported_before_SM_90__(); return false;));
-}
-#endif // __cccl_ptx_isa >= 800
-/*
-// mbarrier.try_wait.parity.shared::cta.b64  waitComplete, [addr], phaseParity;                                // 7a.
-PTX ISA 78, SM_90 template <typename=void>
-__device__ static inline bool mbarrier_try_wait_parity(
-  uint64_t* addr,
-  const uint32_t& phaseParity);
-*/
-#if __cccl_ptx_isa >= 780
-extern "C" _CCCL_DEVICE void __cuda_ptx_mbarrier_try_wait_parity_is_not_supported_before_SM_90__();
-template <typename = void>
-_CCCL_DEVICE static inline bool
-mbarrier_try_wait_parity(_CUDA_VSTD::uint64_t* __addr, const _CUDA_VSTD::uint32_t& __phaseParity)
-{
-  NV_IF_ELSE_TARGET(
-    NV_PROVIDES_SM_90,
-    (_CUDA_VSTD::uint32_t __waitComplete;
-     asm("{\n\t .reg .pred P_OUT; \n\t"
-         "mbarrier.try_wait.parity.shared::cta.b64  P_OUT, [%1], %2;                                // 7a. \n\t"
-         "selp.b32 %0, 1, 0, P_OUT; \n"
-         "}"
-         : "=r"(__waitComplete)
-         : "r"(__as_ptr_smem(__addr)), "r"(__phaseParity)
-         : "memory");
-     return static_cast<bool>(__waitComplete);),
-    (
-      // Unsupported architectures will have a linker error with a semi-decent error message
-      __cuda_ptx_mbarrier_try_wait_parity_is_not_supported_before_SM_90__(); return false;));
-}
-#endif // __cccl_ptx_isa >= 780
-
-/*
-// mbarrier.try_wait.parity.shared::cta.b64  waitComplete, [addr], phaseParity, suspendTimeHint;               // 7b.
-PTX ISA 78, SM_90 template <typename=void>
-__device__ static inline bool mbarrier_try_wait_parity(
-  uint64_t* addr,
-  const uint32_t& phaseParity,
-  const uint32_t& suspendTimeHint);
-*/
-#if __cccl_ptx_isa >= 780
-extern "C" _CCCL_DEVICE void __cuda_ptx_mbarrier_try_wait_parity_is_not_supported_before_SM_90__();
-template <typename = void>
-_CCCL_DEVICE static inline bool mbarrier_try_wait_parity(
-  _CUDA_VSTD::uint64_t* __addr, const _CUDA_VSTD::uint32_t& __phaseParity, const _CUDA_VSTD::uint32_t& __suspendTimeHint)
-{
-  NV_IF_ELSE_TARGET(
-    NV_PROVIDES_SM_90,
-    (_CUDA_VSTD::uint32_t __waitComplete;
-     asm("{\n\t .reg .pred P_OUT; \n\t"
-         "mbarrier.try_wait.parity.shared::cta.b64  P_OUT, [%1], %2, %3;               // 7b. \n\t"
-         "selp.b32 %0, 1, 0, P_OUT; \n"
-         "}"
-         : "=r"(__waitComplete)
-         : "r"(__as_ptr_smem(__addr)), "r"(__phaseParity), "r"(__suspendTimeHint)
-         : "memory");
-     return static_cast<bool>(__waitComplete);),
-    (
-      // Unsupported architectures will have a linker error with a semi-decent error message
-      __cuda_ptx_mbarrier_try_wait_parity_is_not_supported_before_SM_90__(); return false;));
-}
-#endif // __cccl_ptx_isa >= 780
-
-/*
-// mbarrier.try_wait.parity{.sem}{.scope}.shared::cta.b64  waitComplete, [addr], phaseParity;                  // 8a.
-PTX ISA 80, SM_90
-// .sem       = { .acquire }
-// .scope     = { .cta, .cluster }
-template <cuda::ptx::dot_scope Scope>
-__device__ static inline bool mbarrier_try_wait_parity(
-  cuda::ptx::sem_acquire_t,
-  cuda::ptx::scope_t<Scope> scope,
-  uint64_t* addr,
-  const uint32_t& phaseParity);
-*/
-#if __cccl_ptx_isa >= 800
-extern "C" _CCCL_DEVICE void __cuda_ptx_mbarrier_try_wait_parity_is_not_supported_before_SM_90__();
-template <dot_scope _Scope>
-_CCCL_DEVICE static inline bool mbarrier_try_wait_parity(
-  sem_acquire_t, scope_t<_Scope> __scope, _CUDA_VSTD::uint64_t* __addr, const _CUDA_VSTD::uint32_t& __phaseParity)
-{
-  // __sem == sem_acquire (due to parameter type constraint)
-  static_assert(__scope == scope_cta || __scope == scope_cluster, "");
-  NV_IF_ELSE_TARGET(
-    NV_PROVIDES_SM_90,
-    (
-      _CUDA_VSTD::uint32_t __waitComplete; _CCCL_IF_CONSTEXPR (__scope == scope_cta) {
-        asm("{\n\t .reg .pred P_OUT; \n\t"
-            "mbarrier.try_wait.parity.acquire.cta.shared::cta.b64  P_OUT, [%1], %2;                  // 8a. \n\t"
-            "selp.b32 %0, 1, 0, P_OUT; \n"
-            "}"
-            : "=r"(__waitComplete)
-            : "r"(__as_ptr_smem(__addr)), "r"(__phaseParity)
-            : "memory");
-      } _CCCL_ELSE_IF_CONSTEXPR (__scope == scope_cluster) {
-        asm("{\n\t .reg .pred P_OUT; \n\t"
-            "mbarrier.try_wait.parity.acquire.cluster.shared::cta.b64  P_OUT, [%1], %2;                  // 8a. \n\t"
-            "selp.b32 %0, 1, 0, P_OUT; \n"
-            "}"
-            : "=r"(__waitComplete)
-            : "r"(__as_ptr_smem(__addr)), "r"(__phaseParity)
-            : "memory");
-      } return static_cast<bool>(__waitComplete);),
-    (
-      // Unsupported architectures will have a linker error with a semi-decent error message
-      __cuda_ptx_mbarrier_try_wait_parity_is_not_supported_before_SM_90__(); return false;));
-}
-#endif // __cccl_ptx_isa >= 800
-
-/*
-// mbarrier.try_wait.parity{.sem}{.scope}.shared::cta.b64  waitComplete, [addr], phaseParity, suspendTimeHint; // 8b.
-PTX ISA 80, SM_90
-// .sem       = { .acquire }
-// .scope     = { .cta, .cluster }
-template <cuda::ptx::dot_scope Scope>
-__device__ static inline bool mbarrier_try_wait_parity(
-  cuda::ptx::sem_acquire_t,
-  cuda::ptx::scope_t<Scope> scope,
-  uint64_t* addr,
-  const uint32_t& phaseParity,
-  const uint32_t& suspendTimeHint);
-*/
-#if __cccl_ptx_isa >= 800
-extern "C" _CCCL_DEVICE void __cuda_ptx_mbarrier_try_wait_parity_is_not_supported_before_SM_90__();
-template <dot_scope _Scope>
-_CCCL_DEVICE static inline bool mbarrier_try_wait_parity(
-  sem_acquire_t,
-  scope_t<_Scope> __scope,
-  _CUDA_VSTD::uint64_t* __addr,
-  const _CUDA_VSTD::uint32_t& __phaseParity,
-  const _CUDA_VSTD::uint32_t& __suspendTimeHint)
-{
-  // __sem == sem_acquire (due to parameter type constraint)
-  static_assert(__scope == scope_cta || __scope == scope_cluster, "");
-  NV_IF_ELSE_TARGET(
-    NV_PROVIDES_SM_90,
-    (
-      _CUDA_VSTD::uint32_t __waitComplete; _CCCL_IF_CONSTEXPR (__scope == scope_cta) {
-        asm("{\n\t .reg .pred P_OUT; \n\t"
-            "mbarrier.try_wait.parity.acquire.cta.shared::cta.b64  P_OUT, [%1], %2, %3; // 8b. \n\t"
-            "selp.b32 %0, 1, 0, P_OUT; \n"
-            "}"
-            : "=r"(__waitComplete)
-            : "r"(__as_ptr_smem(__addr)), "r"(__phaseParity), "r"(__suspendTimeHint)
-            : "memory");
-      } _CCCL_ELSE_IF_CONSTEXPR (__scope == scope_cluster) {
-        asm("{\n\t .reg .pred P_OUT; \n\t"
-            "mbarrier.try_wait.parity.acquire.cluster.shared::cta.b64  P_OUT, [%1], %2, %3; // 8b. \n\t"
-            "selp.b32 %0, 1, 0, P_OUT; \n"
-            "}"
-            : "=r"(__waitComplete)
-            : "r"(__as_ptr_smem(__addr)), "r"(__phaseParity), "r"(__suspendTimeHint)
-            : "memory");
-      } return static_cast<bool>(__waitComplete);),
-    (
-      // Unsupported architectures will have a linker error with a semi-decent error message
-      __cuda_ptx_mbarrier_try_wait_parity_is_not_supported_before_SM_90__(); return false;));
-}
-#endif // __cccl_ptx_isa >= 800
+#include <cuda/__ptx/instructions/generated/mbarrier_test_wait.inc>
+#include <cuda/__ptx/instructions/generated/mbarrier_test_wait_parity.inc>
+#include <cuda/__ptx/instructions/generated/mbarrier_try_wait.inc>
+#include <cuda/__ptx/instructions/generated/mbarrier_try_wait_parity.inc>
 
 _LIBCUDACXX_END_NAMESPACE_CUDA_PTX
 
diff --git a/libcudacxx/include/cuda/__ptx/instructions/red_async.h b/libcudacxx/include/cuda/__ptx/instructions/red_async.h
index 777628c67d0..a610cf2b583 100644
--- a/libcudacxx/include/cuda/__ptx/instructions/red_async.h
+++ b/libcudacxx/include/cuda/__ptx/instructions/red_async.h
@@ -32,423 +32,7 @@ _LIBCUDACXX_BEGIN_NAMESPACE_CUDA_PTX
 
 // 9.7.12.7. Parallel Synchronization and Communication Instructions: red.async
 // https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#parallel-synchronization-and-communication-instructions-red-async
-/*
-// red.async.relaxed.cluster.shared::cluster.mbarrier::complete_tx::bytes{.op}{.type}  [dest], value, [remote_bar];  //
-PTX ISA 81, SM_90
-// .type      = { .u32 }
-// .op        = { .inc }
-template <typename=void>
-__device__ static inline void red_async(
-  cuda::ptx::op_inc_t,
-  uint32_t* dest,
-  const uint32_t& value,
-  uint64_t* remote_bar);
-*/
-#if __cccl_ptx_isa >= 810
-extern "C" _CCCL_DEVICE void __cuda_ptx_red_async_is_not_supported_before_SM_90__();
-template <typename = void>
-_CCCL_DEVICE static inline void red_async(
-  op_inc_t, _CUDA_VSTD::uint32_t* __dest, const _CUDA_VSTD::uint32_t& __value, _CUDA_VSTD::uint64_t* __remote_bar)
-{
-  // __type == type_u32 (due to parameter type constraint)
-  // __op == op_inc (due to parameter type constraint)
-  NV_IF_ELSE_TARGET(
-    NV_PROVIDES_SM_90,
-    (asm("red.async.relaxed.cluster.shared::cluster.mbarrier::complete_tx::bytes.inc.u32  [%0], %1, [%2]; "
-         :
-         : "r"(__as_ptr_remote_dsmem(__dest)), "r"(__value), "r"(__as_ptr_remote_dsmem(__remote_bar))
-         : "memory");),
-    (
-      // Unsupported architectures will have a linker error with a semi-decent error message
-      __cuda_ptx_red_async_is_not_supported_before_SM_90__();));
-}
-#endif // __cccl_ptx_isa >= 810
-
-/*
-// red.async.relaxed.cluster.shared::cluster.mbarrier::complete_tx::bytes{.op}{.type}  [dest], value, [remote_bar];  //
-PTX ISA 81, SM_90
-// .type      = { .u32 }
-// .op        = { .dec }
-template <typename=void>
-__device__ static inline void red_async(
-  cuda::ptx::op_dec_t,
-  uint32_t* dest,
-  const uint32_t& value,
-  uint64_t* remote_bar);
-*/
-#if __cccl_ptx_isa >= 810
-extern "C" _CCCL_DEVICE void __cuda_ptx_red_async_is_not_supported_before_SM_90__();
-template <typename = void>
-_CCCL_DEVICE static inline void red_async(
-  op_dec_t, _CUDA_VSTD::uint32_t* __dest, const _CUDA_VSTD::uint32_t& __value, _CUDA_VSTD::uint64_t* __remote_bar)
-{
-  // __type == type_u32 (due to parameter type constraint)
-  // __op == op_dec (due to parameter type constraint)
-  NV_IF_ELSE_TARGET(
-    NV_PROVIDES_SM_90,
-    (asm("red.async.relaxed.cluster.shared::cluster.mbarrier::complete_tx::bytes.dec.u32  [%0], %1, [%2]; "
-         :
-         : "r"(__as_ptr_remote_dsmem(__dest)), "r"(__value), "r"(__as_ptr_remote_dsmem(__remote_bar))
-         : "memory");),
-    (
-      // Unsupported architectures will have a linker error with a semi-decent error message
-      __cuda_ptx_red_async_is_not_supported_before_SM_90__();));
-}
-#endif // __cccl_ptx_isa >= 810
-
-/*
-// red.async.relaxed.cluster.shared::cluster.mbarrier::complete_tx::bytes{.op}{.type}  [dest], value, [remote_bar];  //
-PTX ISA 81, SM_90
-// .type      = { .u32 }
-// .op        = { .min }
-template <typename=void>
-__device__ static inline void red_async(
-  cuda::ptx::op_min_t,
-  uint32_t* dest,
-  const uint32_t& value,
-  uint64_t* remote_bar);
-*/
-#if __cccl_ptx_isa >= 810
-extern "C" _CCCL_DEVICE void __cuda_ptx_red_async_is_not_supported_before_SM_90__();
-template <typename = void>
-_CCCL_DEVICE static inline void red_async(
-  op_min_t, _CUDA_VSTD::uint32_t* __dest, const _CUDA_VSTD::uint32_t& __value, _CUDA_VSTD::uint64_t* __remote_bar)
-{
-  // __type == type_u32 (due to parameter type constraint)
-  // __op == op_min (due to parameter type constraint)
-  NV_IF_ELSE_TARGET(
-    NV_PROVIDES_SM_90,
-    (asm("red.async.relaxed.cluster.shared::cluster.mbarrier::complete_tx::bytes.min.u32  [%0], %1, [%2]; "
-         :
-         : "r"(__as_ptr_remote_dsmem(__dest)), "r"(__value), "r"(__as_ptr_remote_dsmem(__remote_bar))
-         : "memory");),
-    (
-      // Unsupported architectures will have a linker error with a semi-decent error message
-      __cuda_ptx_red_async_is_not_supported_before_SM_90__();));
-}
-#endif // __cccl_ptx_isa >= 810
-
-/*
-// red.async.relaxed.cluster.shared::cluster.mbarrier::complete_tx::bytes{.op}{.type}  [dest], value, [remote_bar];  //
-PTX ISA 81, SM_90
-// .type      = { .u32 }
-// .op        = { .max }
-template <typename=void>
-__device__ static inline void red_async(
-  cuda::ptx::op_max_t,
-  uint32_t* dest,
-  const uint32_t& value,
-  uint64_t* remote_bar);
-*/
-#if __cccl_ptx_isa >= 810
-extern "C" _CCCL_DEVICE void __cuda_ptx_red_async_is_not_supported_before_SM_90__();
-template <typename = void>
-_CCCL_DEVICE static inline void red_async(
-  op_max_t, _CUDA_VSTD::uint32_t* __dest, const _CUDA_VSTD::uint32_t& __value, _CUDA_VSTD::uint64_t* __remote_bar)
-{
-  // __type == type_u32 (due to parameter type constraint)
-  // __op == op_max (due to parameter type constraint)
-  NV_IF_ELSE_TARGET(
-    NV_PROVIDES_SM_90,
-    (asm("red.async.relaxed.cluster.shared::cluster.mbarrier::complete_tx::bytes.max.u32  [%0], %1, [%2]; "
-         :
-         : "r"(__as_ptr_remote_dsmem(__dest)), "r"(__value), "r"(__as_ptr_remote_dsmem(__remote_bar))
-         : "memory");),
-    (
-      // Unsupported architectures will have a linker error with a semi-decent error message
-      __cuda_ptx_red_async_is_not_supported_before_SM_90__();));
-}
-#endif // __cccl_ptx_isa >= 810
-
-/*
-// red.async.relaxed.cluster.shared::cluster.mbarrier::complete_tx::bytes{.op}{.type}  [dest], value, [remote_bar];  //
-PTX ISA 81, SM_90
-// .type      = { .u32 }
-// .op        = { .add }
-template <typename=void>
-__device__ static inline void red_async(
-  cuda::ptx::op_add_t,
-  uint32_t* dest,
-  const uint32_t& value,
-  uint64_t* remote_bar);
-*/
-#if __cccl_ptx_isa >= 810
-extern "C" _CCCL_DEVICE void __cuda_ptx_red_async_is_not_supported_before_SM_90__();
-template <typename = void>
-_CCCL_DEVICE static inline void red_async(
-  op_add_t, _CUDA_VSTD::uint32_t* __dest, const _CUDA_VSTD::uint32_t& __value, _CUDA_VSTD::uint64_t* __remote_bar)
-{
-  // __type == type_u32 (due to parameter type constraint)
-  // __op == op_add (due to parameter type constraint)
-  NV_IF_ELSE_TARGET(
-    NV_PROVIDES_SM_90,
-    (asm("red.async.relaxed.cluster.shared::cluster.mbarrier::complete_tx::bytes.add.u32  [%0], %1, [%2]; "
-         :
-         : "r"(__as_ptr_remote_dsmem(__dest)), "r"(__value), "r"(__as_ptr_remote_dsmem(__remote_bar))
-         : "memory");),
-    (
-      // Unsupported architectures will have a linker error with a semi-decent error message
-      __cuda_ptx_red_async_is_not_supported_before_SM_90__();));
-}
-#endif // __cccl_ptx_isa >= 810
-
-/*
-// red.async.relaxed.cluster.shared::cluster.mbarrier::complete_tx::bytes{.op}{.type}  [dest], value, [remote_bar];  //
-PTX ISA 81, SM_90
-// .type      = { .s32 }
-// .op        = { .min }
-template <typename=void>
-__device__ static inline void red_async(
-  cuda::ptx::op_min_t,
-  int32_t* dest,
-  const int32_t& value,
-  uint64_t* remote_bar);
-*/
-#if __cccl_ptx_isa >= 810
-extern "C" _CCCL_DEVICE void __cuda_ptx_red_async_is_not_supported_before_SM_90__();
-template <typename = void>
-_CCCL_DEVICE static inline void
-red_async(op_min_t, _CUDA_VSTD::int32_t* __dest, const _CUDA_VSTD::int32_t& __value, _CUDA_VSTD::uint64_t* __remote_bar)
-{
-  // __type == type_s32 (due to parameter type constraint)
-  // __op == op_min (due to parameter type constraint)
-  NV_IF_ELSE_TARGET(
-    NV_PROVIDES_SM_90,
-    (asm("red.async.relaxed.cluster.shared::cluster.mbarrier::complete_tx::bytes.min.s32  [%0], %1, [%2]; "
-         :
-         : "r"(__as_ptr_remote_dsmem(__dest)), "r"(__value), "r"(__as_ptr_remote_dsmem(__remote_bar))
-         : "memory");),
-    (
-      // Unsupported architectures will have a linker error with a semi-decent error message
-      __cuda_ptx_red_async_is_not_supported_before_SM_90__();));
-}
-#endif // __cccl_ptx_isa >= 810
-
-/*
-// red.async.relaxed.cluster.shared::cluster.mbarrier::complete_tx::bytes{.op}{.type}  [dest], value, [remote_bar];  //
-PTX ISA 81, SM_90
-// .type      = { .s32 }
-// .op        = { .max }
-template <typename=void>
-__device__ static inline void red_async(
-  cuda::ptx::op_max_t,
-  int32_t* dest,
-  const int32_t& value,
-  uint64_t* remote_bar);
-*/
-#if __cccl_ptx_isa >= 810
-extern "C" _CCCL_DEVICE void __cuda_ptx_red_async_is_not_supported_before_SM_90__();
-template <typename = void>
-_CCCL_DEVICE static inline void
-red_async(op_max_t, _CUDA_VSTD::int32_t* __dest, const _CUDA_VSTD::int32_t& __value, _CUDA_VSTD::uint64_t* __remote_bar)
-{
-  // __type == type_s32 (due to parameter type constraint)
-  // __op == op_max (due to parameter type constraint)
-  NV_IF_ELSE_TARGET(
-    NV_PROVIDES_SM_90,
-    (asm("red.async.relaxed.cluster.shared::cluster.mbarrier::complete_tx::bytes.max.s32  [%0], %1, [%2]; "
-         :
-         : "r"(__as_ptr_remote_dsmem(__dest)), "r"(__value), "r"(__as_ptr_remote_dsmem(__remote_bar))
-         : "memory");),
-    (
-      // Unsupported architectures will have a linker error with a semi-decent error message
-      __cuda_ptx_red_async_is_not_supported_before_SM_90__();));
-}
-#endif // __cccl_ptx_isa >= 810
-
-/*
-// red.async.relaxed.cluster.shared::cluster.mbarrier::complete_tx::bytes{.op}{.type}  [dest], value, [remote_bar];  //
-PTX ISA 81, SM_90
-// .type      = { .s32 }
-// .op        = { .add }
-template <typename=void>
-__device__ static inline void red_async(
-  cuda::ptx::op_add_t,
-  int32_t* dest,
-  const int32_t& value,
-  uint64_t* remote_bar);
-*/
-#if __cccl_ptx_isa >= 810
-extern "C" _CCCL_DEVICE void __cuda_ptx_red_async_is_not_supported_before_SM_90__();
-template <typename = void>
-_CCCL_DEVICE static inline void
-red_async(op_add_t, _CUDA_VSTD::int32_t* __dest, const _CUDA_VSTD::int32_t& __value, _CUDA_VSTD::uint64_t* __remote_bar)
-{
-  // __type == type_s32 (due to parameter type constraint)
-  // __op == op_add (due to parameter type constraint)
-  NV_IF_ELSE_TARGET(
-    NV_PROVIDES_SM_90,
-    (asm("red.async.relaxed.cluster.shared::cluster.mbarrier::complete_tx::bytes.add.s32  [%0], %1, [%2]; "
-         :
-         : "r"(__as_ptr_remote_dsmem(__dest)), "r"(__value), "r"(__as_ptr_remote_dsmem(__remote_bar))
-         : "memory");),
-    (
-      // Unsupported architectures will have a linker error with a semi-decent error message
-      __cuda_ptx_red_async_is_not_supported_before_SM_90__();));
-}
-#endif // __cccl_ptx_isa >= 810
-
-/*
-// red.async.relaxed.cluster.shared::cluster.mbarrier::complete_tx::bytes{.op}{.type}  [dest], value, [remote_bar];  //
-PTX ISA 81, SM_90
-// .type      = { .b32 }
-// .op        = { .and }
-template <typename B32>
-__device__ static inline void red_async(
-  cuda::ptx::op_and_op_t,
-  B32* dest,
-  const B32& value,
-  uint64_t* remote_bar);
-*/
-#if __cccl_ptx_isa >= 810
-extern "C" _CCCL_DEVICE void __cuda_ptx_red_async_is_not_supported_before_SM_90__();
-template <typename _B32>
-_CCCL_DEVICE static inline void
-red_async(op_and_op_t, _B32* __dest, const _B32& __value, _CUDA_VSTD::uint64_t* __remote_bar)
-{
-  // __type == type_b32 (due to parameter type constraint)
-  // __op == op_and_op (due to parameter type constraint)
-  static_assert(sizeof(_B32) == 4, "");
-  NV_IF_ELSE_TARGET(
-    NV_PROVIDES_SM_90,
-    (asm("red.async.relaxed.cluster.shared::cluster.mbarrier::complete_tx::bytes.and.b32  [%0], %1, [%2]; "
-         :
-         : "r"(__as_ptr_remote_dsmem(__dest)), "r"(__as_b32(__value)), "r"(__as_ptr_remote_dsmem(__remote_bar))
-         : "memory");),
-    (
-      // Unsupported architectures will have a linker error with a semi-decent error message
-      __cuda_ptx_red_async_is_not_supported_before_SM_90__();));
-}
-#endif // __cccl_ptx_isa >= 810
-
-/*
-// red.async.relaxed.cluster.shared::cluster.mbarrier::complete_tx::bytes{.op}{.type}  [dest], value, [remote_bar];  //
-PTX ISA 81, SM_90
-// .type      = { .b32 }
-// .op        = { .or }
-template <typename B32>
-__device__ static inline void red_async(
-  cuda::ptx::op_or_op_t,
-  B32* dest,
-  const B32& value,
-  uint64_t* remote_bar);
-*/
-#if __cccl_ptx_isa >= 810
-extern "C" _CCCL_DEVICE void __cuda_ptx_red_async_is_not_supported_before_SM_90__();
-template <typename _B32>
-_CCCL_DEVICE static inline void
-red_async(op_or_op_t, _B32* __dest, const _B32& __value, _CUDA_VSTD::uint64_t* __remote_bar)
-{
-  // __type == type_b32 (due to parameter type constraint)
-  // __op == op_or_op (due to parameter type constraint)
-  static_assert(sizeof(_B32) == 4, "");
-  NV_IF_ELSE_TARGET(
-    NV_PROVIDES_SM_90,
-    (asm("red.async.relaxed.cluster.shared::cluster.mbarrier::complete_tx::bytes.or.b32  [%0], %1, [%2]; "
-         :
-         : "r"(__as_ptr_remote_dsmem(__dest)), "r"(__as_b32(__value)), "r"(__as_ptr_remote_dsmem(__remote_bar))
-         : "memory");),
-    (
-      // Unsupported architectures will have a linker error with a semi-decent error message
-      __cuda_ptx_red_async_is_not_supported_before_SM_90__();));
-}
-#endif // __cccl_ptx_isa >= 810
-
-/*
-// red.async.relaxed.cluster.shared::cluster.mbarrier::complete_tx::bytes{.op}{.type}  [dest], value, [remote_bar];  //
-PTX ISA 81, SM_90
-// .type      = { .b32 }
-// .op        = { .xor }
-template <typename B32>
-__device__ static inline void red_async(
-  cuda::ptx::op_xor_op_t,
-  B32* dest,
-  const B32& value,
-  uint64_t* remote_bar);
-*/
-#if __cccl_ptx_isa >= 810
-extern "C" _CCCL_DEVICE void __cuda_ptx_red_async_is_not_supported_before_SM_90__();
-template <typename _B32>
-_CCCL_DEVICE static inline void
-red_async(op_xor_op_t, _B32* __dest, const _B32& __value, _CUDA_VSTD::uint64_t* __remote_bar)
-{
-  // __type == type_b32 (due to parameter type constraint)
-  // __op == op_xor_op (due to parameter type constraint)
-  static_assert(sizeof(_B32) == 4, "");
-  NV_IF_ELSE_TARGET(
-    NV_PROVIDES_SM_90,
-    (asm("red.async.relaxed.cluster.shared::cluster.mbarrier::complete_tx::bytes.xor.b32  [%0], %1, [%2]; "
-         :
-         : "r"(__as_ptr_remote_dsmem(__dest)), "r"(__as_b32(__value)), "r"(__as_ptr_remote_dsmem(__remote_bar))
-         : "memory");),
-    (
-      // Unsupported architectures will have a linker error with a semi-decent error message
-      __cuda_ptx_red_async_is_not_supported_before_SM_90__();));
-}
-#endif // __cccl_ptx_isa >= 810
-
-/*
-// red.async.relaxed.cluster.shared::cluster.mbarrier::complete_tx::bytes{.op}{.type}  [dest], value, [remote_bar];  //
-PTX ISA 81, SM_90
-// .type      = { .u64 }
-// .op        = { .add }
-template <typename=void>
-__device__ static inline void red_async(
-  cuda::ptx::op_add_t,
-  uint64_t* dest,
-  const uint64_t& value,
-  uint64_t* remote_bar);
-*/
-#if __cccl_ptx_isa >= 810
-extern "C" _CCCL_DEVICE void __cuda_ptx_red_async_is_not_supported_before_SM_90__();
-template <typename = void>
-_CCCL_DEVICE static inline void red_async(
-  op_add_t, _CUDA_VSTD::uint64_t* __dest, const _CUDA_VSTD::uint64_t& __value, _CUDA_VSTD::uint64_t* __remote_bar)
-{
-  // __type == type_u64 (due to parameter type constraint)
-  // __op == op_add (due to parameter type constraint)
-  NV_IF_ELSE_TARGET(
-    NV_PROVIDES_SM_90,
-    (asm("red.async.relaxed.cluster.shared::cluster.mbarrier::complete_tx::bytes.add.u64  [%0], %1, [%2]; "
-         :
-         : "r"(__as_ptr_remote_dsmem(__dest)), "l"(__value), "r"(__as_ptr_remote_dsmem(__remote_bar))
-         : "memory");),
-    (
-      // Unsupported architectures will have a linker error with a semi-decent error message
-      __cuda_ptx_red_async_is_not_supported_before_SM_90__();));
-}
-#endif // __cccl_ptx_isa >= 810
-
-/*
-// red.async.relaxed.cluster.shared::cluster.mbarrier::complete_tx::bytes{.op}.u64  [dest], value, [remote_bar]; // .u64
-intentional PTX ISA 81, SM_90
-// .op        = { .add }
-template <typename=void>
-__device__ static inline void red_async(
-  cuda::ptx::op_add_t,
-  int64_t* dest,
-  const int64_t& value,
-  int64_t* remote_bar);
-*/
-#if __cccl_ptx_isa >= 810
-extern "C" _CCCL_DEVICE void __cuda_ptx_red_async_is_not_supported_before_SM_90__();
-template <typename = void>
-_CCCL_DEVICE static inline void
-red_async(op_add_t, _CUDA_VSTD::int64_t* __dest, const _CUDA_VSTD::int64_t& __value, _CUDA_VSTD::int64_t* __remote_bar)
-{
-  // __op == op_add (due to parameter type constraint)
-  NV_IF_ELSE_TARGET(
-    NV_PROVIDES_SM_90,
-    (asm("red.async.relaxed.cluster.shared::cluster.mbarrier::complete_tx::bytes.add.u64  [%0], %1, [%2]; // .u64 "
-         "intentional"
-         :
-         : "r"(__as_ptr_remote_dsmem(__dest)), "l"(__value), "r"(__as_ptr_remote_dsmem(__remote_bar))
-         : "memory");),
-    (
-      // Unsupported architectures will have a linker error with a semi-decent error message
-      __cuda_ptx_red_async_is_not_supported_before_SM_90__();));
-}
-#endif // __cccl_ptx_isa >= 810
+#include <cuda/__ptx/instructions/generated/red_async.inc>
 
 _LIBCUDACXX_END_NAMESPACE_CUDA_PTX
 
diff --git a/libcudacxx/include/cuda/__ptx/instructions/st_async.h b/libcudacxx/include/cuda/__ptx/instructions/st_async.h
index e6774087802..09199b4a3ce 100644
--- a/libcudacxx/include/cuda/__ptx/instructions/st_async.h
+++ b/libcudacxx/include/cuda/__ptx/instructions/st_async.h
@@ -32,114 +32,7 @@ _LIBCUDACXX_BEGIN_NAMESPACE_CUDA_PTX
 
 // 9.7.8.12. Data Movement and Conversion Instructions: st.async
 // https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#data-movement-and-conversion-instructions-st-async
-/*
-// st.async.weak.shared::cluster.mbarrier::complete_tx::bytes{.type} [addr], value, [remote_bar];    // 1.  PTX ISA 81,
-SM_90
-// .type      = { .b32, .b64 }
-template <typename Type>
-__device__ static inline void st_async(
-  Type* addr,
-  const Type& value,
-  uint64_t* remote_bar);
-*/
-#if __cccl_ptx_isa >= 810
-extern "C" _CCCL_DEVICE void __cuda_ptx_st_async_is_not_supported_before_SM_90__();
-template <typename _Type>
-_CCCL_DEVICE static inline void st_async(_Type* __addr, const _Type& __value, _CUDA_VSTD::uint64_t* __remote_bar)
-{
-  static_assert(sizeof(_Type) == 4 || sizeof(_Type) == 8, "");
-  NV_IF_ELSE_TARGET(
-    NV_PROVIDES_SM_90,
-    (
-      _CCCL_IF_CONSTEXPR (sizeof(_Type) == 4) {
-        asm("st.async.weak.shared::cluster.mbarrier::complete_tx::bytes.b32 [%0], %1, [%2];    // 1. "
-            :
-            : "r"(__as_ptr_remote_dsmem(__addr)), "r"(__as_b32(__value)), "r"(__as_ptr_remote_dsmem(__remote_bar))
-            : "memory");
-      } _CCCL_ELSE_IF_CONSTEXPR (sizeof(_Type) == 8) {
-        asm("st.async.weak.shared::cluster.mbarrier::complete_tx::bytes.b64 [%0], %1, [%2];    // 1. "
-            :
-            : "r"(__as_ptr_remote_dsmem(__addr)), "l"(__as_b64(__value)), "r"(__as_ptr_remote_dsmem(__remote_bar))
-            : "memory");
-      }),
-    (
-      // Unsupported architectures will have a linker error with a semi-decent error message
-      __cuda_ptx_st_async_is_not_supported_before_SM_90__();));
-}
-#endif // __cccl_ptx_isa >= 810
-
-/*
-// st.async.weak.shared::cluster.mbarrier::complete_tx::bytes.v2{.type} [addr], value, [remote_bar]; // 2.  PTX ISA 81,
-SM_90
-// .type      = { .b32, .b64 }
-template <typename Type>
-__device__ static inline void st_async(
-  Type* addr,
-  const Type (&value)[2],
-  uint64_t* remote_bar);
-*/
-#if __cccl_ptx_isa >= 810
-extern "C" _CCCL_DEVICE void __cuda_ptx_st_async_is_not_supported_before_SM_90__();
-template <typename _Type>
-_CCCL_DEVICE static inline void st_async(_Type* __addr, const _Type (&__value)[2], _CUDA_VSTD::uint64_t* __remote_bar)
-{
-  static_assert(sizeof(_Type) == 4 || sizeof(_Type) == 8, "");
-  NV_IF_ELSE_TARGET(
-    NV_PROVIDES_SM_90,
-    (
-      _CCCL_IF_CONSTEXPR (sizeof(_Type) == 4) {
-        asm("st.async.weak.shared::cluster.mbarrier::complete_tx::bytes.v2.b32 [%0], {%1, %2}, [%3]; // 2. "
-            :
-            : "r"(__as_ptr_remote_dsmem(__addr)),
-              "r"(__as_b32(__value[0])),
-              "r"(__as_b32(__value[1])),
-              "r"(__as_ptr_remote_dsmem(__remote_bar))
-            : "memory");
-      } _CCCL_ELSE_IF_CONSTEXPR (sizeof(_Type) == 8) {
-        asm("st.async.weak.shared::cluster.mbarrier::complete_tx::bytes.v2.b64 [%0], {%1, %2}, [%3]; // 2. "
-            :
-            : "r"(__as_ptr_remote_dsmem(__addr)),
-              "l"(__as_b64(__value[0])),
-              "l"(__as_b64(__value[1])),
-              "r"(__as_ptr_remote_dsmem(__remote_bar))
-            : "memory");
-      }),
-    (
-      // Unsupported architectures will have a linker error with a semi-decent error message
-      __cuda_ptx_st_async_is_not_supported_before_SM_90__();));
-}
-#endif // __cccl_ptx_isa >= 810
-
-/*
-// st.async.weak.shared::cluster.mbarrier::complete_tx::bytes.v4.b32 [addr], value, [remote_bar];    // 3.  PTX ISA 81,
-SM_90 template <typename B32>
-__device__ static inline void st_async(
-  B32* addr,
-  const B32 (&value)[4],
-  uint64_t* remote_bar);
-*/
-#if __cccl_ptx_isa >= 810
-extern "C" _CCCL_DEVICE void __cuda_ptx_st_async_is_not_supported_before_SM_90__();
-template <typename _B32>
-_CCCL_DEVICE static inline void st_async(_B32* __addr, const _B32 (&__value)[4], _CUDA_VSTD::uint64_t* __remote_bar)
-{
-  static_assert(sizeof(_B32) == 4, "");
-  NV_IF_ELSE_TARGET(
-    NV_PROVIDES_SM_90,
-    (asm("st.async.weak.shared::cluster.mbarrier::complete_tx::bytes.v4.b32 [%0], {%1, %2, %3, %4}, [%5];    // 3. "
-         :
-         : "r"(__as_ptr_remote_dsmem(__addr)),
-           "r"(__as_b32(__value[0])),
-           "r"(__as_b32(__value[1])),
-           "r"(__as_b32(__value[2])),
-           "r"(__as_b32(__value[3])),
-           "r"(__as_ptr_remote_dsmem(__remote_bar))
-         : "memory");),
-    (
-      // Unsupported architectures will have a linker error with a semi-decent error message
-      __cuda_ptx_st_async_is_not_supported_before_SM_90__();));
-}
-#endif // __cccl_ptx_isa >= 810
+#include <cuda/__ptx/instructions/generated/st_async.inc>
 
 _LIBCUDACXX_END_NAMESPACE_CUDA_PTX
 
diff --git a/libcudacxx/include/cuda/__ptx/instructions/tensormap_cp_fenceproxy.h b/libcudacxx/include/cuda/__ptx/instructions/tensormap_cp_fenceproxy.h
index ce8b0f10991..de179f69735 100644
--- a/libcudacxx/include/cuda/__ptx/instructions/tensormap_cp_fenceproxy.h
+++ b/libcudacxx/include/cuda/__ptx/instructions/tensormap_cp_fenceproxy.h
@@ -32,60 +32,7 @@ _LIBCUDACXX_BEGIN_NAMESPACE_CUDA_PTX
 
 // 9.7.12.15.18. Parallel Synchronization and Communication Instructions: tensormap.cp_fenceproxy
 // https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#parallel-synchronization-and-communication-instructions-tensormap-cp-fenceproxy
-/*
-// tensormap.cp_fenceproxy.global.shared::cta.tensormap::generic.sem.scope.sync.aligned  [dst], [src], size; // PTX ISA
-83, SM_90
-// .sem       = { .release }
-// .scope     = { .cta, .cluster, .gpu, .sys }
-template <int N32, cuda::ptx::dot_scope Scope>
-__device__ static inline void tensormap_cp_fenceproxy(
-  cuda::ptx::sem_release_t,
-  cuda::ptx::scope_t<Scope> scope,
-  void* dst,
-  const void* src,
-  cuda::ptx::n32_t<N32> size);
-*/
-#if __cccl_ptx_isa >= 830
-extern "C" _CCCL_DEVICE void __cuda_ptx_tensormap_cp_fenceproxy_is_not_supported_before_SM_90__();
-template <int _N32, dot_scope _Scope>
-_CCCL_DEVICE static inline void
-tensormap_cp_fenceproxy(sem_release_t, scope_t<_Scope> __scope, void* __dst, const void* __src, n32_t<_N32> __size)
-{
-  // __sem == sem_release (due to parameter type constraint)
-  static_assert(__scope == scope_cta || __scope == scope_cluster || __scope == scope_gpu || __scope == scope_sys, "");
-  NV_IF_ELSE_TARGET(
-    NV_PROVIDES_SM_90,
-    (
-      _CCCL_IF_CONSTEXPR (__scope == scope_cta) {
-        asm volatile(
-          "tensormap.cp_fenceproxy.global.shared::cta.tensormap::generic.release.cta.sync.aligned  [%0], [%1], %2;"
-          :
-          : "l"(__as_ptr_gmem(__dst)), "r"(__as_ptr_smem(__src)), "n"(__size.value)
-          : "memory");
-      } _CCCL_ELSE_IF_CONSTEXPR (__scope == scope_cluster) {
-        asm volatile(
-          "tensormap.cp_fenceproxy.global.shared::cta.tensormap::generic.release.cluster.sync.aligned  [%0], [%1], %2;"
-          :
-          : "l"(__as_ptr_gmem(__dst)), "r"(__as_ptr_smem(__src)), "n"(__size.value)
-          : "memory");
-      } _CCCL_ELSE_IF_CONSTEXPR (__scope == scope_gpu) {
-        asm volatile(
-          "tensormap.cp_fenceproxy.global.shared::cta.tensormap::generic.release.gpu.sync.aligned  [%0], [%1], %2;"
-          :
-          : "l"(__as_ptr_gmem(__dst)), "r"(__as_ptr_smem(__src)), "n"(__size.value)
-          : "memory");
-      } _CCCL_ELSE_IF_CONSTEXPR (__scope == scope_sys) {
-        asm volatile(
-          "tensormap.cp_fenceproxy.global.shared::cta.tensormap::generic.release.sys.sync.aligned  [%0], [%1], %2;"
-          :
-          : "l"(__as_ptr_gmem(__dst)), "r"(__as_ptr_smem(__src)), "n"(__size.value)
-          : "memory");
-      }),
-    (
-      // Unsupported architectures will have a linker error with a semi-decent error message
-      __cuda_ptx_tensormap_cp_fenceproxy_is_not_supported_before_SM_90__();));
-}
-#endif // __cccl_ptx_isa >= 830
+#include <cuda/__ptx/instructions/generated/tensormap_cp_fenceproxy.inc>
 
 _LIBCUDACXX_END_NAMESPACE_CUDA_PTX
 
diff --git a/libcudacxx/include/cuda/__ptx/instructions/tensormap_replace.h b/libcudacxx/include/cuda/__ptx/instructions/tensormap_replace.h
index b40c0cf72aa..2f81d8b4361 100644
--- a/libcudacxx/include/cuda/__ptx/instructions/tensormap_replace.h
+++ b/libcudacxx/include/cuda/__ptx/instructions/tensormap_replace.h
@@ -32,575 +32,7 @@ _LIBCUDACXX_BEGIN_NAMESPACE_CUDA_PTX
 
 // 9.7.8.25. Data Movement and Conversion Instructions: tensormap.replace
 // https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#data-movement-and-conversion-instructions-tensormap-replace
-/*
-// tensormap.replace.tile.global_address.space.b1024.b64    [tm_addr], new_val; // PTX ISA 83, SM_90a
-// .space     = { .global }
-template <typename B64>
-__device__ static inline void tensormap_replace_global_address(
-  cuda::ptx::space_global_t,
-  void* tm_addr,
-  B64 new_val);
-*/
-#if __cccl_ptx_isa >= 830
-extern "C" _CCCL_DEVICE void __cuda_ptx_tensormap_replace_global_address_is_not_supported_before_SM_90a__();
-template <typename _B64>
-_CCCL_DEVICE static inline void tensormap_replace_global_address(space_global_t, void* __tm_addr, _B64 __new_val)
-{
-  // __space == space_global (due to parameter type constraint)
-  static_assert(sizeof(_B64) == 8, "");
-  NV_IF_ELSE_TARGET(
-    NV_HAS_FEATURE_SM_90a,
-    (asm("tensormap.replace.tile.global_address.global.b1024.b64    [%0], %1;"
-         :
-         : "l"(__as_ptr_gmem(__tm_addr)), "l"(__as_b64(__new_val))
-         : "memory");),
-    (
-      // Unsupported architectures will have a linker error with a semi-decent error message
-      __cuda_ptx_tensormap_replace_global_address_is_not_supported_before_SM_90a__();));
-}
-#endif // __cccl_ptx_isa >= 830
-
-/*
-// tensormap.replace.tile.global_address.space.b1024.b64    [tm_addr], new_val; // PTX ISA 83, SM_90a
-// .space     = { .shared::cta }
-template <typename B64>
-__device__ static inline void tensormap_replace_global_address(
-  cuda::ptx::space_shared_t,
-  void* tm_addr,
-  B64 new_val);
-*/
-#if __cccl_ptx_isa >= 830
-extern "C" _CCCL_DEVICE void __cuda_ptx_tensormap_replace_global_address_is_not_supported_before_SM_90a__();
-template <typename _B64>
-_CCCL_DEVICE static inline void tensormap_replace_global_address(space_shared_t, void* __tm_addr, _B64 __new_val)
-{
-  // __space == space_shared (due to parameter type constraint)
-  static_assert(sizeof(_B64) == 8, "");
-  NV_IF_ELSE_TARGET(
-    NV_HAS_FEATURE_SM_90a,
-    (asm("tensormap.replace.tile.global_address.shared::cta.b1024.b64    [%0], %1;"
-         :
-         : "r"(__as_ptr_smem(__tm_addr)), "l"(__as_b64(__new_val))
-         : "memory");),
-    (
-      // Unsupported architectures will have a linker error with a semi-decent error message
-      __cuda_ptx_tensormap_replace_global_address_is_not_supported_before_SM_90a__();));
-}
-#endif // __cccl_ptx_isa >= 830
-
-/*
-// tensormap.replace.tile.rank.space.b1024.b32              [tm_addr], new_val; // PTX ISA 83, SM_90a
-// .space     = { .global }
-template <typename B32>
-__device__ static inline void tensormap_replace_rank(
-  cuda::ptx::space_global_t,
-  void* tm_addr,
-  B32 new_val);
-*/
-#if __cccl_ptx_isa >= 830
-extern "C" _CCCL_DEVICE void __cuda_ptx_tensormap_replace_rank_is_not_supported_before_SM_90a__();
-template <typename _B32>
-_CCCL_DEVICE static inline void tensormap_replace_rank(space_global_t, void* __tm_addr, _B32 __new_val)
-{
-  // __space == space_global (due to parameter type constraint)
-  static_assert(sizeof(_B32) == 4, "");
-  NV_IF_ELSE_TARGET(
-    NV_HAS_FEATURE_SM_90a,
-    (asm("tensormap.replace.tile.rank.global.b1024.b32              [%0], %1;"
-         :
-         : "l"(__as_ptr_gmem(__tm_addr)), "r"(__as_b32(__new_val))
-         : "memory");),
-    (
-      // Unsupported architectures will have a linker error with a semi-decent error message
-      __cuda_ptx_tensormap_replace_rank_is_not_supported_before_SM_90a__();));
-}
-#endif // __cccl_ptx_isa >= 830
-
-/*
-// tensormap.replace.tile.rank.space.b1024.b32              [tm_addr], new_val; // PTX ISA 83, SM_90a
-// .space     = { .shared::cta }
-template <typename B32>
-__device__ static inline void tensormap_replace_rank(
-  cuda::ptx::space_shared_t,
-  void* tm_addr,
-  B32 new_val);
-*/
-#if __cccl_ptx_isa >= 830
-extern "C" _CCCL_DEVICE void __cuda_ptx_tensormap_replace_rank_is_not_supported_before_SM_90a__();
-template <typename _B32>
-_CCCL_DEVICE static inline void tensormap_replace_rank(space_shared_t, void* __tm_addr, _B32 __new_val)
-{
-  // __space == space_shared (due to parameter type constraint)
-  static_assert(sizeof(_B32) == 4, "");
-  NV_IF_ELSE_TARGET(
-    NV_HAS_FEATURE_SM_90a,
-    (asm("tensormap.replace.tile.rank.shared::cta.b1024.b32              [%0], %1;"
-         :
-         : "r"(__as_ptr_smem(__tm_addr)), "r"(__as_b32(__new_val))
-         : "memory");),
-    (
-      // Unsupported architectures will have a linker error with a semi-decent error message
-      __cuda_ptx_tensormap_replace_rank_is_not_supported_before_SM_90a__();));
-}
-#endif // __cccl_ptx_isa >= 830
-
-/*
-// tensormap.replace.tile.box_dim.space.b1024.b32           [tm_addr], ord, new_val; // PTX ISA 83, SM_90a
-// .space     = { .global }
-template <int N32, typename B32>
-__device__ static inline void tensormap_replace_box_dim(
-  cuda::ptx::space_global_t,
-  void* tm_addr,
-  cuda::ptx::n32_t<N32> ord,
-  B32 new_val);
-*/
-#if __cccl_ptx_isa >= 830
-extern "C" _CCCL_DEVICE void __cuda_ptx_tensormap_replace_box_dim_is_not_supported_before_SM_90a__();
-template <int _N32, typename _B32>
-_CCCL_DEVICE static inline void
-tensormap_replace_box_dim(space_global_t, void* __tm_addr, n32_t<_N32> __ord, _B32 __new_val)
-{
-  // __space == space_global (due to parameter type constraint)
-  static_assert(sizeof(_B32) == 4, "");
-  NV_IF_ELSE_TARGET(
-    NV_HAS_FEATURE_SM_90a,
-    (asm("tensormap.replace.tile.box_dim.global.b1024.b32           [%0], %1, %2;"
-         :
-         : "l"(__as_ptr_gmem(__tm_addr)), "n"(__ord.value), "r"(__as_b32(__new_val))
-         : "memory");),
-    (
-      // Unsupported architectures will have a linker error with a semi-decent error message
-      __cuda_ptx_tensormap_replace_box_dim_is_not_supported_before_SM_90a__();));
-}
-#endif // __cccl_ptx_isa >= 830
-
-/*
-// tensormap.replace.tile.box_dim.space.b1024.b32           [tm_addr], ord, new_val; // PTX ISA 83, SM_90a
-// .space     = { .shared::cta }
-template <int N32, typename B32>
-__device__ static inline void tensormap_replace_box_dim(
-  cuda::ptx::space_shared_t,
-  void* tm_addr,
-  cuda::ptx::n32_t<N32> ord,
-  B32 new_val);
-*/
-#if __cccl_ptx_isa >= 830
-extern "C" _CCCL_DEVICE void __cuda_ptx_tensormap_replace_box_dim_is_not_supported_before_SM_90a__();
-template <int _N32, typename _B32>
-_CCCL_DEVICE static inline void
-tensormap_replace_box_dim(space_shared_t, void* __tm_addr, n32_t<_N32> __ord, _B32 __new_val)
-{
-  // __space == space_shared (due to parameter type constraint)
-  static_assert(sizeof(_B32) == 4, "");
-  NV_IF_ELSE_TARGET(
-    NV_HAS_FEATURE_SM_90a,
-    (asm("tensormap.replace.tile.box_dim.shared::cta.b1024.b32           [%0], %1, %2;"
-         :
-         : "r"(__as_ptr_smem(__tm_addr)), "n"(__ord.value), "r"(__as_b32(__new_val))
-         : "memory");),
-    (
-      // Unsupported architectures will have a linker error with a semi-decent error message
-      __cuda_ptx_tensormap_replace_box_dim_is_not_supported_before_SM_90a__();));
-}
-#endif // __cccl_ptx_isa >= 830
-
-/*
-// tensormap.replace.tile.global_dim.space.b1024.b32        [tm_addr], ord, new_val; // PTX ISA 83, SM_90a
-// .space     = { .global }
-template <int N32, typename B32>
-__device__ static inline void tensormap_replace_global_dim(
-  cuda::ptx::space_global_t,
-  void* tm_addr,
-  cuda::ptx::n32_t<N32> ord,
-  B32 new_val);
-*/
-#if __cccl_ptx_isa >= 830
-extern "C" _CCCL_DEVICE void __cuda_ptx_tensormap_replace_global_dim_is_not_supported_before_SM_90a__();
-template <int _N32, typename _B32>
-_CCCL_DEVICE static inline void
-tensormap_replace_global_dim(space_global_t, void* __tm_addr, n32_t<_N32> __ord, _B32 __new_val)
-{
-  // __space == space_global (due to parameter type constraint)
-  static_assert(sizeof(_B32) == 4, "");
-  NV_IF_ELSE_TARGET(
-    NV_HAS_FEATURE_SM_90a,
-    (asm("tensormap.replace.tile.global_dim.global.b1024.b32        [%0], %1, %2;"
-         :
-         : "l"(__as_ptr_gmem(__tm_addr)), "n"(__ord.value), "r"(__as_b32(__new_val))
-         : "memory");),
-    (
-      // Unsupported architectures will have a linker error with a semi-decent error message
-      __cuda_ptx_tensormap_replace_global_dim_is_not_supported_before_SM_90a__();));
-}
-#endif // __cccl_ptx_isa >= 830
-
-/*
-// tensormap.replace.tile.global_dim.space.b1024.b32        [tm_addr], ord, new_val; // PTX ISA 83, SM_90a
-// .space     = { .shared::cta }
-template <int N32, typename B32>
-__device__ static inline void tensormap_replace_global_dim(
-  cuda::ptx::space_shared_t,
-  void* tm_addr,
-  cuda::ptx::n32_t<N32> ord,
-  B32 new_val);
-*/
-#if __cccl_ptx_isa >= 830
-extern "C" _CCCL_DEVICE void __cuda_ptx_tensormap_replace_global_dim_is_not_supported_before_SM_90a__();
-template <int _N32, typename _B32>
-_CCCL_DEVICE static inline void
-tensormap_replace_global_dim(space_shared_t, void* __tm_addr, n32_t<_N32> __ord, _B32 __new_val)
-{
-  // __space == space_shared (due to parameter type constraint)
-  static_assert(sizeof(_B32) == 4, "");
-  NV_IF_ELSE_TARGET(
-    NV_HAS_FEATURE_SM_90a,
-    (asm("tensormap.replace.tile.global_dim.shared::cta.b1024.b32        [%0], %1, %2;"
-         :
-         : "r"(__as_ptr_smem(__tm_addr)), "n"(__ord.value), "r"(__as_b32(__new_val))
-         : "memory");),
-    (
-      // Unsupported architectures will have a linker error with a semi-decent error message
-      __cuda_ptx_tensormap_replace_global_dim_is_not_supported_before_SM_90a__();));
-}
-#endif // __cccl_ptx_isa >= 830
-
-/*
-// tensormap.replace.tile.global_stride.space.b1024.b64     [tm_addr], ord, new_val; // PTX ISA 83, SM_90a
-// .space     = { .global }
-template <int N32, typename B64>
-__device__ static inline void tensormap_replace_global_stride(
-  cuda::ptx::space_global_t,
-  void* tm_addr,
-  cuda::ptx::n32_t<N32> ord,
-  B64 new_val);
-*/
-#if __cccl_ptx_isa >= 830
-extern "C" _CCCL_DEVICE void __cuda_ptx_tensormap_replace_global_stride_is_not_supported_before_SM_90a__();
-template <int _N32, typename _B64>
-_CCCL_DEVICE static inline void
-tensormap_replace_global_stride(space_global_t, void* __tm_addr, n32_t<_N32> __ord, _B64 __new_val)
-{
-  // __space == space_global (due to parameter type constraint)
-  static_assert(sizeof(_B64) == 8, "");
-  NV_IF_ELSE_TARGET(
-    NV_HAS_FEATURE_SM_90a,
-    (asm("tensormap.replace.tile.global_stride.global.b1024.b64     [%0], %1, %2;"
-         :
-         : "l"(__as_ptr_gmem(__tm_addr)), "n"(__ord.value), "l"(__as_b64(__new_val))
-         : "memory");),
-    (
-      // Unsupported architectures will have a linker error with a semi-decent error message
-      __cuda_ptx_tensormap_replace_global_stride_is_not_supported_before_SM_90a__();));
-}
-#endif // __cccl_ptx_isa >= 830
-
-/*
-// tensormap.replace.tile.global_stride.space.b1024.b64     [tm_addr], ord, new_val; // PTX ISA 83, SM_90a
-// .space     = { .shared::cta }
-template <int N32, typename B64>
-__device__ static inline void tensormap_replace_global_stride(
-  cuda::ptx::space_shared_t,
-  void* tm_addr,
-  cuda::ptx::n32_t<N32> ord,
-  B64 new_val);
-*/
-#if __cccl_ptx_isa >= 830
-extern "C" _CCCL_DEVICE void __cuda_ptx_tensormap_replace_global_stride_is_not_supported_before_SM_90a__();
-template <int _N32, typename _B64>
-_CCCL_DEVICE static inline void
-tensormap_replace_global_stride(space_shared_t, void* __tm_addr, n32_t<_N32> __ord, _B64 __new_val)
-{
-  // __space == space_shared (due to parameter type constraint)
-  static_assert(sizeof(_B64) == 8, "");
-  NV_IF_ELSE_TARGET(
-    NV_HAS_FEATURE_SM_90a,
-    (asm("tensormap.replace.tile.global_stride.shared::cta.b1024.b64     [%0], %1, %2;"
-         :
-         : "r"(__as_ptr_smem(__tm_addr)), "n"(__ord.value), "l"(__as_b64(__new_val))
-         : "memory");),
-    (
-      // Unsupported architectures will have a linker error with a semi-decent error message
-      __cuda_ptx_tensormap_replace_global_stride_is_not_supported_before_SM_90a__();));
-}
-#endif // __cccl_ptx_isa >= 830
-
-/*
-// tensormap.replace.tile.element_stride.space.b1024.b32    [tm_addr], ord, new_val; // PTX ISA 83, SM_90a
-// .space     = { .global }
-template <int N32, typename B32>
-__device__ static inline void tensormap_replace_element_size(
-  cuda::ptx::space_global_t,
-  void* tm_addr,
-  cuda::ptx::n32_t<N32> ord,
-  B32 new_val);
-*/
-#if __cccl_ptx_isa >= 830
-extern "C" _CCCL_DEVICE void __cuda_ptx_tensormap_replace_element_size_is_not_supported_before_SM_90a__();
-template <int _N32, typename _B32>
-_CCCL_DEVICE static inline void
-tensormap_replace_element_size(space_global_t, void* __tm_addr, n32_t<_N32> __ord, _B32 __new_val)
-{
-  // __space == space_global (due to parameter type constraint)
-  static_assert(sizeof(_B32) == 4, "");
-  NV_IF_ELSE_TARGET(
-    NV_HAS_FEATURE_SM_90a,
-    (asm("tensormap.replace.tile.element_stride.global.b1024.b32    [%0], %1, %2;"
-         :
-         : "l"(__as_ptr_gmem(__tm_addr)), "n"(__ord.value), "r"(__as_b32(__new_val))
-         : "memory");),
-    (
-      // Unsupported architectures will have a linker error with a semi-decent error message
-      __cuda_ptx_tensormap_replace_element_size_is_not_supported_before_SM_90a__();));
-}
-#endif // __cccl_ptx_isa >= 830
-
-/*
-// tensormap.replace.tile.element_stride.space.b1024.b32    [tm_addr], ord, new_val; // PTX ISA 83, SM_90a
-// .space     = { .shared::cta }
-template <int N32, typename B32>
-__device__ static inline void tensormap_replace_element_size(
-  cuda::ptx::space_shared_t,
-  void* tm_addr,
-  cuda::ptx::n32_t<N32> ord,
-  B32 new_val);
-*/
-#if __cccl_ptx_isa >= 830
-extern "C" _CCCL_DEVICE void __cuda_ptx_tensormap_replace_element_size_is_not_supported_before_SM_90a__();
-template <int _N32, typename _B32>
-_CCCL_DEVICE static inline void
-tensormap_replace_element_size(space_shared_t, void* __tm_addr, n32_t<_N32> __ord, _B32 __new_val)
-{
-  // __space == space_shared (due to parameter type constraint)
-  static_assert(sizeof(_B32) == 4, "");
-  NV_IF_ELSE_TARGET(
-    NV_HAS_FEATURE_SM_90a,
-    (asm("tensormap.replace.tile.element_stride.shared::cta.b1024.b32    [%0], %1, %2;"
-         :
-         : "r"(__as_ptr_smem(__tm_addr)), "n"(__ord.value), "r"(__as_b32(__new_val))
-         : "memory");),
-    (
-      // Unsupported architectures will have a linker error with a semi-decent error message
-      __cuda_ptx_tensormap_replace_element_size_is_not_supported_before_SM_90a__();));
-}
-#endif // __cccl_ptx_isa >= 830
-
-/*
-// tensormap.replace.tile.elemtype.space.b1024.b32          [tm_addr], new_val; // PTX ISA 83, SM_90a
-// .space     = { .global }
-template <int N32>
-__device__ static inline void tensormap_replace_elemtype(
-  cuda::ptx::space_global_t,
-  void* tm_addr,
-  cuda::ptx::n32_t<N32> new_val);
-*/
-#if __cccl_ptx_isa >= 830
-extern "C" _CCCL_DEVICE void __cuda_ptx_tensormap_replace_elemtype_is_not_supported_before_SM_90a__();
-template <int _N32>
-_CCCL_DEVICE static inline void tensormap_replace_elemtype(space_global_t, void* __tm_addr, n32_t<_N32> __new_val)
-{
-  // __space == space_global (due to parameter type constraint)
-  NV_IF_ELSE_TARGET(
-    NV_HAS_FEATURE_SM_90a,
-    (asm("tensormap.replace.tile.elemtype.global.b1024.b32          [%0], %1;"
-         :
-         : "l"(__as_ptr_gmem(__tm_addr)), "n"(__new_val.value)
-         : "memory");),
-    (
-      // Unsupported architectures will have a linker error with a semi-decent error message
-      __cuda_ptx_tensormap_replace_elemtype_is_not_supported_before_SM_90a__();));
-}
-#endif // __cccl_ptx_isa >= 830
-
-/*
-// tensormap.replace.tile.elemtype.space.b1024.b32          [tm_addr], new_val; // PTX ISA 83, SM_90a
-// .space     = { .shared::cta }
-template <int N32>
-__device__ static inline void tensormap_replace_elemtype(
-  cuda::ptx::space_shared_t,
-  void* tm_addr,
-  cuda::ptx::n32_t<N32> new_val);
-*/
-#if __cccl_ptx_isa >= 830
-extern "C" _CCCL_DEVICE void __cuda_ptx_tensormap_replace_elemtype_is_not_supported_before_SM_90a__();
-template <int _N32>
-_CCCL_DEVICE static inline void tensormap_replace_elemtype(space_shared_t, void* __tm_addr, n32_t<_N32> __new_val)
-{
-  // __space == space_shared (due to parameter type constraint)
-  NV_IF_ELSE_TARGET(
-    NV_HAS_FEATURE_SM_90a,
-    (asm("tensormap.replace.tile.elemtype.shared::cta.b1024.b32          [%0], %1;"
-         :
-         : "r"(__as_ptr_smem(__tm_addr)), "n"(__new_val.value)
-         : "memory");),
-    (
-      // Unsupported architectures will have a linker error with a semi-decent error message
-      __cuda_ptx_tensormap_replace_elemtype_is_not_supported_before_SM_90a__();));
-}
-#endif // __cccl_ptx_isa >= 830
-
-/*
-// tensormap.replace.tile.interleave_layout.space.b1024.b32 [tm_addr], new_val; // PTX ISA 83, SM_90a
-// .space     = { .global }
-template <int N32>
-__device__ static inline void tensormap_replace_interleave_layout(
-  cuda::ptx::space_global_t,
-  void* tm_addr,
-  cuda::ptx::n32_t<N32> new_val);
-*/
-#if __cccl_ptx_isa >= 830
-extern "C" _CCCL_DEVICE void __cuda_ptx_tensormap_replace_interleave_layout_is_not_supported_before_SM_90a__();
-template <int _N32>
-_CCCL_DEVICE static inline void
-tensormap_replace_interleave_layout(space_global_t, void* __tm_addr, n32_t<_N32> __new_val)
-{
-  // __space == space_global (due to parameter type constraint)
-  NV_IF_ELSE_TARGET(
-    NV_HAS_FEATURE_SM_90a,
-    (asm("tensormap.replace.tile.interleave_layout.global.b1024.b32 [%0], %1;"
-         :
-         : "l"(__as_ptr_gmem(__tm_addr)), "n"(__new_val.value)
-         : "memory");),
-    (
-      // Unsupported architectures will have a linker error with a semi-decent error message
-      __cuda_ptx_tensormap_replace_interleave_layout_is_not_supported_before_SM_90a__();));
-}
-#endif // __cccl_ptx_isa >= 830
-
-/*
-// tensormap.replace.tile.interleave_layout.space.b1024.b32 [tm_addr], new_val; // PTX ISA 83, SM_90a
-// .space     = { .shared::cta }
-template <int N32>
-__device__ static inline void tensormap_replace_interleave_layout(
-  cuda::ptx::space_shared_t,
-  void* tm_addr,
-  cuda::ptx::n32_t<N32> new_val);
-*/
-#if __cccl_ptx_isa >= 830
-extern "C" _CCCL_DEVICE void __cuda_ptx_tensormap_replace_interleave_layout_is_not_supported_before_SM_90a__();
-template <int _N32>
-_CCCL_DEVICE static inline void
-tensormap_replace_interleave_layout(space_shared_t, void* __tm_addr, n32_t<_N32> __new_val)
-{
-  // __space == space_shared (due to parameter type constraint)
-  NV_IF_ELSE_TARGET(
-    NV_HAS_FEATURE_SM_90a,
-    (asm("tensormap.replace.tile.interleave_layout.shared::cta.b1024.b32 [%0], %1;"
-         :
-         : "r"(__as_ptr_smem(__tm_addr)), "n"(__new_val.value)
-         : "memory");),
-    (
-      // Unsupported architectures will have a linker error with a semi-decent error message
-      __cuda_ptx_tensormap_replace_interleave_layout_is_not_supported_before_SM_90a__();));
-}
-#endif // __cccl_ptx_isa >= 830
-
-/*
-// tensormap.replace.tile.swizzle_mode.space.b1024.b32      [tm_addr], new_val; // PTX ISA 83, SM_90a
-// .space     = { .global }
-template <int N32>
-__device__ static inline void tensormap_replace_swizzle_mode(
-  cuda::ptx::space_global_t,
-  void* tm_addr,
-  cuda::ptx::n32_t<N32> new_val);
-*/
-#if __cccl_ptx_isa >= 830
-extern "C" _CCCL_DEVICE void __cuda_ptx_tensormap_replace_swizzle_mode_is_not_supported_before_SM_90a__();
-template <int _N32>
-_CCCL_DEVICE static inline void tensormap_replace_swizzle_mode(space_global_t, void* __tm_addr, n32_t<_N32> __new_val)
-{
-  // __space == space_global (due to parameter type constraint)
-  NV_IF_ELSE_TARGET(
-    NV_HAS_FEATURE_SM_90a,
-    (asm("tensormap.replace.tile.swizzle_mode.global.b1024.b32      [%0], %1;"
-         :
-         : "l"(__as_ptr_gmem(__tm_addr)), "n"(__new_val.value)
-         : "memory");),
-    (
-      // Unsupported architectures will have a linker error with a semi-decent error message
-      __cuda_ptx_tensormap_replace_swizzle_mode_is_not_supported_before_SM_90a__();));
-}
-#endif // __cccl_ptx_isa >= 830
-
-/*
-// tensormap.replace.tile.swizzle_mode.space.b1024.b32      [tm_addr], new_val; // PTX ISA 83, SM_90a
-// .space     = { .shared::cta }
-template <int N32>
-__device__ static inline void tensormap_replace_swizzle_mode(
-  cuda::ptx::space_shared_t,
-  void* tm_addr,
-  cuda::ptx::n32_t<N32> new_val);
-*/
-#if __cccl_ptx_isa >= 830
-extern "C" _CCCL_DEVICE void __cuda_ptx_tensormap_replace_swizzle_mode_is_not_supported_before_SM_90a__();
-template <int _N32>
-_CCCL_DEVICE static inline void tensormap_replace_swizzle_mode(space_shared_t, void* __tm_addr, n32_t<_N32> __new_val)
-{
-  // __space == space_shared (due to parameter type constraint)
-  NV_IF_ELSE_TARGET(
-    NV_HAS_FEATURE_SM_90a,
-    (asm("tensormap.replace.tile.swizzle_mode.shared::cta.b1024.b32      [%0], %1;"
-         :
-         : "r"(__as_ptr_smem(__tm_addr)), "n"(__new_val.value)
-         : "memory");),
-    (
-      // Unsupported architectures will have a linker error with a semi-decent error message
-      __cuda_ptx_tensormap_replace_swizzle_mode_is_not_supported_before_SM_90a__();));
-}
-#endif // __cccl_ptx_isa >= 830
-
-/*
-// tensormap.replace.tile.fill_mode.space.b1024.b32         [tm_addr], new_val; // PTX ISA 83, SM_90a
-// .space     = { .global }
-template <int N32>
-__device__ static inline void tensormap_replace_fill_mode(
-  cuda::ptx::space_global_t,
-  void* tm_addr,
-  cuda::ptx::n32_t<N32> new_val);
-*/
-#if __cccl_ptx_isa >= 830
-extern "C" _CCCL_DEVICE void __cuda_ptx_tensormap_replace_fill_mode_is_not_supported_before_SM_90a__();
-template <int _N32>
-_CCCL_DEVICE static inline void tensormap_replace_fill_mode(space_global_t, void* __tm_addr, n32_t<_N32> __new_val)
-{
-  // __space == space_global (due to parameter type constraint)
-  NV_IF_ELSE_TARGET(
-    NV_HAS_FEATURE_SM_90a,
-    (asm("tensormap.replace.tile.fill_mode.global.b1024.b32         [%0], %1;"
-         :
-         : "l"(__as_ptr_gmem(__tm_addr)), "n"(__new_val.value)
-         : "memory");),
-    (
-      // Unsupported architectures will have a linker error with a semi-decent error message
-      __cuda_ptx_tensormap_replace_fill_mode_is_not_supported_before_SM_90a__();));
-}
-#endif // __cccl_ptx_isa >= 830
-
-/*
-// tensormap.replace.tile.fill_mode.space.b1024.b32         [tm_addr], new_val; // PTX ISA 83, SM_90a
-// .space     = { .shared::cta }
-template <int N32>
-__device__ static inline void tensormap_replace_fill_mode(
-  cuda::ptx::space_shared_t,
-  void* tm_addr,
-  cuda::ptx::n32_t<N32> new_val);
-*/
-#if __cccl_ptx_isa >= 830
-extern "C" _CCCL_DEVICE void __cuda_ptx_tensormap_replace_fill_mode_is_not_supported_before_SM_90a__();
-template <int _N32>
-_CCCL_DEVICE static inline void tensormap_replace_fill_mode(space_shared_t, void* __tm_addr, n32_t<_N32> __new_val)
-{
-  // __space == space_shared (due to parameter type constraint)
-  NV_IF_ELSE_TARGET(
-    NV_HAS_FEATURE_SM_90a,
-    (asm("tensormap.replace.tile.fill_mode.shared::cta.b1024.b32         [%0], %1;"
-         :
-         : "r"(__as_ptr_smem(__tm_addr)), "n"(__new_val.value)
-         : "memory");),
-    (
-      // Unsupported architectures will have a linker error with a semi-decent error message
-      __cuda_ptx_tensormap_replace_fill_mode_is_not_supported_before_SM_90a__();));
-}
-#endif // __cccl_ptx_isa >= 830
+#include <cuda/__ptx/instructions/generated/tensormap_replace.inc>
 
 _LIBCUDACXX_END_NAMESPACE_CUDA_PTX
 

From 4ae70bbbfc2baa942fb499e8d719487544fa9e03 Mon Sep 17 00:00:00 2001
From: David Bayer <48736217+davebayer@users.noreply.github.com>
Date: Fri, 22 Nov 2024 13:44:07 +0100
Subject: [PATCH 11/45] implement C++26 `std::span`'s constructor from
 `std::initializer_list` (#2923)

Co-authored-by: Michael Schellenberger Costa <miscco@nvidia.com>
---
 .../cuda/std/detail/libcxx/include/span       | 33 ++++++-
 libcudacxx/include/cuda/std/version           |  5 +-
 .../views/views.span/span.cons/array.pass.cpp |  5 +-
 .../span.cons/initializer_list.pass.cpp       | 86 ++++++++++++++++---
 4 files changed, 111 insertions(+), 18 deletions(-)

diff --git a/libcudacxx/include/cuda/std/detail/libcxx/include/span b/libcudacxx/include/cuda/std/detail/libcxx/include/span
index 75774146c09..042d2f029c5 100644
--- a/libcudacxx/include/cuda/std/detail/libcxx/include/span
+++ b/libcudacxx/include/cuda/std/detail/libcxx/include/span
@@ -172,6 +172,7 @@ template<class R>
 #include <cuda/std/array>
 #include <cuda/std/cstddef> // for ptrdiff_t
 #include <cuda/std/detail/libcxx/include/stdexcept>
+#include <cuda/std/initializer_list>
 
 // standard-mandated includes
 #include <cuda/std/version>
@@ -202,6 +203,12 @@ _CCCL_INLINE_VAR constexpr bool __is_std_span<span<_Tp, _Extent>> = true;
 template <class _From, class _To>
 _CCCL_CONCEPT __span_array_convertible = _CCCL_TRAIT(is_convertible, _From (*)[], _To (*)[]);
 
+template <class _Tp>
+_CCCL_INLINE_VAR constexpr bool __is_std_initializer_list = false;
+
+template <class _Tp>
+_CCCL_INLINE_VAR constexpr bool __is_std_initializer_list<initializer_list<_Tp>> = true;
+
 // We want to ensure that span interacts nicely with containers that might not have had the ranges treatment
 #  if defined(__cpp_lib_ranges) && !_CCCL_COMPILER(MSVC2017)
 #    define _CCCL_SPAN_USES_RANGES
@@ -216,7 +223,8 @@ _CCCL_CONCEPT_FRAGMENT(
     requires(_CUDA_VRANGES::sized_range<_Range>),
     requires((_CUDA_VRANGES::borrowed_range<_Range> || _CCCL_TRAIT(is_const, _ElementType))),
     requires((!_CCCL_TRAIT(is_array, remove_cvref_t<_Range>))),
-    requires((!__is_std_span<remove_cvref_t<_Range>> && !__is_std_array<remove_cvref_t<_Range>>) ),
+    requires((!__is_std_span<remove_cvref_t<_Range>> && !__is_std_array<remove_cvref_t<_Range>>
+              && !__is_std_initializer_list<remove_cvref_t<_Range>>) ),
     requires(_CCCL_TRAIT(
       is_convertible, remove_reference_t<_CUDA_VRANGES::range_reference_t<_Range>> (*)[], _ElementType (*)[]))));
 
@@ -259,11 +267,13 @@ _CCCL_INLINE_VAR constexpr bool __is_span_compatible_container<
   _ElementType,
   void_t<
     // is not a specialization of span
-    enable_if_t<!__is_std_span<_Container>, nullptr_t>,
+    enable_if_t<!__is_std_span<remove_cvref_t<_Container>>, nullptr_t>,
+    // is not a specialization of array
+    enable_if_t<!__is_std_array<remove_cvref_t<_Container>>, nullptr_t>,
     // is not a specialization of array
-    enable_if_t<!__is_std_array<_Container>, nullptr_t>,
+    enable_if_t<!__is_std_initializer_list<remove_cvref_t<_Container>>, nullptr_t>,
     // is_array_v<Container> is false,
-    enable_if_t<!_CCCL_TRAIT(is_array, _Container), nullptr_t>,
+    enable_if_t<!_CCCL_TRAIT(is_array, remove_cvref_t<_Container>), nullptr_t>,
     // data(cont) and size(cont) are well formed
     decltype(_CUDA_VSTD::data(_CUDA_VSTD::declval<_Container&>())),
     decltype(_CUDA_VSTD::size(_CUDA_VSTD::declval<_Container&>())),
@@ -329,6 +339,14 @@ public:
       : __data_{nullptr}
   {}
 
+  _CCCL_TEMPLATE(class _Tp2 = _Tp)
+  _CCCL_REQUIRES(_CCCL_TRAIT(is_const, _Tp2))
+  _LIBCUDACXX_HIDE_FROM_ABI constexpr explicit span(initializer_list<value_type> __il) noexcept
+      : __data_{__il.begin()}
+  {
+    _CCCL_ASSERT(_Extent == __il.size(), "size mismatch in span's constructor (initializer_list).");
+  }
+
   _CCCL_HIDE_FROM_ABI span(const span&) noexcept            = default;
   _CCCL_HIDE_FROM_ABI span& operator=(const span&) noexcept = default;
 
@@ -585,6 +603,13 @@ public:
       , __size_{0}
   {}
 
+  _CCCL_TEMPLATE(class _Tp2 = _Tp)
+  _CCCL_REQUIRES(_CCCL_TRAIT(is_const, _Tp2))
+  _LIBCUDACXX_HIDE_FROM_ABI constexpr span(initializer_list<value_type> __il) noexcept
+      : __data_{__il.begin()}
+      , __size_{__il.size()}
+  {}
+
   _CCCL_HIDE_FROM_ABI span(const span&) noexcept            = default;
   _CCCL_HIDE_FROM_ABI span& operator=(const span&) noexcept = default;
 
diff --git a/libcudacxx/include/cuda/std/version b/libcudacxx/include/cuda/std/version
index 238259f45ef..bb9475ede07 100644
--- a/libcudacxx/include/cuda/std/version
+++ b/libcudacxx/include/cuda/std/version
@@ -59,8 +59,9 @@
 #  ifndef _LIBCUDACXX_HAS_NO_THREADS
 // #   define __cccl_lib_shared_timed_mutex                 201402L
 #  endif // !_LIBCUDACXX_HAS_NO_THREADS
-#  define __cccl_lib_source_location 201907L
-#  define __cccl_lib_span            202311L
+#  define __cccl_lib_source_location       201907L
+#  define __cccl_lib_span                  202311L
+#  define __cccl_lib_span_initializer_list 202311L
 // # define __cccl_lib_string_udls                          201304L
 #  define __cccl_lib_transformation_trait_aliases 201304L
 #  define __cccl_lib_transparent_operators        201210L
diff --git a/libcudacxx/test/libcudacxx/std/containers/views/views.span/span.cons/array.pass.cpp b/libcudacxx/test/libcudacxx/std/containers/views/views.span/span.cons/array.pass.cpp
index b0cb864464b..5c819507038 100644
--- a/libcudacxx/test/libcudacxx/std/containers/views/views.span/span.cons/array.pass.cpp
+++ b/libcudacxx/test/libcudacxx/std/containers/views/views.span/span.cons/array.pass.cpp
@@ -18,6 +18,7 @@
 //   — remove_pointer_t<decltype(data(arr))>(*)[] is convertible to ElementType(*)[].
 //
 
+#include <cuda/std/array>
 #include <cuda/std/cassert>
 #include <cuda/std/span>
 
@@ -92,8 +93,8 @@ __host__ __device__ constexpr bool testSpan()
   assert(s3.data() == val && s3.size() == 2);
   assert(s4.data() == val && s4.size() == 2);
 
-  cuda::std::span<const int> s5    = {{1, 2}};
-  cuda::std::span<const int, 2> s6 = {{1, 2}};
+  cuda::std::span<const int> s5    = {cuda::std::array<int, 2>{1, 2}};
+  cuda::std::span<const int, 2> s6 = {cuda::std::array<int, 2>{1, 2}};
   assert(s5.size() == 2); // and it dangles
   assert(s6.size() == 2); // and it dangles
 
diff --git a/libcudacxx/test/libcudacxx/std/containers/views/views.span/span.cons/initializer_list.pass.cpp b/libcudacxx/test/libcudacxx/std/containers/views/views.span/span.cons/initializer_list.pass.cpp
index 3c2a2526455..d84d0b01115 100644
--- a/libcudacxx/test/libcudacxx/std/containers/views/views.span/span.cons/initializer_list.pass.cpp
+++ b/libcudacxx/test/libcudacxx/std/containers/views/views.span/span.cons/initializer_list.pass.cpp
@@ -1,20 +1,50 @@
 //===----------------------------------------------------------------------===//
 //
-// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// Part of the libcu++ Project, under the Apache License v2.0 with LLVM Exceptions.
 // See https://llvm.org/LICENSE.txt for license information.
 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
 // SPDX-FileCopyrightText: Copyright (c) 2023 NVIDIA CORPORATION & AFFILIATES.
 //
 //===----------------------------------------------------------------------===//
-// UNSUPPORTED: c++03, c++11
 
-// <span>
+// UNSUPPORTED: c++11
 
+// <cuda/std/span>
+
+// constexpr explicit(extent != dynamic_extent) span(std::initializer_list<value_type> il);
+
+// #include <any>
 #include <cuda/std/cassert>
 #include <cuda/std/cstddef>
+#include <cuda/std/initializer_list>
 #include <cuda/std/span>
+#include <cuda/std/type_traits>
 
+#include "test_convertible.h"
 #include "test_macros.h"
+
+using cuda::std::is_constructible;
+
+// Constructor constrains
+static_assert(is_constructible<cuda::std::span<const int>, cuda::std::initializer_list<int>>::value, "");
+static_assert(is_constructible<cuda::std::span<const int, 42>, cuda::std::initializer_list<int>>::value, "");
+static_assert(!is_constructible<cuda::std::span<const int>, cuda::std::initializer_list<const int>>::value, "");
+static_assert(!is_constructible<cuda::std::span<const int, 42>, cuda::std::initializer_list<const int>>::value, "");
+
+static_assert(!is_constructible<cuda::std::span<int>, cuda::std::initializer_list<int>>::value, "");
+static_assert(!is_constructible<cuda::std::span<int, 42>, cuda::std::initializer_list<int>>::value, "");
+static_assert(!is_constructible<cuda::std::span<int>, cuda::std::initializer_list<const int>>::value, "");
+static_assert(!is_constructible<cuda::std::span<int, 42>, cuda::std::initializer_list<const int>>::value, "");
+
+// Constructor conditionally explicit
+
+static_assert(!test_convertible<cuda::std::span<const int, 28>, cuda::std::initializer_list<int>>(),
+              "This constructor must be explicit");
+static_assert(is_constructible<cuda::std::span<const int, 28>, cuda::std::initializer_list<int>>::value, "");
+static_assert(test_convertible<cuda::std::span<const int>, cuda::std::initializer_list<int>>(),
+              "This constructor must not be explicit");
+static_assert(is_constructible<cuda::std::span<const int>, cuda::std::initializer_list<int>>::value, "");
+
 struct Sink
 {
   constexpr Sink() = default;
@@ -26,25 +56,61 @@ __host__ __device__ constexpr cuda::std::size_t count(cuda::std::span<const Sink
   return sp.size();
 }
 
-template <int N>
-__host__ __device__ constexpr cuda::std::size_t countn(cuda::std::span<const Sink, N> sp)
+template <cuda::std::size_t N>
+__host__ __device__ constexpr cuda::std::size_t count_n(cuda::std::span<const Sink, N> sp)
 {
   return sp.size();
 }
 
 __host__ __device__ constexpr bool test()
 {
-  Sink a[10] = {};
-  assert(count({a}) == 10);
-  assert(count({a, a + 10}) == 10);
-  assert(countn<10>({a}) == 10);
+  // Dynamic extent
+  {
+    Sink a[10]{};
+
+    assert(count({a}) == 1);
+    assert(count({a, a + 10}) == 2);
+    assert(count({a, a + 1, a + 2}) == 3);
+    assert(count(cuda::std::initializer_list<Sink>{a[0], a[1], a[2], a[3]}) == 4);
+  }
+
+  return true;
+}
+
+// Test P2447R4 "Annex C examples"
+
+__host__ __device__ constexpr int three(cuda::std::span<void* const> sp)
+{
+  return static_cast<int>(sp.size());
+}
+
+__host__ __device__ bool test_P2447R4_annex_c_examples()
+{
+  // 1. Overload resolution is affected
+  // --> tested in "initializer_list.verify.cpp"
+
+  // 2. The `initializer_list` ctor has high precedence
+  // --> tested in "initializer_list.verify.cpp"
+
+  // 3. Implicit two-argument construction with a highly convertible value_type
+  {
+    void* a[10];
+    assert(three({a, 0}) == 2);
+  }
+  // {
+  //   cuda::std::any a[10];
+  //   assert(four({a, a + 10}) == 2);
+  // }
+
   return true;
 }
 
 int main(int, char**)
 {
-  test();
+  assert(test());
   static_assert(test(), "");
 
+  assert(test_P2447R4_annex_c_examples());
+
   return 0;
 }

From 83d180f487ac85c3985d39835d665ae676ba49b0 Mon Sep 17 00:00:00 2001
From: David Bayer <48736217+davebayer@users.noreply.github.com>
Date: Fri, 22 Nov 2024 14:06:24 +0100
Subject: [PATCH 12/45] Add tuple protocol to `cuda::std::complex` from C++26
 (#2882)

---
 .../include/cuda/std/__complex/nvbf16.h       |  32 ++++
 .../include/cuda/std/__complex/nvfp16.h       |  32 ++++
 libcudacxx/include/cuda/std/__fwd/complex.h   |  30 +++
 libcudacxx/include/cuda/std/__fwd/get.h       |  13 ++
 .../std/__tuple_dir/structured_bindings.h     |   9 +
 .../include/cuda/std/__tuple_dir/tuple_like.h |   5 +
 .../cuda/std/__tuple_dir/tuple_like_ext.h     |   5 +
 .../cuda/std/detail/libcxx/include/complex    |  74 ++++++++
 libcudacxx/include/cuda/std/version           |   1 +
 .../complex.number/complex.tuple/get.pass.cpp | 171 ++++++++++++++++++
 .../tuple_element_compiles.pass.cpp           |  66 +++++++
 .../tuple_size_compiles.pass.cpp              |  62 +++++++
 12 files changed, 500 insertions(+)
 create mode 100644 libcudacxx/include/cuda/std/__fwd/complex.h
 create mode 100644 libcudacxx/test/libcudacxx/std/numerics/complex.number/complex.tuple/get.pass.cpp
 create mode 100644 libcudacxx/test/libcudacxx/std/numerics/complex.number/complex.tuple/tuple_element_compiles.pass.cpp
 create mode 100644 libcudacxx/test/libcudacxx/std/numerics/complex.number/complex.tuple/tuple_size_compiles.pass.cpp

diff --git a/libcudacxx/include/cuda/std/__complex/nvbf16.h b/libcudacxx/include/cuda/std/__complex/nvbf16.h
index b456a53139a..ede7f05a29a 100644
--- a/libcudacxx/include/cuda/std/__complex/nvbf16.h
+++ b/libcudacxx/include/cuda/std/__complex/nvbf16.h
@@ -30,6 +30,7 @@ _CCCL_DIAG_POP
 
 #  include <cuda/std/__complex/vector_support.h>
 #  include <cuda/std/__cuda/cmath_nvbf16.h>
+#  include <cuda/std/__fwd/get.h>
 #  include <cuda/std/__type_traits/enable_if.h>
 #  include <cuda/std/__type_traits/integral_constant.h>
 #  include <cuda/std/__type_traits/is_constructible.h>
@@ -112,6 +113,9 @@ class _CCCL_TYPE_VISIBILITY_DEFAULT _CCCL_ALIGNAS(alignof(__nv_bfloat162)) compl
   template <class _Up>
   friend class complex;
 
+  template <class _Up>
+  friend struct __get_complex_impl;
+
 public:
   using value_type = __nv_bfloat16;
 
@@ -295,6 +299,34 @@ _LIBCUDACXX_HIDE_FROM_ABI complex<__nv_bfloat16> acos(const complex<__nv_bfloat1
   return complex<__nv_bfloat16>{_CUDA_VSTD::acos(complex<float>{__x})};
 }
 
+template <>
+struct __get_complex_impl<__nv_bfloat16>
+{
+  template <size_t _Index>
+  static _LIBCUDACXX_HIDE_FROM_ABI constexpr __nv_bfloat16& get(complex<__nv_bfloat16>& __z) noexcept
+  {
+    return (_Index == 0) ? __z.__repr_.x : __z.__repr_.y;
+  }
+
+  template <size_t _Index>
+  static _LIBCUDACXX_HIDE_FROM_ABI constexpr __nv_bfloat16&& get(complex<__nv_bfloat16>&& __z) noexcept
+  {
+    return _CUDA_VSTD::move((_Index == 0) ? __z.__repr_.x : __z.__repr_.y);
+  }
+
+  template <size_t _Index>
+  static _LIBCUDACXX_HIDE_FROM_ABI constexpr const __nv_bfloat16& get(const complex<__nv_bfloat16>& __z) noexcept
+  {
+    return (_Index == 0) ? __z.__repr_.x : __z.__repr_.y;
+  }
+
+  template <size_t _Index>
+  static _LIBCUDACXX_HIDE_FROM_ABI constexpr const __nv_bfloat16&& get(const complex<__nv_bfloat16>&& __z) noexcept
+  {
+    return _CUDA_VSTD::move((_Index == 0) ? __z.__repr_.x : __z.__repr_.y);
+  }
+};
+
 #  if !_CCCL_COMPILER(NVRTC)
 template <class _CharT, class _Traits>
 ::std::basic_istream<_CharT, _Traits>&
diff --git a/libcudacxx/include/cuda/std/__complex/nvfp16.h b/libcudacxx/include/cuda/std/__complex/nvfp16.h
index de8b2538f94..11406f98588 100644
--- a/libcudacxx/include/cuda/std/__complex/nvfp16.h
+++ b/libcudacxx/include/cuda/std/__complex/nvfp16.h
@@ -27,6 +27,7 @@
 
 #  include <cuda/std/__complex/vector_support.h>
 #  include <cuda/std/__cuda/cmath_nvfp16.h>
+#  include <cuda/std/__fwd/get.h>
 #  include <cuda/std/__type_traits/enable_if.h>
 #  include <cuda/std/__type_traits/integral_constant.h>
 #  include <cuda/std/__type_traits/is_constructible.h>
@@ -109,6 +110,9 @@ class _CCCL_TYPE_VISIBILITY_DEFAULT _CCCL_ALIGNAS(alignof(__half2)) complex<__ha
   template <class _Up>
   friend class complex;
 
+  template <class _Up>
+  friend struct __get_complex_impl;
+
 public:
   using value_type = __half;
 
@@ -292,6 +296,34 @@ _LIBCUDACXX_HIDE_FROM_ABI complex<__half> acos(const complex<__half>& __x)
   return complex<__half>{_CUDA_VSTD::acos(complex<float>{__x})};
 }
 
+template <>
+struct __get_complex_impl<__half>
+{
+  template <size_t _Index>
+  static _LIBCUDACXX_HIDE_FROM_ABI constexpr __half& get(complex<__half>& __z) noexcept
+  {
+    return (_Index == 0) ? __z.__repr_.x : __z.__repr_.y;
+  }
+
+  template <size_t _Index>
+  static _LIBCUDACXX_HIDE_FROM_ABI constexpr __half&& get(complex<__half>&& __z) noexcept
+  {
+    return _CUDA_VSTD::move((_Index == 0) ? __z.__repr_.x : __z.__repr_.y);
+  }
+
+  template <size_t _Index>
+  static _LIBCUDACXX_HIDE_FROM_ABI constexpr const __half& get(const complex<__half>& __z) noexcept
+  {
+    return (_Index == 0) ? __z.__repr_.x : __z.__repr_.y;
+  }
+
+  template <size_t _Index>
+  static _LIBCUDACXX_HIDE_FROM_ABI constexpr const __half&& get(const complex<__half>&& __z) noexcept
+  {
+    return _CUDA_VSTD::move((_Index == 0) ? __z.__repr_.x : __z.__repr_.y);
+  }
+};
+
 #  if !defined(_LIBCUDACXX_HAS_NO_LOCALIZATION) && !_CCCL_COMPILER(NVRTC)
 template <class _CharT, class _Traits>
 ::std::basic_istream<_CharT, _Traits>& operator>>(::std::basic_istream<_CharT, _Traits>& __is, complex<__half>& __x)
diff --git a/libcudacxx/include/cuda/std/__fwd/complex.h b/libcudacxx/include/cuda/std/__fwd/complex.h
new file mode 100644
index 00000000000..ba5617380dd
--- /dev/null
+++ b/libcudacxx/include/cuda/std/__fwd/complex.h
@@ -0,0 +1,30 @@
+//===----------------------------------------------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+// SPDX-FileCopyrightText: Copyright (c) 2023-24 NVIDIA CORPORATION & AFFILIATES.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef _LIBCUDACXX___FWD_COMPLEX_H
+#define _LIBCUDACXX___FWD_COMPLEX_H
+
+#include <cuda/std/detail/__config>
+
+#if defined(_CCCL_IMPLICIT_SYSTEM_HEADER_GCC)
+#  pragma GCC system_header
+#elif defined(_CCCL_IMPLICIT_SYSTEM_HEADER_CLANG)
+#  pragma clang system_header
+#elif defined(_CCCL_IMPLICIT_SYSTEM_HEADER_MSVC)
+#  pragma system_header
+#endif // no system header
+
+_LIBCUDACXX_BEGIN_NAMESPACE_STD
+
+template <class _Tp>
+class _CCCL_TYPE_VISIBILITY_DEFAULT complex;
+
+_LIBCUDACXX_END_NAMESPACE_STD
+
+#endif // _LIBCUDACXX___FWD_COMPLEX_H
diff --git a/libcudacxx/include/cuda/std/__fwd/get.h b/libcudacxx/include/cuda/std/__fwd/get.h
index 6fd977fd158..70607edc813 100644
--- a/libcudacxx/include/cuda/std/__fwd/get.h
+++ b/libcudacxx/include/cuda/std/__fwd/get.h
@@ -22,6 +22,7 @@
 
 #include <cuda/std/__concepts/copyable.h>
 #include <cuda/std/__fwd/array.h>
+#include <cuda/std/__fwd/complex.h>
 #include <cuda/std/__fwd/pair.h>
 #include <cuda/std/__fwd/subrange.h>
 #include <cuda/std/__fwd/tuple.h>
@@ -70,6 +71,18 @@ _LIBCUDACXX_HIDE_FROM_ABI _CCCL_CONSTEXPR_CXX14 _Tp&& get(array<_Tp, _Size>&&) n
 template <size_t _Ip, class _Tp, size_t _Size>
 _LIBCUDACXX_HIDE_FROM_ABI _CCCL_CONSTEXPR_CXX14 const _Tp&& get(const array<_Tp, _Size>&&) noexcept;
 
+template <size_t _Ip, class _Tp>
+_LIBCUDACXX_HIDE_FROM_ABI constexpr _Tp& get(complex<_Tp>&) noexcept;
+
+template <size_t _Ip, class _Tp>
+_LIBCUDACXX_HIDE_FROM_ABI constexpr _Tp&& get(complex<_Tp>&&) noexcept;
+
+template <size_t _Ip, class _Tp>
+_LIBCUDACXX_HIDE_FROM_ABI constexpr const _Tp& get(const complex<_Tp>&) noexcept;
+
+template <size_t _Ip, class _Tp>
+_LIBCUDACXX_HIDE_FROM_ABI constexpr const _Tp&& get(const complex<_Tp>&&) noexcept;
+
 _LIBCUDACXX_END_NAMESPACE_STD
 
 #if _CCCL_STD_VER >= 2017 && !_CCCL_COMPILER(MSVC2017)
diff --git a/libcudacxx/include/cuda/std/__tuple_dir/structured_bindings.h b/libcudacxx/include/cuda/std/__tuple_dir/structured_bindings.h
index e054f78729e..2652536435d 100644
--- a/libcudacxx/include/cuda/std/__tuple_dir/structured_bindings.h
+++ b/libcudacxx/include/cuda/std/__tuple_dir/structured_bindings.h
@@ -31,6 +31,7 @@ _CCCL_DIAG_SUPPRESS_CLANG("-Wmismatched-tags")
 #endif // !_CCCL_COMPILER(NVRTC)
 
 #include <cuda/std/__fwd/array.h>
+#include <cuda/std/__fwd/complex.h>
 #include <cuda/std/__fwd/pair.h>
 #include <cuda/std/__fwd/subrange.h>
 #include <cuda/std/__fwd/tuple.h>
@@ -87,6 +88,14 @@ struct tuple_element<_Ip, const volatile _CUDA_VSTD::array<_Tp, _Size>>
     : _CUDA_VSTD::tuple_element<_Ip, const volatile _CUDA_VSTD::array<_Tp, _Size>>
 {};
 
+template <class _Tp>
+struct tuple_size<_CUDA_VSTD::complex<_Tp>> : _CUDA_VSTD::tuple_size<_CUDA_VSTD::complex<_Tp>>
+{};
+
+template <size_t _Ip, class _Tp>
+struct tuple_element<_Ip, _CUDA_VSTD::complex<_Tp>> : _CUDA_VSTD::tuple_element<_Ip, _CUDA_VSTD::complex<_Tp>>
+{};
+
 template <class _Tp, class _Up>
 struct tuple_size<_CUDA_VSTD::pair<_Tp, _Up>> : _CUDA_VSTD::tuple_size<_CUDA_VSTD::pair<_Tp, _Up>>
 {};
diff --git a/libcudacxx/include/cuda/std/__tuple_dir/tuple_like.h b/libcudacxx/include/cuda/std/__tuple_dir/tuple_like.h
index 28a6b1dada9..d9f30347dde 100644
--- a/libcudacxx/include/cuda/std/__tuple_dir/tuple_like.h
+++ b/libcudacxx/include/cuda/std/__tuple_dir/tuple_like.h
@@ -21,6 +21,7 @@
 #endif // no system header
 
 #include <cuda/std/__fwd/array.h>
+#include <cuda/std/__fwd/complex.h>
 #include <cuda/std/__fwd/pair.h>
 #include <cuda/std/__fwd/subrange.h>
 #include <cuda/std/__fwd/tuple.h>
@@ -56,6 +57,10 @@ template <class _Tp, size_t _Size>
 struct __tuple_like<array<_Tp, _Size>> : true_type
 {};
 
+template <class _Tp>
+struct __tuple_like<complex<_Tp>> : true_type
+{};
+
 #if _CCCL_STD_VER >= 2017 && !_CCCL_COMPILER(MSVC2017)
 template <class _Ip, class _Sp, _CUDA_VRANGES::subrange_kind _Kp>
 struct __tuple_like<_CUDA_VRANGES::subrange<_Ip, _Sp, _Kp>> : true_type
diff --git a/libcudacxx/include/cuda/std/__tuple_dir/tuple_like_ext.h b/libcudacxx/include/cuda/std/__tuple_dir/tuple_like_ext.h
index 064a3b2787b..8dc56ff460b 100644
--- a/libcudacxx/include/cuda/std/__tuple_dir/tuple_like_ext.h
+++ b/libcudacxx/include/cuda/std/__tuple_dir/tuple_like_ext.h
@@ -21,6 +21,7 @@
 #endif // no system header
 
 #include <cuda/std/__fwd/array.h>
+#include <cuda/std/__fwd/complex.h>
 #include <cuda/std/__fwd/pair.h>
 #include <cuda/std/__fwd/tuple.h>
 #include <cuda/std/__tuple_dir/tuple_types.h>
@@ -55,6 +56,10 @@ template <class _Tp, size_t _Size>
 struct __tuple_like_ext<array<_Tp, _Size>> : true_type
 {};
 
+template <class _Tp>
+struct __tuple_like_ext<complex<_Tp>> : true_type
+{};
+
 template <class... _Tp>
 struct __tuple_like_ext<__tuple_types<_Tp...>> : true_type
 {};
diff --git a/libcudacxx/include/cuda/std/detail/libcxx/include/complex b/libcudacxx/include/cuda/std/detail/libcxx/include/complex
index 68d59129e4e..4e98f7c9774 100644
--- a/libcudacxx/include/cuda/std/detail/libcxx/include/complex
+++ b/libcudacxx/include/cuda/std/detail/libcxx/include/complex
@@ -243,6 +243,9 @@ template<class T> complex<T> tanh (const complex<T>&);
 #endif // no system header
 
 #include <cuda/std/__complex/vector_support.h>
+#include <cuda/std/__fwd/get.h>
+#include <cuda/std/__tuple_dir/tuple_element.h>
+#include <cuda/std/__tuple_dir/tuple_size.h>
 #include <cuda/std/__type_traits/enable_if.h>
 #include <cuda/std/__type_traits/is_constructible.h>
 #include <cuda/std/__type_traits/is_floating_point.h>
@@ -286,6 +289,9 @@ class _CCCL_TYPE_VISIBILITY_DEFAULT _LIBCUDACXX_COMPLEX_ALIGNAS complex
   template <class _Up>
   friend class complex;
 
+  template <class _Up>
+  friend struct __get_complex_impl;
+
 public:
   using value_type = _Tp;
 
@@ -1418,6 +1424,74 @@ _LIBCUDACXX_HIDE_FROM_ABI complex<_Tp> tan(const complex<_Tp>& __x)
   return complex<_Tp>(__z.imag(), -__z.real());
 }
 
+template <class _Tp>
+struct tuple_size<complex<_Tp>> : _CUDA_VSTD::integral_constant<size_t, 2>
+{};
+
+template <size_t _Index, class _Tp>
+  struct tuple_element<_Index, complex<_Tp>> : _CUDA_VSTD::enable_if < _Index<2, _Tp>
+{};
+
+template <class _Tp>
+struct __get_complex_impl
+{
+  template <size_t _Index>
+  static _LIBCUDACXX_HIDE_FROM_ABI constexpr _Tp& get(complex<_Tp>& __z) noexcept
+  {
+    return (_Index == 0) ? __z.__re_ : __z.__im_;
+  }
+
+  template <size_t _Index>
+  static _LIBCUDACXX_HIDE_FROM_ABI constexpr _Tp&& get(complex<_Tp>&& __z) noexcept
+  {
+    return _CUDA_VSTD::move((_Index == 0) ? __z.__re_ : __z.__im_);
+  }
+
+  template <size_t _Index>
+  static _LIBCUDACXX_HIDE_FROM_ABI constexpr const _Tp& get(const complex<_Tp>& __z) noexcept
+  {
+    return (_Index == 0) ? __z.__re_ : __z.__im_;
+  }
+
+  template <size_t _Index>
+  static _LIBCUDACXX_HIDE_FROM_ABI constexpr const _Tp&& get(const complex<_Tp>&& __z) noexcept
+  {
+    return _CUDA_VSTD::move((_Index == 0) ? __z.__re_ : __z.__im_);
+  }
+};
+
+template <size_t _Index, class _Tp>
+_LIBCUDACXX_HIDE_FROM_ABI constexpr _Tp& get(complex<_Tp>& __z) noexcept
+{
+  static_assert(_Index < 2, "Index value is out of range");
+
+  return __get_complex_impl<_Tp>::template get<_Index>(__z);
+}
+
+template <size_t _Index, class _Tp>
+_LIBCUDACXX_HIDE_FROM_ABI constexpr _Tp&& get(complex<_Tp>&& __z) noexcept
+{
+  static_assert(_Index < 2, "Index value is out of range");
+
+  return __get_complex_impl<_Tp>::template get<_Index>(_CUDA_VSTD::move(__z));
+}
+
+template <size_t _Index, class _Tp>
+_LIBCUDACXX_HIDE_FROM_ABI constexpr const _Tp& get(const complex<_Tp>& __z) noexcept
+{
+  static_assert(_Index < 2, "Index value is out of range");
+
+  return __get_complex_impl<_Tp>::template get<_Index>(__z);
+}
+
+template <size_t _Index, class _Tp>
+_LIBCUDACXX_HIDE_FROM_ABI constexpr const _Tp&& get(const complex<_Tp>&& __z) noexcept
+{
+  static_assert(_Index < 2, "Index value is out of range");
+
+  return __get_complex_impl<_Tp>::template get<_Index>(_CUDA_VSTD::move(__z));
+}
+
 #if !_CCCL_COMPILER(NVRTC)
 template <class _Tp, class _CharT, class _Traits>
 ::std::basic_istream<_CharT, _Traits>& operator>>(::std::basic_istream<_CharT, _Traits>& __is, complex<_Tp>& __x)
diff --git a/libcudacxx/include/cuda/std/version b/libcudacxx/include/cuda/std/version
index bb9475ede07..0762976d0d9 100644
--- a/libcudacxx/include/cuda/std/version
+++ b/libcudacxx/include/cuda/std/version
@@ -30,6 +30,7 @@
 #endif // !_CCCL_COMPILER(NVRTC)
 
 #define __cccl_lib_to_underlying 202102L
+// #define __cpp_lib_tuple_like     202311L // P2819R2 is implemented, but P2165R4 is not yet
 
 #if _CCCL_STD_VER >= 2014
 #  define __cccl_lib_bit_cast     201806L
diff --git a/libcudacxx/test/libcudacxx/std/numerics/complex.number/complex.tuple/get.pass.cpp b/libcudacxx/test/libcudacxx/std/numerics/complex.number/complex.tuple/get.pass.cpp
new file mode 100644
index 00000000000..bbf70e671e3
--- /dev/null
+++ b/libcudacxx/test/libcudacxx/std/numerics/complex.number/complex.tuple/get.pass.cpp
@@ -0,0 +1,171 @@
+//===----------------------------------------------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+// SPDX-FileCopyrightText: Copyright (c) 2023 NVIDIA CORPORATION & AFFILIATES.
+//
+//===----------------------------------------------------------------------===//
+
+// <cuda/std/complex>
+
+//   template<size_t I, class T>
+//     constexpr T& get(complex<T>&) noexcept;
+//   template<size_t I, class T>
+//     constexpr T&& get(complex<T>&&) noexcept;
+//   template<size_t I, class T>
+//     constexpr const T& get(const complex<T>&) noexcept;
+//   template<size_t I, class T>
+//     constexpr const T&& get(const complex<T>&&) noexcept;
+
+#include <cuda/std/cassert>
+#include <cuda/std/complex>
+// #include <cuda/std/vector>
+#include <cuda/std/utility>
+
+#include "test_macros.h"
+
+template <typename T>
+TEST_CONSTEXPR_CXX14 __host__ __device__ void test()
+{
+  // &
+  {
+    cuda::std::complex<T> c{T{27}, T{28}};
+
+    auto& r = cuda::std::get<0>(c);
+    ASSERT_SAME_TYPE(T&, decltype(cuda::std::get<0>(c)));
+    static_assert(noexcept(cuda::std::get<0>(c)), "");
+    assert(r == T{27});
+    auto& i = cuda::std::get<1>(c);
+    ASSERT_SAME_TYPE(T&, decltype(cuda::std::get<1>(c)));
+    static_assert(noexcept(cuda::std::get<1>(c)), "");
+    assert(i == T{28});
+  }
+  //  &&
+  {
+    cuda::std::complex<T> c{T{27}, T{28}};
+
+    auto&& r = cuda::std::get<0>(cuda::std::move(c));
+    ASSERT_SAME_TYPE(T&&, decltype(cuda::std::get<0>(cuda::std::move(c))));
+    static_assert(noexcept(cuda::std::get<0>(cuda::std::move(c))), "");
+    assert(r == T{27});
+  }
+  {
+    cuda::std::complex<T> c{T{27}, T{28}};
+
+    auto&& i = cuda::std::get<1>(cuda::std::move(c));
+    ASSERT_SAME_TYPE(T&&, decltype(cuda::std::get<1>(cuda::std::move(c))));
+    static_assert(noexcept(cuda::std::get<1>(cuda::std::move(c))), "");
+    assert(i == T{28});
+  }
+  // const &
+  {
+    const cuda::std::complex<T> c{T{27}, T{28}};
+
+    const auto& r = cuda::std::get<0>(c);
+    ASSERT_SAME_TYPE(const T&, decltype(cuda::std::get<0>(c)));
+    static_assert(noexcept(cuda::std::get<0>(c)), "");
+    assert(r == T{27});
+    const auto& i = cuda::std::get<1>(c);
+    ASSERT_SAME_TYPE(const T&, decltype(cuda::std::get<1>(c)));
+    static_assert(noexcept(cuda::std::get<1>(c)), "");
+    assert(i == T{28});
+  }
+  //  const &&
+  {
+    const cuda::std::complex<T> c{T{27}, T{28}};
+
+    const auto&& r = cuda::std::get<0>(cuda::std::move(c));
+    ASSERT_SAME_TYPE(const T&&, decltype(cuda::std::get<0>(cuda::std::move(c))));
+    static_assert(noexcept(cuda::std::get<0>(cuda::std::move(c))), "");
+    assert(r == T{27});
+  }
+  {
+    const cuda::std::complex<T> c{T{27}, T{28}};
+
+    const auto&& i = cuda::std::get<1>(cuda::std::move(c));
+    ASSERT_SAME_TYPE(const T&&, decltype(cuda::std::get<1>(cuda::std::move(c))));
+    static_assert(noexcept(cuda::std::get<1>(cuda::std::move(c))), "");
+    assert(i == T{28});
+  }
+
+#if TEST_STD_VER >= 2017
+  // `get()` allows using `complex` with structured bindings
+  {
+    cuda::std::complex<T> c{T{27}, T{28}};
+
+    auto [r, i]{c};
+    ASSERT_SAME_TYPE(T, decltype(r));
+    assert(r == T{27});
+    ASSERT_SAME_TYPE(T, decltype(i));
+    assert(i == T{28});
+  }
+  {
+    cuda::std::complex<T> c{T{27}, T{28}};
+
+    auto& [r, i]{c};
+    ASSERT_SAME_TYPE(T, decltype(r));
+    assert(r == T{27});
+    ASSERT_SAME_TYPE(T, decltype(i));
+    assert(i == T{28});
+  }
+#endif // TEST_STD_VER >= 2017
+
+  // TODO: Re-enable this test when we have cuda::ranges::views
+  // #if TEST_STD_VER >= 2017
+  //   // `get()` allows using `complex` with ranges
+  //   {
+  //     cuda::std::complex<T> arr[]{{T{27}, T{28}}, {T{82}, T{94}}};
+
+  //     auto reals = arr | cuda::std::views::elements<0>;
+  //     ASSERT_SAME_AS(T, cuda::std::ranges::range_value_t<decltype(reals)>);
+  //     assert(cuda::std::ranges::size(reals) == 2);
+  //     assert(cuda::std::ranges::equal(reals, std::array<T, 2>{27, 82}));
+
+  //     auto imags = arr | cuda::std::views::elements<0>;
+  //     ASSERT_SAME_AS(T, cuda::std::ranges::range_value_t<decltype(imags)>);
+  //     assert(cuda::std::ranges::size(imags) == 2);
+  //     assert(cuda::std::ranges::equal(imags, std::array<T, 2>{28, 94}));
+  //   }
+  // #endif // TEST_STD_VER >= 2017
+}
+
+__host__ __device__ bool test()
+{
+  test<float>();
+  test<double>();
+
+  // CUDA treats long double as double
+  // test<long double>();
+
+#ifdef _LIBCUDACXX_HAS_NVFP16
+  test<__half>();
+#endif
+#ifdef _LIBCUDACXX_HAS_NVBF16
+  test<__nv_bfloat16>();
+#endif
+
+  return true;
+}
+
+TEST_CONSTEXPR_CXX14 __host__ __device__ bool test_constexpr()
+{
+  test<float>();
+  test<double>();
+
+  // CUDA treats long double as double
+  // test<long double>();
+
+  return true;
+}
+
+int main(int, char**)
+{
+  test();
+
+#if TEST_STD_VER >= 2014
+  static_assert(test_constexpr(), "");
+#endif
+
+  return 0;
+}
diff --git a/libcudacxx/test/libcudacxx/std/numerics/complex.number/complex.tuple/tuple_element_compiles.pass.cpp b/libcudacxx/test/libcudacxx/std/numerics/complex.number/complex.tuple/tuple_element_compiles.pass.cpp
new file mode 100644
index 00000000000..660af111335
--- /dev/null
+++ b/libcudacxx/test/libcudacxx/std/numerics/complex.number/complex.tuple/tuple_element_compiles.pass.cpp
@@ -0,0 +1,66 @@
+//===----------------------------------------------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+// SPDX-FileCopyrightText: Copyright (c) 2023 NVIDIA CORPORATION & AFFILIATES.
+//
+//===----------------------------------------------------------------------===//
+
+// <cuda/std/complex>
+
+//   template<size_t I, class T> struct tuple_element;
+
+#include <cuda/std/cassert>
+#include <cuda/std/complex>
+#include <cuda/std/type_traits>
+
+#include "test_macros.h"
+
+template <size_t I, typename C, typename = void>
+struct HasTupleElement : cuda::std::false_type
+{};
+
+template <size_t I, typename C>
+struct HasTupleElement<I, C, cuda::std::void_t<decltype(cuda::std::tuple_element<I, C>{})>> : cuda::std::true_type
+{};
+
+struct SomeObject
+{};
+
+static_assert(!HasTupleElement<0, SomeObject>::value, "");
+static_assert(!HasTupleElement<1, SomeObject>::value, "");
+static_assert(!HasTupleElement<3, SomeObject>::value, "");
+
+template <typename T>
+__host__ __device__ void test()
+{
+  using C = cuda::std::complex<T>;
+
+  static_assert(HasTupleElement<0, C>::value, "");
+  static_assert(HasTupleElement<1, C>::value, "");
+
+  ASSERT_SAME_TYPE(T, typename cuda::std::tuple_element<0, C>::type);
+  ASSERT_SAME_TYPE(T, typename cuda::std::tuple_element<1, C>::type);
+}
+
+__host__ __device__ void test()
+{
+  test<float>();
+  test<double>();
+
+  // CUDA treats long double as double
+  // test<long double>();
+
+#ifdef _LIBCUDACXX_HAS_NVFP16
+  test<__half>();
+#endif
+#ifdef _LIBCUDACXX_HAS_NVBF16
+  test<__nv_bfloat16>();
+#endif
+}
+
+int main(int, char**)
+{
+  return 0;
+}
diff --git a/libcudacxx/test/libcudacxx/std/numerics/complex.number/complex.tuple/tuple_size_compiles.pass.cpp b/libcudacxx/test/libcudacxx/std/numerics/complex.number/complex.tuple/tuple_size_compiles.pass.cpp
new file mode 100644
index 00000000000..4e34a5c0d64
--- /dev/null
+++ b/libcudacxx/test/libcudacxx/std/numerics/complex.number/complex.tuple/tuple_size_compiles.pass.cpp
@@ -0,0 +1,62 @@
+//===----------------------------------------------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+// SPDX-FileCopyrightText: Copyright (c) 2023 NVIDIA CORPORATION & AFFILIATES.
+//
+//===----------------------------------------------------------------------===//
+
+// <cuda/std/complex>
+
+//   template<class T> struct tuple_size;
+
+#include <cuda/std/cassert>
+#include <cuda/std/complex>
+#include <cuda/std/type_traits>
+
+#include "test_macros.h"
+
+template <typename C, typename = void>
+struct HasTupleSize : cuda::std::false_type
+{};
+
+template <typename C>
+struct HasTupleSize<C, cuda::std::void_t<decltype(cuda::std::tuple_size<C>{})>> : cuda::std::true_type
+{};
+
+struct SomeObject
+{};
+
+static_assert(!HasTupleSize<SomeObject>::value, "");
+
+template <typename T>
+__host__ __device__ void test()
+{
+  using C = cuda::std::complex<T>;
+
+  static_assert(HasTupleSize<C>::value, "");
+  ASSERT_SAME_TYPE(size_t, typename cuda::std::tuple_size<C>::value_type);
+  static_assert(cuda::std::tuple_size<C>() == 2, "");
+}
+
+__host__ __device__ void test()
+{
+  test<float>();
+  test<double>();
+
+  // CUDA treats long double as double
+  // test<long double>();
+
+#ifdef _LIBCUDACXX_HAS_NVFP16
+  test<__half>();
+#endif
+#ifdef _LIBCUDACXX_HAS_NVBF16
+  test<__nv_bfloat16>();
+#endif
+}
+
+int main(int, char**)
+{
+  return 0;
+}

From 537b05013f3e8930cc393f2384cab93c499dd85e Mon Sep 17 00:00:00 2001
From: Bernhard Manfred Gruber <bernhardmgruber@gmail.com>
Date: Sat, 23 Nov 2024 09:04:49 +0100
Subject: [PATCH 13/45] Add missing qualifier for cuda namespace (#2940)

Fixes: #2939
---
 cub/cub/device/dispatch/dispatch_histogram.cuh | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/cub/cub/device/dispatch/dispatch_histogram.cuh b/cub/cub/device/dispatch/dispatch_histogram.cuh
index 15e0311fa2a..9df804d41fd 100644
--- a/cub/cub/device/dispatch/dispatch_histogram.cuh
+++ b/cub/cub/device/dispatch/dispatch_histogram.cuh
@@ -419,7 +419,7 @@ struct dispatch_histogram
         privatized_decode_op, privatized_decode_op + NUM_ACTIVE_CHANNELS, privatized_decode_op_wrapper.begin());
       ::cuda::std::copy(output_decode_op, output_decode_op + NUM_ACTIVE_CHANNELS, output_decode_op_wrapper.begin());
 
-      auto minus_one = cuda::proclaim_return_type<int>([](int levels) {
+      auto minus_one = ::cuda::proclaim_return_type<int>([](int levels) {
         return levels - 1;
       });
       ::cuda::std::transform(

From c22af18463566a2de4040941deb5895739910f5a Mon Sep 17 00:00:00 2001
From: Bernhard Manfred Gruber <bernhardmgruber@gmail.com>
Date: Sat, 23 Nov 2024 09:06:44 +0100
Subject: [PATCH 14/45] Try to fix a clang warning: (#2941)

agent_histogram.cuh:827:37: warning: comparison of different enumeration types
---
 cub/cub/agent/agent_histogram.cuh | 46 ++++++++++++++-----------------
 1 file changed, 20 insertions(+), 26 deletions(-)

diff --git a/cub/cub/agent/agent_histogram.cuh b/cub/cub/agent/agent_histogram.cuh
index f324de52bce..21a487828ca 100644
--- a/cub/cub/agent/agent_histogram.cuh
+++ b/cub/cub/agent/agent_histogram.cuh
@@ -106,23 +106,19 @@ template <int _BLOCK_THREADS,
           int _VEC_SIZE = 4>
 struct AgentHistogramPolicy
 {
-  enum
-  {
-    /// Threads per thread block
-    BLOCK_THREADS = _BLOCK_THREADS,
-
-    /// Pixels per thread (per tile of input)
-    PIXELS_PER_THREAD = _PIXELS_PER_THREAD,
+  /// Threads per thread block
+  static constexpr int BLOCK_THREADS = _BLOCK_THREADS;
+  /// Pixels per thread (per tile of input)
+  static constexpr int PIXELS_PER_THREAD = _PIXELS_PER_THREAD;
 
-    /// Whether to perform localized RLE to compress samples before histogramming
-    IS_RLE_COMPRESS = _RLE_COMPRESS,
+  /// Whether to perform localized RLE to compress samples before histogramming
+  static constexpr bool IS_RLE_COMPRESS = _RLE_COMPRESS;
 
-    /// Whether to prefer privatized shared-memory bins (versus privatized global-memory bins)
-    MEM_PREFERENCE = _MEM_PREFERENCE,
+  /// Whether to prefer privatized shared-memory bins (versus privatized global-memory bins)
+  static constexpr BlockHistogramMemoryPreference MEM_PREFERENCE = _MEM_PREFERENCE;
 
-    /// Whether to dequeue tiles from a global work queue
-    IS_WORK_STEALING = _WORK_STEALING,
-  };
+  /// Whether to dequeue tiles from a global work queue
+  static constexpr bool IS_WORK_STEALING = _WORK_STEALING;
 
   /// Vector size for samples loading (1, 2, 4)
   static constexpr int VEC_SIZE = _VEC_SIZE;
@@ -202,23 +198,21 @@ struct AgentHistogram
   using VecT                   = typename CubVector<SampleT, VecSize>::Type;
 
   /// Constants
-  enum
-  {
-    BLOCK_THREADS = AgentHistogramPolicyT::BLOCK_THREADS,
+  static constexpr int BLOCK_THREADS = AgentHistogramPolicyT::BLOCK_THREADS;
 
-    PIXELS_PER_THREAD  = AgentHistogramPolicyT::PIXELS_PER_THREAD,
-    SAMPLES_PER_THREAD = PIXELS_PER_THREAD * NUM_CHANNELS,
-    VECS_PER_THREAD    = SAMPLES_PER_THREAD / VecSize,
+  static constexpr int PIXELS_PER_THREAD  = AgentHistogramPolicyT::PIXELS_PER_THREAD;
+  static constexpr int SAMPLES_PER_THREAD = PIXELS_PER_THREAD * NUM_CHANNELS;
+  static constexpr int VECS_PER_THREAD    = SAMPLES_PER_THREAD / VecSize;
 
-    TILE_PIXELS  = PIXELS_PER_THREAD * BLOCK_THREADS,
-    TILE_SAMPLES = SAMPLES_PER_THREAD * BLOCK_THREADS,
+  static constexpr int TILE_PIXELS  = PIXELS_PER_THREAD * BLOCK_THREADS;
+  static constexpr int TILE_SAMPLES = SAMPLES_PER_THREAD * BLOCK_THREADS;
 
-    IS_RLE_COMPRESS = AgentHistogramPolicyT::IS_RLE_COMPRESS,
+  static constexpr bool IS_RLE_COMPRESS = AgentHistogramPolicyT::IS_RLE_COMPRESS;
 
-    MEM_PREFERENCE = (PRIVATIZED_SMEM_BINS > 0) ? AgentHistogramPolicyT::MEM_PREFERENCE : GMEM,
+  static constexpr BlockHistogramMemoryPreference MEM_PREFERENCE =
+    (PRIVATIZED_SMEM_BINS > 0) ? AgentHistogramPolicyT::MEM_PREFERENCE : GMEM;
 
-    IS_WORK_STEALING = AgentHistogramPolicyT::IS_WORK_STEALING,
-  };
+  static constexpr bool IS_WORK_STEALING = AgentHistogramPolicyT::IS_WORK_STEALING;
 
   /// Cache load modifier for reading input elements
   static constexpr CacheLoadModifier LOAD_MODIFIER = AgentHistogramPolicyT::LOAD_MODIFIER;

From 80031e29baa11e1674b7d30770badeca0fbdb5dc Mon Sep 17 00:00:00 2001
From: Eric Niebler <eniebler@nvidia.com>
Date: Sun, 24 Nov 2024 10:03:30 -0800
Subject: [PATCH 15/45] minor consistency improvements in concepts macros
 (#2928)

---
 .../cuda/std/__concepts/concept_macros.h      | 107 +++++++++---------
 1 file changed, 52 insertions(+), 55 deletions(-)

diff --git a/libcudacxx/include/cuda/std/__concepts/concept_macros.h b/libcudacxx/include/cuda/std/__concepts/concept_macros.h
index 2850c38a493..18587ca57df 100644
--- a/libcudacxx/include/cuda/std/__concepts/concept_macros.h
+++ b/libcudacxx/include/cuda/std/__concepts/concept_macros.h
@@ -24,6 +24,8 @@
 #  pragma system_header
 #endif // no system header
 
+#include <cuda/std/cstddef> // for size_t
+
 ////////////////////////////////////////////////////////////////////////////////
 // _CCCL_TEMPLATE
 // Usage:
@@ -50,7 +52,7 @@ using __cccl_enable_if_t = typename __cccl_select<_Bp>::template type<_Tp>;
 template <class _Tp, bool _Bp>
 using __cccl_requires_t = typename __cccl_select<_Bp>::template type<_Tp>;
 
-#if (defined(__cpp_concepts) && _CCCL_STD_VER >= 2020)
+#if (defined(__cpp_concepts) && _CCCL_STD_VER >= 2020) || defined(_CCCL_DOXYGEN_INVOKED)
 #  define _CCCL_TEMPLATE(...)               template <__VA_ARGS__>
 #  define _CCCL_REQUIRES(...)               requires __VA_ARGS__
 #  define _CCCL_AND                         &&
@@ -58,43 +60,38 @@ using __cccl_requires_t = typename __cccl_select<_Bp>::template type<_Tp>;
 #  define _CCCL_TRAILING_REQUIRES(...)      ->__VA_ARGS__ _CCCL_TRAILING_REQUIRES_AUX_
 #else // ^^^ __cpp_concepts ^^^ / vvv !__cpp_concepts vvv
 #  define _CCCL_TEMPLATE(...)               template <__VA_ARGS__
-#  define _CCCL_REQUIRES(...)               , bool _CCCL_true_ = true, __cccl_enable_if_t < __VA_ARGS__ && _CCCL_true_, int > = 0 >
-#  define _CCCL_AND                         &&_CCCL_true_, int > = 0, __cccl_enable_if_t <
+#  define _CCCL_REQUIRES(...)               , bool __cccl_true_ = true, __cccl_enable_if_t < __VA_ARGS__ && __cccl_true_, int > = 0 >
+#  define _CCCL_AND                         &&__cccl_true_, int > = 0, __cccl_enable_if_t <
 #  define _CCCL_TRAILING_REQUIRES_AUX_(...) , __VA_ARGS__ >
 #  define _CCCL_TRAILING_REQUIRES(...)      ->__cccl_requires_t < __VA_ARGS__ _CCCL_TRAILING_REQUIRES_AUX_
 #endif // !__cpp_concepts
 
 #if _CCCL_STD_VER >= 2014
 
-namespace __cccl_concept
-{
-
-template <typename...>
-struct _Tag;
+template <class...>
+struct __cccl_tag;
 
 template <class>
-_LIBCUDACXX_HIDE_FROM_ABI constexpr bool __is_true()
+_LIBCUDACXX_HIDE_FROM_ABI constexpr bool __cccl_is_true()
 {
   return true;
 }
 
-#  if _CCCL_COMPILER(CLANG) || _CCCL_COMPILER(MSVC)
+#  if _CCCL_COMPILER(MSVC)
 template <bool _Bp>
 _LIBCUDACXX_HIDE_FROM_ABI __cccl_enable_if_t<_Bp> __cccl_requires()
 {}
-#  else // ^^^ _CCCL_COMPILER(CLANG) || _CCCL_COMPILER(MSVC) ^^^ / vvv other compilers vvv
+#  else // ^^^ _CCCL_COMPILER(MSVC) ^^^ / vvv !_CCCL_COMPILER(MSVC) vvv
 template <bool _Bp, __cccl_enable_if_t<_Bp, int> = 0>
 _CCCL_INLINE_VAR constexpr int __cccl_requires = 0;
-#  endif // !_CCCL_COMPILER(CLANG) && !_CCCL_COMPILER(MSVC)
+#  endif // !_CCCL_COMPILER(MSVC)
 
 template <class _Tp, class... _Args>
-_LIBCUDACXX_HIDE_FROM_ABI auto __cccl_make_dependent(_Tp*, _Tag<_Args...>*) -> _Tp;
+_LIBCUDACXX_HIDE_FROM_ABI auto __cccl_make_dependent(_Tp*, __cccl_tag<_Args...>*) -> _Tp;
 
 template <class _Impl, class... _Args>
-using __requires_expr_impl =
-  decltype(__cccl_make_dependent(static_cast<_Impl*>(nullptr), static_cast<_Tag<void, _Args...>*>(nullptr)));
-
-} // namespace __cccl_concept
+using __cccl_requires_expr_impl =
+  decltype(__cccl_make_dependent(static_cast<_Impl*>(nullptr), static_cast<__cccl_tag<void, _Args...>*>(nullptr)));
 
 // So that we can refer to the ::cuda::std namespace below
 _LIBCUDACXX_BEGIN_NAMESPACE_STD
@@ -107,10 +104,10 @@ _LIBCUDACXX_END_NAMESPACE_STD
 //
 // where ::concept is a fully qualified name, would not compile. The
 // _CUDA_VSTD macro is fully qualified.
-namespace __unqualified_cuda_std = _CUDA_VSTD; // NOLINT(misc-unused-alias-decls)
+namespace __cccl_unqualified_cuda_std = _CUDA_VSTD; // NOLINT(misc-unused-alias-decls)
 
 #  if _CCCL_CUDACC_BELOW(12, 2)
-#    define _CCCL_CONCEPT_VSTD __unqualified_cuda_std // must not be fully qualified
+#    define _CCCL_CONCEPT_VSTD __cccl_unqualified_cuda_std // must not be fully qualified
 #  else
 #    define _CCCL_CONCEPT_VSTD _CUDA_VSTD
 #  endif
@@ -118,10 +115,10 @@ namespace __unqualified_cuda_std = _CUDA_VSTD; // NOLINT(misc-unused-alias-decls
 #  define _CCCL_CONCEPT_FRAGMENT_REQS_M0(_REQ) _CCCL_CONCEPT_FRAGMENT_REQS_SELECT_(_REQ)(_REQ)
 #  define _CCCL_CONCEPT_FRAGMENT_REQS_M1(_REQ) _CCCL_PP_EXPAND _REQ
 #  define _CCCL_CONCEPT_FRAGMENT_REQS_(...)    {_CCCL_PP_FOR_EACH(_CCCL_CONCEPT_FRAGMENT_REQS_M, __VA_ARGS__)}
-#  define _CCCL_CONCEPT_FRAGMENT_REQS_SELECT_(_REQ)                                                              \
-    _CCCL_PP_CAT3(_CCCL_CONCEPT_FRAGMENT_REQS_SELECT_,                                                           \
-                  _CCCL_PP_EVAL(_CCCL_PP_CHECK, _CCCL_PP_CAT3(_CCCL_CONCEPT_FRAGMENT_REQS_SELECT_PROBE_, _REQ))) \
-    /**/
+#  define _CCCL_CONCEPT_FRAGMENT_REQS_SELECT_(_REQ)    \
+    _CCCL_PP_CAT3(_CCCL_CONCEPT_FRAGMENT_REQS_SELECT_, \
+                  _CCCL_PP_EVAL(_CCCL_PP_CHECK, _CCCL_PP_CAT3(_CCCL_CONCEPT_FRAGMENT_REQS_SELECT_PROBE_, _REQ)))
+
 #  define _CCCL_CONCEPT_FRAGMENT_REQS_SELECT_PROBE_requires _CCCL_PP_PROBE_N(~, 1)
 #  define _CCCL_CONCEPT_FRAGMENT_REQS_SELECT_PROBE_noexcept _CCCL_PP_PROBE_N(~, 2)
 #  define _CCCL_CONCEPT_FRAGMENT_REQS_SELECT_PROBE_typename _CCCL_PP_PROBE_N(~, 3)
@@ -132,15 +129,14 @@ namespace __unqualified_cuda_std = _CUDA_VSTD; // NOLINT(misc-unused-alias-decls
 #  define _CCCL_CONCEPT_FRAGMENT_REQS_SELECT_2 _CCCL_CONCEPT_FRAGMENT_REQS_REQUIRES_OR_NOEXCEPT
 #  define _CCCL_CONCEPT_FRAGMENT_REQS_SELECT_3 _CCCL_CONCEPT_FRAGMENT_REQS_REQUIRES_OR_NOEXCEPT
 #  define _CCCL_CONCEPT_FRAGMENT_REQS_SELECT_4 _CCCL_CONCEPT_FRAGMENT_REQS_SAME_AS
+
 #  define _CCCL_CONCEPT_FRAGMENT_REQS_REQUIRES_OR_NOEXCEPT(_REQ) \
     _CCCL_PP_CAT4(_CCCL_CONCEPT_FRAGMENT_REQS_REQUIRES_, _REQ)
 #  define _CCCL_PP_EAT_TYPENAME_PROBE_typename _CCCL_PP_PROBE(~)
 #  define _CCCL_PP_EAT_TYPENAME_SELECT_(_Xp, ...) \
     _CCCL_PP_CAT3(_CCCL_PP_EAT_TYPENAME_SELECT_,  \
                   _CCCL_PP_EVAL(_CCCL_PP_CHECK, _CCCL_PP_CAT3(_CCCL_PP_EAT_TYPENAME_PROBE_, _Xp)))
-#  define _CCCL_PP_EAT_TYPENAME_(...)                            \
-    _CCCL_PP_EVAL2(_CCCL_PP_EAT_TYPENAME_SELECT_, __VA_ARGS__, ) \
-    (__VA_ARGS__)
+#  define _CCCL_PP_EAT_TYPENAME_(...)         _CCCL_PP_EVAL2(_CCCL_PP_EAT_TYPENAME_SELECT_, __VA_ARGS__, )(__VA_ARGS__)
 #  define _CCCL_PP_EAT_TYPENAME_SELECT_0(...) __VA_ARGS__
 #  define _CCCL_PP_EAT_TYPENAME_SELECT_1(...) _CCCL_PP_CAT3(_CCCL_PP_EAT_TYPENAME_, __VA_ARGS__)
 #  define _CCCL_PP_EAT_TYPENAME_typename
@@ -178,13 +174,13 @@ namespace __unqualified_cuda_std = _CUDA_VSTD; // NOLINT(misc-unused-alias-decls
 #    define _CCCL_CONCEPT_FRAGMENT(_NAME, ...)                                                                         \
       _LIBCUDACXX_HIDE_FROM_ABI auto _NAME##_CCCL_CONCEPT_FRAGMENT_impl_ _CCCL_CONCEPT_FRAGMENT_REQS_##__VA_ARGS__ > { \
       }                                                                                                                \
-      template <typename... _As>                                                                                       \
+      template <class... _As>                                                                                          \
       _LIBCUDACXX_HIDE_FROM_ABI char _NAME##_CCCL_CONCEPT_FRAGMENT_(                                                   \
-        __cccl_concept::_Tag<_As...>*, decltype(&_NAME##_CCCL_CONCEPT_FRAGMENT_impl_<_As...>));                        \
-      _LIBCUDACXX_HIDE_FROM_ABI char(&_NAME##_CCCL_CONCEPT_FRAGMENT_(...))[2] /**/
+        ::__cccl_tag<_As...>*, decltype(&_NAME##_CCCL_CONCEPT_FRAGMENT_impl_<_As...>));                                \
+      _LIBCUDACXX_HIDE_FROM_ABI char(&_NAME##_CCCL_CONCEPT_FRAGMENT_(...))[2]
 #    if defined(_MSC_VER) && !defined(__clang__)
 #      define _CCCL_CONCEPT_FRAGMENT_TRUE(...) \
-        __cccl_concept::__is_true<decltype(_CCCL_PP_FOR_EACH(_CCCL_CONCEPT_FRAGMENT_REQS_M, __VA_ARGS__) void())>()
+        ::__cccl_is_true<decltype(_CCCL_PP_FOR_EACH(_CCCL_CONCEPT_FRAGMENT_REQS_M, __VA_ARGS__) void())>()
 #    else
 #      define _CCCL_CONCEPT_FRAGMENT_TRUE(...) \
         !(decltype(_CCCL_PP_FOR_EACH(_CCCL_CONCEPT_FRAGMENT_REQS_M, __VA_ARGS__) void(), false){})
@@ -194,23 +190,22 @@ namespace __unqualified_cuda_std = _CUDA_VSTD; // NOLINT(misc-unused-alias-decls
 #    define _CCCL_CONCEPT_FRAGMENT_REQS_M(_REQ)                             \
       _CCCL_PP_CAT2(_CCCL_CONCEPT_FRAGMENT_REQS_M, _CCCL_PP_IS_PAREN(_REQ)) \
       (_REQ),
-#    define _CCCL_CONCEPT_FRAGMENT_REQS_REQUIRES_requires(...) __cccl_concept::__cccl_requires<__VA_ARGS__>
-#    define _CCCL_CONCEPT_FRAGMENT_REQS_REQUIRES_typename(...) static_cast<__cccl_concept::_Tag<__VA_ARGS__>*>(nullptr)
+#    define _CCCL_CONCEPT_FRAGMENT_REQS_REQUIRES_requires(...) ::__cccl_requires<__VA_ARGS__>
+#    define _CCCL_CONCEPT_FRAGMENT_REQS_REQUIRES_typename(...) static_cast<::__cccl_tag<__VA_ARGS__>*>(nullptr)
 #    if _CCCL_COMPILER(GCC, <, 14)
 // GCC < 14 can't mangle noexcept expressions, so just check that the
 // expression is well-formed.
 // https://gcc.gnu.org/bugzilla/show_bug.cgi?id=70790
 #      define _CCCL_CONCEPT_FRAGMENT_REQS_REQUIRES_noexcept(...) __VA_ARGS__
 #    else
-#      define _CCCL_CONCEPT_FRAGMENT_REQS_REQUIRES_noexcept(...) __cccl_concept::__cccl_requires<noexcept(__VA_ARGS__)>
+#      define _CCCL_CONCEPT_FRAGMENT_REQS_REQUIRES_noexcept(...) ::__cccl_requires<noexcept(__VA_ARGS__)>
 #    endif
 #    define _CCCL_CONCEPT_FRAGMENT_REQS_SAME_AS(_REQ) \
-      __cccl_concept::__cccl_requires<                \
-        _CUDA_VSTD::same_as<_CCCL_PP_CAT4(_CCCL_CONCEPT_FRAGMENT_REQS_SAME_AS_, _REQ) _CCCL_PP_RPAREN>>
+      ::__cccl_requires<_CUDA_VSTD::same_as<_CCCL_PP_CAT4(_CCCL_CONCEPT_FRAGMENT_REQS_SAME_AS_, _REQ) _CCCL_PP_RPAREN>>
 #    define _CCCL_CONCEPT_FRAGMENT_REQS_SAME_AS__Same_as(...) __VA_ARGS__, decltype _CCCL_PP_LPAREN
 
 #    define _CCCL_FRAGMENT(_NAME, ...) \
-      (1u == sizeof(_NAME##_CCCL_CONCEPT_FRAGMENT_(static_cast<__cccl_concept::_Tag<__VA_ARGS__>*>(nullptr), nullptr)))
+      (1u == sizeof(_NAME##_CCCL_CONCEPT_FRAGMENT_(static_cast<::__cccl_tag<__VA_ARGS__>*>(nullptr), nullptr)))
 
 #  endif
 
@@ -225,7 +220,7 @@ namespace __unqualified_cuda_std = _CUDA_VSTD; // NOLINT(misc-unused-alias-decls
 //     );
 //
 // Can only be used as the last requirement in a concept definition.
-#  if defined(__cpp_concepts) && _CCCL_STD_VER >= 2020
+#  if defined(__cpp_concepts) && _CCCL_STD_VER >= 2020 || defined(_CCCL_DOXYGEN_INVOKED)
 #    define _CCCL_REQUIRES_EXPR(_TY, ...) requires(__VA_ARGS__) _CCCL_REQUIRES_EXPR_2
 #    define _CCCL_REQUIRES_EXPR_2(...)    {_CCCL_PP_FOR_EACH(_CCCL_CONCEPT_FRAGMENT_REQS_M, __VA_ARGS__)}
 #  else
@@ -249,27 +244,29 @@ namespace __unqualified_cuda_std = _CUDA_VSTD; // NOLINT(misc-unused-alias-decls
 #    define _CCCL_REQUIRES_EXPR_EXPAND_TPARAMS(...) _CCCL_PP_FOR_EACH(_CCCL_REQUIRES_EXPR_EXPAND_TPARAM, __VA_ARGS__)
 
 #    define _CCCL_REQUIRES_EXPR(_TY, ...)                                                                             \
-      __cccl_concept::__requires_expr_impl<struct _CCCL_PP_CAT(                                                       \
-        __cccl_requires_expr_detail_, __LINE__) _CCCL_REQUIRES_EXPR_EXPAND_TPARAMS _TY>::                             \
-        __cccl_is_satisfied(static_cast<__cccl_concept::_Tag<void _CCCL_REQUIRES_EXPR_EXPAND_TPARAMS _TY>*>(nullptr), \
-                            static_cast<void (*)(__VA_ARGS__)>(nullptr));                                             \
+      ::__cccl_requires_expr_impl<                                                                                    \
+        struct _CCCL_PP_CAT(__cccl_requires_expr_detail_, __LINE__) _CCCL_REQUIRES_EXPR_EXPAND_TPARAMS                \
+          _TY>::__cccl_is_satisfied(static_cast<::__cccl_tag<void _CCCL_REQUIRES_EXPR_EXPAND_TPARAMS _TY>*>(nullptr), \
+                                    static_cast<void (*)(__VA_ARGS__)>(nullptr));                                     \
       struct _CCCL_PP_CAT(__cccl_requires_expr_detail_, __LINE__)                                                     \
       {                                                                                                               \
-        using _Self_t = _CCCL_PP_CAT(__cccl_requires_expr_detail_, __LINE__);                                         \
+        using __cccl_self_t = _CCCL_PP_CAT(__cccl_requires_expr_detail_, __LINE__);                                   \
         template <class _CCCL_REQUIRES_EXPR_TPARAMS _TY>                                                              \
-        _LIBCUDACXX_HIDE_FROM_ABI static auto _Well_formed(__VA_ARGS__) _CCCL_REQUIRES_EXPR_2
-
-#    define _CCCL_REQUIRES_EXPR_2(...)                                                                          \
-      ->decltype(_CCCL_PP_FOR_EACH(_CCCL_CONCEPT_FRAGMENT_REQS_M, __VA_ARGS__) void()) {}                       \
-      template <class... Args, class Sig, class = decltype(static_cast<Sig*>(&_Self_t::_Well_formed<Args...>))> \
-      _LIBCUDACXX_HIDE_FROM_ABI static constexpr bool __cccl_is_satisfied(__cccl_concept::_Tag<Args...>*, Sig*) \
-      {                                                                                                         \
-        return true;                                                                                            \
-      }                                                                                                         \
-      _LIBCUDACXX_HIDE_FROM_ABI static constexpr bool __cccl_is_satisfied(void*, ...)                           \
-      {                                                                                                         \
-        return false;                                                                                           \
-      }                                                                                                         \
+        _LIBCUDACXX_HIDE_FROM_ABI static auto __cccl_well_formed(__VA_ARGS__) _CCCL_REQUIRES_EXPR_2
+
+#    define _CCCL_REQUIRES_EXPR_2(...)                                                                    \
+      ->decltype(_CCCL_PP_FOR_EACH(_CCCL_CONCEPT_FRAGMENT_REQS_M, __VA_ARGS__) void()) {}                 \
+      template <class... _Args,                                                                           \
+                class _Sig,                                                                               \
+                class = decltype(static_cast<_Sig*>(&__cccl_self_t::__cccl_well_formed<_Args...>))>       \
+      _LIBCUDACXX_HIDE_FROM_ABI static constexpr bool __cccl_is_satisfied(::__cccl_tag<_Args...>*, _Sig*) \
+      {                                                                                                   \
+        return true;                                                                                      \
+      }                                                                                                   \
+      _LIBCUDACXX_HIDE_FROM_ABI static constexpr bool __cccl_is_satisfied(void*, ...)                     \
+      {                                                                                                   \
+        return false;                                                                                     \
+      }                                                                                                   \
       }
 #  endif
 

From 18a014125be329b7b7b848f571c576c71b02bdee Mon Sep 17 00:00:00 2001
From: Michael Schellenberger Costa <miscco@nvidia.com>
Date: Mon, 25 Nov 2024 11:02:24 +0100
Subject: [PATCH 16/45] Drop some of the mdspan fold implementation (#2949)

* Drop unused macros

* Do not return a custom struct

* Replace `__MDSPAN_FOLD_AND` with `__fold_and_v` when possible
---
 .../include/cuda/std/__mdspan/extents.h       | 20 ++++++-------
 .../include/cuda/std/__mdspan/layout_left.h   |  7 +++--
 .../include/cuda/std/__mdspan/layout_right.h  |  7 +++--
 .../include/cuda/std/__mdspan/layout_stride.h |  8 ++---
 libcudacxx/include/cuda/std/__mdspan/macros.h | 30 ++++---------------
 .../cuda/std/__mdspan/maybe_static_value.h    |  9 ++----
 libcudacxx/include/cuda/std/__mdspan/mdspan.h | 29 ++++++++----------
 .../include/cuda/std/__mdspan/submdspan.h     | 18 +++++------
 8 files changed, 51 insertions(+), 77 deletions(-)

diff --git a/libcudacxx/include/cuda/std/__mdspan/extents.h b/libcudacxx/include/cuda/std/__mdspan/extents.h
index 302cc26894a..d0bdfd016f6 100644
--- a/libcudacxx/include/cuda/std/__mdspan/extents.h
+++ b/libcudacxx/include/cuda/std/__mdspan/extents.h
@@ -61,6 +61,7 @@
 #include <cuda/std/__mdspan/standard_layout_static_array.h>
 #include <cuda/std/__mdspan/static_array.h>
 #include <cuda/std/__type_traits/conditional.h>
+#include <cuda/std/__type_traits/fold.h>
 #include <cuda/std/__type_traits/integral_constant.h>
 #include <cuda/std/__type_traits/is_convertible.h>
 #include <cuda/std/__type_traits/is_nothrow_constructible.h>
@@ -111,8 +112,7 @@ struct __compare_extent_compatible
 
 template <size_t... _Extents, size_t... _OtherExtents>
 static integral_constant<bool,
-                         __MDSPAN_FOLD_AND((__compare_extent_compatible<_Extents, _OtherExtents>::value) /* && ... */
-                                           )> _CCCL_HOST_DEVICE
+                         __fold_and_v<(__compare_extent_compatible<_Extents, _OtherExtents>::value)...>> _CCCL_HOST_DEVICE
 __check_compatible_extents(true_type,
                            _CUDA_VSTD::integer_sequence<size_t, _Extents...>,
                            _CUDA_VSTD::integer_sequence<size_t, _OtherExtents...>) noexcept
@@ -285,18 +285,16 @@ class extents
   _CCCL_REQUIRES(
     // TODO: check whether the other version works with newest NVCC, doesn't with 11.4
     // NVCC seems to pick up rank_dynamic from the wrong extents type???
-    __MDSPAN_FOLD_AND(_CCCL_TRAIT(_CUDA_VSTD::is_convertible, _Integral, index_type) /* && ... */)
-      _CCCL_AND __MDSPAN_FOLD_AND(_CCCL_TRAIT(_CUDA_VSTD::is_nothrow_constructible, index_type, _Integral) /* && ... */)
-        _CCCL_AND
+    __fold_and_v<_CCCL_TRAIT(_CUDA_VSTD::is_convertible, _Integral, index_type)...> _CCCL_AND
+      __fold_and_v<_CCCL_TRAIT(_CUDA_VSTD::is_nothrow_constructible, index_type, _Integral)...> _CCCL_AND
     // NVCC chokes on the fold thingy here so wrote the workaround
     ((sizeof...(_Integral) == __detail::__count_dynamic_extents<_Extents...>::val)
      || (sizeof...(_Integral) == sizeof...(_Extents))))
 #  else
   _CCCL_TEMPLATE(class... _Integral)
-  _CCCL_REQUIRES(
-    __MDSPAN_FOLD_AND(_CCCL_TRAIT(_CUDA_VSTD::is_convertible, _Integral, index_type) /* && ... */)
-      _CCCL_AND __MDSPAN_FOLD_AND(_CCCL_TRAIT(_CUDA_VSTD::is_nothrow_constructible, index_type, _Integral) /* && ... */)
-        _CCCL_AND((sizeof...(_Integral) == rank_dynamic()) || (sizeof...(_Integral) == rank())))
+  _CCCL_REQUIRES(__fold_and_v<_CCCL_TRAIT(_CUDA_VSTD::is_convertible, _Integral, index_type)...> _CCCL_AND
+                   __fold_and_v<_CCCL_TRAIT(_CUDA_VSTD::is_nothrow_constructible, index_type, _Integral)...> _CCCL_AND(
+                     (sizeof...(_Integral) == rank_dynamic()) || (sizeof...(_Integral) == rank())))
 #  endif
   _LIBCUDACXX_HIDE_FROM_ABI explicit constexpr extents(_Integral... __exts) noexcept
 #  ifndef _CCCL_HAS_NO_ATTRIBUTE_NO_UNIQUE_ADDRESS
@@ -318,8 +316,8 @@ class extents
 #  endif
   {
     /* TODO: precondition check
-     * If sizeof...(_IndexTypes) != rank_dynamic() is true, exts_arr[r] equals Er for each r for which Er is a static
-     * extent, and either
+     * If sizeof...(_IndexTypes) != rank_dynamic() is true, exts_arr[r] equals Er for each r for which Er is a
+     * static extent, and either
      *   - sizeof...(__exts) == 0 is true, or
      *   - each element of __exts is nonnegative and is a representable value of type index_type.
      */
diff --git a/libcudacxx/include/cuda/std/__mdspan/layout_left.h b/libcudacxx/include/cuda/std/__mdspan/layout_left.h
index 8a11107f390..1c105638de7 100644
--- a/libcudacxx/include/cuda/std/__mdspan/layout_left.h
+++ b/libcudacxx/include/cuda/std/__mdspan/layout_left.h
@@ -57,6 +57,7 @@
 #include <cuda/std/__mdspan/extents.h>
 #include <cuda/std/__mdspan/layout_stride.h>
 #include <cuda/std/__mdspan/macros.h>
+#include <cuda/std/__type_traits/fold.h>
 #include <cuda/std/__type_traits/is_constructible.h>
 #include <cuda/std/__type_traits/is_convertible.h>
 #include <cuda/std/__type_traits/is_nothrow_constructible.h>
@@ -187,9 +188,9 @@ class layout_left::mapping
   //--------------------------------------------------------------------------------
 
   _CCCL_TEMPLATE(class... _Indices)
-  _CCCL_REQUIRES((sizeof...(_Indices) == extents_type::rank()) _CCCL_AND __MDSPAN_FOLD_AND(
-    (_CCCL_TRAIT(_CUDA_VSTD::is_convertible, _Indices, index_type)
-     && _CCCL_TRAIT(_CUDA_VSTD::is_nothrow_constructible, index_type, _Indices))))
+  _CCCL_REQUIRES((sizeof...(_Indices) == extents_type::rank())
+                   _CCCL_AND __fold_and_v<_CCCL_TRAIT(_CUDA_VSTD::is_convertible, _Indices, index_type)...> //
+                     _CCCL_AND __fold_and_v<_CCCL_TRAIT(_CUDA_VSTD::is_nothrow_constructible, index_type, _Indices)...>)
   _CCCL_HOST_DEVICE constexpr index_type operator()(_Indices... __idxs) const noexcept
   {
     // Immediately cast incoming indices to `index_type`
diff --git a/libcudacxx/include/cuda/std/__mdspan/layout_right.h b/libcudacxx/include/cuda/std/__mdspan/layout_right.h
index bd61461ab82..43a1df74b30 100644
--- a/libcudacxx/include/cuda/std/__mdspan/layout_right.h
+++ b/libcudacxx/include/cuda/std/__mdspan/layout_right.h
@@ -57,6 +57,7 @@
 #include <cuda/std/__mdspan/extents.h>
 #include <cuda/std/__mdspan/layout_stride.h>
 #include <cuda/std/__mdspan/macros.h>
+#include <cuda/std/__type_traits/fold.h>
 #include <cuda/std/__type_traits/is_constructible.h>
 #include <cuda/std/__type_traits/is_convertible.h>
 #include <cuda/std/__type_traits/is_nothrow_constructible.h>
@@ -192,9 +193,9 @@ class layout_right::mapping
   //--------------------------------------------------------------------------------
 
   _CCCL_TEMPLATE(class... _Indices)
-  _CCCL_REQUIRES((sizeof...(_Indices) == extents_type::rank()) _CCCL_AND __MDSPAN_FOLD_AND(
-    (_CCCL_TRAIT(_CUDA_VSTD::is_convertible, _Indices, index_type)
-     && _CCCL_TRAIT(_CUDA_VSTD::is_nothrow_constructible, index_type, _Indices))))
+  _CCCL_REQUIRES((sizeof...(_Indices) == extents_type::rank())
+                   _CCCL_AND __fold_and_v<_CCCL_TRAIT(_CUDA_VSTD::is_convertible, _Indices, index_type)...> //
+                     _CCCL_AND __fold_and_v<_CCCL_TRAIT(_CUDA_VSTD::is_nothrow_constructible, index_type, _Indices)...>)
   _CCCL_HOST_DEVICE constexpr index_type operator()(_Indices... __idxs) const noexcept
   {
     return __compute_offset(__rank_count<0, extents_type::rank()>(), static_cast<index_type>(__idxs)...);
diff --git a/libcudacxx/include/cuda/std/__mdspan/layout_stride.h b/libcudacxx/include/cuda/std/__mdspan/layout_stride.h
index 3f31820cf49..520ded0f222 100644
--- a/libcudacxx/include/cuda/std/__mdspan/layout_stride.h
+++ b/libcudacxx/include/cuda/std/__mdspan/layout_stride.h
@@ -61,6 +61,7 @@
 #  include <cuda/std/__mdspan/no_unique_address.h>
 #endif // _CCCL_HAS_NO_ATTRIBUTE_NO_UNIQUE_ADDRESS
 #include <cuda/std/__mdspan/static_array.h>
+#include <cuda/std/__type_traits/fold.h>
 #include <cuda/std/__type_traits/is_constructible.h>
 #include <cuda/std/__type_traits/is_convertible.h>
 #include <cuda/std/__type_traits/is_nothrow_constructible.h>
@@ -425,10 +426,9 @@ struct layout_stride
     }
 
     _CCCL_TEMPLATE(class... _Indices)
-    _CCCL_REQUIRES(
-      (sizeof...(_Indices) == _Extents::rank())
-        _CCCL_AND __MDSPAN_FOLD_AND(_CCCL_TRAIT(is_convertible, _Indices, index_type) /*&& ...*/)
-          _CCCL_AND __MDSPAN_FOLD_AND(_CCCL_TRAIT(is_nothrow_constructible, index_type, _Indices) /*&& ...*/))
+    _CCCL_REQUIRES((sizeof...(_Indices) == _Extents::rank())
+                     _CCCL_AND __fold_and_v<_CCCL_TRAIT(is_convertible, _Indices, index_type)...> //
+                       _CCCL_AND __fold_and_v<_CCCL_TRAIT(is_nothrow_constructible, index_type, _Indices)...>)
     __MDSPAN_FORCE_INLINE_FUNCTION
     constexpr index_type operator()(_Indices... __idxs) const noexcept
     {
diff --git a/libcudacxx/include/cuda/std/__mdspan/macros.h b/libcudacxx/include/cuda/std/__mdspan/macros.h
index d3dc04b1111..b9b56adae37 100644
--- a/libcudacxx/include/cuda/std/__mdspan/macros.h
+++ b/libcudacxx/include/cuda/std/__mdspan/macros.h
@@ -276,18 +276,12 @@
 //==============================================================================
 // <editor-fold desc="fold expressions"> {{{1
 
-struct __mdspan_enable_fold_comma
-{};
-
 #  ifdef __MDSPAN_USE_FOLD_EXPRESSIONS
-#    define __MDSPAN_FOLD_AND(...)                  ((__VA_ARGS__) && ...)
-#    define __MDSPAN_FOLD_AND_TEMPLATE(...)         ((__VA_ARGS__) && ...)
-#    define __MDSPAN_FOLD_OR(...)                   ((__VA_ARGS__) || ...)
-#    define __MDSPAN_FOLD_ASSIGN_LEFT(__INIT, ...)  (__INIT = ... = (__VA_ARGS__))
-#    define __MDSPAN_FOLD_ASSIGN_RIGHT(__PACK, ...) (__PACK = ... = (__VA_ARGS__))
-#    define __MDSPAN_FOLD_TIMES_RIGHT(__PACK, ...)  (__PACK * ... * (__VA_ARGS__))
-#    define __MDSPAN_FOLD_PLUS_RIGHT(__PACK, ...)   (__PACK + ... + (__VA_ARGS__))
-#    define __MDSPAN_FOLD_COMMA(...)                ((__VA_ARGS__), ...)
+#    define __MDSPAN_FOLD_AND(...)                 ((__VA_ARGS__) && ...)
+#    define __MDSPAN_FOLD_OR(...)                  ((__VA_ARGS__) || ...)
+#    define __MDSPAN_FOLD_ASSIGN_LEFT(__INIT, ...) (__INIT = ... = (__VA_ARGS__))
+#    define __MDSPAN_FOLD_TIMES_RIGHT(__PACK, ...) (__PACK * ... * (__VA_ARGS__))
+#    define __MDSPAN_FOLD_PLUS_RIGHT(__PACK, ...)  (__PACK + ... + (__VA_ARGS__))
 #  else
 
 _LIBCUDACXX_BEGIN_NAMESPACE_STD
@@ -601,12 +595,6 @@ __fold_left_assign_impl(_Args&&... __args)
 
 #    endif
 
-template <class... _Args>
-_CCCL_HOST_DEVICE constexpr __mdspan_enable_fold_comma __fold_comma_impl(_Args&&...) noexcept
-{
-  return {};
-}
-
 template <bool... _Bs>
 struct __bools;
 
@@ -618,18 +606,10 @@ _LIBCUDACXX_END_NAMESPACE_STD
 #    define __MDSPAN_FOLD_OR(...)  _CUDA_VSTD::__fold_compatibility_impl::__fold_right_or_impl((__VA_ARGS__)...)
 #    define __MDSPAN_FOLD_ASSIGN_LEFT(__INIT, ...) \
       _CUDA_VSTD::__fold_compatibility_impl::__fold_left_assign_impl(__INIT, (__VA_ARGS__)...)
-#    define __MDSPAN_FOLD_ASSIGN_RIGHT(__PACK, ...) \
-      _CUDA_VSTD::__fold_compatibility_impl::__fold_right_assign_impl((__PACK)..., __VA_ARGS__)
 #    define __MDSPAN_FOLD_TIMES_RIGHT(__PACK, ...) \
       _CUDA_VSTD::__fold_compatibility_impl::__fold_right_times_impl((__PACK)..., __VA_ARGS__)
 #    define __MDSPAN_FOLD_PLUS_RIGHT(__PACK, ...) \
       _CUDA_VSTD::__fold_compatibility_impl::__fold_right_plus_impl((__PACK)..., __VA_ARGS__)
-#    define __MDSPAN_FOLD_COMMA(...) _CUDA_VSTD::__fold_compatibility_impl::__fold_comma_impl((__VA_ARGS__)...)
-
-#    define __MDSPAN_FOLD_AND_TEMPLATE(...)                                   \
-      _CCCL_TRAIT(_CUDA_VSTD::is_same,                                        \
-                  __fold_compatibility_impl::__bools<(__VA_ARGS__)..., true>, \
-                  __fold_compatibility_impl::__bools<true, (__VA_ARGS__)...>)
 
 #  endif
 
diff --git a/libcudacxx/include/cuda/std/__mdspan/maybe_static_value.h b/libcudacxx/include/cuda/std/__mdspan/maybe_static_value.h
index 0a8d2696b9f..fd978c6c3f8 100644
--- a/libcudacxx/include/cuda/std/__mdspan/maybe_static_value.h
+++ b/libcudacxx/include/cuda/std/__mdspan/maybe_static_value.h
@@ -88,10 +88,9 @@ struct __maybe_static_value
     return static_cast<_dynamic_t>(__v);
   }
   template <class _Up>
-  __MDSPAN_FORCE_INLINE_FUNCTION constexpr __mdspan_enable_fold_comma __set_value(_Up&& /*__rhs*/) noexcept
+  __MDSPAN_FORCE_INLINE_FUNCTION constexpr void __set_value(_Up&& /*__rhs*/) noexcept
   {
     // Should we assert that the value matches the static value here?
-    return {};
   }
 
   //--------------------------------------------------------------------------
@@ -132,10 +131,9 @@ struct __maybe_static_value<_dynamic_t, _static_t, __is_dynamic_sentinal, __is_d
     return __v;
   }
   template <class _Up>
-  __MDSPAN_FORCE_INLINE_FUNCTION constexpr __mdspan_enable_fold_comma __set_value(_Up&& __rhs) noexcept
+  __MDSPAN_FORCE_INLINE_FUNCTION constexpr void __set_value(_Up&& __rhs) noexcept
   {
     __v = (_Up&&) rhs;
-    return {};
   }
 #    else
   __MDSPAN_FORCE_INLINE_FUNCTION constexpr _dynamic_t __value() const noexcept
@@ -147,10 +145,9 @@ struct __maybe_static_value<_dynamic_t, _static_t, __is_dynamic_sentinal, __is_d
     return this->__no_unique_address_emulation<_dynamic_t>::__ref();
   }
   template <class _Up>
-  __MDSPAN_FORCE_INLINE_FUNCTION constexpr __mdspan_enable_fold_comma __set_value(_Up&& __rhs) noexcept
+  __MDSPAN_FORCE_INLINE_FUNCTION constexpr void __set_value(_Up&& __rhs) noexcept
   {
     this->__no_unique_address_emulation<_dynamic_t>::__ref() = (_Up&&) __rhs;
-    return {};
   }
 #    endif
 };
diff --git a/libcudacxx/include/cuda/std/__mdspan/mdspan.h b/libcudacxx/include/cuda/std/__mdspan/mdspan.h
index ad359c555a2..b206a35fd55 100644
--- a/libcudacxx/include/cuda/std/__mdspan/mdspan.h
+++ b/libcudacxx/include/cuda/std/__mdspan/mdspan.h
@@ -59,6 +59,7 @@
 #include <cuda/std/__mdspan/extents.h>
 #include <cuda/std/__mdspan/layout_right.h>
 #include <cuda/std/__type_traits/extent.h>
+#include <cuda/std/__type_traits/fold.h>
 #include <cuda/std/__type_traits/is_constructible.h>
 #include <cuda/std/__type_traits/is_convertible.h>
 #include <cuda/std/__type_traits/is_default_constructible.h>
@@ -177,12 +178,11 @@ class mdspan
   _CCCL_HIDE_FROM_ABI constexpr mdspan(mdspan&&)      = default;
 
   _CCCL_TEMPLATE(class... _SizeTypes)
-  _CCCL_REQUIRES(
-    __MDSPAN_FOLD_AND(_CCCL_TRAIT(is_convertible, _SizeTypes, index_type) /* && ... */)
-      _CCCL_AND __MDSPAN_FOLD_AND(_CCCL_TRAIT(is_nothrow_constructible, index_type, _SizeTypes) /* && ... */)
-        _CCCL_AND((sizeof...(_SizeTypes) == rank()) || (sizeof...(_SizeTypes) == rank_dynamic()))
-          _CCCL_AND _CCCL_TRAIT(is_constructible, mapping_type, extents_type)
-            _CCCL_AND _CCCL_TRAIT(is_default_constructible, accessor_type))
+  _CCCL_REQUIRES(__fold_and_v<_CCCL_TRAIT(is_convertible, _SizeTypes, index_type)...> _CCCL_AND
+                   __fold_and_v<_CCCL_TRAIT(is_nothrow_constructible, index_type, _SizeTypes)...> _CCCL_AND(
+                     (sizeof...(_SizeTypes) == rank()) || (sizeof...(_SizeTypes) == rank_dynamic()))
+                     _CCCL_AND _CCCL_TRAIT(is_constructible, mapping_type, extents_type)
+                       _CCCL_AND _CCCL_TRAIT(is_default_constructible, accessor_type))
   _LIBCUDACXX_HIDE_FROM_ABI explicit constexpr mdspan(data_handle_type __p, _SizeTypes... __dynamic_extents)
       // TODO @proposal-bug shouldn't I be allowed to do `move(__p)` here?
       : __members(
@@ -264,10 +264,9 @@ class mdspan
 
 #  if __MDSPAN_USE_BRACKET_OPERATOR
   _CCCL_TEMPLATE(class... _SizeTypes)
-  _CCCL_REQUIRES(
-    __MDSPAN_FOLD_AND(_CCCL_TRAIT(is_convertible, _SizeTypes, index_type) /* && ... */)
-      _CCCL_AND __MDSPAN_FOLD_AND(_CCCL_TRAIT(is_nothrow_constructible, index_type, _SizeTypes) /* && ... */)
-        _CCCL_AND(rank() == sizeof...(_SizeTypes)))
+  _CCCL_REQUIRES(__fold_and_v<_CCCL_TRAIT(is_convertible, _SizeTypes, index_type)...> _CCCL_AND
+                   __fold_and_v<_CCCL_TRAIT(is_nothrow_constructible, index_type, _SizeTypes)...> _CCCL_AND(
+                     rank() == sizeof...(_SizeTypes)))
   __MDSPAN_FORCE_INLINE_FUNCTION
   constexpr reference operator[](_SizeTypes... __indices) const
   {
@@ -307,10 +306,9 @@ class mdspan
 
 #  if __MDSPAN_USE_PAREN_OPERATOR
   _CCCL_TEMPLATE(class... _SizeTypes)
-  _CCCL_REQUIRES(
-    __MDSPAN_FOLD_AND(_CCCL_TRAIT(is_convertible, _SizeTypes, index_type) /* && ... */)
-      _CCCL_AND __MDSPAN_FOLD_AND(_CCCL_TRAIT(is_nothrow_constructible, index_type, _SizeTypes) /* && ... */)
-        _CCCL_AND(extents_type::rank() == sizeof...(_SizeTypes)))
+  _CCCL_REQUIRES(__fold_and_v<_CCCL_TRAIT(is_convertible, _SizeTypes, index_type)...> _CCCL_AND
+                   __fold_and_v<_CCCL_TRAIT(is_nothrow_constructible, index_type, _SizeTypes)...> _CCCL_AND(
+                     extents_type::rank() == sizeof...(_SizeTypes)))
   __MDSPAN_FORCE_INLINE_FUNCTION
   constexpr reference operator()(_SizeTypes... __indices) const
   {
@@ -440,8 +438,7 @@ class mdspan
 
 #  if defined(__MDSPAN_USE_CLASS_TEMPLATE_ARGUMENT_DEDUCTION)
 _CCCL_TEMPLATE(class _ElementType, class... _SizeTypes)
-_CCCL_REQUIRES(__MDSPAN_FOLD_AND(_CCCL_TRAIT(is_integral, _SizeTypes) /* && ... */)
-                 _CCCL_AND(sizeof...(_SizeTypes) > 0))
+_CCCL_REQUIRES(__fold_and_v<_CCCL_TRAIT(is_integral, _SizeTypes)...> _CCCL_AND(sizeof...(_SizeTypes) > 0))
 _CCCL_HOST_DEVICE explicit mdspan(_ElementType*,
                                   _SizeTypes...) -> mdspan<_ElementType, dextents<size_t, sizeof...(_SizeTypes)>>;
 
diff --git a/libcudacxx/include/cuda/std/__mdspan/submdspan.h b/libcudacxx/include/cuda/std/__mdspan/submdspan.h
index 2053c3a6d88..aac6f43c85d 100644
--- a/libcudacxx/include/cuda/std/__mdspan/submdspan.h
+++ b/libcudacxx/include/cuda/std/__mdspan/submdspan.h
@@ -62,6 +62,7 @@
 #include <cuda/std/__mdspan/macros.h>
 #include <cuda/std/__mdspan/mdspan.h>
 #include <cuda/std/__type_traits/conditional.h>
+#include <cuda/std/__type_traits/fold.h>
 #include <cuda/std/__type_traits/integral_constant.h>
 #include <cuda/std/__type_traits/is_convertible.h>
 #include <cuda/std/__type_traits/is_same.h>
@@ -250,8 +251,8 @@ struct __assign_op_slice_handler<
   _CUDA_VSTD::integer_sequence<size_t, _StrideIdxs...>>
 {
   // TODO remove this for better compiler performance
-  static_assert(__MDSPAN_FOLD_AND((_Strides == dynamic_extent || _Strides > 0) /* && ... */), " ");
-  static_assert(__MDSPAN_FOLD_AND((_Offsets == dynamic_extent || _Offsets >= 0) /* && ... */), " ");
+  static_assert(__fold_and_v<(_Strides == dynamic_extent || _Strides > 0)...>, " ");
+  static_assert(__fold_and_v<(_Offsets == dynamic_extent || _Offsets >= 0)...>, " ");
 
   using __offsets_storage_t = __partially_static_sizes<_IndexT, size_t, _Offsets...>;
   using __extents_storage_t = __partially_static_sizes<_IndexT, size_t, _Exts...>;
@@ -522,13 +523,12 @@ struct _is_layout_stride<layout_stride> : true_type
 //==============================================================================
 
 _CCCL_TEMPLATE(class _ET, class _EXT, class _LP, class _AP, class... _SliceSpecs)
-_CCCL_REQUIRES(
-  (_CCCL_TRAIT(_CUDA_VSTD::is_same, _LP, layout_left) || _CCCL_TRAIT(_CUDA_VSTD::is_same, _LP, layout_right)
-   || __detail::_is_layout_stride<_LP>::value)
-    _CCCL_AND __MDSPAN_FOLD_AND((_CCCL_TRAIT(_CUDA_VSTD::is_convertible, _SliceSpecs, size_t)
-                                 || _CCCL_TRAIT(_CUDA_VSTD::is_convertible, _SliceSpecs, tuple<size_t, size_t>)
-                                 || _CCCL_TRAIT(_CUDA_VSTD::is_convertible, _SliceSpecs, full_extent_t)) /* && ... */)
-      _CCCL_AND(sizeof...(_SliceSpecs) == _EXT::rank()))
+_CCCL_REQUIRES((_CCCL_TRAIT(_CUDA_VSTD::is_same, _LP, layout_left)
+                || _CCCL_TRAIT(_CUDA_VSTD::is_same, _LP, layout_right) || __detail::_is_layout_stride<_LP>::value)
+                 _CCCL_AND __fold_and_v<(_CCCL_TRAIT(_CUDA_VSTD::is_convertible, _SliceSpecs, size_t)
+                                         || _CCCL_TRAIT(_CUDA_VSTD::is_convertible, _SliceSpecs, tuple<size_t, size_t>)
+                                         || _CCCL_TRAIT(_CUDA_VSTD::is_convertible, _SliceSpecs, full_extent_t))...>
+                   _CCCL_AND(sizeof...(_SliceSpecs) == _EXT::rank()))
 _LIBCUDACXX_HIDE_FROM_ABI __MDSPAN_DEDUCE_RETURN_TYPE_SINGLE_LINE(
   (constexpr submdspan(mdspan<_ET, _EXT, _LP, _AP> const& __src, _SliceSpecs... __slices) noexcept),
   (

From 0172045fe3bd610c5f7d3179408a0830f42d12e9 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?C=C3=A9dric=20Augonnet?=
 <158148890+caugonnet@users.noreply.github.com>
Date: Mon, 25 Nov 2024 11:49:04 +0100
Subject: [PATCH 17/45] [STF] Implement CUDASTF_DOT_TIMING for the
 ctx.cuda_kernel construct (#2950)

* Implement CUDASTF_DOT_TIMING facility for ctx.cuda_kernel

* clang-format
---
 .../__stf/internal/backend_ctx.cuh            | 50 ++++++++++++++++++-
 1 file changed, 48 insertions(+), 2 deletions(-)

diff --git a/cudax/include/cuda/experimental/__stf/internal/backend_ctx.cuh b/cudax/include/cuda/experimental/__stf/internal/backend_ctx.cuh
index 2822370c1f3..7a63df4c8c3 100644
--- a/cudax/include/cuda/experimental/__stf/internal/backend_ctx.cuh
+++ b/cudax/include/cuda/experimental/__stf/internal/backend_ctx.cuh
@@ -64,6 +64,8 @@ class graph_ctx;
 
 class null_partition;
 
+class stream_ctx;
+
 namespace reserved
 {
 
@@ -290,13 +292,57 @@ public:
       t.set_symbol(symbol);
     }
 
+    auto& dot        = *ctx.get_dot();
+    auto& statistics = reserved::task_statistics::instance();
+
+    cudaEvent_t start_event, end_event;
+    const bool record_time = t.schedule_task() || statistics.is_calibrating_to_file();
+
     t.start();
+
+    int device = -1;
+
     SCOPE(exit)
     {
-      t.end();
+      t.end_uncleared();
+
+      if constexpr (::std::is_same_v<Ctx, stream_ctx>)
+      {
+        if (record_time)
+        {
+          cuda_safe_call(cudaEventRecord(end_event, t.get_stream()));
+          cuda_safe_call(cudaEventSynchronize(end_event));
+
+          float milliseconds = 0;
+          cuda_safe_call(cudaEventElapsedTime(&milliseconds, start_event, end_event));
+
+          if (dot.is_tracing())
+          {
+            dot.template add_vertex_timing<typename Ctx::task_type>(t, milliseconds, device);
+          }
+
+          if (statistics.is_calibrating())
+          {
+            statistics.log_task_time(t, milliseconds);
+          }
+        }
+      }
+
+      t.clear();
     };
 
-    auto& dot = *ctx.get_dot();
+    if constexpr (::std::is_same_v<Ctx, stream_ctx>)
+    {
+      if (record_time)
+      {
+        cuda_safe_call(cudaGetDevice(&device)); // We will use this to force it during the next run
+        // Events must be created here to avoid issues with multi-gpu
+        cuda_safe_call(cudaEventCreate(&start_event));
+        cuda_safe_call(cudaEventCreate(&end_event));
+        cuda_safe_call(cudaEventRecord(start_event, t.get_stream()));
+      }
+    }
+
     if (dot.is_tracing())
     {
       dot.template add_vertex<typename Ctx::task_type, logical_data_untyped>(t);

From 0b36a7dd077a7a38fa560d95cf04b1096d1b2466 Mon Sep 17 00:00:00 2001
From: Michael Schellenberger Costa <miscco@nvidia.com>
Date: Mon, 25 Nov 2024 13:49:27 +0100
Subject: [PATCH 18/45] Avoid potential null dereference in `annotated_ptr`
 (#2951)

Fixes [BUG]: UB in annotated_ptr #2942
---
 libcudacxx/include/cuda/annotated_ptr | 4 ++++
 1 file changed, 4 insertions(+)

diff --git a/libcudacxx/include/cuda/annotated_ptr b/libcudacxx/include/cuda/annotated_ptr
index 51601986b7d..7c74be390f1 100644
--- a/libcudacxx/include/cuda/annotated_ptr
+++ b/libcudacxx/include/cuda/annotated_ptr
@@ -391,6 +391,10 @@ public:
 
   _CCCL_HOST_DEVICE pointer get() const noexcept
   {
+    if (__repr == nullptr)
+    {
+      return nullptr;
+    }
     constexpr bool __is_shared = std::is_same<_Property, access_property::shared>::value;
     return __is_shared ? __repr : &(*annotated_ptr<value_type, access_property::global>(__repr));
   }

From a791939bab707c6dcfe4f45bdd27777da2a37852 Mon Sep 17 00:00:00 2001
From: David Bayer <48736217+davebayer@users.noreply.github.com>
Date: Mon, 25 Nov 2024 16:19:00 +0100
Subject: [PATCH 19/45] make compiler version comparison utility generic
 (#2952)

---
 libcudacxx/include/cuda/std/__cccl/compiler.h | 45 +++++++++++--------
 1 file changed, 27 insertions(+), 18 deletions(-)

diff --git a/libcudacxx/include/cuda/std/__cccl/compiler.h b/libcudacxx/include/cuda/std/__cccl/compiler.h
index fd7e93d22cb..037d5e753ed 100644
--- a/libcudacxx/include/cuda/std/__cccl/compiler.h
+++ b/libcudacxx/include/cuda/std/__cccl/compiler.h
@@ -11,7 +11,34 @@
 #ifndef __CCCL_COMPILER_H
 #define __CCCL_COMPILER_H
 
+// Utility to compare version numbers. To use:
+// 1) Define a macro that makes a version number from major and minor numbers, e. g.:
+//    #define MYPRODUCT_MAKE_VERSION(_MAJOR, _MINOR) (_MAJOR * 100 + _MINOR)
+// 2) Define a macro that you will use to compare versions, e. g.:
+//    #define MYPRODUCT(...) _CCCL_VERSION_COMPARE(MYPRODUCT, MYPRODUCT_##__VA_ARGS__)
+//    Signatures:
+//       MYPRODUCT(_PROD)                      - is the product _PROD version non-zero?
+//       MYPRODUCT(_PROD, _OP, _MAJOR)         - compare the product _PROD version to _MAJOR using operator _OP
+//       MYPRODUCT(_PROD, _OP, _MAJOR, _MINOR) - compare the product _PROD version to _MAJOR._MINOR using operator _OP
+#define _CCCL_VERSION_COMPARE_1(_PREFIX, _VER)              (_VER != 0)
+#define _CCCL_VERSION_COMPARE_3(_PREFIX, _VER, _OP, _MAJOR) _CCCL_VERSION_COMPARE_4(_PREFIX, _VER, _OP, _MAJOR, 0)
+#define _CCCL_VERSION_COMPARE_4(_PREFIX, _VER, _OP, _MAJOR, _MINOR) \
+  (_CCCL_VERSION_COMPARE_1(_PREFIX, _VER) && (_VER _OP _PREFIX##_MAKE_VERSION(_MAJOR, _MINOR)))
+#define _CCCL_VERSION_SELECT_COUNT(_ARG1, _ARG2, _ARG3, _ARG4, _ARG5, ...) _ARG5
+#define _CCCL_VERSION_SELECT2(_ARGS)                                       _CCCL_VERSION_SELECT_COUNT _ARGS
+// MSVC traditonal preprocessor requires an extra level of indirection
+#define _CCCL_VERSION_SELECT(...)         \
+  _CCCL_VERSION_SELECT2(                  \
+    (__VA_ARGS__,                         \
+     _CCCL_VERSION_COMPARE_4,             \
+     _CCCL_VERSION_COMPARE_3,             \
+     _CCCL_VERSION_COMPARE_BAD_ARG_COUNT, \
+     _CCCL_VERSION_COMPARE_1,             \
+     _CCCL_VERSION_COMPARE_BAD_ARG_COUNT))
+#define _CCCL_VERSION_COMPARE(_PREFIX, ...) _CCCL_VERSION_SELECT(__VA_ARGS__)(_PREFIX, __VA_ARGS__)
+
 #define _CCCL_COMPILER_MAKE_VERSION(_MAJOR, _MINOR) (_MAJOR * 100 + _MINOR)
+#define _CCCL_COMPILER(...)                         _CCCL_VERSION_COMPARE(_CCCL_COMPILER, _CCCL_COMPILER_##__VA_ARGS__)
 
 // Determine the host compiler and its version
 #if defined(__INTEL_COMPILER)
@@ -39,24 +66,6 @@
 #  define _CCCL_COMPILER_NVRTC _CCCL_COMPILER_MAKE_VERSION(__CUDACC_VER_MAJOR__, __CUDACC_VER_MINOR__)
 #endif
 
-#define _CCCL_COMPILER_COMPARE_VERSION_1(_COMP)              _COMP
-#define _CCCL_COMPILER_COMPARE_VERSION_3(_COMP, _OP, _MAJOR) _CCCL_COMPILER_COMPARE_VERSION_4(_COMP, _OP, _MAJOR, 0)
-#define _CCCL_COMPILER_COMPARE_VERSION_4(_COMP, _OP, _MAJOR, _MINOR) \
-  (_COMP && (_COMP _OP _CCCL_COMPILER_MAKE_VERSION(_MAJOR, _MINOR)))
-
-#define _CCCL_COMPILER_SELECT_COUNT(_ARG1, _ARG2, _ARG3, _ARG4, _ARG5, ...) _ARG5
-#define _CCCL_COMPILER_SELECT2(_ARGS)                                       _CCCL_COMPILER_SELECT_COUNT _ARGS
-// MSVC traditonal preprocessor requires an extra level of indirection
-#define _CCCL_COMPILER_SELECT(...)         \
-  _CCCL_COMPILER_SELECT2(                  \
-    (__VA_ARGS__,                          \
-     _CCCL_COMPILER_COMPARE_VERSION_4,     \
-     _CCCL_COMPILER_COMPARE_VERSION_3,     \
-     _CCCL_COMPILER_COMPARE_BAD_ARG_COUNT, \
-     _CCCL_COMPILER_COMPARE_VERSION_1,     \
-     _CCCL_COMPILER_COMPARE_BAD_ARG_COUNT))
-#define _CCCL_COMPILER(...) _CCCL_COMPILER_SELECT(_CCCL_COMPILER_##__VA_ARGS__)(_CCCL_COMPILER_##__VA_ARGS__)
-
 // Determine the cuda compiler
 #if defined(__NVCC__)
 #  define _CCCL_CUDA_COMPILER_NVCC

From 5ad00af618509234c02a88d9860cd1c8415ab66b Mon Sep 17 00:00:00 2001
From: Michael Schellenberger Costa <miscco@nvidia.com>
Date: Mon, 25 Nov 2024 19:02:21 +0100
Subject: [PATCH 20/45] Add SM100 descriptor to target (#2954)

This is adding the missing sm_100 identifier to nv/target

Fixes #2890
---
 libcudacxx/include/nv/target | 171 ++++++++++++++++++-----------------
 1 file changed, 88 insertions(+), 83 deletions(-)

diff --git a/libcudacxx/include/nv/target b/libcudacxx/include/nv/target
index 1ad75b45b29..6a3a3f0f40c 100644
--- a/libcudacxx/include/nv/target
+++ b/libcudacxx/include/nv/target
@@ -50,25 +50,26 @@ typedef unsigned long long base_int_t;
 constexpr base_int_t all_hosts = 1;
 
 // NVIDIA GPUs
-constexpr base_int_t sm_35_bit = 1 << 1;
-constexpr base_int_t sm_37_bit = 1 << 2;
-constexpr base_int_t sm_50_bit = 1 << 3;
-constexpr base_int_t sm_52_bit = 1 << 4;
-constexpr base_int_t sm_53_bit = 1 << 5;
-constexpr base_int_t sm_60_bit = 1 << 6;
-constexpr base_int_t sm_61_bit = 1 << 7;
-constexpr base_int_t sm_62_bit = 1 << 8;
-constexpr base_int_t sm_70_bit = 1 << 9;
-constexpr base_int_t sm_72_bit = 1 << 10;
-constexpr base_int_t sm_75_bit = 1 << 11;
-constexpr base_int_t sm_80_bit = 1 << 12;
-constexpr base_int_t sm_86_bit = 1 << 13;
-constexpr base_int_t sm_87_bit = 1 << 14;
-constexpr base_int_t sm_89_bit = 1 << 15;
-constexpr base_int_t sm_90_bit = 1 << 16;
+constexpr base_int_t sm_35_bit  = 1 << 1;
+constexpr base_int_t sm_37_bit  = 1 << 2;
+constexpr base_int_t sm_50_bit  = 1 << 3;
+constexpr base_int_t sm_52_bit  = 1 << 4;
+constexpr base_int_t sm_53_bit  = 1 << 5;
+constexpr base_int_t sm_60_bit  = 1 << 6;
+constexpr base_int_t sm_61_bit  = 1 << 7;
+constexpr base_int_t sm_62_bit  = 1 << 8;
+constexpr base_int_t sm_70_bit  = 1 << 9;
+constexpr base_int_t sm_72_bit  = 1 << 10;
+constexpr base_int_t sm_75_bit  = 1 << 11;
+constexpr base_int_t sm_80_bit  = 1 << 12;
+constexpr base_int_t sm_86_bit  = 1 << 13;
+constexpr base_int_t sm_87_bit  = 1 << 14;
+constexpr base_int_t sm_89_bit  = 1 << 15;
+constexpr base_int_t sm_90_bit  = 1 << 16;
+constexpr base_int_t sm_100_bit = 1 << 17;
 constexpr base_int_t all_devices =
   sm_35_bit | sm_37_bit | sm_50_bit | sm_52_bit | sm_53_bit | sm_60_bit | sm_61_bit | sm_62_bit | sm_70_bit | sm_72_bit
-  | sm_75_bit | sm_80_bit | sm_86_bit | sm_87_bit | sm_89_bit | sm_90_bit;
+  | sm_75_bit | sm_80_bit | sm_86_bit | sm_87_bit | sm_89_bit | sm_90_bit | sm_100_bit;
 
 // Store a set of targets as a set of bits
 struct _NV_BITSET_ATTRIBUTE target_description
@@ -83,22 +84,23 @@ struct _NV_BITSET_ATTRIBUTE target_description
 // The type of the user-visible names of the NVIDIA GPU targets
 enum class sm_selector : base_int_t
 {
-  sm_35 = 35,
-  sm_37 = 37,
-  sm_50 = 50,
-  sm_52 = 52,
-  sm_53 = 53,
-  sm_60 = 60,
-  sm_61 = 61,
-  sm_62 = 62,
-  sm_70 = 70,
-  sm_72 = 72,
-  sm_75 = 75,
-  sm_80 = 80,
-  sm_86 = 86,
-  sm_87 = 87,
-  sm_89 = 89,
-  sm_90 = 90,
+  sm_35  = 35,
+  sm_37  = 37,
+  sm_50  = 50,
+  sm_52  = 52,
+  sm_53  = 53,
+  sm_60  = 60,
+  sm_61  = 61,
+  sm_62  = 62,
+  sm_70  = 70,
+  sm_72  = 72,
+  sm_75  = 75,
+  sm_80  = 80,
+  sm_86  = 86,
+  sm_87  = 87,
+  sm_89  = 89,
+  sm_90  = 90,
+  sm_100 = 100,
 };
 
 constexpr base_int_t toint(sm_selector a)
@@ -108,44 +110,46 @@ constexpr base_int_t toint(sm_selector a)
 
 constexpr base_int_t bitexact(sm_selector a)
 {
-  return toint(a) == 35 ? sm_35_bit
-       : toint(a) == 37 ? sm_37_bit
-       : toint(a) == 50 ? sm_50_bit
-       : toint(a) == 52 ? sm_52_bit
-       : toint(a) == 53 ? sm_53_bit
-       : toint(a) == 60 ? sm_60_bit
-       : toint(a) == 61 ? sm_61_bit
-       : toint(a) == 62 ? sm_62_bit
-       : toint(a) == 70 ? sm_70_bit
-       : toint(a) == 72 ? sm_72_bit
-       : toint(a) == 75 ? sm_75_bit
-       : toint(a) == 80 ? sm_80_bit
-       : toint(a) == 86 ? sm_86_bit
-       : toint(a) == 87 ? sm_87_bit
-       : toint(a) == 89 ? sm_89_bit
-       : toint(a) == 90 ? sm_90_bit
-                        : 0;
+  return toint(a) == 35  ? sm_35_bit
+       : toint(a) == 37  ? sm_37_bit
+       : toint(a) == 50  ? sm_50_bit
+       : toint(a) == 52  ? sm_52_bit
+       : toint(a) == 53  ? sm_53_bit
+       : toint(a) == 60  ? sm_60_bit
+       : toint(a) == 61  ? sm_61_bit
+       : toint(a) == 62  ? sm_62_bit
+       : toint(a) == 70  ? sm_70_bit
+       : toint(a) == 72  ? sm_72_bit
+       : toint(a) == 75  ? sm_75_bit
+       : toint(a) == 80  ? sm_80_bit
+       : toint(a) == 86  ? sm_86_bit
+       : toint(a) == 87  ? sm_87_bit
+       : toint(a) == 89  ? sm_89_bit
+       : toint(a) == 90  ? sm_90_bit
+       : toint(a) == 100 ? sm_100_bit
+                         : 0;
 }
 
 constexpr base_int_t bitrounddown(sm_selector a)
 {
-  return toint(a) >= 90 ? sm_90_bit
-       : toint(a) >= 89 ? sm_89_bit
-       : toint(a) >= 87 ? sm_87_bit
-       : toint(a) >= 86 ? sm_86_bit
-       : toint(a) >= 80 ? sm_80_bit
-       : toint(a) >= 75 ? sm_75_bit
-       : toint(a) >= 72 ? sm_72_bit
-       : toint(a) >= 70 ? sm_70_bit
-       : toint(a) >= 62 ? sm_62_bit
-       : toint(a) >= 61 ? sm_61_bit
-       : toint(a) >= 60 ? sm_60_bit
-       : toint(a) >= 53 ? sm_53_bit
-       : toint(a) >= 52 ? sm_52_bit
-       : toint(a) >= 50 ? sm_50_bit
-       : toint(a) >= 37 ? sm_37_bit
-       : toint(a) >= 35 ? sm_35_bit
-                        : 0;
+  return toint(a) >= 100 ? sm_100_bit
+       : toint(a) >= 90  ? sm_90_bit
+       : toint(a) >= 89  ? sm_89_bit
+       : toint(a) >= 87  ? sm_87_bit
+       : toint(a) >= 86  ? sm_86_bit
+       : toint(a) >= 80  ? sm_80_bit
+       : toint(a) >= 75  ? sm_75_bit
+       : toint(a) >= 72  ? sm_72_bit
+       : toint(a) >= 70  ? sm_70_bit
+       : toint(a) >= 62  ? sm_62_bit
+       : toint(a) >= 61  ? sm_61_bit
+       : toint(a) >= 60  ? sm_60_bit
+       : toint(a) >= 53  ? sm_53_bit
+       : toint(a) >= 52  ? sm_52_bit
+       : toint(a) >= 50  ? sm_50_bit
+       : toint(a) >= 37  ? sm_37_bit
+       : toint(a) >= 35  ? sm_35_bit
+                         : 0;
 }
 
 // Public API for NVIDIA GPUs
@@ -188,22 +192,23 @@ constexpr target_description any_target = target_description(detail::all_hosts |
 constexpr target_description no_target  = target_description(0);
 
 // The public names for NVIDIA GPU architectures
-constexpr sm_selector sm_35 = sm_selector::sm_35;
-constexpr sm_selector sm_37 = sm_selector::sm_37;
-constexpr sm_selector sm_50 = sm_selector::sm_50;
-constexpr sm_selector sm_52 = sm_selector::sm_52;
-constexpr sm_selector sm_53 = sm_selector::sm_53;
-constexpr sm_selector sm_60 = sm_selector::sm_60;
-constexpr sm_selector sm_61 = sm_selector::sm_61;
-constexpr sm_selector sm_62 = sm_selector::sm_62;
-constexpr sm_selector sm_70 = sm_selector::sm_70;
-constexpr sm_selector sm_72 = sm_selector::sm_72;
-constexpr sm_selector sm_75 = sm_selector::sm_75;
-constexpr sm_selector sm_80 = sm_selector::sm_80;
-constexpr sm_selector sm_86 = sm_selector::sm_86;
-constexpr sm_selector sm_87 = sm_selector::sm_87;
-constexpr sm_selector sm_89 = sm_selector::sm_89;
-constexpr sm_selector sm_90 = sm_selector::sm_90;
+constexpr sm_selector sm_35  = sm_selector::sm_35;
+constexpr sm_selector sm_37  = sm_selector::sm_37;
+constexpr sm_selector sm_50  = sm_selector::sm_50;
+constexpr sm_selector sm_52  = sm_selector::sm_52;
+constexpr sm_selector sm_53  = sm_selector::sm_53;
+constexpr sm_selector sm_60  = sm_selector::sm_60;
+constexpr sm_selector sm_61  = sm_selector::sm_61;
+constexpr sm_selector sm_62  = sm_selector::sm_62;
+constexpr sm_selector sm_70  = sm_selector::sm_70;
+constexpr sm_selector sm_72  = sm_selector::sm_72;
+constexpr sm_selector sm_75  = sm_selector::sm_75;
+constexpr sm_selector sm_80  = sm_selector::sm_80;
+constexpr sm_selector sm_86  = sm_selector::sm_86;
+constexpr sm_selector sm_87  = sm_selector::sm_87;
+constexpr sm_selector sm_89  = sm_selector::sm_89;
+constexpr sm_selector sm_90  = sm_selector::sm_90;
+constexpr sm_selector sm_100 = sm_selector::sm_100;
 
 using detail::is_exactly;
 using detail::provides;

From ace320be094aa157548323dcd3c72b54ccf85579 Mon Sep 17 00:00:00 2001
From: Bernhard Manfred Gruber <bernhardmgruber@gmail.com>
Date: Mon, 25 Nov 2024 19:39:04 +0100
Subject: [PATCH 21/45] Regenerate `cuda::ptx` headers/docs and run format
 (#2937)

Overwrites all generated PTX header and documentation files and runs `pre-commit run --all-files`. Also exclude generated PTX headers from header check.
---
 .../generated/barrier_cluster.rst             |  13 ++-
 .../instructions/generated/cp_async_bulk.rst  |   9 +-
 .../generated/cp_async_bulk_commit_group.rst  |   5 +-
 .../generated/cp_async_bulk_multicast.rst     |   5 +-
 .../generated/cp_async_bulk_tensor.rst        |  23 ++--
 .../cp_async_bulk_tensor_multicast.rst        |  13 ++-
 .../generated/cp_async_bulk_wait_group.rst    |   3 +
 .../generated/cp_reduce_async_bulk.rst        |  55 +++++-----
 .../generated/cp_reduce_async_bulk_bf16.rst   |   9 +-
 .../generated/cp_reduce_async_bulk_f16.rst    |   9 +-
 .../generated/cp_reduce_async_bulk_tensor.rst |   3 +
 .../ptx/instructions/generated/fence.rst      |   3 +
 .../generated/fence_mbarrier_init.rst         |   5 +-
 .../generated/fence_proxy_alias.rst           |   5 +-
 .../generated/fence_proxy_async.rst           |   5 +-
 .../fence_proxy_tensormap_generic.rst         |   3 +
 .../ptx/instructions/generated/getctarank.rst |   5 +-
 .../generated/mbarrier_arrive.rst             |  11 +-
 .../generated/mbarrier_arrive_expect_tx.rst   |   5 +-
 .../generated/mbarrier_arrive_no_complete.rst |   5 +-
 .../generated/mbarrier_expect_tx.rst          |   3 +
 .../instructions/generated/mbarrier_init.rst  |   5 +-
 .../generated/mbarrier_test_wait.rst          |   5 +-
 .../generated/mbarrier_test_wait_parity.rst   |   5 +-
 .../generated/mbarrier_try_wait.rst           |   7 +-
 .../generated/mbarrier_try_wait_parity.rst    |   7 +-
 .../ptx/instructions/generated/red_async.rst  |  23 ++--
 .../ptx/instructions/generated/st_async.rst   |   3 +
 .../generated/tensormap_cp_fenceproxy.rst     |   3 +
 .../generated/tensormap_replace.rst           |   3 +
 .../cuda/__ptx/instructions/barrier_cluster.h |   2 +-
 .../cuda/__ptx/instructions/cp_async_bulk.h   |   4 +-
 .../instructions/cp_async_bulk_commit_group.h |   2 +-
 .../__ptx/instructions/cp_async_bulk_tensor.h |   4 +-
 .../instructions/cp_async_bulk_wait_group.h   |   2 +-
 .../__ptx/instructions/cp_reduce_async_bulk.h |   6 +-
 .../cp_reduce_async_bulk_tensor.h             |   2 +-
 .../include/cuda/__ptx/instructions/fence.h   |  10 +-
 ...{barrier_cluster.inc => barrier_cluster.h} |  17 ++-
 .../{cp_async_bulk.inc => cp_async_bulk.h}    |  13 ++-
 ...group.inc => cp_async_bulk_commit_group.h} |   9 +-
 ...ulticast.inc => cp_async_bulk_multicast.h} |   9 +-
 ...bulk_tensor.inc => cp_async_bulk_tensor.h} |  27 +++--
 ...t.inc => cp_async_bulk_tensor_multicast.h} |  17 ++-
 ...t_group.inc => cp_async_bulk_wait_group.h} |  15 ++-
 ..._async_bulk.inc => cp_reduce_async_bulk.h} |  67 ++++++------
 ...k_bf16.inc => cp_reduce_async_bulk_bf16.h} |  27 +++--
 ...ulk_f16.inc => cp_reduce_async_bulk_f16.h} |  25 +++--
 ...nsor.inc => cp_reduce_async_bulk_tensor.h} |  77 +++++++------
 .../generated/{fence.inc => fence.h}          |  19 +++-
 ...barrier_init.inc => fence_mbarrier_init.h} |   9 +-
 ...ce_proxy_alias.inc => fence_proxy_alias.h} |   9 +-
 ...ce_proxy_async.inc => fence_proxy_async.h} |  13 ++-
 ...ic.inc => fence_proxy_tensormap_generic.h} |  19 +++-
 .../generated/{get_sreg.inc => get_sreg.h}    | 103 ++++++++++--------
 .../{getctarank.inc => getctarank.h}          |   9 +-
 ...{mbarrier_arrive.inc => mbarrier_arrive.h} |  19 +++-
 ...ect_tx.inc => mbarrier_arrive_expect_tx.h} |  11 +-
 ...lete.inc => mbarrier_arrive_no_complete.h} |   9 +-
 .../{mbarrier_init.inc => mbarrier_init.h}    |   9 +-
 ...ier_test_wait.inc => mbarrier_test_wait.h} |  11 +-
 ...parity.inc => mbarrier_test_wait_parity.h} |  11 +-
 ...rrier_try_wait.inc => mbarrier_try_wait.h} |  15 ++-
 ..._parity.inc => mbarrier_try_wait_parity.h} |  15 ++-
 .../generated/{red_async.inc => red_async.h}  |  27 +++--
 .../generated/{st_async.inc => st_async.h}    |  11 +-
 ...nceproxy.inc => tensormap_cp_fenceproxy.h} |  13 ++-
 ...sormap_replace.inc => tensormap_replace.h} |   7 ++
 .../cuda/__ptx/instructions/get_sreg.h        |   2 +-
 .../cuda/__ptx/instructions/getctarank.h      |   2 +-
 .../cuda/__ptx/instructions/mbarrier_arrive.h |   6 +-
 .../cuda/__ptx/instructions/mbarrier_init.h   |   2 +-
 .../cuda/__ptx/instructions/mbarrier_wait.h   |   8 +-
 .../cuda/__ptx/instructions/red_async.h       |   2 +-
 .../cuda/__ptx/instructions/st_async.h        |   2 +-
 .../instructions/tensormap_cp_fenceproxy.h    |   2 +-
 .../__ptx/instructions/tensormap_replace.h    |   2 +-
 .../test/internal_headers/CMakeLists.txt      |   3 +
 78 files changed, 631 insertions(+), 332 deletions(-)
 rename libcudacxx/include/cuda/__ptx/instructions/generated/{barrier_cluster.inc => barrier_cluster.h} (92%)
 rename libcudacxx/include/cuda/__ptx/instructions/generated/{cp_async_bulk.inc => cp_async_bulk.h} (93%)
 rename libcudacxx/include/cuda/__ptx/instructions/generated/{cp_async_bulk_commit_group.inc => cp_async_bulk_commit_group.h} (73%)
 rename libcudacxx/include/cuda/__ptx/instructions/generated/{cp_async_bulk_multicast.inc => cp_async_bulk_multicast.h} (86%)
 rename libcudacxx/include/cuda/__ptx/instructions/generated/{cp_async_bulk_tensor.inc => cp_async_bulk_tensor.h} (96%)
 rename libcudacxx/include/cuda/__ptx/instructions/generated/{cp_async_bulk_tensor_multicast.inc => cp_async_bulk_tensor_multicast.h} (95%)
 rename libcudacxx/include/cuda/__ptx/instructions/generated/{cp_async_bulk_wait_group.inc => cp_async_bulk_wait_group.h} (82%)
 rename libcudacxx/include/cuda/__ptx/instructions/generated/{cp_reduce_async_bulk.inc => cp_reduce_async_bulk.h} (97%)
 rename libcudacxx/include/cuda/__ptx/instructions/generated/{cp_reduce_async_bulk_bf16.inc => cp_reduce_async_bulk_bf16.h} (89%)
 rename libcudacxx/include/cuda/__ptx/instructions/generated/{cp_reduce_async_bulk_f16.inc => cp_reduce_async_bulk_f16.h} (89%)
 rename libcudacxx/include/cuda/__ptx/instructions/generated/{cp_reduce_async_bulk_tensor.inc => cp_reduce_async_bulk_tensor.h} (91%)
 rename libcudacxx/include/cuda/__ptx/instructions/generated/{fence.inc => fence.h} (81%)
 rename libcudacxx/include/cuda/__ptx/instructions/generated/{fence_mbarrier_init.inc => fence_mbarrier_init.h} (80%)
 rename libcudacxx/include/cuda/__ptx/instructions/generated/{fence_proxy_alias.inc => fence_proxy_alias.h} (74%)
 rename libcudacxx/include/cuda/__ptx/instructions/generated/{fence_proxy_async.inc => fence_proxy_async.h} (83%)
 rename libcudacxx/include/cuda/__ptx/instructions/generated/{fence_proxy_tensormap_generic.inc => fence_proxy_tensormap_generic.h} (85%)
 rename libcudacxx/include/cuda/__ptx/instructions/generated/{get_sreg.inc => get_sreg.h} (95%)
 rename libcudacxx/include/cuda/__ptx/instructions/generated/{getctarank.inc => getctarank.h} (81%)
 rename libcudacxx/include/cuda/__ptx/instructions/generated/{mbarrier_arrive.inc => mbarrier_arrive.h} (94%)
 rename libcudacxx/include/cuda/__ptx/instructions/generated/{mbarrier_arrive_expect_tx.inc => mbarrier_arrive_expect_tx.h} (90%)
 rename libcudacxx/include/cuda/__ptx/instructions/generated/{mbarrier_arrive_no_complete.inc => mbarrier_arrive_no_complete.h} (79%)
 rename libcudacxx/include/cuda/__ptx/instructions/generated/{mbarrier_init.inc => mbarrier_init.h} (78%)
 rename libcudacxx/include/cuda/__ptx/instructions/generated/{mbarrier_test_wait.inc => mbarrier_test_wait.h} (90%)
 rename libcudacxx/include/cuda/__ptx/instructions/generated/{mbarrier_test_wait_parity.inc => mbarrier_test_wait_parity.h} (90%)
 rename libcudacxx/include/cuda/__ptx/instructions/generated/{mbarrier_try_wait.inc => mbarrier_try_wait.h} (93%)
 rename libcudacxx/include/cuda/__ptx/instructions/generated/{mbarrier_try_wait_parity.inc => mbarrier_try_wait_parity.h} (93%)
 rename libcudacxx/include/cuda/__ptx/instructions/generated/{red_async.inc => red_async.h} (97%)
 rename libcudacxx/include/cuda/__ptx/instructions/generated/{st_async.inc => st_async.h} (93%)
 rename libcudacxx/include/cuda/__ptx/instructions/generated/{tensormap_cp_fenceproxy.inc => tensormap_cp_fenceproxy.h} (85%)
 rename libcudacxx/include/cuda/__ptx/instructions/generated/{tensormap_replace.inc => tensormap_replace.h} (99%)

diff --git a/docs/libcudacxx/ptx/instructions/generated/barrier_cluster.rst b/docs/libcudacxx/ptx/instructions/generated/barrier_cluster.rst
index bd994990c05..2e3b8bac188 100644
--- a/docs/libcudacxx/ptx/instructions/generated/barrier_cluster.rst
+++ b/docs/libcudacxx/ptx/instructions/generated/barrier_cluster.rst
@@ -1,10 +1,13 @@
+..
+   This file was automatically generated. Do not edit.
+
 barrier.cluster.arrive
 ^^^^^^^^^^^^^^^^^^^^^^
 .. code:: cuda
 
    // barrier.cluster.arrive; // PTX ISA 78, SM_90
    // Marked volatile and as clobbering memory
-   template <typename=void>
+   template <typename = void>
    __device__ static inline void barrier_cluster_arrive();
 
 barrier.cluster.wait
@@ -13,7 +16,7 @@ barrier.cluster.wait
 
    // barrier.cluster.wait; // PTX ISA 78, SM_90
    // Marked volatile and as clobbering memory
-   template <typename=void>
+   template <typename = void>
    __device__ static inline void barrier_cluster_wait();
 
 barrier.cluster.arrive.release
@@ -23,7 +26,7 @@ barrier.cluster.arrive.release
    // barrier.cluster.arrive.sem; // PTX ISA 80, SM_90
    // .sem       = { .release }
    // Marked volatile and as clobbering memory
-   template <typename=void>
+   template <typename = void>
    __device__ static inline void barrier_cluster_arrive(
      cuda::ptx::sem_release_t);
 
@@ -34,7 +37,7 @@ barrier.cluster.arrive.relaxed
    // barrier.cluster.arrive.sem; // PTX ISA 80, SM_90
    // .sem       = { .relaxed }
    // Marked volatile
-   template <typename=void>
+   template <typename = void>
    __device__ static inline void barrier_cluster_arrive(
      cuda::ptx::sem_relaxed_t);
 
@@ -45,6 +48,6 @@ barrier.cluster.wait.acquire
    // barrier.cluster.wait.sem; // PTX ISA 80, SM_90
    // .sem       = { .acquire }
    // Marked volatile and as clobbering memory
-   template <typename=void>
+   template <typename = void>
    __device__ static inline void barrier_cluster_wait(
      cuda::ptx::sem_acquire_t);
diff --git a/docs/libcudacxx/ptx/instructions/generated/cp_async_bulk.rst b/docs/libcudacxx/ptx/instructions/generated/cp_async_bulk.rst
index f5c236f8bf9..4883d8495eb 100644
--- a/docs/libcudacxx/ptx/instructions/generated/cp_async_bulk.rst
+++ b/docs/libcudacxx/ptx/instructions/generated/cp_async_bulk.rst
@@ -1,3 +1,6 @@
+..
+   This file was automatically generated. Do not edit.
+
 cp.async.bulk.shared::cluster.global.mbarrier::complete_tx::bytes
 ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
 .. code:: cuda
@@ -5,7 +8,7 @@ cp.async.bulk.shared::cluster.global.mbarrier::complete_tx::bytes
    // cp.async.bulk.dst.src.mbarrier::complete_tx::bytes [dstMem], [srcMem], size, [smem_bar]; // 1a. unicast PTX ISA 80, SM_90
    // .dst       = { .shared::cluster }
    // .src       = { .global }
-   template <typename=void>
+   template <typename = void>
    __device__ static inline void cp_async_bulk(
      cuda::ptx::space_cluster_t,
      cuda::ptx::space_global_t,
@@ -21,7 +24,7 @@ cp.async.bulk.shared::cluster.shared::cta.mbarrier::complete_tx::bytes
    // cp.async.bulk.dst.src.mbarrier::complete_tx::bytes [dstMem], [srcMem], size, [rdsmem_bar]; // 2.  PTX ISA 80, SM_90
    // .dst       = { .shared::cluster }
    // .src       = { .shared::cta }
-   template <typename=void>
+   template <typename = void>
    __device__ static inline void cp_async_bulk(
      cuda::ptx::space_cluster_t,
      cuda::ptx::space_shared_t,
@@ -37,7 +40,7 @@ cp.async.bulk.global.shared::cta.bulk_group
    // cp.async.bulk.dst.src.bulk_group [dstMem], [srcMem], size; // 3.  PTX ISA 80, SM_90
    // .dst       = { .global }
    // .src       = { .shared::cta }
-   template <typename=void>
+   template <typename = void>
    __device__ static inline void cp_async_bulk(
      cuda::ptx::space_global_t,
      cuda::ptx::space_shared_t,
diff --git a/docs/libcudacxx/ptx/instructions/generated/cp_async_bulk_commit_group.rst b/docs/libcudacxx/ptx/instructions/generated/cp_async_bulk_commit_group.rst
index 984b4aff976..07b9f9acfc1 100644
--- a/docs/libcudacxx/ptx/instructions/generated/cp_async_bulk_commit_group.rst
+++ b/docs/libcudacxx/ptx/instructions/generated/cp_async_bulk_commit_group.rst
@@ -1,7 +1,10 @@
+..
+   This file was automatically generated. Do not edit.
+
 cp.async.bulk.commit_group
 ^^^^^^^^^^^^^^^^^^^^^^^^^^
 .. code:: cuda
 
    // cp.async.bulk.commit_group; // PTX ISA 80, SM_90
-   template <typename=void>
+   template <typename = void>
    __device__ static inline void cp_async_bulk_commit_group();
diff --git a/docs/libcudacxx/ptx/instructions/generated/cp_async_bulk_multicast.rst b/docs/libcudacxx/ptx/instructions/generated/cp_async_bulk_multicast.rst
index 9cb15d06fa3..af027c0b623 100644
--- a/docs/libcudacxx/ptx/instructions/generated/cp_async_bulk_multicast.rst
+++ b/docs/libcudacxx/ptx/instructions/generated/cp_async_bulk_multicast.rst
@@ -1,3 +1,6 @@
+..
+   This file was automatically generated. Do not edit.
+
 cp.async.bulk.shared::cluster.global.mbarrier::complete_tx::bytes.multicast::cluster
 ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
 .. code:: cuda
@@ -5,7 +8,7 @@ cp.async.bulk.shared::cluster.global.mbarrier::complete_tx::bytes.multicast::clu
    // cp.async.bulk{.dst}{.src}.mbarrier::complete_tx::bytes.multicast::cluster [dstMem], [srcMem], size, [smem_bar], ctaMask; // 1.  PTX ISA 80, SM_90a
    // .dst       = { .shared::cluster }
    // .src       = { .global }
-   template <typename=void>
+   template <typename = void>
    __device__ static inline void cp_async_bulk(
      cuda::ptx::space_cluster_t,
      cuda::ptx::space_global_t,
diff --git a/docs/libcudacxx/ptx/instructions/generated/cp_async_bulk_tensor.rst b/docs/libcudacxx/ptx/instructions/generated/cp_async_bulk_tensor.rst
index 40eb070e66a..1c21efdd0a3 100644
--- a/docs/libcudacxx/ptx/instructions/generated/cp_async_bulk_tensor.rst
+++ b/docs/libcudacxx/ptx/instructions/generated/cp_async_bulk_tensor.rst
@@ -1,3 +1,6 @@
+..
+   This file was automatically generated. Do not edit.
+
 cp.async.bulk.tensor.1d.shared::cluster.global.tile.mbarrier::complete_tx::bytes
 ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
 .. code:: cuda
@@ -5,7 +8,7 @@ cp.async.bulk.tensor.1d.shared::cluster.global.tile.mbarrier::complete_tx::bytes
    // cp.async.bulk.tensor.1d.dst.src.tile.mbarrier::complete_tx::bytes [dstMem], [tensorMap, tensorCoords], [smem_bar];// 1a. PTX ISA 80, SM_90
    // .dst       = { .shared::cluster }
    // .src       = { .global }
-   template <typename=void>
+   template <typename = void>
    __device__ static inline void cp_async_bulk_tensor(
      cuda::ptx::space_cluster_t,
      cuda::ptx::space_global_t,
@@ -21,7 +24,7 @@ cp.async.bulk.tensor.1d.global.shared::cta.tile.bulk_group
    // cp.async.bulk.tensor.1d.dst.src.tile.bulk_group [tensorMap, tensorCoords], [srcMem]; // 3a. PTX ISA 80, SM_90
    // .dst       = { .global }
    // .src       = { .shared::cta }
-   template <typename=void>
+   template <typename = void>
    __device__ static inline void cp_async_bulk_tensor(
      cuda::ptx::space_global_t,
      cuda::ptx::space_shared_t,
@@ -36,7 +39,7 @@ cp.async.bulk.tensor.2d.shared::cluster.global.tile.mbarrier::complete_tx::bytes
    // cp.async.bulk.tensor.2d.dst.src.tile.mbarrier::complete_tx::bytes [dstMem], [tensorMap, tensorCoords], [smem_bar];// 1b. PTX ISA 80, SM_90
    // .dst       = { .shared::cluster }
    // .src       = { .global }
-   template <typename=void>
+   template <typename = void>
    __device__ static inline void cp_async_bulk_tensor(
      cuda::ptx::space_cluster_t,
      cuda::ptx::space_global_t,
@@ -52,7 +55,7 @@ cp.async.bulk.tensor.2d.global.shared::cta.tile.bulk_group
    // cp.async.bulk.tensor.2d.dst.src.tile.bulk_group [tensorMap, tensorCoords], [srcMem]; // 3b. PTX ISA 80, SM_90
    // .dst       = { .global }
    // .src       = { .shared::cta }
-   template <typename=void>
+   template <typename = void>
    __device__ static inline void cp_async_bulk_tensor(
      cuda::ptx::space_global_t,
      cuda::ptx::space_shared_t,
@@ -67,7 +70,7 @@ cp.async.bulk.tensor.3d.shared::cluster.global.tile.mbarrier::complete_tx::bytes
    // cp.async.bulk.tensor.3d.dst.src.tile.mbarrier::complete_tx::bytes [dstMem], [tensorMap, tensorCoords], [smem_bar];// 1c. PTX ISA 80, SM_90
    // .dst       = { .shared::cluster }
    // .src       = { .global }
-   template <typename=void>
+   template <typename = void>
    __device__ static inline void cp_async_bulk_tensor(
      cuda::ptx::space_cluster_t,
      cuda::ptx::space_global_t,
@@ -83,7 +86,7 @@ cp.async.bulk.tensor.3d.global.shared::cta.tile.bulk_group
    // cp.async.bulk.tensor.3d.dst.src.tile.bulk_group [tensorMap, tensorCoords], [srcMem]; // 3c. PTX ISA 80, SM_90
    // .dst       = { .global }
    // .src       = { .shared::cta }
-   template <typename=void>
+   template <typename = void>
    __device__ static inline void cp_async_bulk_tensor(
      cuda::ptx::space_global_t,
      cuda::ptx::space_shared_t,
@@ -98,7 +101,7 @@ cp.async.bulk.tensor.4d.shared::cluster.global.tile.mbarrier::complete_tx::bytes
    // cp.async.bulk.tensor.4d.dst.src.tile.mbarrier::complete_tx::bytes [dstMem], [tensorMap, tensorCoords], [smem_bar];// 1d. PTX ISA 80, SM_90
    // .dst       = { .shared::cluster }
    // .src       = { .global }
-   template <typename=void>
+   template <typename = void>
    __device__ static inline void cp_async_bulk_tensor(
      cuda::ptx::space_cluster_t,
      cuda::ptx::space_global_t,
@@ -114,7 +117,7 @@ cp.async.bulk.tensor.4d.global.shared::cta.tile.bulk_group
    // cp.async.bulk.tensor.4d.dst.src.tile.bulk_group [tensorMap, tensorCoords], [srcMem]; // 3d. PTX ISA 80, SM_90
    // .dst       = { .global }
    // .src       = { .shared::cta }
-   template <typename=void>
+   template <typename = void>
    __device__ static inline void cp_async_bulk_tensor(
      cuda::ptx::space_global_t,
      cuda::ptx::space_shared_t,
@@ -129,7 +132,7 @@ cp.async.bulk.tensor.5d.shared::cluster.global.tile.mbarrier::complete_tx::bytes
    // cp.async.bulk.tensor.5d.dst.src.tile.mbarrier::complete_tx::bytes [dstMem], [tensorMap, tensorCoords], [smem_bar];// 1e. PTX ISA 80, SM_90
    // .dst       = { .shared::cluster }
    // .src       = { .global }
-   template <typename=void>
+   template <typename = void>
    __device__ static inline void cp_async_bulk_tensor(
      cuda::ptx::space_cluster_t,
      cuda::ptx::space_global_t,
@@ -145,7 +148,7 @@ cp.async.bulk.tensor.5d.global.shared::cta.tile.bulk_group
    // cp.async.bulk.tensor.5d.dst.src.tile.bulk_group [tensorMap, tensorCoords], [srcMem]; // 3e. PTX ISA 80, SM_90
    // .dst       = { .global }
    // .src       = { .shared::cta }
-   template <typename=void>
+   template <typename = void>
    __device__ static inline void cp_async_bulk_tensor(
      cuda::ptx::space_global_t,
      cuda::ptx::space_shared_t,
diff --git a/docs/libcudacxx/ptx/instructions/generated/cp_async_bulk_tensor_multicast.rst b/docs/libcudacxx/ptx/instructions/generated/cp_async_bulk_tensor_multicast.rst
index 2481c80bf3c..ac33a05b69f 100644
--- a/docs/libcudacxx/ptx/instructions/generated/cp_async_bulk_tensor_multicast.rst
+++ b/docs/libcudacxx/ptx/instructions/generated/cp_async_bulk_tensor_multicast.rst
@@ -1,3 +1,6 @@
+..
+   This file was automatically generated. Do not edit.
+
 cp.async.bulk.tensor.1d.shared::cluster.global.tile.mbarrier::complete_tx::bytes.multicast::cluster
 ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
 .. code:: cuda
@@ -5,7 +8,7 @@ cp.async.bulk.tensor.1d.shared::cluster.global.tile.mbarrier::complete_tx::bytes
    // cp.async.bulk.tensor.1d.dst.src.tile.mbarrier::complete_tx::bytes.multicast::cluster [dstMem], [tensorMap, tensorCoords], [smem_bar], ctaMask; // 2a. PTX ISA 80, SM_90a
    // .dst       = { .shared::cluster }
    // .src       = { .global }
-   template <typename=void>
+   template <typename = void>
    __device__ static inline void cp_async_bulk_tensor(
      cuda::ptx::space_cluster_t,
      cuda::ptx::space_global_t,
@@ -22,7 +25,7 @@ cp.async.bulk.tensor.2d.shared::cluster.global.tile.mbarrier::complete_tx::bytes
    // cp.async.bulk.tensor.2d.dst.src.tile.mbarrier::complete_tx::bytes.multicast::cluster [dstMem], [tensorMap, tensorCoords], [smem_bar], ctaMask; // 2b. PTX ISA 80, SM_90a
    // .dst       = { .shared::cluster }
    // .src       = { .global }
-   template <typename=void>
+   template <typename = void>
    __device__ static inline void cp_async_bulk_tensor(
      cuda::ptx::space_cluster_t,
      cuda::ptx::space_global_t,
@@ -39,7 +42,7 @@ cp.async.bulk.tensor.3d.shared::cluster.global.tile.mbarrier::complete_tx::bytes
    // cp.async.bulk.tensor.3d.dst.src.tile.mbarrier::complete_tx::bytes.multicast::cluster [dstMem], [tensorMap, tensorCoords], [smem_bar], ctaMask; // 2c. PTX ISA 80, SM_90a
    // .dst       = { .shared::cluster }
    // .src       = { .global }
-   template <typename=void>
+   template <typename = void>
    __device__ static inline void cp_async_bulk_tensor(
      cuda::ptx::space_cluster_t,
      cuda::ptx::space_global_t,
@@ -56,7 +59,7 @@ cp.async.bulk.tensor.4d.shared::cluster.global.tile.mbarrier::complete_tx::bytes
    // cp.async.bulk.tensor.4d.dst.src.tile.mbarrier::complete_tx::bytes.multicast::cluster [dstMem], [tensorMap, tensorCoords], [smem_bar], ctaMask; // 2d. PTX ISA 80, SM_90a
    // .dst       = { .shared::cluster }
    // .src       = { .global }
-   template <typename=void>
+   template <typename = void>
    __device__ static inline void cp_async_bulk_tensor(
      cuda::ptx::space_cluster_t,
      cuda::ptx::space_global_t,
@@ -73,7 +76,7 @@ cp.async.bulk.tensor.5d.shared::cluster.global.tile.mbarrier::complete_tx::bytes
    // cp.async.bulk.tensor.5d.dst.src.tile.mbarrier::complete_tx::bytes.multicast::cluster [dstMem], [tensorMap, tensorCoords], [smem_bar], ctaMask; // 2e. PTX ISA 80, SM_90a
    // .dst       = { .shared::cluster }
    // .src       = { .global }
-   template <typename=void>
+   template <typename = void>
    __device__ static inline void cp_async_bulk_tensor(
      cuda::ptx::space_cluster_t,
      cuda::ptx::space_global_t,
diff --git a/docs/libcudacxx/ptx/instructions/generated/cp_async_bulk_wait_group.rst b/docs/libcudacxx/ptx/instructions/generated/cp_async_bulk_wait_group.rst
index 08ebd3c28a7..06ff8e9014c 100644
--- a/docs/libcudacxx/ptx/instructions/generated/cp_async_bulk_wait_group.rst
+++ b/docs/libcudacxx/ptx/instructions/generated/cp_async_bulk_wait_group.rst
@@ -1,3 +1,6 @@
+..
+   This file was automatically generated. Do not edit.
+
 cp.async.bulk.wait_group
 ^^^^^^^^^^^^^^^^^^^^^^^^
 .. code:: cuda
diff --git a/docs/libcudacxx/ptx/instructions/generated/cp_reduce_async_bulk.rst b/docs/libcudacxx/ptx/instructions/generated/cp_reduce_async_bulk.rst
index cc82d633375..b043eb9f456 100644
--- a/docs/libcudacxx/ptx/instructions/generated/cp_reduce_async_bulk.rst
+++ b/docs/libcudacxx/ptx/instructions/generated/cp_reduce_async_bulk.rst
@@ -1,3 +1,6 @@
+..
+   This file was automatically generated. Do not edit.
+
 cp.reduce.async.bulk.shared::cluster.shared::cta.mbarrier::complete_tx::bytes.and.b32
 ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
 .. code:: cuda
@@ -64,7 +67,7 @@ cp.reduce.async.bulk.shared::cluster.shared::cta.mbarrier::complete_tx::bytes.mi
    // .src       = { .shared::cta }
    // .type      = { .u32 }
    // .op        = { .min }
-   template <typename=void>
+   template <typename = void>
    __device__ static inline void cp_reduce_async_bulk(
      cuda::ptx::space_cluster_t,
      cuda::ptx::space_shared_t,
@@ -83,7 +86,7 @@ cp.reduce.async.bulk.shared::cluster.shared::cta.mbarrier::complete_tx::bytes.ma
    // .src       = { .shared::cta }
    // .type      = { .u32 }
    // .op        = { .max }
-   template <typename=void>
+   template <typename = void>
    __device__ static inline void cp_reduce_async_bulk(
      cuda::ptx::space_cluster_t,
      cuda::ptx::space_shared_t,
@@ -102,7 +105,7 @@ cp.reduce.async.bulk.shared::cluster.shared::cta.mbarrier::complete_tx::bytes.ad
    // .src       = { .shared::cta }
    // .type      = { .u32 }
    // .op        = { .add }
-   template <typename=void>
+   template <typename = void>
    __device__ static inline void cp_reduce_async_bulk(
      cuda::ptx::space_cluster_t,
      cuda::ptx::space_shared_t,
@@ -121,7 +124,7 @@ cp.reduce.async.bulk.shared::cluster.shared::cta.mbarrier::complete_tx::bytes.in
    // .src       = { .shared::cta }
    // .type      = { .u32 }
    // .op        = { .inc }
-   template <typename=void>
+   template <typename = void>
    __device__ static inline void cp_reduce_async_bulk(
      cuda::ptx::space_cluster_t,
      cuda::ptx::space_shared_t,
@@ -140,7 +143,7 @@ cp.reduce.async.bulk.shared::cluster.shared::cta.mbarrier::complete_tx::bytes.de
    // .src       = { .shared::cta }
    // .type      = { .u32 }
    // .op        = { .dec }
-   template <typename=void>
+   template <typename = void>
    __device__ static inline void cp_reduce_async_bulk(
      cuda::ptx::space_cluster_t,
      cuda::ptx::space_shared_t,
@@ -159,7 +162,7 @@ cp.reduce.async.bulk.shared::cluster.shared::cta.mbarrier::complete_tx::bytes.mi
    // .src       = { .shared::cta }
    // .type      = { .s32 }
    // .op        = { .min }
-   template <typename=void>
+   template <typename = void>
    __device__ static inline void cp_reduce_async_bulk(
      cuda::ptx::space_cluster_t,
      cuda::ptx::space_shared_t,
@@ -178,7 +181,7 @@ cp.reduce.async.bulk.shared::cluster.shared::cta.mbarrier::complete_tx::bytes.ma
    // .src       = { .shared::cta }
    // .type      = { .s32 }
    // .op        = { .max }
-   template <typename=void>
+   template <typename = void>
    __device__ static inline void cp_reduce_async_bulk(
      cuda::ptx::space_cluster_t,
      cuda::ptx::space_shared_t,
@@ -197,7 +200,7 @@ cp.reduce.async.bulk.shared::cluster.shared::cta.mbarrier::complete_tx::bytes.ad
    // .src       = { .shared::cta }
    // .type      = { .s32 }
    // .op        = { .add }
-   template <typename=void>
+   template <typename = void>
    __device__ static inline void cp_reduce_async_bulk(
      cuda::ptx::space_cluster_t,
      cuda::ptx::space_shared_t,
@@ -216,7 +219,7 @@ cp.reduce.async.bulk.shared::cluster.shared::cta.mbarrier::complete_tx::bytes.ad
    // .src       = { .shared::cta }
    // .type      = { .u64 }
    // .op        = { .add }
-   template <typename=void>
+   template <typename = void>
    __device__ static inline void cp_reduce_async_bulk(
      cuda::ptx::space_cluster_t,
      cuda::ptx::space_shared_t,
@@ -235,7 +238,7 @@ cp.reduce.async.bulk.shared::cluster.shared::cta.mbarrier::complete_tx::bytes.ad
    // .src       = { .shared::cta }
    // .type      = { .s64 }
    // .op        = { .add }
-   template <typename=void>
+   template <typename = void>
    __device__ static inline void cp_reduce_async_bulk(
      cuda::ptx::space_cluster_t,
      cuda::ptx::space_shared_t,
@@ -362,7 +365,7 @@ cp.reduce.async.bulk.global.shared::cta.bulk_group.min.u32
    // .src       = { .shared::cta }
    // .type      = { .u32 }
    // .op        = { .min }
-   template <typename=void>
+   template <typename = void>
    __device__ static inline void cp_reduce_async_bulk(
      cuda::ptx::space_global_t,
      cuda::ptx::space_shared_t,
@@ -380,7 +383,7 @@ cp.reduce.async.bulk.global.shared::cta.bulk_group.max.u32
    // .src       = { .shared::cta }
    // .type      = { .u32 }
    // .op        = { .max }
-   template <typename=void>
+   template <typename = void>
    __device__ static inline void cp_reduce_async_bulk(
      cuda::ptx::space_global_t,
      cuda::ptx::space_shared_t,
@@ -398,7 +401,7 @@ cp.reduce.async.bulk.global.shared::cta.bulk_group.add.u32
    // .src       = { .shared::cta }
    // .type      = { .u32 }
    // .op        = { .add }
-   template <typename=void>
+   template <typename = void>
    __device__ static inline void cp_reduce_async_bulk(
      cuda::ptx::space_global_t,
      cuda::ptx::space_shared_t,
@@ -416,7 +419,7 @@ cp.reduce.async.bulk.global.shared::cta.bulk_group.inc.u32
    // .src       = { .shared::cta }
    // .type      = { .u32 }
    // .op        = { .inc }
-   template <typename=void>
+   template <typename = void>
    __device__ static inline void cp_reduce_async_bulk(
      cuda::ptx::space_global_t,
      cuda::ptx::space_shared_t,
@@ -434,7 +437,7 @@ cp.reduce.async.bulk.global.shared::cta.bulk_group.dec.u32
    // .src       = { .shared::cta }
    // .type      = { .u32 }
    // .op        = { .dec }
-   template <typename=void>
+   template <typename = void>
    __device__ static inline void cp_reduce_async_bulk(
      cuda::ptx::space_global_t,
      cuda::ptx::space_shared_t,
@@ -452,7 +455,7 @@ cp.reduce.async.bulk.global.shared::cta.bulk_group.min.s32
    // .src       = { .shared::cta }
    // .type      = { .s32 }
    // .op        = { .min }
-   template <typename=void>
+   template <typename = void>
    __device__ static inline void cp_reduce_async_bulk(
      cuda::ptx::space_global_t,
      cuda::ptx::space_shared_t,
@@ -470,7 +473,7 @@ cp.reduce.async.bulk.global.shared::cta.bulk_group.max.s32
    // .src       = { .shared::cta }
    // .type      = { .s32 }
    // .op        = { .max }
-   template <typename=void>
+   template <typename = void>
    __device__ static inline void cp_reduce_async_bulk(
      cuda::ptx::space_global_t,
      cuda::ptx::space_shared_t,
@@ -488,7 +491,7 @@ cp.reduce.async.bulk.global.shared::cta.bulk_group.add.s32
    // .src       = { .shared::cta }
    // .type      = { .s32 }
    // .op        = { .add }
-   template <typename=void>
+   template <typename = void>
    __device__ static inline void cp_reduce_async_bulk(
      cuda::ptx::space_global_t,
      cuda::ptx::space_shared_t,
@@ -506,7 +509,7 @@ cp.reduce.async.bulk.global.shared::cta.bulk_group.min.u64
    // .src       = { .shared::cta }
    // .type      = { .u64 }
    // .op        = { .min }
-   template <typename=void>
+   template <typename = void>
    __device__ static inline void cp_reduce_async_bulk(
      cuda::ptx::space_global_t,
      cuda::ptx::space_shared_t,
@@ -524,7 +527,7 @@ cp.reduce.async.bulk.global.shared::cta.bulk_group.max.u64
    // .src       = { .shared::cta }
    // .type      = { .u64 }
    // .op        = { .max }
-   template <typename=void>
+   template <typename = void>
    __device__ static inline void cp_reduce_async_bulk(
      cuda::ptx::space_global_t,
      cuda::ptx::space_shared_t,
@@ -542,7 +545,7 @@ cp.reduce.async.bulk.global.shared::cta.bulk_group.add.u64
    // .src       = { .shared::cta }
    // .type      = { .u64 }
    // .op        = { .add }
-   template <typename=void>
+   template <typename = void>
    __device__ static inline void cp_reduce_async_bulk(
      cuda::ptx::space_global_t,
      cuda::ptx::space_shared_t,
@@ -560,7 +563,7 @@ cp.reduce.async.bulk.global.shared::cta.bulk_group.min.s64
    // .src       = { .shared::cta }
    // .type      = { .s64 }
    // .op        = { .min }
-   template <typename=void>
+   template <typename = void>
    __device__ static inline void cp_reduce_async_bulk(
      cuda::ptx::space_global_t,
      cuda::ptx::space_shared_t,
@@ -578,7 +581,7 @@ cp.reduce.async.bulk.global.shared::cta.bulk_group.max.s64
    // .src       = { .shared::cta }
    // .type      = { .s64 }
    // .op        = { .max }
-   template <typename=void>
+   template <typename = void>
    __device__ static inline void cp_reduce_async_bulk(
      cuda::ptx::space_global_t,
      cuda::ptx::space_shared_t,
@@ -596,7 +599,7 @@ cp.reduce.async.bulk.global.shared::cta.bulk_group.add.f32
    // .src       = { .shared::cta }
    // .type      = { .f32 }
    // .op        = { .add }
-   template <typename=void>
+   template <typename = void>
    __device__ static inline void cp_reduce_async_bulk(
      cuda::ptx::space_global_t,
      cuda::ptx::space_shared_t,
@@ -614,7 +617,7 @@ cp.reduce.async.bulk.global.shared::cta.bulk_group.add.f64
    // .src       = { .shared::cta }
    // .type      = { .f64 }
    // .op        = { .add }
-   template <typename=void>
+   template <typename = void>
    __device__ static inline void cp_reduce_async_bulk(
      cuda::ptx::space_global_t,
      cuda::ptx::space_shared_t,
@@ -632,7 +635,7 @@ cp.reduce.async.bulk.global.shared::cta.bulk_group.add.u64
    // .src       = { .shared::cta }
    // .type      = { .s64 }
    // .op        = { .add }
-   template <typename=void>
+   template <typename = void>
    __device__ static inline void cp_reduce_async_bulk(
      cuda::ptx::space_global_t,
      cuda::ptx::space_shared_t,
diff --git a/docs/libcudacxx/ptx/instructions/generated/cp_reduce_async_bulk_bf16.rst b/docs/libcudacxx/ptx/instructions/generated/cp_reduce_async_bulk_bf16.rst
index e4dea98a119..80e927d0375 100644
--- a/docs/libcudacxx/ptx/instructions/generated/cp_reduce_async_bulk_bf16.rst
+++ b/docs/libcudacxx/ptx/instructions/generated/cp_reduce_async_bulk_bf16.rst
@@ -1,3 +1,6 @@
+..
+   This file was automatically generated. Do not edit.
+
 cp.reduce.async.bulk.global.shared::cta.bulk_group.min.bf16
 ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
 .. code:: cuda
@@ -7,7 +10,7 @@ cp.reduce.async.bulk.global.shared::cta.bulk_group.min.bf16
    // .src       = { .shared::cta }
    // .type      = { .bf16 }
    // .op        = { .min }
-   template <typename=void>
+   template <typename = void>
    __device__ static inline void cp_reduce_async_bulk(
      cuda::ptx::space_global_t,
      cuda::ptx::space_shared_t,
@@ -25,7 +28,7 @@ cp.reduce.async.bulk.global.shared::cta.bulk_group.max.bf16
    // .src       = { .shared::cta }
    // .type      = { .bf16 }
    // .op        = { .max }
-   template <typename=void>
+   template <typename = void>
    __device__ static inline void cp_reduce_async_bulk(
      cuda::ptx::space_global_t,
      cuda::ptx::space_shared_t,
@@ -43,7 +46,7 @@ cp.reduce.async.bulk.global.shared::cta.bulk_group.add.noftz.bf16
    // .src       = { .shared::cta }
    // .type      = { .bf16 }
    // .op        = { .add }
-   template <typename=void>
+   template <typename = void>
    __device__ static inline void cp_reduce_async_bulk(
      cuda::ptx::space_global_t,
      cuda::ptx::space_shared_t,
diff --git a/docs/libcudacxx/ptx/instructions/generated/cp_reduce_async_bulk_f16.rst b/docs/libcudacxx/ptx/instructions/generated/cp_reduce_async_bulk_f16.rst
index 18c5e0bfc60..0d658fd9256 100644
--- a/docs/libcudacxx/ptx/instructions/generated/cp_reduce_async_bulk_f16.rst
+++ b/docs/libcudacxx/ptx/instructions/generated/cp_reduce_async_bulk_f16.rst
@@ -1,3 +1,6 @@
+..
+   This file was automatically generated. Do not edit.
+
 cp.reduce.async.bulk.global.shared::cta.bulk_group.min.f16
 ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
 .. code:: cuda
@@ -7,7 +10,7 @@ cp.reduce.async.bulk.global.shared::cta.bulk_group.min.f16
    // .src       = { .shared::cta }
    // .type      = { .f16 }
    // .op        = { .min }
-   template <typename=void>
+   template <typename = void>
    __device__ static inline void cp_reduce_async_bulk(
      cuda::ptx::space_global_t,
      cuda::ptx::space_shared_t,
@@ -25,7 +28,7 @@ cp.reduce.async.bulk.global.shared::cta.bulk_group.max.f16
    // .src       = { .shared::cta }
    // .type      = { .f16 }
    // .op        = { .max }
-   template <typename=void>
+   template <typename = void>
    __device__ static inline void cp_reduce_async_bulk(
      cuda::ptx::space_global_t,
      cuda::ptx::space_shared_t,
@@ -43,7 +46,7 @@ cp.reduce.async.bulk.global.shared::cta.bulk_group.add.noftz.f16
    // .src       = { .shared::cta }
    // .type      = { .f16 }
    // .op        = { .add }
-   template <typename=void>
+   template <typename = void>
    __device__ static inline void cp_reduce_async_bulk(
      cuda::ptx::space_global_t,
      cuda::ptx::space_shared_t,
diff --git a/docs/libcudacxx/ptx/instructions/generated/cp_reduce_async_bulk_tensor.rst b/docs/libcudacxx/ptx/instructions/generated/cp_reduce_async_bulk_tensor.rst
index c653b01cd60..d587d3f51a2 100644
--- a/docs/libcudacxx/ptx/instructions/generated/cp_reduce_async_bulk_tensor.rst
+++ b/docs/libcudacxx/ptx/instructions/generated/cp_reduce_async_bulk_tensor.rst
@@ -1,3 +1,6 @@
+..
+   This file was automatically generated. Do not edit.
+
 cp.reduce.async.bulk.tensor.1d.global.shared::cta.add.tile.bulk_group
 ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
 .. code:: cuda
diff --git a/docs/libcudacxx/ptx/instructions/generated/fence.rst b/docs/libcudacxx/ptx/instructions/generated/fence.rst
index 2fe14dcb3b2..ed21fa80b6e 100644
--- a/docs/libcudacxx/ptx/instructions/generated/fence.rst
+++ b/docs/libcudacxx/ptx/instructions/generated/fence.rst
@@ -1,3 +1,6 @@
+..
+   This file was automatically generated. Do not edit.
+
 fence.sc.cta
 ^^^^^^^^^^^^
 .. code:: cuda
diff --git a/docs/libcudacxx/ptx/instructions/generated/fence_mbarrier_init.rst b/docs/libcudacxx/ptx/instructions/generated/fence_mbarrier_init.rst
index 0f5298e3359..c7dd357632a 100644
--- a/docs/libcudacxx/ptx/instructions/generated/fence_mbarrier_init.rst
+++ b/docs/libcudacxx/ptx/instructions/generated/fence_mbarrier_init.rst
@@ -1,3 +1,6 @@
+..
+   This file was automatically generated. Do not edit.
+
 fence.mbarrier_init.release.cluster
 ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
 .. code:: cuda
@@ -5,7 +8,7 @@ fence.mbarrier_init.release.cluster
    // fence.mbarrier_init.sem.scope; // 3. PTX ISA 80, SM_90
    // .sem       = { .release }
    // .scope     = { .cluster }
-   template <typename=void>
+   template <typename = void>
    __device__ static inline void fence_mbarrier_init(
      cuda::ptx::sem_release_t,
      cuda::ptx::scope_cluster_t);
diff --git a/docs/libcudacxx/ptx/instructions/generated/fence_proxy_alias.rst b/docs/libcudacxx/ptx/instructions/generated/fence_proxy_alias.rst
index 935aab9b6df..fdd1f8d0b12 100644
--- a/docs/libcudacxx/ptx/instructions/generated/fence_proxy_alias.rst
+++ b/docs/libcudacxx/ptx/instructions/generated/fence_proxy_alias.rst
@@ -1,7 +1,10 @@
+..
+   This file was automatically generated. Do not edit.
+
 fence.proxy.alias
 ^^^^^^^^^^^^^^^^^
 .. code:: cuda
 
    // fence.proxy.alias; // 4. PTX ISA 75, SM_70
-   template <typename=void>
+   template <typename = void>
    __device__ static inline void fence_proxy_alias();
diff --git a/docs/libcudacxx/ptx/instructions/generated/fence_proxy_async.rst b/docs/libcudacxx/ptx/instructions/generated/fence_proxy_async.rst
index 3e741a1f6c4..8376e96ce6b 100644
--- a/docs/libcudacxx/ptx/instructions/generated/fence_proxy_async.rst
+++ b/docs/libcudacxx/ptx/instructions/generated/fence_proxy_async.rst
@@ -1,9 +1,12 @@
+..
+   This file was automatically generated. Do not edit.
+
 fence.proxy.async
 ^^^^^^^^^^^^^^^^^
 .. code:: cuda
 
    // fence.proxy.async; // 5. PTX ISA 80, SM_90
-   template <typename=void>
+   template <typename = void>
    __device__ static inline void fence_proxy_async();
 
 fence.proxy.async.global
diff --git a/docs/libcudacxx/ptx/instructions/generated/fence_proxy_tensormap_generic.rst b/docs/libcudacxx/ptx/instructions/generated/fence_proxy_tensormap_generic.rst
index db582971c3d..78c3cd308a0 100644
--- a/docs/libcudacxx/ptx/instructions/generated/fence_proxy_tensormap_generic.rst
+++ b/docs/libcudacxx/ptx/instructions/generated/fence_proxy_tensormap_generic.rst
@@ -1,3 +1,6 @@
+..
+   This file was automatically generated. Do not edit.
+
 fence.proxy.tensormap::generic.release.cta
 ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
 .. code:: cuda
diff --git a/docs/libcudacxx/ptx/instructions/generated/getctarank.rst b/docs/libcudacxx/ptx/instructions/generated/getctarank.rst
index c85f52ee302..374c182576f 100644
--- a/docs/libcudacxx/ptx/instructions/generated/getctarank.rst
+++ b/docs/libcudacxx/ptx/instructions/generated/getctarank.rst
@@ -1,10 +1,13 @@
+..
+   This file was automatically generated. Do not edit.
+
 getctarank.shared::cluster.u32
 ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
 .. code:: cuda
 
    // getctarank{.space}.u32 dest, addr; // PTX ISA 78, SM_90
    // .space     = { .shared::cluster }
-   template <typename=void>
+   template <typename = void>
    __device__ static inline uint32_t getctarank(
      cuda::ptx::space_cluster_t,
      const void* addr);
diff --git a/docs/libcudacxx/ptx/instructions/generated/mbarrier_arrive.rst b/docs/libcudacxx/ptx/instructions/generated/mbarrier_arrive.rst
index 92cd106cad9..21436e2b3ca 100644
--- a/docs/libcudacxx/ptx/instructions/generated/mbarrier_arrive.rst
+++ b/docs/libcudacxx/ptx/instructions/generated/mbarrier_arrive.rst
@@ -1,9 +1,12 @@
+..
+   This file was automatically generated. Do not edit.
+
 mbarrier.arrive.shared.b64
 ^^^^^^^^^^^^^^^^^^^^^^^^^^
 .. code:: cuda
 
    // mbarrier.arrive.shared.b64                                  state,  [addr];           // 1.  PTX ISA 70, SM_80
-   template <typename=void>
+   template <typename = void>
    __device__ static inline uint64_t mbarrier_arrive(
      uint64_t* addr);
 
@@ -12,7 +15,7 @@ mbarrier.arrive.shared::cta.b64
 .. code:: cuda
 
    // mbarrier.arrive.shared::cta.b64                             state,  [addr], count;    // 2.  PTX ISA 78, SM_90
-   template <typename=void>
+   template <typename = void>
    __device__ static inline uint64_t mbarrier_arrive(
      uint64_t* addr,
      const uint32_t& count);
@@ -87,7 +90,7 @@ mbarrier.arrive.release.cluster.shared::cluster.b64
    // .sem       = { .release }
    // .scope     = { .cluster }
    // .space     = { .shared::cluster }
-   template <typename=void>
+   template <typename = void>
    __device__ static inline void mbarrier_arrive(
      cuda::ptx::sem_release_t,
      cuda::ptx::scope_cluster_t,
@@ -102,7 +105,7 @@ mbarrier.arrive.release.cluster.shared::cluster.b64
    // .sem       = { .release }
    // .scope     = { .cluster }
    // .space     = { .shared::cluster }
-   template <typename=void>
+   template <typename = void>
    __device__ static inline void mbarrier_arrive(
      cuda::ptx::sem_release_t,
      cuda::ptx::scope_cluster_t,
diff --git a/docs/libcudacxx/ptx/instructions/generated/mbarrier_arrive_expect_tx.rst b/docs/libcudacxx/ptx/instructions/generated/mbarrier_arrive_expect_tx.rst
index 0087ae2f458..47c56eca31a 100644
--- a/docs/libcudacxx/ptx/instructions/generated/mbarrier_arrive_expect_tx.rst
+++ b/docs/libcudacxx/ptx/instructions/generated/mbarrier_arrive_expect_tx.rst
@@ -1,3 +1,6 @@
+..
+   This file was automatically generated. Do not edit.
+
 mbarrier.arrive.expect_tx.release.cta.shared::cta.b64
 ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
 .. code:: cuda
@@ -38,7 +41,7 @@ mbarrier.arrive.expect_tx.release.cluster.shared::cluster.b64
    // .sem       = { .release }
    // .scope     = { .cluster }
    // .space     = { .shared::cluster }
-   template <typename=void>
+   template <typename = void>
    __device__ static inline void mbarrier_arrive_expect_tx(
      cuda::ptx::sem_release_t,
      cuda::ptx::scope_cluster_t,
diff --git a/docs/libcudacxx/ptx/instructions/generated/mbarrier_arrive_no_complete.rst b/docs/libcudacxx/ptx/instructions/generated/mbarrier_arrive_no_complete.rst
index b6d7edbbeee..ba909ae1f56 100644
--- a/docs/libcudacxx/ptx/instructions/generated/mbarrier_arrive_no_complete.rst
+++ b/docs/libcudacxx/ptx/instructions/generated/mbarrier_arrive_no_complete.rst
@@ -1,9 +1,12 @@
+..
+   This file was automatically generated. Do not edit.
+
 mbarrier.arrive.noComplete.shared.b64
 ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
 .. code:: cuda
 
    // mbarrier.arrive.noComplete.shared.b64                       state,  [addr], count;    // 5.  PTX ISA 70, SM_80
-   template <typename=void>
+   template <typename = void>
    __device__ static inline uint64_t mbarrier_arrive_no_complete(
      uint64_t* addr,
      const uint32_t& count);
diff --git a/docs/libcudacxx/ptx/instructions/generated/mbarrier_expect_tx.rst b/docs/libcudacxx/ptx/instructions/generated/mbarrier_expect_tx.rst
index b87d6f62a23..46adcd16be3 100644
--- a/docs/libcudacxx/ptx/instructions/generated/mbarrier_expect_tx.rst
+++ b/docs/libcudacxx/ptx/instructions/generated/mbarrier_expect_tx.rst
@@ -1,3 +1,6 @@
+..
+   This file was automatically generated. Do not edit.
+
 mbarrier.expect_tx.relaxed.cta.shared::cta.b64
 ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
 .. code:: cuda
diff --git a/docs/libcudacxx/ptx/instructions/generated/mbarrier_init.rst b/docs/libcudacxx/ptx/instructions/generated/mbarrier_init.rst
index 3e529d86d78..2c3520a20f6 100644
--- a/docs/libcudacxx/ptx/instructions/generated/mbarrier_init.rst
+++ b/docs/libcudacxx/ptx/instructions/generated/mbarrier_init.rst
@@ -1,9 +1,12 @@
+..
+   This file was automatically generated. Do not edit.
+
 mbarrier.init.shared.b64
 ^^^^^^^^^^^^^^^^^^^^^^^^
 .. code:: cuda
 
    // mbarrier.init.shared.b64 [addr], count; // PTX ISA 70, SM_80
-   template <typename=void>
+   template <typename = void>
    __device__ static inline void mbarrier_init(
      uint64_t* addr,
      const uint32_t& count);
diff --git a/docs/libcudacxx/ptx/instructions/generated/mbarrier_test_wait.rst b/docs/libcudacxx/ptx/instructions/generated/mbarrier_test_wait.rst
index 4cb241c7ca8..d16b2ac07ac 100644
--- a/docs/libcudacxx/ptx/instructions/generated/mbarrier_test_wait.rst
+++ b/docs/libcudacxx/ptx/instructions/generated/mbarrier_test_wait.rst
@@ -1,9 +1,12 @@
+..
+   This file was automatically generated. Do not edit.
+
 mbarrier.test_wait.shared.b64
 ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
 .. code:: cuda
 
    // mbarrier.test_wait.shared.b64 waitComplete, [addr], state;                                                  // 1.  PTX ISA 70, SM_80
-   template <typename=void>
+   template <typename = void>
    __device__ static inline bool mbarrier_test_wait(
      uint64_t* addr,
      const uint64_t& state);
diff --git a/docs/libcudacxx/ptx/instructions/generated/mbarrier_test_wait_parity.rst b/docs/libcudacxx/ptx/instructions/generated/mbarrier_test_wait_parity.rst
index e750c4a543f..ec464b3398b 100644
--- a/docs/libcudacxx/ptx/instructions/generated/mbarrier_test_wait_parity.rst
+++ b/docs/libcudacxx/ptx/instructions/generated/mbarrier_test_wait_parity.rst
@@ -1,9 +1,12 @@
+..
+   This file was automatically generated. Do not edit.
+
 mbarrier.test_wait.parity.shared.b64
 ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
 .. code:: cuda
 
    // mbarrier.test_wait.parity.shared.b64 waitComplete, [addr], phaseParity;                                     // 3.  PTX ISA 71, SM_80
-   template <typename=void>
+   template <typename = void>
    __device__ static inline bool mbarrier_test_wait_parity(
      uint64_t* addr,
      const uint32_t& phaseParity);
diff --git a/docs/libcudacxx/ptx/instructions/generated/mbarrier_try_wait.rst b/docs/libcudacxx/ptx/instructions/generated/mbarrier_try_wait.rst
index ce648c66ee9..3dfdba46861 100644
--- a/docs/libcudacxx/ptx/instructions/generated/mbarrier_try_wait.rst
+++ b/docs/libcudacxx/ptx/instructions/generated/mbarrier_try_wait.rst
@@ -1,9 +1,12 @@
+..
+   This file was automatically generated. Do not edit.
+
 mbarrier.try_wait.shared::cta.b64
 ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
 .. code:: cuda
 
    // mbarrier.try_wait.shared::cta.b64         waitComplete, [addr], state;                                      // 5a.  PTX ISA 78, SM_90
-   template <typename=void>
+   template <typename = void>
    __device__ static inline bool mbarrier_try_wait(
      uint64_t* addr,
      const uint64_t& state);
@@ -13,7 +16,7 @@ mbarrier.try_wait.shared::cta.b64
 .. code:: cuda
 
    // mbarrier.try_wait.shared::cta.b64         waitComplete, [addr], state, suspendTimeHint;                    // 5b.  PTX ISA 78, SM_90
-   template <typename=void>
+   template <typename = void>
    __device__ static inline bool mbarrier_try_wait(
      uint64_t* addr,
      const uint64_t& state,
diff --git a/docs/libcudacxx/ptx/instructions/generated/mbarrier_try_wait_parity.rst b/docs/libcudacxx/ptx/instructions/generated/mbarrier_try_wait_parity.rst
index 3210dc0eab1..4e7af4bace5 100644
--- a/docs/libcudacxx/ptx/instructions/generated/mbarrier_try_wait_parity.rst
+++ b/docs/libcudacxx/ptx/instructions/generated/mbarrier_try_wait_parity.rst
@@ -1,9 +1,12 @@
+..
+   This file was automatically generated. Do not edit.
+
 mbarrier.try_wait.parity.shared::cta.b64
 ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
 .. code:: cuda
 
    // mbarrier.try_wait.parity.shared::cta.b64  waitComplete, [addr], phaseParity;                                // 7a.  PTX ISA 78, SM_90
-   template <typename=void>
+   template <typename = void>
    __device__ static inline bool mbarrier_try_wait_parity(
      uint64_t* addr,
      const uint32_t& phaseParity);
@@ -13,7 +16,7 @@ mbarrier.try_wait.parity.shared::cta.b64
 .. code:: cuda
 
    // mbarrier.try_wait.parity.shared::cta.b64  waitComplete, [addr], phaseParity, suspendTimeHint;               // 7b.  PTX ISA 78, SM_90
-   template <typename=void>
+   template <typename = void>
    __device__ static inline bool mbarrier_try_wait_parity(
      uint64_t* addr,
      const uint32_t& phaseParity,
diff --git a/docs/libcudacxx/ptx/instructions/generated/red_async.rst b/docs/libcudacxx/ptx/instructions/generated/red_async.rst
index d6b9cf36549..658fe0a8f44 100644
--- a/docs/libcudacxx/ptx/instructions/generated/red_async.rst
+++ b/docs/libcudacxx/ptx/instructions/generated/red_async.rst
@@ -1,3 +1,6 @@
+..
+   This file was automatically generated. Do not edit.
+
 red.async.relaxed.cluster.shared::cluster.mbarrier::complete_tx::bytes.inc.u32
 ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
 .. code:: cuda
@@ -5,7 +8,7 @@ red.async.relaxed.cluster.shared::cluster.mbarrier::complete_tx::bytes.inc.u32
    // red.async.relaxed.cluster.shared::cluster.mbarrier::complete_tx::bytes{.op}{.type}  [dest], value, [remote_bar];  // PTX ISA 81, SM_90
    // .type      = { .u32 }
    // .op        = { .inc }
-   template <typename=void>
+   template <typename = void>
    __device__ static inline void red_async(
      cuda::ptx::op_inc_t,
      uint32_t* dest,
@@ -19,7 +22,7 @@ red.async.relaxed.cluster.shared::cluster.mbarrier::complete_tx::bytes.dec.u32
    // red.async.relaxed.cluster.shared::cluster.mbarrier::complete_tx::bytes{.op}{.type}  [dest], value, [remote_bar];  // PTX ISA 81, SM_90
    // .type      = { .u32 }
    // .op        = { .dec }
-   template <typename=void>
+   template <typename = void>
    __device__ static inline void red_async(
      cuda::ptx::op_dec_t,
      uint32_t* dest,
@@ -33,7 +36,7 @@ red.async.relaxed.cluster.shared::cluster.mbarrier::complete_tx::bytes.min.u32
    // red.async.relaxed.cluster.shared::cluster.mbarrier::complete_tx::bytes{.op}{.type}  [dest], value, [remote_bar];  // PTX ISA 81, SM_90
    // .type      = { .u32 }
    // .op        = { .min }
-   template <typename=void>
+   template <typename = void>
    __device__ static inline void red_async(
      cuda::ptx::op_min_t,
      uint32_t* dest,
@@ -47,7 +50,7 @@ red.async.relaxed.cluster.shared::cluster.mbarrier::complete_tx::bytes.max.u32
    // red.async.relaxed.cluster.shared::cluster.mbarrier::complete_tx::bytes{.op}{.type}  [dest], value, [remote_bar];  // PTX ISA 81, SM_90
    // .type      = { .u32 }
    // .op        = { .max }
-   template <typename=void>
+   template <typename = void>
    __device__ static inline void red_async(
      cuda::ptx::op_max_t,
      uint32_t* dest,
@@ -61,7 +64,7 @@ red.async.relaxed.cluster.shared::cluster.mbarrier::complete_tx::bytes.add.u32
    // red.async.relaxed.cluster.shared::cluster.mbarrier::complete_tx::bytes{.op}{.type}  [dest], value, [remote_bar];  // PTX ISA 81, SM_90
    // .type      = { .u32 }
    // .op        = { .add }
-   template <typename=void>
+   template <typename = void>
    __device__ static inline void red_async(
      cuda::ptx::op_add_t,
      uint32_t* dest,
@@ -75,7 +78,7 @@ red.async.relaxed.cluster.shared::cluster.mbarrier::complete_tx::bytes.min.s32
    // red.async.relaxed.cluster.shared::cluster.mbarrier::complete_tx::bytes{.op}{.type}  [dest], value, [remote_bar];  // PTX ISA 81, SM_90
    // .type      = { .s32 }
    // .op        = { .min }
-   template <typename=void>
+   template <typename = void>
    __device__ static inline void red_async(
      cuda::ptx::op_min_t,
      int32_t* dest,
@@ -89,7 +92,7 @@ red.async.relaxed.cluster.shared::cluster.mbarrier::complete_tx::bytes.max.s32
    // red.async.relaxed.cluster.shared::cluster.mbarrier::complete_tx::bytes{.op}{.type}  [dest], value, [remote_bar];  // PTX ISA 81, SM_90
    // .type      = { .s32 }
    // .op        = { .max }
-   template <typename=void>
+   template <typename = void>
    __device__ static inline void red_async(
      cuda::ptx::op_max_t,
      int32_t* dest,
@@ -103,7 +106,7 @@ red.async.relaxed.cluster.shared::cluster.mbarrier::complete_tx::bytes.add.s32
    // red.async.relaxed.cluster.shared::cluster.mbarrier::complete_tx::bytes{.op}{.type}  [dest], value, [remote_bar];  // PTX ISA 81, SM_90
    // .type      = { .s32 }
    // .op        = { .add }
-   template <typename=void>
+   template <typename = void>
    __device__ static inline void red_async(
      cuda::ptx::op_add_t,
      int32_t* dest,
@@ -159,7 +162,7 @@ red.async.relaxed.cluster.shared::cluster.mbarrier::complete_tx::bytes.add.u64
    // red.async.relaxed.cluster.shared::cluster.mbarrier::complete_tx::bytes{.op}{.type}  [dest], value, [remote_bar];  // PTX ISA 81, SM_90
    // .type      = { .u64 }
    // .op        = { .add }
-   template <typename=void>
+   template <typename = void>
    __device__ static inline void red_async(
      cuda::ptx::op_add_t,
      uint64_t* dest,
@@ -172,7 +175,7 @@ red.async.relaxed.cluster.shared::cluster.mbarrier::complete_tx::bytes.add.u64
 
    // red.async.relaxed.cluster.shared::cluster.mbarrier::complete_tx::bytes{.op}.u64  [dest], value, [remote_bar]; // .u64 intentional PTX ISA 81, SM_90
    // .op        = { .add }
-   template <typename=void>
+   template <typename = void>
    __device__ static inline void red_async(
      cuda::ptx::op_add_t,
      int64_t* dest,
diff --git a/docs/libcudacxx/ptx/instructions/generated/st_async.rst b/docs/libcudacxx/ptx/instructions/generated/st_async.rst
index c519ea57f70..d00a152cf29 100644
--- a/docs/libcudacxx/ptx/instructions/generated/st_async.rst
+++ b/docs/libcudacxx/ptx/instructions/generated/st_async.rst
@@ -1,3 +1,6 @@
+..
+   This file was automatically generated. Do not edit.
+
 st.async.weak.shared::cluster.mbarrier::complete_tx::bytes.b32
 ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
 .. code:: cuda
diff --git a/docs/libcudacxx/ptx/instructions/generated/tensormap_cp_fenceproxy.rst b/docs/libcudacxx/ptx/instructions/generated/tensormap_cp_fenceproxy.rst
index 52fae102ad4..e42bae5a5a0 100644
--- a/docs/libcudacxx/ptx/instructions/generated/tensormap_cp_fenceproxy.rst
+++ b/docs/libcudacxx/ptx/instructions/generated/tensormap_cp_fenceproxy.rst
@@ -1,3 +1,6 @@
+..
+   This file was automatically generated. Do not edit.
+
 tensormap.cp_fenceproxy.global.shared::cta.tensormap::generic.release.cta.sync.aligned
 ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
 .. code:: cuda
diff --git a/docs/libcudacxx/ptx/instructions/generated/tensormap_replace.rst b/docs/libcudacxx/ptx/instructions/generated/tensormap_replace.rst
index 33e6f1d839a..a8c4a260782 100644
--- a/docs/libcudacxx/ptx/instructions/generated/tensormap_replace.rst
+++ b/docs/libcudacxx/ptx/instructions/generated/tensormap_replace.rst
@@ -1,3 +1,6 @@
+..
+   This file was automatically generated. Do not edit.
+
 tensormap.replace.tile.global_address.global.b1024.b64
 ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
 .. code:: cuda
diff --git a/libcudacxx/include/cuda/__ptx/instructions/barrier_cluster.h b/libcudacxx/include/cuda/__ptx/instructions/barrier_cluster.h
index 8b09ddd1110..93b6a06037c 100644
--- a/libcudacxx/include/cuda/__ptx/instructions/barrier_cluster.h
+++ b/libcudacxx/include/cuda/__ptx/instructions/barrier_cluster.h
@@ -32,7 +32,7 @@ _LIBCUDACXX_BEGIN_NAMESPACE_CUDA_PTX
 
 // 9.7.12.3. Parallel Synchronization and Communication Instructions: barrier.cluster
 // https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#parallel-synchronization-and-communication-instructions-barrier-cluster
-#include <cuda/__ptx/instructions/generated/barrier_cluster.inc>
+#include <cuda/__ptx/instructions/generated/barrier_cluster.h>
 
 _LIBCUDACXX_END_NAMESPACE_CUDA_PTX
 
diff --git a/libcudacxx/include/cuda/__ptx/instructions/cp_async_bulk.h b/libcudacxx/include/cuda/__ptx/instructions/cp_async_bulk.h
index 480a02a701e..abfba441ac9 100644
--- a/libcudacxx/include/cuda/__ptx/instructions/cp_async_bulk.h
+++ b/libcudacxx/include/cuda/__ptx/instructions/cp_async_bulk.h
@@ -32,8 +32,8 @@ _LIBCUDACXX_BEGIN_NAMESPACE_CUDA_PTX
 
 // 9.7.8.24.6. Data Movement and Conversion Instructions: cp.async.bulk
 // https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#data-movement-and-conversion-instructions-cp-async-bulk
-#include <cuda/__ptx/instructions/generated/cp_async_bulk.inc>
-#include <cuda/__ptx/instructions/generated/cp_async_bulk_multicast.inc>
+#include <cuda/__ptx/instructions/generated/cp_async_bulk.h>
+#include <cuda/__ptx/instructions/generated/cp_async_bulk_multicast.h>
 
 _LIBCUDACXX_END_NAMESPACE_CUDA_PTX
 
diff --git a/libcudacxx/include/cuda/__ptx/instructions/cp_async_bulk_commit_group.h b/libcudacxx/include/cuda/__ptx/instructions/cp_async_bulk_commit_group.h
index bd97259cf19..f9320e975f2 100644
--- a/libcudacxx/include/cuda/__ptx/instructions/cp_async_bulk_commit_group.h
+++ b/libcudacxx/include/cuda/__ptx/instructions/cp_async_bulk_commit_group.h
@@ -32,7 +32,7 @@ _LIBCUDACXX_BEGIN_NAMESPACE_CUDA_PTX
 
 // 9.7.8.24.12. Data Movement and Conversion Instructions: cp.async.bulk.commit_group
 // https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#data-movement-and-conversion-instructions-cp-async-bulk-commit-group
-#include <cuda/__ptx/instructions/generated/cp_async_bulk_commit_group.inc>
+#include <cuda/__ptx/instructions/generated/cp_async_bulk_commit_group.h>
 
 _LIBCUDACXX_END_NAMESPACE_CUDA_PTX
 
diff --git a/libcudacxx/include/cuda/__ptx/instructions/cp_async_bulk_tensor.h b/libcudacxx/include/cuda/__ptx/instructions/cp_async_bulk_tensor.h
index 5b9f575ce5f..7de5b41b744 100644
--- a/libcudacxx/include/cuda/__ptx/instructions/cp_async_bulk_tensor.h
+++ b/libcudacxx/include/cuda/__ptx/instructions/cp_async_bulk_tensor.h
@@ -32,8 +32,8 @@ _LIBCUDACXX_BEGIN_NAMESPACE_CUDA_PTX
 
 // 9.7.8.24.9. Data Movement and Conversion Instructions: cp.async.bulk.tensor
 // https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#data-movement-and-conversion-instructions-cp-async-bulk-tensor
-#include <cuda/__ptx/instructions/generated/cp_async_bulk_tensor.inc>
-#include <cuda/__ptx/instructions/generated/cp_async_bulk_tensor_multicast.inc>
+#include <cuda/__ptx/instructions/generated/cp_async_bulk_tensor.h>
+#include <cuda/__ptx/instructions/generated/cp_async_bulk_tensor_multicast.h>
 
 _LIBCUDACXX_END_NAMESPACE_CUDA_PTX
 
diff --git a/libcudacxx/include/cuda/__ptx/instructions/cp_async_bulk_wait_group.h b/libcudacxx/include/cuda/__ptx/instructions/cp_async_bulk_wait_group.h
index 00a3700e1a9..0d933e2cc34 100644
--- a/libcudacxx/include/cuda/__ptx/instructions/cp_async_bulk_wait_group.h
+++ b/libcudacxx/include/cuda/__ptx/instructions/cp_async_bulk_wait_group.h
@@ -32,7 +32,7 @@ _LIBCUDACXX_BEGIN_NAMESPACE_CUDA_PTX
 
 // 9.7.8.24.13. Data Movement and Conversion Instructions: cp.async.bulk.wait_group
 // https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#data-movement-and-conversion-instructions-cp-async-bulk-wait-group
-#include <cuda/__ptx/instructions/generated/cp_async_bulk_wait_group.inc>
+#include <cuda/__ptx/instructions/generated/cp_async_bulk_wait_group.h>
 
 _LIBCUDACXX_END_NAMESPACE_CUDA_PTX
 
diff --git a/libcudacxx/include/cuda/__ptx/instructions/cp_reduce_async_bulk.h b/libcudacxx/include/cuda/__ptx/instructions/cp_reduce_async_bulk.h
index ee6d90bc4d9..f1487301ada 100644
--- a/libcudacxx/include/cuda/__ptx/instructions/cp_reduce_async_bulk.h
+++ b/libcudacxx/include/cuda/__ptx/instructions/cp_reduce_async_bulk.h
@@ -43,12 +43,12 @@ _LIBCUDACXX_BEGIN_NAMESPACE_CUDA_PTX
 
 // 9.7.8.24.7. Data Movement and Conversion Instructions: cp.reduce.async.bulk
 // https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#data-movement-and-conversion-instructions-cp-reduce-async-bulk
-#include <cuda/__ptx/instructions/generated/cp_reduce_async_bulk.inc>
+#include <cuda/__ptx/instructions/generated/cp_reduce_async_bulk.h>
 #ifdef _LIBCUDACXX_HAS_NVF16
-#  include <cuda/__ptx/instructions/generated/cp_reduce_async_bulk_f16.inc>
+#  include <cuda/__ptx/instructions/generated/cp_reduce_async_bulk_f16.h>
 #endif // _LIBCUDACXX_HAS_NVF16
 #ifdef _LIBCUDACXX_HAS_NVBF16
-#  include <cuda/__ptx/instructions/generated/cp_reduce_async_bulk_bf16.inc>
+#  include <cuda/__ptx/instructions/generated/cp_reduce_async_bulk_bf16.h>
 #endif // _LIBCUDACXX_HAS_NVBF16
 
 _LIBCUDACXX_END_NAMESPACE_CUDA_PTX
diff --git a/libcudacxx/include/cuda/__ptx/instructions/cp_reduce_async_bulk_tensor.h b/libcudacxx/include/cuda/__ptx/instructions/cp_reduce_async_bulk_tensor.h
index a6b23a706c7..436c42d4c3f 100644
--- a/libcudacxx/include/cuda/__ptx/instructions/cp_reduce_async_bulk_tensor.h
+++ b/libcudacxx/include/cuda/__ptx/instructions/cp_reduce_async_bulk_tensor.h
@@ -32,7 +32,7 @@ _LIBCUDACXX_BEGIN_NAMESPACE_CUDA_PTX
 
 // 9.7.8.24.10. Data Movement and Conversion Instructions: cp.reduce.async.bulk.tensor
 // https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#data-movement-and-conversion-instructions-cp-reduce-async-bulk-tensor
-#include <cuda/__ptx/instructions/generated/cp_reduce_async_bulk_tensor.inc>
+#include <cuda/__ptx/instructions/generated/cp_reduce_async_bulk_tensor.h>
 
 _LIBCUDACXX_END_NAMESPACE_CUDA_PTX
 
diff --git a/libcudacxx/include/cuda/__ptx/instructions/fence.h b/libcudacxx/include/cuda/__ptx/instructions/fence.h
index 045f09cb40e..a8dccf979c2 100644
--- a/libcudacxx/include/cuda/__ptx/instructions/fence.h
+++ b/libcudacxx/include/cuda/__ptx/instructions/fence.h
@@ -32,11 +32,11 @@ _LIBCUDACXX_BEGIN_NAMESPACE_CUDA_PTX
 
 // 9.7.12.4. Parallel Synchronization and Communication Instructions: membar/fence
 // https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#parallel-synchronization-and-communication-instructions-membar-fence
-#include <cuda/__ptx/instructions/generated/fence.inc>
-#include <cuda/__ptx/instructions/generated/fence_mbarrier_init.inc>
-#include <cuda/__ptx/instructions/generated/fence_proxy_alias.inc>
-#include <cuda/__ptx/instructions/generated/fence_proxy_async.inc>
-#include <cuda/__ptx/instructions/generated/fence_proxy_tensormap_generic.inc>
+#include <cuda/__ptx/instructions/generated/fence.h>
+#include <cuda/__ptx/instructions/generated/fence_mbarrier_init.h>
+#include <cuda/__ptx/instructions/generated/fence_proxy_alias.h>
+#include <cuda/__ptx/instructions/generated/fence_proxy_async.h>
+#include <cuda/__ptx/instructions/generated/fence_proxy_tensormap_generic.h>
 
 _LIBCUDACXX_END_NAMESPACE_CUDA_PTX
 
diff --git a/libcudacxx/include/cuda/__ptx/instructions/generated/barrier_cluster.inc b/libcudacxx/include/cuda/__ptx/instructions/generated/barrier_cluster.h
similarity index 92%
rename from libcudacxx/include/cuda/__ptx/instructions/generated/barrier_cluster.inc
rename to libcudacxx/include/cuda/__ptx/instructions/generated/barrier_cluster.h
index ca9238bc3ff..10d55714c5b 100644
--- a/libcudacxx/include/cuda/__ptx/instructions/generated/barrier_cluster.inc
+++ b/libcudacxx/include/cuda/__ptx/instructions/generated/barrier_cluster.h
@@ -1,7 +1,12 @@
+// This file was automatically generated. Do not edit.
+
+#ifndef _CUDA_PTX_GENERATED_BARRIER_CLUSTER_H_
+#define _CUDA_PTX_GENERATED_BARRIER_CLUSTER_H_
+
 /*
 // barrier.cluster.arrive; // PTX ISA 78, SM_90
 // Marked volatile and as clobbering memory
-template <typename=void>
+template <typename = void>
 __device__ static inline void barrier_cluster_arrive();
 */
 #if __cccl_ptx_isa >= 780
@@ -24,7 +29,7 @@ _CCCL_DEVICE static inline void barrier_cluster_arrive()
 /*
 // barrier.cluster.wait; // PTX ISA 78, SM_90
 // Marked volatile and as clobbering memory
-template <typename=void>
+template <typename = void>
 __device__ static inline void barrier_cluster_wait();
 */
 #if __cccl_ptx_isa >= 780
@@ -48,7 +53,7 @@ _CCCL_DEVICE static inline void barrier_cluster_wait()
 // barrier.cluster.arrive.sem; // PTX ISA 80, SM_90
 // .sem       = { .release }
 // Marked volatile and as clobbering memory
-template <typename=void>
+template <typename = void>
 __device__ static inline void barrier_cluster_arrive(
   cuda::ptx::sem_release_t);
 */
@@ -74,7 +79,7 @@ _CCCL_DEVICE static inline void barrier_cluster_arrive(sem_release_t)
 // barrier.cluster.arrive.sem; // PTX ISA 80, SM_90
 // .sem       = { .relaxed }
 // Marked volatile
-template <typename=void>
+template <typename = void>
 __device__ static inline void barrier_cluster_arrive(
   cuda::ptx::sem_relaxed_t);
 */
@@ -100,7 +105,7 @@ _CCCL_DEVICE static inline void barrier_cluster_arrive(sem_relaxed_t)
 // barrier.cluster.wait.sem; // PTX ISA 80, SM_90
 // .sem       = { .acquire }
 // Marked volatile and as clobbering memory
-template <typename=void>
+template <typename = void>
 __device__ static inline void barrier_cluster_wait(
   cuda::ptx::sem_acquire_t);
 */
@@ -121,3 +126,5 @@ _CCCL_DEVICE static inline void barrier_cluster_wait(sem_acquire_t)
       __cuda_ptx_barrier_cluster_wait_is_not_supported_before_SM_90__();));
 }
 #endif // __cccl_ptx_isa >= 800
+
+#endif // _CUDA_PTX_GENERATED_BARRIER_CLUSTER_H_
diff --git a/libcudacxx/include/cuda/__ptx/instructions/generated/cp_async_bulk.inc b/libcudacxx/include/cuda/__ptx/instructions/generated/cp_async_bulk.h
similarity index 93%
rename from libcudacxx/include/cuda/__ptx/instructions/generated/cp_async_bulk.inc
rename to libcudacxx/include/cuda/__ptx/instructions/generated/cp_async_bulk.h
index 69f77053b95..8ba40d45f64 100644
--- a/libcudacxx/include/cuda/__ptx/instructions/generated/cp_async_bulk.inc
+++ b/libcudacxx/include/cuda/__ptx/instructions/generated/cp_async_bulk.h
@@ -1,9 +1,14 @@
+// This file was automatically generated. Do not edit.
+
+#ifndef _CUDA_PTX_GENERATED_CP_ASYNC_BULK_H_
+#define _CUDA_PTX_GENERATED_CP_ASYNC_BULK_H_
+
 /*
 // cp.async.bulk.dst.src.mbarrier::complete_tx::bytes [dstMem], [srcMem], size, [smem_bar]; // 1a. unicast PTX ISA 80,
 SM_90
 // .dst       = { .shared::cluster }
 // .src       = { .global }
-template <typename=void>
+template <typename = void>
 __device__ static inline void cp_async_bulk(
   cuda::ptx::space_cluster_t,
   cuda::ptx::space_global_t,
@@ -41,7 +46,7 @@ _CCCL_DEVICE static inline void cp_async_bulk(
 // cp.async.bulk.dst.src.mbarrier::complete_tx::bytes [dstMem], [srcMem], size, [rdsmem_bar]; // 2.  PTX ISA 80, SM_90
 // .dst       = { .shared::cluster }
 // .src       = { .shared::cta }
-template <typename=void>
+template <typename = void>
 __device__ static inline void cp_async_bulk(
   cuda::ptx::space_cluster_t,
   cuda::ptx::space_shared_t,
@@ -82,7 +87,7 @@ _CCCL_DEVICE static inline void cp_async_bulk(
 // cp.async.bulk.dst.src.bulk_group [dstMem], [srcMem], size; // 3.  PTX ISA 80, SM_90
 // .dst       = { .global }
 // .src       = { .shared::cta }
-template <typename=void>
+template <typename = void>
 __device__ static inline void cp_async_bulk(
   cuda::ptx::space_global_t,
   cuda::ptx::space_shared_t,
@@ -109,3 +114,5 @@ cp_async_bulk(space_global_t, space_shared_t, void* __dstMem, const void* __srcM
       __cuda_ptx_cp_async_bulk_is_not_supported_before_SM_90__();));
 }
 #endif // __cccl_ptx_isa >= 800
+
+#endif // _CUDA_PTX_GENERATED_CP_ASYNC_BULK_H_
diff --git a/libcudacxx/include/cuda/__ptx/instructions/generated/cp_async_bulk_commit_group.inc b/libcudacxx/include/cuda/__ptx/instructions/generated/cp_async_bulk_commit_group.h
similarity index 73%
rename from libcudacxx/include/cuda/__ptx/instructions/generated/cp_async_bulk_commit_group.inc
rename to libcudacxx/include/cuda/__ptx/instructions/generated/cp_async_bulk_commit_group.h
index 24baddaea8f..7bb58675ddb 100644
--- a/libcudacxx/include/cuda/__ptx/instructions/generated/cp_async_bulk_commit_group.inc
+++ b/libcudacxx/include/cuda/__ptx/instructions/generated/cp_async_bulk_commit_group.h
@@ -1,6 +1,11 @@
+// This file was automatically generated. Do not edit.
+
+#ifndef _CUDA_PTX_GENERATED_CP_ASYNC_BULK_COMMIT_GROUP_H_
+#define _CUDA_PTX_GENERATED_CP_ASYNC_BULK_COMMIT_GROUP_H_
+
 /*
 // cp.async.bulk.commit_group; // PTX ISA 80, SM_90
-template <typename=void>
+template <typename = void>
 __device__ static inline void cp_async_bulk_commit_group();
 */
 #if __cccl_ptx_isa >= 800
@@ -19,3 +24,5 @@ _CCCL_DEVICE static inline void cp_async_bulk_commit_group()
       __cuda_ptx_cp_async_bulk_commit_group_is_not_supported_before_SM_90__();));
 }
 #endif // __cccl_ptx_isa >= 800
+
+#endif // _CUDA_PTX_GENERATED_CP_ASYNC_BULK_COMMIT_GROUP_H_
diff --git a/libcudacxx/include/cuda/__ptx/instructions/generated/cp_async_bulk_multicast.inc b/libcudacxx/include/cuda/__ptx/instructions/generated/cp_async_bulk_multicast.h
similarity index 86%
rename from libcudacxx/include/cuda/__ptx/instructions/generated/cp_async_bulk_multicast.inc
rename to libcudacxx/include/cuda/__ptx/instructions/generated/cp_async_bulk_multicast.h
index cdd5a535eb6..a5534ef0b48 100644
--- a/libcudacxx/include/cuda/__ptx/instructions/generated/cp_async_bulk_multicast.inc
+++ b/libcudacxx/include/cuda/__ptx/instructions/generated/cp_async_bulk_multicast.h
@@ -1,9 +1,14 @@
+// This file was automatically generated. Do not edit.
+
+#ifndef _CUDA_PTX_GENERATED_CP_ASYNC_BULK_MULTICAST_H_
+#define _CUDA_PTX_GENERATED_CP_ASYNC_BULK_MULTICAST_H_
+
 /*
 // cp.async.bulk{.dst}{.src}.mbarrier::complete_tx::bytes.multicast::cluster [dstMem], [srcMem], size, [smem_bar],
 ctaMask; // 1.  PTX ISA 80, SM_90a
 // .dst       = { .shared::cluster }
 // .src       = { .global }
-template <typename=void>
+template <typename = void>
 __device__ static inline void cp_async_bulk(
   cuda::ptx::space_cluster_t,
   cuda::ptx::space_global_t,
@@ -43,3 +48,5 @@ _CCCL_DEVICE static inline void cp_async_bulk(
       __cuda_ptx_cp_async_bulk_is_not_supported_before_SM_90a__();));
 }
 #endif // __cccl_ptx_isa >= 800
+
+#endif // _CUDA_PTX_GENERATED_CP_ASYNC_BULK_MULTICAST_H_
diff --git a/libcudacxx/include/cuda/__ptx/instructions/generated/cp_async_bulk_tensor.inc b/libcudacxx/include/cuda/__ptx/instructions/generated/cp_async_bulk_tensor.h
similarity index 96%
rename from libcudacxx/include/cuda/__ptx/instructions/generated/cp_async_bulk_tensor.inc
rename to libcudacxx/include/cuda/__ptx/instructions/generated/cp_async_bulk_tensor.h
index 547888d5b0f..3cbd26fda04 100644
--- a/libcudacxx/include/cuda/__ptx/instructions/generated/cp_async_bulk_tensor.inc
+++ b/libcudacxx/include/cuda/__ptx/instructions/generated/cp_async_bulk_tensor.h
@@ -1,9 +1,14 @@
+// This file was automatically generated. Do not edit.
+
+#ifndef _CUDA_PTX_GENERATED_CP_ASYNC_BULK_TENSOR_H_
+#define _CUDA_PTX_GENERATED_CP_ASYNC_BULK_TENSOR_H_
+
 /*
 // cp.async.bulk.tensor.1d.dst.src.tile.mbarrier::complete_tx::bytes [dstMem], [tensorMap, tensorCoords], [smem_bar];//
 1a. PTX ISA 80, SM_90
 // .dst       = { .shared::cluster }
 // .src       = { .global }
-template <typename=void>
+template <typename = void>
 __device__ static inline void cp_async_bulk_tensor(
   cuda::ptx::space_cluster_t,
   cuda::ptx::space_global_t,
@@ -42,7 +47,7 @@ _CCCL_DEVICE static inline void cp_async_bulk_tensor(
 // cp.async.bulk.tensor.1d.dst.src.tile.bulk_group [tensorMap, tensorCoords], [srcMem]; // 3a. PTX ISA 80, SM_90
 // .dst       = { .global }
 // .src       = { .shared::cta }
-template <typename=void>
+template <typename = void>
 __device__ static inline void cp_async_bulk_tensor(
   cuda::ptx::space_global_t,
   cuda::ptx::space_shared_t,
@@ -79,7 +84,7 @@ _CCCL_DEVICE static inline void cp_async_bulk_tensor(
 1b. PTX ISA 80, SM_90
 // .dst       = { .shared::cluster }
 // .src       = { .global }
-template <typename=void>
+template <typename = void>
 __device__ static inline void cp_async_bulk_tensor(
   cuda::ptx::space_cluster_t,
   cuda::ptx::space_global_t,
@@ -122,7 +127,7 @@ _CCCL_DEVICE static inline void cp_async_bulk_tensor(
 // cp.async.bulk.tensor.2d.dst.src.tile.bulk_group [tensorMap, tensorCoords], [srcMem]; // 3b. PTX ISA 80, SM_90
 // .dst       = { .global }
 // .src       = { .shared::cta }
-template <typename=void>
+template <typename = void>
 __device__ static inline void cp_async_bulk_tensor(
   cuda::ptx::space_global_t,
   cuda::ptx::space_shared_t,
@@ -159,7 +164,7 @@ _CCCL_DEVICE static inline void cp_async_bulk_tensor(
 1c. PTX ISA 80, SM_90
 // .dst       = { .shared::cluster }
 // .src       = { .global }
-template <typename=void>
+template <typename = void>
 __device__ static inline void cp_async_bulk_tensor(
   cuda::ptx::space_cluster_t,
   cuda::ptx::space_global_t,
@@ -203,7 +208,7 @@ _CCCL_DEVICE static inline void cp_async_bulk_tensor(
 // cp.async.bulk.tensor.3d.dst.src.tile.bulk_group [tensorMap, tensorCoords], [srcMem]; // 3c. PTX ISA 80, SM_90
 // .dst       = { .global }
 // .src       = { .shared::cta }
-template <typename=void>
+template <typename = void>
 __device__ static inline void cp_async_bulk_tensor(
   cuda::ptx::space_global_t,
   cuda::ptx::space_shared_t,
@@ -244,7 +249,7 @@ _CCCL_DEVICE static inline void cp_async_bulk_tensor(
 1d. PTX ISA 80, SM_90
 // .dst       = { .shared::cluster }
 // .src       = { .global }
-template <typename=void>
+template <typename = void>
 __device__ static inline void cp_async_bulk_tensor(
   cuda::ptx::space_cluster_t,
   cuda::ptx::space_global_t,
@@ -289,7 +294,7 @@ _CCCL_DEVICE static inline void cp_async_bulk_tensor(
 // cp.async.bulk.tensor.4d.dst.src.tile.bulk_group [tensorMap, tensorCoords], [srcMem]; // 3d. PTX ISA 80, SM_90
 // .dst       = { .global }
 // .src       = { .shared::cta }
-template <typename=void>
+template <typename = void>
 __device__ static inline void cp_async_bulk_tensor(
   cuda::ptx::space_global_t,
   cuda::ptx::space_shared_t,
@@ -331,7 +336,7 @@ _CCCL_DEVICE static inline void cp_async_bulk_tensor(
 1e. PTX ISA 80, SM_90
 // .dst       = { .shared::cluster }
 // .src       = { .global }
-template <typename=void>
+template <typename = void>
 __device__ static inline void cp_async_bulk_tensor(
   cuda::ptx::space_cluster_t,
   cuda::ptx::space_global_t,
@@ -377,7 +382,7 @@ _CCCL_DEVICE static inline void cp_async_bulk_tensor(
 // cp.async.bulk.tensor.5d.dst.src.tile.bulk_group [tensorMap, tensorCoords], [srcMem]; // 3e. PTX ISA 80, SM_90
 // .dst       = { .global }
 // .src       = { .shared::cta }
-template <typename=void>
+template <typename = void>
 __device__ static inline void cp_async_bulk_tensor(
   cuda::ptx::space_global_t,
   cuda::ptx::space_shared_t,
@@ -414,3 +419,5 @@ _CCCL_DEVICE static inline void cp_async_bulk_tensor(
       __cuda_ptx_cp_async_bulk_tensor_is_not_supported_before_SM_90__();));
 }
 #endif // __cccl_ptx_isa >= 800
+
+#endif // _CUDA_PTX_GENERATED_CP_ASYNC_BULK_TENSOR_H_
diff --git a/libcudacxx/include/cuda/__ptx/instructions/generated/cp_async_bulk_tensor_multicast.inc b/libcudacxx/include/cuda/__ptx/instructions/generated/cp_async_bulk_tensor_multicast.h
similarity index 95%
rename from libcudacxx/include/cuda/__ptx/instructions/generated/cp_async_bulk_tensor_multicast.inc
rename to libcudacxx/include/cuda/__ptx/instructions/generated/cp_async_bulk_tensor_multicast.h
index 020698a15b1..915979d18f3 100644
--- a/libcudacxx/include/cuda/__ptx/instructions/generated/cp_async_bulk_tensor_multicast.inc
+++ b/libcudacxx/include/cuda/__ptx/instructions/generated/cp_async_bulk_tensor_multicast.h
@@ -1,9 +1,14 @@
+// This file was automatically generated. Do not edit.
+
+#ifndef _CUDA_PTX_GENERATED_CP_ASYNC_BULK_TENSOR_MULTICAST_H_
+#define _CUDA_PTX_GENERATED_CP_ASYNC_BULK_TENSOR_MULTICAST_H_
+
 /*
 // cp.async.bulk.tensor.1d.dst.src.tile.mbarrier::complete_tx::bytes.multicast::cluster [dstMem], [tensorMap,
 tensorCoords], [smem_bar], ctaMask; // 2a. PTX ISA 80, SM_90a
 // .dst       = { .shared::cluster }
 // .src       = { .global }
-template <typename=void>
+template <typename = void>
 __device__ static inline void cp_async_bulk_tensor(
   cuda::ptx::space_cluster_t,
   cuda::ptx::space_global_t,
@@ -49,7 +54,7 @@ _CCCL_DEVICE static inline void cp_async_bulk_tensor(
 tensorCoords], [smem_bar], ctaMask; // 2b. PTX ISA 80, SM_90a
 // .dst       = { .shared::cluster }
 // .src       = { .global }
-template <typename=void>
+template <typename = void>
 __device__ static inline void cp_async_bulk_tensor(
   cuda::ptx::space_cluster_t,
   cuda::ptx::space_global_t,
@@ -96,7 +101,7 @@ _CCCL_DEVICE static inline void cp_async_bulk_tensor(
 tensorCoords], [smem_bar], ctaMask; // 2c. PTX ISA 80, SM_90a
 // .dst       = { .shared::cluster }
 // .src       = { .global }
-template <typename=void>
+template <typename = void>
 __device__ static inline void cp_async_bulk_tensor(
   cuda::ptx::space_cluster_t,
   cuda::ptx::space_global_t,
@@ -144,7 +149,7 @@ _CCCL_DEVICE static inline void cp_async_bulk_tensor(
 tensorCoords], [smem_bar], ctaMask; // 2d. PTX ISA 80, SM_90a
 // .dst       = { .shared::cluster }
 // .src       = { .global }
-template <typename=void>
+template <typename = void>
 __device__ static inline void cp_async_bulk_tensor(
   cuda::ptx::space_cluster_t,
   cuda::ptx::space_global_t,
@@ -193,7 +198,7 @@ _CCCL_DEVICE static inline void cp_async_bulk_tensor(
 tensorCoords], [smem_bar], ctaMask; // 2e. PTX ISA 80, SM_90a
 // .dst       = { .shared::cluster }
 // .src       = { .global }
-template <typename=void>
+template <typename = void>
 __device__ static inline void cp_async_bulk_tensor(
   cuda::ptx::space_cluster_t,
   cuda::ptx::space_global_t,
@@ -237,3 +242,5 @@ _CCCL_DEVICE static inline void cp_async_bulk_tensor(
       __cuda_ptx_cp_async_bulk_tensor_is_not_supported_before_SM_90a__();));
 }
 #endif // __cccl_ptx_isa >= 800
+
+#endif // _CUDA_PTX_GENERATED_CP_ASYNC_BULK_TENSOR_MULTICAST_H_
diff --git a/libcudacxx/include/cuda/__ptx/instructions/generated/cp_async_bulk_wait_group.inc b/libcudacxx/include/cuda/__ptx/instructions/generated/cp_async_bulk_wait_group.h
similarity index 82%
rename from libcudacxx/include/cuda/__ptx/instructions/generated/cp_async_bulk_wait_group.inc
rename to libcudacxx/include/cuda/__ptx/instructions/generated/cp_async_bulk_wait_group.h
index 1a715a0fac6..2057323665a 100644
--- a/libcudacxx/include/cuda/__ptx/instructions/generated/cp_async_bulk_wait_group.inc
+++ b/libcudacxx/include/cuda/__ptx/instructions/generated/cp_async_bulk_wait_group.h
@@ -1,3 +1,8 @@
+// This file was automatically generated. Do not edit.
+
+#ifndef _CUDA_PTX_GENERATED_CP_ASYNC_BULK_WAIT_GROUP_H_
+#define _CUDA_PTX_GENERATED_CP_ASYNC_BULK_WAIT_GROUP_H_
+
 /*
 // cp.async.bulk.wait_group N; // PTX ISA 80, SM_90
 template <int N32>
@@ -7,13 +12,13 @@ __device__ static inline void cp_async_bulk_wait_group(
 #if __cccl_ptx_isa >= 800
 extern "C" _CCCL_DEVICE void __cuda_ptx_cp_async_bulk_wait_group_is_not_supported_before_SM_90__();
 template <int _N32>
-_CCCL_DEVICE static inline void cp_async_bulk_wait_group(n32_t<_N32> __n)
+_CCCL_DEVICE static inline void cp_async_bulk_wait_group(n32_t<_N32> __N)
 {
   NV_IF_ELSE_TARGET(
     NV_PROVIDES_SM_90,
     (asm volatile("cp.async.bulk.wait_group %0;"
                   :
-                  : "n"(__n.value)
+                  : "n"(__N.value)
                   : "memory");),
     (
       // Unsupported architectures will have a linker error with a semi-decent error message
@@ -30,16 +35,18 @@ __device__ static inline void cp_async_bulk_wait_group_read(
 #if __cccl_ptx_isa >= 800
 extern "C" _CCCL_DEVICE void __cuda_ptx_cp_async_bulk_wait_group_read_is_not_supported_before_SM_90__();
 template <int _N32>
-_CCCL_DEVICE static inline void cp_async_bulk_wait_group_read(n32_t<_N32> __n)
+_CCCL_DEVICE static inline void cp_async_bulk_wait_group_read(n32_t<_N32> __N)
 {
   NV_IF_ELSE_TARGET(
     NV_PROVIDES_SM_90,
     (asm volatile("cp.async.bulk.wait_group.read %0;"
                   :
-                  : "n"(__n.value)
+                  : "n"(__N.value)
                   : "memory");),
     (
       // Unsupported architectures will have a linker error with a semi-decent error message
       __cuda_ptx_cp_async_bulk_wait_group_read_is_not_supported_before_SM_90__();));
 }
 #endif // __cccl_ptx_isa >= 800
+
+#endif // _CUDA_PTX_GENERATED_CP_ASYNC_BULK_WAIT_GROUP_H_
diff --git a/libcudacxx/include/cuda/__ptx/instructions/generated/cp_reduce_async_bulk.inc b/libcudacxx/include/cuda/__ptx/instructions/generated/cp_reduce_async_bulk.h
similarity index 97%
rename from libcudacxx/include/cuda/__ptx/instructions/generated/cp_reduce_async_bulk.inc
rename to libcudacxx/include/cuda/__ptx/instructions/generated/cp_reduce_async_bulk.h
index 50059ff6c5b..a35684c85e1 100644
--- a/libcudacxx/include/cuda/__ptx/instructions/generated/cp_reduce_async_bulk.inc
+++ b/libcudacxx/include/cuda/__ptx/instructions/generated/cp_reduce_async_bulk.h
@@ -1,5 +1,8 @@
-// 9.7.8.24.7. Data Movement and Conversion Instructions: cp.reduce.async.bulk
-// https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#data-movement-and-conversion-instructions-cp-reduce-async-bulk
+// This file was automatically generated. Do not edit.
+
+#ifndef _CUDA_PTX_GENERATED_CP_REDUCE_ASYNC_BULK_H_
+#define _CUDA_PTX_GENERATED_CP_REDUCE_ASYNC_BULK_H_
+
 /*
 // cp.reduce.async.bulk.dst.src.mbarrier::complete_tx::bytes.op.type [dstMem], [srcMem], size, [rdsmem_bar]; // 1. PTX
 ISA 80, SM_90
@@ -154,7 +157,7 @@ ISA 80, SM_90
 // .src       = { .shared::cta }
 // .type      = { .u32 }
 // .op        = { .min }
-template <typename=void>
+template <typename = void>
 __device__ static inline void cp_reduce_async_bulk(
   cuda::ptx::space_cluster_t,
   cuda::ptx::space_shared_t,
@@ -203,7 +206,7 @@ ISA 80, SM_90
 // .src       = { .shared::cta }
 // .type      = { .u32 }
 // .op        = { .max }
-template <typename=void>
+template <typename = void>
 __device__ static inline void cp_reduce_async_bulk(
   cuda::ptx::space_cluster_t,
   cuda::ptx::space_shared_t,
@@ -252,7 +255,7 @@ ISA 80, SM_90
 // .src       = { .shared::cta }
 // .type      = { .u32 }
 // .op        = { .add }
-template <typename=void>
+template <typename = void>
 __device__ static inline void cp_reduce_async_bulk(
   cuda::ptx::space_cluster_t,
   cuda::ptx::space_shared_t,
@@ -301,7 +304,7 @@ ISA 80, SM_90
 // .src       = { .shared::cta }
 // .type      = { .u32 }
 // .op        = { .inc }
-template <typename=void>
+template <typename = void>
 __device__ static inline void cp_reduce_async_bulk(
   cuda::ptx::space_cluster_t,
   cuda::ptx::space_shared_t,
@@ -350,7 +353,7 @@ ISA 80, SM_90
 // .src       = { .shared::cta }
 // .type      = { .u32 }
 // .op        = { .dec }
-template <typename=void>
+template <typename = void>
 __device__ static inline void cp_reduce_async_bulk(
   cuda::ptx::space_cluster_t,
   cuda::ptx::space_shared_t,
@@ -399,7 +402,7 @@ ISA 80, SM_90
 // .src       = { .shared::cta }
 // .type      = { .s32 }
 // .op        = { .min }
-template <typename=void>
+template <typename = void>
 __device__ static inline void cp_reduce_async_bulk(
   cuda::ptx::space_cluster_t,
   cuda::ptx::space_shared_t,
@@ -448,7 +451,7 @@ ISA 80, SM_90
 // .src       = { .shared::cta }
 // .type      = { .s32 }
 // .op        = { .max }
-template <typename=void>
+template <typename = void>
 __device__ static inline void cp_reduce_async_bulk(
   cuda::ptx::space_cluster_t,
   cuda::ptx::space_shared_t,
@@ -497,7 +500,7 @@ ISA 80, SM_90
 // .src       = { .shared::cta }
 // .type      = { .s32 }
 // .op        = { .add }
-template <typename=void>
+template <typename = void>
 __device__ static inline void cp_reduce_async_bulk(
   cuda::ptx::space_cluster_t,
   cuda::ptx::space_shared_t,
@@ -546,7 +549,7 @@ ISA 80, SM_90
 // .src       = { .shared::cta }
 // .type      = { .u64 }
 // .op        = { .add }
-template <typename=void>
+template <typename = void>
 __device__ static inline void cp_reduce_async_bulk(
   cuda::ptx::space_cluster_t,
   cuda::ptx::space_shared_t,
@@ -595,7 +598,7 @@ ISA 80, SM_90
 // .src       = { .shared::cta }
 // .type      = { .s64 }
 // .op        = { .add }
-template <typename=void>
+template <typename = void>
 __device__ static inline void cp_reduce_async_bulk(
   cuda::ptx::space_cluster_t,
   cuda::ptx::space_shared_t,
@@ -670,7 +673,7 @@ _CCCL_DEVICE static inline void cp_reduce_async_bulk(
             :
             : "l"(__as_ptr_gmem(__dstMem)), "r"(__as_ptr_smem(__srcMem)), "r"(__size)
             : "memory");
-      } _CCCL_ELSE_IF_CONSTEXPR (sizeof(_Type) == 8) {
+      } else _CCCL_IF_CONSTEXPR (sizeof(_Type) == 8) {
         asm("cp.reduce.async.bulk.global.shared::cta.bulk_group.and.b64  [%0], [%1], %2; // 3."
             :
             : "l"(__as_ptr_gmem(__dstMem)), "r"(__as_ptr_smem(__srcMem)), "r"(__size)
@@ -715,7 +718,7 @@ _CCCL_DEVICE static inline void cp_reduce_async_bulk(
             :
             : "l"(__as_ptr_gmem(__dstMem)), "r"(__as_ptr_smem(__srcMem)), "r"(__size)
             : "memory");
-      } _CCCL_ELSE_IF_CONSTEXPR (sizeof(_Type) == 8) {
+      } else _CCCL_IF_CONSTEXPR (sizeof(_Type) == 8) {
         asm("cp.reduce.async.bulk.global.shared::cta.bulk_group.or.b64  [%0], [%1], %2; // 3."
             :
             : "l"(__as_ptr_gmem(__dstMem)), "r"(__as_ptr_smem(__srcMem)), "r"(__size)
@@ -760,7 +763,7 @@ _CCCL_DEVICE static inline void cp_reduce_async_bulk(
             :
             : "l"(__as_ptr_gmem(__dstMem)), "r"(__as_ptr_smem(__srcMem)), "r"(__size)
             : "memory");
-      } _CCCL_ELSE_IF_CONSTEXPR (sizeof(_Type) == 8) {
+      } else _CCCL_IF_CONSTEXPR (sizeof(_Type) == 8) {
         asm("cp.reduce.async.bulk.global.shared::cta.bulk_group.xor.b64  [%0], [%1], %2; // 3."
             :
             : "l"(__as_ptr_gmem(__dstMem)), "r"(__as_ptr_smem(__srcMem)), "r"(__size)
@@ -778,7 +781,7 @@ _CCCL_DEVICE static inline void cp_reduce_async_bulk(
 // .src       = { .shared::cta }
 // .type      = { .u32 }
 // .op        = { .min }
-template <typename=void>
+template <typename = void>
 __device__ static inline void cp_reduce_async_bulk(
   cuda::ptx::space_global_t,
   cuda::ptx::space_shared_t,
@@ -820,7 +823,7 @@ _CCCL_DEVICE static inline void cp_reduce_async_bulk(
 // .src       = { .shared::cta }
 // .type      = { .u32 }
 // .op        = { .max }
-template <typename=void>
+template <typename = void>
 __device__ static inline void cp_reduce_async_bulk(
   cuda::ptx::space_global_t,
   cuda::ptx::space_shared_t,
@@ -862,7 +865,7 @@ _CCCL_DEVICE static inline void cp_reduce_async_bulk(
 // .src       = { .shared::cta }
 // .type      = { .u32 }
 // .op        = { .add }
-template <typename=void>
+template <typename = void>
 __device__ static inline void cp_reduce_async_bulk(
   cuda::ptx::space_global_t,
   cuda::ptx::space_shared_t,
@@ -904,7 +907,7 @@ _CCCL_DEVICE static inline void cp_reduce_async_bulk(
 // .src       = { .shared::cta }
 // .type      = { .u32 }
 // .op        = { .inc }
-template <typename=void>
+template <typename = void>
 __device__ static inline void cp_reduce_async_bulk(
   cuda::ptx::space_global_t,
   cuda::ptx::space_shared_t,
@@ -946,7 +949,7 @@ _CCCL_DEVICE static inline void cp_reduce_async_bulk(
 // .src       = { .shared::cta }
 // .type      = { .u32 }
 // .op        = { .dec }
-template <typename=void>
+template <typename = void>
 __device__ static inline void cp_reduce_async_bulk(
   cuda::ptx::space_global_t,
   cuda::ptx::space_shared_t,
@@ -988,7 +991,7 @@ _CCCL_DEVICE static inline void cp_reduce_async_bulk(
 // .src       = { .shared::cta }
 // .type      = { .s32 }
 // .op        = { .min }
-template <typename=void>
+template <typename = void>
 __device__ static inline void cp_reduce_async_bulk(
   cuda::ptx::space_global_t,
   cuda::ptx::space_shared_t,
@@ -1030,7 +1033,7 @@ _CCCL_DEVICE static inline void cp_reduce_async_bulk(
 // .src       = { .shared::cta }
 // .type      = { .s32 }
 // .op        = { .max }
-template <typename=void>
+template <typename = void>
 __device__ static inline void cp_reduce_async_bulk(
   cuda::ptx::space_global_t,
   cuda::ptx::space_shared_t,
@@ -1072,7 +1075,7 @@ _CCCL_DEVICE static inline void cp_reduce_async_bulk(
 // .src       = { .shared::cta }
 // .type      = { .s32 }
 // .op        = { .add }
-template <typename=void>
+template <typename = void>
 __device__ static inline void cp_reduce_async_bulk(
   cuda::ptx::space_global_t,
   cuda::ptx::space_shared_t,
@@ -1114,7 +1117,7 @@ _CCCL_DEVICE static inline void cp_reduce_async_bulk(
 // .src       = { .shared::cta }
 // .type      = { .u64 }
 // .op        = { .min }
-template <typename=void>
+template <typename = void>
 __device__ static inline void cp_reduce_async_bulk(
   cuda::ptx::space_global_t,
   cuda::ptx::space_shared_t,
@@ -1156,7 +1159,7 @@ _CCCL_DEVICE static inline void cp_reduce_async_bulk(
 // .src       = { .shared::cta }
 // .type      = { .u64 }
 // .op        = { .max }
-template <typename=void>
+template <typename = void>
 __device__ static inline void cp_reduce_async_bulk(
   cuda::ptx::space_global_t,
   cuda::ptx::space_shared_t,
@@ -1198,7 +1201,7 @@ _CCCL_DEVICE static inline void cp_reduce_async_bulk(
 // .src       = { .shared::cta }
 // .type      = { .u64 }
 // .op        = { .add }
-template <typename=void>
+template <typename = void>
 __device__ static inline void cp_reduce_async_bulk(
   cuda::ptx::space_global_t,
   cuda::ptx::space_shared_t,
@@ -1240,7 +1243,7 @@ _CCCL_DEVICE static inline void cp_reduce_async_bulk(
 // .src       = { .shared::cta }
 // .type      = { .s64 }
 // .op        = { .min }
-template <typename=void>
+template <typename = void>
 __device__ static inline void cp_reduce_async_bulk(
   cuda::ptx::space_global_t,
   cuda::ptx::space_shared_t,
@@ -1282,7 +1285,7 @@ _CCCL_DEVICE static inline void cp_reduce_async_bulk(
 // .src       = { .shared::cta }
 // .type      = { .s64 }
 // .op        = { .max }
-template <typename=void>
+template <typename = void>
 __device__ static inline void cp_reduce_async_bulk(
   cuda::ptx::space_global_t,
   cuda::ptx::space_shared_t,
@@ -1324,7 +1327,7 @@ _CCCL_DEVICE static inline void cp_reduce_async_bulk(
 // .src       = { .shared::cta }
 // .type      = { .f32 }
 // .op        = { .add }
-template <typename=void>
+template <typename = void>
 __device__ static inline void cp_reduce_async_bulk(
   cuda::ptx::space_global_t,
   cuda::ptx::space_shared_t,
@@ -1361,7 +1364,7 @@ _CCCL_DEVICE static inline void cp_reduce_async_bulk(
 // .src       = { .shared::cta }
 // .type      = { .f64 }
 // .op        = { .add }
-template <typename=void>
+template <typename = void>
 __device__ static inline void cp_reduce_async_bulk(
   cuda::ptx::space_global_t,
   cuda::ptx::space_shared_t,
@@ -1398,7 +1401,7 @@ _CCCL_DEVICE static inline void cp_reduce_async_bulk(
 // .src       = { .shared::cta }
 // .type      = { .s64 }
 // .op        = { .add }
-template <typename=void>
+template <typename = void>
 __device__ static inline void cp_reduce_async_bulk(
   cuda::ptx::space_global_t,
   cuda::ptx::space_shared_t,
@@ -1433,3 +1436,5 @@ _CCCL_DEVICE static inline void cp_reduce_async_bulk(
       __cuda_ptx_cp_reduce_async_bulk_is_not_supported_before_SM_90__();));
 }
 #endif // __cccl_ptx_isa >= 800
+
+#endif // _CUDA_PTX_GENERATED_CP_REDUCE_ASYNC_BULK_H_
diff --git a/libcudacxx/include/cuda/__ptx/instructions/generated/cp_reduce_async_bulk_bf16.inc b/libcudacxx/include/cuda/__ptx/instructions/generated/cp_reduce_async_bulk_bf16.h
similarity index 89%
rename from libcudacxx/include/cuda/__ptx/instructions/generated/cp_reduce_async_bulk_bf16.inc
rename to libcudacxx/include/cuda/__ptx/instructions/generated/cp_reduce_async_bulk_bf16.h
index c657e8d1935..1e13bb5f4f2 100644
--- a/libcudacxx/include/cuda/__ptx/instructions/generated/cp_reduce_async_bulk_bf16.inc
+++ b/libcudacxx/include/cuda/__ptx/instructions/generated/cp_reduce_async_bulk_bf16.h
@@ -1,11 +1,15 @@
-#ifdef _LIBCUDACXX_HAS_NVBF16
+// This file was automatically generated. Do not edit.
+
+#ifndef _CUDA_PTX_GENERATED_CP_REDUCE_ASYNC_BULK_BF16_H_
+#define _CUDA_PTX_GENERATED_CP_REDUCE_ASYNC_BULK_BF16_H_
+
 /*
 // cp.reduce.async.bulk.dst.src.bulk_group.op.type  [dstMem], [srcMem], size; // 4. PTX ISA 80, SM_90
 // .dst       = { .global }
 // .src       = { .shared::cta }
 // .type      = { .bf16 }
 // .op        = { .min }
-template <typename=void>
+template <typename = void>
 __device__ static inline void cp_reduce_async_bulk(
   cuda::ptx::space_global_t,
   cuda::ptx::space_shared_t,
@@ -14,7 +18,7 @@ __device__ static inline void cp_reduce_async_bulk(
   const __nv_bfloat16* srcMem,
   uint32_t size);
 */
-#  if __cccl_ptx_isa >= 800
+#if __cccl_ptx_isa >= 800
 extern "C" _CCCL_DEVICE void __cuda_ptx_cp_reduce_async_bulk_is_not_supported_before_SM_90__();
 template <typename = void>
 _CCCL_DEVICE static inline void cp_reduce_async_bulk(
@@ -39,7 +43,7 @@ _CCCL_DEVICE static inline void cp_reduce_async_bulk(
       // Unsupported architectures will have a linker error with a semi-decent error message
       __cuda_ptx_cp_reduce_async_bulk_is_not_supported_before_SM_90__();));
 }
-#  endif // __cccl_ptx_isa >= 800
+#endif // __cccl_ptx_isa >= 800
 
 /*
 // cp.reduce.async.bulk.dst.src.bulk_group.op.type  [dstMem], [srcMem], size; // 4. PTX ISA 80, SM_90
@@ -47,7 +51,7 @@ _CCCL_DEVICE static inline void cp_reduce_async_bulk(
 // .src       = { .shared::cta }
 // .type      = { .bf16 }
 // .op        = { .max }
-template <typename=void>
+template <typename = void>
 __device__ static inline void cp_reduce_async_bulk(
   cuda::ptx::space_global_t,
   cuda::ptx::space_shared_t,
@@ -56,7 +60,7 @@ __device__ static inline void cp_reduce_async_bulk(
   const __nv_bfloat16* srcMem,
   uint32_t size);
 */
-#  if __cccl_ptx_isa >= 800
+#if __cccl_ptx_isa >= 800
 extern "C" _CCCL_DEVICE void __cuda_ptx_cp_reduce_async_bulk_is_not_supported_before_SM_90__();
 template <typename = void>
 _CCCL_DEVICE static inline void cp_reduce_async_bulk(
@@ -81,7 +85,7 @@ _CCCL_DEVICE static inline void cp_reduce_async_bulk(
       // Unsupported architectures will have a linker error with a semi-decent error message
       __cuda_ptx_cp_reduce_async_bulk_is_not_supported_before_SM_90__();));
 }
-#  endif // __cccl_ptx_isa >= 800
+#endif // __cccl_ptx_isa >= 800
 
 /*
 // cp.reduce.async.bulk.dst.src.bulk_group.op.noftz.type  [dstMem], [srcMem], size; // 5. PTX ISA 80, SM_90
@@ -89,7 +93,7 @@ _CCCL_DEVICE static inline void cp_reduce_async_bulk(
 // .src       = { .shared::cta }
 // .type      = { .bf16 }
 // .op        = { .add }
-template <typename=void>
+template <typename = void>
 __device__ static inline void cp_reduce_async_bulk(
   cuda::ptx::space_global_t,
   cuda::ptx::space_shared_t,
@@ -98,7 +102,7 @@ __device__ static inline void cp_reduce_async_bulk(
   const __nv_bfloat16* srcMem,
   uint32_t size);
 */
-#  if __cccl_ptx_isa >= 800
+#if __cccl_ptx_isa >= 800
 extern "C" _CCCL_DEVICE void __cuda_ptx_cp_reduce_async_bulk_is_not_supported_before_SM_90__();
 template <typename = void>
 _CCCL_DEVICE static inline void cp_reduce_async_bulk(
@@ -123,5 +127,6 @@ _CCCL_DEVICE static inline void cp_reduce_async_bulk(
       // Unsupported architectures will have a linker error with a semi-decent error message
       __cuda_ptx_cp_reduce_async_bulk_is_not_supported_before_SM_90__();));
 }
-#  endif // __cccl_ptx_isa >= 800
-#endif // _LIBCUDACXX_HAS_NVBF16
+#endif // __cccl_ptx_isa >= 800
+
+#endif // _CUDA_PTX_GENERATED_CP_REDUCE_ASYNC_BULK_BF16_H_
diff --git a/libcudacxx/include/cuda/__ptx/instructions/generated/cp_reduce_async_bulk_f16.inc b/libcudacxx/include/cuda/__ptx/instructions/generated/cp_reduce_async_bulk_f16.h
similarity index 89%
rename from libcudacxx/include/cuda/__ptx/instructions/generated/cp_reduce_async_bulk_f16.inc
rename to libcudacxx/include/cuda/__ptx/instructions/generated/cp_reduce_async_bulk_f16.h
index 3a52630db53..0c4678c95bb 100644
--- a/libcudacxx/include/cuda/__ptx/instructions/generated/cp_reduce_async_bulk_f16.inc
+++ b/libcudacxx/include/cuda/__ptx/instructions/generated/cp_reduce_async_bulk_f16.h
@@ -1,10 +1,15 @@
+// This file was automatically generated. Do not edit.
+
+#ifndef _CUDA_PTX_GENERATED_CP_REDUCE_ASYNC_BULK_F16_H_
+#define _CUDA_PTX_GENERATED_CP_REDUCE_ASYNC_BULK_F16_H_
+
 /*
 // cp.reduce.async.bulk.dst.src.bulk_group.op.type  [dstMem], [srcMem], size; // 4. PTX ISA 80, SM_90
 // .dst       = { .global }
 // .src       = { .shared::cta }
 // .type      = { .f16 }
 // .op        = { .min }
-template <typename=void>
+template <typename = void>
 __device__ static inline void cp_reduce_async_bulk(
   cuda::ptx::space_global_t,
   cuda::ptx::space_shared_t,
@@ -13,7 +18,7 @@ __device__ static inline void cp_reduce_async_bulk(
   const __half* srcMem,
   uint32_t size);
 */
-#  if __cccl_ptx_isa >= 800
+#if __cccl_ptx_isa >= 800
 extern "C" _CCCL_DEVICE void __cuda_ptx_cp_reduce_async_bulk_is_not_supported_before_SM_90__();
 template <typename = void>
 _CCCL_DEVICE static inline void cp_reduce_async_bulk(
@@ -33,7 +38,7 @@ _CCCL_DEVICE static inline void cp_reduce_async_bulk(
       // Unsupported architectures will have a linker error with a semi-decent error message
       __cuda_ptx_cp_reduce_async_bulk_is_not_supported_before_SM_90__();));
 }
-#  endif // __cccl_ptx_isa >= 800
+#endif // __cccl_ptx_isa >= 800
 
 /*
 // cp.reduce.async.bulk.dst.src.bulk_group.op.type  [dstMem], [srcMem], size; // 4. PTX ISA 80, SM_90
@@ -41,7 +46,7 @@ _CCCL_DEVICE static inline void cp_reduce_async_bulk(
 // .src       = { .shared::cta }
 // .type      = { .f16 }
 // .op        = { .max }
-template <typename=void>
+template <typename = void>
 __device__ static inline void cp_reduce_async_bulk(
   cuda::ptx::space_global_t,
   cuda::ptx::space_shared_t,
@@ -50,7 +55,7 @@ __device__ static inline void cp_reduce_async_bulk(
   const __half* srcMem,
   uint32_t size);
 */
-#  if __cccl_ptx_isa >= 800
+#if __cccl_ptx_isa >= 800
 extern "C" _CCCL_DEVICE void __cuda_ptx_cp_reduce_async_bulk_is_not_supported_before_SM_90__();
 template <typename = void>
 _CCCL_DEVICE static inline void cp_reduce_async_bulk(
@@ -70,7 +75,7 @@ _CCCL_DEVICE static inline void cp_reduce_async_bulk(
       // Unsupported architectures will have a linker error with a semi-decent error message
       __cuda_ptx_cp_reduce_async_bulk_is_not_supported_before_SM_90__();));
 }
-#  endif // __cccl_ptx_isa >= 800
+#endif // __cccl_ptx_isa >= 800
 
 /*
 // cp.reduce.async.bulk.dst.src.bulk_group.op.noftz.type  [dstMem], [srcMem], size; // 5. PTX ISA 80, SM_90
@@ -78,7 +83,7 @@ _CCCL_DEVICE static inline void cp_reduce_async_bulk(
 // .src       = { .shared::cta }
 // .type      = { .f16 }
 // .op        = { .add }
-template <typename=void>
+template <typename = void>
 __device__ static inline void cp_reduce_async_bulk(
   cuda::ptx::space_global_t,
   cuda::ptx::space_shared_t,
@@ -87,7 +92,7 @@ __device__ static inline void cp_reduce_async_bulk(
   const __half* srcMem,
   uint32_t size);
 */
-#  if __cccl_ptx_isa >= 800
+#if __cccl_ptx_isa >= 800
 extern "C" _CCCL_DEVICE void __cuda_ptx_cp_reduce_async_bulk_is_not_supported_before_SM_90__();
 template <typename = void>
 _CCCL_DEVICE static inline void cp_reduce_async_bulk(
@@ -107,4 +112,6 @@ _CCCL_DEVICE static inline void cp_reduce_async_bulk(
       // Unsupported architectures will have a linker error with a semi-decent error message
       __cuda_ptx_cp_reduce_async_bulk_is_not_supported_before_SM_90__();));
 }
-#  endif // __cccl_ptx_isa >= 800
+#endif // __cccl_ptx_isa >= 800
+
+#endif // _CUDA_PTX_GENERATED_CP_REDUCE_ASYNC_BULK_F16_H_
diff --git a/libcudacxx/include/cuda/__ptx/instructions/generated/cp_reduce_async_bulk_tensor.inc b/libcudacxx/include/cuda/__ptx/instructions/generated/cp_reduce_async_bulk_tensor.h
similarity index 91%
rename from libcudacxx/include/cuda/__ptx/instructions/generated/cp_reduce_async_bulk_tensor.inc
rename to libcudacxx/include/cuda/__ptx/instructions/generated/cp_reduce_async_bulk_tensor.h
index 32008f6af5b..9ec5b2443d8 100644
--- a/libcudacxx/include/cuda/__ptx/instructions/generated/cp_reduce_async_bulk_tensor.inc
+++ b/libcudacxx/include/cuda/__ptx/instructions/generated/cp_reduce_async_bulk_tensor.h
@@ -1,3 +1,8 @@
+// This file was automatically generated. Do not edit.
+
+#ifndef _CUDA_PTX_GENERATED_CP_REDUCE_ASYNC_BULK_TENSOR_H_
+#define _CUDA_PTX_GENERATED_CP_REDUCE_ASYNC_BULK_TENSOR_H_
+
 /*
 // cp.reduce.async.bulk.tensor.1d.dst.src.op.tile.bulk_group [tensorMap, tensorCoords], [srcMem]; // 1a. PTX ISA 80,
 SM_90
@@ -37,37 +42,37 @@ _CCCL_DEVICE static inline void cp_reduce_async_bulk_tensor(
             :
             : "l"(__tensorMap), "r"(__tensorCoords[0]), "r"(__as_ptr_smem(__srcMem))
             : "memory");
-      } _CCCL_ELSE_IF_CONSTEXPR (__op == op_min) {
+      } else _CCCL_IF_CONSTEXPR (__op == op_min) {
         asm("cp.reduce.async.bulk.tensor.1d.global.shared::cta.min.tile.bulk_group [%0, {%1}], [%2]; // 1a."
             :
             : "l"(__tensorMap), "r"(__tensorCoords[0]), "r"(__as_ptr_smem(__srcMem))
             : "memory");
-      } _CCCL_ELSE_IF_CONSTEXPR (__op == op_max) {
+      } else _CCCL_IF_CONSTEXPR (__op == op_max) {
         asm("cp.reduce.async.bulk.tensor.1d.global.shared::cta.max.tile.bulk_group [%0, {%1}], [%2]; // 1a."
             :
             : "l"(__tensorMap), "r"(__tensorCoords[0]), "r"(__as_ptr_smem(__srcMem))
             : "memory");
-      } _CCCL_ELSE_IF_CONSTEXPR (__op == op_inc) {
+      } else _CCCL_IF_CONSTEXPR (__op == op_inc) {
         asm("cp.reduce.async.bulk.tensor.1d.global.shared::cta.inc.tile.bulk_group [%0, {%1}], [%2]; // 1a."
             :
             : "l"(__tensorMap), "r"(__tensorCoords[0]), "r"(__as_ptr_smem(__srcMem))
             : "memory");
-      } _CCCL_ELSE_IF_CONSTEXPR (__op == op_dec) {
+      } else _CCCL_IF_CONSTEXPR (__op == op_dec) {
         asm("cp.reduce.async.bulk.tensor.1d.global.shared::cta.dec.tile.bulk_group [%0, {%1}], [%2]; // 1a."
             :
             : "l"(__tensorMap), "r"(__tensorCoords[0]), "r"(__as_ptr_smem(__srcMem))
             : "memory");
-      } _CCCL_ELSE_IF_CONSTEXPR (__op == op_and_op) {
+      } else _CCCL_IF_CONSTEXPR (__op == op_and_op) {
         asm("cp.reduce.async.bulk.tensor.1d.global.shared::cta.and.tile.bulk_group [%0, {%1}], [%2]; // 1a."
             :
             : "l"(__tensorMap), "r"(__tensorCoords[0]), "r"(__as_ptr_smem(__srcMem))
             : "memory");
-      } _CCCL_ELSE_IF_CONSTEXPR (__op == op_or_op) {
+      } else _CCCL_IF_CONSTEXPR (__op == op_or_op) {
         asm("cp.reduce.async.bulk.tensor.1d.global.shared::cta.or.tile.bulk_group [%0, {%1}], [%2]; // 1a."
             :
             : "l"(__tensorMap), "r"(__tensorCoords[0]), "r"(__as_ptr_smem(__srcMem))
             : "memory");
-      } _CCCL_ELSE_IF_CONSTEXPR (__op == op_xor_op) {
+      } else _CCCL_IF_CONSTEXPR (__op == op_xor_op) {
         asm("cp.reduce.async.bulk.tensor.1d.global.shared::cta.xor.tile.bulk_group [%0, {%1}], [%2]; // 1a."
             :
             : "l"(__tensorMap), "r"(__tensorCoords[0]), "r"(__as_ptr_smem(__srcMem))
@@ -118,37 +123,37 @@ _CCCL_DEVICE static inline void cp_reduce_async_bulk_tensor(
             :
             : "l"(__tensorMap), "r"(__tensorCoords[0]), "r"(__tensorCoords[1]), "r"(__as_ptr_smem(__srcMem))
             : "memory");
-      } _CCCL_ELSE_IF_CONSTEXPR (__op == op_min) {
+      } else _CCCL_IF_CONSTEXPR (__op == op_min) {
         asm("cp.reduce.async.bulk.tensor.2d.global.shared::cta.min.tile.bulk_group [%0, {%1, %2}], [%3]; // 1b."
             :
             : "l"(__tensorMap), "r"(__tensorCoords[0]), "r"(__tensorCoords[1]), "r"(__as_ptr_smem(__srcMem))
             : "memory");
-      } _CCCL_ELSE_IF_CONSTEXPR (__op == op_max) {
+      } else _CCCL_IF_CONSTEXPR (__op == op_max) {
         asm("cp.reduce.async.bulk.tensor.2d.global.shared::cta.max.tile.bulk_group [%0, {%1, %2}], [%3]; // 1b."
             :
             : "l"(__tensorMap), "r"(__tensorCoords[0]), "r"(__tensorCoords[1]), "r"(__as_ptr_smem(__srcMem))
             : "memory");
-      } _CCCL_ELSE_IF_CONSTEXPR (__op == op_inc) {
+      } else _CCCL_IF_CONSTEXPR (__op == op_inc) {
         asm("cp.reduce.async.bulk.tensor.2d.global.shared::cta.inc.tile.bulk_group [%0, {%1, %2}], [%3]; // 1b."
             :
             : "l"(__tensorMap), "r"(__tensorCoords[0]), "r"(__tensorCoords[1]), "r"(__as_ptr_smem(__srcMem))
             : "memory");
-      } _CCCL_ELSE_IF_CONSTEXPR (__op == op_dec) {
+      } else _CCCL_IF_CONSTEXPR (__op == op_dec) {
         asm("cp.reduce.async.bulk.tensor.2d.global.shared::cta.dec.tile.bulk_group [%0, {%1, %2}], [%3]; // 1b."
             :
             : "l"(__tensorMap), "r"(__tensorCoords[0]), "r"(__tensorCoords[1]), "r"(__as_ptr_smem(__srcMem))
             : "memory");
-      } _CCCL_ELSE_IF_CONSTEXPR (__op == op_and_op) {
+      } else _CCCL_IF_CONSTEXPR (__op == op_and_op) {
         asm("cp.reduce.async.bulk.tensor.2d.global.shared::cta.and.tile.bulk_group [%0, {%1, %2}], [%3]; // 1b."
             :
             : "l"(__tensorMap), "r"(__tensorCoords[0]), "r"(__tensorCoords[1]), "r"(__as_ptr_smem(__srcMem))
             : "memory");
-      } _CCCL_ELSE_IF_CONSTEXPR (__op == op_or_op) {
+      } else _CCCL_IF_CONSTEXPR (__op == op_or_op) {
         asm("cp.reduce.async.bulk.tensor.2d.global.shared::cta.or.tile.bulk_group [%0, {%1, %2}], [%3]; // 1b."
             :
             : "l"(__tensorMap), "r"(__tensorCoords[0]), "r"(__tensorCoords[1]), "r"(__as_ptr_smem(__srcMem))
             : "memory");
-      } _CCCL_ELSE_IF_CONSTEXPR (__op == op_xor_op) {
+      } else _CCCL_IF_CONSTEXPR (__op == op_xor_op) {
         asm("cp.reduce.async.bulk.tensor.2d.global.shared::cta.xor.tile.bulk_group [%0, {%1, %2}], [%3]; // 1b."
             :
             : "l"(__tensorMap), "r"(__tensorCoords[0]), "r"(__tensorCoords[1]), "r"(__as_ptr_smem(__srcMem))
@@ -203,7 +208,7 @@ _CCCL_DEVICE static inline void cp_reduce_async_bulk_tensor(
               "r"(__tensorCoords[2]),
               "r"(__as_ptr_smem(__srcMem))
             : "memory");
-      } _CCCL_ELSE_IF_CONSTEXPR (__op == op_min) {
+      } else _CCCL_IF_CONSTEXPR (__op == op_min) {
         asm("cp.reduce.async.bulk.tensor.3d.global.shared::cta.min.tile.bulk_group [%0, {%1, %2, %3}], [%4]; // 1c."
             :
             : "l"(__tensorMap),
@@ -212,7 +217,7 @@ _CCCL_DEVICE static inline void cp_reduce_async_bulk_tensor(
               "r"(__tensorCoords[2]),
               "r"(__as_ptr_smem(__srcMem))
             : "memory");
-      } _CCCL_ELSE_IF_CONSTEXPR (__op == op_max) {
+      } else _CCCL_IF_CONSTEXPR (__op == op_max) {
         asm("cp.reduce.async.bulk.tensor.3d.global.shared::cta.max.tile.bulk_group [%0, {%1, %2, %3}], [%4]; // 1c."
             :
             : "l"(__tensorMap),
@@ -221,7 +226,7 @@ _CCCL_DEVICE static inline void cp_reduce_async_bulk_tensor(
               "r"(__tensorCoords[2]),
               "r"(__as_ptr_smem(__srcMem))
             : "memory");
-      } _CCCL_ELSE_IF_CONSTEXPR (__op == op_inc) {
+      } else _CCCL_IF_CONSTEXPR (__op == op_inc) {
         asm("cp.reduce.async.bulk.tensor.3d.global.shared::cta.inc.tile.bulk_group [%0, {%1, %2, %3}], [%4]; // 1c."
             :
             : "l"(__tensorMap),
@@ -230,7 +235,7 @@ _CCCL_DEVICE static inline void cp_reduce_async_bulk_tensor(
               "r"(__tensorCoords[2]),
               "r"(__as_ptr_smem(__srcMem))
             : "memory");
-      } _CCCL_ELSE_IF_CONSTEXPR (__op == op_dec) {
+      } else _CCCL_IF_CONSTEXPR (__op == op_dec) {
         asm("cp.reduce.async.bulk.tensor.3d.global.shared::cta.dec.tile.bulk_group [%0, {%1, %2, %3}], [%4]; // 1c."
             :
             : "l"(__tensorMap),
@@ -239,7 +244,7 @@ _CCCL_DEVICE static inline void cp_reduce_async_bulk_tensor(
               "r"(__tensorCoords[2]),
               "r"(__as_ptr_smem(__srcMem))
             : "memory");
-      } _CCCL_ELSE_IF_CONSTEXPR (__op == op_and_op) {
+      } else _CCCL_IF_CONSTEXPR (__op == op_and_op) {
         asm("cp.reduce.async.bulk.tensor.3d.global.shared::cta.and.tile.bulk_group [%0, {%1, %2, %3}], [%4]; // 1c."
             :
             : "l"(__tensorMap),
@@ -248,7 +253,7 @@ _CCCL_DEVICE static inline void cp_reduce_async_bulk_tensor(
               "r"(__tensorCoords[2]),
               "r"(__as_ptr_smem(__srcMem))
             : "memory");
-      } _CCCL_ELSE_IF_CONSTEXPR (__op == op_or_op) {
+      } else _CCCL_IF_CONSTEXPR (__op == op_or_op) {
         asm("cp.reduce.async.bulk.tensor.3d.global.shared::cta.or.tile.bulk_group [%0, {%1, %2, %3}], [%4]; // 1c."
             :
             : "l"(__tensorMap),
@@ -257,7 +262,7 @@ _CCCL_DEVICE static inline void cp_reduce_async_bulk_tensor(
               "r"(__tensorCoords[2]),
               "r"(__as_ptr_smem(__srcMem))
             : "memory");
-      } _CCCL_ELSE_IF_CONSTEXPR (__op == op_xor_op) {
+      } else _CCCL_IF_CONSTEXPR (__op == op_xor_op) {
         asm("cp.reduce.async.bulk.tensor.3d.global.shared::cta.xor.tile.bulk_group [%0, {%1, %2, %3}], [%4]; // 1c."
             :
             : "l"(__tensorMap),
@@ -317,7 +322,7 @@ _CCCL_DEVICE static inline void cp_reduce_async_bulk_tensor(
               "r"(__tensorCoords[3]),
               "r"(__as_ptr_smem(__srcMem))
             : "memory");
-      } _CCCL_ELSE_IF_CONSTEXPR (__op == op_min) {
+      } else _CCCL_IF_CONSTEXPR (__op == op_min) {
         asm("cp.reduce.async.bulk.tensor.4d.global.shared::cta.min.tile.bulk_group [%0, {%1, %2, %3, %4}], [%5]; // 1d."
             :
             : "l"(__tensorMap),
@@ -327,7 +332,7 @@ _CCCL_DEVICE static inline void cp_reduce_async_bulk_tensor(
               "r"(__tensorCoords[3]),
               "r"(__as_ptr_smem(__srcMem))
             : "memory");
-      } _CCCL_ELSE_IF_CONSTEXPR (__op == op_max) {
+      } else _CCCL_IF_CONSTEXPR (__op == op_max) {
         asm("cp.reduce.async.bulk.tensor.4d.global.shared::cta.max.tile.bulk_group [%0, {%1, %2, %3, %4}], [%5]; // 1d."
             :
             : "l"(__tensorMap),
@@ -337,7 +342,7 @@ _CCCL_DEVICE static inline void cp_reduce_async_bulk_tensor(
               "r"(__tensorCoords[3]),
               "r"(__as_ptr_smem(__srcMem))
             : "memory");
-      } _CCCL_ELSE_IF_CONSTEXPR (__op == op_inc) {
+      } else _CCCL_IF_CONSTEXPR (__op == op_inc) {
         asm("cp.reduce.async.bulk.tensor.4d.global.shared::cta.inc.tile.bulk_group [%0, {%1, %2, %3, %4}], [%5]; // 1d."
             :
             : "l"(__tensorMap),
@@ -347,7 +352,7 @@ _CCCL_DEVICE static inline void cp_reduce_async_bulk_tensor(
               "r"(__tensorCoords[3]),
               "r"(__as_ptr_smem(__srcMem))
             : "memory");
-      } _CCCL_ELSE_IF_CONSTEXPR (__op == op_dec) {
+      } else _CCCL_IF_CONSTEXPR (__op == op_dec) {
         asm("cp.reduce.async.bulk.tensor.4d.global.shared::cta.dec.tile.bulk_group [%0, {%1, %2, %3, %4}], [%5]; // 1d."
             :
             : "l"(__tensorMap),
@@ -357,7 +362,7 @@ _CCCL_DEVICE static inline void cp_reduce_async_bulk_tensor(
               "r"(__tensorCoords[3]),
               "r"(__as_ptr_smem(__srcMem))
             : "memory");
-      } _CCCL_ELSE_IF_CONSTEXPR (__op == op_and_op) {
+      } else _CCCL_IF_CONSTEXPR (__op == op_and_op) {
         asm("cp.reduce.async.bulk.tensor.4d.global.shared::cta.and.tile.bulk_group [%0, {%1, %2, %3, %4}], [%5]; // 1d."
             :
             : "l"(__tensorMap),
@@ -367,7 +372,7 @@ _CCCL_DEVICE static inline void cp_reduce_async_bulk_tensor(
               "r"(__tensorCoords[3]),
               "r"(__as_ptr_smem(__srcMem))
             : "memory");
-      } _CCCL_ELSE_IF_CONSTEXPR (__op == op_or_op) {
+      } else _CCCL_IF_CONSTEXPR (__op == op_or_op) {
         asm("cp.reduce.async.bulk.tensor.4d.global.shared::cta.or.tile.bulk_group [%0, {%1, %2, %3, %4}], [%5]; // 1d."
             :
             : "l"(__tensorMap),
@@ -377,7 +382,7 @@ _CCCL_DEVICE static inline void cp_reduce_async_bulk_tensor(
               "r"(__tensorCoords[3]),
               "r"(__as_ptr_smem(__srcMem))
             : "memory");
-      } _CCCL_ELSE_IF_CONSTEXPR (__op == op_xor_op) {
+      } else _CCCL_IF_CONSTEXPR (__op == op_xor_op) {
         asm("cp.reduce.async.bulk.tensor.4d.global.shared::cta.xor.tile.bulk_group [%0, {%1, %2, %3, %4}], [%5]; // 1d."
             :
             : "l"(__tensorMap),
@@ -440,7 +445,7 @@ _CCCL_DEVICE static inline void cp_reduce_async_bulk_tensor(
               "r"(__tensorCoords[4]),
               "r"(__as_ptr_smem(__srcMem))
             : "memory");
-      } _CCCL_ELSE_IF_CONSTEXPR (__op == op_min) {
+      } else _CCCL_IF_CONSTEXPR (__op == op_min) {
         asm("cp.reduce.async.bulk.tensor.5d.global.shared::cta.min.tile.bulk_group [%0, {%1, %2, %3, %4, %5}], [%6]; "
             "// 1e."
             :
@@ -452,7 +457,7 @@ _CCCL_DEVICE static inline void cp_reduce_async_bulk_tensor(
               "r"(__tensorCoords[4]),
               "r"(__as_ptr_smem(__srcMem))
             : "memory");
-      } _CCCL_ELSE_IF_CONSTEXPR (__op == op_max) {
+      } else _CCCL_IF_CONSTEXPR (__op == op_max) {
         asm("cp.reduce.async.bulk.tensor.5d.global.shared::cta.max.tile.bulk_group [%0, {%1, %2, %3, %4, %5}], [%6]; "
             "// 1e."
             :
@@ -464,7 +469,7 @@ _CCCL_DEVICE static inline void cp_reduce_async_bulk_tensor(
               "r"(__tensorCoords[4]),
               "r"(__as_ptr_smem(__srcMem))
             : "memory");
-      } _CCCL_ELSE_IF_CONSTEXPR (__op == op_inc) {
+      } else _CCCL_IF_CONSTEXPR (__op == op_inc) {
         asm("cp.reduce.async.bulk.tensor.5d.global.shared::cta.inc.tile.bulk_group [%0, {%1, %2, %3, %4, %5}], [%6]; "
             "// 1e."
             :
@@ -476,7 +481,7 @@ _CCCL_DEVICE static inline void cp_reduce_async_bulk_tensor(
               "r"(__tensorCoords[4]),
               "r"(__as_ptr_smem(__srcMem))
             : "memory");
-      } _CCCL_ELSE_IF_CONSTEXPR (__op == op_dec) {
+      } else _CCCL_IF_CONSTEXPR (__op == op_dec) {
         asm("cp.reduce.async.bulk.tensor.5d.global.shared::cta.dec.tile.bulk_group [%0, {%1, %2, %3, %4, %5}], [%6]; "
             "// 1e."
             :
@@ -488,7 +493,7 @@ _CCCL_DEVICE static inline void cp_reduce_async_bulk_tensor(
               "r"(__tensorCoords[4]),
               "r"(__as_ptr_smem(__srcMem))
             : "memory");
-      } _CCCL_ELSE_IF_CONSTEXPR (__op == op_and_op) {
+      } else _CCCL_IF_CONSTEXPR (__op == op_and_op) {
         asm("cp.reduce.async.bulk.tensor.5d.global.shared::cta.and.tile.bulk_group [%0, {%1, %2, %3, %4, %5}], [%6]; "
             "// 1e."
             :
@@ -500,7 +505,7 @@ _CCCL_DEVICE static inline void cp_reduce_async_bulk_tensor(
               "r"(__tensorCoords[4]),
               "r"(__as_ptr_smem(__srcMem))
             : "memory");
-      } _CCCL_ELSE_IF_CONSTEXPR (__op == op_or_op) {
+      } else _CCCL_IF_CONSTEXPR (__op == op_or_op) {
         asm("cp.reduce.async.bulk.tensor.5d.global.shared::cta.or.tile.bulk_group [%0, {%1, %2, %3, %4, %5}], [%6]; // "
             "1e."
             :
@@ -512,7 +517,7 @@ _CCCL_DEVICE static inline void cp_reduce_async_bulk_tensor(
               "r"(__tensorCoords[4]),
               "r"(__as_ptr_smem(__srcMem))
             : "memory");
-      } _CCCL_ELSE_IF_CONSTEXPR (__op == op_xor_op) {
+      } else _CCCL_IF_CONSTEXPR (__op == op_xor_op) {
         asm("cp.reduce.async.bulk.tensor.5d.global.shared::cta.xor.tile.bulk_group [%0, {%1, %2, %3, %4, %5}], [%6]; "
             "// 1e."
             :
@@ -530,3 +535,5 @@ _CCCL_DEVICE static inline void cp_reduce_async_bulk_tensor(
       __cuda_ptx_cp_reduce_async_bulk_tensor_is_not_supported_before_SM_90__();));
 }
 #endif // __cccl_ptx_isa >= 800
+
+#endif // _CUDA_PTX_GENERATED_CP_REDUCE_ASYNC_BULK_TENSOR_H_
diff --git a/libcudacxx/include/cuda/__ptx/instructions/generated/fence.inc b/libcudacxx/include/cuda/__ptx/instructions/generated/fence.h
similarity index 81%
rename from libcudacxx/include/cuda/__ptx/instructions/generated/fence.inc
rename to libcudacxx/include/cuda/__ptx/instructions/generated/fence.h
index f10ec07ebb5..db00c4d4cba 100644
--- a/libcudacxx/include/cuda/__ptx/instructions/generated/fence.inc
+++ b/libcudacxx/include/cuda/__ptx/instructions/generated/fence.h
@@ -1,3 +1,8 @@
+// This file was automatically generated. Do not edit.
+
+#ifndef _CUDA_PTX_GENERATED_FENCE_H_
+#define _CUDA_PTX_GENERATED_FENCE_H_
+
 /*
 // fence{.sem}.scope; // 1. PTX ISA 60, SM_70
 // .sem       = { .sc, .acq_rel }
@@ -19,15 +24,15 @@ _CCCL_DEVICE static inline void fence(sem_t<_Sem> __sem, scope_t<_Scope> __scope
     (
       _CCCL_IF_CONSTEXPR (__sem == sem_sc && __scope == scope_cta) {
         asm volatile("fence.sc.cta; // 1." : : : "memory");
-      } _CCCL_ELSE_IF_CONSTEXPR (__sem == sem_sc && __scope == scope_gpu) {
+      } else _CCCL_IF_CONSTEXPR (__sem == sem_sc && __scope == scope_gpu) {
         asm volatile("fence.sc.gpu; // 1." : : : "memory");
-      } _CCCL_ELSE_IF_CONSTEXPR (__sem == sem_sc && __scope == scope_sys) {
+      } else _CCCL_IF_CONSTEXPR (__sem == sem_sc && __scope == scope_sys) {
         asm volatile("fence.sc.sys; // 1." : : : "memory");
-      } _CCCL_ELSE_IF_CONSTEXPR (__sem == sem_acq_rel && __scope == scope_cta) {
+      } else _CCCL_IF_CONSTEXPR (__sem == sem_acq_rel && __scope == scope_cta) {
         asm volatile("fence.acq_rel.cta; // 1." : : : "memory");
-      } _CCCL_ELSE_IF_CONSTEXPR (__sem == sem_acq_rel && __scope == scope_gpu) {
+      } else _CCCL_IF_CONSTEXPR (__sem == sem_acq_rel && __scope == scope_gpu) {
         asm volatile("fence.acq_rel.gpu; // 1." : : : "memory");
-      } _CCCL_ELSE_IF_CONSTEXPR (__sem == sem_acq_rel && __scope == scope_sys) {
+      } else _CCCL_IF_CONSTEXPR (__sem == sem_acq_rel && __scope == scope_sys) {
         asm volatile("fence.acq_rel.sys; // 1." : : : "memory");
       }),
     (
@@ -57,7 +62,7 @@ _CCCL_DEVICE static inline void fence(sem_t<_Sem> __sem, scope_cluster_t)
     (
       _CCCL_IF_CONSTEXPR (__sem == sem_sc) {
         asm volatile("fence.sc.cluster; // 2." : : : "memory");
-      } _CCCL_ELSE_IF_CONSTEXPR (__sem == sem_acq_rel) {
+      } else _CCCL_IF_CONSTEXPR (__sem == sem_acq_rel) {
         asm volatile("fence.acq_rel.cluster; // 2." : : : "memory");
       }),
     (
@@ -65,3 +70,5 @@ _CCCL_DEVICE static inline void fence(sem_t<_Sem> __sem, scope_cluster_t)
       __cuda_ptx_fence_is_not_supported_before_SM_90__();));
 }
 #endif // __cccl_ptx_isa >= 780
+
+#endif // _CUDA_PTX_GENERATED_FENCE_H_
diff --git a/libcudacxx/include/cuda/__ptx/instructions/generated/fence_mbarrier_init.inc b/libcudacxx/include/cuda/__ptx/instructions/generated/fence_mbarrier_init.h
similarity index 80%
rename from libcudacxx/include/cuda/__ptx/instructions/generated/fence_mbarrier_init.inc
rename to libcudacxx/include/cuda/__ptx/instructions/generated/fence_mbarrier_init.h
index 0d39c222598..e185913b3cd 100644
--- a/libcudacxx/include/cuda/__ptx/instructions/generated/fence_mbarrier_init.inc
+++ b/libcudacxx/include/cuda/__ptx/instructions/generated/fence_mbarrier_init.h
@@ -1,8 +1,13 @@
+// This file was automatically generated. Do not edit.
+
+#ifndef _CUDA_PTX_GENERATED_FENCE_MBARRIER_INIT_H_
+#define _CUDA_PTX_GENERATED_FENCE_MBARRIER_INIT_H_
+
 /*
 // fence.mbarrier_init.sem.scope; // 3. PTX ISA 80, SM_90
 // .sem       = { .release }
 // .scope     = { .cluster }
-template <typename=void>
+template <typename = void>
 __device__ static inline void fence_mbarrier_init(
   cuda::ptx::sem_release_t,
   cuda::ptx::scope_cluster_t);
@@ -25,3 +30,5 @@ _CCCL_DEVICE static inline void fence_mbarrier_init(sem_release_t, scope_cluster
       __cuda_ptx_fence_mbarrier_init_is_not_supported_before_SM_90__();));
 }
 #endif // __cccl_ptx_isa >= 800
+
+#endif // _CUDA_PTX_GENERATED_FENCE_MBARRIER_INIT_H_
diff --git a/libcudacxx/include/cuda/__ptx/instructions/generated/fence_proxy_alias.inc b/libcudacxx/include/cuda/__ptx/instructions/generated/fence_proxy_alias.h
similarity index 74%
rename from libcudacxx/include/cuda/__ptx/instructions/generated/fence_proxy_alias.inc
rename to libcudacxx/include/cuda/__ptx/instructions/generated/fence_proxy_alias.h
index 98260b851ca..40229b84a96 100644
--- a/libcudacxx/include/cuda/__ptx/instructions/generated/fence_proxy_alias.inc
+++ b/libcudacxx/include/cuda/__ptx/instructions/generated/fence_proxy_alias.h
@@ -1,6 +1,11 @@
+// This file was automatically generated. Do not edit.
+
+#ifndef _CUDA_PTX_GENERATED_FENCE_PROXY_ALIAS_H_
+#define _CUDA_PTX_GENERATED_FENCE_PROXY_ALIAS_H_
+
 /*
 // fence.proxy.alias; // 4. PTX ISA 75, SM_70
-template <typename=void>
+template <typename = void>
 __device__ static inline void fence_proxy_alias();
 */
 #if __cccl_ptx_isa >= 750
@@ -19,3 +24,5 @@ _CCCL_DEVICE static inline void fence_proxy_alias()
       __cuda_ptx_fence_proxy_alias_is_not_supported_before_SM_70__();));
 }
 #endif // __cccl_ptx_isa >= 750
+
+#endif // _CUDA_PTX_GENERATED_FENCE_PROXY_ALIAS_H_
diff --git a/libcudacxx/include/cuda/__ptx/instructions/generated/fence_proxy_async.inc b/libcudacxx/include/cuda/__ptx/instructions/generated/fence_proxy_async.h
similarity index 83%
rename from libcudacxx/include/cuda/__ptx/instructions/generated/fence_proxy_async.inc
rename to libcudacxx/include/cuda/__ptx/instructions/generated/fence_proxy_async.h
index f0a37baabdb..f64b5faee5e 100644
--- a/libcudacxx/include/cuda/__ptx/instructions/generated/fence_proxy_async.inc
+++ b/libcudacxx/include/cuda/__ptx/instructions/generated/fence_proxy_async.h
@@ -1,6 +1,11 @@
+// This file was automatically generated. Do not edit.
+
+#ifndef _CUDA_PTX_GENERATED_FENCE_PROXY_ASYNC_H_
+#define _CUDA_PTX_GENERATED_FENCE_PROXY_ASYNC_H_
+
 /*
 // fence.proxy.async; // 5. PTX ISA 80, SM_90
-template <typename=void>
+template <typename = void>
 __device__ static inline void fence_proxy_async();
 */
 #if __cccl_ptx_isa >= 800
@@ -38,9 +43,9 @@ _CCCL_DEVICE static inline void fence_proxy_async(space_t<_Space> __space)
     (
       _CCCL_IF_CONSTEXPR (__space == space_global) {
         asm volatile("fence.proxy.async.global; // 6." : : : "memory");
-      } _CCCL_ELSE_IF_CONSTEXPR (__space == space_cluster) {
+      } else _CCCL_IF_CONSTEXPR (__space == space_cluster) {
         asm volatile("fence.proxy.async.shared::cluster; // 6." : : : "memory");
-      } _CCCL_ELSE_IF_CONSTEXPR (__space == space_shared) {
+      } else _CCCL_IF_CONSTEXPR (__space == space_shared) {
         asm volatile("fence.proxy.async.shared::cta; // 6." : : : "memory");
       }),
     (
@@ -48,3 +53,5 @@ _CCCL_DEVICE static inline void fence_proxy_async(space_t<_Space> __space)
       __cuda_ptx_fence_proxy_async_is_not_supported_before_SM_90__();));
 }
 #endif // __cccl_ptx_isa >= 800
+
+#endif // _CUDA_PTX_GENERATED_FENCE_PROXY_ASYNC_H_
diff --git a/libcudacxx/include/cuda/__ptx/instructions/generated/fence_proxy_tensormap_generic.inc b/libcudacxx/include/cuda/__ptx/instructions/generated/fence_proxy_tensormap_generic.h
similarity index 85%
rename from libcudacxx/include/cuda/__ptx/instructions/generated/fence_proxy_tensormap_generic.inc
rename to libcudacxx/include/cuda/__ptx/instructions/generated/fence_proxy_tensormap_generic.h
index 3e5b2a265f4..1e6119ee032 100644
--- a/libcudacxx/include/cuda/__ptx/instructions/generated/fence_proxy_tensormap_generic.inc
+++ b/libcudacxx/include/cuda/__ptx/instructions/generated/fence_proxy_tensormap_generic.h
@@ -1,3 +1,8 @@
+// This file was automatically generated. Do not edit.
+
+#ifndef _CUDA_PTX_GENERATED_FENCE_PROXY_TENSORMAP_GENERIC_H_
+#define _CUDA_PTX_GENERATED_FENCE_PROXY_TENSORMAP_GENERIC_H_
+
 /*
 // fence.proxy.tensormap::generic.release.scope; // 7. PTX ISA 83, SM_90
 // .sem       = { .release }
@@ -19,11 +24,11 @@ _CCCL_DEVICE static inline void fence_proxy_tensormap_generic(sem_release_t, sco
     (
       _CCCL_IF_CONSTEXPR (__scope == scope_cta) {
         asm volatile("fence.proxy.tensormap::generic.release.cta; // 7." : : : "memory");
-      } _CCCL_ELSE_IF_CONSTEXPR (__scope == scope_cluster) {
+      } else _CCCL_IF_CONSTEXPR (__scope == scope_cluster) {
         asm volatile("fence.proxy.tensormap::generic.release.cluster; // 7." : : : "memory");
-      } _CCCL_ELSE_IF_CONSTEXPR (__scope == scope_gpu) {
+      } else _CCCL_IF_CONSTEXPR (__scope == scope_gpu) {
         asm volatile("fence.proxy.tensormap::generic.release.gpu; // 7." : : : "memory");
-      } _CCCL_ELSE_IF_CONSTEXPR (__scope == scope_sys) {
+      } else _CCCL_IF_CONSTEXPR (__scope == scope_sys) {
         asm volatile("fence.proxy.tensormap::generic.release.sys; // 7." : : : "memory");
       }),
     (
@@ -59,17 +64,17 @@ fence_proxy_tensormap_generic(sem_acquire_t, scope_t<_Scope> __scope, const void
                      :
                      : "l"(__addr), "n"(__size.value)
                      : "memory");
-      } _CCCL_ELSE_IF_CONSTEXPR (__scope == scope_cluster) {
+      } else _CCCL_IF_CONSTEXPR (__scope == scope_cluster) {
         asm volatile("fence.proxy.tensormap::generic.acquire.cluster [%0], %1; // 8."
                      :
                      : "l"(__addr), "n"(__size.value)
                      : "memory");
-      } _CCCL_ELSE_IF_CONSTEXPR (__scope == scope_gpu) {
+      } else _CCCL_IF_CONSTEXPR (__scope == scope_gpu) {
         asm volatile("fence.proxy.tensormap::generic.acquire.gpu [%0], %1; // 8."
                      :
                      : "l"(__addr), "n"(__size.value)
                      : "memory");
-      } _CCCL_ELSE_IF_CONSTEXPR (__scope == scope_sys) {
+      } else _CCCL_IF_CONSTEXPR (__scope == scope_sys) {
         asm volatile("fence.proxy.tensormap::generic.acquire.sys [%0], %1; // 8."
                      :
                      : "l"(__addr), "n"(__size.value)
@@ -80,3 +85,5 @@ fence_proxy_tensormap_generic(sem_acquire_t, scope_t<_Scope> __scope, const void
       __cuda_ptx_fence_proxy_tensormap_generic_is_not_supported_before_SM_90__();));
 }
 #endif // __cccl_ptx_isa >= 830
+
+#endif // _CUDA_PTX_GENERATED_FENCE_PROXY_TENSORMAP_GENERIC_H_
diff --git a/libcudacxx/include/cuda/__ptx/instructions/generated/get_sreg.inc b/libcudacxx/include/cuda/__ptx/instructions/generated/get_sreg.h
similarity index 95%
rename from libcudacxx/include/cuda/__ptx/instructions/generated/get_sreg.inc
rename to libcudacxx/include/cuda/__ptx/instructions/generated/get_sreg.h
index dd3079915f7..08128cc00a1 100644
--- a/libcudacxx/include/cuda/__ptx/instructions/generated/get_sreg.inc
+++ b/libcudacxx/include/cuda/__ptx/instructions/generated/get_sreg.h
@@ -1,6 +1,11 @@
+// This file was automatically generated. Do not edit.
+
+#ifndef _CUDA_PTX_GENERATED_GET_SREG_H_
+#define _CUDA_PTX_GENERATED_GET_SREG_H_
+
 /*
 // mov.u32 sreg_value, %%tid.x; // PTX ISA 20
-template <typename=void>
+template <typename = void>
 __device__ static inline uint32_t get_sreg_tid_x();
 */
 #if __cccl_ptx_isa >= 200
@@ -15,7 +20,7 @@ _CCCL_DEVICE static inline _CUDA_VSTD::uint32_t get_sreg_tid_x()
 
 /*
 // mov.u32 sreg_value, %%tid.y; // PTX ISA 20
-template <typename=void>
+template <typename = void>
 __device__ static inline uint32_t get_sreg_tid_y();
 */
 #if __cccl_ptx_isa >= 200
@@ -30,7 +35,7 @@ _CCCL_DEVICE static inline _CUDA_VSTD::uint32_t get_sreg_tid_y()
 
 /*
 // mov.u32 sreg_value, %%tid.z; // PTX ISA 20
-template <typename=void>
+template <typename = void>
 __device__ static inline uint32_t get_sreg_tid_z();
 */
 #if __cccl_ptx_isa >= 200
@@ -45,7 +50,7 @@ _CCCL_DEVICE static inline _CUDA_VSTD::uint32_t get_sreg_tid_z()
 
 /*
 // mov.u32 sreg_value, %%ntid.x; // PTX ISA 20
-template <typename=void>
+template <typename = void>
 __device__ static inline uint32_t get_sreg_ntid_x();
 */
 #if __cccl_ptx_isa >= 200
@@ -60,7 +65,7 @@ _CCCL_DEVICE static inline _CUDA_VSTD::uint32_t get_sreg_ntid_x()
 
 /*
 // mov.u32 sreg_value, %%ntid.y; // PTX ISA 20
-template <typename=void>
+template <typename = void>
 __device__ static inline uint32_t get_sreg_ntid_y();
 */
 #if __cccl_ptx_isa >= 200
@@ -75,7 +80,7 @@ _CCCL_DEVICE static inline _CUDA_VSTD::uint32_t get_sreg_ntid_y()
 
 /*
 // mov.u32 sreg_value, %%ntid.z; // PTX ISA 20
-template <typename=void>
+template <typename = void>
 __device__ static inline uint32_t get_sreg_ntid_z();
 */
 #if __cccl_ptx_isa >= 200
@@ -90,7 +95,7 @@ _CCCL_DEVICE static inline _CUDA_VSTD::uint32_t get_sreg_ntid_z()
 
 /*
 // mov.u32 sreg_value, %%laneid; // PTX ISA 13
-template <typename=void>
+template <typename = void>
 __device__ static inline uint32_t get_sreg_laneid();
 */
 #if __cccl_ptx_isa >= 130
@@ -105,7 +110,7 @@ _CCCL_DEVICE static inline _CUDA_VSTD::uint32_t get_sreg_laneid()
 
 /*
 // mov.u32 sreg_value, %%warpid; // PTX ISA 13
-template <typename=void>
+template <typename = void>
 __device__ static inline uint32_t get_sreg_warpid();
 */
 #if __cccl_ptx_isa >= 130
@@ -120,7 +125,7 @@ _CCCL_DEVICE static inline _CUDA_VSTD::uint32_t get_sreg_warpid()
 
 /*
 // mov.u32 sreg_value, %%nwarpid; // PTX ISA 20, SM_35
-template <typename=void>
+template <typename = void>
 __device__ static inline uint32_t get_sreg_nwarpid();
 */
 #if __cccl_ptx_isa >= 200
@@ -144,7 +149,7 @@ _CCCL_DEVICE static inline _CUDA_VSTD::uint32_t get_sreg_nwarpid()
 
 /*
 // mov.u32 sreg_value, %%ctaid.x; // PTX ISA 20
-template <typename=void>
+template <typename = void>
 __device__ static inline uint32_t get_sreg_ctaid_x();
 */
 #if __cccl_ptx_isa >= 200
@@ -159,7 +164,7 @@ _CCCL_DEVICE static inline _CUDA_VSTD::uint32_t get_sreg_ctaid_x()
 
 /*
 // mov.u32 sreg_value, %%ctaid.y; // PTX ISA 20
-template <typename=void>
+template <typename = void>
 __device__ static inline uint32_t get_sreg_ctaid_y();
 */
 #if __cccl_ptx_isa >= 200
@@ -174,7 +179,7 @@ _CCCL_DEVICE static inline _CUDA_VSTD::uint32_t get_sreg_ctaid_y()
 
 /*
 // mov.u32 sreg_value, %%ctaid.z; // PTX ISA 20
-template <typename=void>
+template <typename = void>
 __device__ static inline uint32_t get_sreg_ctaid_z();
 */
 #if __cccl_ptx_isa >= 200
@@ -189,7 +194,7 @@ _CCCL_DEVICE static inline _CUDA_VSTD::uint32_t get_sreg_ctaid_z()
 
 /*
 // mov.u32 sreg_value, %%nctaid.x; // PTX ISA 20
-template <typename=void>
+template <typename = void>
 __device__ static inline uint32_t get_sreg_nctaid_x();
 */
 #if __cccl_ptx_isa >= 200
@@ -204,7 +209,7 @@ _CCCL_DEVICE static inline _CUDA_VSTD::uint32_t get_sreg_nctaid_x()
 
 /*
 // mov.u32 sreg_value, %%nctaid.y; // PTX ISA 20
-template <typename=void>
+template <typename = void>
 __device__ static inline uint32_t get_sreg_nctaid_y();
 */
 #if __cccl_ptx_isa >= 200
@@ -219,7 +224,7 @@ _CCCL_DEVICE static inline _CUDA_VSTD::uint32_t get_sreg_nctaid_y()
 
 /*
 // mov.u32 sreg_value, %%nctaid.z; // PTX ISA 20
-template <typename=void>
+template <typename = void>
 __device__ static inline uint32_t get_sreg_nctaid_z();
 */
 #if __cccl_ptx_isa >= 200
@@ -234,7 +239,7 @@ _CCCL_DEVICE static inline _CUDA_VSTD::uint32_t get_sreg_nctaid_z()
 
 /*
 // mov.u32 sreg_value, %%smid; // PTX ISA 13
-template <typename=void>
+template <typename = void>
 __device__ static inline uint32_t get_sreg_smid();
 */
 #if __cccl_ptx_isa >= 130
@@ -249,7 +254,7 @@ _CCCL_DEVICE static inline _CUDA_VSTD::uint32_t get_sreg_smid()
 
 /*
 // mov.u32 sreg_value, %%nsmid; // PTX ISA 20, SM_35
-template <typename=void>
+template <typename = void>
 __device__ static inline uint32_t get_sreg_nsmid();
 */
 #if __cccl_ptx_isa >= 200
@@ -273,7 +278,7 @@ _CCCL_DEVICE static inline _CUDA_VSTD::uint32_t get_sreg_nsmid()
 
 /*
 // mov.u64 sreg_value, %%gridid; // PTX ISA 30
-template <typename=void>
+template <typename = void>
 __device__ static inline uint64_t get_sreg_gridid();
 */
 #if __cccl_ptx_isa >= 300
@@ -288,7 +293,7 @@ _CCCL_DEVICE static inline _CUDA_VSTD::uint64_t get_sreg_gridid()
 
 /*
 // mov.pred sreg_value, %%is_explicit_cluster; // PTX ISA 78, SM_90
-template <typename=void>
+template <typename = void>
 __device__ static inline bool get_sreg_is_explicit_cluster();
 */
 #if __cccl_ptx_isa >= 780
@@ -315,7 +320,7 @@ _CCCL_DEVICE static inline bool get_sreg_is_explicit_cluster()
 
 /*
 // mov.u32 sreg_value, %%clusterid.x; // PTX ISA 78, SM_90
-template <typename=void>
+template <typename = void>
 __device__ static inline uint32_t get_sreg_clusterid_x();
 */
 #if __cccl_ptx_isa >= 780
@@ -339,7 +344,7 @@ _CCCL_DEVICE static inline _CUDA_VSTD::uint32_t get_sreg_clusterid_x()
 
 /*
 // mov.u32 sreg_value, %%clusterid.y; // PTX ISA 78, SM_90
-template <typename=void>
+template <typename = void>
 __device__ static inline uint32_t get_sreg_clusterid_y();
 */
 #if __cccl_ptx_isa >= 780
@@ -363,7 +368,7 @@ _CCCL_DEVICE static inline _CUDA_VSTD::uint32_t get_sreg_clusterid_y()
 
 /*
 // mov.u32 sreg_value, %%clusterid.z; // PTX ISA 78, SM_90
-template <typename=void>
+template <typename = void>
 __device__ static inline uint32_t get_sreg_clusterid_z();
 */
 #if __cccl_ptx_isa >= 780
@@ -387,7 +392,7 @@ _CCCL_DEVICE static inline _CUDA_VSTD::uint32_t get_sreg_clusterid_z()
 
 /*
 // mov.u32 sreg_value, %%nclusterid.x; // PTX ISA 78, SM_90
-template <typename=void>
+template <typename = void>
 __device__ static inline uint32_t get_sreg_nclusterid_x();
 */
 #if __cccl_ptx_isa >= 780
@@ -411,7 +416,7 @@ _CCCL_DEVICE static inline _CUDA_VSTD::uint32_t get_sreg_nclusterid_x()
 
 /*
 // mov.u32 sreg_value, %%nclusterid.y; // PTX ISA 78, SM_90
-template <typename=void>
+template <typename = void>
 __device__ static inline uint32_t get_sreg_nclusterid_y();
 */
 #if __cccl_ptx_isa >= 780
@@ -435,7 +440,7 @@ _CCCL_DEVICE static inline _CUDA_VSTD::uint32_t get_sreg_nclusterid_y()
 
 /*
 // mov.u32 sreg_value, %%nclusterid.z; // PTX ISA 78, SM_90
-template <typename=void>
+template <typename = void>
 __device__ static inline uint32_t get_sreg_nclusterid_z();
 */
 #if __cccl_ptx_isa >= 780
@@ -459,7 +464,7 @@ _CCCL_DEVICE static inline _CUDA_VSTD::uint32_t get_sreg_nclusterid_z()
 
 /*
 // mov.u32 sreg_value, %%cluster_ctaid.x; // PTX ISA 78, SM_90
-template <typename=void>
+template <typename = void>
 __device__ static inline uint32_t get_sreg_cluster_ctaid_x();
 */
 #if __cccl_ptx_isa >= 780
@@ -483,7 +488,7 @@ _CCCL_DEVICE static inline _CUDA_VSTD::uint32_t get_sreg_cluster_ctaid_x()
 
 /*
 // mov.u32 sreg_value, %%cluster_ctaid.y; // PTX ISA 78, SM_90
-template <typename=void>
+template <typename = void>
 __device__ static inline uint32_t get_sreg_cluster_ctaid_y();
 */
 #if __cccl_ptx_isa >= 780
@@ -507,7 +512,7 @@ _CCCL_DEVICE static inline _CUDA_VSTD::uint32_t get_sreg_cluster_ctaid_y()
 
 /*
 // mov.u32 sreg_value, %%cluster_ctaid.z; // PTX ISA 78, SM_90
-template <typename=void>
+template <typename = void>
 __device__ static inline uint32_t get_sreg_cluster_ctaid_z();
 */
 #if __cccl_ptx_isa >= 780
@@ -531,7 +536,7 @@ _CCCL_DEVICE static inline _CUDA_VSTD::uint32_t get_sreg_cluster_ctaid_z()
 
 /*
 // mov.u32 sreg_value, %%cluster_nctaid.x; // PTX ISA 78, SM_90
-template <typename=void>
+template <typename = void>
 __device__ static inline uint32_t get_sreg_cluster_nctaid_x();
 */
 #if __cccl_ptx_isa >= 780
@@ -555,7 +560,7 @@ _CCCL_DEVICE static inline _CUDA_VSTD::uint32_t get_sreg_cluster_nctaid_x()
 
 /*
 // mov.u32 sreg_value, %%cluster_nctaid.y; // PTX ISA 78, SM_90
-template <typename=void>
+template <typename = void>
 __device__ static inline uint32_t get_sreg_cluster_nctaid_y();
 */
 #if __cccl_ptx_isa >= 780
@@ -579,7 +584,7 @@ _CCCL_DEVICE static inline _CUDA_VSTD::uint32_t get_sreg_cluster_nctaid_y()
 
 /*
 // mov.u32 sreg_value, %%cluster_nctaid.z; // PTX ISA 78, SM_90
-template <typename=void>
+template <typename = void>
 __device__ static inline uint32_t get_sreg_cluster_nctaid_z();
 */
 #if __cccl_ptx_isa >= 780
@@ -603,7 +608,7 @@ _CCCL_DEVICE static inline _CUDA_VSTD::uint32_t get_sreg_cluster_nctaid_z()
 
 /*
 // mov.u32 sreg_value, %%cluster_ctarank; // PTX ISA 78, SM_90
-template <typename=void>
+template <typename = void>
 __device__ static inline uint32_t get_sreg_cluster_ctarank();
 */
 #if __cccl_ptx_isa >= 780
@@ -627,7 +632,7 @@ _CCCL_DEVICE static inline _CUDA_VSTD::uint32_t get_sreg_cluster_ctarank()
 
 /*
 // mov.u32 sreg_value, %%cluster_nctarank; // PTX ISA 78, SM_90
-template <typename=void>
+template <typename = void>
 __device__ static inline uint32_t get_sreg_cluster_nctarank();
 */
 #if __cccl_ptx_isa >= 780
@@ -651,7 +656,7 @@ _CCCL_DEVICE static inline _CUDA_VSTD::uint32_t get_sreg_cluster_nctarank()
 
 /*
 // mov.u32 sreg_value, %%lanemask_eq; // PTX ISA 20, SM_35
-template <typename=void>
+template <typename = void>
 __device__ static inline uint32_t get_sreg_lanemask_eq();
 */
 #if __cccl_ptx_isa >= 200
@@ -675,7 +680,7 @@ _CCCL_DEVICE static inline _CUDA_VSTD::uint32_t get_sreg_lanemask_eq()
 
 /*
 // mov.u32 sreg_value, %%lanemask_le; // PTX ISA 20, SM_35
-template <typename=void>
+template <typename = void>
 __device__ static inline uint32_t get_sreg_lanemask_le();
 */
 #if __cccl_ptx_isa >= 200
@@ -699,7 +704,7 @@ _CCCL_DEVICE static inline _CUDA_VSTD::uint32_t get_sreg_lanemask_le()
 
 /*
 // mov.u32 sreg_value, %%lanemask_lt; // PTX ISA 20, SM_35
-template <typename=void>
+template <typename = void>
 __device__ static inline uint32_t get_sreg_lanemask_lt();
 */
 #if __cccl_ptx_isa >= 200
@@ -723,7 +728,7 @@ _CCCL_DEVICE static inline _CUDA_VSTD::uint32_t get_sreg_lanemask_lt()
 
 /*
 // mov.u32 sreg_value, %%lanemask_ge; // PTX ISA 20, SM_35
-template <typename=void>
+template <typename = void>
 __device__ static inline uint32_t get_sreg_lanemask_ge();
 */
 #if __cccl_ptx_isa >= 200
@@ -747,7 +752,7 @@ _CCCL_DEVICE static inline _CUDA_VSTD::uint32_t get_sreg_lanemask_ge()
 
 /*
 // mov.u32 sreg_value, %%lanemask_gt; // PTX ISA 20, SM_35
-template <typename=void>
+template <typename = void>
 __device__ static inline uint32_t get_sreg_lanemask_gt();
 */
 #if __cccl_ptx_isa >= 200
@@ -771,7 +776,7 @@ _CCCL_DEVICE static inline _CUDA_VSTD::uint32_t get_sreg_lanemask_gt()
 
 /*
 // mov.u32 sreg_value, %%clock; // PTX ISA 10
-template <typename=void>
+template <typename = void>
 __device__ static inline uint32_t get_sreg_clock();
 */
 #if __cccl_ptx_isa >= 100
@@ -786,7 +791,7 @@ _CCCL_DEVICE static inline _CUDA_VSTD::uint32_t get_sreg_clock()
 
 /*
 // mov.u32 sreg_value, %%clock_hi; // PTX ISA 50, SM_35
-template <typename=void>
+template <typename = void>
 __device__ static inline uint32_t get_sreg_clock_hi();
 */
 #if __cccl_ptx_isa >= 500
@@ -810,7 +815,7 @@ _CCCL_DEVICE static inline _CUDA_VSTD::uint32_t get_sreg_clock_hi()
 
 /*
 // mov.u64 sreg_value, %%clock64; // PTX ISA 20, SM_35
-template <typename=void>
+template <typename = void>
 __device__ static inline uint64_t get_sreg_clock64();
 */
 #if __cccl_ptx_isa >= 200
@@ -834,7 +839,7 @@ _CCCL_DEVICE static inline _CUDA_VSTD::uint64_t get_sreg_clock64()
 
 /*
 // mov.u64 sreg_value, %%globaltimer; // PTX ISA 31, SM_35
-template <typename=void>
+template <typename = void>
 __device__ static inline uint64_t get_sreg_globaltimer();
 */
 #if __cccl_ptx_isa >= 310
@@ -858,7 +863,7 @@ _CCCL_DEVICE static inline _CUDA_VSTD::uint64_t get_sreg_globaltimer()
 
 /*
 // mov.u32 sreg_value, %%globaltimer_lo; // PTX ISA 31, SM_35
-template <typename=void>
+template <typename = void>
 __device__ static inline uint32_t get_sreg_globaltimer_lo();
 */
 #if __cccl_ptx_isa >= 310
@@ -882,7 +887,7 @@ _CCCL_DEVICE static inline _CUDA_VSTD::uint32_t get_sreg_globaltimer_lo()
 
 /*
 // mov.u32 sreg_value, %%globaltimer_hi; // PTX ISA 31, SM_35
-template <typename=void>
+template <typename = void>
 __device__ static inline uint32_t get_sreg_globaltimer_hi();
 */
 #if __cccl_ptx_isa >= 310
@@ -906,7 +911,7 @@ _CCCL_DEVICE static inline _CUDA_VSTD::uint32_t get_sreg_globaltimer_hi()
 
 /*
 // mov.u32 sreg_value, %%total_smem_size; // PTX ISA 41, SM_35
-template <typename=void>
+template <typename = void>
 __device__ static inline uint32_t get_sreg_total_smem_size();
 */
 #if __cccl_ptx_isa >= 410
@@ -930,7 +935,7 @@ _CCCL_DEVICE static inline _CUDA_VSTD::uint32_t get_sreg_total_smem_size()
 
 /*
 // mov.u32 sreg_value, %%aggr_smem_size; // PTX ISA 81, SM_90
-template <typename=void>
+template <typename = void>
 __device__ static inline uint32_t get_sreg_aggr_smem_size();
 */
 #if __cccl_ptx_isa >= 810
@@ -954,7 +959,7 @@ _CCCL_DEVICE static inline _CUDA_VSTD::uint32_t get_sreg_aggr_smem_size()
 
 /*
 // mov.u32 sreg_value, %%dynamic_smem_size; // PTX ISA 41, SM_35
-template <typename=void>
+template <typename = void>
 __device__ static inline uint32_t get_sreg_dynamic_smem_size();
 */
 #if __cccl_ptx_isa >= 410
@@ -978,7 +983,7 @@ _CCCL_DEVICE static inline _CUDA_VSTD::uint32_t get_sreg_dynamic_smem_size()
 
 /*
 // mov.u64 sreg_value, %%current_graph_exec; // PTX ISA 80, SM_50
-template <typename=void>
+template <typename = void>
 __device__ static inline uint64_t get_sreg_current_graph_exec();
 */
 #if __cccl_ptx_isa >= 800
@@ -999,3 +1004,5 @@ _CCCL_DEVICE static inline _CUDA_VSTD::uint64_t get_sreg_current_graph_exec()
       __cuda_ptx_get_sreg_current_graph_exec_is_not_supported_before_SM_50__(); return 0;));
 }
 #endif // __cccl_ptx_isa >= 800
+
+#endif // _CUDA_PTX_GENERATED_GET_SREG_H_
diff --git a/libcudacxx/include/cuda/__ptx/instructions/generated/getctarank.inc b/libcudacxx/include/cuda/__ptx/instructions/generated/getctarank.h
similarity index 81%
rename from libcudacxx/include/cuda/__ptx/instructions/generated/getctarank.inc
rename to libcudacxx/include/cuda/__ptx/instructions/generated/getctarank.h
index 51bd351be87..a769868f45c 100644
--- a/libcudacxx/include/cuda/__ptx/instructions/generated/getctarank.inc
+++ b/libcudacxx/include/cuda/__ptx/instructions/generated/getctarank.h
@@ -1,7 +1,12 @@
+// This file was automatically generated. Do not edit.
+
+#ifndef _CUDA_PTX_GENERATED_GETCTARANK_H_
+#define _CUDA_PTX_GENERATED_GETCTARANK_H_
+
 /*
 // getctarank{.space}.u32 dest, addr; // PTX ISA 78, SM_90
 // .space     = { .shared::cluster }
-template <typename=void>
+template <typename = void>
 __device__ static inline uint32_t getctarank(
   cuda::ptx::space_cluster_t,
   const void* addr);
@@ -25,3 +30,5 @@ _CCCL_DEVICE static inline _CUDA_VSTD::uint32_t getctarank(space_cluster_t, cons
       __cuda_ptx_getctarank_is_not_supported_before_SM_90__(); return 0;));
 }
 #endif // __cccl_ptx_isa >= 780
+
+#endif // _CUDA_PTX_GENERATED_GETCTARANK_H_
diff --git a/libcudacxx/include/cuda/__ptx/instructions/generated/mbarrier_arrive.inc b/libcudacxx/include/cuda/__ptx/instructions/generated/mbarrier_arrive.h
similarity index 94%
rename from libcudacxx/include/cuda/__ptx/instructions/generated/mbarrier_arrive.inc
rename to libcudacxx/include/cuda/__ptx/instructions/generated/mbarrier_arrive.h
index f3e2b860d50..e1afe25d8c2 100644
--- a/libcudacxx/include/cuda/__ptx/instructions/generated/mbarrier_arrive.inc
+++ b/libcudacxx/include/cuda/__ptx/instructions/generated/mbarrier_arrive.h
@@ -1,6 +1,11 @@
+// This file was automatically generated. Do not edit.
+
+#ifndef _CUDA_PTX_GENERATED_MBARRIER_ARRIVE_H_
+#define _CUDA_PTX_GENERATED_MBARRIER_ARRIVE_H_
+
 /*
 // mbarrier.arrive.shared.b64                                  state,  [addr];           // 1.  PTX ISA 70, SM_80
-template <typename=void>
+template <typename = void>
 __device__ static inline uint64_t mbarrier_arrive(
   uint64_t* addr);
 */
@@ -25,7 +30,7 @@ _CCCL_DEVICE static inline _CUDA_VSTD::uint64_t mbarrier_arrive(_CUDA_VSTD::uint
 
 /*
 // mbarrier.arrive.shared::cta.b64                             state,  [addr], count;    // 2.  PTX ISA 78, SM_90
-template <typename=void>
+template <typename = void>
 __device__ static inline uint64_t mbarrier_arrive(
   uint64_t* addr,
   const uint32_t& count);
@@ -79,7 +84,7 @@ mbarrier_arrive(sem_release_t, scope_t<_Scope> __scope, space_shared_t, _CUDA_VS
             : "=l"(__state)
             : "r"(__as_ptr_smem(__addr))
             : "memory");
-      } _CCCL_ELSE_IF_CONSTEXPR (__scope == scope_cluster) {
+      } else _CCCL_IF_CONSTEXPR (__scope == scope_cluster) {
         asm("mbarrier.arrive.release.cluster.shared::cta.b64                   %0,  [%1];           // 3a. "
             : "=l"(__state)
             : "r"(__as_ptr_smem(__addr))
@@ -125,7 +130,7 @@ _CCCL_DEVICE static inline _CUDA_VSTD::uint64_t mbarrier_arrive(
             : "=l"(__state)
             : "r"(__as_ptr_smem(__addr)), "r"(__count)
             : "memory");
-      } _CCCL_ELSE_IF_CONSTEXPR (__scope == scope_cluster) {
+      } else _CCCL_IF_CONSTEXPR (__scope == scope_cluster) {
         asm("mbarrier.arrive.release.cluster.shared::cta.b64                   %0,  [%1], %2;    // 3b. "
             : "=l"(__state)
             : "r"(__as_ptr_smem(__addr)), "r"(__count)
@@ -142,7 +147,7 @@ _CCCL_DEVICE static inline _CUDA_VSTD::uint64_t mbarrier_arrive(
 // .sem       = { .release }
 // .scope     = { .cluster }
 // .space     = { .shared::cluster }
-template <typename=void>
+template <typename = void>
 __device__ static inline void mbarrier_arrive(
   cuda::ptx::sem_release_t,
   cuda::ptx::scope_cluster_t,
@@ -175,7 +180,7 @@ mbarrier_arrive(sem_release_t, scope_cluster_t, space_cluster_t, _CUDA_VSTD::uin
 // .sem       = { .release }
 // .scope     = { .cluster }
 // .space     = { .shared::cluster }
-template <typename=void>
+template <typename = void>
 __device__ static inline void mbarrier_arrive(
   cuda::ptx::sem_release_t,
   cuda::ptx::scope_cluster_t,
@@ -203,3 +208,5 @@ _CCCL_DEVICE static inline void mbarrier_arrive(
       __cuda_ptx_mbarrier_arrive_is_not_supported_before_SM_90__();));
 }
 #endif // __cccl_ptx_isa >= 800
+
+#endif // _CUDA_PTX_GENERATED_MBARRIER_ARRIVE_H_
diff --git a/libcudacxx/include/cuda/__ptx/instructions/generated/mbarrier_arrive_expect_tx.inc b/libcudacxx/include/cuda/__ptx/instructions/generated/mbarrier_arrive_expect_tx.h
similarity index 90%
rename from libcudacxx/include/cuda/__ptx/instructions/generated/mbarrier_arrive_expect_tx.inc
rename to libcudacxx/include/cuda/__ptx/instructions/generated/mbarrier_arrive_expect_tx.h
index efb749957b1..79301a57851 100644
--- a/libcudacxx/include/cuda/__ptx/instructions/generated/mbarrier_arrive_expect_tx.inc
+++ b/libcudacxx/include/cuda/__ptx/instructions/generated/mbarrier_arrive_expect_tx.h
@@ -1,3 +1,8 @@
+// This file was automatically generated. Do not edit.
+
+#ifndef _CUDA_PTX_GENERATED_MBARRIER_ARRIVE_EXPECT_TX_H_
+#define _CUDA_PTX_GENERATED_MBARRIER_ARRIVE_EXPECT_TX_H_
+
 /*
 // mbarrier.arrive.expect_tx{.sem}{.scope}{.space}.b64 state, [addr], tx_count; // 8.  PTX ISA 80, SM_90
 // .sem       = { .release }
@@ -32,7 +37,7 @@ _CCCL_DEVICE static inline _CUDA_VSTD::uint64_t mbarrier_arrive_expect_tx(
             : "=l"(__state)
             : "r"(__as_ptr_smem(__addr)), "r"(__tx_count)
             : "memory");
-      } _CCCL_ELSE_IF_CONSTEXPR (__scope == scope_cluster) {
+      } else _CCCL_IF_CONSTEXPR (__scope == scope_cluster) {
         asm("mbarrier.arrive.expect_tx.release.cluster.shared::cta.b64 %0, [%1], %2; // 8. "
             : "=l"(__state)
             : "r"(__as_ptr_smem(__addr)), "r"(__tx_count)
@@ -49,7 +54,7 @@ _CCCL_DEVICE static inline _CUDA_VSTD::uint64_t mbarrier_arrive_expect_tx(
 // .sem       = { .release }
 // .scope     = { .cluster }
 // .space     = { .shared::cluster }
-template <typename=void>
+template <typename = void>
 __device__ static inline void mbarrier_arrive_expect_tx(
   cuda::ptx::sem_release_t,
   cuda::ptx::scope_cluster_t,
@@ -77,3 +82,5 @@ _CCCL_DEVICE static inline void mbarrier_arrive_expect_tx(
       __cuda_ptx_mbarrier_arrive_expect_tx_is_not_supported_before_SM_90__();));
 }
 #endif // __cccl_ptx_isa >= 800
+
+#endif // _CUDA_PTX_GENERATED_MBARRIER_ARRIVE_EXPECT_TX_H_
diff --git a/libcudacxx/include/cuda/__ptx/instructions/generated/mbarrier_arrive_no_complete.inc b/libcudacxx/include/cuda/__ptx/instructions/generated/mbarrier_arrive_no_complete.h
similarity index 79%
rename from libcudacxx/include/cuda/__ptx/instructions/generated/mbarrier_arrive_no_complete.inc
rename to libcudacxx/include/cuda/__ptx/instructions/generated/mbarrier_arrive_no_complete.h
index 879bedebdc9..cbfb275baa4 100644
--- a/libcudacxx/include/cuda/__ptx/instructions/generated/mbarrier_arrive_no_complete.inc
+++ b/libcudacxx/include/cuda/__ptx/instructions/generated/mbarrier_arrive_no_complete.h
@@ -1,6 +1,11 @@
+// This file was automatically generated. Do not edit.
+
+#ifndef _CUDA_PTX_GENERATED_MBARRIER_ARRIVE_NO_COMPLETE_H_
+#define _CUDA_PTX_GENERATED_MBARRIER_ARRIVE_NO_COMPLETE_H_
+
 /*
 // mbarrier.arrive.noComplete.shared.b64                       state,  [addr], count;    // 5.  PTX ISA 70, SM_80
-template <typename=void>
+template <typename = void>
 __device__ static inline uint64_t mbarrier_arrive_no_complete(
   uint64_t* addr,
   const uint32_t& count);
@@ -24,3 +29,5 @@ mbarrier_arrive_no_complete(_CUDA_VSTD::uint64_t* __addr, const _CUDA_VSTD::uint
       __cuda_ptx_mbarrier_arrive_no_complete_is_not_supported_before_SM_80__(); return 0;));
 }
 #endif // __cccl_ptx_isa >= 700
+
+#endif // _CUDA_PTX_GENERATED_MBARRIER_ARRIVE_NO_COMPLETE_H_
diff --git a/libcudacxx/include/cuda/__ptx/instructions/generated/mbarrier_init.inc b/libcudacxx/include/cuda/__ptx/instructions/generated/mbarrier_init.h
similarity index 78%
rename from libcudacxx/include/cuda/__ptx/instructions/generated/mbarrier_init.inc
rename to libcudacxx/include/cuda/__ptx/instructions/generated/mbarrier_init.h
index 3afeeacfccf..d1e5c57c97e 100644
--- a/libcudacxx/include/cuda/__ptx/instructions/generated/mbarrier_init.inc
+++ b/libcudacxx/include/cuda/__ptx/instructions/generated/mbarrier_init.h
@@ -1,6 +1,11 @@
+// This file was automatically generated. Do not edit.
+
+#ifndef _CUDA_PTX_GENERATED_MBARRIER_INIT_H_
+#define _CUDA_PTX_GENERATED_MBARRIER_INIT_H_
+
 /*
 // mbarrier.init.shared.b64 [addr], count; // PTX ISA 70, SM_80
-template <typename=void>
+template <typename = void>
 __device__ static inline void mbarrier_init(
   uint64_t* addr,
   const uint32_t& count);
@@ -21,3 +26,5 @@ _CCCL_DEVICE static inline void mbarrier_init(_CUDA_VSTD::uint64_t* __addr, cons
       __cuda_ptx_mbarrier_init_is_not_supported_before_SM_80__();));
 }
 #endif // __cccl_ptx_isa >= 700
+
+#endif // _CUDA_PTX_GENERATED_MBARRIER_INIT_H_
diff --git a/libcudacxx/include/cuda/__ptx/instructions/generated/mbarrier_test_wait.inc b/libcudacxx/include/cuda/__ptx/instructions/generated/mbarrier_test_wait.h
similarity index 90%
rename from libcudacxx/include/cuda/__ptx/instructions/generated/mbarrier_test_wait.inc
rename to libcudacxx/include/cuda/__ptx/instructions/generated/mbarrier_test_wait.h
index 301c0364af4..f3dbb6ed1c3 100644
--- a/libcudacxx/include/cuda/__ptx/instructions/generated/mbarrier_test_wait.inc
+++ b/libcudacxx/include/cuda/__ptx/instructions/generated/mbarrier_test_wait.h
@@ -1,6 +1,11 @@
+// This file was automatically generated. Do not edit.
+
+#ifndef _CUDA_PTX_GENERATED_MBARRIER_TEST_WAIT_H_
+#define _CUDA_PTX_GENERATED_MBARRIER_TEST_WAIT_H_
+
 /*
 // mbarrier.test_wait.shared.b64 waitComplete, [addr], state;                                                  // 1. PTX
-ISA 70, SM_80 template <typename=void>
+ISA 70, SM_80 template <typename = void>
 __device__ static inline bool mbarrier_test_wait(
   uint64_t* addr,
   const uint64_t& state);
@@ -58,7 +63,7 @@ _CCCL_DEVICE static inline bool mbarrier_test_wait(
             : "=r"(__waitComplete)
             : "r"(__as_ptr_smem(__addr)), "l"(__state)
             : "memory");
-      } _CCCL_ELSE_IF_CONSTEXPR (__scope == scope_cluster) {
+      } else _CCCL_IF_CONSTEXPR (__scope == scope_cluster) {
         asm("{\n\t .reg .pred P_OUT; \n\t"
             "mbarrier.test_wait.acquire.cluster.shared::cta.b64        P_OUT, [%1], %2;                        // 2.  "
             "\n\t"
@@ -73,3 +78,5 @@ _CCCL_DEVICE static inline bool mbarrier_test_wait(
       __cuda_ptx_mbarrier_test_wait_is_not_supported_before_SM_90__(); return false;));
 }
 #endif // __cccl_ptx_isa >= 800
+
+#endif // _CUDA_PTX_GENERATED_MBARRIER_TEST_WAIT_H_
diff --git a/libcudacxx/include/cuda/__ptx/instructions/generated/mbarrier_test_wait_parity.inc b/libcudacxx/include/cuda/__ptx/instructions/generated/mbarrier_test_wait_parity.h
similarity index 90%
rename from libcudacxx/include/cuda/__ptx/instructions/generated/mbarrier_test_wait_parity.inc
rename to libcudacxx/include/cuda/__ptx/instructions/generated/mbarrier_test_wait_parity.h
index 604cfd92045..b975434b2de 100644
--- a/libcudacxx/include/cuda/__ptx/instructions/generated/mbarrier_test_wait_parity.inc
+++ b/libcudacxx/include/cuda/__ptx/instructions/generated/mbarrier_test_wait_parity.h
@@ -1,6 +1,11 @@
+// This file was automatically generated. Do not edit.
+
+#ifndef _CUDA_PTX_GENERATED_MBARRIER_TEST_WAIT_PARITY_H_
+#define _CUDA_PTX_GENERATED_MBARRIER_TEST_WAIT_PARITY_H_
+
 /*
 // mbarrier.test_wait.parity.shared.b64 waitComplete, [addr], phaseParity;                                     // 3. PTX
-ISA 71, SM_80 template <typename=void>
+ISA 71, SM_80 template <typename = void>
 __device__ static inline bool mbarrier_test_wait_parity(
   uint64_t* addr,
   const uint32_t& phaseParity);
@@ -59,7 +64,7 @@ _CCCL_DEVICE static inline bool mbarrier_test_wait_parity(
             : "=r"(__waitComplete)
             : "r"(__as_ptr_smem(__addr)), "r"(__phaseParity)
             : "memory");
-      } _CCCL_ELSE_IF_CONSTEXPR (__scope == scope_cluster) {
+      } else _CCCL_IF_CONSTEXPR (__scope == scope_cluster) {
         asm("{\n\t .reg .pred P_OUT; \n\t"
             "mbarrier.test_wait.parity.acquire.cluster.shared::cta.b64 P_OUT, [%1], %2;                  // 4. \n\t"
             "selp.b32 %0, 1, 0, P_OUT; \n"
@@ -73,3 +78,5 @@ _CCCL_DEVICE static inline bool mbarrier_test_wait_parity(
       __cuda_ptx_mbarrier_test_wait_parity_is_not_supported_before_SM_90__(); return false;));
 }
 #endif // __cccl_ptx_isa >= 800
+
+#endif // _CUDA_PTX_GENERATED_MBARRIER_TEST_WAIT_PARITY_H_
diff --git a/libcudacxx/include/cuda/__ptx/instructions/generated/mbarrier_try_wait.inc b/libcudacxx/include/cuda/__ptx/instructions/generated/mbarrier_try_wait.h
similarity index 93%
rename from libcudacxx/include/cuda/__ptx/instructions/generated/mbarrier_try_wait.inc
rename to libcudacxx/include/cuda/__ptx/instructions/generated/mbarrier_try_wait.h
index c5f2062664c..dd50a2c9f41 100644
--- a/libcudacxx/include/cuda/__ptx/instructions/generated/mbarrier_try_wait.inc
+++ b/libcudacxx/include/cuda/__ptx/instructions/generated/mbarrier_try_wait.h
@@ -1,6 +1,11 @@
+// This file was automatically generated. Do not edit.
+
+#ifndef _CUDA_PTX_GENERATED_MBARRIER_TRY_WAIT_H_
+#define _CUDA_PTX_GENERATED_MBARRIER_TRY_WAIT_H_
+
 /*
 // mbarrier.try_wait.shared::cta.b64         waitComplete, [addr], state;                                      // 5a.
-PTX ISA 78, SM_90 template <typename=void>
+PTX ISA 78, SM_90 template <typename = void>
 __device__ static inline bool mbarrier_try_wait(
   uint64_t* addr,
   const uint64_t& state);
@@ -29,7 +34,7 @@ _CCCL_DEVICE static inline bool mbarrier_try_wait(_CUDA_VSTD::uint64_t* __addr,
 
 /*
 // mbarrier.try_wait.shared::cta.b64         waitComplete, [addr], state, suspendTimeHint;                    // 5b. PTX
-ISA 78, SM_90 template <typename=void>
+ISA 78, SM_90 template <typename = void>
 __device__ static inline bool mbarrier_try_wait(
   uint64_t* addr,
   const uint64_t& state,
@@ -89,7 +94,7 @@ _CCCL_DEVICE static inline bool mbarrier_try_wait(
             : "=r"(__waitComplete)
             : "r"(__as_ptr_smem(__addr)), "l"(__state)
             : "memory");
-      } _CCCL_ELSE_IF_CONSTEXPR (__scope == scope_cluster) {
+      } else _CCCL_IF_CONSTEXPR (__scope == scope_cluster) {
         asm("{\n\t .reg .pred P_OUT; \n\t"
             "mbarrier.try_wait.acquire.cluster.shared::cta.b64         P_OUT, [%1], %2;                        // 6a. "
             "\n\t"
@@ -141,7 +146,7 @@ _CCCL_DEVICE static inline bool mbarrier_try_wait(
             : "=r"(__waitComplete)
             : "r"(__as_ptr_smem(__addr)), "l"(__state), "r"(__suspendTimeHint)
             : "memory");
-      } _CCCL_ELSE_IF_CONSTEXPR (__scope == scope_cluster) {
+      } else _CCCL_IF_CONSTEXPR (__scope == scope_cluster) {
         asm("{\n\t .reg .pred P_OUT; \n\t"
             "mbarrier.try_wait.acquire.cluster.shared::cta.b64         P_OUT, [%1], %2 , %3;      // 6b. \n\t"
             "selp.b32 %0, 1, 0, P_OUT; \n"
@@ -155,3 +160,5 @@ _CCCL_DEVICE static inline bool mbarrier_try_wait(
       __cuda_ptx_mbarrier_try_wait_is_not_supported_before_SM_90__(); return false;));
 }
 #endif // __cccl_ptx_isa >= 800
+
+#endif // _CUDA_PTX_GENERATED_MBARRIER_TRY_WAIT_H_
diff --git a/libcudacxx/include/cuda/__ptx/instructions/generated/mbarrier_try_wait_parity.inc b/libcudacxx/include/cuda/__ptx/instructions/generated/mbarrier_try_wait_parity.h
similarity index 93%
rename from libcudacxx/include/cuda/__ptx/instructions/generated/mbarrier_try_wait_parity.inc
rename to libcudacxx/include/cuda/__ptx/instructions/generated/mbarrier_try_wait_parity.h
index 321bfc515da..d3deb3ca1d5 100644
--- a/libcudacxx/include/cuda/__ptx/instructions/generated/mbarrier_try_wait_parity.inc
+++ b/libcudacxx/include/cuda/__ptx/instructions/generated/mbarrier_try_wait_parity.h
@@ -1,6 +1,11 @@
+// This file was automatically generated. Do not edit.
+
+#ifndef _CUDA_PTX_GENERATED_MBARRIER_TRY_WAIT_PARITY_H_
+#define _CUDA_PTX_GENERATED_MBARRIER_TRY_WAIT_PARITY_H_
+
 /*
 // mbarrier.try_wait.parity.shared::cta.b64  waitComplete, [addr], phaseParity;                                // 7a.
-PTX ISA 78, SM_90 template <typename=void>
+PTX ISA 78, SM_90 template <typename = void>
 __device__ static inline bool mbarrier_try_wait_parity(
   uint64_t* addr,
   const uint32_t& phaseParity);
@@ -30,7 +35,7 @@ mbarrier_try_wait_parity(_CUDA_VSTD::uint64_t* __addr, const _CUDA_VSTD::uint32_
 
 /*
 // mbarrier.try_wait.parity.shared::cta.b64  waitComplete, [addr], phaseParity, suspendTimeHint;               // 7b.
-PTX ISA 78, SM_90 template <typename=void>
+PTX ISA 78, SM_90 template <typename = void>
 __device__ static inline bool mbarrier_try_wait_parity(
   uint64_t* addr,
   const uint32_t& phaseParity,
@@ -90,7 +95,7 @@ _CCCL_DEVICE static inline bool mbarrier_try_wait_parity(
             : "=r"(__waitComplete)
             : "r"(__as_ptr_smem(__addr)), "r"(__phaseParity)
             : "memory");
-      } _CCCL_ELSE_IF_CONSTEXPR (__scope == scope_cluster) {
+      } else _CCCL_IF_CONSTEXPR (__scope == scope_cluster) {
         asm("{\n\t .reg .pred P_OUT; \n\t"
             "mbarrier.try_wait.parity.acquire.cluster.shared::cta.b64  P_OUT, [%1], %2;                  // 8a. \n\t"
             "selp.b32 %0, 1, 0, P_OUT; \n"
@@ -141,7 +146,7 @@ _CCCL_DEVICE static inline bool mbarrier_try_wait_parity(
             : "=r"(__waitComplete)
             : "r"(__as_ptr_smem(__addr)), "r"(__phaseParity), "r"(__suspendTimeHint)
             : "memory");
-      } _CCCL_ELSE_IF_CONSTEXPR (__scope == scope_cluster) {
+      } else _CCCL_IF_CONSTEXPR (__scope == scope_cluster) {
         asm("{\n\t .reg .pred P_OUT; \n\t"
             "mbarrier.try_wait.parity.acquire.cluster.shared::cta.b64  P_OUT, [%1], %2, %3; // 8b. \n\t"
             "selp.b32 %0, 1, 0, P_OUT; \n"
@@ -155,3 +160,5 @@ _CCCL_DEVICE static inline bool mbarrier_try_wait_parity(
       __cuda_ptx_mbarrier_try_wait_parity_is_not_supported_before_SM_90__(); return false;));
 }
 #endif // __cccl_ptx_isa >= 800
+
+#endif // _CUDA_PTX_GENERATED_MBARRIER_TRY_WAIT_PARITY_H_
diff --git a/libcudacxx/include/cuda/__ptx/instructions/generated/red_async.inc b/libcudacxx/include/cuda/__ptx/instructions/generated/red_async.h
similarity index 97%
rename from libcudacxx/include/cuda/__ptx/instructions/generated/red_async.inc
rename to libcudacxx/include/cuda/__ptx/instructions/generated/red_async.h
index 3157fa1c627..d88392f3635 100644
--- a/libcudacxx/include/cuda/__ptx/instructions/generated/red_async.inc
+++ b/libcudacxx/include/cuda/__ptx/instructions/generated/red_async.h
@@ -1,9 +1,14 @@
+// This file was automatically generated. Do not edit.
+
+#ifndef _CUDA_PTX_GENERATED_RED_ASYNC_H_
+#define _CUDA_PTX_GENERATED_RED_ASYNC_H_
+
 /*
 // red.async.relaxed.cluster.shared::cluster.mbarrier::complete_tx::bytes{.op}{.type}  [dest], value, [remote_bar];  //
 PTX ISA 81, SM_90
 // .type      = { .u32 }
 // .op        = { .inc }
-template <typename=void>
+template <typename = void>
 __device__ static inline void red_async(
   cuda::ptx::op_inc_t,
   uint32_t* dest,
@@ -35,7 +40,7 @@ _CCCL_DEVICE static inline void red_async(
 PTX ISA 81, SM_90
 // .type      = { .u32 }
 // .op        = { .dec }
-template <typename=void>
+template <typename = void>
 __device__ static inline void red_async(
   cuda::ptx::op_dec_t,
   uint32_t* dest,
@@ -67,7 +72,7 @@ _CCCL_DEVICE static inline void red_async(
 PTX ISA 81, SM_90
 // .type      = { .u32 }
 // .op        = { .min }
-template <typename=void>
+template <typename = void>
 __device__ static inline void red_async(
   cuda::ptx::op_min_t,
   uint32_t* dest,
@@ -99,7 +104,7 @@ _CCCL_DEVICE static inline void red_async(
 PTX ISA 81, SM_90
 // .type      = { .u32 }
 // .op        = { .max }
-template <typename=void>
+template <typename = void>
 __device__ static inline void red_async(
   cuda::ptx::op_max_t,
   uint32_t* dest,
@@ -131,7 +136,7 @@ _CCCL_DEVICE static inline void red_async(
 PTX ISA 81, SM_90
 // .type      = { .u32 }
 // .op        = { .add }
-template <typename=void>
+template <typename = void>
 __device__ static inline void red_async(
   cuda::ptx::op_add_t,
   uint32_t* dest,
@@ -163,7 +168,7 @@ _CCCL_DEVICE static inline void red_async(
 PTX ISA 81, SM_90
 // .type      = { .s32 }
 // .op        = { .min }
-template <typename=void>
+template <typename = void>
 __device__ static inline void red_async(
   cuda::ptx::op_min_t,
   int32_t* dest,
@@ -195,7 +200,7 @@ red_async(op_min_t, _CUDA_VSTD::int32_t* __dest, const _CUDA_VSTD::int32_t& __va
 PTX ISA 81, SM_90
 // .type      = { .s32 }
 // .op        = { .max }
-template <typename=void>
+template <typename = void>
 __device__ static inline void red_async(
   cuda::ptx::op_max_t,
   int32_t* dest,
@@ -227,7 +232,7 @@ red_async(op_max_t, _CUDA_VSTD::int32_t* __dest, const _CUDA_VSTD::int32_t& __va
 PTX ISA 81, SM_90
 // .type      = { .s32 }
 // .op        = { .add }
-template <typename=void>
+template <typename = void>
 __device__ static inline void red_async(
   cuda::ptx::op_add_t,
   int32_t* dest,
@@ -358,7 +363,7 @@ red_async(op_xor_op_t, _B32* __dest, const _B32& __value, _CUDA_VSTD::uint64_t*
 PTX ISA 81, SM_90
 // .type      = { .u64 }
 // .op        = { .add }
-template <typename=void>
+template <typename = void>
 __device__ static inline void red_async(
   cuda::ptx::op_add_t,
   uint64_t* dest,
@@ -389,7 +394,7 @@ _CCCL_DEVICE static inline void red_async(
 // red.async.relaxed.cluster.shared::cluster.mbarrier::complete_tx::bytes{.op}.u64  [dest], value, [remote_bar]; // .u64
 intentional PTX ISA 81, SM_90
 // .op        = { .add }
-template <typename=void>
+template <typename = void>
 __device__ static inline void red_async(
   cuda::ptx::op_add_t,
   int64_t* dest,
@@ -415,3 +420,5 @@ red_async(op_add_t, _CUDA_VSTD::int64_t* __dest, const _CUDA_VSTD::int64_t& __va
       __cuda_ptx_red_async_is_not_supported_before_SM_90__();));
 }
 #endif // __cccl_ptx_isa >= 810
+
+#endif // _CUDA_PTX_GENERATED_RED_ASYNC_H_
diff --git a/libcudacxx/include/cuda/__ptx/instructions/generated/st_async.inc b/libcudacxx/include/cuda/__ptx/instructions/generated/st_async.h
similarity index 93%
rename from libcudacxx/include/cuda/__ptx/instructions/generated/st_async.inc
rename to libcudacxx/include/cuda/__ptx/instructions/generated/st_async.h
index 9dfab243ffe..18fd2c03a41 100644
--- a/libcudacxx/include/cuda/__ptx/instructions/generated/st_async.inc
+++ b/libcudacxx/include/cuda/__ptx/instructions/generated/st_async.h
@@ -1,3 +1,8 @@
+// This file was automatically generated. Do not edit.
+
+#ifndef _CUDA_PTX_GENERATED_ST_ASYNC_H_
+#define _CUDA_PTX_GENERATED_ST_ASYNC_H_
+
 /*
 // st.async.weak.shared::cluster.mbarrier::complete_tx::bytes{.type} [addr], value, [remote_bar];    // 1.  PTX ISA 81,
 SM_90
@@ -22,7 +27,7 @@ _CCCL_DEVICE static inline void st_async(_Type* __addr, const _Type& __value, _C
             :
             : "r"(__as_ptr_remote_dsmem(__addr)), "r"(__as_b32(__value)), "r"(__as_ptr_remote_dsmem(__remote_bar))
             : "memory");
-      } _CCCL_ELSE_IF_CONSTEXPR (sizeof(_Type) == 8) {
+      } else _CCCL_IF_CONSTEXPR (sizeof(_Type) == 8) {
         asm("st.async.weak.shared::cluster.mbarrier::complete_tx::bytes.b64 [%0], %1, [%2];    // 1. "
             :
             : "r"(__as_ptr_remote_dsmem(__addr)), "l"(__as_b64(__value)), "r"(__as_ptr_remote_dsmem(__remote_bar))
@@ -61,7 +66,7 @@ _CCCL_DEVICE static inline void st_async(_Type* __addr, const _Type (&__value)[2
               "r"(__as_b32(__value[1])),
               "r"(__as_ptr_remote_dsmem(__remote_bar))
             : "memory");
-      } _CCCL_ELSE_IF_CONSTEXPR (sizeof(_Type) == 8) {
+      } else _CCCL_IF_CONSTEXPR (sizeof(_Type) == 8) {
         asm("st.async.weak.shared::cluster.mbarrier::complete_tx::bytes.v2.b64 [%0], {%1, %2}, [%3]; // 2. "
             :
             : "r"(__as_ptr_remote_dsmem(__addr)),
@@ -106,3 +111,5 @@ _CCCL_DEVICE static inline void st_async(_B32* __addr, const _B32 (&__value)[4],
       __cuda_ptx_st_async_is_not_supported_before_SM_90__();));
 }
 #endif // __cccl_ptx_isa >= 810
+
+#endif // _CUDA_PTX_GENERATED_ST_ASYNC_H_
diff --git a/libcudacxx/include/cuda/__ptx/instructions/generated/tensormap_cp_fenceproxy.inc b/libcudacxx/include/cuda/__ptx/instructions/generated/tensormap_cp_fenceproxy.h
similarity index 85%
rename from libcudacxx/include/cuda/__ptx/instructions/generated/tensormap_cp_fenceproxy.inc
rename to libcudacxx/include/cuda/__ptx/instructions/generated/tensormap_cp_fenceproxy.h
index 033d0606e7f..b51b5185db0 100644
--- a/libcudacxx/include/cuda/__ptx/instructions/generated/tensormap_cp_fenceproxy.inc
+++ b/libcudacxx/include/cuda/__ptx/instructions/generated/tensormap_cp_fenceproxy.h
@@ -1,3 +1,8 @@
+// This file was automatically generated. Do not edit.
+
+#ifndef _CUDA_PTX_GENERATED_TENSORMAP_CP_FENCEPROXY_H_
+#define _CUDA_PTX_GENERATED_TENSORMAP_CP_FENCEPROXY_H_
+
 /*
 // tensormap.cp_fenceproxy.global.shared::cta.tensormap::generic.sem.scope.sync.aligned  [dst], [src], size; // PTX ISA
 83, SM_90
@@ -28,19 +33,19 @@ tensormap_cp_fenceproxy(sem_release_t, scope_t<_Scope> __scope, void* __dst, con
           :
           : "l"(__as_ptr_gmem(__dst)), "r"(__as_ptr_smem(__src)), "n"(__size.value)
           : "memory");
-      } _CCCL_ELSE_IF_CONSTEXPR (__scope == scope_cluster) {
+      } else _CCCL_IF_CONSTEXPR (__scope == scope_cluster) {
         asm volatile(
           "tensormap.cp_fenceproxy.global.shared::cta.tensormap::generic.release.cluster.sync.aligned  [%0], [%1], %2;"
           :
           : "l"(__as_ptr_gmem(__dst)), "r"(__as_ptr_smem(__src)), "n"(__size.value)
           : "memory");
-      } _CCCL_ELSE_IF_CONSTEXPR (__scope == scope_gpu) {
+      } else _CCCL_IF_CONSTEXPR (__scope == scope_gpu) {
         asm volatile(
           "tensormap.cp_fenceproxy.global.shared::cta.tensormap::generic.release.gpu.sync.aligned  [%0], [%1], %2;"
           :
           : "l"(__as_ptr_gmem(__dst)), "r"(__as_ptr_smem(__src)), "n"(__size.value)
           : "memory");
-      } _CCCL_ELSE_IF_CONSTEXPR (__scope == scope_sys) {
+      } else _CCCL_IF_CONSTEXPR (__scope == scope_sys) {
         asm volatile(
           "tensormap.cp_fenceproxy.global.shared::cta.tensormap::generic.release.sys.sync.aligned  [%0], [%1], %2;"
           :
@@ -52,3 +57,5 @@ tensormap_cp_fenceproxy(sem_release_t, scope_t<_Scope> __scope, void* __dst, con
       __cuda_ptx_tensormap_cp_fenceproxy_is_not_supported_before_SM_90__();));
 }
 #endif // __cccl_ptx_isa >= 830
+
+#endif // _CUDA_PTX_GENERATED_TENSORMAP_CP_FENCEPROXY_H_
diff --git a/libcudacxx/include/cuda/__ptx/instructions/generated/tensormap_replace.inc b/libcudacxx/include/cuda/__ptx/instructions/generated/tensormap_replace.h
similarity index 99%
rename from libcudacxx/include/cuda/__ptx/instructions/generated/tensormap_replace.inc
rename to libcudacxx/include/cuda/__ptx/instructions/generated/tensormap_replace.h
index 3b1060ead38..3889026750d 100644
--- a/libcudacxx/include/cuda/__ptx/instructions/generated/tensormap_replace.inc
+++ b/libcudacxx/include/cuda/__ptx/instructions/generated/tensormap_replace.h
@@ -1,3 +1,8 @@
+// This file was automatically generated. Do not edit.
+
+#ifndef _CUDA_PTX_GENERATED_TENSORMAP_REPLACE_H_
+#define _CUDA_PTX_GENERATED_TENSORMAP_REPLACE_H_
+
 /*
 // tensormap.replace.tile.global_address.space.b1024.b64    [tm_addr], new_val; // PTX ISA 83, SM_90a
 // .space     = { .global }
@@ -567,3 +572,5 @@ _CCCL_DEVICE static inline void tensormap_replace_fill_mode(space_shared_t, void
       __cuda_ptx_tensormap_replace_fill_mode_is_not_supported_before_SM_90a__();));
 }
 #endif // __cccl_ptx_isa >= 830
+
+#endif // _CUDA_PTX_GENERATED_TENSORMAP_REPLACE_H_
diff --git a/libcudacxx/include/cuda/__ptx/instructions/get_sreg.h b/libcudacxx/include/cuda/__ptx/instructions/get_sreg.h
index 033005beb5b..3157f7d1da9 100644
--- a/libcudacxx/include/cuda/__ptx/instructions/get_sreg.h
+++ b/libcudacxx/include/cuda/__ptx/instructions/get_sreg.h
@@ -32,7 +32,7 @@ _LIBCUDACXX_BEGIN_NAMESPACE_CUDA_PTX
 
 // 10. Special Registers
 // https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#special-registers
-#include <cuda/__ptx/instructions/generated/get_sreg.inc>
+#include <cuda/__ptx/instructions/generated/get_sreg.h>
 
 _LIBCUDACXX_END_NAMESPACE_CUDA_PTX
 
diff --git a/libcudacxx/include/cuda/__ptx/instructions/getctarank.h b/libcudacxx/include/cuda/__ptx/instructions/getctarank.h
index f5ed3424d3b..c41084f5ae3 100644
--- a/libcudacxx/include/cuda/__ptx/instructions/getctarank.h
+++ b/libcudacxx/include/cuda/__ptx/instructions/getctarank.h
@@ -32,7 +32,7 @@ _LIBCUDACXX_BEGIN_NAMESPACE_CUDA_PTX
 
 // 9.7.8.23. Data Movement and Conversion Instructions: getctarank
 // https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#data-movement-and-conversion-instructions-getctarank
-#include <cuda/__ptx/instructions/generated/getctarank.inc>
+#include <cuda/__ptx/instructions/generated/getctarank.h>
 
 _LIBCUDACXX_END_NAMESPACE_CUDA_PTX
 
diff --git a/libcudacxx/include/cuda/__ptx/instructions/mbarrier_arrive.h b/libcudacxx/include/cuda/__ptx/instructions/mbarrier_arrive.h
index fb1341a61d8..0a44942df82 100644
--- a/libcudacxx/include/cuda/__ptx/instructions/mbarrier_arrive.h
+++ b/libcudacxx/include/cuda/__ptx/instructions/mbarrier_arrive.h
@@ -32,9 +32,9 @@ _LIBCUDACXX_BEGIN_NAMESPACE_CUDA_PTX
 
 // 9.7.12.15.13. Parallel Synchronization and Communication Instructions: mbarrier.arrive
 // https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#parallel-synchronization-and-communication-instructions-mbarrier-arrive
-#include <cuda/__ptx/instructions/generated/mbarrier_arrive.inc>
-#include <cuda/__ptx/instructions/generated/mbarrier_arrive_expect_tx.inc>
-#include <cuda/__ptx/instructions/generated/mbarrier_arrive_no_complete.inc>
+#include <cuda/__ptx/instructions/generated/mbarrier_arrive.h>
+#include <cuda/__ptx/instructions/generated/mbarrier_arrive_expect_tx.h>
+#include <cuda/__ptx/instructions/generated/mbarrier_arrive_no_complete.h>
 
 _LIBCUDACXX_END_NAMESPACE_CUDA_PTX
 
diff --git a/libcudacxx/include/cuda/__ptx/instructions/mbarrier_init.h b/libcudacxx/include/cuda/__ptx/instructions/mbarrier_init.h
index 575abda7a41..b3539245e03 100644
--- a/libcudacxx/include/cuda/__ptx/instructions/mbarrier_init.h
+++ b/libcudacxx/include/cuda/__ptx/instructions/mbarrier_init.h
@@ -32,7 +32,7 @@ _LIBCUDACXX_BEGIN_NAMESPACE_CUDA_PTX
 
 // 9.7.12.15.9. Parallel Synchronization and Communication Instructions: mbarrier.init
 // https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#parallel-synchronization-and-communication-instructions-mbarrier-init
-#include <cuda/__ptx/instructions/generated/mbarrier_init.inc>
+#include <cuda/__ptx/instructions/generated/mbarrier_init.h>
 
 _LIBCUDACXX_END_NAMESPACE_CUDA_PTX
 
diff --git a/libcudacxx/include/cuda/__ptx/instructions/mbarrier_wait.h b/libcudacxx/include/cuda/__ptx/instructions/mbarrier_wait.h
index 2d6adb78eec..dfcc03bc01c 100644
--- a/libcudacxx/include/cuda/__ptx/instructions/mbarrier_wait.h
+++ b/libcudacxx/include/cuda/__ptx/instructions/mbarrier_wait.h
@@ -32,10 +32,10 @@ _LIBCUDACXX_BEGIN_NAMESPACE_CUDA_PTX
 
 // 9.7.12.15.16. Parallel Synchronization and Communication Instructions: mbarrier.test_wait/mbarrier.try_wait
 // https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#parallel-synchronization-and-communication-instructions-mbarrier-test-wait-mbarrier-try-wait
-#include <cuda/__ptx/instructions/generated/mbarrier_test_wait.inc>
-#include <cuda/__ptx/instructions/generated/mbarrier_test_wait_parity.inc>
-#include <cuda/__ptx/instructions/generated/mbarrier_try_wait.inc>
-#include <cuda/__ptx/instructions/generated/mbarrier_try_wait_parity.inc>
+#include <cuda/__ptx/instructions/generated/mbarrier_test_wait.h>
+#include <cuda/__ptx/instructions/generated/mbarrier_test_wait_parity.h>
+#include <cuda/__ptx/instructions/generated/mbarrier_try_wait.h>
+#include <cuda/__ptx/instructions/generated/mbarrier_try_wait_parity.h>
 
 _LIBCUDACXX_END_NAMESPACE_CUDA_PTX
 
diff --git a/libcudacxx/include/cuda/__ptx/instructions/red_async.h b/libcudacxx/include/cuda/__ptx/instructions/red_async.h
index a610cf2b583..d14a96dc725 100644
--- a/libcudacxx/include/cuda/__ptx/instructions/red_async.h
+++ b/libcudacxx/include/cuda/__ptx/instructions/red_async.h
@@ -32,7 +32,7 @@ _LIBCUDACXX_BEGIN_NAMESPACE_CUDA_PTX
 
 // 9.7.12.7. Parallel Synchronization and Communication Instructions: red.async
 // https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#parallel-synchronization-and-communication-instructions-red-async
-#include <cuda/__ptx/instructions/generated/red_async.inc>
+#include <cuda/__ptx/instructions/generated/red_async.h>
 
 _LIBCUDACXX_END_NAMESPACE_CUDA_PTX
 
diff --git a/libcudacxx/include/cuda/__ptx/instructions/st_async.h b/libcudacxx/include/cuda/__ptx/instructions/st_async.h
index 09199b4a3ce..ffad9f176d0 100644
--- a/libcudacxx/include/cuda/__ptx/instructions/st_async.h
+++ b/libcudacxx/include/cuda/__ptx/instructions/st_async.h
@@ -32,7 +32,7 @@ _LIBCUDACXX_BEGIN_NAMESPACE_CUDA_PTX
 
 // 9.7.8.12. Data Movement and Conversion Instructions: st.async
 // https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#data-movement-and-conversion-instructions-st-async
-#include <cuda/__ptx/instructions/generated/st_async.inc>
+#include <cuda/__ptx/instructions/generated/st_async.h>
 
 _LIBCUDACXX_END_NAMESPACE_CUDA_PTX
 
diff --git a/libcudacxx/include/cuda/__ptx/instructions/tensormap_cp_fenceproxy.h b/libcudacxx/include/cuda/__ptx/instructions/tensormap_cp_fenceproxy.h
index de179f69735..22eaa502305 100644
--- a/libcudacxx/include/cuda/__ptx/instructions/tensormap_cp_fenceproxy.h
+++ b/libcudacxx/include/cuda/__ptx/instructions/tensormap_cp_fenceproxy.h
@@ -32,7 +32,7 @@ _LIBCUDACXX_BEGIN_NAMESPACE_CUDA_PTX
 
 // 9.7.12.15.18. Parallel Synchronization and Communication Instructions: tensormap.cp_fenceproxy
 // https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#parallel-synchronization-and-communication-instructions-tensormap-cp-fenceproxy
-#include <cuda/__ptx/instructions/generated/tensormap_cp_fenceproxy.inc>
+#include <cuda/__ptx/instructions/generated/tensormap_cp_fenceproxy.h>
 
 _LIBCUDACXX_END_NAMESPACE_CUDA_PTX
 
diff --git a/libcudacxx/include/cuda/__ptx/instructions/tensormap_replace.h b/libcudacxx/include/cuda/__ptx/instructions/tensormap_replace.h
index 2f81d8b4361..681a820b070 100644
--- a/libcudacxx/include/cuda/__ptx/instructions/tensormap_replace.h
+++ b/libcudacxx/include/cuda/__ptx/instructions/tensormap_replace.h
@@ -32,7 +32,7 @@ _LIBCUDACXX_BEGIN_NAMESPACE_CUDA_PTX
 
 // 9.7.8.25. Data Movement and Conversion Instructions: tensormap.replace
 // https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#data-movement-and-conversion-instructions-tensormap-replace
-#include <cuda/__ptx/instructions/generated/tensormap_replace.inc>
+#include <cuda/__ptx/instructions/generated/tensormap_replace.h>
 
 _LIBCUDACXX_END_NAMESPACE_CUDA_PTX
 
diff --git a/libcudacxx/test/internal_headers/CMakeLists.txt b/libcudacxx/test/internal_headers/CMakeLists.txt
index 4c1031e5b4f..1f1e4947efb 100644
--- a/libcudacxx/test/internal_headers/CMakeLists.txt
+++ b/libcudacxx/test/internal_headers/CMakeLists.txt
@@ -26,6 +26,9 @@ if ("${CMAKE_CXX_COMPILER_ID}" STREQUAL "MSVC" AND NOT "${CMAKE_CXX_STANDARD}" M
   list(FILTER internal_headers EXCLUDE REGEX "mdspan")
 endif()
 
+# generated cuda::ptx headers are not standalone
+list(FILTER internal_headers EXCLUDE REGEX "__ptx/instructions/generated")
+
 function(libcudacxx_create_internal_header_test header_name, headertest_src, fallback)
   if(fallback)
     set(header_name "${header_name}_fallback")

From efee771d1b5cdf1feb2ddc256249d14ec0768839 Mon Sep 17 00:00:00 2001
From: Bernhard Manfred Gruber <bernhardmgruber@gmail.com>
Date: Mon, 25 Nov 2024 19:41:13 +0100
Subject: [PATCH 22/45] Regenerate PTX test (#2953)

Overwrites all generated PTX tests and runs `pre-commit run --all-files`
---
 ...{barrier_cluster.inc => barrier_cluster.h} | 16 +++++++++++
 .../{cp_async_bulk.inc => cp_async_bulk.h}    | 16 +++++++++++
 .../generated/cp_async_bulk_commit_group.h    | 26 +++++++++++++++++
 .../generated/cp_async_bulk_commit_group.inc  | 10 -------
 ...ulticast.inc => cp_async_bulk_multicast.h} | 16 +++++++++++
 ...bulk_tensor.inc => cp_async_bulk_tensor.h} | 16 +++++++++++
 ...t.inc => cp_async_bulk_tensor_multicast.h} | 16 +++++++++++
 ...t_group.inc => cp_async_bulk_wait_group.h} | 16 +++++++++++
 ..._async_bulk.inc => cp_reduce_async_bulk.h} | 16 +++++++++++
 ...k_bf16.inc => cp_reduce_async_bulk_bf16.h} | 28 +++++++++++++++----
 ...ulk_f16.inc => cp_reduce_async_bulk_f16.h} | 28 +++++++++++++++----
 ...nsor.inc => cp_reduce_async_bulk_tensor.h} | 16 +++++++++++
 .../cuda/ptx/generated/{fence.inc => fence.h} | 16 +++++++++++
 .../cuda/ptx/generated/fence_mbarrier_init.h  | 27 ++++++++++++++++++
 .../ptx/generated/fence_mbarrier_init.inc     | 11 --------
 .../cuda/ptx/generated/fence_proxy_alias.h    | 25 +++++++++++++++++
 .../cuda/ptx/generated/fence_proxy_alias.inc  |  9 ------
 ...ce_proxy_async.inc => fence_proxy_async.h} | 16 +++++++++++
 ...ic.inc => fence_proxy_tensormap_generic.h} | 16 +++++++++++
 .../generated/{get_sreg.inc => get_sreg.h}    | 16 +++++++++++
 .../cuda/ptx/generated/getctarank.h           | 26 +++++++++++++++++
 .../cuda/ptx/generated/getctarank.inc         | 10 -------
 ...{mbarrier_arrive.inc => mbarrier_arrive.h} | 16 +++++++++++
 ...ect_tx.inc => mbarrier_arrive_expect_tx.h} | 16 +++++++++++
 .../generated/mbarrier_arrive_no_complete.h   | 26 +++++++++++++++++
 .../generated/mbarrier_arrive_no_complete.inc | 10 -------
 .../cuda/ptx/generated/mbarrier_init.h        | 26 +++++++++++++++++
 .../cuda/ptx/generated/mbarrier_init.inc      | 10 -------
 ...rrier_try_wait.inc => mbarrier_try_wait.h} | 16 +++++++++++
 ..._parity.inc => mbarrier_try_wait_parity.h} | 16 +++++++++++
 .../{mbarrier_wait.inc => mbarrier_wait.h}    |  0
 ...wait_parity.inc => mbarrier_wait_parity.h} |  0
 .../generated/{red_async.inc => red_async.h}  | 16 +++++++++++
 .../generated/{st_async.inc => st_async.h}    | 16 +++++++++++
 ...nceproxy.inc => tensormap_cp_fenceproxy.h} | 16 +++++++++++
 ...sormap_replace.inc => tensormap_replace.h} | 16 +++++++++++
 .../ptx/ptx.barrier.cluster.compile.pass.cpp  | 19 +------------
 ...p.async.bulk.commit_group.compile.pass.cpp | 19 +------------
 .../ptx/ptx.cp.async.bulk.compile.pass.cpp    | 19 +------------
 ...x.cp.async.bulk.multicast.compile.pass.cpp | 19 +------------
 .../ptx.cp.async.bulk.tensor.compile.pass.cpp | 19 +------------
 ...ync.bulk.tensor.multicast.compile.pass.cpp | 19 +------------
 ....cp.async.bulk.wait_group.compile.pass.cpp | 19 +------------
 .../ptx.cp.reduce.async.bulk.compile.pass.cpp | 23 ++-------------
 ....reduce.async.bulk.tensor.compile.pass.cpp | 19 +------------
 .../cuda/ptx/ptx.fence.compile.pass.cpp       | 27 ++++--------------
 .../cuda/ptx/ptx.get_sreg.compile.pass.cpp    | 19 +------------
 .../cuda/ptx/ptx.getctarank.compile.pass.cpp  | 19 +------------
 .../ptx/ptx.mbarrier.arrive.compile.pass.cpp  | 23 ++-------------
 .../ptx/ptx.mbarrier.init.compile.pass.cpp    | 19 +------------
 .../ptx/ptx.mbarrier.wait.compile.pass.cpp    | 25 +++--------------
 .../cuda/ptx/ptx.red.async.compile.pass.cpp   | 19 +------------
 .../cuda/ptx/ptx.st.async.compile.pass.cpp    | 19 +------------
 ...x.tensormap.cp_fenceproxy.compile.pass.cpp | 19 +------------
 .../ptx.tensormap.replace.compile.pass.cpp    | 19 +------------
 55 files changed, 550 insertions(+), 425 deletions(-)
 rename libcudacxx/test/libcudacxx/cuda/ptx/generated/{barrier_cluster.inc => barrier_cluster.h} (69%)
 rename libcudacxx/test/libcudacxx/cuda/ptx/generated/{cp_async_bulk.inc => cp_async_bulk.h} (66%)
 create mode 100644 libcudacxx/test/libcudacxx/cuda/ptx/generated/cp_async_bulk_commit_group.h
 delete mode 100644 libcudacxx/test/libcudacxx/cuda/ptx/generated/cp_async_bulk_commit_group.inc
 rename libcudacxx/test/libcudacxx/cuda/ptx/generated/{cp_async_bulk_multicast.inc => cp_async_bulk_multicast.h} (51%)
 rename libcudacxx/test/libcudacxx/cuda/ptx/generated/{cp_async_bulk_tensor.inc => cp_async_bulk_tensor.h} (87%)
 rename libcudacxx/test/libcudacxx/cuda/ptx/generated/{cp_async_bulk_tensor_multicast.inc => cp_async_bulk_tensor_multicast.h} (83%)
 rename libcudacxx/test/libcudacxx/cuda/ptx/generated/{cp_async_bulk_wait_group.inc => cp_async_bulk_wait_group.h} (50%)
 rename libcudacxx/test/libcudacxx/cuda/ptx/generated/{cp_reduce_async_bulk.inc => cp_reduce_async_bulk.h} (96%)
 rename libcudacxx/test/libcudacxx/cuda/ptx/generated/{cp_reduce_async_bulk_bf16.inc => cp_reduce_async_bulk_bf16.h} (65%)
 rename libcudacxx/test/libcudacxx/cuda/ptx/generated/{cp_reduce_async_bulk_f16.inc => cp_reduce_async_bulk_f16.h} (59%)
 rename libcudacxx/test/libcudacxx/cuda/ptx/generated/{cp_reduce_async_bulk_tensor.inc => cp_reduce_async_bulk_tensor.h} (97%)
 rename libcudacxx/test/libcudacxx/cuda/ptx/generated/{fence.inc => fence.h} (71%)
 create mode 100644 libcudacxx/test/libcudacxx/cuda/ptx/generated/fence_mbarrier_init.h
 delete mode 100644 libcudacxx/test/libcudacxx/cuda/ptx/generated/fence_mbarrier_init.inc
 create mode 100644 libcudacxx/test/libcudacxx/cuda/ptx/generated/fence_proxy_alias.h
 delete mode 100644 libcudacxx/test/libcudacxx/cuda/ptx/generated/fence_proxy_alias.inc
 rename libcudacxx/test/libcudacxx/cuda/ptx/generated/{fence_proxy_async.inc => fence_proxy_async.h} (58%)
 rename libcudacxx/test/libcudacxx/cuda/ptx/generated/{fence_proxy_tensormap_generic.inc => fence_proxy_tensormap_generic.h} (78%)
 rename libcudacxx/test/libcudacxx/cuda/ptx/generated/{get_sreg.inc => get_sreg.h} (94%)
 create mode 100644 libcudacxx/test/libcudacxx/cuda/ptx/generated/getctarank.h
 delete mode 100644 libcudacxx/test/libcudacxx/cuda/ptx/generated/getctarank.inc
 rename libcudacxx/test/libcudacxx/cuda/ptx/generated/{mbarrier_arrive.inc => mbarrier_arrive.h} (82%)
 rename libcudacxx/test/libcudacxx/cuda/ptx/generated/{mbarrier_arrive_expect_tx.inc => mbarrier_arrive_expect_tx.h} (67%)
 create mode 100644 libcudacxx/test/libcudacxx/cuda/ptx/generated/mbarrier_arrive_no_complete.h
 delete mode 100644 libcudacxx/test/libcudacxx/cuda/ptx/generated/mbarrier_arrive_no_complete.inc
 create mode 100644 libcudacxx/test/libcudacxx/cuda/ptx/generated/mbarrier_init.h
 delete mode 100644 libcudacxx/test/libcudacxx/cuda/ptx/generated/mbarrier_init.inc
 rename libcudacxx/test/libcudacxx/cuda/ptx/generated/{mbarrier_try_wait.inc => mbarrier_try_wait.h} (77%)
 rename libcudacxx/test/libcudacxx/cuda/ptx/generated/{mbarrier_try_wait_parity.inc => mbarrier_try_wait_parity.h} (77%)
 rename libcudacxx/test/libcudacxx/cuda/ptx/generated/{mbarrier_wait.inc => mbarrier_wait.h} (100%)
 rename libcudacxx/test/libcudacxx/cuda/ptx/generated/{mbarrier_wait_parity.inc => mbarrier_wait_parity.h} (100%)
 rename libcudacxx/test/libcudacxx/cuda/ptx/generated/{red_async.inc => red_async.h} (87%)
 rename libcudacxx/test/libcudacxx/cuda/ptx/generated/{st_async.inc => st_async.h} (70%)
 rename libcudacxx/test/libcudacxx/cuda/ptx/generated/{tensormap_cp_fenceproxy.inc => tensormap_cp_fenceproxy.h} (70%)
 rename libcudacxx/test/libcudacxx/cuda/ptx/generated/{tensormap_replace.inc => tensormap_replace.h} (91%)

diff --git a/libcudacxx/test/libcudacxx/cuda/ptx/generated/barrier_cluster.inc b/libcudacxx/test/libcudacxx/cuda/ptx/generated/barrier_cluster.h
similarity index 69%
rename from libcudacxx/test/libcudacxx/cuda/ptx/generated/barrier_cluster.inc
rename to libcudacxx/test/libcudacxx/cuda/ptx/generated/barrier_cluster.h
index cad5510ba70..52c47bf2f9d 100644
--- a/libcudacxx/test/libcudacxx/cuda/ptx/generated/barrier_cluster.inc
+++ b/libcudacxx/test/libcudacxx/cuda/ptx/generated/barrier_cluster.h
@@ -1,3 +1,19 @@
+// This file was automatically generated. Do not edit.
+
+// We use a special strategy to force the generation of the PTX. This is mainly
+// a fight against dead-code-elimination in the NVVM layer.
+//
+// The reason we need this strategy is because certain older versions of ptxas
+// segfault when a non-sensical sequence of PTX is generated. So instead, we try
+// to force the instantiation and compilation to PTX of all the overloads of the
+// PTX wrapping functions.
+//
+// We do this by writing a function pointer of each overload to the kernel
+// parameter `fn_ptr`.
+//
+// Because `fn_ptr` is possibly visible outside this translation unit, the
+// compiler must compile all the functions which are stored.
+
 __global__ void test_barrier_cluster(void** fn_ptr)
 {
 #if __cccl_ptx_isa >= 780
diff --git a/libcudacxx/test/libcudacxx/cuda/ptx/generated/cp_async_bulk.inc b/libcudacxx/test/libcudacxx/cuda/ptx/generated/cp_async_bulk.h
similarity index 66%
rename from libcudacxx/test/libcudacxx/cuda/ptx/generated/cp_async_bulk.inc
rename to libcudacxx/test/libcudacxx/cuda/ptx/generated/cp_async_bulk.h
index cd66de989a2..a342954591a 100644
--- a/libcudacxx/test/libcudacxx/cuda/ptx/generated/cp_async_bulk.inc
+++ b/libcudacxx/test/libcudacxx/cuda/ptx/generated/cp_async_bulk.h
@@ -1,3 +1,19 @@
+// This file was automatically generated. Do not edit.
+
+// We use a special strategy to force the generation of the PTX. This is mainly
+// a fight against dead-code-elimination in the NVVM layer.
+//
+// The reason we need this strategy is because certain older versions of ptxas
+// segfault when a non-sensical sequence of PTX is generated. So instead, we try
+// to force the instantiation and compilation to PTX of all the overloads of the
+// PTX wrapping functions.
+//
+// We do this by writing a function pointer of each overload to the kernel
+// parameter `fn_ptr`.
+//
+// Because `fn_ptr` is possibly visible outside this translation unit, the
+// compiler must compile all the functions which are stored.
+
 __global__ void test_cp_async_bulk(void** fn_ptr)
 {
 #if __cccl_ptx_isa >= 800
diff --git a/libcudacxx/test/libcudacxx/cuda/ptx/generated/cp_async_bulk_commit_group.h b/libcudacxx/test/libcudacxx/cuda/ptx/generated/cp_async_bulk_commit_group.h
new file mode 100644
index 00000000000..b017312d979
--- /dev/null
+++ b/libcudacxx/test/libcudacxx/cuda/ptx/generated/cp_async_bulk_commit_group.h
@@ -0,0 +1,26 @@
+// This file was automatically generated. Do not edit.
+
+// We use a special strategy to force the generation of the PTX. This is mainly
+// a fight against dead-code-elimination in the NVVM layer.
+//
+// The reason we need this strategy is because certain older versions of ptxas
+// segfault when a non-sensical sequence of PTX is generated. So instead, we try
+// to force the instantiation and compilation to PTX of all the overloads of the
+// PTX wrapping functions.
+//
+// We do this by writing a function pointer of each overload to the kernel
+// parameter `fn_ptr`.
+//
+// Because `fn_ptr` is possibly visible outside this translation unit, the
+// compiler must compile all the functions which are stored.
+
+__global__ void test_cp_async_bulk_commit_group(void** fn_ptr)
+{
+#if __cccl_ptx_isa >= 800
+  NV_IF_TARGET(
+    NV_PROVIDES_SM_90,
+    (
+        // cp.async.bulk.commit_group;
+        * fn_ptr++ = reinterpret_cast<void*>(static_cast<void (*)()>(cuda::ptx::cp_async_bulk_commit_group));));
+#endif // __cccl_ptx_isa >= 800
+}
diff --git a/libcudacxx/test/libcudacxx/cuda/ptx/generated/cp_async_bulk_commit_group.inc b/libcudacxx/test/libcudacxx/cuda/ptx/generated/cp_async_bulk_commit_group.inc
deleted file mode 100644
index afdf14abb8a..00000000000
--- a/libcudacxx/test/libcudacxx/cuda/ptx/generated/cp_async_bulk_commit_group.inc
+++ /dev/null
@@ -1,10 +0,0 @@
-__global__ void test_cp_async_bulk_commit_group(void** fn_ptr)
-{
-#if __cccl_ptx_isa >= 800
-  NV_IF_TARGET(
-    NV_PROVIDES_SM_90,
-    (
-        // cp.async.bulk.commit_group;
-        * fn_ptr++ = reinterpret_cast<void*>(static_cast<void (*)()>(cuda::ptx::cp_async_bulk_commit_group));));
-#endif // __cccl_ptx_isa >= 800
-}
diff --git a/libcudacxx/test/libcudacxx/cuda/ptx/generated/cp_async_bulk_multicast.inc b/libcudacxx/test/libcudacxx/cuda/ptx/generated/cp_async_bulk_multicast.h
similarity index 51%
rename from libcudacxx/test/libcudacxx/cuda/ptx/generated/cp_async_bulk_multicast.inc
rename to libcudacxx/test/libcudacxx/cuda/ptx/generated/cp_async_bulk_multicast.h
index b2bd0d968d9..6e2a986e7bd 100644
--- a/libcudacxx/test/libcudacxx/cuda/ptx/generated/cp_async_bulk_multicast.inc
+++ b/libcudacxx/test/libcudacxx/cuda/ptx/generated/cp_async_bulk_multicast.h
@@ -1,3 +1,19 @@
+// This file was automatically generated. Do not edit.
+
+// We use a special strategy to force the generation of the PTX. This is mainly
+// a fight against dead-code-elimination in the NVVM layer.
+//
+// The reason we need this strategy is because certain older versions of ptxas
+// segfault when a non-sensical sequence of PTX is generated. So instead, we try
+// to force the instantiation and compilation to PTX of all the overloads of the
+// PTX wrapping functions.
+//
+// We do this by writing a function pointer of each overload to the kernel
+// parameter `fn_ptr`.
+//
+// Because `fn_ptr` is possibly visible outside this translation unit, the
+// compiler must compile all the functions which are stored.
+
 __global__ void test_cp_async_bulk_multicast(void** fn_ptr)
 {
 #if __cccl_ptx_isa >= 800
diff --git a/libcudacxx/test/libcudacxx/cuda/ptx/generated/cp_async_bulk_tensor.inc b/libcudacxx/test/libcudacxx/cuda/ptx/generated/cp_async_bulk_tensor.h
similarity index 87%
rename from libcudacxx/test/libcudacxx/cuda/ptx/generated/cp_async_bulk_tensor.inc
rename to libcudacxx/test/libcudacxx/cuda/ptx/generated/cp_async_bulk_tensor.h
index f9d0d240d28..4618f3ea7a0 100644
--- a/libcudacxx/test/libcudacxx/cuda/ptx/generated/cp_async_bulk_tensor.inc
+++ b/libcudacxx/test/libcudacxx/cuda/ptx/generated/cp_async_bulk_tensor.h
@@ -1,3 +1,19 @@
+// This file was automatically generated. Do not edit.
+
+// We use a special strategy to force the generation of the PTX. This is mainly
+// a fight against dead-code-elimination in the NVVM layer.
+//
+// The reason we need this strategy is because certain older versions of ptxas
+// segfault when a non-sensical sequence of PTX is generated. So instead, we try
+// to force the instantiation and compilation to PTX of all the overloads of the
+// PTX wrapping functions.
+//
+// We do this by writing a function pointer of each overload to the kernel
+// parameter `fn_ptr`.
+//
+// Because `fn_ptr` is possibly visible outside this translation unit, the
+// compiler must compile all the functions which are stored.
+
 __global__ void test_cp_async_bulk_tensor(void** fn_ptr)
 {
 #if __cccl_ptx_isa >= 800
diff --git a/libcudacxx/test/libcudacxx/cuda/ptx/generated/cp_async_bulk_tensor_multicast.inc b/libcudacxx/test/libcudacxx/cuda/ptx/generated/cp_async_bulk_tensor_multicast.h
similarity index 83%
rename from libcudacxx/test/libcudacxx/cuda/ptx/generated/cp_async_bulk_tensor_multicast.inc
rename to libcudacxx/test/libcudacxx/cuda/ptx/generated/cp_async_bulk_tensor_multicast.h
index 2851aab6d7c..617bc9507bd 100644
--- a/libcudacxx/test/libcudacxx/cuda/ptx/generated/cp_async_bulk_tensor_multicast.inc
+++ b/libcudacxx/test/libcudacxx/cuda/ptx/generated/cp_async_bulk_tensor_multicast.h
@@ -1,3 +1,19 @@
+// This file was automatically generated. Do not edit.
+
+// We use a special strategy to force the generation of the PTX. This is mainly
+// a fight against dead-code-elimination in the NVVM layer.
+//
+// The reason we need this strategy is because certain older versions of ptxas
+// segfault when a non-sensical sequence of PTX is generated. So instead, we try
+// to force the instantiation and compilation to PTX of all the overloads of the
+// PTX wrapping functions.
+//
+// We do this by writing a function pointer of each overload to the kernel
+// parameter `fn_ptr`.
+//
+// Because `fn_ptr` is possibly visible outside this translation unit, the
+// compiler must compile all the functions which are stored.
+
 __global__ void test_cp_async_bulk_tensor_multicast(void** fn_ptr)
 {
 #if __cccl_ptx_isa >= 800
diff --git a/libcudacxx/test/libcudacxx/cuda/ptx/generated/cp_async_bulk_wait_group.inc b/libcudacxx/test/libcudacxx/cuda/ptx/generated/cp_async_bulk_wait_group.h
similarity index 50%
rename from libcudacxx/test/libcudacxx/cuda/ptx/generated/cp_async_bulk_wait_group.inc
rename to libcudacxx/test/libcudacxx/cuda/ptx/generated/cp_async_bulk_wait_group.h
index 0139a65f6ce..fa11225f316 100644
--- a/libcudacxx/test/libcudacxx/cuda/ptx/generated/cp_async_bulk_wait_group.inc
+++ b/libcudacxx/test/libcudacxx/cuda/ptx/generated/cp_async_bulk_wait_group.h
@@ -1,3 +1,19 @@
+// This file was automatically generated. Do not edit.
+
+// We use a special strategy to force the generation of the PTX. This is mainly
+// a fight against dead-code-elimination in the NVVM layer.
+//
+// The reason we need this strategy is because certain older versions of ptxas
+// segfault when a non-sensical sequence of PTX is generated. So instead, we try
+// to force the instantiation and compilation to PTX of all the overloads of the
+// PTX wrapping functions.
+//
+// We do this by writing a function pointer of each overload to the kernel
+// parameter `fn_ptr`.
+//
+// Because `fn_ptr` is possibly visible outside this translation unit, the
+// compiler must compile all the functions which are stored.
+
 __global__ void test_cp_async_bulk_wait_group(void** fn_ptr)
 {
 #if __cccl_ptx_isa >= 800
diff --git a/libcudacxx/test/libcudacxx/cuda/ptx/generated/cp_reduce_async_bulk.inc b/libcudacxx/test/libcudacxx/cuda/ptx/generated/cp_reduce_async_bulk.h
similarity index 96%
rename from libcudacxx/test/libcudacxx/cuda/ptx/generated/cp_reduce_async_bulk.inc
rename to libcudacxx/test/libcudacxx/cuda/ptx/generated/cp_reduce_async_bulk.h
index 5ee274bcbe8..6f3195ebf7d 100644
--- a/libcudacxx/test/libcudacxx/cuda/ptx/generated/cp_reduce_async_bulk.inc
+++ b/libcudacxx/test/libcudacxx/cuda/ptx/generated/cp_reduce_async_bulk.h
@@ -1,3 +1,19 @@
+// This file was automatically generated. Do not edit.
+
+// We use a special strategy to force the generation of the PTX. This is mainly
+// a fight against dead-code-elimination in the NVVM layer.
+//
+// The reason we need this strategy is because certain older versions of ptxas
+// segfault when a non-sensical sequence of PTX is generated. So instead, we try
+// to force the instantiation and compilation to PTX of all the overloads of the
+// PTX wrapping functions.
+//
+// We do this by writing a function pointer of each overload to the kernel
+// parameter `fn_ptr`.
+//
+// Because `fn_ptr` is possibly visible outside this translation unit, the
+// compiler must compile all the functions which are stored.
+
 __global__ void test_cp_reduce_async_bulk(void** fn_ptr)
 {
 #if __cccl_ptx_isa >= 800
diff --git a/libcudacxx/test/libcudacxx/cuda/ptx/generated/cp_reduce_async_bulk_bf16.inc b/libcudacxx/test/libcudacxx/cuda/ptx/generated/cp_reduce_async_bulk_bf16.h
similarity index 65%
rename from libcudacxx/test/libcudacxx/cuda/ptx/generated/cp_reduce_async_bulk_bf16.inc
rename to libcudacxx/test/libcudacxx/cuda/ptx/generated/cp_reduce_async_bulk_bf16.h
index fe38374fe00..f5bfe7ef8b3 100644
--- a/libcudacxx/test/libcudacxx/cuda/ptx/generated/cp_reduce_async_bulk_bf16.inc
+++ b/libcudacxx/test/libcudacxx/cuda/ptx/generated/cp_reduce_async_bulk_bf16.h
@@ -1,6 +1,22 @@
+// This file was automatically generated. Do not edit.
+
+// We use a special strategy to force the generation of the PTX. This is mainly
+// a fight against dead-code-elimination in the NVVM layer.
+//
+// The reason we need this strategy is because certain older versions of ptxas
+// segfault when a non-sensical sequence of PTX is generated. So instead, we try
+// to force the instantiation and compilation to PTX of all the overloads of the
+// PTX wrapping functions.
+//
+// We do this by writing a function pointer of each overload to the kernel
+// parameter `fn_ptr`.
+//
+// Because `fn_ptr` is possibly visible outside this translation unit, the
+// compiler must compile all the functions which are stored.
+
 __global__ void test_cp_reduce_async_bulk_bf16(void** fn_ptr)
 {
-#  if __cccl_ptx_isa >= 800
+#if __cccl_ptx_isa >= 800
   NV_IF_TARGET(
     NV_PROVIDES_SM_90,
     (
@@ -12,9 +28,9 @@ __global__ void test_cp_reduce_async_bulk_bf16(void** fn_ptr)
                                __nv_bfloat16*,
                                const __nv_bfloat16*,
                                uint32_t)>(cuda::ptx::cp_reduce_async_bulk));));
-#  endif // __cccl_ptx_isa >= 800
+#endif // __cccl_ptx_isa >= 800
 
-#  if __cccl_ptx_isa >= 800
+#if __cccl_ptx_isa >= 800
   NV_IF_TARGET(
     NV_PROVIDES_SM_90,
     (
@@ -26,9 +42,9 @@ __global__ void test_cp_reduce_async_bulk_bf16(void** fn_ptr)
                                __nv_bfloat16*,
                                const __nv_bfloat16*,
                                uint32_t)>(cuda::ptx::cp_reduce_async_bulk));));
-#  endif // __cccl_ptx_isa >= 800
+#endif // __cccl_ptx_isa >= 800
 
-#  if __cccl_ptx_isa >= 800
+#if __cccl_ptx_isa >= 800
   NV_IF_TARGET(
     NV_PROVIDES_SM_90,
     (
@@ -40,5 +56,5 @@ __global__ void test_cp_reduce_async_bulk_bf16(void** fn_ptr)
                                __nv_bfloat16*,
                                const __nv_bfloat16*,
                                uint32_t)>(cuda::ptx::cp_reduce_async_bulk));));
-#  endif // __cccl_ptx_isa >= 800
+#endif // __cccl_ptx_isa >= 800
 }
diff --git a/libcudacxx/test/libcudacxx/cuda/ptx/generated/cp_reduce_async_bulk_f16.inc b/libcudacxx/test/libcudacxx/cuda/ptx/generated/cp_reduce_async_bulk_f16.h
similarity index 59%
rename from libcudacxx/test/libcudacxx/cuda/ptx/generated/cp_reduce_async_bulk_f16.inc
rename to libcudacxx/test/libcudacxx/cuda/ptx/generated/cp_reduce_async_bulk_f16.h
index e7e58cfcb80..b2ce91fc12b 100644
--- a/libcudacxx/test/libcudacxx/cuda/ptx/generated/cp_reduce_async_bulk_f16.inc
+++ b/libcudacxx/test/libcudacxx/cuda/ptx/generated/cp_reduce_async_bulk_f16.h
@@ -1,6 +1,22 @@
+// This file was automatically generated. Do not edit.
+
+// We use a special strategy to force the generation of the PTX. This is mainly
+// a fight against dead-code-elimination in the NVVM layer.
+//
+// The reason we need this strategy is because certain older versions of ptxas
+// segfault when a non-sensical sequence of PTX is generated. So instead, we try
+// to force the instantiation and compilation to PTX of all the overloads of the
+// PTX wrapping functions.
+//
+// We do this by writing a function pointer of each overload to the kernel
+// parameter `fn_ptr`.
+//
+// Because `fn_ptr` is possibly visible outside this translation unit, the
+// compiler must compile all the functions which are stored.
+
 __global__ void test_cp_reduce_async_bulk_f16(void** fn_ptr)
 {
-#  if __cccl_ptx_isa >= 800
+#if __cccl_ptx_isa >= 800
   NV_IF_TARGET(
     NV_PROVIDES_SM_90,
     (
@@ -9,9 +25,9 @@ __global__ void test_cp_reduce_async_bulk_f16(void** fn_ptr)
           static_cast<void (*)(
             cuda::ptx::space_global_t, cuda::ptx::space_shared_t, cuda::ptx::op_min_t, __half*, const __half*, uint32_t)>(
             cuda::ptx::cp_reduce_async_bulk));));
-#  endif // __cccl_ptx_isa >= 800
+#endif // __cccl_ptx_isa >= 800
 
-#  if __cccl_ptx_isa >= 800
+#if __cccl_ptx_isa >= 800
   NV_IF_TARGET(
     NV_PROVIDES_SM_90,
     (
@@ -20,9 +36,9 @@ __global__ void test_cp_reduce_async_bulk_f16(void** fn_ptr)
           static_cast<void (*)(
             cuda::ptx::space_global_t, cuda::ptx::space_shared_t, cuda::ptx::op_max_t, __half*, const __half*, uint32_t)>(
             cuda::ptx::cp_reduce_async_bulk));));
-#  endif // __cccl_ptx_isa >= 800
+#endif // __cccl_ptx_isa >= 800
 
-#  if __cccl_ptx_isa >= 800
+#if __cccl_ptx_isa >= 800
   NV_IF_TARGET(
     NV_PROVIDES_SM_90,
     (
@@ -31,5 +47,5 @@ __global__ void test_cp_reduce_async_bulk_f16(void** fn_ptr)
           static_cast<void (*)(
             cuda::ptx::space_global_t, cuda::ptx::space_shared_t, cuda::ptx::op_add_t, __half*, const __half*, uint32_t)>(
             cuda::ptx::cp_reduce_async_bulk));));
-#  endif // __cccl_ptx_isa >= 800
+#endif // __cccl_ptx_isa >= 800
 }
diff --git a/libcudacxx/test/libcudacxx/cuda/ptx/generated/cp_reduce_async_bulk_tensor.inc b/libcudacxx/test/libcudacxx/cuda/ptx/generated/cp_reduce_async_bulk_tensor.h
similarity index 97%
rename from libcudacxx/test/libcudacxx/cuda/ptx/generated/cp_reduce_async_bulk_tensor.inc
rename to libcudacxx/test/libcudacxx/cuda/ptx/generated/cp_reduce_async_bulk_tensor.h
index 6f0a7d710ce..270f17a70e3 100644
--- a/libcudacxx/test/libcudacxx/cuda/ptx/generated/cp_reduce_async_bulk_tensor.inc
+++ b/libcudacxx/test/libcudacxx/cuda/ptx/generated/cp_reduce_async_bulk_tensor.h
@@ -1,3 +1,19 @@
+// This file was automatically generated. Do not edit.
+
+// We use a special strategy to force the generation of the PTX. This is mainly
+// a fight against dead-code-elimination in the NVVM layer.
+//
+// The reason we need this strategy is because certain older versions of ptxas
+// segfault when a non-sensical sequence of PTX is generated. So instead, we try
+// to force the instantiation and compilation to PTX of all the overloads of the
+// PTX wrapping functions.
+//
+// We do this by writing a function pointer of each overload to the kernel
+// parameter `fn_ptr`.
+//
+// Because `fn_ptr` is possibly visible outside this translation unit, the
+// compiler must compile all the functions which are stored.
+
 __global__ void test_cp_reduce_async_bulk_tensor(void** fn_ptr)
 {
 #if __cccl_ptx_isa >= 800
diff --git a/libcudacxx/test/libcudacxx/cuda/ptx/generated/fence.inc b/libcudacxx/test/libcudacxx/cuda/ptx/generated/fence.h
similarity index 71%
rename from libcudacxx/test/libcudacxx/cuda/ptx/generated/fence.inc
rename to libcudacxx/test/libcudacxx/cuda/ptx/generated/fence.h
index 2e464580de9..aecfcde5e01 100644
--- a/libcudacxx/test/libcudacxx/cuda/ptx/generated/fence.inc
+++ b/libcudacxx/test/libcudacxx/cuda/ptx/generated/fence.h
@@ -1,3 +1,19 @@
+// This file was automatically generated. Do not edit.
+
+// We use a special strategy to force the generation of the PTX. This is mainly
+// a fight against dead-code-elimination in the NVVM layer.
+//
+// The reason we need this strategy is because certain older versions of ptxas
+// segfault when a non-sensical sequence of PTX is generated. So instead, we try
+// to force the instantiation and compilation to PTX of all the overloads of the
+// PTX wrapping functions.
+//
+// We do this by writing a function pointer of each overload to the kernel
+// parameter `fn_ptr`.
+//
+// Because `fn_ptr` is possibly visible outside this translation unit, the
+// compiler must compile all the functions which are stored.
+
 __global__ void test_fence(void** fn_ptr)
 {
 #if __cccl_ptx_isa >= 600
diff --git a/libcudacxx/test/libcudacxx/cuda/ptx/generated/fence_mbarrier_init.h b/libcudacxx/test/libcudacxx/cuda/ptx/generated/fence_mbarrier_init.h
new file mode 100644
index 00000000000..29d1bf3f627
--- /dev/null
+++ b/libcudacxx/test/libcudacxx/cuda/ptx/generated/fence_mbarrier_init.h
@@ -0,0 +1,27 @@
+// This file was automatically generated. Do not edit.
+
+// We use a special strategy to force the generation of the PTX. This is mainly
+// a fight against dead-code-elimination in the NVVM layer.
+//
+// The reason we need this strategy is because certain older versions of ptxas
+// segfault when a non-sensical sequence of PTX is generated. So instead, we try
+// to force the instantiation and compilation to PTX of all the overloads of the
+// PTX wrapping functions.
+//
+// We do this by writing a function pointer of each overload to the kernel
+// parameter `fn_ptr`.
+//
+// Because `fn_ptr` is possibly visible outside this translation unit, the
+// compiler must compile all the functions which are stored.
+
+__global__ void test_fence_mbarrier_init(void** fn_ptr)
+{
+#if __cccl_ptx_isa >= 800
+  NV_IF_TARGET(
+    NV_PROVIDES_SM_90,
+    (
+        // fence.mbarrier_init.release.cluster; // 3.
+        * fn_ptr++ = reinterpret_cast<void*>(static_cast<void (*)(cuda::ptx::sem_release_t, cuda::ptx::scope_cluster_t)>(
+          cuda::ptx::fence_mbarrier_init));));
+#endif // __cccl_ptx_isa >= 800
+}
diff --git a/libcudacxx/test/libcudacxx/cuda/ptx/generated/fence_mbarrier_init.inc b/libcudacxx/test/libcudacxx/cuda/ptx/generated/fence_mbarrier_init.inc
deleted file mode 100644
index f503c1d055b..00000000000
--- a/libcudacxx/test/libcudacxx/cuda/ptx/generated/fence_mbarrier_init.inc
+++ /dev/null
@@ -1,11 +0,0 @@
-__global__ void test_fence_mbarrier_init(void** fn_ptr)
-{
-#if __cccl_ptx_isa >= 800
-  NV_IF_TARGET(
-    NV_PROVIDES_SM_90,
-    (
-        // fence.mbarrier_init.release.cluster; // 3.
-        * fn_ptr++ = reinterpret_cast<void*>(static_cast<void (*)(cuda::ptx::sem_release_t, cuda::ptx::scope_cluster_t)>(
-          cuda::ptx::fence_mbarrier_init));));
-#endif // __cccl_ptx_isa >= 800
-}
diff --git a/libcudacxx/test/libcudacxx/cuda/ptx/generated/fence_proxy_alias.h b/libcudacxx/test/libcudacxx/cuda/ptx/generated/fence_proxy_alias.h
new file mode 100644
index 00000000000..474f89f8b0f
--- /dev/null
+++ b/libcudacxx/test/libcudacxx/cuda/ptx/generated/fence_proxy_alias.h
@@ -0,0 +1,25 @@
+// This file was automatically generated. Do not edit.
+
+// We use a special strategy to force the generation of the PTX. This is mainly
+// a fight against dead-code-elimination in the NVVM layer.
+//
+// The reason we need this strategy is because certain older versions of ptxas
+// segfault when a non-sensical sequence of PTX is generated. So instead, we try
+// to force the instantiation and compilation to PTX of all the overloads of the
+// PTX wrapping functions.
+//
+// We do this by writing a function pointer of each overload to the kernel
+// parameter `fn_ptr`.
+//
+// Because `fn_ptr` is possibly visible outside this translation unit, the
+// compiler must compile all the functions which are stored.
+
+__global__ void test_fence_proxy_alias(void** fn_ptr)
+{
+#if __cccl_ptx_isa >= 750
+  NV_IF_TARGET(NV_PROVIDES_SM_70,
+               (
+                   // fence.proxy.alias; // 4.
+                   * fn_ptr++ = reinterpret_cast<void*>(static_cast<void (*)()>(cuda::ptx::fence_proxy_alias));));
+#endif // __cccl_ptx_isa >= 750
+}
diff --git a/libcudacxx/test/libcudacxx/cuda/ptx/generated/fence_proxy_alias.inc b/libcudacxx/test/libcudacxx/cuda/ptx/generated/fence_proxy_alias.inc
deleted file mode 100644
index a8021d3f5be..00000000000
--- a/libcudacxx/test/libcudacxx/cuda/ptx/generated/fence_proxy_alias.inc
+++ /dev/null
@@ -1,9 +0,0 @@
-__global__ void test_fence_proxy_alias(void** fn_ptr)
-{
-#if __cccl_ptx_isa >= 750
-  NV_IF_TARGET(NV_PROVIDES_SM_70,
-               (
-                   // fence.proxy.alias; // 4.
-                   * fn_ptr++ = reinterpret_cast<void*>(static_cast<void (*)()>(cuda::ptx::fence_proxy_alias));));
-#endif // __cccl_ptx_isa >= 750
-}
diff --git a/libcudacxx/test/libcudacxx/cuda/ptx/generated/fence_proxy_async.inc b/libcudacxx/test/libcudacxx/cuda/ptx/generated/fence_proxy_async.h
similarity index 58%
rename from libcudacxx/test/libcudacxx/cuda/ptx/generated/fence_proxy_async.inc
rename to libcudacxx/test/libcudacxx/cuda/ptx/generated/fence_proxy_async.h
index e3d8e6d160a..56ebe6cceb0 100644
--- a/libcudacxx/test/libcudacxx/cuda/ptx/generated/fence_proxy_async.inc
+++ b/libcudacxx/test/libcudacxx/cuda/ptx/generated/fence_proxy_async.h
@@ -1,3 +1,19 @@
+// This file was automatically generated. Do not edit.
+
+// We use a special strategy to force the generation of the PTX. This is mainly
+// a fight against dead-code-elimination in the NVVM layer.
+//
+// The reason we need this strategy is because certain older versions of ptxas
+// segfault when a non-sensical sequence of PTX is generated. So instead, we try
+// to force the instantiation and compilation to PTX of all the overloads of the
+// PTX wrapping functions.
+//
+// We do this by writing a function pointer of each overload to the kernel
+// parameter `fn_ptr`.
+//
+// Because `fn_ptr` is possibly visible outside this translation unit, the
+// compiler must compile all the functions which are stored.
+
 __global__ void test_fence_proxy_async(void** fn_ptr)
 {
 #if __cccl_ptx_isa >= 800
diff --git a/libcudacxx/test/libcudacxx/cuda/ptx/generated/fence_proxy_tensormap_generic.inc b/libcudacxx/test/libcudacxx/cuda/ptx/generated/fence_proxy_tensormap_generic.h
similarity index 78%
rename from libcudacxx/test/libcudacxx/cuda/ptx/generated/fence_proxy_tensormap_generic.inc
rename to libcudacxx/test/libcudacxx/cuda/ptx/generated/fence_proxy_tensormap_generic.h
index 1e0ea93a387..288aa6c3257 100644
--- a/libcudacxx/test/libcudacxx/cuda/ptx/generated/fence_proxy_tensormap_generic.inc
+++ b/libcudacxx/test/libcudacxx/cuda/ptx/generated/fence_proxy_tensormap_generic.h
@@ -1,3 +1,19 @@
+// This file was automatically generated. Do not edit.
+
+// We use a special strategy to force the generation of the PTX. This is mainly
+// a fight against dead-code-elimination in the NVVM layer.
+//
+// The reason we need this strategy is because certain older versions of ptxas
+// segfault when a non-sensical sequence of PTX is generated. So instead, we try
+// to force the instantiation and compilation to PTX of all the overloads of the
+// PTX wrapping functions.
+//
+// We do this by writing a function pointer of each overload to the kernel
+// parameter `fn_ptr`.
+//
+// Because `fn_ptr` is possibly visible outside this translation unit, the
+// compiler must compile all the functions which are stored.
+
 __global__ void test_fence_proxy_tensormap_generic(void** fn_ptr)
 {
 #if __cccl_ptx_isa >= 830
diff --git a/libcudacxx/test/libcudacxx/cuda/ptx/generated/get_sreg.inc b/libcudacxx/test/libcudacxx/cuda/ptx/generated/get_sreg.h
similarity index 94%
rename from libcudacxx/test/libcudacxx/cuda/ptx/generated/get_sreg.inc
rename to libcudacxx/test/libcudacxx/cuda/ptx/generated/get_sreg.h
index 90842352f90..dd4326a6a17 100644
--- a/libcudacxx/test/libcudacxx/cuda/ptx/generated/get_sreg.inc
+++ b/libcudacxx/test/libcudacxx/cuda/ptx/generated/get_sreg.h
@@ -1,3 +1,19 @@
+// This file was automatically generated. Do not edit.
+
+// We use a special strategy to force the generation of the PTX. This is mainly
+// a fight against dead-code-elimination in the NVVM layer.
+//
+// The reason we need this strategy is because certain older versions of ptxas
+// segfault when a non-sensical sequence of PTX is generated. So instead, we try
+// to force the instantiation and compilation to PTX of all the overloads of the
+// PTX wrapping functions.
+//
+// We do this by writing a function pointer of each overload to the kernel
+// parameter `fn_ptr`.
+//
+// Because `fn_ptr` is possibly visible outside this translation unit, the
+// compiler must compile all the functions which are stored.
+
 __global__ void test_get_sreg(void** fn_ptr)
 {
 #if __cccl_ptx_isa >= 200
diff --git a/libcudacxx/test/libcudacxx/cuda/ptx/generated/getctarank.h b/libcudacxx/test/libcudacxx/cuda/ptx/generated/getctarank.h
new file mode 100644
index 00000000000..b6e4b06afd6
--- /dev/null
+++ b/libcudacxx/test/libcudacxx/cuda/ptx/generated/getctarank.h
@@ -0,0 +1,26 @@
+// This file was automatically generated. Do not edit.
+
+// We use a special strategy to force the generation of the PTX. This is mainly
+// a fight against dead-code-elimination in the NVVM layer.
+//
+// The reason we need this strategy is because certain older versions of ptxas
+// segfault when a non-sensical sequence of PTX is generated. So instead, we try
+// to force the instantiation and compilation to PTX of all the overloads of the
+// PTX wrapping functions.
+//
+// We do this by writing a function pointer of each overload to the kernel
+// parameter `fn_ptr`.
+//
+// Because `fn_ptr` is possibly visible outside this translation unit, the
+// compiler must compile all the functions which are stored.
+
+__global__ void test_getctarank(void** fn_ptr)
+{
+#if __cccl_ptx_isa >= 780
+  NV_IF_TARGET(NV_PROVIDES_SM_90,
+               (
+                   // getctarank.shared::cluster.u32 dest, addr;
+                   * fn_ptr++ = reinterpret_cast<void*>(
+                     static_cast<uint32_t (*)(cuda::ptx::space_cluster_t, const void*)>(cuda::ptx::getctarank));));
+#endif // __cccl_ptx_isa >= 780
+}
diff --git a/libcudacxx/test/libcudacxx/cuda/ptx/generated/getctarank.inc b/libcudacxx/test/libcudacxx/cuda/ptx/generated/getctarank.inc
deleted file mode 100644
index 28b04c9f738..00000000000
--- a/libcudacxx/test/libcudacxx/cuda/ptx/generated/getctarank.inc
+++ /dev/null
@@ -1,10 +0,0 @@
-__global__ void test_getctarank(void** fn_ptr)
-{
-#if __cccl_ptx_isa >= 780
-  NV_IF_TARGET(NV_PROVIDES_SM_90,
-               (
-                   // getctarank.shared::cluster.u32 dest, addr;
-                   * fn_ptr++ = reinterpret_cast<void*>(
-                     static_cast<uint32_t (*)(cuda::ptx::space_cluster_t, const void*)>(cuda::ptx::getctarank));));
-#endif // __cccl_ptx_isa >= 780
-}
diff --git a/libcudacxx/test/libcudacxx/cuda/ptx/generated/mbarrier_arrive.inc b/libcudacxx/test/libcudacxx/cuda/ptx/generated/mbarrier_arrive.h
similarity index 82%
rename from libcudacxx/test/libcudacxx/cuda/ptx/generated/mbarrier_arrive.inc
rename to libcudacxx/test/libcudacxx/cuda/ptx/generated/mbarrier_arrive.h
index 4a94ec51d45..3cddcb3b54c 100644
--- a/libcudacxx/test/libcudacxx/cuda/ptx/generated/mbarrier_arrive.inc
+++ b/libcudacxx/test/libcudacxx/cuda/ptx/generated/mbarrier_arrive.h
@@ -1,3 +1,19 @@
+// This file was automatically generated. Do not edit.
+
+// We use a special strategy to force the generation of the PTX. This is mainly
+// a fight against dead-code-elimination in the NVVM layer.
+//
+// The reason we need this strategy is because certain older versions of ptxas
+// segfault when a non-sensical sequence of PTX is generated. So instead, we try
+// to force the instantiation and compilation to PTX of all the overloads of the
+// PTX wrapping functions.
+//
+// We do this by writing a function pointer of each overload to the kernel
+// parameter `fn_ptr`.
+//
+// Because `fn_ptr` is possibly visible outside this translation unit, the
+// compiler must compile all the functions which are stored.
+
 __global__ void test_mbarrier_arrive(void** fn_ptr)
 {
 #if __cccl_ptx_isa >= 700
diff --git a/libcudacxx/test/libcudacxx/cuda/ptx/generated/mbarrier_arrive_expect_tx.inc b/libcudacxx/test/libcudacxx/cuda/ptx/generated/mbarrier_arrive_expect_tx.h
similarity index 67%
rename from libcudacxx/test/libcudacxx/cuda/ptx/generated/mbarrier_arrive_expect_tx.inc
rename to libcudacxx/test/libcudacxx/cuda/ptx/generated/mbarrier_arrive_expect_tx.h
index 085723a452b..a2ef4b619bb 100644
--- a/libcudacxx/test/libcudacxx/cuda/ptx/generated/mbarrier_arrive_expect_tx.inc
+++ b/libcudacxx/test/libcudacxx/cuda/ptx/generated/mbarrier_arrive_expect_tx.h
@@ -1,3 +1,19 @@
+// This file was automatically generated. Do not edit.
+
+// We use a special strategy to force the generation of the PTX. This is mainly
+// a fight against dead-code-elimination in the NVVM layer.
+//
+// The reason we need this strategy is because certain older versions of ptxas
+// segfault when a non-sensical sequence of PTX is generated. So instead, we try
+// to force the instantiation and compilation to PTX of all the overloads of the
+// PTX wrapping functions.
+//
+// We do this by writing a function pointer of each overload to the kernel
+// parameter `fn_ptr`.
+//
+// Because `fn_ptr` is possibly visible outside this translation unit, the
+// compiler must compile all the functions which are stored.
+
 __global__ void test_mbarrier_arrive_expect_tx(void** fn_ptr)
 {
 #if __cccl_ptx_isa >= 800
diff --git a/libcudacxx/test/libcudacxx/cuda/ptx/generated/mbarrier_arrive_no_complete.h b/libcudacxx/test/libcudacxx/cuda/ptx/generated/mbarrier_arrive_no_complete.h
new file mode 100644
index 00000000000..9647ff830a8
--- /dev/null
+++ b/libcudacxx/test/libcudacxx/cuda/ptx/generated/mbarrier_arrive_no_complete.h
@@ -0,0 +1,26 @@
+// This file was automatically generated. Do not edit.
+
+// We use a special strategy to force the generation of the PTX. This is mainly
+// a fight against dead-code-elimination in the NVVM layer.
+//
+// The reason we need this strategy is because certain older versions of ptxas
+// segfault when a non-sensical sequence of PTX is generated. So instead, we try
+// to force the instantiation and compilation to PTX of all the overloads of the
+// PTX wrapping functions.
+//
+// We do this by writing a function pointer of each overload to the kernel
+// parameter `fn_ptr`.
+//
+// Because `fn_ptr` is possibly visible outside this translation unit, the
+// compiler must compile all the functions which are stored.
+
+__global__ void test_mbarrier_arrive_no_complete(void** fn_ptr)
+{
+#if __cccl_ptx_isa >= 700
+  NV_IF_TARGET(NV_PROVIDES_SM_80,
+               (
+                   // mbarrier.arrive.noComplete.shared.b64                       state,  [addr], count;    // 5.
+                   * fn_ptr++ = reinterpret_cast<void*>(
+                     static_cast<uint64_t (*)(uint64_t*, const uint32_t&)>(cuda::ptx::mbarrier_arrive_no_complete));));
+#endif // __cccl_ptx_isa >= 700
+}
diff --git a/libcudacxx/test/libcudacxx/cuda/ptx/generated/mbarrier_arrive_no_complete.inc b/libcudacxx/test/libcudacxx/cuda/ptx/generated/mbarrier_arrive_no_complete.inc
deleted file mode 100644
index d1d017cd3c2..00000000000
--- a/libcudacxx/test/libcudacxx/cuda/ptx/generated/mbarrier_arrive_no_complete.inc
+++ /dev/null
@@ -1,10 +0,0 @@
-__global__ void test_mbarrier_arrive_no_complete(void** fn_ptr)
-{
-#if __cccl_ptx_isa >= 700
-  NV_IF_TARGET(NV_PROVIDES_SM_80,
-               (
-                   // mbarrier.arrive.noComplete.shared.b64                       state,  [addr], count;    // 5.
-                   * fn_ptr++ = reinterpret_cast<void*>(
-                     static_cast<uint64_t (*)(uint64_t*, const uint32_t&)>(cuda::ptx::mbarrier_arrive_no_complete));));
-#endif // __cccl_ptx_isa >= 700
-}
diff --git a/libcudacxx/test/libcudacxx/cuda/ptx/generated/mbarrier_init.h b/libcudacxx/test/libcudacxx/cuda/ptx/generated/mbarrier_init.h
new file mode 100644
index 00000000000..d0a87419e77
--- /dev/null
+++ b/libcudacxx/test/libcudacxx/cuda/ptx/generated/mbarrier_init.h
@@ -0,0 +1,26 @@
+// This file was automatically generated. Do not edit.
+
+// We use a special strategy to force the generation of the PTX. This is mainly
+// a fight against dead-code-elimination in the NVVM layer.
+//
+// The reason we need this strategy is because certain older versions of ptxas
+// segfault when a non-sensical sequence of PTX is generated. So instead, we try
+// to force the instantiation and compilation to PTX of all the overloads of the
+// PTX wrapping functions.
+//
+// We do this by writing a function pointer of each overload to the kernel
+// parameter `fn_ptr`.
+//
+// Because `fn_ptr` is possibly visible outside this translation unit, the
+// compiler must compile all the functions which are stored.
+
+__global__ void test_mbarrier_init(void** fn_ptr)
+{
+#if __cccl_ptx_isa >= 700
+  NV_IF_TARGET(NV_PROVIDES_SM_80,
+               (
+                   // mbarrier.init.shared.b64 [addr], count;
+                   * fn_ptr++ = reinterpret_cast<void*>(
+                     static_cast<void (*)(uint64_t*, const uint32_t&)>(cuda::ptx::mbarrier_init));));
+#endif // __cccl_ptx_isa >= 700
+}
diff --git a/libcudacxx/test/libcudacxx/cuda/ptx/generated/mbarrier_init.inc b/libcudacxx/test/libcudacxx/cuda/ptx/generated/mbarrier_init.inc
deleted file mode 100644
index f814161d1f9..00000000000
--- a/libcudacxx/test/libcudacxx/cuda/ptx/generated/mbarrier_init.inc
+++ /dev/null
@@ -1,10 +0,0 @@
-__global__ void test_mbarrier_init(void** fn_ptr)
-{
-#if __cccl_ptx_isa >= 700
-  NV_IF_TARGET(NV_PROVIDES_SM_80,
-               (
-                   // mbarrier.init.shared.b64 [addr], count;
-                   * fn_ptr++ = reinterpret_cast<void*>(
-                     static_cast<void (*)(uint64_t*, const uint32_t&)>(cuda::ptx::mbarrier_init));));
-#endif // __cccl_ptx_isa >= 700
-}
diff --git a/libcudacxx/test/libcudacxx/cuda/ptx/generated/mbarrier_try_wait.inc b/libcudacxx/test/libcudacxx/cuda/ptx/generated/mbarrier_try_wait.h
similarity index 77%
rename from libcudacxx/test/libcudacxx/cuda/ptx/generated/mbarrier_try_wait.inc
rename to libcudacxx/test/libcudacxx/cuda/ptx/generated/mbarrier_try_wait.h
index e9d8661a07e..00166f8172c 100644
--- a/libcudacxx/test/libcudacxx/cuda/ptx/generated/mbarrier_try_wait.inc
+++ b/libcudacxx/test/libcudacxx/cuda/ptx/generated/mbarrier_try_wait.h
@@ -1,3 +1,19 @@
+// This file was automatically generated. Do not edit.
+
+// We use a special strategy to force the generation of the PTX. This is mainly
+// a fight against dead-code-elimination in the NVVM layer.
+//
+// The reason we need this strategy is because certain older versions of ptxas
+// segfault when a non-sensical sequence of PTX is generated. So instead, we try
+// to force the instantiation and compilation to PTX of all the overloads of the
+// PTX wrapping functions.
+//
+// We do this by writing a function pointer of each overload to the kernel
+// parameter `fn_ptr`.
+//
+// Because `fn_ptr` is possibly visible outside this translation unit, the
+// compiler must compile all the functions which are stored.
+
 __global__ void test_mbarrier_try_wait(void** fn_ptr)
 {
 #if __cccl_ptx_isa >= 780
diff --git a/libcudacxx/test/libcudacxx/cuda/ptx/generated/mbarrier_try_wait_parity.inc b/libcudacxx/test/libcudacxx/cuda/ptx/generated/mbarrier_try_wait_parity.h
similarity index 77%
rename from libcudacxx/test/libcudacxx/cuda/ptx/generated/mbarrier_try_wait_parity.inc
rename to libcudacxx/test/libcudacxx/cuda/ptx/generated/mbarrier_try_wait_parity.h
index f8c3875451a..8aa588fbab0 100644
--- a/libcudacxx/test/libcudacxx/cuda/ptx/generated/mbarrier_try_wait_parity.inc
+++ b/libcudacxx/test/libcudacxx/cuda/ptx/generated/mbarrier_try_wait_parity.h
@@ -1,3 +1,19 @@
+// This file was automatically generated. Do not edit.
+
+// We use a special strategy to force the generation of the PTX. This is mainly
+// a fight against dead-code-elimination in the NVVM layer.
+//
+// The reason we need this strategy is because certain older versions of ptxas
+// segfault when a non-sensical sequence of PTX is generated. So instead, we try
+// to force the instantiation and compilation to PTX of all the overloads of the
+// PTX wrapping functions.
+//
+// We do this by writing a function pointer of each overload to the kernel
+// parameter `fn_ptr`.
+//
+// Because `fn_ptr` is possibly visible outside this translation unit, the
+// compiler must compile all the functions which are stored.
+
 __global__ void test_mbarrier_try_wait_parity(void** fn_ptr)
 {
 #if __cccl_ptx_isa >= 780
diff --git a/libcudacxx/test/libcudacxx/cuda/ptx/generated/mbarrier_wait.inc b/libcudacxx/test/libcudacxx/cuda/ptx/generated/mbarrier_wait.h
similarity index 100%
rename from libcudacxx/test/libcudacxx/cuda/ptx/generated/mbarrier_wait.inc
rename to libcudacxx/test/libcudacxx/cuda/ptx/generated/mbarrier_wait.h
diff --git a/libcudacxx/test/libcudacxx/cuda/ptx/generated/mbarrier_wait_parity.inc b/libcudacxx/test/libcudacxx/cuda/ptx/generated/mbarrier_wait_parity.h
similarity index 100%
rename from libcudacxx/test/libcudacxx/cuda/ptx/generated/mbarrier_wait_parity.inc
rename to libcudacxx/test/libcudacxx/cuda/ptx/generated/mbarrier_wait_parity.h
diff --git a/libcudacxx/test/libcudacxx/cuda/ptx/generated/red_async.inc b/libcudacxx/test/libcudacxx/cuda/ptx/generated/red_async.h
similarity index 87%
rename from libcudacxx/test/libcudacxx/cuda/ptx/generated/red_async.inc
rename to libcudacxx/test/libcudacxx/cuda/ptx/generated/red_async.h
index 0d562fd31a7..530d8c85967 100644
--- a/libcudacxx/test/libcudacxx/cuda/ptx/generated/red_async.inc
+++ b/libcudacxx/test/libcudacxx/cuda/ptx/generated/red_async.h
@@ -1,3 +1,19 @@
+// This file was automatically generated. Do not edit.
+
+// We use a special strategy to force the generation of the PTX. This is mainly
+// a fight against dead-code-elimination in the NVVM layer.
+//
+// The reason we need this strategy is because certain older versions of ptxas
+// segfault when a non-sensical sequence of PTX is generated. So instead, we try
+// to force the instantiation and compilation to PTX of all the overloads of the
+// PTX wrapping functions.
+//
+// We do this by writing a function pointer of each overload to the kernel
+// parameter `fn_ptr`.
+//
+// Because `fn_ptr` is possibly visible outside this translation unit, the
+// compiler must compile all the functions which are stored.
+
 __global__ void test_red_async(void** fn_ptr)
 {
 #if __cccl_ptx_isa >= 810
diff --git a/libcudacxx/test/libcudacxx/cuda/ptx/generated/st_async.inc b/libcudacxx/test/libcudacxx/cuda/ptx/generated/st_async.h
similarity index 70%
rename from libcudacxx/test/libcudacxx/cuda/ptx/generated/st_async.inc
rename to libcudacxx/test/libcudacxx/cuda/ptx/generated/st_async.h
index 4efb95ef217..05ba9dd521a 100644
--- a/libcudacxx/test/libcudacxx/cuda/ptx/generated/st_async.inc
+++ b/libcudacxx/test/libcudacxx/cuda/ptx/generated/st_async.h
@@ -1,3 +1,19 @@
+// This file was automatically generated. Do not edit.
+
+// We use a special strategy to force the generation of the PTX. This is mainly
+// a fight against dead-code-elimination in the NVVM layer.
+//
+// The reason we need this strategy is because certain older versions of ptxas
+// segfault when a non-sensical sequence of PTX is generated. So instead, we try
+// to force the instantiation and compilation to PTX of all the overloads of the
+// PTX wrapping functions.
+//
+// We do this by writing a function pointer of each overload to the kernel
+// parameter `fn_ptr`.
+//
+// Because `fn_ptr` is possibly visible outside this translation unit, the
+// compiler must compile all the functions which are stored.
+
 __global__ void test_st_async(void** fn_ptr)
 {
 #if __cccl_ptx_isa >= 810
diff --git a/libcudacxx/test/libcudacxx/cuda/ptx/generated/tensormap_cp_fenceproxy.inc b/libcudacxx/test/libcudacxx/cuda/ptx/generated/tensormap_cp_fenceproxy.h
similarity index 70%
rename from libcudacxx/test/libcudacxx/cuda/ptx/generated/tensormap_cp_fenceproxy.inc
rename to libcudacxx/test/libcudacxx/cuda/ptx/generated/tensormap_cp_fenceproxy.h
index 9a0a8c1f615..f5293e20ec3 100644
--- a/libcudacxx/test/libcudacxx/cuda/ptx/generated/tensormap_cp_fenceproxy.inc
+++ b/libcudacxx/test/libcudacxx/cuda/ptx/generated/tensormap_cp_fenceproxy.h
@@ -1,3 +1,19 @@
+// This file was automatically generated. Do not edit.
+
+// We use a special strategy to force the generation of the PTX. This is mainly
+// a fight against dead-code-elimination in the NVVM layer.
+//
+// The reason we need this strategy is because certain older versions of ptxas
+// segfault when a non-sensical sequence of PTX is generated. So instead, we try
+// to force the instantiation and compilation to PTX of all the overloads of the
+// PTX wrapping functions.
+//
+// We do this by writing a function pointer of each overload to the kernel
+// parameter `fn_ptr`.
+//
+// Because `fn_ptr` is possibly visible outside this translation unit, the
+// compiler must compile all the functions which are stored.
+
 __global__ void test_tensormap_cp_fenceproxy(void** fn_ptr)
 {
 #if __cccl_ptx_isa >= 830
diff --git a/libcudacxx/test/libcudacxx/cuda/ptx/generated/tensormap_replace.inc b/libcudacxx/test/libcudacxx/cuda/ptx/generated/tensormap_replace.h
similarity index 91%
rename from libcudacxx/test/libcudacxx/cuda/ptx/generated/tensormap_replace.inc
rename to libcudacxx/test/libcudacxx/cuda/ptx/generated/tensormap_replace.h
index c69f3d11964..95446eb81fa 100644
--- a/libcudacxx/test/libcudacxx/cuda/ptx/generated/tensormap_replace.inc
+++ b/libcudacxx/test/libcudacxx/cuda/ptx/generated/tensormap_replace.h
@@ -1,3 +1,19 @@
+// This file was automatically generated. Do not edit.
+
+// We use a special strategy to force the generation of the PTX. This is mainly
+// a fight against dead-code-elimination in the NVVM layer.
+//
+// The reason we need this strategy is because certain older versions of ptxas
+// segfault when a non-sensical sequence of PTX is generated. So instead, we try
+// to force the instantiation and compilation to PTX of all the overloads of the
+// PTX wrapping functions.
+//
+// We do this by writing a function pointer of each overload to the kernel
+// parameter `fn_ptr`.
+//
+// Because `fn_ptr` is possibly visible outside this translation unit, the
+// compiler must compile all the functions which are stored.
+
 __global__ void test_tensormap_replace(void** fn_ptr)
 {
 #if __cccl_ptx_isa >= 830
diff --git a/libcudacxx/test/libcudacxx/cuda/ptx/ptx.barrier.cluster.compile.pass.cpp b/libcudacxx/test/libcudacxx/cuda/ptx/ptx.barrier.cluster.compile.pass.cpp
index c460a2e5b09..33d08621ef4 100644
--- a/libcudacxx/test/libcudacxx/cuda/ptx/ptx.barrier.cluster.compile.pass.cpp
+++ b/libcudacxx/test/libcudacxx/cuda/ptx/ptx.barrier.cluster.compile.pass.cpp
@@ -14,24 +14,7 @@
 #include <cuda/ptx>
 #include <cuda/std/utility>
 
-/*
- * We use a special strategy to force the generation of the PTX. This is mainly
- * a fight against dead-code-elimination in the NVVM layer.
- *
- * The reason we need this strategy is because certain older versions of ptxas
- * segfault when a non-sensical sequence of PTX is generated. So instead, we try
- * to force the instantiation and compilation to PTX of all the overloads of the
- * PTX wrapping functions.
- *
- * We do this by writing a function pointer of each overload to the kernel
- * parameter `fn_ptr`.
- *
- * Because `fn_ptr` is possibly visible outside this translation unit, the
- * compiler must compile all the functions which are stored.
- *
- */
-
-#include "generated/barrier_cluster.inc"
+#include "generated/barrier_cluster.h"
 
 int main(int, char**)
 {
diff --git a/libcudacxx/test/libcudacxx/cuda/ptx/ptx.cp.async.bulk.commit_group.compile.pass.cpp b/libcudacxx/test/libcudacxx/cuda/ptx/ptx.cp.async.bulk.commit_group.compile.pass.cpp
index 4695221dbc5..e7ff21c2730 100644
--- a/libcudacxx/test/libcudacxx/cuda/ptx/ptx.cp.async.bulk.commit_group.compile.pass.cpp
+++ b/libcudacxx/test/libcudacxx/cuda/ptx/ptx.cp.async.bulk.commit_group.compile.pass.cpp
@@ -14,24 +14,7 @@
 #include <cuda/ptx>
 #include <cuda/std/utility>
 
-/*
- * We use a special strategy to force the generation of the PTX. This is mainly
- * a fight against dead-code-elimination in the NVVM layer.
- *
- * The reason we need this strategy is because certain older versions of ptxas
- * segfault when a non-sensical sequence of PTX is generated. So instead, we try
- * to force the instantiation and compilation to PTX of all the overloads of the
- * PTX wrapping functions.
- *
- * We do this by writing a function pointer of each overload to the kernel
- * parameter `fn_ptr`.
- *
- * Because `fn_ptr` is possibly visible outside this translation unit, the
- * compiler must compile all the functions which are stored.
- *
- */
-
-#include "generated/cp_async_bulk_commit_group.inc"
+#include "generated/cp_async_bulk_commit_group.h"
 
 int main(int, char**)
 {
diff --git a/libcudacxx/test/libcudacxx/cuda/ptx/ptx.cp.async.bulk.compile.pass.cpp b/libcudacxx/test/libcudacxx/cuda/ptx/ptx.cp.async.bulk.compile.pass.cpp
index b1811727b66..fdd35749cc6 100644
--- a/libcudacxx/test/libcudacxx/cuda/ptx/ptx.cp.async.bulk.compile.pass.cpp
+++ b/libcudacxx/test/libcudacxx/cuda/ptx/ptx.cp.async.bulk.compile.pass.cpp
@@ -14,24 +14,7 @@
 #include <cuda/ptx>
 #include <cuda/std/utility>
 
-/*
- * We use a special strategy to force the generation of the PTX. This is mainly
- * a fight against dead-code-elimination in the NVVM layer.
- *
- * The reason we need this strategy is because certain older versions of ptxas
- * segfault when a non-sensical sequence of PTX is generated. So instead, we try
- * to force the instantiation and compilation to PTX of all the overloads of the
- * PTX wrapping functions.
- *
- * We do this by writing a function pointer of each overload to the kernel
- * parameter `fn_ptr`.
- *
- * Because `fn_ptr` is possibly visible outside this translation unit, the
- * compiler must compile all the functions which are stored.
- *
- */
-
-#include "generated/cp_async_bulk.inc"
+#include "generated/cp_async_bulk.h"
 
 int main(int, char**)
 {
diff --git a/libcudacxx/test/libcudacxx/cuda/ptx/ptx.cp.async.bulk.multicast.compile.pass.cpp b/libcudacxx/test/libcudacxx/cuda/ptx/ptx.cp.async.bulk.multicast.compile.pass.cpp
index c040528cabc..ae1546828ae 100644
--- a/libcudacxx/test/libcudacxx/cuda/ptx/ptx.cp.async.bulk.multicast.compile.pass.cpp
+++ b/libcudacxx/test/libcudacxx/cuda/ptx/ptx.cp.async.bulk.multicast.compile.pass.cpp
@@ -16,24 +16,7 @@
 #include <cuda/ptx>
 #include <cuda/std/utility>
 
-/*
- * We use a special strategy to force the generation of the PTX. This is mainly
- * a fight against dead-code-elimination in the NVVM layer.
- *
- * The reason we need this strategy is because certain older versions of ptxas
- * segfault when a non-sensical sequence of PTX is generated. So instead, we try
- * to force the instantiation and compilation to PTX of all the overloads of the
- * PTX wrapping functions.
- *
- * We do this by writing a function pointer of each overload to the kernel
- * parameter `fn_ptr`.
- *
- * Because `fn_ptr` is possibly visible outside this translation unit, the
- * compiler must compile all the functions which are stored.
- *
- */
-
-#include "generated/cp_async_bulk_multicast.inc"
+#include "generated/cp_async_bulk_multicast.h"
 
 int main(int, char**)
 {
diff --git a/libcudacxx/test/libcudacxx/cuda/ptx/ptx.cp.async.bulk.tensor.compile.pass.cpp b/libcudacxx/test/libcudacxx/cuda/ptx/ptx.cp.async.bulk.tensor.compile.pass.cpp
index 0b69b8a8f1c..eeb7b4bf5a5 100644
--- a/libcudacxx/test/libcudacxx/cuda/ptx/ptx.cp.async.bulk.tensor.compile.pass.cpp
+++ b/libcudacxx/test/libcudacxx/cuda/ptx/ptx.cp.async.bulk.tensor.compile.pass.cpp
@@ -14,24 +14,7 @@
 #include <cuda/ptx>
 #include <cuda/std/utility>
 
-/*
- * We use a special strategy to force the generation of the PTX. This is mainly
- * a fight against dead-code-elimination in the NVVM layer.
- *
- * The reason we need this strategy is because certain older versions of ptxas
- * segfault when a non-sensical sequence of PTX is generated. So instead, we try
- * to force the instantiation and compilation to PTX of all the overloads of the
- * PTX wrapping functions.
- *
- * We do this by writing a function pointer of each overload to the kernel
- * parameter `fn_ptr`.
- *
- * Because `fn_ptr` is possibly visible outside this translation unit, the
- * compiler must compile all the functions which are stored.
- *
- */
-
-#include "generated/cp_async_bulk_tensor.inc"
+#include "generated/cp_async_bulk_tensor.h"
 
 int main(int, char**)
 {
diff --git a/libcudacxx/test/libcudacxx/cuda/ptx/ptx.cp.async.bulk.tensor.multicast.compile.pass.cpp b/libcudacxx/test/libcudacxx/cuda/ptx/ptx.cp.async.bulk.tensor.multicast.compile.pass.cpp
index 7d53d9ee0c9..d07351a2275 100644
--- a/libcudacxx/test/libcudacxx/cuda/ptx/ptx.cp.async.bulk.tensor.multicast.compile.pass.cpp
+++ b/libcudacxx/test/libcudacxx/cuda/ptx/ptx.cp.async.bulk.tensor.multicast.compile.pass.cpp
@@ -16,24 +16,7 @@
 #include <cuda/ptx>
 #include <cuda/std/utility>
 
-/*
- * We use a special strategy to force the generation of the PTX. This is mainly
- * a fight against dead-code-elimination in the NVVM layer.
- *
- * The reason we need this strategy is because certain older versions of ptxas
- * segfault when a non-sensical sequence of PTX is generated. So instead, we try
- * to force the instantiation and compilation to PTX of all the overloads of the
- * PTX wrapping functions.
- *
- * We do this by writing a function pointer of each overload to the kernel
- * parameter `fn_ptr`.
- *
- * Because `fn_ptr` is possibly visible outside this translation unit, the
- * compiler must compile all the functions which are stored.
- *
- */
-
-#include "generated/cp_async_bulk_tensor_multicast.inc"
+#include "generated/cp_async_bulk_tensor_multicast.h"
 
 int main(int, char**)
 {
diff --git a/libcudacxx/test/libcudacxx/cuda/ptx/ptx.cp.async.bulk.wait_group.compile.pass.cpp b/libcudacxx/test/libcudacxx/cuda/ptx/ptx.cp.async.bulk.wait_group.compile.pass.cpp
index 39df53c5f9d..87910d04941 100644
--- a/libcudacxx/test/libcudacxx/cuda/ptx/ptx.cp.async.bulk.wait_group.compile.pass.cpp
+++ b/libcudacxx/test/libcudacxx/cuda/ptx/ptx.cp.async.bulk.wait_group.compile.pass.cpp
@@ -14,24 +14,7 @@
 #include <cuda/ptx>
 #include <cuda/std/utility>
 
-/*
- * We use a special strategy to force the generation of the PTX. This is mainly
- * a fight against dead-code-elimination in the NVVM layer.
- *
- * The reason we need this strategy is because certain older versions of ptxas
- * segfault when a non-sensical sequence of PTX is generated. So instead, we try
- * to force the instantiation and compilation to PTX of all the overloads of the
- * PTX wrapping functions.
- *
- * We do this by writing a function pointer of each overload to the kernel
- * parameter `fn_ptr`.
- *
- * Because `fn_ptr` is possibly visible outside this translation unit, the
- * compiler must compile all the functions which are stored.
- *
- */
-
-#include "generated/cp_async_bulk_wait_group.inc"
+#include "generated/cp_async_bulk_wait_group.h"
 
 int main(int, char**)
 {
diff --git a/libcudacxx/test/libcudacxx/cuda/ptx/ptx.cp.reduce.async.bulk.compile.pass.cpp b/libcudacxx/test/libcudacxx/cuda/ptx/ptx.cp.reduce.async.bulk.compile.pass.cpp
index a186e34a809..8b916d74bf9 100644
--- a/libcudacxx/test/libcudacxx/cuda/ptx/ptx.cp.reduce.async.bulk.compile.pass.cpp
+++ b/libcudacxx/test/libcudacxx/cuda/ptx/ptx.cp.reduce.async.bulk.compile.pass.cpp
@@ -14,31 +14,14 @@
 #include <cuda/ptx>
 #include <cuda/std/utility>
 
-/*
- * We use a special strategy to force the generation of the PTX. This is mainly
- * a fight against dead-code-elimination in the NVVM layer.
- *
- * The reason we need this strategy is because certain older versions of ptxas
- * segfault when a non-sensical sequence of PTX is generated. So instead, we try
- * to force the instantiation and compilation to PTX of all the overloads of the
- * PTX wrapping functions.
- *
- * We do this by writing a function pointer of each overload to the kernel
- * parameter `fn_ptr`.
- *
- * Because `fn_ptr` is possibly visible outside this translation unit, the
- * compiler must compile all the functions which are stored.
- *
- */
-
-#include "generated/cp_reduce_async_bulk.inc"
+#include "generated/cp_reduce_async_bulk.h"
 
 #ifdef _LIBCUDACXX_HAS_NVF16
-#  include "generated/cp_reduce_async_bulk_f16.inc"
+#  include "generated/cp_reduce_async_bulk_f16.h"
 #endif // _LIBCUDACXX_HAS_NVF16
 
 #ifdef _LIBCUDACXX_HAS_NVBF16
-#  include "generated/cp_reduce_async_bulk_bf16.inc"
+#  include "generated/cp_reduce_async_bulk_bf16.h"
 #endif // _LIBCUDACXX_HAS_NVBF16
 
 int main(int, char**)
diff --git a/libcudacxx/test/libcudacxx/cuda/ptx/ptx.cp.reduce.async.bulk.tensor.compile.pass.cpp b/libcudacxx/test/libcudacxx/cuda/ptx/ptx.cp.reduce.async.bulk.tensor.compile.pass.cpp
index 14abc0d3ae6..f6a6fd61735 100644
--- a/libcudacxx/test/libcudacxx/cuda/ptx/ptx.cp.reduce.async.bulk.tensor.compile.pass.cpp
+++ b/libcudacxx/test/libcudacxx/cuda/ptx/ptx.cp.reduce.async.bulk.tensor.compile.pass.cpp
@@ -14,24 +14,7 @@
 #include <cuda/ptx>
 #include <cuda/std/utility>
 
-/*
- * We use a special strategy to force the generation of the PTX. This is mainly
- * a fight against dead-code-elimination in the NVVM layer.
- *
- * The reason we need this strategy is because certain older versions of ptxas
- * segfault when a non-sensical sequence of PTX is generated. So instead, we try
- * to force the instantiation and compilation to PTX of all the overloads of the
- * PTX wrapping functions.
- *
- * We do this by writing a function pointer of each overload to the kernel
- * parameter `fn_ptr`.
- *
- * Because `fn_ptr` is possibly visible outside this translation unit, the
- * compiler must compile all the functions which are stored.
- *
- */
-
-#include "generated/cp_reduce_async_bulk_tensor.inc"
+#include "generated/cp_reduce_async_bulk_tensor.h"
 
 int main(int, char**)
 {
diff --git a/libcudacxx/test/libcudacxx/cuda/ptx/ptx.fence.compile.pass.cpp b/libcudacxx/test/libcudacxx/cuda/ptx/ptx.fence.compile.pass.cpp
index 641cb83f172..56f54b345f7 100644
--- a/libcudacxx/test/libcudacxx/cuda/ptx/ptx.fence.compile.pass.cpp
+++ b/libcudacxx/test/libcudacxx/cuda/ptx/ptx.fence.compile.pass.cpp
@@ -14,28 +14,11 @@
 #include <cuda/ptx>
 #include <cuda/std/utility>
 
-/*
- * We use a special strategy to force the generation of the PTX. This is mainly
- * a fight against dead-code-elimination in the NVVM layer.
- *
- * The reason we need this strategy is because certain older versions of ptxas
- * segfault when a non-sensical sequence of PTX is generated. So instead, we try
- * to force the instantiation and compilation to PTX of all the overloads of the
- * PTX wrapping functions.
- *
- * We do this by writing a function pointer of each overload to the kernel
- * parameter `fn_ptr`.
- *
- * Because `fn_ptr` is possibly visible outside this translation unit, the
- * compiler must compile all the functions which are stored.
- *
- */
-
-#include "generated/fence.inc"
-#include "generated/fence_mbarrier_init.inc"
-#include "generated/fence_proxy_alias.inc"
-#include "generated/fence_proxy_async.inc"
-#include "generated/fence_proxy_tensormap_generic.inc"
+#include "generated/fence.h"
+#include "generated/fence_mbarrier_init.h"
+#include "generated/fence_proxy_alias.h"
+#include "generated/fence_proxy_async.h"
+#include "generated/fence_proxy_tensormap_generic.h"
 
 int main(int, char**)
 {
diff --git a/libcudacxx/test/libcudacxx/cuda/ptx/ptx.get_sreg.compile.pass.cpp b/libcudacxx/test/libcudacxx/cuda/ptx/ptx.get_sreg.compile.pass.cpp
index 697cc00a1be..91a6dd94bf1 100644
--- a/libcudacxx/test/libcudacxx/cuda/ptx/ptx.get_sreg.compile.pass.cpp
+++ b/libcudacxx/test/libcudacxx/cuda/ptx/ptx.get_sreg.compile.pass.cpp
@@ -15,24 +15,7 @@
 #include <cuda/ptx>
 #include <cuda/std/utility>
 
-/*
- * We use a special strategy to force the generation of the PTX. This is mainly
- * a fight against dead-code-elimination in the NVVM layer.
- *
- * The reason we need this strategy is because certain older versions of ptxas
- * segfault when a non-sensical sequence of PTX is generated. So instead, we try
- * to force the instantiation and compilation to PTX of all the overloads of the
- * PTX wrapping functions.
- *
- * We do this by writing a function pointer of each overload to the kernel
- * parameter `fn_ptr`.
- *
- * Because `fn_ptr` is possibly visible outside this translation unit, the
- * compiler must compile all the functions which are stored.
- *
- */
-
-#include "generated/get_sreg.inc"
+#include "generated/get_sreg.h"
 
 int main(int, char**)
 {
diff --git a/libcudacxx/test/libcudacxx/cuda/ptx/ptx.getctarank.compile.pass.cpp b/libcudacxx/test/libcudacxx/cuda/ptx/ptx.getctarank.compile.pass.cpp
index 80fc71c0998..ed39816b7d6 100644
--- a/libcudacxx/test/libcudacxx/cuda/ptx/ptx.getctarank.compile.pass.cpp
+++ b/libcudacxx/test/libcudacxx/cuda/ptx/ptx.getctarank.compile.pass.cpp
@@ -14,24 +14,7 @@
 #include <cuda/ptx>
 #include <cuda/std/utility>
 
-/*
- * We use a special strategy to force the generation of the PTX. This is mainly
- * a fight against dead-code-elimination in the NVVM layer.
- *
- * The reason we need this strategy is because certain older versions of ptxas
- * segfault when a non-sensical sequence of PTX is generated. So instead, we try
- * to force the instantiation and compilation to PTX of all the overloads of the
- * PTX wrapping functions.
- *
- * We do this by writing a function pointer of each overload to the kernel
- * parameter `fn_ptr`.
- *
- * Because `fn_ptr` is possibly visible outside this translation unit, the
- * compiler must compile all the functions which are stored.
- *
- */
-
-#include "generated/getctarank.inc"
+#include "generated/getctarank.h"
 
 int main(int, char**)
 {
diff --git a/libcudacxx/test/libcudacxx/cuda/ptx/ptx.mbarrier.arrive.compile.pass.cpp b/libcudacxx/test/libcudacxx/cuda/ptx/ptx.mbarrier.arrive.compile.pass.cpp
index 2350b176630..93263910906 100644
--- a/libcudacxx/test/libcudacxx/cuda/ptx/ptx.mbarrier.arrive.compile.pass.cpp
+++ b/libcudacxx/test/libcudacxx/cuda/ptx/ptx.mbarrier.arrive.compile.pass.cpp
@@ -14,26 +14,9 @@
 #include <cuda/ptx>
 #include <cuda/std/utility>
 
-/*
- * We use a special strategy to force the generation of the PTX. This is mainly
- * a fight against dead-code-elimination in the NVVM layer.
- *
- * The reason we need this strategy is because certain older versions of ptxas
- * segfault when a non-sensical sequence of PTX is generated. So instead, we try
- * to force the instantiation and compilation to PTX of all the overloads of the
- * PTX wrapping functions.
- *
- * We do this by writing a function pointer of each overload to the kernel
- * parameter `fn_ptr`.
- *
- * Because `fn_ptr` is possibly visible outside this translation unit, the
- * compiler must compile all the functions which are stored.
- *
- */
-
-#include "generated/mbarrier_arrive.inc"
-#include "generated/mbarrier_arrive_expect_tx.inc"
-#include "generated/mbarrier_arrive_no_complete.inc"
+#include "generated/mbarrier_arrive.h"
+#include "generated/mbarrier_arrive_expect_tx.h"
+#include "generated/mbarrier_arrive_no_complete.h"
 
 int main(int, char**)
 {
diff --git a/libcudacxx/test/libcudacxx/cuda/ptx/ptx.mbarrier.init.compile.pass.cpp b/libcudacxx/test/libcudacxx/cuda/ptx/ptx.mbarrier.init.compile.pass.cpp
index b445a61a8a9..7af0db56b70 100644
--- a/libcudacxx/test/libcudacxx/cuda/ptx/ptx.mbarrier.init.compile.pass.cpp
+++ b/libcudacxx/test/libcudacxx/cuda/ptx/ptx.mbarrier.init.compile.pass.cpp
@@ -14,24 +14,7 @@
 #include <cuda/ptx>
 #include <cuda/std/utility>
 
-/*
- * We use a special strategy to force the generation of the PTX. This is mainly
- * a fight against dead-code-elimination in the NVVM layer.
- *
- * The reason we need this strategy is because certain older versions of ptxas
- * segfault when a non-sensical sequence of PTX is generated. So instead, we try
- * to force the instantiation and compilation to PTX of all the overloads of the
- * PTX wrapping functions.
- *
- * We do this by writing a function pointer of each overload to the kernel
- * parameter `fn_ptr`.
- *
- * Because `fn_ptr` is possibly visible outside this translation unit, the
- * compiler must compile all the functions which are stored.
- *
- */
-
-#include "generated/mbarrier_init.inc"
+#include "generated/mbarrier_init.h"
 
 int main(int, char**)
 {
diff --git a/libcudacxx/test/libcudacxx/cuda/ptx/ptx.mbarrier.wait.compile.pass.cpp b/libcudacxx/test/libcudacxx/cuda/ptx/ptx.mbarrier.wait.compile.pass.cpp
index e9c17a2024d..896abb8a7d8 100644
--- a/libcudacxx/test/libcudacxx/cuda/ptx/ptx.mbarrier.wait.compile.pass.cpp
+++ b/libcudacxx/test/libcudacxx/cuda/ptx/ptx.mbarrier.wait.compile.pass.cpp
@@ -14,27 +14,10 @@
 #include <cuda/ptx>
 #include <cuda/std/utility>
 
-/*
- * We use a special strategy to force the generation of the PTX. This is mainly
- * a fight against dead-code-elimination in the NVVM layer.
- *
- * The reason we need this strategy is because certain older versions of ptxas
- * segfault when a non-sensical sequence of PTX is generated. So instead, we try
- * to force the instantiation and compilation to PTX of all the overloads of the
- * PTX wrapping functions.
- *
- * We do this by writing a function pointer of each overload to the kernel
- * parameter `fn_ptr`.
- *
- * Because `fn_ptr` is possibly visible outside this translation unit, the
- * compiler must compile all the functions which are stored.
- *
- */
-
-#include "generated/mbarrier_try_wait.inc"
-#include "generated/mbarrier_try_wait_parity.inc"
-#include "generated/mbarrier_wait.inc"
-#include "generated/mbarrier_wait_parity.inc"
+#include "generated/mbarrier_try_wait.h"
+#include "generated/mbarrier_try_wait_parity.h"
+#include "generated/mbarrier_wait.h"
+#include "generated/mbarrier_wait_parity.h"
 
 int main(int, char**)
 {
diff --git a/libcudacxx/test/libcudacxx/cuda/ptx/ptx.red.async.compile.pass.cpp b/libcudacxx/test/libcudacxx/cuda/ptx/ptx.red.async.compile.pass.cpp
index 4a380ec8396..c6f66503b1f 100644
--- a/libcudacxx/test/libcudacxx/cuda/ptx/ptx.red.async.compile.pass.cpp
+++ b/libcudacxx/test/libcudacxx/cuda/ptx/ptx.red.async.compile.pass.cpp
@@ -14,24 +14,7 @@
 #include <cuda/ptx>
 #include <cuda/std/utility>
 
-/*
- * We use a special strategy to force the generation of the PTX. This is mainly
- * a fight against dead-code-elimination in the NVVM layer.
- *
- * The reason we need this strategy is because certain older versions of ptxas
- * segfault when a non-sensical sequence of PTX is generated. So instead, we try
- * to force the instantiation and compilation to PTX of all the overloads of the
- * PTX wrapping functions.
- *
- * We do this by writing a function pointer of each overload to the kernel
- * parameter `fn_ptr`.
- *
- * Because `fn_ptr` is possibly visible outside this translation unit, the
- * compiler must compile all the functions which are stored.
- *
- */
-
-#include "generated/red_async.inc"
+#include "generated/red_async.h"
 
 int main(int, char**)
 {
diff --git a/libcudacxx/test/libcudacxx/cuda/ptx/ptx.st.async.compile.pass.cpp b/libcudacxx/test/libcudacxx/cuda/ptx/ptx.st.async.compile.pass.cpp
index 2c74f48e04d..7c008b77126 100644
--- a/libcudacxx/test/libcudacxx/cuda/ptx/ptx.st.async.compile.pass.cpp
+++ b/libcudacxx/test/libcudacxx/cuda/ptx/ptx.st.async.compile.pass.cpp
@@ -14,24 +14,7 @@
 #include <cuda/ptx>
 #include <cuda/std/utility>
 
-/*
- * We use a special strategy to force the generation of the PTX. This is mainly
- * a fight against dead-code-elimination in the NVVM layer.
- *
- * The reason we need this strategy is because certain older versions of ptxas
- * segfault when a non-sensical sequence of PTX is generated. So instead, we try
- * to force the instantiation and compilation to PTX of all the overloads of the
- * PTX wrapping functions.
- *
- * We do this by writing a function pointer of each overload to the kernel
- * parameter `fn_ptr`.
- *
- * Because `fn_ptr` is possibly visible outside this translation unit, the
- * compiler must compile all the functions which are stored.
- *
- */
-
-#include "generated/st_async.inc"
+#include "generated/st_async.h"
 
 int main(int, char**)
 {
diff --git a/libcudacxx/test/libcudacxx/cuda/ptx/ptx.tensormap.cp_fenceproxy.compile.pass.cpp b/libcudacxx/test/libcudacxx/cuda/ptx/ptx.tensormap.cp_fenceproxy.compile.pass.cpp
index d0d3a967836..bb5578fc730 100644
--- a/libcudacxx/test/libcudacxx/cuda/ptx/ptx.tensormap.cp_fenceproxy.compile.pass.cpp
+++ b/libcudacxx/test/libcudacxx/cuda/ptx/ptx.tensormap.cp_fenceproxy.compile.pass.cpp
@@ -14,24 +14,7 @@
 #include <cuda/ptx>
 #include <cuda/std/utility>
 
-/*
- * We use a special strategy to force the generation of the PTX. This is mainly
- * a fight against dead-code-elimination in the NVVM layer.
- *
- * The reason we need this strategy is because certain older versions of ptxas
- * segfault when a non-sensical sequence of PTX is generated. So instead, we try
- * to force the instantiation and compilation to PTX of all the overloads of the
- * PTX wrapping functions.
- *
- * We do this by writing a function pointer of each overload to the kernel
- * parameter `fn_ptr`.
- *
- * Because `fn_ptr` is possibly visible outside this translation unit, the
- * compiler must compile all the functions which are stored.
- *
- */
-
-#include "generated/tensormap_cp_fenceproxy.inc"
+#include "generated/tensormap_cp_fenceproxy.h"
 
 int main(int, char**)
 {
diff --git a/libcudacxx/test/libcudacxx/cuda/ptx/ptx.tensormap.replace.compile.pass.cpp b/libcudacxx/test/libcudacxx/cuda/ptx/ptx.tensormap.replace.compile.pass.cpp
index d780ff26dca..264b7956fbb 100644
--- a/libcudacxx/test/libcudacxx/cuda/ptx/ptx.tensormap.replace.compile.pass.cpp
+++ b/libcudacxx/test/libcudacxx/cuda/ptx/ptx.tensormap.replace.compile.pass.cpp
@@ -14,24 +14,7 @@
 #include <cuda/ptx>
 #include <cuda/std/utility>
 
-/*
- * We use a special strategy to force the generation of the PTX. This is mainly
- * a fight against dead-code-elimination in the NVVM layer.
- *
- * The reason we need this strategy is because certain older versions of ptxas
- * segfault when a non-sensical sequence of PTX is generated. So instead, we try
- * to force the instantiation and compilation to PTX of all the overloads of the
- * PTX wrapping functions.
- *
- * We do this by writing a function pointer of each overload to the kernel
- * parameter `fn_ptr`.
- *
- * Because `fn_ptr` is possibly visible outside this translation unit, the
- * compiler must compile all the functions which are stored.
- *
- */
-
-#include "generated/tensormap_replace.inc"
+#include "generated/tensormap_replace.h"
 
 int main(int, char**)
 {

From dc920c93749d0b050dd306172c5a8888a4cf058a Mon Sep 17 00:00:00 2001
From: Michael Schellenberger Costa <miscco@nvidia.com>
Date: Mon, 25 Nov 2024 21:02:52 +0100
Subject: [PATCH 23/45] Do not include extended floating point headers if they
 are not needed (#2956)

Fixes #2933
---
 c2h/include/c2h/generators.h                  | 19 ++++++++++++++++++-
 cub/cub/detail/fast_modulo_division.cuh       |  6 +++---
 cub/cub/thread/thread_operators.cuh           | 11 +++++++++++
 cub/cub/thread/thread_reduce.cuh              | 11 +++++++++++
 cub/cub/util_type.cuh                         |  9 +++++++++
 .../cuda/std/__cccl/extended_floating_point.h | 11 -----------
 .../is_extended_floating_point.h              | 18 +++++++++++-------
 .../include/cuda/std/__type_traits/promote.h  |  1 +
 thrust/thrust/system/cuda/detail/sort.h       | 11 +++++++++++
 9 files changed, 75 insertions(+), 22 deletions(-)

diff --git a/c2h/include/c2h/generators.h b/c2h/include/c2h/generators.h
index 20036088fa8..62f169e9e21 100644
--- a/c2h/include/c2h/generators.h
+++ b/c2h/include/c2h/generators.h
@@ -35,7 +35,24 @@
 #include <c2h/vector.h>
 
 #if THRUST_DEVICE_SYSTEM == THRUST_DEVICE_SYSTEM_CUDA
-#  include <cub/util_type.cuh> // for <cuda_fp8.h>
+#  if defined(_CCCL_HAS_NVFP16)
+#    include <cuda_fp16.h>
+#  endif // _CCCL_HAS_NVFP16
+
+#  if defined(_CCCL_HAS_NVBF16)
+_CCCL_DIAG_PUSH
+_CCCL_DIAG_SUPPRESS_CLANG("-Wunused-function")
+#    include <cuda_bf16.h>
+_CCCL_DIAG_POP
+
+#    if _CCCL_CUDACC_AT_LEAST(11, 8)
+// cuda_fp8.h resets default for C4127, so we have to guard the inclusion
+_CCCL_DIAG_PUSH
+#      include <cuda_fp8.h>
+_CCCL_DIAG_POP
+#    endif // _CCCL_CUDACC_AT_LEAST(11, 8)
+#  endif // _CCCL_HAS_NVBF16
+
 #  if defined(__CUDA_FP8_TYPES_EXIST__)
 namespace std
 {
diff --git a/cub/cub/detail/fast_modulo_division.cuh b/cub/cub/detail/fast_modulo_division.cuh
index aa2ffd371c0..24b3204801d 100644
--- a/cub/cub/detail/fast_modulo_division.cuh
+++ b/cub/cub/detail/fast_modulo_division.cuh
@@ -37,6 +37,9 @@
 #  pragma system_header
 #endif // no system header
 
+#include <cub/detail/type_traits.cuh> // implicit_prom_t
+#include <cub/util_type.cuh> // CUB_IS_INT128_ENABLED
+
 #include <cuda/cmath> // cuda::std::ceil_div
 #include <cuda/std/bit> // std::has_single_bit
 #include <cuda/std/climits> // CHAR_BIT
@@ -44,9 +47,6 @@
 #include <cuda/std/limits> // numeric_limits
 #include <cuda/std/type_traits> // std::is_integral
 
-#include "cub/detail/type_traits.cuh" // implicit_prom_t
-#include "cub/util_type.cuh" // CUB_IS_INT128_ENABLED
-
 #if defined(CCCL_ENABLE_DEVICE_ASSERTIONS)
 _CCCL_NV_DIAG_SUPPRESS(186) // pointless comparison of unsigned integer with zero
 #endif // CCCL_ENABLE_DEVICE_ASSERTIONS
diff --git a/cub/cub/thread/thread_operators.cuh b/cub/cub/thread/thread_operators.cuh
index 45d2446188f..05f2d6a41f6 100644
--- a/cub/cub/thread/thread_operators.cuh
+++ b/cub/cub/thread/thread_operators.cuh
@@ -56,6 +56,17 @@
 #include <cuda/std/type_traits> // cuda::std::common_type
 #include <cuda/std/utility> // cuda::std::forward
 
+#if defined(_CCCL_HAS_NVFP16)
+#  include <cuda_fp16.h>
+#endif // _CCCL_HAS_NVFP16
+
+#if defined(_CCCL_HAS_NVBF16)
+_CCCL_DIAG_PUSH
+_CCCL_DIAG_SUPPRESS_CLANG("-Wunused-function")
+#  include <cuda_bf16.h>
+_CCCL_DIAG_POP
+#endif // _CCCL_HAS_NVFP16
+
 CUB_NAMESPACE_BEGIN
 
 // TODO(bgruber): deprecate in C++17 with a note: "replace by decltype(cuda::std::not_fn(EqualityOp{}))"
diff --git a/cub/cub/thread/thread_reduce.cuh b/cub/cub/thread/thread_reduce.cuh
index d4b4a89fdfd..f384d907b34 100644
--- a/cub/cub/thread/thread_reduce.cuh
+++ b/cub/cub/thread/thread_reduce.cuh
@@ -54,6 +54,17 @@
 #include <cuda/std/cstdint> // uint16_t
 #include <cuda/std/functional> // cuda::std::plus
 
+#if defined(_CCCL_HAS_NVFP16)
+#  include <cuda_fp16.h>
+#endif // _CCCL_HAS_NVFP16
+
+#if defined(_CCCL_HAS_NVBF16)
+_CCCL_DIAG_PUSH
+_CCCL_DIAG_SUPPRESS_CLANG("-Wunused-function")
+#  include <cuda_bf16.h>
+_CCCL_DIAG_POP
+#endif // _CCCL_HAS_NVFP16
+
 CUB_NAMESPACE_BEGIN
 
 //! @rst
diff --git a/cub/cub/util_type.cuh b/cub/cub/util_type.cuh
index f062ebc4ae9..5bda9dfe98f 100644
--- a/cub/cub/util_type.cuh
+++ b/cub/cub/util_type.cuh
@@ -50,7 +50,16 @@
 #include <cuda/std/limits>
 #include <cuda/std/type_traits>
 
+#if defined(_CCCL_HAS_NVFP16)
+#  include <cuda_fp16.h>
+#endif // _CCCL_HAS_NVFP16
+
 #if defined(_CCCL_HAS_NVBF16)
+_CCCL_DIAG_PUSH
+_CCCL_DIAG_SUPPRESS_CLANG("-Wunused-function")
+#  include <cuda_bf16.h>
+_CCCL_DIAG_POP
+
 #  if _CCCL_CUDACC_AT_LEAST(11, 8)
 // cuda_fp8.h resets default for C4127, so we have to guard the inclusion
 _CCCL_DIAG_PUSH
diff --git a/libcudacxx/include/cuda/std/__cccl/extended_floating_point.h b/libcudacxx/include/cuda/std/__cccl/extended_floating_point.h
index 9d3c835c464..d135f406702 100644
--- a/libcudacxx/include/cuda/std/__cccl/extended_floating_point.h
+++ b/libcudacxx/include/cuda/std/__cccl/extended_floating_point.h
@@ -39,15 +39,4 @@
 #  endif
 #endif // !_CCCL_HAS_NVBF16
 
-#if defined(_CCCL_HAS_NVFP16)
-#  include <cuda_fp16.h>
-#endif // _CCCL_HAS_NVFP16
-
-#if defined(_CCCL_HAS_NVBF16)
-_CCCL_DIAG_PUSH
-_CCCL_DIAG_SUPPRESS_CLANG("-Wunused-function")
-#  include <cuda_bf16.h>
-_CCCL_DIAG_POP
-#endif // _CCCL_HAS_NVFP16
-
 #endif // __CCCL_EXTENDED_FLOATING_POINT_H
diff --git a/libcudacxx/include/cuda/std/__type_traits/is_extended_floating_point.h b/libcudacxx/include/cuda/std/__type_traits/is_extended_floating_point.h
index dcc4330e107..bb1afa4225b 100644
--- a/libcudacxx/include/cuda/std/__type_traits/is_extended_floating_point.h
+++ b/libcudacxx/include/cuda/std/__type_traits/is_extended_floating_point.h
@@ -22,6 +22,17 @@
 
 #include <cuda/std/__type_traits/integral_constant.h>
 
+#if defined(_LIBCUDACXX_HAS_NVFP16)
+#  include <cuda_fp16.h>
+#endif // _LIBCUDACXX_HAS_NVFP16
+
+#if defined(_LIBCUDACXX_HAS_NVBF16)
+_CCCL_DIAG_PUSH
+_CCCL_DIAG_SUPPRESS_CLANG("-Wunused-function")
+#  include <cuda_bf16.h>
+_CCCL_DIAG_POP
+#endif // _LIBCUDACXX_HAS_NVBF16
+
 _LIBCUDACXX_BEGIN_NAMESPACE_STD
 
 template <class _Tp>
@@ -39,8 +50,6 @@ _CCCL_INLINE_VAR constexpr bool __is_extended_floating_point_v
 #endif // !_CCCL_NO_VARIABLE_TEMPLATES
 
 #if defined(_LIBCUDACXX_HAS_NVFP16)
-#  include <cuda_fp16.h>
-
 template <>
 struct __is_extended_floating_point<__half> : true_type
 {};
@@ -52,11 +61,6 @@ _CCCL_INLINE_VAR constexpr bool __is_extended_floating_point_v<__half> = true;
 #endif // _LIBCUDACXX_HAS_NVFP16
 
 #if defined(_LIBCUDACXX_HAS_NVBF16)
-_CCCL_DIAG_PUSH
-_CCCL_DIAG_SUPPRESS_CLANG("-Wunused-function")
-#  include <cuda_bf16.h>
-_CCCL_DIAG_POP
-
 template <>
 struct __is_extended_floating_point<__nv_bfloat16> : true_type
 {};
diff --git a/libcudacxx/include/cuda/std/__type_traits/promote.h b/libcudacxx/include/cuda/std/__type_traits/promote.h
index 01b06989513..daa545c5fa1 100644
--- a/libcudacxx/include/cuda/std/__type_traits/promote.h
+++ b/libcudacxx/include/cuda/std/__type_traits/promote.h
@@ -28,6 +28,7 @@
 #ifdef _LIBCUDACXX_HAS_NVFP16
 #  include <cuda_fp16.h>
 #endif // _LIBCUDACXX_HAS_NVFP16
+
 #ifdef _LIBCUDACXX_HAS_NVBF16
 _CCCL_DIAG_PUSH
 _CCCL_DIAG_SUPPRESS_CLANG("-Wunused-function")
diff --git a/thrust/thrust/system/cuda/detail/sort.h b/thrust/thrust/system/cuda/detail/sort.h
index a582cf2f3c6..3de3d5492f7 100644
--- a/thrust/thrust/system/cuda/detail/sort.h
+++ b/thrust/thrust/system/cuda/detail/sort.h
@@ -60,6 +60,17 @@
 
 #  include <cstdint>
 
+#  if defined(_CCCL_HAS_NVFP16)
+#    include <cuda_fp16.h>
+#  endif // _CCCL_HAS_NVFP16
+
+#  if defined(_CCCL_HAS_NVBF16)
+_CCCL_DIAG_PUSH
+_CCCL_DIAG_SUPPRESS_CLANG("-Wunused-function")
+#    include <cuda_bf16.h>
+_CCCL_DIAG_POP
+#  endif // _CCCL_HAS_NVBF16
+
 THRUST_NAMESPACE_BEGIN
 namespace cuda_cub
 {

From db47d38e2d7a0352d2b036934729851707ce66d2 Mon Sep 17 00:00:00 2001
From: pciolkosz <pciolkosz@nvidia.com>
Date: Mon, 25 Nov 2024 12:27:34 -0800
Subject: [PATCH 24/45] [CUDAX] Add copy_bytes and fill_bytes overloads for
 mdspan (#2932)

* Implement copy_bytes for mdspan

* Add final conversion to mdspan and more tests

* mdspan fill_bytes

* Add docs

* Fix issues after rebase

* Help old GCC figure out the types

* Move runtime extents check to a function

* Fix clang and more old GCC fixes
---
 .../cuda/experimental/__algorithm/common.cuh  | 24 +++++-
 .../cuda/experimental/__algorithm/copy.cuh    | 86 ++++++++++++++++++-
 .../cuda/experimental/__algorithm/fill.cuh    | 32 ++++++-
 cudax/test/algorithm/common.cuh               | 29 +++++--
 cudax/test/algorithm/copy.cu                  | 66 +++++++++++++-
 cudax/test/algorithm/fill.cu                  | 29 +++++++
 6 files changed, 253 insertions(+), 13 deletions(-)

diff --git a/cudax/include/cuda/experimental/__algorithm/common.cuh b/cudax/include/cuda/experimental/__algorithm/common.cuh
index 9dd891f7b28..eadb5e50dd5 100644
--- a/cudax/include/cuda/experimental/__algorithm/common.cuh
+++ b/cudax/include/cuda/experimental/__algorithm/common.cuh
@@ -23,15 +23,17 @@
 
 #include <cuda/std/__ranges/concepts.h>
 #include <cuda/std/__type_traits/is_convertible.h>
+#include <cuda/std/mdspan>
 #include <cuda/std/span>
 
 #include <cuda/experimental/__launch/launch_transform.cuh>
 
 namespace cuda::experimental
 {
+
 #if _CCCL_STD_VER >= 2020 && defined(_CCCL_SPAN_USES_RANGES)
 template <typename _Tp>
-concept __valid_copy_fill_argument = _CUDA_VRANGES::contiguous_range<detail::__as_copy_arg_t<_Tp>>;
+concept __valid_1d_copy_fill_argument = _CUDA_VRANGES::contiguous_range<detail::__as_copy_arg_t<_Tp>>;
 
 #else
 template <typename _Tp, typename = int>
@@ -45,10 +47,28 @@ inline constexpr bool __convertible_to_span<
     int>> = true;
 
 template <typename _Tp>
-inline constexpr bool __valid_copy_fill_argument =
+inline constexpr bool __valid_1d_copy_fill_argument =
   _CUDA_VRANGES::contiguous_range<detail::__as_copy_arg_t<_Tp>> || __convertible_to_span<_Tp>;
 
 #endif
 
+template <typename _Tp, typename _Decayed = _CUDA_VSTD::decay_t<_Tp>>
+using __as_mdspan_t =
+  _CUDA_VSTD::mdspan<typename _Decayed::value_type,
+                     typename _Decayed::extents_type,
+                     typename _Decayed::layout_type,
+                     typename _Decayed::accessor_type>;
+
+template <typename _Tp, typename = int>
+inline constexpr bool __convertible_to_mdspan = false;
+
+template <typename _Tp>
+inline constexpr bool
+  __convertible_to_mdspan<_Tp, _CUDA_VSTD::enable_if_t<_CUDA_VSTD::is_convertible_v<_Tp, __as_mdspan_t<_Tp>>, int>> =
+    true;
+
+template <typename _Tp>
+inline constexpr bool __valid_nd_copy_fill_argument = __convertible_to_mdspan<detail::__as_copy_arg_t<_Tp>>;
+
 } // namespace cuda::experimental
 #endif //__CUDAX_ALGORITHM_COMMON
diff --git a/cudax/include/cuda/experimental/__algorithm/copy.cuh b/cudax/include/cuda/experimental/__algorithm/copy.cuh
index 9cb5cf99a0a..e2c7c73d51a 100644
--- a/cudax/include/cuda/experimental/__algorithm/copy.cuh
+++ b/cudax/include/cuda/experimental/__algorithm/copy.cuh
@@ -53,7 +53,8 @@ void __copy_bytes_impl(stream_ref __stream, _CUDA_VSTD::span<_SrcTy> __src, _CUD
 
 //! @brief Launches a bytewise memory copy from source to destination into the provided stream.
 //!
-//! Both source and destination needs to either be a `contiguous_range` or implicitly/launch transform to one.
+//! Both source and destination needs to either be a `contiguous_range` or launch transform to one.
+//! They can also implicitly convert to `cuda::std::span`, but the type needs to contain `value_type` member alias.
 //! Both source and destination type is required to be trivially copyable.
 //!
 //! This call might be synchronous if either source or destination is pagable host memory.
@@ -63,7 +64,7 @@ void __copy_bytes_impl(stream_ref __stream, _CUDA_VSTD::span<_SrcTy> __src, _CUD
 //! @param __src Source to copy from
 //! @param __dst Destination to copy into
 _CCCL_TEMPLATE(typename _SrcTy, typename _DstTy)
-_CCCL_REQUIRES(__valid_copy_fill_argument<_SrcTy> _CCCL_AND __valid_copy_fill_argument<_DstTy>)
+_CCCL_REQUIRES(__valid_1d_copy_fill_argument<_SrcTy> _CCCL_AND __valid_1d_copy_fill_argument<_DstTy>)
 void copy_bytes(stream_ref __stream, _SrcTy&& __src, _DstTy&& __dst)
 {
   __copy_bytes_impl(
@@ -74,5 +75,86 @@ void copy_bytes(stream_ref __stream, _SrcTy&& __src, _DstTy&& __dst)
       detail::__launch_transform(__stream, _CUDA_VSTD::forward<_DstTy>(__dst)))));
 }
 
+template <typename _Extents, typename _OtherExtents>
+inline constexpr bool __copy_bytes_compatible_extents = false;
+
+template <typename _IndexType,
+          _CUDA_VSTD::size_t... _Extents,
+          typename _OtherIndexType,
+          _CUDA_VSTD::size_t... _OtherExtents>
+inline constexpr bool __copy_bytes_compatible_extents<_CUDA_VSTD::extents<_IndexType, _Extents...>,
+                                                      _CUDA_VSTD::extents<_OtherIndexType, _OtherExtents...>> =
+  decltype(_CUDA_VSTD::__detail::__check_compatible_extents(
+    _CUDA_VSTD::integral_constant<bool, sizeof...(_Extents) == sizeof...(_OtherExtents)>{},
+    _CUDA_VSTD::integer_sequence<size_t, _Extents...>{},
+    _CUDA_VSTD::integer_sequence<size_t, _OtherExtents...>{}))::value;
+
+template <typename _SrcExtents, typename _DstExtents>
+_CCCL_NODISCARD bool __copy_bytes_runtime_extents_match(_SrcExtents __src_exts, _DstExtents __dst_exts)
+{
+  for (typename _SrcExtents::rank_type __i = 0; __i < __src_exts.rank(); __i++)
+  {
+    if (__src_exts.extent(__i)
+        != static_cast<typename _SrcExtents::index_type>(
+          __dst_exts.extent((static_cast<typename _DstExtents::rank_type>(__i)))))
+    {
+      return false;
+    }
+  }
+  return true;
+}
+
+template <typename _SrcElem,
+          typename _SrcExtents,
+          typename _SrcLayout,
+          typename _SrcAccessor,
+          typename _DstElem,
+          typename _DstExtents,
+          typename _DstLayout,
+          typename _DstAccessor>
+void __nd_copy_bytes_impl(stream_ref __stream,
+                          _CUDA_VSTD::mdspan<_SrcElem, _SrcExtents, _SrcLayout, _SrcAccessor> __src,
+                          _CUDA_VSTD::mdspan<_DstElem, _DstExtents, _DstLayout, _DstAccessor> __dst)
+{
+  static_assert(__copy_bytes_compatible_extents<_SrcExtents, _DstExtents>,
+                "Multidimensional copy requires both source and destination extents to be compatible");
+  static_assert(_CUDA_VSTD::is_same_v<_SrcLayout, _DstLayout>,
+                "Multidimensional copy requires both source and destination layouts to match");
+
+  if (!__copy_bytes_runtime_extents_match(__src.extents(), __dst.extents()))
+  {
+    _CUDA_VSTD::__throw_invalid_argument("Copy destination size differs from the source");
+  }
+
+  __copy_bytes_impl(__stream,
+                    _CUDA_VSTD::span(__src.data_handle(), __src.mapping().required_span_size()),
+                    _CUDA_VSTD::span(__dst.data_handle(), __dst.mapping().required_span_size()));
+}
+
+//! @brief Launches a bytewise memory copy from source to destination into the provided stream.
+//!
+//! Both source and destination needs to either be an instance of `cuda::std::mdspan` or launch transform to
+//! one. They can also implicitly convert to `cuda::std::mdspan`, but the type needs to contain `mdspan` template
+//! arguments as member aliases named `value_type`, `extents_type`, `layout_type` and `accessor_type`. Both source and
+//! destination type is required to be trivially copyable.
+//!
+//! This call might be synchronous if either source or destination is pagable host memory.
+//! It will be synchronous if both destination and copy is located in host memory.
+//!
+//! @param __stream Stream that the copy should be inserted into
+//! @param __src Source to copy from
+//! @param __dst Destination to copy into
+_CCCL_TEMPLATE(typename _SrcTy, typename _DstTy)
+_CCCL_REQUIRES(__valid_nd_copy_fill_argument<_SrcTy> _CCCL_AND __valid_nd_copy_fill_argument<_DstTy>)
+void copy_bytes(stream_ref __stream, _SrcTy&& __src, _DstTy&& __dst)
+{
+  decltype(auto) __src_transformed = detail::__launch_transform(__stream, _CUDA_VSTD::forward<_SrcTy>(__src));
+  decltype(auto) __dst_transformed = detail::__launch_transform(__stream, _CUDA_VSTD::forward<_DstTy>(__dst));
+  decltype(auto) __src_as_arg      = static_cast<detail::__as_copy_arg_t<_SrcTy>>(__src_transformed);
+  decltype(auto) __dst_as_arg      = static_cast<detail::__as_copy_arg_t<_DstTy>>(__dst_transformed);
+  __nd_copy_bytes_impl(
+    __stream, __as_mdspan_t<decltype(__src_as_arg)>(__src_as_arg), __as_mdspan_t<decltype(__dst_as_arg)>(__dst_as_arg));
+}
+
 } // namespace cuda::experimental
 #endif // __CUDAX_ALGORITHM_COPY
diff --git a/cudax/include/cuda/experimental/__algorithm/fill.cuh b/cudax/include/cuda/experimental/__algorithm/fill.cuh
index aeb54235c78..cc7ddc61382 100644
--- a/cudax/include/cuda/experimental/__algorithm/fill.cuh
+++ b/cudax/include/cuda/experimental/__algorithm/fill.cuh
@@ -42,15 +42,17 @@ void __fill_bytes_impl(stream_ref __stream, _CUDA_VSTD::span<_DstTy, _DstSize> _
 
 //! @brief Launches an operation to bytewise fill the memory into the provided stream.
 //!
-//! Destination needs to either be a `contiguous_range` or implicitly/launch transform
-//! into one. It can't reside in pagable host memory.
+//! Destination needs to either be a `contiguous_range` or launch transform
+//! into one. It can also implicitly convert to `cuda::std::span`, but it needs to contain `value_type` member alias.
 //! Destination type is required to be trivially copyable.
 //!
+//! Destination can't reside in pagable host memory.
+//!
 //! @param __stream Stream that the copy should be inserted into
 //! @param __dst Destination memory to fill
 //! @param __value Value to fill into every byte in the destination
 _CCCL_TEMPLATE(typename _DstTy)
-_CCCL_REQUIRES(__valid_copy_fill_argument<_DstTy>)
+_CCCL_REQUIRES(__valid_1d_copy_fill_argument<_DstTy>)
 void fill_bytes(stream_ref __stream, _DstTy&& __dst, uint8_t __value)
 {
   __fill_bytes_impl(__stream,
@@ -59,5 +61,29 @@ void fill_bytes(stream_ref __stream, _DstTy&& __dst, uint8_t __value)
                     __value);
 }
 
+//! @brief Launches an operation to bytewise fill the memory into the provided stream.
+//!
+//! Destination needs to either be an instance of `cuda::std::mdspan` or launch transform
+//! into one. It can also implicitly convert to `cuda::std::mdspan`, but the type needs to contain `mdspan` template
+//! arguments as member aliases named `value_type`, `extents_type`, `layout_type` and `accessor_type`.   Destination
+//! type is required to be trivially copyable.
+//!
+//! Destination can't reside in pagable host memory.
+//!
+//! @param __stream Stream that the copy should be inserted into
+//! @param __dst Destination memory to fill
+//! @param __value Value to fill into every byte in the destination
+_CCCL_TEMPLATE(typename _DstTy)
+_CCCL_REQUIRES(__valid_nd_copy_fill_argument<_DstTy>)
+void fill_bytes(stream_ref __stream, _DstTy&& __dst, uint8_t __value)
+{
+  decltype(auto) __dst_transformed = detail::__launch_transform(__stream, _CUDA_VSTD::forward<_DstTy>(__dst));
+  decltype(auto) __dst_as_arg      = static_cast<detail::__as_copy_arg_t<_DstTy>>(__dst_transformed);
+  auto __dst_mdspan                = __as_mdspan_t<decltype(__dst_as_arg)>(__dst_as_arg);
+
+  __fill_bytes_impl(
+    __stream, _CUDA_VSTD::span(__dst_mdspan.data_handle(), __dst_mdspan.mapping().required_span_size()), __value);
+}
+
 } // namespace cuda::experimental
 #endif // __CUDAX_ALGORITHM_FILL
diff --git a/cudax/test/algorithm/common.cuh b/cudax/test/algorithm/common.cuh
index 2789a1f4802..4b262966190 100644
--- a/cudax/test/algorithm/common.cuh
+++ b/cudax/test/algorithm/common.cuh
@@ -43,10 +43,24 @@ void check_result_and_erase(cudax::stream_ref stream, Result&& result, uint8_t p
   }
 }
 
+template <typename Layout = cuda::std::layout_right, typename Extents>
+auto make_buffer_for_mdspan(Extents extents, char value = 0)
+{
+  cuda::mr::pinned_memory_resource host_resource;
+  auto mapping = typename Layout::template mapping<decltype(extents)>{extents};
+
+  cudax::uninitialized_buffer<int, cuda::mr::host_accessible> buffer(host_resource, mapping.required_span_size());
+
+  memset(buffer.data(), value, buffer.size_bytes());
+
+  return buffer;
+}
+
 namespace cuda::experimental
 {
 
 // Need a type that goes through all launch_transform steps, but is not a contiguous_range
+template <typename AsKernelArg = cuda::std::span<int>>
 struct weird_buffer
 {
   const cuda::mr::pinned_memory_resource& resource;
@@ -57,7 +71,9 @@ struct weird_buffer
       : resource(res)
       , data((int*) res.allocate(s * sizeof(int)))
       , size(s)
-  {}
+  {
+    memset(data, 0, size);
+  }
 
   ~weird_buffer()
   {
@@ -72,12 +88,18 @@ struct weird_buffer
     int* data;
     std::size_t size;
 
-    using __as_kernel_arg = cuda::std::span<int>;
+    using __as_kernel_arg = AsKernelArg;
 
     operator cuda::std::span<int>()
     {
       return {data, size};
     }
+
+    template <typename Extents>
+    operator cuda::std::mdspan<int, Extents>()
+    {
+      return cuda::std::mdspan<int, Extents>{data};
+    }
   };
 
   _CCCL_NODISCARD_FRIEND transform_result __cudax_launch_transform(cuda::stream_ref, const weird_buffer& self) noexcept
@@ -85,9 +107,6 @@ struct weird_buffer
     return {self.data, self.size};
   }
 };
-
-static_assert(std::is_same_v<cudax::as_kernel_arg_t<cudax::weird_buffer>, cuda::std::span<int>>);
-
 } // namespace cuda::experimental
 
 #endif // __ALGORITHM_COMMON__
diff --git a/cudax/test/algorithm/copy.cu b/cudax/test/algorithm/copy.cu
index 07eabba32e6..3db65e22c51 100644
--- a/cudax/test/algorithm/copy.cu
+++ b/cudax/test/algorithm/copy.cu
@@ -10,7 +10,7 @@
 
 #include "common.cuh"
 
-TEST_CASE("Copy", "[data_manipulation]")
+TEST_CASE("1d Copy", "[data_manipulation]")
 {
   cudax::stream _stream;
 
@@ -103,3 +103,67 @@ TEST_CASE("Copy", "[data_manipulation]")
     CUDAX_REQUIRE(vec[1] == 0xbeef);
   }
 }
+
+template <typename SrcLayout = cuda::std::layout_right,
+          typename DstLayout = SrcLayout,
+          typename SrcExtents,
+          typename DstExtents>
+void test_mdspan_copy_bytes(
+  cudax::stream_ref stream, SrcExtents src_extents = SrcExtents(), DstExtents dst_extents = DstExtents())
+{
+  auto src_buffer = make_buffer_for_mdspan<SrcLayout>(src_extents, 1);
+  auto dst_buffer = make_buffer_for_mdspan<DstLayout>(dst_extents, 0);
+
+  cuda::std::mdspan<int, SrcExtents, SrcLayout> src(src_buffer.data(), src_extents);
+  cuda::std::mdspan<int, DstExtents, DstLayout> dst(dst_buffer.data(), dst_extents);
+
+  for (int i = 0; i < static_cast<int>(src.extent(1)); i++)
+  {
+    src(0, i) = i;
+  }
+
+  cudax::copy_bytes(stream, std::move(src), dst);
+  stream.wait();
+
+  for (int i = 0; i < static_cast<int>(dst.extent(1)); i++)
+  {
+    CUDAX_CHECK(dst(0, i) == i);
+  }
+}
+
+TEST_CASE("Mdspan copy", "[data_manipulation]")
+{
+  cudax::stream stream;
+
+  SECTION("Different extents")
+  {
+    auto static_extents = cuda::std::extents<size_t, 3, 4>();
+    test_mdspan_copy_bytes(stream, static_extents, static_extents);
+    test_mdspan_copy_bytes<cuda::std::layout_left>(stream, static_extents, static_extents);
+
+    auto dynamic_extents = cuda::std::dextents<size_t, 2>(3, 4);
+    test_mdspan_copy_bytes(stream, dynamic_extents, dynamic_extents);
+    test_mdspan_copy_bytes(stream, static_extents, dynamic_extents);
+    test_mdspan_copy_bytes<cuda::std::layout_left>(stream, static_extents, dynamic_extents);
+
+    auto mixed_extents = cuda::std::extents<int, cuda::std::dynamic_extent, 4>(3);
+    test_mdspan_copy_bytes(stream, dynamic_extents, mixed_extents);
+    test_mdspan_copy_bytes(stream, mixed_extents, static_extents);
+    test_mdspan_copy_bytes<cuda::std::layout_left>(stream, mixed_extents, static_extents);
+  }
+
+  SECTION("Launch transform")
+  {
+    auto mixed_extents =
+      cuda::std::extents<size_t, 1024, cuda::std::dynamic_extent, 2, cuda::std::dynamic_extent>(1024, 2);
+    [[maybe_unused]] auto static_extents = cuda::std::extents<size_t, 1024, 1024, 2, 2>();
+    auto mdspan_buffer                   = make_buffer_for_mdspan(mixed_extents, 1);
+    cuda::std::mdspan<int, decltype(mixed_extents)> mdspan(mdspan_buffer.data(), mixed_extents);
+    cudax::weird_buffer<cuda::std::mdspan<int, decltype(static_extents)>> buffer{
+      cuda::mr::pinned_memory_resource{}, mdspan.mapping().required_span_size()};
+
+    cudax::copy_bytes(stream, mdspan, buffer);
+    stream.wait();
+    CUDAX_REQUIRE(!memcmp(mdspan_buffer.data(), buffer.data, mdspan_buffer.size()));
+  }
+}
diff --git a/cudax/test/algorithm/fill.cu b/cudax/test/algorithm/fill.cu
index 7111aa848f3..ce733871f51 100644
--- a/cudax/test/algorithm/fill.cu
+++ b/cudax/test/algorithm/fill.cu
@@ -44,3 +44,32 @@ TEST_CASE("Fill", "[data_manipulation]")
     check_result_and_erase(_stream, cuda::std::span(buffer.data, buffer.size));
   }
 }
+
+TEST_CASE("Mdspan Fill", "[data_manipulation]")
+{
+  cudax::stream stream;
+  {
+    cuda::std::dextents<size_t, 3> dynamic_extents{1, 2, 3};
+    auto buffer = make_buffer_for_mdspan(dynamic_extents, 0);
+    cuda::std::mdspan<int, decltype(dynamic_extents)> dynamic_mdspan(buffer.data(), dynamic_extents);
+
+    cudax::fill_bytes(stream, dynamic_mdspan, fill_byte);
+    check_result_and_erase(stream, cuda::std::span(buffer.data(), buffer.size()));
+  }
+  {
+    cuda::std::extents<size_t, 2, cuda::std::dynamic_extent, 4> mixed_extents{1};
+    auto buffer = make_buffer_for_mdspan(mixed_extents, 0);
+    cuda::std::mdspan<int, decltype(mixed_extents)> mixed_mdspan(buffer.data(), mixed_extents);
+
+    cudax::fill_bytes(stream, cuda::std::move(mixed_mdspan), fill_byte);
+    check_result_and_erase(stream, cuda::std::span(buffer.data(), buffer.size()));
+  }
+  {
+    using static_extents = cuda::std::extents<size_t, 2, 3, 4>;
+    auto size            = cuda::std::layout_left::mapping<static_extents>().required_span_size();
+    cudax::weird_buffer<cuda::std::mdspan<int, static_extents>> buffer(cuda::mr::pinned_memory_resource{}, size);
+
+    cudax::fill_bytes(stream, buffer, fill_byte);
+    check_result_and_erase(stream, cuda::std::span(buffer.data, buffer.size));
+  }
+}

From a085ba11095d0849a1ff62fb2f375d2601904868 Mon Sep 17 00:00:00 2001
From: Eric Niebler <eniebler@nvidia.com>
Date: Mon, 25 Nov 2024 13:17:29 -0800
Subject: [PATCH 25/45] add a `_CCCL_NO_CONCEPTS` config macro (#2945)

Co-authored-by: Bernhard Manfred Gruber <bernhardmgruber@gmail.com>
---
 libcudacxx/include/cuda/std/__cccl/dialect.h  |  6 ++++
 .../include/cuda/std/__concepts/arithmetic.h  |  4 +--
 .../include/cuda/std/__concepts/assignable.h  |  6 ++--
 .../cuda/std/__concepts/boolean_testable.h    |  6 ++--
 .../cuda/std/__concepts/class_or_enum.h       |  4 +--
 .../std/__concepts/common_reference_with.h    |  6 ++--
 .../include/cuda/std/__concepts/common_with.h |  6 ++--
 .../cuda/std/__concepts/concept_macros.h      | 23 +++++++-------
 .../cuda/std/__concepts/constructible.h       |  6 ++--
 .../cuda/std/__concepts/convertible_to.h      |  6 ++--
 .../include/cuda/std/__concepts/copyable.h    |  6 ++--
 .../cuda/std/__concepts/derived_from.h        |  6 ++--
 .../cuda/std/__concepts/destructible.h        |  4 +--
 .../cuda/std/__concepts/different_from.h      |  4 +--
 .../cuda/std/__concepts/equality_comparable.h |  6 ++--
 .../include/cuda/std/__concepts/invocable.h   |  6 ++--
 .../include/cuda/std/__concepts/movable.h     |  6 ++--
 .../include/cuda/std/__concepts/predicate.h   |  6 ++--
 .../include/cuda/std/__concepts/regular.h     |  6 ++--
 .../include/cuda/std/__concepts/relation.h    |  6 ++--
 .../include/cuda/std/__concepts/same_as.h     |  4 +--
 .../include/cuda/std/__concepts/semiregular.h |  6 ++--
 .../include/cuda/std/__concepts/swappable.h   | 26 ++++++++--------
 .../cuda/std/__concepts/totally_ordered.h     |  6 ++--
 .../include/cuda/std/__iterator/concepts.h    |  4 +--
 .../std/__iterator/incrementable_traits.h     |  4 +--
 .../include/cuda/std/__iterator/iter_move.h   | 12 ++++----
 .../include/cuda/std/__iterator/iter_swap.h   |  6 ++--
 .../cuda/std/__iterator/iterator_traits.h     | 22 +++++++-------
 .../cuda/std/__iterator/move_iterator.h       | 18 +++++------
 .../cuda/std/__iterator/readable_traits.h     |  4 +--
 .../cuda/std/__iterator/reverse_iterator.h    | 30 +++++++++----------
 libcudacxx/include/cuda/std/__ranges/access.h | 12 ++++----
 .../include/cuda/std/__ranges/concepts.h      | 13 ++++----
 libcudacxx/include/cuda/std/__ranges/data.h   |  6 ++--
 libcudacxx/include/cuda/std/__ranges/empty.h  |  6 ++--
 .../include/cuda/std/__ranges/enable_view.h   |  6 ++--
 libcudacxx/include/cuda/std/__ranges/rbegin.h |  6 ++--
 libcudacxx/include/cuda/std/__ranges/rend.h   |  6 ++--
 libcudacxx/include/cuda/std/__ranges/size.h   | 10 +++----
 .../include/cuda/std/__ranges/subrange.h      | 18 +++++------
 .../cuda/std/__ranges/view_interface.h        | 12 ++++----
 42 files changed, 187 insertions(+), 179 deletions(-)

diff --git a/libcudacxx/include/cuda/std/__cccl/dialect.h b/libcudacxx/include/cuda/std/__cccl/dialect.h
index 8dfedd5a3cc..407f2db6ecf 100644
--- a/libcudacxx/include/cuda/std/__cccl/dialect.h
+++ b/libcudacxx/include/cuda/std/__cccl/dialect.h
@@ -80,6 +80,7 @@
 #  define _CCCL_IF_CONSTEXPR      if constexpr
 #  define _CCCL_ELSE_IF_CONSTEXPR else if constexpr
 #else // ^^^ C++17 ^^^ / vvv C++14 vvv
+#  define _CCCL_NO_IF_CONSTEXPR
 #  define _CCCL_IF_CONSTEXPR      if
 #  define _CCCL_ELSE_IF_CONSTEXPR else if
 #endif // _CCCL_STD_VER <= 2014
@@ -104,6 +105,11 @@
 #  define _CCCL_NO_VARIABLE_TEMPLATES
 #endif // _CCCL_STD_VER <= 2011
 
+// concepts are only available from C++20 onwards
+#if _CCCL_STD_VER <= 2017 || !defined(__cpp_concepts) || (__cpp_concepts < 201907L)
+#  define _CCCL_NO_CONCEPTS
+#endif // _CCCL_STD_VER <= 2017 || !defined(__cpp_concepts) || (__cpp_concepts < 201907L)
+
 // noexcept function types are only available from C++17 onwards
 #if _CCCL_STD_VER >= 2017 && defined(__cpp_noexcept_function_type) && (__cpp_noexcept_function_type >= 201510L)
 #  define _CCCL_FUNCTION_TYPE_NOEXCEPT noexcept
diff --git a/libcudacxx/include/cuda/std/__concepts/arithmetic.h b/libcudacxx/include/cuda/std/__concepts/arithmetic.h
index 4f653cd35fc..5a643652824 100644
--- a/libcudacxx/include/cuda/std/__concepts/arithmetic.h
+++ b/libcudacxx/include/cuda/std/__concepts/arithmetic.h
@@ -30,7 +30,7 @@
 
 _LIBCUDACXX_BEGIN_NAMESPACE_STD
 
-#if _CCCL_STD_VER > 2011
+#if !defined(_CCCL_NO_VARIABLE_TEMPLATES)
 
 // [concepts.arithmetic], arithmetic concepts
 
@@ -49,7 +49,7 @@ _CCCL_CONCEPT floating_point = _CCCL_TRAIT(is_floating_point, _Tp);
 template <class _Tp>
 _CCCL_CONCEPT __libcpp_signed_integer = __libcpp_is_signed_integer<_Tp>::value;
 
-#endif // _CCCL_STD_VER > 2011
+#endif // ^^^ !_CCCL_NO_VARIABLE_TEMPLATES
 
 _LIBCUDACXX_END_NAMESPACE_STD
 
diff --git a/libcudacxx/include/cuda/std/__concepts/assignable.h b/libcudacxx/include/cuda/std/__concepts/assignable.h
index d3b0c89e96d..d2d3c96d64d 100644
--- a/libcudacxx/include/cuda/std/__concepts/assignable.h
+++ b/libcudacxx/include/cuda/std/__concepts/assignable.h
@@ -29,7 +29,7 @@
 
 _LIBCUDACXX_BEGIN_NAMESPACE_STD
 
-#if _CCCL_STD_VER > 2017
+#if !defined(_CCCL_NO_CONCEPTS)
 
 // [concept.assignable]
 
@@ -40,7 +40,7 @@ concept assignable_from =
        { __lhs = _CUDA_VSTD::forward<_Rhs>(__rhs) } -> same_as<_Lhs>;
      };
 
-#elif _CCCL_STD_VER > 2011
+#elif !defined(_CCCL_NO_VARIABLE_TEMPLATES) // ^^^ !_CCCL_NO_CONCEPTS ^^^
 
 template <class _Lhs, class _Rhs>
 _CCCL_CONCEPT_FRAGMENT(
@@ -53,7 +53,7 @@ _CCCL_CONCEPT_FRAGMENT(
 template <class _Lhs, class _Rhs>
 _CCCL_CONCEPT assignable_from = _CCCL_FRAGMENT(__assignable_from_, _Lhs, _Rhs);
 
-#endif // _CCCL_STD_VER > 2011
+#endif // ^^^ !_CCCL_NO_VARIABLE_TEMPLATES
 
 _LIBCUDACXX_END_NAMESPACE_STD
 
diff --git a/libcudacxx/include/cuda/std/__concepts/boolean_testable.h b/libcudacxx/include/cuda/std/__concepts/boolean_testable.h
index c3717385ebd..adc07b35842 100644
--- a/libcudacxx/include/cuda/std/__concepts/boolean_testable.h
+++ b/libcudacxx/include/cuda/std/__concepts/boolean_testable.h
@@ -26,7 +26,7 @@
 
 _LIBCUDACXX_BEGIN_NAMESPACE_STD
 
-#if _CCCL_STD_VER > 2017
+#if !defined(_CCCL_NO_CONCEPTS)
 
 // [concepts.booleantestable]
 
@@ -38,7 +38,7 @@ concept __boolean_testable = __boolean_testable_impl<_Tp> && requires(_Tp&& __t)
   { !_CUDA_VSTD::forward<_Tp>(__t) } -> __boolean_testable_impl;
 };
 
-#elif _CCCL_STD_VER > 2011
+#elif !defined(_CCCL_NO_VARIABLE_TEMPLATES) // ^^^ !_CCCL_NO_CONCEPTS ^^^
 
 template <class _Tp>
 _CCCL_CONCEPT __boolean_testable_impl = convertible_to<_Tp, bool>;
@@ -52,7 +52,7 @@ _CCCL_CONCEPT_FRAGMENT(
 template <class _Tp>
 _CCCL_CONCEPT __boolean_testable = _CCCL_FRAGMENT(__boolean_testable_, _Tp);
 
-#endif // _CCCL_STD_VER > 2011
+#endif // ^^^ !_CCCL_NO_VARIABLE_TEMPLATES
 
 _LIBCUDACXX_END_NAMESPACE_STD
 
diff --git a/libcudacxx/include/cuda/std/__concepts/class_or_enum.h b/libcudacxx/include/cuda/std/__concepts/class_or_enum.h
index 390ec8c5991..f94dec899f2 100644
--- a/libcudacxx/include/cuda/std/__concepts/class_or_enum.h
+++ b/libcudacxx/include/cuda/std/__concepts/class_or_enum.h
@@ -28,7 +28,7 @@
 
 _LIBCUDACXX_BEGIN_NAMESPACE_STD
 
-#if _CCCL_STD_VER > 2011
+#if !defined(_CCCL_NO_VARIABLE_TEMPLATES)
 
 template <class _Tp>
 _CCCL_CONCEPT __class_or_enum = _CCCL_TRAIT(is_class, _Tp) || _CCCL_TRAIT(is_union, _Tp) || _CCCL_TRAIT(is_enum, _Tp);
@@ -39,7 +39,7 @@ template <class _Tp>
 _CCCL_CONCEPT __workaround_52970 =
   _CCCL_TRAIT(is_class, remove_cvref_t<_Tp>) || _CCCL_TRAIT(is_union, remove_cvref_t<_Tp>);
 
-#endif // _CCCL_STD_VER > 2011
+#endif // ^^^ !_CCCL_NO_VARIABLE_TEMPLATES
 
 _LIBCUDACXX_END_NAMESPACE_STD
 
diff --git a/libcudacxx/include/cuda/std/__concepts/common_reference_with.h b/libcudacxx/include/cuda/std/__concepts/common_reference_with.h
index a41f04a1563..648805ca871 100644
--- a/libcudacxx/include/cuda/std/__concepts/common_reference_with.h
+++ b/libcudacxx/include/cuda/std/__concepts/common_reference_with.h
@@ -29,7 +29,7 @@
 
 _LIBCUDACXX_BEGIN_NAMESPACE_STD
 
-#if _CCCL_STD_VER > 2017
+#if !defined(_CCCL_NO_CONCEPTS)
 
 // [concept.commonref]
 
@@ -38,7 +38,7 @@ concept common_reference_with =
   same_as<common_reference_t<_Tp, _Up>, common_reference_t<_Up, _Tp>>
   && convertible_to<_Tp, common_reference_t<_Tp, _Up>> && convertible_to<_Up, common_reference_t<_Tp, _Up>>;
 
-#elif _CCCL_STD_VER > 2011
+#elif !defined(_CCCL_NO_VARIABLE_TEMPLATES) // ^^^ !_CCCL_NO_CONCEPTS ^^^
 
 template <class _Tp, class _Up>
 _CCCL_CONCEPT_FRAGMENT(__common_reference_exists_,
@@ -58,7 +58,7 @@ _CCCL_CONCEPT_FRAGMENT(
 template <class _Tp, class _Up>
 _CCCL_CONCEPT common_reference_with = _CCCL_FRAGMENT(__common_reference_with_, _Tp, _Up);
 
-#endif // _CCCL_STD_VER > 2011
+#endif // ^^^ !_CCCL_NO_VARIABLE_TEMPLATES
 
 _LIBCUDACXX_END_NAMESPACE_STD
 
diff --git a/libcudacxx/include/cuda/std/__concepts/common_with.h b/libcudacxx/include/cuda/std/__concepts/common_with.h
index 683ce44f5e4..20bb3680755 100644
--- a/libcudacxx/include/cuda/std/__concepts/common_with.h
+++ b/libcudacxx/include/cuda/std/__concepts/common_with.h
@@ -29,7 +29,7 @@
 
 _LIBCUDACXX_BEGIN_NAMESPACE_STD
 
-#if _CCCL_STD_VER > 2017
+#if !defined(_CCCL_NO_CONCEPTS)
 
 // [concept.common]
 
@@ -39,7 +39,7 @@ concept common_with = same_as<common_type_t<_Tp, _Up>, common_type_t<_Up, _Tp>>
   static_cast<common_type_t<_Tp, _Up>>(_CUDA_VSTD::declval<_Up>());
 } && common_reference_with<add_lvalue_reference_t<const _Tp>, add_lvalue_reference_t<const _Up>> && common_reference_with<add_lvalue_reference_t<common_type_t<_Tp, _Up>>, common_reference_t<add_lvalue_reference_t<const _Tp>, add_lvalue_reference_t<const _Up>>>;
 
-#elif _CCCL_STD_VER > 2011
+#elif !defined(_CCCL_NO_VARIABLE_TEMPLATES) // ^^^ !_CCCL_NO_CONCEPTS ^^^
 
 template <class _Tp, class _Up>
 _CCCL_CONCEPT_FRAGMENT(__common_type_exists_,
@@ -71,7 +71,7 @@ _CCCL_CONCEPT_FRAGMENT(
 template <class _Tp, class _Up>
 _CCCL_CONCEPT common_with = _CCCL_FRAGMENT(__common_with_, _Tp, _Up);
 
-#endif // _CCCL_STD_VER > 2011
+#endif // ^^^ !_CCCL_NO_VARIABLE_TEMPLATES
 
 _LIBCUDACXX_END_NAMESPACE_STD
 
diff --git a/libcudacxx/include/cuda/std/__concepts/concept_macros.h b/libcudacxx/include/cuda/std/__concepts/concept_macros.h
index 18587ca57df..8fc98cde0ff 100644
--- a/libcudacxx/include/cuda/std/__concepts/concept_macros.h
+++ b/libcudacxx/include/cuda/std/__concepts/concept_macros.h
@@ -52,21 +52,22 @@ using __cccl_enable_if_t = typename __cccl_select<_Bp>::template type<_Tp>;
 template <class _Tp, bool _Bp>
 using __cccl_requires_t = typename __cccl_select<_Bp>::template type<_Tp>;
 
-#if (defined(__cpp_concepts) && _CCCL_STD_VER >= 2020) || defined(_CCCL_DOXYGEN_INVOKED)
+#if !defined(_CCCL_NO_CONCEPTS) || defined(_CCCL_DOXYGEN_INVOKED)
 #  define _CCCL_TEMPLATE(...)               template <__VA_ARGS__>
 #  define _CCCL_REQUIRES(...)               requires __VA_ARGS__
 #  define _CCCL_AND                         &&
 #  define _CCCL_TRAILING_REQUIRES_AUX_(...) requires __VA_ARGS__
 #  define _CCCL_TRAILING_REQUIRES(...)      ->__VA_ARGS__ _CCCL_TRAILING_REQUIRES_AUX_
-#else // ^^^ __cpp_concepts ^^^ / vvv !__cpp_concepts vvv
+#else // ^^^ _CCCL_NO_CONCEPTS ^^^ / vvv !_CCCL_NO_CONCEPTS vvv
 #  define _CCCL_TEMPLATE(...)               template <__VA_ARGS__
 #  define _CCCL_REQUIRES(...)               , bool __cccl_true_ = true, __cccl_enable_if_t < __VA_ARGS__ && __cccl_true_, int > = 0 >
 #  define _CCCL_AND                         &&__cccl_true_, int > = 0, __cccl_enable_if_t <
 #  define _CCCL_TRAILING_REQUIRES_AUX_(...) , __VA_ARGS__ >
 #  define _CCCL_TRAILING_REQUIRES(...)      ->__cccl_requires_t < __VA_ARGS__ _CCCL_TRAILING_REQUIRES_AUX_
-#endif // !__cpp_concepts
+#endif // !defined(_CCCL_NO_CONCEPTS)
 
-#if _CCCL_STD_VER >= 2014
+// The following concepts emulation macros need variable template support
+#if !defined(_CCCL_NO_VARIABLE_TEMPLATES)
 
 template <class...>
 struct __cccl_tag;
@@ -141,7 +142,7 @@ namespace __cccl_unqualified_cuda_std = _CUDA_VSTD; // NOLINT(misc-unused-alias-
 #  define _CCCL_PP_EAT_TYPENAME_SELECT_1(...) _CCCL_PP_CAT3(_CCCL_PP_EAT_TYPENAME_, __VA_ARGS__)
 #  define _CCCL_PP_EAT_TYPENAME_typename
 
-#  if (defined(__cpp_concepts) && _CCCL_STD_VER >= 2020) || defined(_CCCL_DOXYGEN_INVOKED)
+#  if !defined(_CCCL_NO_CONCEPTS) || defined(_CCCL_DOXYGEN_INVOKED)
 
 #    define _CCCL_CONCEPT concept
 
@@ -167,7 +168,7 @@ namespace __cccl_unqualified_cuda_std = _CUDA_VSTD; // NOLINT(misc-unused-alias-
 
 #    define _CCCL_FRAGMENT(_NAME, ...) _NAME<__VA_ARGS__>
 
-#  else
+#  else // ^^^ !_CCCL_NO_CONCEPTS ^^^ / vvv _CCCL_NO_CONCEPTS vvv
 
 #    define _CCCL_CONCEPT _CCCL_INLINE_VAR constexpr bool
 
@@ -207,7 +208,7 @@ namespace __cccl_unqualified_cuda_std = _CUDA_VSTD; // NOLINT(misc-unused-alias-
 #    define _CCCL_FRAGMENT(_NAME, ...) \
       (1u == sizeof(_NAME##_CCCL_CONCEPT_FRAGMENT_(static_cast<::__cccl_tag<__VA_ARGS__>*>(nullptr), nullptr)))
 
-#  endif
+#  endif // ^^^ _CCCL_NO_CONCEPTS ^^^
 
 ////////////////////////////////////////////////////////////////////////////////
 // _CCCL_REQUIRES_EXPR
@@ -220,10 +221,10 @@ namespace __cccl_unqualified_cuda_std = _CUDA_VSTD; // NOLINT(misc-unused-alias-
 //     );
 //
 // Can only be used as the last requirement in a concept definition.
-#  if defined(__cpp_concepts) && _CCCL_STD_VER >= 2020 || defined(_CCCL_DOXYGEN_INVOKED)
+#  if !defined(_CCCL_NO_CONCEPTS) || defined(_CCCL_DOXYGEN_INVOKED)
 #    define _CCCL_REQUIRES_EXPR(_TY, ...) requires(__VA_ARGS__) _CCCL_REQUIRES_EXPR_2
 #    define _CCCL_REQUIRES_EXPR_2(...)    {_CCCL_PP_FOR_EACH(_CCCL_CONCEPT_FRAGMENT_REQS_M, __VA_ARGS__)}
-#  else
+#  else // ^^^ !_CCCL_NO_CONCEPTS ^^^ / vvv _CCCL_NO_CONCEPTS vvv
 #    define _CCCL_REQUIRES_EXPR_TPARAM_PROBE_variadic _CCCL_PP_PROBE(~)
 #    define _CCCL_REQUIRES_EXPR_TPARAM_variadic
 
@@ -268,8 +269,8 @@ namespace __cccl_unqualified_cuda_std = _CUDA_VSTD; // NOLINT(misc-unused-alias-
         return false;                                                                                     \
       }                                                                                                   \
       }
-#  endif
+#  endif // ^^^ _CCCL_NO_CONCEPTS ^^^
 
-#endif // _CCCL_STD_VER >= 2014
+#endif // ^^^ !_CCCL_NO_VARIABLE_TEMPLATES ^^^
 
 #endif //_CUDA___CONCEPTS
diff --git a/libcudacxx/include/cuda/std/__concepts/constructible.h b/libcudacxx/include/cuda/std/__concepts/constructible.h
index 13879811f8b..08c579060fe 100644
--- a/libcudacxx/include/cuda/std/__concepts/constructible.h
+++ b/libcudacxx/include/cuda/std/__concepts/constructible.h
@@ -28,7 +28,7 @@
 
 _LIBCUDACXX_BEGIN_NAMESPACE_STD
 
-#if _CCCL_STD_VER > 2017
+#if !defined(_CCCL_NO_CONCEPTS)
 
 // [concept.constructible]
 template <class _Tp, class... _Args>
@@ -52,7 +52,7 @@ concept copy_constructible =
   && constructible_from<_Tp, const _Tp&> && convertible_to<const _Tp&, _Tp> && constructible_from<_Tp, const _Tp>
   && convertible_to<const _Tp, _Tp>;
 
-#elif _CCCL_STD_VER > 2011
+#elif !defined(_CCCL_NO_VARIABLE_TEMPLATES) // ^^^ !_CCCL_NO_CONCEPTS ^^^
 
 template <class _Tp, class... _Args>
 _CCCL_CONCEPT_FRAGMENT(__constructible_from_,
@@ -96,7 +96,7 @@ _CCCL_CONCEPT_FRAGMENT(
 template <class _Tp>
 _CCCL_CONCEPT copy_constructible = _CCCL_FRAGMENT(__copy_constructible_, _Tp);
 
-#endif // _CCCL_STD_VER > 2011
+#endif // ^^^ !_CCCL_NO_VARIABLE_TEMPLATES
 
 _LIBCUDACXX_END_NAMESPACE_STD
 
diff --git a/libcudacxx/include/cuda/std/__concepts/convertible_to.h b/libcudacxx/include/cuda/std/__concepts/convertible_to.h
index 169383cb095..45eebf9d97d 100644
--- a/libcudacxx/include/cuda/std/__concepts/convertible_to.h
+++ b/libcudacxx/include/cuda/std/__concepts/convertible_to.h
@@ -28,12 +28,12 @@ _LIBCUDACXX_BEGIN_NAMESPACE_STD
 
 // [concept.convertible]
 
-#if _CCCL_STD_VER >= 2020
+#if !defined(_CCCL_NO_CONCEPTS)
 
 template <class _From, class _To>
 concept convertible_to = is_convertible_v<_From, _To> && requires { static_cast<_To>(_CUDA_VSTD::declval<_From>()); };
 
-#elif _CCCL_STD_VER >= 2014 // ^^^ C++20 ^^^ / vvv C++14/17 vvv
+#elif !defined(_CCCL_NO_VARIABLE_TEMPLATES) // ^^^ !_CCCL_NO_CONCEPTS ^^^
 
 #  if _CCCL_COMPILER(MSVC)
 _CCCL_NV_DIAG_SUPPRESS(1211) // nonstandard cast to array type ignored
@@ -60,7 +60,7 @@ _CCCL_NV_DIAG_DEFAULT(1211) // nonstandard cast to array type ignored
 #  endif // _CCCL_COMPILER(MSVC)
 _CCCL_NV_DIAG_DEFAULT(171) // invalid type conversion, e.g. [with _From=int **, _To=const int *const *]
 
-#endif // _CCCL_STD_VER >= 2014
+#endif // ^^^ !_CCCL_NO_VARIABLE_TEMPLATES ^^^
 
 _LIBCUDACXX_END_NAMESPACE_STD
 
diff --git a/libcudacxx/include/cuda/std/__concepts/copyable.h b/libcudacxx/include/cuda/std/__concepts/copyable.h
index 11bf23329bc..1ba79c71ed2 100644
--- a/libcudacxx/include/cuda/std/__concepts/copyable.h
+++ b/libcudacxx/include/cuda/std/__concepts/copyable.h
@@ -27,7 +27,7 @@
 
 _LIBCUDACXX_BEGIN_NAMESPACE_STD
 
-#if _CCCL_STD_VER > 2017
+#if !defined(_CCCL_NO_CONCEPTS)
 
 // [concepts.object]
 
@@ -35,7 +35,7 @@ template <class _Tp>
 concept copyable = copy_constructible<_Tp> && movable<_Tp> && assignable_from<_Tp&, _Tp&>
                 && assignable_from<_Tp&, const _Tp&> && assignable_from<_Tp&, const _Tp>;
 
-#elif _CCCL_STD_VER > 2011
+#elif !defined(_CCCL_NO_VARIABLE_TEMPLATES) // ^^^ !_CCCL_NO_CONCEPTS ^^^
 
 template <class _Tp>
 _CCCL_CONCEPT_FRAGMENT(
@@ -49,7 +49,7 @@ _CCCL_CONCEPT_FRAGMENT(
 template <class _Tp>
 _CCCL_CONCEPT copyable = _CCCL_FRAGMENT(__copyable_, _Tp);
 
-#endif // _CCCL_STD_VER > 2011
+#endif // ^^^ !_CCCL_NO_VARIABLE_TEMPLATES
 
 _LIBCUDACXX_END_NAMESPACE_STD
 
diff --git a/libcudacxx/include/cuda/std/__concepts/derived_from.h b/libcudacxx/include/cuda/std/__concepts/derived_from.h
index ff3f0cb2411..dca99425d54 100644
--- a/libcudacxx/include/cuda/std/__concepts/derived_from.h
+++ b/libcudacxx/include/cuda/std/__concepts/derived_from.h
@@ -27,14 +27,14 @@
 
 _LIBCUDACXX_BEGIN_NAMESPACE_STD
 
-#if _CCCL_STD_VER > 2017
+#if !defined(_CCCL_NO_CONCEPTS)
 
 // [concept.derived]
 
 template <class _Dp, class _Bp>
 concept derived_from = is_base_of_v<_Bp, _Dp> && is_convertible_v<const volatile _Dp*, const volatile _Bp*>;
 
-#elif _CCCL_STD_VER > 2011
+#elif !defined(_CCCL_NO_VARIABLE_TEMPLATES) // ^^^ !_CCCL_NO_CONCEPTS ^^^
 
 template <class _Dp, class _Bp>
 _CCCL_CONCEPT_FRAGMENT(
@@ -46,7 +46,7 @@ _CCCL_CONCEPT_FRAGMENT(
 template <class _Dp, class _Bp>
 _CCCL_CONCEPT derived_from = _CCCL_FRAGMENT(__derived_from_, _Dp, _Bp);
 
-#endif // _CCCL_STD_VER > 2011
+#endif // ^^^ !_CCCL_NO_VARIABLE_TEMPLATES
 
 _LIBCUDACXX_END_NAMESPACE_STD
 
diff --git a/libcudacxx/include/cuda/std/__concepts/destructible.h b/libcudacxx/include/cuda/std/__concepts/destructible.h
index 90426478490..62d241b9e33 100644
--- a/libcudacxx/include/cuda/std/__concepts/destructible.h
+++ b/libcudacxx/include/cuda/std/__concepts/destructible.h
@@ -30,7 +30,7 @@
 
 _LIBCUDACXX_BEGIN_NAMESPACE_STD
 
-#if _CCCL_STD_VER > 2011
+#if !defined(_CCCL_NO_VARIABLE_TEMPLATES)
 
 #  if _CCCL_COMPILER(MSVC)
 
@@ -69,7 +69,7 @@ _CCCL_CONCEPT destructible = __destructible<_Tp>;
 
 #  endif // !_CCCL_COMPILER(MSVC)
 
-#endif // _CCCL_STD_VER > 2011
+#endif // ^^^ !_CCCL_NO_VARIABLE_TEMPLATES
 
 _LIBCUDACXX_END_NAMESPACE_STD
 
diff --git a/libcudacxx/include/cuda/std/__concepts/different_from.h b/libcudacxx/include/cuda/std/__concepts/different_from.h
index 596fa0c2587..0675c0171b0 100644
--- a/libcudacxx/include/cuda/std/__concepts/different_from.h
+++ b/libcudacxx/include/cuda/std/__concepts/different_from.h
@@ -26,12 +26,12 @@
 
 _LIBCUDACXX_BEGIN_NAMESPACE_STD
 
-#if _CCCL_STD_VER > 2011
+#if !defined(_CCCL_NO_VARIABLE_TEMPLATES)
 
 template <class _Tp, class _Up>
 _CCCL_CONCEPT __different_from = !same_as<remove_cvref_t<_Tp>, remove_cvref_t<_Up>>;
 
-#endif // _CCCL_STD_VER > 2011
+#endif // ^^^ !_CCCL_NO_VARIABLE_TEMPLATES
 
 _LIBCUDACXX_END_NAMESPACE_STD
 
diff --git a/libcudacxx/include/cuda/std/__concepts/equality_comparable.h b/libcudacxx/include/cuda/std/__concepts/equality_comparable.h
index c2909df1a3b..ed599a7f2cb 100644
--- a/libcudacxx/include/cuda/std/__concepts/equality_comparable.h
+++ b/libcudacxx/include/cuda/std/__concepts/equality_comparable.h
@@ -28,7 +28,7 @@
 
 _LIBCUDACXX_BEGIN_NAMESPACE_STD
 
-#if _CCCL_STD_VER > 2017
+#if !defined(_CCCL_NO_CONCEPTS)
 
 // [concept.equalitycomparable]
 
@@ -51,7 +51,7 @@ concept equality_comparable_with =
   && equality_comparable<common_reference_t<__make_const_lvalue_ref<_Tp>, __make_const_lvalue_ref<_Up>>>
   && __weakly_equality_comparable_with<_Tp, _Up>;
 
-#elif _CCCL_STD_VER > 2011
+#elif !defined(_CCCL_NO_VARIABLE_TEMPLATES) // ^^^ !_CCCL_NO_CONCEPTS ^^^
 
 template <class _Tp>
 _CCCL_CONCEPT_FRAGMENT(__with_lvalue_reference_, requires()(typename(__make_const_lvalue_ref<_Tp>)));
@@ -89,7 +89,7 @@ _CCCL_CONCEPT_FRAGMENT(
 template <class _Tp, class _Up>
 _CCCL_CONCEPT equality_comparable_with = _CCCL_FRAGMENT(__equality_comparable_with_, _Tp, _Up);
 
-#endif // _CCCL_STD_VER > 2011
+#endif // ^^^ !_CCCL_NO_VARIABLE_TEMPLATES
 
 _LIBCUDACXX_END_NAMESPACE_STD
 
diff --git a/libcudacxx/include/cuda/std/__concepts/invocable.h b/libcudacxx/include/cuda/std/__concepts/invocable.h
index c9dda78270e..864821362e7 100644
--- a/libcudacxx/include/cuda/std/__concepts/invocable.h
+++ b/libcudacxx/include/cuda/std/__concepts/invocable.h
@@ -27,7 +27,7 @@
 
 _LIBCUDACXX_BEGIN_NAMESPACE_STD
 
-#if _CCCL_STD_VER > 2017
+#if !defined(_CCCL_NO_CONCEPTS)
 
 // [concept.invocable]
 
@@ -48,7 +48,7 @@ concept __invoke_constructible = requires(_Fun&& __fun, _Args&&... __args) {
     _CUDA_VSTD::invoke(_CUDA_VSTD::forward<_Fun>(__fun), _CUDA_VSTD::forward<_Args>(__args)...));
 };
 
-#elif _CCCL_STD_VER > 2011
+#elif !defined(_CCCL_NO_VARIABLE_TEMPLATES) // ^^^ !_CCCL_NO_CONCEPTS ^^^
 
 template <class _Fn, class... _Args>
 _CCCL_CONCEPT_FRAGMENT(_Invocable_,
@@ -69,7 +69,7 @@ _CCCL_CONCEPT_FRAGMENT(
 template <class _Fun, class... _Args>
 _CCCL_CONCEPT __invoke_constructible = _CCCL_FRAGMENT(__invoke_constructible_, _Fun, _Args...);
 
-#endif // _CCCL_STD_VER > 2011
+#endif // ^^^ !_CCCL_NO_VARIABLE_TEMPLATES
 
 _LIBCUDACXX_END_NAMESPACE_STD
 
diff --git a/libcudacxx/include/cuda/std/__concepts/movable.h b/libcudacxx/include/cuda/std/__concepts/movable.h
index 18f47cba6c5..98641e22319 100644
--- a/libcudacxx/include/cuda/std/__concepts/movable.h
+++ b/libcudacxx/include/cuda/std/__concepts/movable.h
@@ -28,12 +28,12 @@
 
 _LIBCUDACXX_BEGIN_NAMESPACE_STD
 
-#if _CCCL_STD_VER > 2017
+#if !defined(_CCCL_NO_CONCEPTS)
 
 template <class _Tp>
 concept movable = is_object_v<_Tp> && move_constructible<_Tp> && assignable_from<_Tp&, _Tp> && swappable<_Tp>;
 
-#elif _CCCL_STD_VER > 2011
+#elif !defined(_CCCL_NO_VARIABLE_TEMPLATES) // ^^^ !_CCCL_NO_CONCEPTS ^^^
 
 // [concepts.object]
 template <class _Tp>
@@ -47,7 +47,7 @@ _CCCL_CONCEPT_FRAGMENT(
 template <class _Tp>
 _CCCL_CONCEPT movable = _CCCL_FRAGMENT(_Movable_, _Tp);
 
-#endif // _CCCL_STD_VER > 2011
+#endif // ^^^ !_CCCL_NO_VARIABLE_TEMPLATES
 
 _LIBCUDACXX_END_NAMESPACE_STD
 
diff --git a/libcudacxx/include/cuda/std/__concepts/predicate.h b/libcudacxx/include/cuda/std/__concepts/predicate.h
index 7d8ee168583..8538468063c 100644
--- a/libcudacxx/include/cuda/std/__concepts/predicate.h
+++ b/libcudacxx/include/cuda/std/__concepts/predicate.h
@@ -27,12 +27,12 @@
 
 _LIBCUDACXX_BEGIN_NAMESPACE_STD
 
-#if _CCCL_STD_VER > 2017
+#if !defined(_CCCL_NO_CONCEPTS)
 
 template <class _Fn, class... _Args>
 concept predicate = regular_invocable<_Fn, _Args...> && __boolean_testable<invoke_result_t<_Fn, _Args...>>;
 
-#elif _CCCL_STD_VER > 2011
+#elif !defined(_CCCL_NO_VARIABLE_TEMPLATES) // ^^^ !_CCCL_NO_CONCEPTS ^^^
 
 // [concept.predicate]
 template <class _Fn, class... _Args>
@@ -43,7 +43,7 @@ _CCCL_CONCEPT_FRAGMENT(
 template <class _Fn, class... _Args>
 _CCCL_CONCEPT predicate = _CCCL_FRAGMENT(_Predicate_, _Fn, _Args...);
 
-#endif // _CCCL_STD_VER > 2011
+#endif // ^^^ !_CCCL_NO_VARIABLE_TEMPLATES
 
 _LIBCUDACXX_END_NAMESPACE_STD
 
diff --git a/libcudacxx/include/cuda/std/__concepts/regular.h b/libcudacxx/include/cuda/std/__concepts/regular.h
index 506dc7700f7..757976cf338 100644
--- a/libcudacxx/include/cuda/std/__concepts/regular.h
+++ b/libcudacxx/include/cuda/std/__concepts/regular.h
@@ -26,14 +26,14 @@
 
 _LIBCUDACXX_BEGIN_NAMESPACE_STD
 
-#if _CCCL_STD_VER > 2017
+#if !defined(_CCCL_NO_CONCEPTS)
 
 // [concept.object]
 
 template <class _Tp>
 concept regular = semiregular<_Tp> && equality_comparable<_Tp>;
 
-#elif _CCCL_STD_VER > 2011
+#elif !defined(_CCCL_NO_VARIABLE_TEMPLATES) // ^^^ !_CCCL_NO_CONCEPTS ^^^
 
 // [concept.object]
 
@@ -43,7 +43,7 @@ _CCCL_CONCEPT_FRAGMENT(__regular_, requires()(requires(semiregular<_Tp>), requir
 template <class _Tp>
 _CCCL_CONCEPT regular = _CCCL_FRAGMENT(__regular_, _Tp);
 
-#endif // _CCCL_STD_VER > 2011
+#endif // ^^^ !_CCCL_NO_VARIABLE_TEMPLATES
 
 _LIBCUDACXX_END_NAMESPACE_STD
 
diff --git a/libcudacxx/include/cuda/std/__concepts/relation.h b/libcudacxx/include/cuda/std/__concepts/relation.h
index e6006db9a8a..9d552c195bb 100644
--- a/libcudacxx/include/cuda/std/__concepts/relation.h
+++ b/libcudacxx/include/cuda/std/__concepts/relation.h
@@ -25,7 +25,7 @@
 
 _LIBCUDACXX_BEGIN_NAMESPACE_STD
 
-#if _CCCL_STD_VER > 2017
+#if !defined(_CCCL_NO_CONCEPTS)
 
 // [concept.relation]
 
@@ -43,7 +43,7 @@ concept equivalence_relation = relation<_Rp, _Tp, _Up>;
 template <class _Rp, class _Tp, class _Up>
 concept strict_weak_order = relation<_Rp, _Tp, _Up>;
 
-#elif _CCCL_STD_VER > 2011
+#elif !defined(_CCCL_NO_VARIABLE_TEMPLATES) // ^^^ !_CCCL_NO_CONCEPTS ^^^
 
 template <class _Rp, class _Tp, class _Up>
 _CCCL_CONCEPT_FRAGMENT(
@@ -66,7 +66,7 @@ _CCCL_CONCEPT equivalence_relation = relation<_Rp, _Tp, _Up>;
 template <class _Rp, class _Tp, class _Up>
 _CCCL_CONCEPT strict_weak_order = relation<_Rp, _Tp, _Up>;
 
-#endif // _CCCL_STD_VER > 2011
+#endif // ^^^ !_CCCL_NO_VARIABLE_TEMPLATES
 
 _LIBCUDACXX_END_NAMESPACE_STD
 
diff --git a/libcudacxx/include/cuda/std/__concepts/same_as.h b/libcudacxx/include/cuda/std/__concepts/same_as.h
index 59b59d6afb7..6247b74d5ec 100644
--- a/libcudacxx/include/cuda/std/__concepts/same_as.h
+++ b/libcudacxx/include/cuda/std/__concepts/same_as.h
@@ -25,7 +25,7 @@
 
 _LIBCUDACXX_BEGIN_NAMESPACE_STD
 
-#if _CCCL_STD_VER > 2011
+#if !defined(_CCCL_NO_VARIABLE_TEMPLATES)
 
 // [concept.same]
 
@@ -35,7 +35,7 @@ _CCCL_CONCEPT __same_as_impl = _IsSame<_Tp, _Up>::value;
 template <class _Tp, class _Up>
 _CCCL_CONCEPT same_as = __same_as_impl<_Tp, _Up> && __same_as_impl<_Up, _Tp>;
 
-#endif // _CCCL_STD_VER > 2011
+#endif // ^^^ !_CCCL_NO_VARIABLE_TEMPLATES
 
 _LIBCUDACXX_END_NAMESPACE_STD
 
diff --git a/libcudacxx/include/cuda/std/__concepts/semiregular.h b/libcudacxx/include/cuda/std/__concepts/semiregular.h
index ae3876885e3..e3c5dd482a6 100644
--- a/libcudacxx/include/cuda/std/__concepts/semiregular.h
+++ b/libcudacxx/include/cuda/std/__concepts/semiregular.h
@@ -26,14 +26,14 @@
 
 _LIBCUDACXX_BEGIN_NAMESPACE_STD
 
-#if _CCCL_STD_VER > 2017
+#if !defined(_CCCL_NO_CONCEPTS)
 
 // [concept.object]
 
 template <class _Tp>
 concept semiregular = copyable<_Tp> && default_initializable<_Tp>;
 
-#elif _CCCL_STD_VER > 2011
+#elif !defined(_CCCL_NO_VARIABLE_TEMPLATES) // ^^^ !_CCCL_NO_CONCEPTS ^^^
 
 // [concept.object]
 
@@ -43,7 +43,7 @@ _CCCL_CONCEPT_FRAGMENT(__semiregular_, requires()(requires(copyable<_Tp>), requi
 template <class _Tp>
 _CCCL_CONCEPT semiregular = _CCCL_FRAGMENT(__semiregular_, _Tp);
 
-#endif // _CCCL_STD_VER > 2011
+#endif // ^^^ !_CCCL_NO_VARIABLE_TEMPLATES
 
 _LIBCUDACXX_END_NAMESPACE_STD
 
diff --git a/libcudacxx/include/cuda/std/__concepts/swappable.h b/libcudacxx/include/cuda/std/__concepts/swappable.h
index 8688e71a702..2ad1e4270a3 100644
--- a/libcudacxx/include/cuda/std/__concepts/swappable.h
+++ b/libcudacxx/include/cuda/std/__concepts/swappable.h
@@ -41,7 +41,7 @@
 _CCCL_NV_DIAG_SUPPRESS(461) // nonstandard cast to array type ignored
 #endif // _CCCL_COMPILER(MSVC)
 
-#if _CCCL_STD_VER > 2011
+#if !defined(_CCCL_NO_VARIABLE_TEMPLATES)
 
 _LIBCUDACXX_BEGIN_NAMESPACE_RANGES
 
@@ -52,7 +52,7 @@ _LIBCUDACXX_BEGIN_NAMESPACE_CPO(__swap)
 template <class _Tp>
 void swap(_Tp&, _Tp&) = delete;
 
-#  if _CCCL_STD_VER > 2017
+#  if !defined(_CCCL_NO_CONCEPTS)
 template <class _Tp, class _Up>
 concept __unqualified_swappable_with =
   (__class_or_enum<remove_cvref_t<_Tp>> || __class_or_enum<remove_cvref_t<_Up>>)
@@ -62,7 +62,7 @@ template <class _Tp>
 concept __exchangeable =
   !__unqualified_swappable_with<_Tp&, _Tp&> && move_constructible<_Tp> && assignable_from<_Tp&, _Tp>;
 
-#  else // ^^^ CXX20 ^^^ / vvv CXX17 vvv
+#  else // ^^^ !_CCCL_NO_CONCEPTS ^^^ / vvv _CCCL_NO_CONCEPTS vvv
 
 template <class _Tp, class _Up>
 _CCCL_CONCEPT_FRAGMENT(
@@ -80,9 +80,9 @@ _CCCL_CONCEPT_FRAGMENT(__exchangeable_,
 
 template <class _Tp>
 _CCCL_CONCEPT __exchangeable = _CCCL_FRAGMENT(__exchangeable_, _Tp);
-#  endif // _CCCL_STD_VER < 2020
+#  endif // _CCCL_NO_CONCEPTS
 
-#  if _CCCL_STD_VER > 2017 && !_CCCL_COMPILER(NVHPC) // nvbug4051640
+#  if !defined(_CCCL_NO_CONCEPTS) && !_CCCL_COMPILER(NVHPC) // nvbug4051640
 struct __fn;
 
 _CCCL_NV_DIAG_SUPPRESS(2642)
@@ -92,10 +92,10 @@ concept __swappable_arrays =
   && requires(_Tp (&__t)[_Size], _Up (&__u)[_Size], const __fn& __swap) { __swap(__t[0], __u[0]); };
 _CCCL_NV_DIAG_DEFAULT(2642)
 
-#  else
+#  else // ^^^ !_CCCL_NO_CONCEPTS && !_CCCL_COMPILER(NVHPC) ^^^ / vvv _CCCL_NO_CONCEPTS || _CCCL_COMPILER(NVHPC) vvv
 template <class _Tp, class _Up, size_t _Size, class = void>
 _CCCL_INLINE_VAR constexpr bool __swappable_arrays = false;
-#  endif // _CCCL_STD_VER < 2020 || _CCCL_COMPILER(NVHPC)
+#  endif // _CCCL_NO_CONCEPTS || _CCCL_COMPILER(NVHPC)
 
 template <class _Tp, class _Up, class = void>
 _CCCL_INLINE_VAR constexpr bool __noexcept_swappable_arrays = false;
@@ -135,7 +135,7 @@ struct __fn
   }
 };
 
-#  if _CCCL_STD_VER < 2020 || _CCCL_COMPILER(NVHPC)
+#  if defined(_CCCL_NO_CONCEPTS) || _CCCL_COMPILER(NVHPC)
 template <class _Tp, class _Up, class _Size>
 _CCCL_CONCEPT_FRAGMENT(
   __swappable_arrays_,
@@ -147,7 +147,7 @@ _CCCL_CONCEPT_FRAGMENT(
 template <class _Tp, class _Up, size_t _Size>
 _CCCL_INLINE_VAR constexpr bool __swappable_arrays<_Tp, _Up, _Size, void_t<type_identity_t<_Tp>>> =
   _CCCL_FRAGMENT(__swappable_arrays_, _Tp, _Up, _CUDA_VSTD::integral_constant<size_t, _Size>);
-#  endif // _CCCL_STD_VER < 2020 || _CCCL_COMPILER(NVHPC)
+#  endif // _CCCL_NO_CONCEPTS || _CCCL_COMPILER(NVHPC)
 
 template <class _Tp, class _Up>
 _CCCL_INLINE_VAR constexpr bool __noexcept_swappable_arrays<_Tp, _Up, void_t<type_identity_t<_Tp>>> =
@@ -163,7 +163,7 @@ _LIBCUDACXX_END_NAMESPACE_RANGES
 
 _LIBCUDACXX_BEGIN_NAMESPACE_STD
 
-#  if _CCCL_STD_VER > 2017
+#  if !defined(_CCCL_NO_CONCEPTS)
 template <class _Tp>
 concept swappable = requires(_Tp& __a, _Tp& __b) { _CUDA_VRANGES::swap(__a, __b); };
 
@@ -174,7 +174,7 @@ concept swappable_with = common_reference_with<_Tp, _Up> && requires(_Tp&& __t,
   _CUDA_VRANGES::swap(_CUDA_VSTD::forward<_Tp>(__t), _CUDA_VSTD::forward<_Up>(__u));
   _CUDA_VRANGES::swap(_CUDA_VSTD::forward<_Up>(__u), _CUDA_VSTD::forward<_Tp>(__t));
 };
-#  else
+#  else // ^^^ !_CCCL_NO_CONCEPTS ^^^ / vvv _CCCL_NO_CONCEPTS vvv
 template <class _Tp>
 _CCCL_CONCEPT_FRAGMENT(__swappable_, requires(_Tp& __a, _Tp& __b)((_CUDA_VRANGES::swap(__a, __b))));
 
@@ -193,11 +193,11 @@ _CCCL_CONCEPT_FRAGMENT(
 
 template <class _Tp, class _Up>
 _CCCL_CONCEPT swappable_with = _CCCL_FRAGMENT(__swappable_with_, _Tp, _Up);
-#  endif
+#  endif // ^^^ _CCCL_NO_CONCEPTS ^^^
 
 _LIBCUDACXX_END_NAMESPACE_STD
 
-#endif // _CCCL_STD_VER > 2011
+#endif // ^^^ !_CCCL_NO_VARIABLE_TEMPLATES
 
 #if _CCCL_COMPILER(MSVC)
 _CCCL_NV_DIAG_DEFAULT(461) // nonstandard cast to array type ignored
diff --git a/libcudacxx/include/cuda/std/__concepts/totally_ordered.h b/libcudacxx/include/cuda/std/__concepts/totally_ordered.h
index 59e9254289a..088098956c0 100644
--- a/libcudacxx/include/cuda/std/__concepts/totally_ordered.h
+++ b/libcudacxx/include/cuda/std/__concepts/totally_ordered.h
@@ -28,7 +28,7 @@
 
 _LIBCUDACXX_BEGIN_NAMESPACE_STD
 
-#if _CCCL_STD_VER > 2017
+#if !defined(_CCCL_NO_CONCEPTS)
 
 // [concept.totallyordered]
 
@@ -53,7 +53,7 @@ concept totally_ordered_with =
   && totally_ordered<common_reference_t<__make_const_lvalue_ref<_Tp>, __make_const_lvalue_ref<_Up>>>
   && __partially_ordered_with<_Tp, _Up>;
 
-#elif _CCCL_STD_VER > 2011
+#elif !defined(_CCCL_NO_VARIABLE_TEMPLATES) // ^^^ !_CCCL_NO_CONCEPTS ^^^
 
 template <class _Tp, class _Up>
 _CCCL_CONCEPT_FRAGMENT(
@@ -91,7 +91,7 @@ template <class _Tp, class _Up>
 _CCCL_CONCEPT totally_ordered_with = _CCCL_FRAGMENT(__totally_ordered_with_, _Tp, _Up);
 ;
 
-#endif // _CCCL_STD_VER > 2011
+#endif // ^^^ !_CCCL_NO_VARIABLE_TEMPLATES
 
 _LIBCUDACXX_END_NAMESPACE_STD
 
diff --git a/libcudacxx/include/cuda/std/__iterator/concepts.h b/libcudacxx/include/cuda/std/__iterator/concepts.h
index e4e507afe83..ef36ad11f9d 100644
--- a/libcudacxx/include/cuda/std/__iterator/concepts.h
+++ b/libcudacxx/include/cuda/std/__iterator/concepts.h
@@ -53,7 +53,7 @@
 
 _LIBCUDACXX_BEGIN_NAMESPACE_STD
 
-#if _CCCL_STD_VER > 2017
+#if !defined(_CCCL_NO_CONCEPTS)
 
 // [iterator.concept.readable]
 template <class _In>
@@ -254,7 +254,7 @@ concept indirectly_copyable_storable =
 // Note: indirectly_swappable is located in iter_swap.h to prevent a dependency cycle
 // (both iter_swap and indirectly_swappable require indirectly_readable).
 
-#elif _CCCL_STD_VER > 2014
+#elif _CCCL_STD_VER > 2014 // ^^^ !_CCCL_NO_CONCEPTS ^^^
 
 // [iterator.concept.readable]
 template <class _In>
diff --git a/libcudacxx/include/cuda/std/__iterator/incrementable_traits.h b/libcudacxx/include/cuda/std/__iterator/incrementable_traits.h
index e9d462eeaf4..4555b4ae412 100644
--- a/libcudacxx/include/cuda/std/__iterator/incrementable_traits.h
+++ b/libcudacxx/include/cuda/std/__iterator/incrementable_traits.h
@@ -37,7 +37,7 @@
 
 _LIBCUDACXX_BEGIN_NAMESPACE_STD
 
-#if _CCCL_STD_VER > 2017
+#if !defined(_CCCL_NO_CONCEPTS)
 
 // [incrementable.traits]
 template <class>
@@ -88,7 +88,7 @@ using iter_difference_t =
                          incrementable_traits<remove_cvref_t<_Ip>>,
                          iterator_traits<remove_cvref_t<_Ip>>>::difference_type;
 
-#elif _CCCL_STD_VER > 2014
+#elif _CCCL_STD_VER > 2014 // ^^^ !_CCCL_NO_CONCEPTS ^^^
 
 // [incrementable.traits]
 template <class, class = void>
diff --git a/libcudacxx/include/cuda/std/__iterator/iter_move.h b/libcudacxx/include/cuda/std/__iterator/iter_move.h
index 1dfb489933b..54ce7692c1e 100644
--- a/libcudacxx/include/cuda/std/__iterator/iter_move.h
+++ b/libcudacxx/include/cuda/std/__iterator/iter_move.h
@@ -42,7 +42,7 @@ _LIBCUDACXX_BEGIN_NAMESPACE_CPO(__iter_move)
 
 _CCCL_HOST_DEVICE void iter_move();
 
-#  if _CCCL_STD_VER >= 2020
+#  if !defined(_CCCL_NO_CONCEPTS)
 template <class _Tp>
 concept __unqualified_iter_move =
   __class_or_enum<remove_cvref_t<_Tp>> && requires(_Tp&& __t) { iter_move(_CUDA_VSTD::forward<_Tp>(__t)); };
@@ -59,7 +59,7 @@ concept __just_deref = !__unqualified_iter_move<_Tp> && !__move_deref<_Tp> && re
   requires(!is_lvalue_reference_v<decltype(*__t)>);
 };
 
-#  else // ^^^ _CCCL_STD_VER >= 2020 ^^^ / vvv _CCCL_STD_VER <= 2017 vvv
+#  else // ^^^ !_CCCL_NO_CONCEPTS ^^^ / vvv _CCCL_NO_CONCEPTS vvv
 
 template <class _Tp>
 _CCCL_CONCEPT_FRAGMENT(
@@ -85,7 +85,7 @@ _CCCL_CONCEPT_FRAGMENT(__just_deref_,
 
 template <class _Tp>
 _CCCL_CONCEPT __just_deref = _CCCL_FRAGMENT(__just_deref_, _Tp);
-#  endif // _CCCL_STD_VER <= 2017
+#  endif // _CCCL_NO_CONCEPTS
 
 // [iterator.cust.move]
 
@@ -124,14 +124,14 @@ _LIBCUDACXX_END_NAMESPACE_RANGES
 
 _LIBCUDACXX_BEGIN_NAMESPACE_STD
 
-#  if _CCCL_STD_VER >= 2020
+#  if !defined(_CCCL_NO_CONCEPTS)
 template <__dereferenceable _Tp>
   requires requires(_Tp& __t) {
     { _CUDA_VRANGES::iter_move(__t) } -> __can_reference;
   }
 using iter_rvalue_reference_t = decltype(_CUDA_VRANGES::iter_move(_CUDA_VSTD::declval<_Tp&>()));
 
-#  else // ^^^ _CCCL_STD_VER >= 2020 ^^^ / vvv _CCCL_STD_VER <= 2017 vvv
+#  else // ^^^ !_CCCL_NO_CONCEPTS ^^^ / vvv _CCCL_NO_CONCEPTS vvv
 
 template <class _Tp>
 _CCCL_CONCEPT_FRAGMENT(__can_iter_rvalue_reference_t_,
@@ -146,7 +146,7 @@ using __iter_rvalue_reference_t = decltype(_CUDA_VRANGES::iter_move(_CUDA_VSTD::
 
 template <class _Tp>
 using iter_rvalue_reference_t = enable_if_t<__can_iter_rvalue_reference_t<_Tp>, __iter_rvalue_reference_t<_Tp>>;
-#  endif // _CCCL_STD_VER <= 2017
+#  endif // _CCCL_NO_CONCEPTS
 
 _LIBCUDACXX_END_NAMESPACE_STD
 
diff --git a/libcudacxx/include/cuda/std/__iterator/iter_swap.h b/libcudacxx/include/cuda/std/__iterator/iter_swap.h
index a4047cbba10..bafeed69742 100644
--- a/libcudacxx/include/cuda/std/__iterator/iter_swap.h
+++ b/libcudacxx/include/cuda/std/__iterator/iter_swap.h
@@ -39,7 +39,7 @@ _LIBCUDACXX_BEGIN_NAMESPACE_CPO(__iter_swap)
 template <class _I1, class _I2>
 void iter_swap(_I1, _I2) = delete;
 
-#  if _CCCL_STD_VER > 2017
+#  if !defined(_CCCL_NO_CONCEPTS)
 template <class _T1, class _T2>
 concept __unqualified_iter_swap =
   (__class_or_enum<remove_cvref_t<_T1>> || __class_or_enum<remove_cvref_t<_T2>>)
@@ -52,7 +52,7 @@ concept __readable_swappable = !__unqualified_iter_swap<_T1, _T2> && indirectly_
 template <class _T1, class _T2>
 concept __moveable_storable = !__unqualified_iter_swap<_T1, _T2> && !__readable_swappable<_T1, _T2>
                            && indirectly_movable_storable<_T1, _T2> && indirectly_movable_storable<_T2, _T1>;
-#  else
+#  else // ^^^ !_CCCL_NO_CONCEPTS ^^^ / vvv _CCCL_NO_CONCEPTS vvv
 template <class _T1, class _T2>
 _CCCL_CONCEPT_FRAGMENT(
   __unqualified_iter_swap_,
@@ -83,7 +83,7 @@ _CCCL_CONCEPT_FRAGMENT(
 
 template <class _T1, class _T2>
 _CCCL_CONCEPT __moveable_storable = _CCCL_FRAGMENT(__moveable_storable_, _T1, _T2);
-#  endif // _CCCL_STD_VER > 2011
+#  endif // _CCCL_NO_CONCEPTS
 
 struct __fn
 {
diff --git a/libcudacxx/include/cuda/std/__iterator/iterator_traits.h b/libcudacxx/include/cuda/std/__iterator/iterator_traits.h
index 759af45cc3a..020f27071db 100644
--- a/libcudacxx/include/cuda/std/__iterator/iterator_traits.h
+++ b/libcudacxx/include/cuda/std/__iterator/iterator_traits.h
@@ -67,7 +67,7 @@ struct __cccl_std_contiguous_iterator_tag_exists : __cccl_type_is_defined<struct
 
 _LIBCUDACXX_BEGIN_NAMESPACE_STD
 
-#if _CCCL_STD_VER >= 2020
+#if !defined(_CCCL_NO_CONCEPTS)
 
 template <class _Tp>
 using __with_reference = _Tp&;
@@ -87,7 +87,7 @@ using iter_reference_t = decltype(*declval<_Tp&>());
 template <class>
 struct _CCCL_TYPE_VISIBILITY_DEFAULT iterator_traits;
 
-#elif _CCCL_STD_VER >= 2017
+#elif _CCCL_STD_VER > 2014 // ^^^ !_CCCL_NO_CONCEPTS ^^^
 
 template <class _Tp>
 using __with_reference = _Tp&;
@@ -113,7 +113,7 @@ using iter_reference_t = enable_if_t<__dereferenceable<_Tp>, decltype(*_CUDA_VST
 
 template <class, class>
 struct _CCCL_TYPE_VISIBILITY_DEFAULT iterator_traits;
-#else // ^^^ _CCCL_STD_VER >= 2017 ^^^ / vvv _CCCL_STD_VER <= 2014 vvv
+#else // ^^^ _CCCL_STD_VER > 2014 ^^^ / vvv _CCCL_STD_VER <= 2014 vvv
 template <class>
 struct _CCCL_TYPE_VISIBILITY_DEFAULT iterator_traits;
 #endif // _CCCL_STD_VER <= 2014
@@ -242,7 +242,7 @@ struct __has_iterator_concept
   static const bool value = decltype(__test<_Tp>(nullptr))::value;
 };
 
-#if _CCCL_STD_VER >= 2020
+#if !defined(_CCCL_NO_CONCEPTS)
 
 // The `cpp17-*-iterator` exposition-only concepts have very similar names to the `Cpp17*Iterator` named requirements
 // from `[iterator.cpp17]`. To avoid confusion between the two, the exposition-only concepts have been banished to
@@ -484,7 +484,7 @@ struct _CCCL_TYPE_VISIBILITY_DEFAULT iterator_traits : __iterator_traits<_Ip>
   using __primary_template = iterator_traits;
 };
 
-#elif _CCCL_STD_VER >= 2017
+#elif _CCCL_STD_VER > 2014 // ^^^ !_CCCL_NO_CONCEPTS ^^^ / vvv _CCCL_STD_VER > 2014 vvv
 
 // The `cpp17-*-iterator` exposition-only concepts have very similar names to the `Cpp17*Iterator` named requirements
 // from `[iterator.cpp17]`. To avoid confusion between the two, the exposition-only concepts have been banished to
@@ -764,7 +764,7 @@ struct _CCCL_TYPE_VISIBILITY_DEFAULT iterator_traits : __iterator_traits<_Ip>
   using __primary_template = iterator_traits;
 };
 
-#else // _CCCL_STD_VER >= 2014
+#else // ^^^ _CCCL_STD_VER > 2014 ^^^ / vvv _CCCL_STD_VER <= 2014 vvv
 
 template <class _Iter, bool>
 struct __iterator_traits
@@ -804,7 +804,7 @@ struct _CCCL_TYPE_VISIBILITY_DEFAULT iterator_traits : __iterator_traits<_Iter,
 #endif // _CCCL_STD_VER <= 2014
 
 template <class _Tp>
-#if _CCCL_STD_VER >= 2020
+#if !defined(_CCCL_NO_CONCEPTS)
   requires is_object_v<_Tp>
 #endif
 struct _CCCL_TYPE_VISIBILITY_DEFAULT iterator_traits<_Tp*>
@@ -814,7 +814,7 @@ struct _CCCL_TYPE_VISIBILITY_DEFAULT iterator_traits<_Tp*>
   typedef _Tp* pointer;
   typedef typename add_lvalue_reference<_Tp>::type reference;
   typedef random_access_iterator_tag iterator_category;
-#if _CCCL_STD_VER >= 2017
+#if _CCCL_STD_VER > 2014
   typedef contiguous_iterator_tag iterator_concept;
 #endif
 };
@@ -860,17 +860,17 @@ struct __is_cpp17_random_access_iterator
 // Such iterators receive special "contiguous" optimizations in
 // std::copy and std::sort.
 //
-#if _CCCL_STD_VER >= 2017
+#if _CCCL_STD_VER > 2014
 template <class _Tp>
 struct __is_cpp17_contiguous_iterator
     : _Or<__has_iterator_category_convertible_to<_Tp, contiguous_iterator_tag>,
           __has_iterator_concept_convertible_to<_Tp, contiguous_iterator_tag>>
 {};
-#else
+#else // ^^^ _CCCL_STD_VER > 2014 ^^^ / vvv _CCCL_STD_VER <= 2014 vvv
 template <class _Tp>
 struct __is_cpp17_contiguous_iterator : false_type
 {};
-#endif
+#endif // _CCCL_STD_VER <= 2014
 
 // Any native pointer which is an iterator is also a contiguous iterator.
 template <class _Up>
diff --git a/libcudacxx/include/cuda/std/__iterator/move_iterator.h b/libcudacxx/include/cuda/std/__iterator/move_iterator.h
index efdf656366a..0436b25b36c 100644
--- a/libcudacxx/include/cuda/std/__iterator/move_iterator.h
+++ b/libcudacxx/include/cuda/std/__iterator/move_iterator.h
@@ -44,7 +44,7 @@
 
 _LIBCUDACXX_BEGIN_NAMESPACE_STD
 
-#if _CCCL_STD_VER > 2017
+#if !defined(_CCCL_NO_CONCEPTS)
 template <class _Iter, class = void>
 struct __move_iter_category_base
 {};
@@ -67,7 +67,7 @@ concept __move_iter_comparable = requires {
 template <class _Iter>
 _CCCL_INLINE_VAR constexpr bool __noexcept_move_iter_iter_move =
   noexcept(_CUDA_VRANGES::iter_move(_CUDA_VSTD::declval<_Iter>()));
-#elif _CCCL_STD_VER >= 2017
+#elif _CCCL_STD_VER > 2014 // ^^^ !_CCCL_NO_CONCEPTS ^^^ / vvv _CCCL_STD_VER > 2014 vvv
 template <class _Iter, class = void>
 struct __move_iter_category_base
 {};
@@ -92,7 +92,7 @@ _CCCL_CONCEPT __move_iter_comparable = _CCCL_FRAGMENT(__move_iter_comparable_, _
 template <class _Iter>
 _CCCL_INLINE_VAR constexpr bool __noexcept_move_iter_iter_move =
   noexcept(_CUDA_VRANGES::iter_move(_CUDA_VSTD::declval<_Iter>()));
-#endif // _CCCL_STD_VER >= 2017
+#endif // _CCCL_STD_VER > 2014
 
 template <class _Iter>
 class _CCCL_TYPE_VISIBILITY_DEFAULT move_iterator
@@ -179,18 +179,18 @@ class _CCCL_TYPE_VISIBILITY_DEFAULT move_iterator
   }
 
 #if _CCCL_STD_VER > 2014
-#  if _CCCL_STD_VER > 2017
+#  if !defined(_CCCL_NO_CONCEPTS)
   _LIBCUDACXX_HIDE_FROM_ABI constexpr move_iterator()
     requires is_constructible_v<_Iter>
       : __current_()
   {}
-#  else // ^^^ _CCCL_STD_VER > 2017 ^^^ / vvv _CCCL_STD_VER < 2020 vvv
+#  else // ^^^ !_CCCL_NO_CONCEPTS ^^^ / vvv _CCCL_NO_CONCEPTS vvv
   _CCCL_TEMPLATE(class _It2 = _Iter)
   _CCCL_REQUIRES(is_constructible_v<_It2>)
   _LIBCUDACXX_HIDE_FROM_ABI constexpr move_iterator()
       : __current_()
   {}
-#  endif // _CCCL_STD_VER < 2020
+#  endif // _CCCL_NO_CONCEPTS
 
   _CCCL_TEMPLATE(class _Up)
   _CCCL_REQUIRES((!_IsSame<_Up, _Iter>::value) && convertible_to<const _Up&, _Iter>)
@@ -460,7 +460,7 @@ operator-(const move_iterator<_Iter1>& __x, const move_iterator<_Iter2>& __y) ->
   return __x.base() - __y.base();
 }
 
-#if _CCCL_STD_VER > 2017
+#if !defined(_CCCL_NO_CONCEPTS)
 template <class _Iter>
 _LIBCUDACXX_HIDE_FROM_ABI constexpr move_iterator<_Iter>
 operator+(iter_difference_t<_Iter> __n, const move_iterator<_Iter>& __x)
@@ -470,14 +470,14 @@ operator+(iter_difference_t<_Iter> __n, const move_iterator<_Iter>& __x)
 {
   return __x + __n;
 }
-#else // ^^^ _CCCL_STD_VER > 2017 ^^^ / vvv _CCCL_STD_VER < 2020 vvv
+#else // ^^^ !_CCCL_NO_CONCEPTS ^^^ / vvv _CCCL_NO_CONCEPTS vvv
 template <class _Iter>
 _LIBCUDACXX_HIDE_FROM_ABI _CCCL_CONSTEXPR_CXX14 move_iterator<_Iter>
 operator+(typename move_iterator<_Iter>::difference_type __n, const move_iterator<_Iter>& __x)
 {
   return move_iterator<_Iter>(__x.base() + __n);
 }
-#endif // _CCCL_STD_VER < 2020
+#endif // _CCCL_NO_CONCEPTS
 
 template <class _Iter>
 _LIBCUDACXX_HIDE_FROM_ABI _CCCL_CONSTEXPR_CXX14 move_iterator<_Iter> make_move_iterator(_Iter __i)
diff --git a/libcudacxx/include/cuda/std/__iterator/readable_traits.h b/libcudacxx/include/cuda/std/__iterator/readable_traits.h
index e7e5dcd3bf4..b73086dd968 100644
--- a/libcudacxx/include/cuda/std/__iterator/readable_traits.h
+++ b/libcudacxx/include/cuda/std/__iterator/readable_traits.h
@@ -36,7 +36,7 @@
 
 _LIBCUDACXX_BEGIN_NAMESPACE_STD
 
-#if _CCCL_STD_VER > 2017
+#if !defined(_CCCL_NO_CONCEPTS)
 
 // [readable.traits]
 template <class>
@@ -106,7 +106,7 @@ using iter_value_t =
                          indirectly_readable_traits<remove_cvref_t<_Ip>>,
                          iterator_traits<remove_cvref_t<_Ip>>>::value_type;
 
-#elif _CCCL_STD_VER > 2014
+#elif _CCCL_STD_VER > 2014 // ^^^ !_CCCL_NO_CONCEPTS ^^^
 
 // [readable.traits]
 template <class, class = void>
diff --git a/libcudacxx/include/cuda/std/__iterator/reverse_iterator.h b/libcudacxx/include/cuda/std/__iterator/reverse_iterator.h
index 982312731f9..502f090afff 100644
--- a/libcudacxx/include/cuda/std/__iterator/reverse_iterator.h
+++ b/libcudacxx/include/cuda/std/__iterator/reverse_iterator.h
@@ -175,7 +175,7 @@ class _CCCL_TYPE_VISIBILITY_DEFAULT reverse_iterator
     return *--__tmp;
   }
 
-#if _CCCL_STD_VER > 2017
+#if !defined(_CCCL_NO_CONCEPTS) && !defined(_CCCL_NO_IF_CONSTEXPR)
   _LIBCUDACXX_HIDE_FROM_ABI constexpr pointer operator->() const
     requires is_pointer_v<_Iter> || requires(const _Iter __i) { __i.operator->(); }
   {
@@ -285,11 +285,11 @@ struct __is_reverse_iterator<reverse_iterator<_Iter>> : true_type
 template <class _Iter1, class _Iter2>
 _LIBCUDACXX_HIDE_FROM_ABI _CCCL_CONSTEXPR_CXX14 bool
 operator==(const reverse_iterator<_Iter1>& __x, const reverse_iterator<_Iter2>& __y)
-#if _CCCL_STD_VER > 2017
+#if !defined(_CCCL_NO_CONCEPTS)
   requires requires {
     { __x.base() == __y.base() } -> convertible_to<bool>;
   }
-#endif // _CCCL_STD_VER > 2017
+#endif // !_CCCL_NO_CONCEPTS
 {
   return __x.base() == __y.base();
 }
@@ -297,11 +297,11 @@ operator==(const reverse_iterator<_Iter1>& __x, const reverse_iterator<_Iter2>&
 template <class _Iter1, class _Iter2>
 _LIBCUDACXX_HIDE_FROM_ABI _CCCL_CONSTEXPR_CXX14 bool
 operator<(const reverse_iterator<_Iter1>& __x, const reverse_iterator<_Iter2>& __y)
-#if _CCCL_STD_VER > 2017
+#if !defined(_CCCL_NO_CONCEPTS)
   requires requires {
     { __x.base() > __y.base() } -> convertible_to<bool>;
   }
-#endif // _CCCL_STD_VER > 2017
+#endif // !_CCCL_NO_CONCEPTS
 {
   return __x.base() > __y.base();
 }
@@ -309,11 +309,11 @@ operator<(const reverse_iterator<_Iter1>& __x, const reverse_iterator<_Iter2>& _
 template <class _Iter1, class _Iter2>
 _LIBCUDACXX_HIDE_FROM_ABI _CCCL_CONSTEXPR_CXX14 bool
 operator!=(const reverse_iterator<_Iter1>& __x, const reverse_iterator<_Iter2>& __y)
-#if _CCCL_STD_VER > 2017
+#if !defined(_CCCL_NO_CONCEPTS)
   requires requires {
     { __x.base() != __y.base() } -> convertible_to<bool>;
   }
-#endif // _CCCL_STD_VER > 2017
+#endif // !_CCCL_NO_CONCEPTS
 {
   return __x.base() != __y.base();
 }
@@ -321,11 +321,11 @@ operator!=(const reverse_iterator<_Iter1>& __x, const reverse_iterator<_Iter2>&
 template <class _Iter1, class _Iter2>
 _LIBCUDACXX_HIDE_FROM_ABI _CCCL_CONSTEXPR_CXX14 bool
 operator>(const reverse_iterator<_Iter1>& __x, const reverse_iterator<_Iter2>& __y)
-#if _CCCL_STD_VER > 2017
+#if !defined(_CCCL_NO_CONCEPTS)
   requires requires {
     { __x.base() < __y.base() } -> convertible_to<bool>;
   }
-#endif // _CCCL_STD_VER > 2017
+#endif // !_CCCL_NO_CONCEPTS
 {
   return __x.base() < __y.base();
 }
@@ -333,11 +333,11 @@ operator>(const reverse_iterator<_Iter1>& __x, const reverse_iterator<_Iter2>& _
 template <class _Iter1, class _Iter2>
 _LIBCUDACXX_HIDE_FROM_ABI _CCCL_CONSTEXPR_CXX14 bool
 operator>=(const reverse_iterator<_Iter1>& __x, const reverse_iterator<_Iter2>& __y)
-#if _CCCL_STD_VER > 2017
+#if !defined(_CCCL_NO_CONCEPTS)
   requires requires {
     { __x.base() <= __y.base() } -> convertible_to<bool>;
   }
-#endif // _CCCL_STD_VER > 2017
+#endif // !_CCCL_NO_CONCEPTS
 {
   return __x.base() <= __y.base();
 }
@@ -345,11 +345,11 @@ operator>=(const reverse_iterator<_Iter1>& __x, const reverse_iterator<_Iter2>&
 template <class _Iter1, class _Iter2>
 _LIBCUDACXX_HIDE_FROM_ABI _CCCL_CONSTEXPR_CXX14 bool
 operator<=(const reverse_iterator<_Iter1>& __x, const reverse_iterator<_Iter2>& __y)
-#if _CCCL_STD_VER > 2017
+#if !defined(_CCCL_NO_CONCEPTS)
   requires requires {
     { __x.base() >= __y.base() } -> convertible_to<bool>;
   }
-#endif // _CCCL_STD_VER > 2017
+#endif // !_CCCL_NO_CONCEPTS
 {
   return __x.base() >= __y.base();
 }
@@ -377,11 +377,11 @@ operator+(typename reverse_iterator<_Iter>::difference_type __n, const reverse_i
   return reverse_iterator<_Iter>(__x.base() - __n);
 }
 
-#if _CCCL_STD_VER > 2017
+#if !defined(_CCCL_NO_CONCEPTS)
 template <class _Iter1, class _Iter2>
   requires(!sized_sentinel_for<_Iter1, _Iter2>)
 inline constexpr bool disable_sized_sentinel_for<reverse_iterator<_Iter1>, reverse_iterator<_Iter2>> = true;
-#endif // _CCCL_STD_VER > 2017
+#endif // !_CCCL_NO_CONCEPTS
 
 #if _CCCL_STD_VER > 2011
 template <class _Iter>
diff --git a/libcudacxx/include/cuda/std/__ranges/access.h b/libcudacxx/include/cuda/std/__ranges/access.h
index 2c1525e1ad4..3c5ef7da52b 100644
--- a/libcudacxx/include/cuda/std/__ranges/access.h
+++ b/libcudacxx/include/cuda/std/__ranges/access.h
@@ -46,7 +46,7 @@ void begin(_Tp&) = delete;
 template <class _Tp>
 void begin(const _Tp&) = delete;
 
-#  if _CCCL_STD_VER > 2017
+#  if !defined(_CCCL_NO_CONCEPTS)
 template <class _Tp>
 concept __member_begin = __can_borrow<_Tp> && __workaround_52970<_Tp> && requires(_Tp&& __t) {
   { _LIBCUDACXX_AUTO_CAST(__t.begin()) } -> input_or_output_iterator;
@@ -57,7 +57,7 @@ concept __unqualified_begin =
   !__member_begin<_Tp> && __can_borrow<_Tp> && __class_or_enum<remove_cvref_t<_Tp>> && requires(_Tp&& __t) {
     { _LIBCUDACXX_AUTO_CAST(begin(__t)) } -> input_or_output_iterator;
   };
-#  else // ^^^ CXX20 ^^^ / vvv CXX17 vvv
+#  else // ^^^ !_CCCL_NO_CONCEPTS ^^^ / vvv _CCCL_NO_CONCEPTS vvv
 template <class _Tp>
 _CCCL_CONCEPT_FRAGMENT(
   __member_begin_,
@@ -78,7 +78,7 @@ _CCCL_CONCEPT_FRAGMENT(
 
 template <class _Tp>
 _CCCL_CONCEPT __unqualified_begin = _CCCL_FRAGMENT(__unqualified_begin_, _Tp);
-#  endif // _CCCL_STD_VER < 2020
+#  endif // _CCCL_NO_CONCEPTS
 
 struct __fn
 {
@@ -141,7 +141,7 @@ void end(_Tp&) = delete;
 template <class _Tp>
 void end(const _Tp&) = delete;
 
-#  if _CCCL_STD_VER > 2017
+#  if !defined(_CCCL_NO_CONCEPTS)
 template <class _Tp>
 concept __member_end = __can_borrow<_Tp> && __workaround_52970<_Tp> && requires(_Tp&& __t) {
   typename iterator_t<_Tp>;
@@ -154,7 +154,7 @@ concept __unqualified_end =
     typename iterator_t<_Tp>;
     { _LIBCUDACXX_AUTO_CAST(end(__t)) } -> sentinel_for<iterator_t<_Tp>>;
   };
-#  else // ^^^ CXX20 ^^^ / vvv CXX17 vvv
+#  else // ^^^ !_CCCL_NO_CONCEPTS ^^^ / vvv _CCCL_NO_CONCEPTS vvv
 template <class _Tp>
 _CCCL_CONCEPT_FRAGMENT(
   __member_end_,
@@ -177,7 +177,7 @@ _CCCL_CONCEPT_FRAGMENT(
 
 template <class _Tp>
 _CCCL_CONCEPT __unqualified_end = _CCCL_FRAGMENT(__unqualified_end_, _Tp);
-#  endif // _CCCL_STD_VER < 2020
+#  endif // _CCCL_NO_CONCEPTS
 
 struct __fn
 {
diff --git a/libcudacxx/include/cuda/std/__ranges/concepts.h b/libcudacxx/include/cuda/std/__ranges/concepts.h
index 26d7fe421e7..4183f423ea6 100644
--- a/libcudacxx/include/cuda/std/__ranges/concepts.h
+++ b/libcudacxx/include/cuda/std/__ranges/concepts.h
@@ -46,7 +46,7 @@ _LIBCUDACXX_BEGIN_NAMESPACE_RANGES
 
 #if _CCCL_STD_VER >= 2017 && !_CCCL_COMPILER(MSVC2017)
 
-#  if _CCCL_STD_VER >= 2020
+#  if !defined(_CCCL_NO_CONCEPTS)
 
 // [range.range]
 
@@ -138,7 +138,8 @@ concept viewable_range =
       || (!view<remove_cvref_t<_Tp>>
           && (is_lvalue_reference_v<_Tp>
               || (movable<remove_reference_t<_Tp>> && !__is_std_initializer_list<remove_cvref_t<_Tp>>) )));
-#  else // ^^^ C++20 ^^^ / vvv C++17 vvv
+
+#  else // ^^^ !_CCCL_NO_CONCEPTS ^^^ / vvv _CCCL_NO_CONCEPTS vvv
 // [range.range]
 
 template <class _Tp>
@@ -285,13 +286,13 @@ _CCCL_CONCEPT_FRAGMENT(
 template <class _Tp>
 _CCCL_CONCEPT viewable_range = _CCCL_FRAGMENT(__viewable_range_, _Tp);
 
-#  endif // _CCCL_STD_VER >= 2017
+#  endif // _CCCL_NO_CONCEPTS
 
 //[container.intro.reqmts]
-#  if _CCCL_STD_VER >= 2020
+#  if !defined(_CCCL_NO_CONCEPTS)
 template <class _Range, class _Tp>
 concept __container_compatible_range = input_range<_Range> && convertible_to<range_reference_t<_Range>, _Tp>;
-#  else // ^^^ C++20 ^^^ / vvv C++17 vvv
+#  else // ^^^ !_CCCL_NO_CONCEPTS ^^^ / vvv _CCCL_NO_CONCEPTS vvv
 template <class _Range, class _Tp>
 _CCCL_CONCEPT_FRAGMENT(
   __container_compatible_range_,
@@ -299,7 +300,7 @@ _CCCL_CONCEPT_FRAGMENT(
 
 template <class _Range, class _Tp>
 _CCCL_CONCEPT __container_compatible_range = _CCCL_FRAGMENT(__container_compatible_range_, _Range, _Tp);
-#  endif // _CCCL_STD_VER <= 2017
+#  endif // _CCCL_NO_CONCEPTS
 
 #endif // _CCCL_STD_VER >= 2017 && !_CCCL_COMPILER(MSVC2017)
 
diff --git a/libcudacxx/include/cuda/std/__ranges/data.h b/libcudacxx/include/cuda/std/__ranges/data.h
index f5bf6015963..0f756d52a9f 100644
--- a/libcudacxx/include/cuda/std/__ranges/data.h
+++ b/libcudacxx/include/cuda/std/__ranges/data.h
@@ -43,7 +43,7 @@ _LIBCUDACXX_BEGIN_NAMESPACE_CPO(__data)
 template <class _Tp>
 _CCCL_CONCEPT __ptr_to_object = is_pointer_v<_Tp> && is_object_v<remove_pointer_t<_Tp>>;
 
-#  if _CCCL_STD_VER >= 2020
+#  if !defined(_CCCL_NO_CONCEPTS)
 template <class _Tp>
 concept __member_data = __can_borrow<_Tp> && __workaround_52970<_Tp> && requires(_Tp&& __t) {
   { _LIBCUDACXX_AUTO_CAST(__t.data()) } -> __ptr_to_object;
@@ -53,7 +53,7 @@ template <class _Tp>
 concept __ranges_begin_invocable = !__member_data<_Tp> && __can_borrow<_Tp> && requires(_Tp&& __t) {
   { _CUDA_VRANGES::begin(__t) } -> contiguous_iterator;
 };
-#  else // ^^^ CXX20 ^^^ / vvv CXX17 vvv
+#  else // ^^^ !_CCCL_NO_CONCEPTS ^^^ / vvv _CCCL_NO_CONCEPTS vvv
 template <class _Tp>
 _CCCL_CONCEPT_FRAGMENT(__member_data_,
                        requires(_Tp&& __t)(requires(__can_borrow<_Tp>),
@@ -71,7 +71,7 @@ _CCCL_CONCEPT_FRAGMENT(__ranges_begin_invocable_,
 
 template <class _Tp>
 _CCCL_CONCEPT __ranges_begin_invocable = _CCCL_FRAGMENT(__ranges_begin_invocable_, _Tp);
-#  endif // _CCCL_STD_VER <= 2017
+#  endif // _CCCL_NO_CONCEPTS
 
 struct __fn
 {
diff --git a/libcudacxx/include/cuda/std/__ranges/empty.h b/libcudacxx/include/cuda/std/__ranges/empty.h
index d8f8213e9a8..1494c18882f 100644
--- a/libcudacxx/include/cuda/std/__ranges/empty.h
+++ b/libcudacxx/include/cuda/std/__ranges/empty.h
@@ -33,7 +33,7 @@ _LIBCUDACXX_BEGIN_NAMESPACE_RANGES
 
 _LIBCUDACXX_BEGIN_NAMESPACE_CPO(__empty)
 
-#  if _CCCL_STD_VER >= 2020
+#  if !defined(_CCCL_NO_CONCEPTS)
 template <class _Tp>
 concept __member_empty = __workaround_52970<_Tp> && requires(_Tp&& __t) { bool(__t.empty()); };
 
@@ -45,7 +45,7 @@ concept __can_compare_begin_end = !__member_empty<_Tp> && !__can_invoke_size<_Tp
   bool(_CUDA_VRANGES::begin(__t) == _CUDA_VRANGES::end(__t));
   { _CUDA_VRANGES::begin(__t) } -> forward_iterator;
 };
-#  else // ^^^ CXX20 ^^^ / vvv CXX17 vvv
+#  else // ^^^ !_CCCL_NO_CONCEPTS ^^^ / vvv _CCCL_NO_CONCEPTS vvv
 template <class _Tp>
 _CCCL_CONCEPT_FRAGMENT(__member_empty_, requires(_Tp&& __t)(requires(__workaround_52970<_Tp>), (bool(__t.empty()))));
 
@@ -69,7 +69,7 @@ _CCCL_CONCEPT_FRAGMENT(
 
 template <class _Tp>
 _CCCL_CONCEPT __can_compare_begin_end = _CCCL_FRAGMENT(__can_compare_begin_end_, _Tp);
-#  endif // _CCCL_STD_VER <= 2017
+#  endif // _CCCL_NO_CONCEPTS
 
 struct __fn
 {
diff --git a/libcudacxx/include/cuda/std/__ranges/enable_view.h b/libcudacxx/include/cuda/std/__ranges/enable_view.h
index 1e5a09cd541..72e390c0499 100644
--- a/libcudacxx/include/cuda/std/__ranges/enable_view.h
+++ b/libcudacxx/include/cuda/std/__ranges/enable_view.h
@@ -56,14 +56,14 @@ _CCCL_TEMPLATE(class _Op, class _Yp)
 _CCCL_REQUIRES(is_convertible_v<_Op*, view_interface<_Yp>*>)
 _LIBCUDACXX_HIDE_FROM_ABI void __is_derived_from_view_interface(const _Op*, const view_interface<_Yp>*);
 
-#  if _CCCL_STD_VER >= 2020
+#  if !defined(_CCCL_NO_CONCEPTS)
 
 template <class _Tp>
 _CCCL_INLINE_VAR constexpr bool enable_view = derived_from<_Tp, view_base> || requires {
   _CUDA_VRANGES::__is_derived_from_view_interface((_Tp*) nullptr, (_Tp*) nullptr);
 };
 
-#  else // ^^^ _CCCL_STD_VER >= 2020 ^^^ / vvv _CCCL_STD_VER <= 2017 vvv
+#  else // ^^^ !_CCCL_NO_CONCEPTS ^^^ / vvv _CCCL_NO_CONCEPTS vvv
 
 template <class _Tp, class = void>
 _CCCL_INLINE_VAR constexpr bool enable_view = derived_from<_Tp, view_base>;
@@ -72,7 +72,7 @@ template <class _Tp>
 _CCCL_INLINE_VAR constexpr bool
   enable_view<_Tp, void_t<decltype(_CUDA_VRANGES::__is_derived_from_view_interface((_Tp*) nullptr, (_Tp*) nullptr))>> =
     true;
-#  endif // _CCCL_STD_VER <= 2017
+#  endif // _CCCL_NO_CONCEPTS
 
 #endif // _CCCL_STD_VER >= 2017
 
diff --git a/libcudacxx/include/cuda/std/__ranges/rbegin.h b/libcudacxx/include/cuda/std/__ranges/rbegin.h
index 8b70f702797..13cf76b9da9 100644
--- a/libcudacxx/include/cuda/std/__ranges/rbegin.h
+++ b/libcudacxx/include/cuda/std/__ranges/rbegin.h
@@ -43,7 +43,7 @@ void rbegin(_Tp&) = delete;
 template <class _Tp>
 void rbegin(const _Tp&) = delete;
 
-#  if _CCCL_STD_VER >= 2020
+#  if !defined(_CCCL_NO_CONCEPTS)
 template <class _Tp>
 concept __member_rbegin = __can_borrow<_Tp> && __workaround_52970<_Tp> && requires(_Tp&& __t) {
   { _LIBCUDACXX_AUTO_CAST(__t.rbegin()) } -> input_or_output_iterator;
@@ -61,7 +61,7 @@ concept __can_reverse =
     { _CUDA_VRANGES::begin(__t) } -> same_as<decltype(_CUDA_VRANGES::end(__t))>;
     { _CUDA_VRANGES::begin(__t) } -> bidirectional_iterator;
   };
-#  else // ^^^ CXX20 ^^^ / vvv CXX17 vvv
+#  else // ^^^ !_CCCL_NO_CONCEPTS ^^^ / vvv _CCCL_NO_CONCEPTS vvv
 template <class _Tp>
 _CCCL_CONCEPT_FRAGMENT(
   __member_rbegin_,
@@ -94,7 +94,7 @@ _CCCL_CONCEPT_FRAGMENT(
 
 template <class _Tp>
 _CCCL_CONCEPT __can_reverse = _CCCL_FRAGMENT(__can_reverse_, _Tp);
-#  endif // _CCCL_STD_VER <= 2017
+#  endif // _CCCL_NO_CONCEPTS
 
 struct __fn
 {
diff --git a/libcudacxx/include/cuda/std/__ranges/rend.h b/libcudacxx/include/cuda/std/__ranges/rend.h
index 5c266d63bdd..3f21c323eba 100644
--- a/libcudacxx/include/cuda/std/__ranges/rend.h
+++ b/libcudacxx/include/cuda/std/__ranges/rend.h
@@ -44,7 +44,7 @@ void rend(_Tp&) = delete;
 template <class _Tp>
 void rend(const _Tp&) = delete;
 
-#  if _CCCL_STD_VER >= 2020
+#  if !defined(_CCCL_NO_CONCEPTS)
 template <class _Tp>
 concept __member_rend = __can_borrow<_Tp> && __workaround_52970<_Tp> && requires(_Tp&& __t) {
   _CUDA_VRANGES::rbegin(__t);
@@ -63,7 +63,7 @@ concept __can_reverse = __can_borrow<_Tp> && !__member_rend<_Tp> && !__unqualifi
   { _CUDA_VRANGES::begin(__t) } -> same_as<decltype(_CUDA_VRANGES::end(__t))>;
   { _CUDA_VRANGES::begin(__t) } -> bidirectional_iterator;
 };
-#  else // ^^^ CXX20 ^^^ / vvv CXX17 vvv
+#  else // ^^^ !_CCCL_NO_CONCEPTS ^^^ / vvv _CCCL_NO_CONCEPTS vvv
 template <class _Tp>
 _CCCL_CONCEPT_FRAGMENT(
   __member_rend_,
@@ -100,7 +100,7 @@ _CCCL_CONCEPT_FRAGMENT(
 
 template <class _Tp>
 _CCCL_CONCEPT __can_reverse = _CCCL_FRAGMENT(__can_reverse_, _Tp);
-#  endif // _CCCL_STD_VER <= 2017
+#  endif // _CCCL_NO_CONCEPTS
 
 class __fn
 {
diff --git a/libcudacxx/include/cuda/std/__ranges/size.h b/libcudacxx/include/cuda/std/__ranges/size.h
index 04487441586..0b432ae6e87 100644
--- a/libcudacxx/include/cuda/std/__ranges/size.h
+++ b/libcudacxx/include/cuda/std/__ranges/size.h
@@ -52,7 +52,7 @@ void size(const _Tp&) = delete;
 template <class _Tp>
 _CCCL_CONCEPT __size_enabled = !disable_sized_range<remove_cvref_t<_Tp>>;
 
-#  if _CCCL_STD_VER >= 2020
+#  if !defined(_CCCL_NO_CONCEPTS)
 template <class _Tp>
 concept __member_size = __size_enabled<_Tp> && __workaround_52970<_Tp> && requires(_Tp&& __t) {
   { _LIBCUDACXX_AUTO_CAST(__t.size()) } -> __integer_like;
@@ -70,7 +70,7 @@ concept __difference =
     { _CUDA_VRANGES::begin(__t) } -> forward_iterator;
     { _CUDA_VRANGES::end(__t) } -> sized_sentinel_for<decltype(_CUDA_VRANGES::begin(_CUDA_VSTD::declval<_Tp>()))>;
   };
-#  else // ^^^ CXX20 ^^^ / vvv CXX17 vvv
+#  else // ^^^ !_CCCL_NO_CONCEPTS ^^^ / vvv _CCCL_NO_CONCEPTS vvv
 template <class _Tp>
 _CCCL_CONCEPT_FRAGMENT(__member_size_,
                        requires(_Tp&& __t)(requires(__size_enabled<_Tp>),
@@ -103,7 +103,7 @@ _CCCL_CONCEPT_FRAGMENT(
 
 template <class _Tp>
 _CCCL_CONCEPT __difference = _CCCL_FRAGMENT(__difference_, _Tp);
-#  endif // _CCCL_STD_VER <= 2017
+#  endif // _CCCL_NO_CONCEPTS
 
 struct __fn
 {
@@ -162,10 +162,10 @@ _CCCL_GLOBAL_CONSTANT auto size = __size::__fn{};
 // [range.prim.ssize]
 
 _LIBCUDACXX_BEGIN_NAMESPACE_CPO(__ssize)
-#  if _CCCL_STD_VER >= 2020
+#  if !defined(_CCCL_NO_CONCEPTS)
 template <class _Tp>
 concept __can_ssize = requires(_Tp&& __t) { _CUDA_VRANGES::size(__t); };
-#  else // ^^^ CXX20 ^^^ / vvv CXX17 vvv
+#  else // ^^^ !_CCCL_NO_CONCEPTS ^^^ / vvv _CCCL_NO_CONCEPTS vvv
 template <class _Tp>
 _CCCL_CONCEPT_FRAGMENT(__can_ssize_,
                        requires(_Tp&& __t)(requires(!is_unbounded_array_v<_Tp>), ((void) _CUDA_VRANGES::size(__t))));
diff --git a/libcudacxx/include/cuda/std/__ranges/subrange.h b/libcudacxx/include/cuda/std/__ranges/subrange.h
index 190df21d43b..484ce8c1f46 100644
--- a/libcudacxx/include/cuda/std/__ranges/subrange.h
+++ b/libcudacxx/include/cuda/std/__ranges/subrange.h
@@ -60,7 +60,7 @@ _CCCL_DIAG_SUPPRESS_MSVC(4848)
 _LIBCUDACXX_BEGIN_NAMESPACE_RANGES
 _LIBCUDACXX_BEGIN_NAMESPACE_RANGES_ABI
 
-#  if _CCCL_STD_VER >= 2020
+#  if !defined(_CCCL_NO_CONCEPTS)
 template <class _From, class _To>
 concept __uses_nonqualification_pointer_conversion =
   is_pointer_v<_From> && is_pointer_v<_To>
@@ -106,7 +106,7 @@ template <class _Iter, class _Sent, subrange_kind _Kind, class _Pair>
 concept __subrange_to_pair = __different_from<_Pair, subrange<_Iter, _Sent, _Kind>>
                           && __pair_like_convertible_from<_Pair, const _Iter&, const _Sent&>;
 
-#  else // ^^^ C++20 ^^^ / vvv C++17 vvv
+#  else // ^^^ !_CCCL_NO_CONCEPTS ^^^ / vvv _CCCL_NO_CONCEPTS vvv
 
 template <class _From, class _To>
 _CCCL_CONCEPT_FRAGMENT(
@@ -211,19 +211,19 @@ _CCCL_CONCEPT_FRAGMENT(__subrange_to_pair_,
 template <class _Iter, class _Sent, subrange_kind _Kind, class _Pair>
 _CCCL_CONCEPT __subrange_to_pair =
   _CCCL_FRAGMENT(__subrange_to_pair_, _Iter, _Sent, integral_constant<subrange_kind, _Kind>, _Pair);
-#  endif // _CCCL_STD_VER <= 2017
+#  endif // _CCCL_NO_CONCEPTS
 
-#  if _CCCL_STD_VER >= 2020
+#  if !defined(_CCCL_NO_CONCEPTS)
 template <input_or_output_iterator _Iter, sentinel_for<_Iter> _Sent, subrange_kind _Kind>
   requires(_Kind == subrange_kind::sized || !sized_sentinel_for<_Sent, _Iter>)
-#  else // ^^^ C++20 ^^^ / vvv C++17 vvv
+#  else // ^^^ !_CCCL_NO_CONCEPTS ^^^ / vvv _CCCL_NO_CONCEPTS vvv
 template <class _Iter,
           class _Sent,
           subrange_kind _Kind,
           enable_if_t<input_or_output_iterator<_Iter>, int>,
           enable_if_t<sentinel_for<_Sent, _Iter>, int>,
           enable_if_t<(_Kind == subrange_kind::sized || !sized_sentinel_for<_Sent, _Iter>), int>>
-#  endif // _CCCL_STD_VER <= 2017
+#  endif // _CCCL_NO_CONCEPTS
 class _CCCL_TYPE_VISIBILITY_DEFAULT subrange : public view_interface<subrange<_Iter, _Sent, _Kind>>
 {
 public:
@@ -243,15 +243,15 @@ class _CCCL_TYPE_VISIBILITY_DEFAULT subrange : public view_interface<subrange<_I
   _CCCL_NO_UNIQUE_ADDRESS _Size __size_  = 0;
 
 public:
-#  if _CCCL_STD_VER >= 2020
+#  if !defined(_CCCL_NO_CONCEPTS)
   subrange()
     requires default_initializable<_Iter>
   = default;
-#  else // ^^^ C++20 ^^^ / vvv C++17 vvv
+#  else // ^^^ !_CCCL_NO_CONCEPTS ^^^ / vvv _CCCL_NO_CONCEPTS vvv
   template <class _It = _Iter, enable_if_t<default_initializable<_It>, int> = 0>
   _LIBCUDACXX_HIDE_FROM_ABI constexpr subrange() noexcept(is_nothrow_default_constructible_v<_It>)
       : view_interface<subrange<_Iter, _Sent, _Kind>>(){};
-#  endif // _CCCL_STD_VER <= 2017
+#  endif // _CCCL_NO_CONCEPTS
 
   _CCCL_TEMPLATE(class _It)
   _CCCL_REQUIRES(__subrange_from_iter_sent<_Iter, _It, _StoreSize>)
diff --git a/libcudacxx/include/cuda/std/__ranges/view_interface.h b/libcudacxx/include/cuda/std/__ranges/view_interface.h
index 661e20c1b68..a5055867542 100644
--- a/libcudacxx/include/cuda/std/__ranges/view_interface.h
+++ b/libcudacxx/include/cuda/std/__ranges/view_interface.h
@@ -39,25 +39,25 @@ _LIBCUDACXX_BEGIN_NAMESPACE_RANGES
 
 #if _CCCL_STD_VER >= 2017 && !_CCCL_COMPILER(MSVC2017)
 
-#  if _CCCL_STD_VER >= 2020
+#  if !defined(_CCCL_NO_CONCEPTS)
 template <class _Tp>
 concept __can_empty = requires(_Tp& __t) { _CUDA_VRANGES::empty(__t); };
-#  else // ^^^ C++20 ^^^ / vvv C++17 vvv
+#  else // ^^^ !_CCCL_NO_CONCEPTS ^^^ / vvv _CCCL_NO_CONCEPTS vvv
 template <class _Tp>
 _CCCL_CONCEPT_FRAGMENT(__can_empty_, requires(_Tp& __t)(typename(decltype(_CUDA_VRANGES::empty(__t)))));
 
 template <class _Tp>
 _CCCL_CONCEPT __can_empty = _CCCL_FRAGMENT(__can_empty_, _Tp);
-#  endif //  _CCCL_STD_VER <= 2017
+#  endif //  _CCCL_NO_CONCEPTS
 
 _LIBCUDACXX_BEGIN_NAMESPACE_RANGES_ABI
 
-#  if _CCCL_STD_VER >= 2020
+#  if !defined(_CCCL_NO_CONCEPTS)
 template <class _Derived>
   requires is_class_v<_Derived> && same_as<_Derived, remove_cv_t<_Derived>>
-#  else // ^^^ C++20 ^^^ / vvv C++17 vvv
+#  else // ^^^ !_CCCL_NO_CONCEPTS ^^^ / vvv _CCCL_NO_CONCEPTS vvv
 template <class _Derived, enable_if_t<is_class_v<_Derived> && same_as<_Derived, remove_cv_t<_Derived>>, int>>
-#  endif //  _CCCL_STD_VER <= 2017
+#  endif //  _CCCL_NO_CONCEPTS
 class view_interface
 {
   _LIBCUDACXX_HIDE_FROM_ABI constexpr _Derived& __derived() noexcept

From 7d35d56657e65137497ceff6c06858f56ec6fda5 Mon Sep 17 00:00:00 2001
From: Eric Niebler <eniebler@nvidia.com>
Date: Mon, 25 Nov 2024 21:05:37 -0800
Subject: [PATCH 26/45] remove definition of macro that is no longer used
 (#2957)

---
 libcudacxx/test/utils/libcudacxx/test/config.py | 1 -
 1 file changed, 1 deletion(-)

diff --git a/libcudacxx/test/utils/libcudacxx/test/config.py b/libcudacxx/test/utils/libcudacxx/test/config.py
index 0ae9f226d65..4bf1f48739b 100644
--- a/libcudacxx/test/utils/libcudacxx/test/config.py
+++ b/libcudacxx/test/utils/libcudacxx/test/config.py
@@ -995,7 +995,6 @@ def configure_compile_flags_rtti(self):
                 self.cxx.compile_flags += ['-D_SILENCE_CXX20_CISO646_REMOVED_WARNING']
             else:
                 self.cxx.compile_flags += ['-fno-rtti']
-            self.cxx.compile_flags += ['-D_LIBCUDACXX_NO_RTTI']
 
     def configure_compile_flags_abi_version(self):
         abi_version = self.get_lit_conf('abi_version', '').strip()

From 658e0bbfa7a8d309c1f234ae77f1352c8f8849ac Mon Sep 17 00:00:00 2001
From: Michael Schellenberger Costa <miscco@nvidia.com>
Date: Tue, 26 Nov 2024 09:00:35 +0100
Subject: [PATCH 27/45] Avoid symbol clashes with libc++ (#2955)

* Drop `__libcpp` prefix in favor of `__cccl`

libc++ has moved towards just using unqualified `std::` for any partial qualification.

That leads to a high chance of symbol clashes if we use the same names as them. As a first replace all uses of `__libcpp` with `_cccl`

I was wondering about `__libcupp` but :shrug:

* Backport `is_constant_evaluated`
---
 .../cuda/experimental/__async/stop_token.cuh  |   2 +-
 .../cuda/__barrier/barrier_block_scope.h      |  16 +-
 libcudacxx/include/cuda/pipeline              |   2 +-
 .../include/cuda/std/__algorithm/copy.h       |   4 +-
 .../cuda/std/__atomic/wait/notify_wait.h      |   4 +-
 .../include/cuda/std/__atomic/wait/polling.h  |   2 +-
 .../include/cuda/std/__barrier/barrier.h      |   4 +-
 libcudacxx/include/cuda/std/__bit/clz.h       |  16 +-
 libcudacxx/include/cuda/std/__bit/countl.h    |  13 +-
 libcudacxx/include/cuda/std/__bit/countr.h    |  13 +-
 libcudacxx/include/cuda/std/__bit/ctz.h       |  16 +-
 .../include/cuda/std/__bit/has_single_bit.h   |   4 +-
 libcudacxx/include/cuda/std/__bit/integral.h  |  10 +-
 libcudacxx/include/cuda/std/__bit/popc.h      |  16 +-
 libcudacxx/include/cuda/std/__bit/popcount.h  |  12 +-
 libcudacxx/include/cuda/std/__bit/reference.h |   6 +-
 libcudacxx/include/cuda/std/__bit/rotate.h    |   8 +-
 .../include/cuda/std/__complex/nvbf16.h       |   2 +-
 .../include/cuda/std/__complex/nvfp16.h       |   2 +-
 .../include/cuda/std/__concepts/arithmetic.h  |   2 +-
 .../cuda/std/__iterator/erase_if_container.h  |   2 +-
 .../include/cuda/std/__memory/allocator.h     |  16 +-
 .../cuda/std/__memory/builtin_new_allocator.h |   6 +-
 .../include/cuda/std/__memory/construct_at.h  |   8 +-
 .../cuda/std/__memory/temporary_buffer.h      |   2 +-
 .../std/__memory/uninitialized_algorithms.h   |   6 +-
 libcudacxx/include/cuda/std/__new/allocate.h  |  22 +-
 .../cuda/std/__semaphore/atomic_semaphore.h   |   4 +-
 .../include/cuda/std/__string/string_view.h   |   2 +-
 .../cuda/std/__thread/threading_support.h     |  12 +-
 .../std/__thread/threading_support_cuda.h     |   4 +-
 .../std/__thread/threading_support_external.h |   4 +-
 .../std/__thread/threading_support_pthread.h  |  52 ++--
 .../std/__thread/threading_support_win32.h    |  22 +-
 .../std/__type_traits/add_lvalue_reference.h  |   2 +-
 .../cuda/std/__type_traits/add_pointer.h      |   2 +-
 .../std/__type_traits/add_rvalue_reference.h  |   2 +-
 .../include/cuda/std/__type_traits/decay.h    |   2 +-
 .../cuda/std/__type_traits/is_bounded_array.h |   4 +-
 .../std/__type_traits/is_constant_evaluated.h |  11 +-
 .../cuda/std/__type_traits/is_constructible.h |  17 +-
 .../include/cuda/std/__type_traits/is_empty.h |   6 +-
 .../std/__type_traits/is_floating_point.h     |  10 +-
 .../cuda/std/__type_traits/is_integral.h      |  40 +--
 .../is_member_function_pointer.h              |   6 +-
 .../__type_traits/is_member_object_pointer.h  |   2 +-
 .../std/__type_traits/is_member_pointer.h     |   2 +-
 .../std/__type_traits/is_nothrow_assignable.h |   8 +-
 .../__type_traits/is_nothrow_constructible.h  |  12 +-
 .../__type_traits/is_nothrow_destructible.h   |  10 +-
 .../cuda/std/__type_traits/is_pointer.h       |   6 +-
 .../cuda/std/__type_traits/is_referenceable.h |   8 +-
 .../cuda/std/__type_traits/is_signed.h        |  10 +-
 .../std/__type_traits/is_signed_integer.h     |  14 +-
 .../cuda/std/__type_traits/is_swappable.h     |   4 +-
 .../__type_traits/is_trivially_destructible.h |   4 +-
 .../std/__type_traits/is_unbounded_array.h    |   4 +-
 .../include/cuda/std/__type_traits/is_union.h |   4 +-
 .../cuda/std/__type_traits/is_unsigned.h      |  10 +-
 .../std/__type_traits/is_unsigned_integer.h   |  14 +-
 .../include/cuda/std/__utility/unreachable.h  |   2 +-
 .../cuda/std/detail/libcxx/include/__string   |   4 +-
 .../cuda/std/detail/libcxx/include/algorithm  |   2 +-
 .../cuda/std/detail/libcxx/include/complex    |  32 +--
 .../cuda/std/detail/libcxx/include/limits     |  22 +-
 libcudacxx/test/NOTES.TXT                     |   2 +-
 .../is_constant_evaluated.pass.cpp            |  10 +-
 .../utilities/meta/is_referenceable.pass.cpp  | 258 +++++++++---------
 .../partial_sort_copy.pass.cpp                |   4 +-
 .../partial_sort_copy_comp.pass.cpp           |   4 +-
 .../sequences/inplace_vector/access.pass.cpp  |   2 +-
 .../sequences/inplace_vector/assign.pass.cpp  |   2 +-
 .../inplace_vector/assignment.pass.cpp        |   2 +-
 .../inplace_vector/capacity.pass.cpp          |   2 +-
 .../inplace_vector/comparison.pass.cpp        |   2 +-
 .../inplace_vector/constructor.pass.cpp       |   4 +-
 .../sequences/inplace_vector/emplace.pass.cpp |   2 +-
 .../sequences/inplace_vector/insert.pass.cpp  |   2 +-
 .../inplace_vector/iterators.pass.cpp         |   2 +-
 .../sequences/inplace_vector/resize.pass.cpp  |   2 +-
 .../sequences/inplace_vector/swap.pass.cpp    |   2 +-
 .../unique.ptr.ctor/pointer_deleter.pass.cpp  |   4 +-
 .../is_constant_evaluated.fail.cpp            |  28 --
 .../meta.unary.prop/is_constructible.pass.cpp |  18 +-
 libcudacxx/test/support/check_assertion.h     |   2 +-
 libcudacxx/test/support/test_macros.h         |   2 +-
 86 files changed, 455 insertions(+), 493 deletions(-)
 delete mode 100644 libcudacxx/test/libcudacxx/std/utilities/meta/meta.const.eval/is_constant_evaluated.fail.cpp

diff --git a/cudax/include/cuda/experimental/__async/stop_token.cuh b/cudax/include/cuda/experimental/__async/stop_token.cuh
index 52ff380ad99..32aeb3ea63a 100644
--- a/cudax/include/cuda/experimental/__async/stop_token.cuh
+++ b/cudax/include/cuda/experimental/__async/stop_token.cuh
@@ -96,7 +96,7 @@ struct __spin_wait
     else
     {
       --__count_;
-      _CUDA_VSTD::__libcpp_thread_yield_processor();
+      _CUDA_VSTD::__cccl_thread_yield_processor();
     }
   }
 
diff --git a/libcudacxx/include/cuda/__barrier/barrier_block_scope.h b/libcudacxx/include/cuda/__barrier/barrier_block_scope.h
index 163aad61da1..e79165ae8d0 100644
--- a/libcudacxx/include/cuda/__barrier/barrier_block_scope.h
+++ b/libcudacxx/include/cuda/__barrier/barrier_block_scope.h
@@ -229,7 +229,7 @@ class barrier<thread_scope_block, _CUDA_VSTD::__empty_completion> : public __blo
       (
         int32_t __ready = 0;
         if (!__isClusterShared(&__barrier)) {
-          return _CUDA_VSTD::__libcpp_thread_poll_with_backoff(
+          return _CUDA_VSTD::__cccl_thread_poll_with_backoff(
             _CUDA_VSTD::__barrier_poll_tester_phase<barrier>(this, _CUDA_VSTD::move(__token)), __nanosec);
         } else if (!__isShared(&__barrier)) { __trap(); }
 
@@ -256,7 +256,7 @@ class barrier<thread_scope_block, _CUDA_VSTD::__empty_completion> : public __blo
       (
         bool __ready = 0;
         if (!__isShared(&__barrier)) {
-          return _CUDA_VSTD::__libcpp_thread_poll_with_backoff(
+          return _CUDA_VSTD::__cccl_thread_poll_with_backoff(
             _CUDA_VSTD::__barrier_poll_tester_phase<barrier>(this, _CUDA_VSTD::move(__token)), __nanosec);
         }
 
@@ -267,7 +267,7 @@ class barrier<thread_scope_block, _CUDA_VSTD::__empty_completion> : public __blo
         } while (!__ready && __nanosec > (_CUDA_VSTD::chrono::high_resolution_clock::now() - __start));
         return __ready;),
       NV_ANY_TARGET,
-      (return _CUDA_VSTD::__libcpp_thread_poll_with_backoff(
+      (return _CUDA_VSTD::__cccl_thread_poll_with_backoff(
                 _CUDA_VSTD::__barrier_poll_tester_phase<barrier>(this, _CUDA_VSTD::move(__token)),
                 _CUDA_VSTD::chrono::nanoseconds(__nanosec));))
   }
@@ -331,7 +331,7 @@ class barrier<thread_scope_block, _CUDA_VSTD::__empty_completion> : public __blo
       (
         int32_t __ready = 0;
         if (!__isClusterShared(&__barrier)) {
-          return _CUDA_VSTD::__libcpp_thread_poll_with_backoff(
+          return _CUDA_VSTD::__cccl_thread_poll_with_backoff(
             _CUDA_VSTD::__barrier_poll_tester_parity<barrier>(this, __phase_parity), __nanosec);
         } else if (!__isShared(&__barrier)) { __trap(); }
 
@@ -359,7 +359,7 @@ class barrier<thread_scope_block, _CUDA_VSTD::__empty_completion> : public __blo
       (
         bool __ready = 0;
         if (!__isShared(&__barrier)) {
-          return _CUDA_VSTD::__libcpp_thread_poll_with_backoff(
+          return _CUDA_VSTD::__cccl_thread_poll_with_backoff(
             _CUDA_VSTD::__barrier_poll_tester_parity<barrier>(this, __phase_parity), __nanosec);
         }
 
@@ -371,20 +371,20 @@ class barrier<thread_scope_block, _CUDA_VSTD::__empty_completion> : public __blo
 
         return __ready;),
       NV_ANY_TARGET,
-      (return _CUDA_VSTD::__libcpp_thread_poll_with_backoff(
+      (return _CUDA_VSTD::__cccl_thread_poll_with_backoff(
                 _CUDA_VSTD::__barrier_poll_tester_parity<barrier>(this, __phase_parity), __nanosec);))
   }
 
 public:
   _LIBCUDACXX_HIDE_FROM_ABI void wait(arrival_token&& __phase) const
   {
-    _CUDA_VSTD::__libcpp_thread_poll_with_backoff(
+    _CUDA_VSTD::__cccl_thread_poll_with_backoff(
       _CUDA_VSTD::__barrier_poll_tester_phase<barrier>(this, _CUDA_VSTD::move(__phase)));
   }
 
   _LIBCUDACXX_HIDE_FROM_ABI void wait_parity(bool __phase_parity) const
   {
-    _CUDA_VSTD::__libcpp_thread_poll_with_backoff(
+    _CUDA_VSTD::__cccl_thread_poll_with_backoff(
       _CUDA_VSTD::__barrier_poll_tester_parity<barrier>(this, __phase_parity));
   }
 
diff --git a/libcudacxx/include/cuda/pipeline b/libcudacxx/include/cuda/pipeline
index 564075e1827..d034c931644 100644
--- a/libcudacxx/include/cuda/pipeline
+++ b/libcudacxx/include/cuda/pipeline
@@ -296,7 +296,7 @@ public:
   _LIBCUDACXX_HIDE_FROM_ABI bool consumer_wait_for(const _CUDA_VSTD::chrono::duration<_Rep, _Period>& __duration)
   {
     barrier<_Scope>& __stage_barrier = __shared_state_get_stage(__tail)->__produced;
-    return _CUDA_VSTD::__libcpp_thread_poll_with_backoff(
+    return _CUDA_VSTD::__cccl_thread_poll_with_backoff(
       _CUDA_VSTD::__barrier_poll_tester_parity<barrier<_Scope>>(&__stage_barrier, __produced_phase_parity),
       _CUDA_VSTD::chrono::duration_cast<_CUDA_VSTD::chrono::nanoseconds>(__duration));
   }
diff --git a/libcudacxx/include/cuda/std/__algorithm/copy.h b/libcudacxx/include/cuda/std/__algorithm/copy.h
index 2333a19a604..f4013d4ea73 100644
--- a/libcudacxx/include/cuda/std/__algorithm/copy.h
+++ b/libcudacxx/include/cuda/std/__algorithm/copy.h
@@ -54,7 +54,7 @@ _LIBCUDACXX_HIDE_FROM_ABI _CCCL_CONSTEXPR_CXX14 bool __dispatch_memmove(_Up* __r
   return false;
 #endif
 
-  if (__libcpp_is_constant_evaluated())
+  if (_CUDA_VSTD::is_constant_evaluated())
   {
     return false;
   }
@@ -114,7 +114,7 @@ _LIBCUDACXX_HIDE_FROM_ABI _CCCL_CONSTEXPR_CXX14 pair<_Tp*, _Up*> __copy(_Tp* __f
     {
       return {__last, __result + __n};
     }
-    if ((!__libcpp_is_constant_evaluated() && __first < __result)
+    if ((!_CUDA_VSTD::is_constant_evaluated() && __first < __result)
         || __constexpr_tail_overlap(__first, __result, __last))
     {
       for (ptrdiff_t __i = __n; __i > 0; --__i)
diff --git a/libcudacxx/include/cuda/std/__atomic/wait/notify_wait.h b/libcudacxx/include/cuda/std/__atomic/wait/notify_wait.h
index b79b22adad6..1c4c23d959e 100644
--- a/libcudacxx/include/cuda/std/__atomic/wait/notify_wait.h
+++ b/libcudacxx/include/cuda/std/__atomic/wait/notify_wait.h
@@ -72,11 +72,11 @@ _LIBCUDACXX_HIDE_FROM_ABI void __atomic_wait(
     }
     if (__i < 12)
     {
-      _CUDA_VSTD::__libcpp_thread_yield_processor();
+      _CUDA_VSTD::__cccl_thread_yield_processor();
     }
     else
     {
-      _CUDA_VSTD::__libcpp_thread_yield();
+      _CUDA_VSTD::__cccl_thread_yield();
     }
   }
   while (__nonatomic_compare_equal(__atomic_load_dispatch(__a, __order, _Sco{}), __val))
diff --git a/libcudacxx/include/cuda/std/__atomic/wait/polling.h b/libcudacxx/include/cuda/std/__atomic/wait/polling.h
index cbb1a73a4b8..54ba4a08948 100644
--- a/libcudacxx/include/cuda/std/__atomic/wait/polling.h
+++ b/libcudacxx/include/cuda/std/__atomic/wait/polling.h
@@ -53,7 +53,7 @@ template <typename _Tp, typename _Sco>
 _CCCL_HOST_DEVICE void __atomic_try_wait_slow_fallback(
   _Tp const volatile* __a, __atomic_underlying_remove_cv_t<_Tp> __val, memory_order __order, _Sco)
 {
-  _CUDA_VSTD::__libcpp_thread_poll_with_backoff(__atomic_poll_tester<_Tp, _Sco>(__a, __val, __order));
+  _CUDA_VSTD::__cccl_thread_poll_with_backoff(__atomic_poll_tester<_Tp, _Sco>(__a, __val, __order));
 }
 
 _LIBCUDACXX_END_NAMESPACE_STD
diff --git a/libcudacxx/include/cuda/std/__barrier/barrier.h b/libcudacxx/include/cuda/std/__barrier/barrier.h
index 491998132a8..e17d4a2d111 100644
--- a/libcudacxx/include/cuda/std/__barrier/barrier.h
+++ b/libcudacxx/include/cuda/std/__barrier/barrier.h
@@ -192,12 +192,12 @@ class __barrier_base<__empty_completion, _Sco>
   }
   _LIBCUDACXX_HIDE_FROM_ABI void wait(arrival_token&& __phase) const
   {
-    _CUDA_VSTD::__libcpp_thread_poll_with_backoff(
+    _CUDA_VSTD::__cccl_thread_poll_with_backoff(
       __barrier_poll_tester_phase<__barrier_base>(this, _CUDA_VSTD::move(__phase)));
   }
   _LIBCUDACXX_HIDE_FROM_ABI void wait_parity(bool __parity) const
   {
-    _CUDA_VSTD::__libcpp_thread_poll_with_backoff(__barrier_poll_tester_parity<__barrier_base>(this, __parity));
+    _CUDA_VSTD::__cccl_thread_poll_with_backoff(__barrier_poll_tester_parity<__barrier_base>(this, __parity));
   }
   _LIBCUDACXX_HIDE_FROM_ABI void arrive_and_wait()
   {
diff --git a/libcudacxx/include/cuda/std/__bit/clz.h b/libcudacxx/include/cuda/std/__bit/clz.h
index 267f022737a..791db82ca7f 100644
--- a/libcudacxx/include/cuda/std/__bit/clz.h
+++ b/libcudacxx/include/cuda/std/__bit/clz.h
@@ -75,10 +75,10 @@ _LIBCUDACXX_HIDE_FROM_ABI constexpr int __constexpr_clz(uint64_t __x) noexcept
 #  endif
 }
 
-_LIBCUDACXX_HIDE_FROM_ABI constexpr int __libcpp_clz(uint32_t __x) noexcept
+_LIBCUDACXX_HIDE_FROM_ABI constexpr int __cccl_clz(uint32_t __x) noexcept
 {
 #  if _CCCL_STD_VER >= 2014
-  if (!__libcpp_default_is_constant_evaluated())
+  if (!__cccl_default_is_constant_evaluated())
   {
     NV_IF_ELSE_TARGET(NV_IS_DEVICE, (return __clz(__x);), (return __builtin_clz(__x);))
   }
@@ -86,10 +86,10 @@ _LIBCUDACXX_HIDE_FROM_ABI constexpr int __libcpp_clz(uint32_t __x) noexcept
   return __constexpr_clz(__x);
 }
 
-_LIBCUDACXX_HIDE_FROM_ABI constexpr int __libcpp_clz(uint64_t __x) noexcept
+_LIBCUDACXX_HIDE_FROM_ABI constexpr int __cccl_clz(uint64_t __x) noexcept
 {
 #  if _CCCL_STD_VER >= 2014
-  if (!__libcpp_default_is_constant_evaluated())
+  if (!__cccl_default_is_constant_evaluated())
   {
     NV_IF_ELSE_TARGET(NV_IS_DEVICE, (return __clzll(__x);), (return __builtin_clzll(__x);))
   }
@@ -100,10 +100,10 @@ _LIBCUDACXX_HIDE_FROM_ABI constexpr int __libcpp_clz(uint64_t __x) noexcept
 #else // _CCCL_COMPILER(MSVC)
 
 // Precondition:  __x != 0
-_LIBCUDACXX_HIDE_FROM_ABI constexpr int __libcpp_clz(uint32_t __x)
+_LIBCUDACXX_HIDE_FROM_ABI constexpr int __cccl_clz(uint32_t __x)
 {
 #  if !defined(__CUDA_ARCH__)
-  if (!__libcpp_default_is_constant_evaluated())
+  if (!__cccl_default_is_constant_evaluated())
   {
     unsigned long __where = 0;
     if (_BitScanReverse(&__where, __x))
@@ -117,10 +117,10 @@ _LIBCUDACXX_HIDE_FROM_ABI constexpr int __libcpp_clz(uint32_t __x)
   return __binary_clz32(static_cast<uint64_t>(__x), 0);
 }
 
-_LIBCUDACXX_HIDE_FROM_ABI constexpr int __libcpp_clz(uint64_t __x)
+_LIBCUDACXX_HIDE_FROM_ABI constexpr int __cccl_clz(uint64_t __x)
 {
 #  if !defined(__CUDA_ARCH__)
-  if (!__libcpp_default_is_constant_evaluated())
+  if (!__cccl_default_is_constant_evaluated())
   {
     unsigned long __where = 0;
 #    if defined(_LIBCUDACXX_HAS_BITSCAN64)
diff --git a/libcudacxx/include/cuda/std/__bit/countl.h b/libcudacxx/include/cuda/std/__bit/countl.h
index f15e14a5293..3642d17de09 100644
--- a/libcudacxx/include/cuda/std/__bit/countl.h
+++ b/libcudacxx/include/cuda/std/__bit/countl.h
@@ -38,14 +38,14 @@ template <class _Tp>
 _LIBCUDACXX_HIDE_FROM_ABI constexpr enable_if_t<sizeof(_Tp) <= sizeof(uint32_t), int>
 __countl_zero_dispatch(_Tp __t) noexcept
 {
-  return __libcpp_clz(static_cast<uint32_t>(__t)) - (numeric_limits<uint32_t>::digits - numeric_limits<_Tp>::digits);
+  return __cccl_clz(static_cast<uint32_t>(__t)) - (numeric_limits<uint32_t>::digits - numeric_limits<_Tp>::digits);
 }
 
 template <class _Tp>
 _LIBCUDACXX_HIDE_FROM_ABI constexpr enable_if_t<sizeof(_Tp) == sizeof(uint64_t), int>
 __countl_zero_dispatch(_Tp __t) noexcept
 {
-  return __libcpp_clz(static_cast<uint64_t>(__t)) - (numeric_limits<uint64_t>::digits - numeric_limits<_Tp>::digits);
+  return __cccl_clz(static_cast<uint64_t>(__t)) - (numeric_limits<uint64_t>::digits - numeric_limits<_Tp>::digits);
 }
 
 template <typename _Tp, int _St = sizeof(_Tp) / sizeof(uint64_t)>
@@ -90,27 +90,26 @@ __countl_zero_dispatch(_Tp __t) noexcept
 template <class _Tp>
 _LIBCUDACXX_HIDE_FROM_ABI constexpr int __countl_zero(_Tp __t) noexcept
 {
-  static_assert(__libcpp_is_unsigned_integer<_Tp>::value, "__countl_zero requires unsigned");
+  static_assert(__cccl_is_unsigned_integer<_Tp>::value, "__countl_zero requires unsigned");
   return __t ? __countl_zero_dispatch(__t) : numeric_limits<_Tp>::digits;
 }
 
 template <class _Tp>
 _LIBCUDACXX_HIDE_FROM_ABI constexpr int __countl_one(_Tp __t) noexcept
 {
-  static_assert(__libcpp_is_unsigned_integer<_Tp>::value, "__countl_one requires unsigned");
+  static_assert(__cccl_is_unsigned_integer<_Tp>::value, "__countl_one requires unsigned");
   return __t != numeric_limits<_Tp>::max() ? __countl_zero(static_cast<_Tp>(~__t)) : numeric_limits<_Tp>::digits;
 }
 
 template <class _Tp>
-_LIBCUDACXX_HIDE_FROM_ABI constexpr enable_if_t<__libcpp_is_unsigned_integer<_Tp>::value, int>
+_LIBCUDACXX_HIDE_FROM_ABI constexpr enable_if_t<__cccl_is_unsigned_integer<_Tp>::value, int>
 countl_zero(_Tp __t) noexcept
 {
   return __countl_zero(__t);
 }
 
 template <class _Tp>
-_LIBCUDACXX_HIDE_FROM_ABI constexpr enable_if_t<__libcpp_is_unsigned_integer<_Tp>::value, int>
-countl_one(_Tp __t) noexcept
+_LIBCUDACXX_HIDE_FROM_ABI constexpr enable_if_t<__cccl_is_unsigned_integer<_Tp>::value, int> countl_one(_Tp __t) noexcept
 {
   return __countl_one(__t);
 }
diff --git a/libcudacxx/include/cuda/std/__bit/countr.h b/libcudacxx/include/cuda/std/__bit/countr.h
index 21e65f800ba..e7a2b609abe 100644
--- a/libcudacxx/include/cuda/std/__bit/countr.h
+++ b/libcudacxx/include/cuda/std/__bit/countr.h
@@ -37,14 +37,14 @@ template <class _Tp>
 _LIBCUDACXX_HIDE_FROM_ABI constexpr enable_if_t<sizeof(_Tp) <= sizeof(uint32_t), int>
 __countr_zero_dispatch(_Tp __t) noexcept
 {
-  return __libcpp_ctz(static_cast<uint32_t>(__t));
+  return __cccl_ctz(static_cast<uint32_t>(__t));
 }
 
 template <class _Tp>
 _LIBCUDACXX_HIDE_FROM_ABI constexpr enable_if_t<sizeof(_Tp) == sizeof(uint64_t), int>
 __countr_zero_dispatch(_Tp __t) noexcept
 {
-  return __libcpp_ctz(static_cast<uint64_t>(__t));
+  return __cccl_ctz(static_cast<uint64_t>(__t));
 }
 
 template <typename _Tp, int _St = sizeof(_Tp) / sizeof(uint64_t)>
@@ -83,7 +83,7 @@ __countr_zero_dispatch(_Tp __t) noexcept
 template <class _Tp>
 _LIBCUDACXX_HIDE_FROM_ABI constexpr int __countr_zero(_Tp __t) noexcept
 {
-  static_assert(__libcpp_is_unsigned_integer<_Tp>::value, "__countr_zero requires unsigned");
+  static_assert(__cccl_is_unsigned_integer<_Tp>::value, "__countr_zero requires unsigned");
 
   return __t ? __countr_zero_dispatch(__t) : numeric_limits<_Tp>::digits;
 }
@@ -91,20 +91,19 @@ _LIBCUDACXX_HIDE_FROM_ABI constexpr int __countr_zero(_Tp __t) noexcept
 template <class _Tp>
 _LIBCUDACXX_HIDE_FROM_ABI constexpr int __countr_one(_Tp __t) noexcept
 {
-  static_assert(__libcpp_is_unsigned_integer<_Tp>::value, "__countr_one requires unsigned");
+  static_assert(__cccl_is_unsigned_integer<_Tp>::value, "__countr_one requires unsigned");
   return __t != numeric_limits<_Tp>::max() ? __countr_zero(static_cast<_Tp>(~__t)) : numeric_limits<_Tp>::digits;
 }
 
 template <class _Tp>
-_LIBCUDACXX_HIDE_FROM_ABI constexpr enable_if_t<__libcpp_is_unsigned_integer<_Tp>::value, int>
+_LIBCUDACXX_HIDE_FROM_ABI constexpr enable_if_t<__cccl_is_unsigned_integer<_Tp>::value, int>
 countr_zero(_Tp __t) noexcept
 {
   return __countr_zero(__t);
 }
 
 template <class _Tp>
-_LIBCUDACXX_HIDE_FROM_ABI constexpr enable_if_t<__libcpp_is_unsigned_integer<_Tp>::value, int>
-countr_one(_Tp __t) noexcept
+_LIBCUDACXX_HIDE_FROM_ABI constexpr enable_if_t<__cccl_is_unsigned_integer<_Tp>::value, int> countr_one(_Tp __t) noexcept
 {
   return __countr_one(__t);
 }
diff --git a/libcudacxx/include/cuda/std/__bit/ctz.h b/libcudacxx/include/cuda/std/__bit/ctz.h
index 9d2e771bd61..813afa6be65 100644
--- a/libcudacxx/include/cuda/std/__bit/ctz.h
+++ b/libcudacxx/include/cuda/std/__bit/ctz.h
@@ -75,10 +75,10 @@ _LIBCUDACXX_HIDE_FROM_ABI constexpr int __constexpr_ctz(uint64_t __x) noexcept
 #  endif
 }
 
-_LIBCUDACXX_HIDE_FROM_ABI constexpr int __libcpp_ctz(uint32_t __x) noexcept
+_LIBCUDACXX_HIDE_FROM_ABI constexpr int __cccl_ctz(uint32_t __x) noexcept
 {
 #  if _CCCL_STD_VER >= 2014
-  if (!__libcpp_default_is_constant_evaluated())
+  if (!__cccl_default_is_constant_evaluated())
   {
     NV_IF_ELSE_TARGET(
       NV_IS_DEVICE, (return (!__x) ? (sizeof(uint32_t) * 8) : (__ffs(__x) - 1);), (return __builtin_ctz(__x);))
@@ -87,10 +87,10 @@ _LIBCUDACXX_HIDE_FROM_ABI constexpr int __libcpp_ctz(uint32_t __x) noexcept
   return __constexpr_ctz(__x);
 }
 
-_LIBCUDACXX_HIDE_FROM_ABI constexpr int __libcpp_ctz(uint64_t __x) noexcept
+_LIBCUDACXX_HIDE_FROM_ABI constexpr int __cccl_ctz(uint64_t __x) noexcept
 {
 #  if _CCCL_STD_VER >= 2014
-  if (!__libcpp_default_is_constant_evaluated())
+  if (!__cccl_default_is_constant_evaluated())
   {
     NV_IF_ELSE_TARGET(
       NV_IS_DEVICE, (return (!__x) ? (sizeof(uint64_t) * 8) : (__ffsll(__x) - 1);), (return __builtin_ctzll(__x);))
@@ -102,10 +102,10 @@ _LIBCUDACXX_HIDE_FROM_ABI constexpr int __libcpp_ctz(uint64_t __x) noexcept
 #else // _CCCL_COMPILER(MSVC)
 
 // Precondition:  __x != 0
-_LIBCUDACXX_HIDE_FROM_ABI constexpr int __libcpp_ctz(uint32_t __x)
+_LIBCUDACXX_HIDE_FROM_ABI constexpr int __cccl_ctz(uint32_t __x)
 {
 #  if !defined(__CUDA_ARCH__)
-  if (!__libcpp_default_is_constant_evaluated())
+  if (!__cccl_default_is_constant_evaluated())
   {
     unsigned long __where = 0;
     if (_BitScanForward(&__where, __x))
@@ -119,10 +119,10 @@ _LIBCUDACXX_HIDE_FROM_ABI constexpr int __libcpp_ctz(uint32_t __x)
   return __binary_ctz32(static_cast<uint64_t>(__x), 0);
 }
 
-_LIBCUDACXX_HIDE_FROM_ABI constexpr int __libcpp_ctz(uint64_t __x)
+_LIBCUDACXX_HIDE_FROM_ABI constexpr int __cccl_ctz(uint64_t __x)
 {
 #  if !defined(__CUDA_ARCH__)
-  if (!__libcpp_default_is_constant_evaluated())
+  if (!__cccl_default_is_constant_evaluated())
   {
     unsigned long __where = 0;
 #    if defined(_LIBCUDACXX_HAS_BITSCAN64) && (defined(_M_AMD64) || defined(__x86_64__))
diff --git a/libcudacxx/include/cuda/std/__bit/has_single_bit.h b/libcudacxx/include/cuda/std/__bit/has_single_bit.h
index 1cd207f72bb..07586899549 100644
--- a/libcudacxx/include/cuda/std/__bit/has_single_bit.h
+++ b/libcudacxx/include/cuda/std/__bit/has_single_bit.h
@@ -29,12 +29,12 @@ _LIBCUDACXX_BEGIN_NAMESPACE_STD
 template <class _Tp>
 _LIBCUDACXX_HIDE_FROM_ABI constexpr bool __has_single_bit(_Tp __t) noexcept
 {
-  static_assert(__libcpp_is_unsigned_integer<_Tp>::value, "__has_single_bit requires unsigned");
+  static_assert(__cccl_is_unsigned_integer<_Tp>::value, "__has_single_bit requires unsigned");
   return __t != 0 && (((__t & (__t - 1)) == 0));
 }
 
 template <class _Tp>
-_LIBCUDACXX_HIDE_FROM_ABI constexpr enable_if_t<__libcpp_is_unsigned_integer<_Tp>::value, bool>
+_LIBCUDACXX_HIDE_FROM_ABI constexpr enable_if_t<__cccl_is_unsigned_integer<_Tp>::value, bool>
 has_single_bit(_Tp __t) noexcept
 {
   return __has_single_bit(__t);
diff --git a/libcudacxx/include/cuda/std/__bit/integral.h b/libcudacxx/include/cuda/std/__bit/integral.h
index 869972f3422..f0186ad9f5f 100644
--- a/libcudacxx/include/cuda/std/__bit/integral.h
+++ b/libcudacxx/include/cuda/std/__bit/integral.h
@@ -32,7 +32,7 @@ _LIBCUDACXX_BEGIN_NAMESPACE_STD
 template <class _Tp>
 _LIBCUDACXX_HIDE_FROM_ABI constexpr uint32_t __bit_log2(_Tp __t) noexcept
 {
-  static_assert(__libcpp_is_unsigned_integer<_Tp>::value, "__bit_log2 requires unsigned");
+  static_assert(__cccl_is_unsigned_integer<_Tp>::value, "__bit_log2 requires unsigned");
   return numeric_limits<_Tp>::digits - 1 - __countl_zero(__t);
 }
 
@@ -51,21 +51,19 @@ _LIBCUDACXX_HIDE_FROM_ABI constexpr enable_if_t<sizeof(_Tp) < sizeof(uint32_t),
 }
 
 template <class _Tp>
-_LIBCUDACXX_HIDE_FROM_ABI constexpr enable_if_t<__libcpp_is_unsigned_integer<_Tp>::value, _Tp>
-bit_floor(_Tp __t) noexcept
+_LIBCUDACXX_HIDE_FROM_ABI constexpr enable_if_t<__cccl_is_unsigned_integer<_Tp>::value, _Tp> bit_floor(_Tp __t) noexcept
 {
   return __t == 0 ? 0 : static_cast<_Tp>(_Tp{1} << __bit_log2(__t));
 }
 
 template <class _Tp>
-_LIBCUDACXX_HIDE_FROM_ABI constexpr enable_if_t<__libcpp_is_unsigned_integer<_Tp>::value, _Tp> bit_ceil(_Tp __t) noexcept
+_LIBCUDACXX_HIDE_FROM_ABI constexpr enable_if_t<__cccl_is_unsigned_integer<_Tp>::value, _Tp> bit_ceil(_Tp __t) noexcept
 {
   return (__t < 2) ? 1 : static_cast<_Tp>(__ceil2(__t));
 }
 
 template <class _Tp>
-_LIBCUDACXX_HIDE_FROM_ABI constexpr enable_if_t<__libcpp_is_unsigned_integer<_Tp>::value, int>
-bit_width(_Tp __t) noexcept
+_LIBCUDACXX_HIDE_FROM_ABI constexpr enable_if_t<__cccl_is_unsigned_integer<_Tp>::value, int> bit_width(_Tp __t) noexcept
 {
   return __t == 0 ? 0 : static_cast<int>(__bit_log2(__t) + 1);
 }
diff --git a/libcudacxx/include/cuda/std/__bit/popc.h b/libcudacxx/include/cuda/std/__bit/popc.h
index dc22999b985..6a1cb93239f 100644
--- a/libcudacxx/include/cuda/std/__bit/popc.h
+++ b/libcudacxx/include/cuda/std/__bit/popc.h
@@ -76,10 +76,10 @@ _LIBCUDACXX_HIDE_FROM_ABI constexpr int __constexpr_popcount(uint64_t __x) noexc
 #  endif
 }
 
-_LIBCUDACXX_HIDE_FROM_ABI constexpr int __libcpp_popc(uint32_t __x) noexcept
+_LIBCUDACXX_HIDE_FROM_ABI constexpr int __cccl_popc(uint32_t __x) noexcept
 {
 #  if _CCCL_STD_VER >= 2014
-  if (!__libcpp_default_is_constant_evaluated())
+  if (!__cccl_default_is_constant_evaluated())
   {
     NV_IF_ELSE_TARGET(NV_IS_DEVICE, (return __popc(__x);), (return __builtin_popcount(__x);))
   }
@@ -87,10 +87,10 @@ _LIBCUDACXX_HIDE_FROM_ABI constexpr int __libcpp_popc(uint32_t __x) noexcept
   return __constexpr_popcount(static_cast<uint64_t>(__x));
 }
 
-_LIBCUDACXX_HIDE_FROM_ABI constexpr int __libcpp_popc(uint64_t __x) noexcept
+_LIBCUDACXX_HIDE_FROM_ABI constexpr int __cccl_popc(uint64_t __x) noexcept
 {
 #  if _CCCL_STD_VER >= 2014
-  if (!__libcpp_default_is_constant_evaluated())
+  if (!__cccl_default_is_constant_evaluated())
   {
     NV_IF_ELSE_TARGET(NV_IS_DEVICE, (return __popcll(__x);), (return __builtin_popcountll(__x);))
   }
@@ -100,9 +100,9 @@ _LIBCUDACXX_HIDE_FROM_ABI constexpr int __libcpp_popc(uint64_t __x) noexcept
 
 #else // _CCCL_COMPILER(MSVC)
 
-_LIBCUDACXX_HIDE_FROM_ABI constexpr int __libcpp_popc(uint32_t __x)
+_LIBCUDACXX_HIDE_FROM_ABI constexpr int __cccl_popc(uint32_t __x)
 {
-  if (!__libcpp_default_is_constant_evaluated())
+  if (!__cccl_default_is_constant_evaluated())
   {
     NV_IF_TARGET(NV_IS_HOST, (return static_cast<int>(_LIBCUDACXX_MSVC_POPC(__x));))
   }
@@ -110,9 +110,9 @@ _LIBCUDACXX_HIDE_FROM_ABI constexpr int __libcpp_popc(uint32_t __x)
   return __fallback_popc64(static_cast<uint64_t>(__x));
 }
 
-_LIBCUDACXX_HIDE_FROM_ABI constexpr int __libcpp_popc(uint64_t __x)
+_LIBCUDACXX_HIDE_FROM_ABI constexpr int __cccl_popc(uint64_t __x)
 {
-  if (!__libcpp_default_is_constant_evaluated())
+  if (!__cccl_default_is_constant_evaluated())
   {
     NV_IF_TARGET(NV_IS_HOST, (return static_cast<int>(_LIBCUDACXX_MSVC_POPC64(__x));))
   }
diff --git a/libcudacxx/include/cuda/std/__bit/popcount.h b/libcudacxx/include/cuda/std/__bit/popcount.h
index 5d4395cb457..18c8d97dd30 100644
--- a/libcudacxx/include/cuda/std/__bit/popcount.h
+++ b/libcudacxx/include/cuda/std/__bit/popcount.h
@@ -33,14 +33,14 @@ template <class _Tp>
 _LIBCUDACXX_HIDE_FROM_ABI constexpr enable_if_t<sizeof(_Tp) <= sizeof(uint32_t), int>
 __popcount_dispatch(_Tp __t) noexcept
 {
-  return __libcpp_popc(static_cast<uint32_t>(__t));
+  return __cccl_popc(static_cast<uint32_t>(__t));
 }
 
 template <class _Tp>
 _LIBCUDACXX_HIDE_FROM_ABI constexpr enable_if_t<sizeof(_Tp) == sizeof(uint64_t), int>
 __popcount_dispatch(_Tp __t) noexcept
 {
-  return __libcpp_popc(static_cast<uint64_t>(__t));
+  return __cccl_popc(static_cast<uint64_t>(__t));
 }
 
 template <typename _Tp, int _St = sizeof(_Tp) / sizeof(uint64_t)>
@@ -49,7 +49,7 @@ struct __popcount_rsh_impl
   static _LIBCUDACXX_HIDE_FROM_ABI constexpr int __count(_Tp __t)
   {
     return __popcount_rsh_impl<_Tp, _St - 1>::__count(__t >> numeric_limits<uint64_t>::digits)
-         + __libcpp_popc(static_cast<uint64_t>(__t));
+         + __cccl_popc(static_cast<uint64_t>(__t));
   }
 };
 
@@ -58,7 +58,7 @@ struct __popcount_rsh_impl<_Tp, 1>
 {
   static _LIBCUDACXX_HIDE_FROM_ABI constexpr int __count(_Tp __t)
   {
-    return __libcpp_popc(static_cast<uint64_t>(__t));
+    return __cccl_popc(static_cast<uint64_t>(__t));
   }
 };
 
@@ -72,13 +72,13 @@ __popcount_dispatch(_Tp __t) noexcept
 template <class _Tp>
 _LIBCUDACXX_HIDE_FROM_ABI constexpr int __popcount(_Tp __t) noexcept
 {
-  static_assert(__libcpp_is_unsigned_integer<_Tp>::value, "__libcpp_popcount requires unsigned");
+  static_assert(__cccl_is_unsigned_integer<_Tp>::value, "__cccl_popcount requires unsigned");
 
   return __popcount_dispatch(__t);
 }
 
 template <class _Tp>
-_LIBCUDACXX_HIDE_FROM_ABI constexpr enable_if_t<__libcpp_is_unsigned_integer<_Tp>::value, int> popcount(_Tp __t) noexcept
+_LIBCUDACXX_HIDE_FROM_ABI constexpr enable_if_t<__cccl_is_unsigned_integer<_Tp>::value, int> popcount(_Tp __t) noexcept
 {
   return __popcount(__t);
 }
diff --git a/libcudacxx/include/cuda/std/__bit/reference.h b/libcudacxx/include/cuda/std/__bit/reference.h
index d4c7320a701..12acac014b1 100644
--- a/libcudacxx/include/cuda/std/__bit/reference.h
+++ b/libcudacxx/include/cuda/std/__bit/reference.h
@@ -109,7 +109,7 @@ class __bit_reference
   }
   _LIBCUDACXX_HIDE_FROM_ABI _CCCL_CONSTEXPR_CXX14 __bit_iterator<_Cp, false> operator&() const noexcept
   {
-    return __bit_iterator<_Cp, false>(__seg_, static_cast<unsigned>(_CUDA_VSTD::__libcpp_ctz(__mask_)));
+    return __bit_iterator<_Cp, false>(__seg_, static_cast<unsigned>(_CUDA_VSTD::__cccl_ctz(__mask_)));
   }
 
   friend _LIBCUDACXX_HIDE_FROM_ABI _CCCL_CONSTEXPR_CXX14 void
@@ -180,7 +180,7 @@ class __bit_const_reference
 
   _LIBCUDACXX_HIDE_FROM_ABI _CCCL_CONSTEXPR_CXX14 __bit_iterator<_Cp, true> operator&() const noexcept
   {
-    return __bit_iterator<_Cp, true>(__seg_, static_cast<unsigned>(_CUDA_VSTD::__libcpp_ctz(__mask_)));
+    return __bit_iterator<_Cp, true>(__seg_, static_cast<unsigned>(_CUDA_VSTD::__cccl_ctz(__mask_)));
   }
 
 private:
@@ -812,7 +812,7 @@ struct __bit_array
   _LIBCUDACXX_HIDE_FROM_ABI _CCCL_CONSTEXPR_CXX14 explicit __bit_array(difference_type __s)
       : __size_(__s)
   {
-    if (__libcpp_is_constant_evaluated())
+    if (_CUDA_VSTD::is_constant_evaluated())
     {
       for (size_t __i = 0; __i != __bit_array<_Cp>::_Np; ++__i)
       {
diff --git a/libcudacxx/include/cuda/std/__bit/rotate.h b/libcudacxx/include/cuda/std/__bit/rotate.h
index 0d5d7652a91..bf2c2e5f61a 100644
--- a/libcudacxx/include/cuda/std/__bit/rotate.h
+++ b/libcudacxx/include/cuda/std/__bit/rotate.h
@@ -30,7 +30,7 @@ _LIBCUDACXX_BEGIN_NAMESPACE_STD
 template <class _Tp>
 _CCCL_NODISCARD _LIBCUDACXX_HIDE_FROM_ABI constexpr _Tp __rotl(_Tp __t, unsigned int __cnt) noexcept
 {
-  static_assert(__libcpp_is_unsigned_integer<_Tp>::value, "__rotl requires unsigned");
+  static_assert(__cccl_is_unsigned_integer<_Tp>::value, "__rotl requires unsigned");
   using __nlt = numeric_limits<_Tp>;
 
   return ((__cnt % __nlt::digits) == 0)
@@ -41,7 +41,7 @@ _CCCL_NODISCARD _LIBCUDACXX_HIDE_FROM_ABI constexpr _Tp __rotl(_Tp __t, unsigned
 template <class _Tp>
 _CCCL_NODISCARD _LIBCUDACXX_HIDE_FROM_ABI constexpr _Tp __rotr(_Tp __t, unsigned int __cnt) noexcept
 {
-  static_assert(__libcpp_is_unsigned_integer<_Tp>::value, "__rotr requires unsigned");
+  static_assert(__cccl_is_unsigned_integer<_Tp>::value, "__rotr requires unsigned");
   using __nlt = numeric_limits<_Tp>;
 
   return ((__cnt % __nlt::digits) == 0)
@@ -50,7 +50,7 @@ _CCCL_NODISCARD _LIBCUDACXX_HIDE_FROM_ABI constexpr _Tp __rotr(_Tp __t, unsigned
 }
 
 template <class _Tp>
-_CCCL_NODISCARD _LIBCUDACXX_HIDE_FROM_ABI constexpr enable_if_t<__libcpp_is_unsigned_integer<_Tp>::value, _Tp>
+_CCCL_NODISCARD _LIBCUDACXX_HIDE_FROM_ABI constexpr enable_if_t<__cccl_is_unsigned_integer<_Tp>::value, _Tp>
 rotl(_Tp __t, unsigned int __cnt) noexcept
 {
   return __rotl(__t, __cnt);
@@ -58,7 +58,7 @@ rotl(_Tp __t, unsigned int __cnt) noexcept
 
 // rotr
 template <class _Tp>
-_CCCL_NODISCARD _LIBCUDACXX_HIDE_FROM_ABI constexpr enable_if_t<__libcpp_is_unsigned_integer<_Tp>::value, _Tp>
+_CCCL_NODISCARD _LIBCUDACXX_HIDE_FROM_ABI constexpr enable_if_t<__cccl_is_unsigned_integer<_Tp>::value, _Tp>
 rotr(_Tp __t, unsigned int __cnt) noexcept
 {
   return __rotr(__t, __cnt);
diff --git a/libcudacxx/include/cuda/std/__complex/nvbf16.h b/libcudacxx/include/cuda/std/__complex/nvbf16.h
index ede7f05a29a..0167f952141 100644
--- a/libcudacxx/include/cuda/std/__complex/nvbf16.h
+++ b/libcudacxx/include/cuda/std/__complex/nvbf16.h
@@ -83,7 +83,7 @@ struct __type_to_vector<__nv_bfloat16>
 };
 
 template <>
-struct __libcpp_complex_overload_traits<__nv_bfloat16, false, false>
+struct __cccl_complex_overload_traits<__nv_bfloat16, false, false>
 {
   typedef __nv_bfloat16 _ValueType;
   typedef complex<__nv_bfloat16> _ComplexType;
diff --git a/libcudacxx/include/cuda/std/__complex/nvfp16.h b/libcudacxx/include/cuda/std/__complex/nvfp16.h
index 11406f98588..8ddd2b27747 100644
--- a/libcudacxx/include/cuda/std/__complex/nvfp16.h
+++ b/libcudacxx/include/cuda/std/__complex/nvfp16.h
@@ -80,7 +80,7 @@ struct __type_to_vector<__half>
 };
 
 template <>
-struct __libcpp_complex_overload_traits<__half, false, false>
+struct __cccl_complex_overload_traits<__half, false, false>
 {
   typedef __half _ValueType;
   typedef complex<__half> _ComplexType;
diff --git a/libcudacxx/include/cuda/std/__concepts/arithmetic.h b/libcudacxx/include/cuda/std/__concepts/arithmetic.h
index 5a643652824..cd909548745 100644
--- a/libcudacxx/include/cuda/std/__concepts/arithmetic.h
+++ b/libcudacxx/include/cuda/std/__concepts/arithmetic.h
@@ -47,7 +47,7 @@ template <class _Tp>
 _CCCL_CONCEPT floating_point = _CCCL_TRAIT(is_floating_point, _Tp);
 
 template <class _Tp>
-_CCCL_CONCEPT __libcpp_signed_integer = __libcpp_is_signed_integer<_Tp>::value;
+_CCCL_CONCEPT __cccl_signed_integer = __cccl_is_signed_integer<_Tp>::value;
 
 #endif // ^^^ !_CCCL_NO_VARIABLE_TEMPLATES
 
diff --git a/libcudacxx/include/cuda/std/__iterator/erase_if_container.h b/libcudacxx/include/cuda/std/__iterator/erase_if_container.h
index 2d2b6e35767..e4573dc187b 100644
--- a/libcudacxx/include/cuda/std/__iterator/erase_if_container.h
+++ b/libcudacxx/include/cuda/std/__iterator/erase_if_container.h
@@ -24,7 +24,7 @@
 _LIBCUDACXX_BEGIN_NAMESPACE_STD
 
 template <class _Container, class _Predicate>
-_LIBCUDACXX_HIDE_FROM_ABI typename _Container::size_type __libcpp_erase_if_container(_Container& __c, _Predicate& __pred)
+_LIBCUDACXX_HIDE_FROM_ABI typename _Container::size_type __cccl_erase_if_container(_Container& __c, _Predicate& __pred)
 {
   typename _Container::size_type __old_size = __c.size();
 
diff --git a/libcudacxx/include/cuda/std/__memory/allocator.h b/libcudacxx/include/cuda/std/__memory/allocator.h
index fecac15b13f..c771226e191 100644
--- a/libcudacxx/include/cuda/std/__memory/allocator.h
+++ b/libcudacxx/include/cuda/std/__memory/allocator.h
@@ -129,13 +129,13 @@ class _CCCL_TYPE_VISIBILITY_DEFAULT allocator : private __non_trivial_if<!_CCCL_
       __throw_bad_array_new_length();
     }
 #if defined(_CCCL_HAS_CONSTEXPR_ALLOCATION)
-    if (__libcpp_is_constant_evaluated())
+    if (_CUDA_VSTD::is_constant_evaluated())
     {
       return ::std::allocator<_Tp>{}.allocate(__n);
     }
 #endif // _CCCL_HAS_CONSTEXPR_ALLOCATION
     {
-      return static_cast<_Tp*>(_CUDA_VSTD::__libcpp_allocate(__n * sizeof(_Tp), _LIBCUDACXX_ALIGNOF(_Tp)));
+      return static_cast<_Tp*>(_CUDA_VSTD::__cccl_allocate(__n * sizeof(_Tp), _LIBCUDACXX_ALIGNOF(_Tp)));
     }
   }
 
@@ -150,14 +150,14 @@ class _CCCL_TYPE_VISIBILITY_DEFAULT allocator : private __non_trivial_if<!_CCCL_
   _LIBCUDACXX_HIDE_FROM_ABI _CCCL_CONSTEXPR_CXX20_ALLOCATION void deallocate(_Tp* __p, size_t __n) noexcept
   {
 #if defined(_CCCL_HAS_CONSTEXPR_ALLOCATION)
-    if (__libcpp_is_constant_evaluated())
+    if (_CUDA_VSTD::is_constant_evaluated())
     {
       return ::std::allocator<_Tp>{}.deallocate(__p, __n);
     }
     else
 #endif // _CCCL_STD_VER >= 2020
     {
-      _CUDA_VSTD::__libcpp_deallocate((void*) __p, __n * sizeof(_Tp), _LIBCUDACXX_ALIGNOF(_Tp));
+      _CUDA_VSTD::__cccl_deallocate((void*) __p, __n * sizeof(_Tp), _LIBCUDACXX_ALIGNOF(_Tp));
     }
   }
 
@@ -231,13 +231,13 @@ class _CCCL_TYPE_VISIBILITY_DEFAULT allocator<const _Tp>
     {
       __throw_bad_array_new_length();
     }
-    if (__libcpp_is_constant_evaluated())
+    if (_CUDA_VSTD::is_constant_evaluated())
     {
       return static_cast<const _Tp*>(::operator new(__n * sizeof(_Tp)));
     }
     else
     {
-      return static_cast<const _Tp*>(_CUDA_VSTD::__libcpp_allocate(__n * sizeof(_Tp), _LIBCUDACXX_ALIGNOF(_Tp)));
+      return static_cast<const _Tp*>(_CUDA_VSTD::__cccl_allocate(__n * sizeof(_Tp), _LIBCUDACXX_ALIGNOF(_Tp)));
     }
   }
 
@@ -250,13 +250,13 @@ class _CCCL_TYPE_VISIBILITY_DEFAULT allocator<const _Tp>
 
   _LIBCUDACXX_HIDE_FROM_ABI _CCCL_CONSTEXPR_CXX20 void deallocate(const _Tp* __p, size_t __n) noexcept
   {
-    if (__libcpp_is_constant_evaluated())
+    if (_CUDA_VSTD::is_constant_evaluated())
     {
       ::operator delete(const_cast<_Tp*>(__p));
     }
     else
     {
-      _CUDA_VSTD::__libcpp_deallocate((void*) const_cast<_Tp*>(__p), __n * sizeof(_Tp), _LIBCUDACXX_ALIGNOF(_Tp));
+      _CUDA_VSTD::__cccl_deallocate((void*) const_cast<_Tp*>(__p), __n * sizeof(_Tp), _LIBCUDACXX_ALIGNOF(_Tp));
     }
   }
 
diff --git a/libcudacxx/include/cuda/std/__memory/builtin_new_allocator.h b/libcudacxx/include/cuda/std/__memory/builtin_new_allocator.h
index 03a45cac5a6..5752a48ec04 100644
--- a/libcudacxx/include/cuda/std/__memory/builtin_new_allocator.h
+++ b/libcudacxx/include/cuda/std/__memory/builtin_new_allocator.h
@@ -45,7 +45,7 @@ struct __builtin_new_allocator
 
     _LIBCUDACXX_HIDE_FROM_ABI void operator()(void* __p) const noexcept
     {
-      _CUDA_VSTD::__libcpp_deallocate(__p, __size_, __align_);
+      _CUDA_VSTD::__cccl_deallocate(__p, __size_, __align_);
     }
 
   private:
@@ -57,12 +57,12 @@ struct __builtin_new_allocator
 
   _LIBCUDACXX_HIDE_FROM_ABI static __holder_t __allocate_bytes(size_t __s, size_t __align)
   {
-    return __holder_t(_CUDA_VSTD::__libcpp_allocate(__s, __align), __builtin_new_deleter(__s, __align));
+    return __holder_t(_CUDA_VSTD::__cccl_allocate(__s, __align), __builtin_new_deleter(__s, __align));
   }
 
   _LIBCUDACXX_HIDE_FROM_ABI static void __deallocate_bytes(void* __p, size_t __s, size_t __align) noexcept
   {
-    _CUDA_VSTD::__libcpp_deallocate(__p, __s, __align);
+    _CUDA_VSTD::__cccl_deallocate(__p, __s, __align);
   }
 
   template <class _Tp>
diff --git a/libcudacxx/include/cuda/std/__memory/construct_at.h b/libcudacxx/include/cuda/std/__memory/construct_at.h
index 18300552e7e..aeb39a6bf18 100644
--- a/libcudacxx/include/cuda/std/__memory/construct_at.h
+++ b/libcudacxx/include/cuda/std/__memory/construct_at.h
@@ -114,7 +114,7 @@ construct_at(_Tp* __location, _Args&&... __args)
 {
   _CCCL_ASSERT(__location != nullptr, "null pointer given to construct_at");
   // Need to go through `std::construct_at` as that is the explicitly blessed function
-  if (__libcpp_is_constant_evaluated())
+  if (_CUDA_VSTD::is_constant_evaluated())
   {
     return ::std::construct_at(__location, _CUDA_VSTD::forward<_Args>(__args)...);
   }
@@ -131,7 +131,7 @@ construct_at(_Tp* __location, _Args&&... __args)
 {
   _CCCL_ASSERT(__location != nullptr, "null pointer given to construct_at");
   // Need to go through `std::construct_at` as that is the explicitly blessed function
-  if (__libcpp_is_constant_evaluated())
+  if (_CUDA_VSTD::is_constant_evaluated())
   {
     return ::std::construct_at(__location, _CUDA_VSTD::forward<_Args>(__args)...);
   }
@@ -150,7 +150,7 @@ __construct_at(_Tp* __location, _Args&&... __args)
   _CCCL_ASSERT(__location != nullptr, "null pointer given to construct_at");
 #if _CCCL_STD_VER >= 2020
   // Need to go through `std::construct_at` as that is the explicitly blessed function
-  if (__libcpp_is_constant_evaluated())
+  if (_CUDA_VSTD::is_constant_evaluated())
   {
     return ::std::construct_at(__location, _CUDA_VSTD::forward<_Args>(__args)...);
   }
@@ -167,7 +167,7 @@ __construct_at(_Tp* __location, _Args&&... __args)
   _CCCL_ASSERT(__location != nullptr, "null pointer given to construct_at");
 #if _CCCL_STD_VER >= 2020
   // Need to go through `std::construct_at` as that is the explicitly blessed function
-  if (__libcpp_is_constant_evaluated())
+  if (_CUDA_VSTD::is_constant_evaluated())
   {
     return ::std::construct_at(__location, _CUDA_VSTD::forward<_Args>(__args)...);
   }
diff --git a/libcudacxx/include/cuda/std/__memory/temporary_buffer.h b/libcudacxx/include/cuda/std/__memory/temporary_buffer.h
index 37f64befac4..2aa33cad869 100644
--- a/libcudacxx/include/cuda/std/__memory/temporary_buffer.h
+++ b/libcudacxx/include/cuda/std/__memory/temporary_buffer.h
@@ -80,7 +80,7 @@ get_temporary_buffer(ptrdiff_t __n) noexcept
 template <class _Tp>
 _LIBCUDACXX_HIDE_FROM_ABI void return_temporary_buffer(_Tp* __p) noexcept
 {
-  _CUDA_VSTD::__libcpp_deallocate_unsized((void*) __p, _LIBCUDACXX_ALIGNOF(_Tp));
+  _CUDA_VSTD::__cccl_deallocate_unsized((void*) __p, _LIBCUDACXX_ALIGNOF(_Tp));
 }
 
 _LIBCUDACXX_END_NAMESPACE_STD
diff --git a/libcudacxx/include/cuda/std/__memory/uninitialized_algorithms.h b/libcudacxx/include/cuda/std/__memory/uninitialized_algorithms.h
index a194efa5a02..11b476ba76c 100644
--- a/libcudacxx/include/cuda/std/__memory/uninitialized_algorithms.h
+++ b/libcudacxx/include/cuda/std/__memory/uninitialized_algorithms.h
@@ -349,7 +349,7 @@ __allocator_destroy_multidimensional(_Alloc& __alloc, _BidirIter __first, _Bidir
 
   _CCCL_IF_CONSTEXPR (_CCCL_TRAIT(is_array, _ValueType))
   {
-    static_assert(!__libcpp_is_unbounded_array<_ValueType>::value,
+    static_assert(!__cccl_is_unbounded_array<_ValueType>::value,
                   "arrays of unbounded arrays don't exist, but if they did we would mess up here");
 
     using _Element = remove_extent_t<_ValueType>;
@@ -576,7 +576,7 @@ template <
 _LIBCUDACXX_HIDE_FROM_ABI _CCCL_CONSTEXPR_CXX20 _Out*
 __uninitialized_allocator_copy_impl(_Alloc&, _In* __first1, _In* __last1, _Out* __first2)
 {
-  if (__libcpp_is_constant_evaluated())
+  if (_CUDA_VSTD::is_constant_evaluated())
   {
     while (__first1 != __last1)
     {
@@ -650,7 +650,7 @@ template <class _Alloc,
 _LIBCUDACXX_HIDE_FROM_ABI _CCCL_CONSTEXPR_CXX20 _Iter2
 __uninitialized_allocator_move_if_noexcept(_Alloc&, _Iter1 __first1, _Iter1 __last1, _Iter2 __first2)
 {
-  if (__libcpp_is_constant_evaluated())
+  if (_CUDA_VSTD::is_constant_evaluated())
   {
     while (__first1 != __last1)
     {
diff --git a/libcudacxx/include/cuda/std/__new/allocate.h b/libcudacxx/include/cuda/std/__new/allocate.h
index 2e1f8e9f302..4b8fa107f39 100644
--- a/libcudacxx/include/cuda/std/__new/allocate.h
+++ b/libcudacxx/include/cuda/std/__new/allocate.h
@@ -57,7 +57,7 @@ _LIBCUDACXX_HIDE_FROM_ABI constexpr bool __is_overaligned_for_new(size_t __align
 }
 
 template <class... _Args>
-_LIBCUDACXX_HIDE_FROM_ABI void* __libcpp_operator_new(_Args... __args)
+_LIBCUDACXX_HIDE_FROM_ABI void* __cccl_operator_new(_Args... __args)
 {
   // Those builtins are not usable on device and the tests crash when using them
 #if defined(_CCCL_BUILTIN_OPERATOR_NEW)
@@ -68,7 +68,7 @@ _LIBCUDACXX_HIDE_FROM_ABI void* __libcpp_operator_new(_Args... __args)
 }
 
 template <class... _Args>
-_LIBCUDACXX_HIDE_FROM_ABI void __libcpp_operator_delete(_Args... __args)
+_LIBCUDACXX_HIDE_FROM_ABI void __cccl_operator_delete(_Args... __args)
 {
   // Those builtins are not usable on device and the tests crash when using them
 #if defined(_CCCL_BUILTIN_OPERATOR_DELETE)
@@ -78,17 +78,17 @@ _LIBCUDACXX_HIDE_FROM_ABI void __libcpp_operator_delete(_Args... __args)
 #endif // !_CCCL_BUILTIN_OPERATOR_DELETE
 }
 
-_LIBCUDACXX_HIDE_FROM_ABI void* __libcpp_allocate(size_t __size, size_t __align)
+_LIBCUDACXX_HIDE_FROM_ABI void* __cccl_allocate(size_t __size, size_t __align)
 {
 #ifndef _LIBCUDACXX_HAS_NO_ALIGNED_ALLOCATION
   if (__is_overaligned_for_new(__align))
   {
     const ::std::align_val_t __align_val = static_cast<::std::align_val_t>(__align);
-    return __libcpp_operator_new(__size, __align_val);
+    return __cccl_operator_new(__size, __align_val);
   }
 #endif // !_LIBCUDACXX_HAS_NO_ALIGNED_ALLOCATION
   (void) __align;
-  return __libcpp_operator_new(__size);
+  return __cccl_operator_new(__size);
 }
 
 template <class... _Args>
@@ -96,13 +96,13 @@ _LIBCUDACXX_HIDE_FROM_ABI void __do_deallocate_handle_size(void* __ptr, size_t _
 {
 #ifdef _LIBCUDACXX_HAS_NO_SIZED_DEALLOCATION
   (void) __size;
-  return _CUDA_VSTD::__libcpp_operator_delete(__ptr, __args...);
+  return _CUDA_VSTD::__cccl_operator_delete(__ptr, __args...);
 #else // ^^^ _LIBCUDACXX_HAS_NO_SIZED_DEALLOCATION ^^^ / vvv !_LIBCUDACXX_HAS_NO_SIZED_DEALLOCATION vvv
-  return _CUDA_VSTD::__libcpp_operator_delete(__ptr, __size, __args...);
+  return _CUDA_VSTD::__cccl_operator_delete(__ptr, __size, __args...);
 #endif // !_LIBCUDACXX_HAS_NO_SIZED_DEALLOCATION
 }
 
-_LIBCUDACXX_HIDE_FROM_ABI void __libcpp_deallocate(void* __ptr, size_t __size, size_t __align)
+_LIBCUDACXX_HIDE_FROM_ABI void __cccl_deallocate(void* __ptr, size_t __size, size_t __align)
 {
 #ifndef _LIBCUDACXX_HAS_NO_ALIGNED_ALLOCATION
   if (__is_overaligned_for_new(__align))
@@ -115,17 +115,17 @@ _LIBCUDACXX_HIDE_FROM_ABI void __libcpp_deallocate(void* __ptr, size_t __size, s
   return __do_deallocate_handle_size(__ptr, __size);
 }
 
-_LIBCUDACXX_HIDE_FROM_ABI void __libcpp_deallocate_unsized(void* __ptr, size_t __align)
+_LIBCUDACXX_HIDE_FROM_ABI void __cccl_deallocate_unsized(void* __ptr, size_t __align)
 {
 #ifndef _LIBCUDACXX_HAS_NO_ALIGNED_ALLOCATION
   if (__is_overaligned_for_new(__align))
   {
     const ::std::align_val_t __align_val = static_cast<::std::align_val_t>(__align);
-    return __libcpp_operator_delete(__ptr, __align_val);
+    return __cccl_operator_delete(__ptr, __align_val);
   }
 #endif // !_LIBCUDACXX_HAS_NO_ALIGNED_ALLOCATION
   (void) __align;
-  return __libcpp_operator_delete(__ptr);
+  return __cccl_operator_delete(__ptr);
 }
 
 _LIBCUDACXX_END_NAMESPACE_STD
diff --git a/libcudacxx/include/cuda/std/__semaphore/atomic_semaphore.h b/libcudacxx/include/cuda/std/__semaphore/atomic_semaphore.h
index 78013392630..fb6d302e771 100644
--- a/libcudacxx/include/cuda/std/__semaphore/atomic_semaphore.h
+++ b/libcudacxx/include/cuda/std/__semaphore/atomic_semaphore.h
@@ -74,7 +74,7 @@ class __atomic_semaphore
 
   _CCCL_NODISCARD _LIBCUDACXX_HIDE_FROM_ABI bool __acquire_slow_timed(chrono::nanoseconds const& __rel_time)
   {
-    return _CUDA_VSTD::__libcpp_thread_poll_with_backoff(
+    return _CUDA_VSTD::__cccl_thread_poll_with_backoff(
       [this]() {
         ptrdiff_t const __old = __count.load(memory_order_acquire);
         return __old != 0 && __fetch_sub_if_slow(__old);
@@ -157,7 +157,7 @@ class __atomic_semaphore<_Sco, 1>
 
   _CCCL_NODISCARD _LIBCUDACXX_HIDE_FROM_ABI bool __acquire_slow_timed(chrono::nanoseconds const& __rel_time)
   {
-    return _CUDA_VSTD::__libcpp_thread_poll_with_backoff(
+    return _CUDA_VSTD::__cccl_thread_poll_with_backoff(
       [this]() {
         return try_acquire();
       },
diff --git a/libcudacxx/include/cuda/std/__string/string_view.h b/libcudacxx/include/cuda/std/__string/string_view.h
index 458c46e3063..46bf51b589c 100644
--- a/libcudacxx/include/cuda/std/__string/string_view.h
+++ b/libcudacxx/include/cuda/std/__string/string_view.h
@@ -229,7 +229,7 @@ struct __string_view
   {
     // If we're in a constant evaluated context, we cannot compare the __str_
     // members for equality.
-    return __compare(__other, bool_constant<__libcpp_default_is_constant_evaluated()>());
+    return __compare(__other, bool_constant<__cccl_default_is_constant_evaluated()>());
   }
 
   _CCCL_NODISCARD_FRIEND _LIBCUDACXX_HIDE_FROM_ABI constexpr bool
diff --git a/libcudacxx/include/cuda/std/__thread/threading_support.h b/libcudacxx/include/cuda/std/__thread/threading_support.h
index b131dbf0f94..d2ebacf576f 100644
--- a/libcudacxx/include/cuda/std/__thread/threading_support.h
+++ b/libcudacxx/include/cuda/std/__thread/threading_support.h
@@ -52,13 +52,13 @@ _LIBCUDACXX_BEGIN_NAMESPACE_STD
 #    define __LIBCUDACXX_ASM_THREAD_YIELD (;)
 #  endif // !__x86_64__
 
-_LIBCUDACXX_HIDE_FROM_ABI void __libcpp_thread_yield_processor()
+_LIBCUDACXX_HIDE_FROM_ABI void __cccl_thread_yield_processor()
 {
   NV_IF_TARGET(NV_IS_HOST, __LIBCUDACXX_ASM_THREAD_YIELD)
 }
 
 template <class _Fn>
-_LIBCUDACXX_HIDE_FROM_ABI bool __libcpp_thread_poll_with_backoff(
+_LIBCUDACXX_HIDE_FROM_ABI bool __cccl_thread_poll_with_backoff(
   _Fn&& __f, _CUDA_VSTD::chrono::nanoseconds __max = _CUDA_VSTD::chrono::nanoseconds::zero())
 {
   _CUDA_VSTD::chrono::high_resolution_clock::time_point const __start =
@@ -73,7 +73,7 @@ _LIBCUDACXX_HIDE_FROM_ABI bool __libcpp_thread_poll_with_backoff(
     {
       if (__count > (_LIBCUDACXX_POLLING_COUNT >> 1))
       {
-        _CUDA_VSTD::__libcpp_thread_yield_processor();
+        _CUDA_VSTD::__cccl_thread_yield_processor();
       }
       __count += 1;
       continue;
@@ -87,15 +87,15 @@ _LIBCUDACXX_HIDE_FROM_ABI bool __libcpp_thread_poll_with_backoff(
     _CUDA_VSTD::chrono::nanoseconds const __step = __elapsed / 4;
     if (__step >= _CUDA_VSTD::chrono::milliseconds(1))
     {
-      _CUDA_VSTD::__libcpp_thread_sleep_for(_CUDA_VSTD::chrono::milliseconds(1));
+      _CUDA_VSTD::__cccl_thread_sleep_for(_CUDA_VSTD::chrono::milliseconds(1));
     }
     else if (__step >= _CUDA_VSTD::chrono::microseconds(10))
     {
-      _CUDA_VSTD::__libcpp_thread_sleep_for(__step);
+      _CUDA_VSTD::__cccl_thread_sleep_for(__step);
     }
     else
     {
-      _CUDA_VSTD::__libcpp_thread_yield();
+      _CUDA_VSTD::__cccl_thread_yield();
     }
   }
 }
diff --git a/libcudacxx/include/cuda/std/__thread/threading_support_cuda.h b/libcudacxx/include/cuda/std/__thread/threading_support_cuda.h
index c361b0f7e06..c46cf508dca 100644
--- a/libcudacxx/include/cuda/std/__thread/threading_support_cuda.h
+++ b/libcudacxx/include/cuda/std/__thread/threading_support_cuda.h
@@ -29,9 +29,9 @@ _CCCL_PUSH_MACROS
 
 _LIBCUDACXX_BEGIN_NAMESPACE_STD
 
-_LIBCUDACXX_HIDE_FROM_ABI void __libcpp_thread_yield() {}
+_LIBCUDACXX_HIDE_FROM_ABI void __cccl_thread_yield() {}
 
-_LIBCUDACXX_HIDE_FROM_ABI void __libcpp_thread_sleep_for(_CUDA_VSTD::chrono::nanoseconds __ns)
+_LIBCUDACXX_HIDE_FROM_ABI void __cccl_thread_sleep_for(_CUDA_VSTD::chrono::nanoseconds __ns)
 {
   NV_IF_TARGET(NV_IS_DEVICE,
                (auto const __step = __ns.count(); assert(__step < numeric_limits<unsigned>::max());
diff --git a/libcudacxx/include/cuda/std/__thread/threading_support_external.h b/libcudacxx/include/cuda/std/__thread/threading_support_external.h
index 639e117355c..92d0945a029 100644
--- a/libcudacxx/include/cuda/std/__thread/threading_support_external.h
+++ b/libcudacxx/include/cuda/std/__thread/threading_support_external.h
@@ -28,9 +28,9 @@ _CCCL_PUSH_MACROS
 
 _LIBCUDACXX_BEGIN_NAMESPACE_STD
 
-_LIBCUDACXX_HIDE_FROM_ABI void __libcpp_thread_yield();
+_LIBCUDACXX_HIDE_FROM_ABI void __cccl_thread_yield();
 
-_LIBCUDACXX_HIDE_FROM_ABI void __libcpp_thread_sleep_for(_CUDA_VSTD::chrono::nanoseconds __ns);
+_LIBCUDACXX_HIDE_FROM_ABI void __cccl_thread_sleep_for(_CUDA_VSTD::chrono::nanoseconds __ns);
 
 _LIBCUDACXX_END_NAMESPACE_STD
 
diff --git a/libcudacxx/include/cuda/std/__thread/threading_support_pthread.h b/libcudacxx/include/cuda/std/__thread/threading_support_pthread.h
index 4b1af8c7bc2..3da59117761 100644
--- a/libcudacxx/include/cuda/std/__thread/threading_support_pthread.h
+++ b/libcudacxx/include/cuda/std/__thread/threading_support_pthread.h
@@ -40,51 +40,51 @@
 
 _CCCL_PUSH_MACROS
 
-typedef ::timespec __libcpp_timespec_t;
+typedef ::timespec __cccl_timespec_t;
 
 _LIBCUDACXX_BEGIN_NAMESPACE_STD
 
 // Mutex
-typedef pthread_mutex_t __libcpp_mutex_t;
+typedef pthread_mutex_t __cccl_mutex_t;
 #  define _LIBCUDACXX_MUTEX_INITIALIZER PTHREAD_MUTEX_INITIALIZER
 
-typedef pthread_mutex_t __libcpp_recursive_mutex_t;
+typedef pthread_mutex_t __cccl_recursive_mutex_t;
 
 // Condition Variable
-typedef pthread_cond_t __libcpp_condvar_t;
+typedef pthread_cond_t __cccl_condvar_t;
 #  define _LIBCUDACXX_CONDVAR_INITIALIZER PTHREAD_COND_INITIALIZER
 
 // Semaphore
 #  if defined(__APPLE__)
-typedef dispatch_semaphore_t __libcpp_semaphore_t;
+typedef dispatch_semaphore_t __cccl_semaphore_t;
 #    define _LIBCUDACXX_SEMAPHORE_MAX numeric_limits<long>::max()
 #  else // ^^^ __APPLE__ ^^^ / vvv !__APPLE__ vvv
-typedef sem_t __libcpp_semaphore_t;
+typedef sem_t __cccl_semaphore_t;
 #    define _LIBCUDACXX_SEMAPHORE_MAX SEM_VALUE_MAX
 #  endif // !__APPLE__
 
 // Execute once
-typedef pthread_once_t __libcpp_exec_once_flag;
+typedef pthread_once_t __cccl_exec_once_flag;
 #  define _LIBCUDACXX_EXEC_ONCE_INITIALIZER PTHREAD_ONCE_INIT
 
 // Thread id
-typedef pthread_t __libcpp_thread_id;
+typedef pthread_t __cccl_thread_id;
 
 // Thread
 #  define _LIBCUDACXX_NULL_THREAD 0U
 
-typedef pthread_t __libcpp_thread_t;
+typedef pthread_t __cccl_thread_t;
 
 // Thread Local Storage
-typedef pthread_key_t __libcpp_tls_key;
+typedef pthread_key_t __cccl_tls_key;
 
 #  define _LIBCUDACXX_TLS_DESTRUCTOR_CC
 
-_LIBCUDACXX_HIDE_FROM_ABI __libcpp_timespec_t __libcpp_to_timespec(const _CUDA_VSTD::chrono::nanoseconds& __ns)
+_LIBCUDACXX_HIDE_FROM_ABI __cccl_timespec_t __cccl_to_timespec(const _CUDA_VSTD::chrono::nanoseconds& __ns)
 {
   using namespace chrono;
   seconds __s = duration_cast<seconds>(__ns);
-  __libcpp_timespec_t __ts;
+  __cccl_timespec_t __ts;
   typedef decltype(__ts.tv_sec) ts_sec;
   constexpr ts_sec __ts_sec_max = numeric_limits<ts_sec>::max();
 
@@ -104,73 +104,73 @@ _LIBCUDACXX_HIDE_FROM_ABI __libcpp_timespec_t __libcpp_to_timespec(const _CUDA_V
 // Semaphore
 #  if defined(__APPLE__)
 
-_LIBCUDACXX_HIDE_FROM_ABI bool __libcpp_semaphore_init(__libcpp_semaphore_t* __sem, int __init)
+_LIBCUDACXX_HIDE_FROM_ABI bool __cccl_semaphore_init(__cccl_semaphore_t* __sem, int __init)
 {
   return (*__sem = dispatch_semaphore_create(__init)) != nullptr;
 }
 
-_LIBCUDACXX_HIDE_FROM_ABI bool __libcpp_semaphore_destroy(__libcpp_semaphore_t* __sem)
+_LIBCUDACXX_HIDE_FROM_ABI bool __cccl_semaphore_destroy(__cccl_semaphore_t* __sem)
 {
   dispatch_release(*__sem);
   return true;
 }
 
-_LIBCUDACXX_HIDE_FROM_ABI bool __libcpp_semaphore_post(__libcpp_semaphore_t* __sem)
+_LIBCUDACXX_HIDE_FROM_ABI bool __cccl_semaphore_post(__cccl_semaphore_t* __sem)
 {
   dispatch_semaphore_signal(*__sem);
   return true;
 }
 
-_LIBCUDACXX_HIDE_FROM_ABI bool __libcpp_semaphore_wait(__libcpp_semaphore_t* __sem)
+_LIBCUDACXX_HIDE_FROM_ABI bool __cccl_semaphore_wait(__cccl_semaphore_t* __sem)
 {
   return dispatch_semaphore_wait(*__sem, DISPATCH_TIME_FOREVER) == 0;
 }
 
 _LIBCUDACXX_HIDE_FROM_ABI bool
-__libcpp_semaphore_wait_timed(__libcpp_semaphore_t* __sem, _CUDA_VSTD::chrono::nanoseconds const& __ns)
+__cccl_semaphore_wait_timed(__cccl_semaphore_t* __sem, _CUDA_VSTD::chrono::nanoseconds const& __ns)
 {
   return dispatch_semaphore_wait(*__sem, dispatch_time(DISPATCH_TIME_NOW, __ns.count())) == 0;
 }
 
 #  else // ^^^ __APPLE__ ^^^ / vvv !__APPLE__ vvv
 
-_LIBCUDACXX_HIDE_FROM_ABI bool __libcpp_semaphore_init(__libcpp_semaphore_t* __sem, int __init)
+_LIBCUDACXX_HIDE_FROM_ABI bool __cccl_semaphore_init(__cccl_semaphore_t* __sem, int __init)
 {
   return sem_init(__sem, 0, __init) == 0;
 }
 
-_LIBCUDACXX_HIDE_FROM_ABI bool __libcpp_semaphore_destroy(__libcpp_semaphore_t* __sem)
+_LIBCUDACXX_HIDE_FROM_ABI bool __cccl_semaphore_destroy(__cccl_semaphore_t* __sem)
 {
   return sem_destroy(__sem) == 0;
 }
 
-_LIBCUDACXX_HIDE_FROM_ABI bool __libcpp_semaphore_post(__libcpp_semaphore_t* __sem)
+_LIBCUDACXX_HIDE_FROM_ABI bool __cccl_semaphore_post(__cccl_semaphore_t* __sem)
 {
   return sem_post(__sem) == 0;
 }
 
-_LIBCUDACXX_HIDE_FROM_ABI bool __libcpp_semaphore_wait(__libcpp_semaphore_t* __sem)
+_LIBCUDACXX_HIDE_FROM_ABI bool __cccl_semaphore_wait(__cccl_semaphore_t* __sem)
 {
   return sem_wait(__sem) == 0;
 }
 
 _LIBCUDACXX_HIDE_FROM_ABI bool
-__libcpp_semaphore_wait_timed(__libcpp_semaphore_t* __sem, _CUDA_VSTD::chrono::nanoseconds const& __ns)
+__cccl_semaphore_wait_timed(__cccl_semaphore_t* __sem, _CUDA_VSTD::chrono::nanoseconds const& __ns)
 {
-  __libcpp_timespec_t __ts = __libcpp_to_timespec(__ns);
+  __cccl_timespec_t __ts = __cccl_to_timespec(__ns);
   return sem_timedwait(__sem, &__ts) == 0;
 }
 
 #  endif // !__APPLE__
 
-_LIBCUDACXX_HIDE_FROM_ABI void __libcpp_thread_yield()
+_LIBCUDACXX_HIDE_FROM_ABI void __cccl_thread_yield()
 {
   sched_yield();
 }
 
-_LIBCUDACXX_HIDE_FROM_ABI void __libcpp_thread_sleep_for(_CUDA_VSTD::chrono::nanoseconds __ns)
+_LIBCUDACXX_HIDE_FROM_ABI void __cccl_thread_sleep_for(_CUDA_VSTD::chrono::nanoseconds __ns)
 {
-  __libcpp_timespec_t __ts = __libcpp_to_timespec(__ns);
+  __cccl_timespec_t __ts = __cccl_to_timespec(__ns);
   while (nanosleep(&__ts, &__ts) == -1 && errno == EINTR)
     ;
 }
diff --git a/libcudacxx/include/cuda/std/__thread/threading_support_win32.h b/libcudacxx/include/cuda/std/__thread/threading_support_win32.h
index ff8bd6a35fe..ab59307e5aa 100644
--- a/libcudacxx/include/cuda/std/__thread/threading_support_win32.h
+++ b/libcudacxx/include/cuda/std/__thread/threading_support_win32.h
@@ -32,47 +32,47 @@ _CCCL_PUSH_MACROS
 _LIBCUDACXX_BEGIN_NAMESPACE_STD
 
 // Mutex
-typedef void* __libcpp_mutex_t;
+typedef void* __cccl_mutex_t;
 #  define _LIBCUDACXX_MUTEX_INITIALIZER 0
 
 #  if defined(_M_IX86) || defined(__i386__) || defined(_M_ARM) || defined(__arm__)
-typedef void* __libcpp_recursive_mutex_t[6];
+typedef void* __cccl_recursive_mutex_t[6];
 #  elif defined(_M_AMD64) || defined(__x86_64__) || defined(_M_ARM64) || defined(__aarch64__)
-typedef void* __libcpp_recursive_mutex_t[5];
+typedef void* __cccl_recursive_mutex_t[5];
 #  else
 #    error Unsupported architecture
 #  endif
 
 // Condition Variable
-typedef void* __libcpp_condvar_t;
+typedef void* __cccl_condvar_t;
 #  define _LIBCUDACXX_CONDVAR_INITIALIZER 0
 
 // Semaphore
-typedef void* __libcpp_semaphore_t;
+typedef void* __cccl_semaphore_t;
 
 // Execute Once
-typedef void* __libcpp_exec_once_flag;
+typedef void* __cccl_exec_once_flag;
 #  define _LIBCUDACXX_EXEC_ONCE_INITIALIZER 0
 
 // Thread ID
-typedef long __libcpp_thread_id;
+typedef long __cccl_thread_id;
 
 // Thread
 #  define _LIBCUDACXX_NULL_THREAD 0U
 
-typedef void* __libcpp_thread_t;
+typedef void* __cccl_thread_t;
 
 // Thread Local Storage
-typedef long __libcpp_tls_key;
+typedef long __cccl_tls_key;
 
 #  define _LIBCUDACXX_TLS_DESTRUCTOR_CC __stdcall
 
-_LIBCUDACXX_HIDE_FROM_ABI void __libcpp_thread_yield()
+_LIBCUDACXX_HIDE_FROM_ABI void __cccl_thread_yield()
 {
   SwitchToThread();
 }
 
-_LIBCUDACXX_HIDE_FROM_ABI void __libcpp_thread_sleep_for(chrono::nanoseconds __ns)
+_LIBCUDACXX_HIDE_FROM_ABI void __cccl_thread_sleep_for(chrono::nanoseconds __ns)
 {
   using namespace chrono;
   // round-up to the nearest milisecond
diff --git a/libcudacxx/include/cuda/std/__type_traits/add_lvalue_reference.h b/libcudacxx/include/cuda/std/__type_traits/add_lvalue_reference.h
index cc74e6bbbd8..8b70295ce14 100644
--- a/libcudacxx/include/cuda/std/__type_traits/add_lvalue_reference.h
+++ b/libcudacxx/include/cuda/std/__type_traits/add_lvalue_reference.h
@@ -31,7 +31,7 @@ using add_lvalue_reference_t _CCCL_NODEBUG_ALIAS = _CCCL_BUILTIN_ADD_LVALUE_REFE
 
 #else // ^^^ _CCCL_BUILTIN_ADD_LVALUE_REFERENCE ^^^ / vvv !_CCCL_BUILTIN_ADD_LVALUE_REFERENCE vvv
 
-template <class _Tp, bool = __libcpp_is_referenceable<_Tp>::value>
+template <class _Tp, bool = __cccl_is_referenceable<_Tp>::value>
 struct __add_lvalue_reference_impl
 {
   typedef _CCCL_NODEBUG_ALIAS _Tp type;
diff --git a/libcudacxx/include/cuda/std/__type_traits/add_pointer.h b/libcudacxx/include/cuda/std/__type_traits/add_pointer.h
index bf89c4fd082..65986787c84 100644
--- a/libcudacxx/include/cuda/std/__type_traits/add_pointer.h
+++ b/libcudacxx/include/cuda/std/__type_traits/add_pointer.h
@@ -34,7 +34,7 @@ template <class _Tp>
 using add_pointer_t _CCCL_NODEBUG_ALIAS = _CCCL_BUILTIN_ADD_POINTER(_Tp);
 
 #else // ^^^ _CCCL_BUILTIN_ADD_POINTER ^^^ / vvv !_CCCL_BUILTIN_ADD_POINTER vvv
-template <class _Tp, bool = __libcpp_is_referenceable<_Tp>::value || is_void<_Tp>::value>
+template <class _Tp, bool = __cccl_is_referenceable<_Tp>::value || is_void<_Tp>::value>
 struct __add_pointer_impl
 {
   typedef _CCCL_NODEBUG_ALIAS remove_reference_t<_Tp>* type;
diff --git a/libcudacxx/include/cuda/std/__type_traits/add_rvalue_reference.h b/libcudacxx/include/cuda/std/__type_traits/add_rvalue_reference.h
index c9704de4092..eb9e3f0acdf 100644
--- a/libcudacxx/include/cuda/std/__type_traits/add_rvalue_reference.h
+++ b/libcudacxx/include/cuda/std/__type_traits/add_rvalue_reference.h
@@ -31,7 +31,7 @@ using add_rvalue_reference_t _CCCL_NODEBUG_ALIAS = _CCCL_BUILTIN_ADD_RVALUE_REFE
 
 #else // ^^^ _CCCL_BUILTIN_ADD_RVALUE_REFERENCE ^^^ / vvv !_CCCL_BUILTIN_ADD_RVALUE_REFERENCE vvv
 
-template <class _Tp, bool = __libcpp_is_referenceable<_Tp>::value>
+template <class _Tp, bool = __cccl_is_referenceable<_Tp>::value>
 struct __add_rvalue_reference_impl
 {
   typedef _CCCL_NODEBUG_ALIAS _Tp type;
diff --git a/libcudacxx/include/cuda/std/__type_traits/decay.h b/libcudacxx/include/cuda/std/__type_traits/decay.h
index 2888466585c..b8d5a744cfd 100644
--- a/libcudacxx/include/cuda/std/__type_traits/decay.h
+++ b/libcudacxx/include/cuda/std/__type_traits/decay.h
@@ -66,7 +66,7 @@ struct _CCCL_TYPE_VISIBILITY_DEFAULT decay
   typedef _CCCL_NODEBUG_ALIAS remove_reference_t<_Tp> _Up;
 
 public:
-  typedef _CCCL_NODEBUG_ALIAS typename __decay_impl<_Up, __libcpp_is_referenceable<_Up>::value>::type type;
+  typedef _CCCL_NODEBUG_ALIAS typename __decay_impl<_Up, __cccl_is_referenceable<_Up>::value>::type type;
 };
 
 template <class _Tp>
diff --git a/libcudacxx/include/cuda/std/__type_traits/is_bounded_array.h b/libcudacxx/include/cuda/std/__type_traits/is_bounded_array.h
index fa762d24b2d..983e17e3553 100644
--- a/libcudacxx/include/cuda/std/__type_traits/is_bounded_array.h
+++ b/libcudacxx/include/cuda/std/__type_traits/is_bounded_array.h
@@ -26,10 +26,10 @@
 _LIBCUDACXX_BEGIN_NAMESPACE_STD
 
 template <class>
-struct _CCCL_TYPE_VISIBILITY_DEFAULT __libcpp_is_bounded_array : false_type
+struct _CCCL_TYPE_VISIBILITY_DEFAULT __cccl_is_bounded_array : false_type
 {};
 template <class _Tp, size_t _Np>
-struct _CCCL_TYPE_VISIBILITY_DEFAULT __libcpp_is_bounded_array<_Tp[_Np]> : true_type
+struct _CCCL_TYPE_VISIBILITY_DEFAULT __cccl_is_bounded_array<_Tp[_Np]> : true_type
 {};
 
 template <class>
diff --git a/libcudacxx/include/cuda/std/__type_traits/is_constant_evaluated.h b/libcudacxx/include/cuda/std/__type_traits/is_constant_evaluated.h
index b4281c6c637..fc24b17077a 100644
--- a/libcudacxx/include/cuda/std/__type_traits/is_constant_evaluated.h
+++ b/libcudacxx/include/cuda/std/__type_traits/is_constant_evaluated.h
@@ -27,21 +27,16 @@ _LIBCUDACXX_HIDE_FROM_ABI constexpr bool is_constant_evaluated() noexcept
 {
   return _CCCL_BUILTIN_IS_CONSTANT_EVALUATED();
 }
-
-_LIBCUDACXX_HIDE_FROM_ABI constexpr bool __libcpp_is_constant_evaluated() noexcept
-{
-  return _CCCL_BUILTIN_IS_CONSTANT_EVALUATED();
-}
-_LIBCUDACXX_HIDE_FROM_ABI constexpr bool __libcpp_default_is_constant_evaluated() noexcept
+_LIBCUDACXX_HIDE_FROM_ABI constexpr bool __cccl_default_is_constant_evaluated() noexcept
 {
   return _CCCL_BUILTIN_IS_CONSTANT_EVALUATED();
 }
 #else // ^^^ _CCCL_BUILTIN_IS_CONSTANT_EVALUATED ^^^ / vvv !_CCCL_BUILTIN_IS_CONSTANT_EVALUATED vvv
-_LIBCUDACXX_HIDE_FROM_ABI constexpr bool __libcpp_is_constant_evaluated() noexcept
+_LIBCUDACXX_HIDE_FROM_ABI constexpr bool is_constant_evaluated() noexcept
 {
   return false;
 }
-_LIBCUDACXX_HIDE_FROM_ABI constexpr bool __libcpp_default_is_constant_evaluated() noexcept
+_LIBCUDACXX_HIDE_FROM_ABI constexpr bool __cccl_default_is_constant_evaluated() noexcept
 {
   return true;
 }
diff --git a/libcudacxx/include/cuda/std/__type_traits/is_constructible.h b/libcudacxx/include/cuda/std/__type_traits/is_constructible.h
index cd82aa9397c..579c45c0295 100644
--- a/libcudacxx/include/cuda/std/__type_traits/is_constructible.h
+++ b/libcudacxx/include/cuda/std/__type_traits/is_constructible.h
@@ -45,7 +45,7 @@ struct __nat
      || defined(_LIBCUDACXX_USE_IS_CONSTRUCTIBLE_FALLBACK))
 
 template <class _Tp, class... _Args>
-struct __libcpp_is_constructible;
+struct __cccl_is_constructible;
 
 template <class _To, class _From>
 struct __is_invalid_base_to_derived_cast
@@ -54,8 +54,7 @@ struct __is_invalid_base_to_derived_cast
   using _RawFrom = remove_cvref_t<_From>;
   using _RawTo   = remove_cvref_t<_To>;
   static const bool value =
-    _And<_IsNotSame<_RawFrom, _RawTo>, is_base_of<_RawFrom, _RawTo>, _Not<__libcpp_is_constructible<_RawTo, _From>>>::
-      value;
+    _And<_IsNotSame<_RawFrom, _RawTo>, is_base_of<_RawFrom, _RawTo>, _Not<__cccl_is_constructible<_RawTo, _From>>>::value;
 };
 
 template <class _To, class _From>
@@ -123,26 +122,26 @@ struct __is_default_constructible<_Tp[_Nx], false> : __is_default_constructible<
 {};
 
 template <class _Tp, class... _Args>
-struct __libcpp_is_constructible
+struct __cccl_is_constructible
 {
   static_assert(sizeof...(_Args) > 1, "Wrong specialization");
   typedef decltype(__is_constructible_helper::__test_nary<_Tp, _Args...>(0)) type;
 };
 
 template <class _Tp>
-struct __libcpp_is_constructible<_Tp> : __is_default_constructible<_Tp>
+struct __cccl_is_constructible<_Tp> : __is_default_constructible<_Tp>
 {};
 
 template <class _Tp, class _A0>
-struct __libcpp_is_constructible<_Tp, _A0> : public decltype(__is_constructible_helper::__test_unary<_Tp, _A0>(0))
+struct __cccl_is_constructible<_Tp, _A0> : public decltype(__is_constructible_helper::__test_unary<_Tp, _A0>(0))
 {};
 
 template <class _Tp, class _A0>
-struct __libcpp_is_constructible<_Tp&, _A0> : public decltype(__is_constructible_helper::__test_cast<_Tp&, _A0>(0))
+struct __cccl_is_constructible<_Tp&, _A0> : public decltype(__is_constructible_helper::__test_cast<_Tp&, _A0>(0))
 {};
 
 template <class _Tp, class _A0>
-struct __libcpp_is_constructible<_Tp&&, _A0> : public decltype(__is_constructible_helper::__test_cast<_Tp&&, _A0>(0))
+struct __cccl_is_constructible<_Tp&&, _A0> : public decltype(__is_constructible_helper::__test_cast<_Tp&&, _A0>(0))
 {};
 
 #endif
@@ -160,7 +159,7 @@ _CCCL_INLINE_VAR constexpr bool is_constructible_v = _CCCL_BUILTIN_IS_CONSTRUCTI
 
 #else
 template <class _Tp, class... _Args>
-struct _CCCL_TYPE_VISIBILITY_DEFAULT is_constructible : public __libcpp_is_constructible<_Tp, _Args...>::type
+struct _CCCL_TYPE_VISIBILITY_DEFAULT is_constructible : public __cccl_is_constructible<_Tp, _Args...>::type
 {};
 
 #  if !defined(_CCCL_NO_VARIABLE_TEMPLATES)
diff --git a/libcudacxx/include/cuda/std/__type_traits/is_empty.h b/libcudacxx/include/cuda/std/__type_traits/is_empty.h
index 4b11bc7da88..dc2a3691321 100644
--- a/libcudacxx/include/cuda/std/__type_traits/is_empty.h
+++ b/libcudacxx/include/cuda/std/__type_traits/is_empty.h
@@ -50,15 +50,15 @@ struct __is_empty2
 };
 
 template <class _Tp, bool = _CCCL_TRAIT(is_class, _Tp)>
-struct __libcpp_empty : public integral_constant<bool, sizeof(__is_empty1<_Tp>) == sizeof(__is_empty2)>
+struct __cccl_empty : public integral_constant<bool, sizeof(__is_empty1<_Tp>) == sizeof(__is_empty2)>
 {};
 
 template <class _Tp>
-struct __libcpp_empty<_Tp, false> : public false_type
+struct __cccl_empty<_Tp, false> : public false_type
 {};
 
 template <class _Tp>
-struct _CCCL_TYPE_VISIBILITY_DEFAULT is_empty : public __libcpp_empty<_Tp>
+struct _CCCL_TYPE_VISIBILITY_DEFAULT is_empty : public __cccl_empty<_Tp>
 {};
 
 #  if !defined(_CCCL_NO_VARIABLE_TEMPLATES)
diff --git a/libcudacxx/include/cuda/std/__type_traits/is_floating_point.h b/libcudacxx/include/cuda/std/__type_traits/is_floating_point.h
index 59336c6acad..913bacdb2a6 100644
--- a/libcudacxx/include/cuda/std/__type_traits/is_floating_point.h
+++ b/libcudacxx/include/cuda/std/__type_traits/is_floating_point.h
@@ -26,20 +26,20 @@
 _LIBCUDACXX_BEGIN_NAMESPACE_STD
 
 template <class _Tp>
-struct __libcpp_is_floating_point : public false_type
+struct __cccl_is_floating_point : public false_type
 {};
 template <>
-struct __libcpp_is_floating_point<float> : public true_type
+struct __cccl_is_floating_point<float> : public true_type
 {};
 template <>
-struct __libcpp_is_floating_point<double> : public true_type
+struct __cccl_is_floating_point<double> : public true_type
 {};
 template <>
-struct __libcpp_is_floating_point<long double> : public true_type
+struct __cccl_is_floating_point<long double> : public true_type
 {};
 
 template <class _Tp>
-struct _CCCL_TYPE_VISIBILITY_DEFAULT is_floating_point : public __libcpp_is_floating_point<remove_cv_t<_Tp>>
+struct _CCCL_TYPE_VISIBILITY_DEFAULT is_floating_point : public __cccl_is_floating_point<remove_cv_t<_Tp>>
 {};
 
 #if !defined(_CCCL_NO_VARIABLE_TEMPLATES)
diff --git a/libcudacxx/include/cuda/std/__type_traits/is_integral.h b/libcudacxx/include/cuda/std/__type_traits/is_integral.h
index d3b412b8135..eddcba144c5 100644
--- a/libcudacxx/include/cuda/std/__type_traits/is_integral.h
+++ b/libcudacxx/include/cuda/std/__type_traits/is_integral.h
@@ -39,72 +39,72 @@ _CCCL_INLINE_VAR constexpr bool is_integral_v = _CCCL_BUILTIN_IS_INTEGRAL(_Tp);
 #else // ^^^ _CCCL_BUILTIN_IS_INTEGRAL ^^^ / vvv !_CCCL_BUILTIN_IS_INTEGRAL vvv
 
 template <class _Tp>
-struct __libcpp_is_integral : public false_type
+struct __cccl_is_integral : public false_type
 {};
 template <>
-struct __libcpp_is_integral<bool> : public true_type
+struct __cccl_is_integral<bool> : public true_type
 {};
 template <>
-struct __libcpp_is_integral<char> : public true_type
+struct __cccl_is_integral<char> : public true_type
 {};
 template <>
-struct __libcpp_is_integral<signed char> : public true_type
+struct __cccl_is_integral<signed char> : public true_type
 {};
 template <>
-struct __libcpp_is_integral<unsigned char> : public true_type
+struct __cccl_is_integral<unsigned char> : public true_type
 {};
 template <>
-struct __libcpp_is_integral<wchar_t> : public true_type
+struct __cccl_is_integral<wchar_t> : public true_type
 {};
 #  ifndef _LIBCUDACXX_NO_HAS_CHAR8_T
 template <>
-struct __libcpp_is_integral<char8_t> : public true_type
+struct __cccl_is_integral<char8_t> : public true_type
 {};
 #  endif
 #  ifndef _LIBCUDACXX_HAS_NO_UNICODE_CHARS
 template <>
-struct __libcpp_is_integral<char16_t> : public true_type
+struct __cccl_is_integral<char16_t> : public true_type
 {};
 template <>
-struct __libcpp_is_integral<char32_t> : public true_type
+struct __cccl_is_integral<char32_t> : public true_type
 {};
 #  endif // _LIBCUDACXX_HAS_NO_UNICODE_CHARS
 template <>
-struct __libcpp_is_integral<short> : public true_type
+struct __cccl_is_integral<short> : public true_type
 {};
 template <>
-struct __libcpp_is_integral<unsigned short> : public true_type
+struct __cccl_is_integral<unsigned short> : public true_type
 {};
 template <>
-struct __libcpp_is_integral<int> : public true_type
+struct __cccl_is_integral<int> : public true_type
 {};
 template <>
-struct __libcpp_is_integral<unsigned int> : public true_type
+struct __cccl_is_integral<unsigned int> : public true_type
 {};
 template <>
-struct __libcpp_is_integral<long> : public true_type
+struct __cccl_is_integral<long> : public true_type
 {};
 template <>
-struct __libcpp_is_integral<unsigned long> : public true_type
+struct __cccl_is_integral<unsigned long> : public true_type
 {};
 template <>
-struct __libcpp_is_integral<long long> : public true_type
+struct __cccl_is_integral<long long> : public true_type
 {};
 template <>
-struct __libcpp_is_integral<unsigned long long> : public true_type
+struct __cccl_is_integral<unsigned long long> : public true_type
 {};
 #  ifndef _LIBCUDACXX_HAS_NO_INT128
 template <>
-struct __libcpp_is_integral<__int128_t> : public true_type
+struct __cccl_is_integral<__int128_t> : public true_type
 {};
 template <>
-struct __libcpp_is_integral<__uint128_t> : public true_type
+struct __cccl_is_integral<__uint128_t> : public true_type
 {};
 #  endif
 
 template <class _Tp>
 struct _CCCL_TYPE_VISIBILITY_DEFAULT is_integral
-    : public integral_constant<bool, __libcpp_is_integral<remove_cv_t<_Tp>>::value>
+    : public integral_constant<bool, __cccl_is_integral<remove_cv_t<_Tp>>::value>
 {};
 
 #  if !defined(_CCCL_NO_VARIABLE_TEMPLATES)
diff --git a/libcudacxx/include/cuda/std/__type_traits/is_member_function_pointer.h b/libcudacxx/include/cuda/std/__type_traits/is_member_function_pointer.h
index 943ed414a5c..fff6f96ee81 100644
--- a/libcudacxx/include/cuda/std/__type_traits/is_member_function_pointer.h
+++ b/libcudacxx/include/cuda/std/__type_traits/is_member_function_pointer.h
@@ -28,7 +28,7 @@
 _LIBCUDACXX_BEGIN_NAMESPACE_STD
 
 template <class _Tp>
-struct __libcpp_is_member_pointer
+struct __cccl_is_member_pointer
 {
   enum
   {
@@ -38,7 +38,7 @@ struct __libcpp_is_member_pointer
   };
 };
 template <class _Tp, class _Up>
-struct __libcpp_is_member_pointer<_Tp _Up::*>
+struct __cccl_is_member_pointer<_Tp _Up::*>
 {
   enum
   {
@@ -64,7 +64,7 @@ _CCCL_INLINE_VAR constexpr bool is_member_function_pointer_v = _CCCL_BUILTIN_IS_
 
 template <class _Tp>
 struct _CCCL_TYPE_VISIBILITY_DEFAULT is_member_function_pointer
-    : public integral_constant<bool, __libcpp_is_member_pointer<remove_cv_t<_Tp>>::__is_func>
+    : public integral_constant<bool, __cccl_is_member_pointer<remove_cv_t<_Tp>>::__is_func>
 {};
 
 #  if !defined(_CCCL_NO_VARIABLE_TEMPLATES)
diff --git a/libcudacxx/include/cuda/std/__type_traits/is_member_object_pointer.h b/libcudacxx/include/cuda/std/__type_traits/is_member_object_pointer.h
index b9f411cf9d7..86ce9dd9d26 100644
--- a/libcudacxx/include/cuda/std/__type_traits/is_member_object_pointer.h
+++ b/libcudacxx/include/cuda/std/__type_traits/is_member_object_pointer.h
@@ -42,7 +42,7 @@ _CCCL_INLINE_VAR constexpr bool is_member_object_pointer_v = _CCCL_BUILTIN_IS_ME
 
 template <class _Tp>
 struct _CCCL_TYPE_VISIBILITY_DEFAULT is_member_object_pointer
-    : public integral_constant<bool, __libcpp_is_member_pointer<remove_cv_t<_Tp>>::__is_obj>
+    : public integral_constant<bool, __cccl_is_member_pointer<remove_cv_t<_Tp>>::__is_obj>
 {};
 
 #  if !defined(_CCCL_NO_VARIABLE_TEMPLATES)
diff --git a/libcudacxx/include/cuda/std/__type_traits/is_member_pointer.h b/libcudacxx/include/cuda/std/__type_traits/is_member_pointer.h
index 2f0ff0d5eb6..74ceaf6e7d3 100644
--- a/libcudacxx/include/cuda/std/__type_traits/is_member_pointer.h
+++ b/libcudacxx/include/cuda/std/__type_traits/is_member_pointer.h
@@ -42,7 +42,7 @@ _CCCL_INLINE_VAR constexpr bool is_member_pointer_v = _CCCL_BUILTIN_IS_MEMBER_PO
 
 template <class _Tp>
 struct _CCCL_TYPE_VISIBILITY_DEFAULT is_member_pointer
-    : public integral_constant<bool, __libcpp_is_member_pointer<remove_cv_t<_Tp>>::__is_member>
+    : public integral_constant<bool, __cccl_is_member_pointer<remove_cv_t<_Tp>>::__is_member>
 {};
 
 #  if !defined(_CCCL_NO_VARIABLE_TEMPLATES)
diff --git a/libcudacxx/include/cuda/std/__type_traits/is_nothrow_assignable.h b/libcudacxx/include/cuda/std/__type_traits/is_nothrow_assignable.h
index 3232e3eff2c..b12662cb9cc 100644
--- a/libcudacxx/include/cuda/std/__type_traits/is_nothrow_assignable.h
+++ b/libcudacxx/include/cuda/std/__type_traits/is_nothrow_assignable.h
@@ -42,20 +42,20 @@ _CCCL_INLINE_VAR constexpr bool is_nothrow_assignable_v = _CCCL_BUILTIN_IS_NOTHR
 #elif !defined(_LIBCUDACXX_HAS_NO_NOEXCEPT) && !defined(_LIBCUDACXX_HAS_NO_NOEXCEPT_SFINAE)
 
 template <bool, class _Tp, class _Arg>
-struct __libcpp_is_nothrow_assignable;
+struct __cccl_is_nothrow_assignable;
 
 template <class _Tp, class _Arg>
-struct __libcpp_is_nothrow_assignable<false, _Tp, _Arg> : public false_type
+struct __cccl_is_nothrow_assignable<false, _Tp, _Arg> : public false_type
 {};
 
 template <class _Tp, class _Arg>
-struct __libcpp_is_nothrow_assignable<true, _Tp, _Arg>
+struct __cccl_is_nothrow_assignable<true, _Tp, _Arg>
     : public integral_constant<bool, noexcept(_CUDA_VSTD::declval<_Tp>() = _CUDA_VSTD::declval<_Arg>())>
 {};
 
 template <class _Tp, class _Arg>
 struct _CCCL_TYPE_VISIBILITY_DEFAULT is_nothrow_assignable
-    : public __libcpp_is_nothrow_assignable<is_assignable<_Tp, _Arg>::value, _Tp, _Arg>
+    : public __cccl_is_nothrow_assignable<is_assignable<_Tp, _Arg>::value, _Tp, _Arg>
 {};
 
 #  if !defined(_CCCL_NO_VARIABLE_TEMPLATES)
diff --git a/libcudacxx/include/cuda/std/__type_traits/is_nothrow_constructible.h b/libcudacxx/include/cuda/std/__type_traits/is_nothrow_constructible.h
index b225e46cbc0..62440f9b26e 100644
--- a/libcudacxx/include/cuda/std/__type_traits/is_nothrow_constructible.h
+++ b/libcudacxx/include/cuda/std/__type_traits/is_nothrow_constructible.h
@@ -44,10 +44,10 @@ _CCCL_INLINE_VAR constexpr bool is_nothrow_constructible_v = _CCCL_BUILTIN_IS_NO
 #  if !defined(_LIBCUDACXX_HAS_NO_NOEXCEPT)
 
 template <bool, bool, class _Tp, class... _Args>
-struct __libcpp_is_nothrow_constructible;
+struct __cccl_is_nothrow_constructible;
 
 template <class _Tp, class... _Args>
-struct __libcpp_is_nothrow_constructible</*is constructible*/ true, /*is reference*/ false, _Tp, _Args...>
+struct __cccl_is_nothrow_constructible</*is constructible*/ true, /*is reference*/ false, _Tp, _Args...>
     : public integral_constant<bool, noexcept(_Tp(_CUDA_VSTD::declval<_Args>()...))>
 {};
 
@@ -56,22 +56,22 @@ _LIBCUDACXX_HIDE_FROM_ABI void __implicit_conversion_to(_Tp) noexcept
 {}
 
 template <class _Tp, class _Arg>
-struct __libcpp_is_nothrow_constructible</*is constructible*/ true, /*is reference*/ true, _Tp, _Arg>
+struct __cccl_is_nothrow_constructible</*is constructible*/ true, /*is reference*/ true, _Tp, _Arg>
     : public integral_constant<bool, noexcept(__implicit_conversion_to<_Tp>(_CUDA_VSTD::declval<_Arg>()))>
 {};
 
 template <class _Tp, bool _IsReference, class... _Args>
-struct __libcpp_is_nothrow_constructible</*is constructible*/ false, _IsReference, _Tp, _Args...> : public false_type
+struct __cccl_is_nothrow_constructible</*is constructible*/ false, _IsReference, _Tp, _Args...> : public false_type
 {};
 
 template <class _Tp, class... _Args>
 struct _CCCL_TYPE_VISIBILITY_DEFAULT is_nothrow_constructible
-    : __libcpp_is_nothrow_constructible<is_constructible<_Tp, _Args...>::value, is_reference<_Tp>::value, _Tp, _Args...>
+    : __cccl_is_nothrow_constructible<is_constructible<_Tp, _Args...>::value, is_reference<_Tp>::value, _Tp, _Args...>
 {};
 
 template <class _Tp, size_t _Ns>
 struct _CCCL_TYPE_VISIBILITY_DEFAULT is_nothrow_constructible<_Tp[_Ns]>
-    : __libcpp_is_nothrow_constructible<is_constructible<_Tp>::value, is_reference<_Tp>::value, _Tp>
+    : __cccl_is_nothrow_constructible<is_constructible<_Tp>::value, is_reference<_Tp>::value, _Tp>
 {};
 
 #  else
diff --git a/libcudacxx/include/cuda/std/__type_traits/is_nothrow_destructible.h b/libcudacxx/include/cuda/std/__type_traits/is_nothrow_destructible.h
index 1cd366424de..23821feaffd 100644
--- a/libcudacxx/include/cuda/std/__type_traits/is_nothrow_destructible.h
+++ b/libcudacxx/include/cuda/std/__type_traits/is_nothrow_destructible.h
@@ -41,16 +41,16 @@ struct is_nothrow_destructible : public integral_constant<bool, _CCCL_BUILTIN_IS
 #elif !defined(_LIBCUDACXX_HAS_NO_NOEXCEPT)
 
 template <class _Tp, bool = is_destructible<_Tp>::value>
-struct __libcpp_is_nothrow_destructible : false_type
+struct __cccl_is_nothrow_destructible : false_type
 {};
 
 template <class _Tp>
-struct __libcpp_is_nothrow_destructible<_Tp, true>
+struct __cccl_is_nothrow_destructible<_Tp, true>
     : public integral_constant<bool, noexcept(_CUDA_VSTD::declval<_Tp>().~_Tp())>
 {};
 
 template <class _Tp>
-struct _CCCL_TYPE_VISIBILITY_DEFAULT is_nothrow_destructible : public __libcpp_is_nothrow_destructible<_Tp>
+struct _CCCL_TYPE_VISIBILITY_DEFAULT is_nothrow_destructible : public __cccl_is_nothrow_destructible<_Tp>
 {};
 
 template <class _Tp, size_t _Ns>
@@ -68,12 +68,12 @@ struct _CCCL_TYPE_VISIBILITY_DEFAULT is_nothrow_destructible<_Tp&&> : public tru
 #else
 
 template <class _Tp>
-struct __libcpp_nothrow_destructor : public integral_constant<bool, is_scalar<_Tp>::value || is_reference<_Tp>::value>
+struct __cccl_nothrow_destructor : public integral_constant<bool, is_scalar<_Tp>::value || is_reference<_Tp>::value>
 {};
 
 template <class _Tp>
 struct _CCCL_TYPE_VISIBILITY_DEFAULT is_nothrow_destructible
-    : public __libcpp_nothrow_destructor<remove_all_extents_t<_Tp>>
+    : public __cccl_nothrow_destructor<remove_all_extents_t<_Tp>>
 {};
 
 template <class _Tp>
diff --git a/libcudacxx/include/cuda/std/__type_traits/is_pointer.h b/libcudacxx/include/cuda/std/__type_traits/is_pointer.h
index b87e5537ca1..67969fbbb56 100644
--- a/libcudacxx/include/cuda/std/__type_traits/is_pointer.h
+++ b/libcudacxx/include/cuda/std/__type_traits/is_pointer.h
@@ -39,14 +39,14 @@ _CCCL_INLINE_VAR constexpr bool is_pointer_v = _CCCL_BUILTIN_IS_POINTER(_Tp);
 #else
 
 template <class _Tp>
-struct __libcpp_is_pointer : public false_type
+struct __cccl_is_pointer : public false_type
 {};
 template <class _Tp>
-struct __libcpp_is_pointer<_Tp*> : public true_type
+struct __cccl_is_pointer<_Tp*> : public true_type
 {};
 
 template <class _Tp>
-struct _CCCL_TYPE_VISIBILITY_DEFAULT is_pointer : public __libcpp_is_pointer<remove_cv_t<_Tp>>
+struct _CCCL_TYPE_VISIBILITY_DEFAULT is_pointer : public __cccl_is_pointer<remove_cv_t<_Tp>>
 {};
 
 #  if !defined(_CCCL_NO_VARIABLE_TEMPLATES)
diff --git a/libcudacxx/include/cuda/std/__type_traits/is_referenceable.h b/libcudacxx/include/cuda/std/__type_traits/is_referenceable.h
index 63d2d71fb52..c2ddc771904 100644
--- a/libcudacxx/include/cuda/std/__type_traits/is_referenceable.h
+++ b/libcudacxx/include/cuda/std/__type_traits/is_referenceable.h
@@ -28,11 +28,11 @@ _LIBCUDACXX_BEGIN_NAMESPACE_STD
 #if defined(_CCCL_BUILTIN_IS_REFERENCEABLE) && !defined(_LIBCUDACXX_USE_IS_REFERENCEABLE_FALLBACK)
 
 template <class _Tp>
-struct __libcpp_is_referenceable : public integral_constant<bool, _CCCL_BUILTIN_IS_REFERENCEABLE(_Tp)>
+struct __cccl_is_referenceable : public integral_constant<bool, _CCCL_BUILTIN_IS_REFERENCEABLE(_Tp)>
 {};
 
 #else
-struct __libcpp_is_referenceable_impl
+struct __cccl_is_referenceable_impl
 {
   template <class _Tp>
   _CCCL_HOST_DEVICE static _Tp& __test(int);
@@ -41,8 +41,8 @@ struct __libcpp_is_referenceable_impl
 };
 
 template <class _Tp>
-struct __libcpp_is_referenceable
-    : integral_constant<bool, _IsNotSame<decltype(__libcpp_is_referenceable_impl::__test<_Tp>(0)), false_type>::value>
+struct __cccl_is_referenceable
+    : integral_constant<bool, _IsNotSame<decltype(__cccl_is_referenceable_impl::__test<_Tp>(0)), false_type>::value>
 {};
 #endif // defined(_CCCL_BUILTIN_IS_REFERENCEABLE) && !defined(_LIBCUDACXX_USE_IS_REFERENCEABLE_FALLBACK)
 
diff --git a/libcudacxx/include/cuda/std/__type_traits/is_signed.h b/libcudacxx/include/cuda/std/__type_traits/is_signed.h
index 33e5c1eb5c5..220790002ba 100644
--- a/libcudacxx/include/cuda/std/__type_traits/is_signed.h
+++ b/libcudacxx/include/cuda/std/__type_traits/is_signed.h
@@ -42,23 +42,23 @@ _CCCL_INLINE_VAR constexpr bool is_signed_v = _CCCL_BUILTIN_IS_SIGNED(_Tp);
 #else
 
 template <class _Tp, bool = is_integral<_Tp>::value>
-struct __libcpp_is_signed_impl : public bool_constant<(_Tp(-1) < _Tp(0))>
+struct __cccl_is_signed_impl : public bool_constant<(_Tp(-1) < _Tp(0))>
 {};
 
 template <class _Tp>
-struct __libcpp_is_signed_impl<_Tp, false> : public true_type
+struct __cccl_is_signed_impl<_Tp, false> : public true_type
 {}; // floating point
 
 template <class _Tp, bool = is_arithmetic<_Tp>::value>
-struct __libcpp_is_signed : public __libcpp_is_signed_impl<_Tp>
+struct __cccl_is_signed : public __cccl_is_signed_impl<_Tp>
 {};
 
 template <class _Tp>
-struct __libcpp_is_signed<_Tp, false> : public false_type
+struct __cccl_is_signed<_Tp, false> : public false_type
 {};
 
 template <class _Tp>
-struct _CCCL_TYPE_VISIBILITY_DEFAULT is_signed : public __libcpp_is_signed<_Tp>
+struct _CCCL_TYPE_VISIBILITY_DEFAULT is_signed : public __cccl_is_signed<_Tp>
 {};
 
 #  if !defined(_CCCL_NO_VARIABLE_TEMPLATES)
diff --git a/libcudacxx/include/cuda/std/__type_traits/is_signed_integer.h b/libcudacxx/include/cuda/std/__type_traits/is_signed_integer.h
index 69ce3aa8a6e..273df0d830b 100644
--- a/libcudacxx/include/cuda/std/__type_traits/is_signed_integer.h
+++ b/libcudacxx/include/cuda/std/__type_traits/is_signed_integer.h
@@ -25,26 +25,26 @@
 _LIBCUDACXX_BEGIN_NAMESPACE_STD
 
 template <class _Tp>
-struct __libcpp_is_signed_integer : public false_type
+struct __cccl_is_signed_integer : public false_type
 {};
 template <>
-struct __libcpp_is_signed_integer<signed char> : public true_type
+struct __cccl_is_signed_integer<signed char> : public true_type
 {};
 template <>
-struct __libcpp_is_signed_integer<signed short> : public true_type
+struct __cccl_is_signed_integer<signed short> : public true_type
 {};
 template <>
-struct __libcpp_is_signed_integer<signed int> : public true_type
+struct __cccl_is_signed_integer<signed int> : public true_type
 {};
 template <>
-struct __libcpp_is_signed_integer<signed long> : public true_type
+struct __cccl_is_signed_integer<signed long> : public true_type
 {};
 template <>
-struct __libcpp_is_signed_integer<signed long long> : public true_type
+struct __cccl_is_signed_integer<signed long long> : public true_type
 {};
 #ifndef _LIBCUDACXX_HAS_NO_INT128
 template <>
-struct __libcpp_is_signed_integer<__int128_t> : public true_type
+struct __cccl_is_signed_integer<__int128_t> : public true_type
 {};
 #endif
 
diff --git a/libcudacxx/include/cuda/std/__type_traits/is_swappable.h b/libcudacxx/include/cuda/std/__type_traits/is_swappable.h
index 964f14d6381..d2727ced8b4 100644
--- a/libcudacxx/include/cuda/std/__type_traits/is_swappable.h
+++ b/libcudacxx/include/cuda/std/__type_traits/is_swappable.h
@@ -163,7 +163,7 @@ struct _CCCL_TYPE_VISIBILITY_DEFAULT is_swappable_with
 
 template <class _Tp>
 struct _CCCL_TYPE_VISIBILITY_DEFAULT is_swappable
-    : public conditional_t<__libcpp_is_referenceable<_Tp>::value,
+    : public conditional_t<__cccl_is_referenceable<_Tp>::value,
                            is_swappable_with<add_lvalue_reference_t<_Tp>, add_lvalue_reference_t<_Tp>>,
                            false_type>
 {};
@@ -175,7 +175,7 @@ struct _CCCL_TYPE_VISIBILITY_DEFAULT is_nothrow_swappable_with
 
 template <class _Tp>
 struct _CCCL_TYPE_VISIBILITY_DEFAULT is_nothrow_swappable
-    : public conditional_t<__libcpp_is_referenceable<_Tp>::value,
+    : public conditional_t<__cccl_is_referenceable<_Tp>::value,
                            is_nothrow_swappable_with<add_lvalue_reference_t<_Tp>, add_lvalue_reference_t<_Tp>>,
                            false_type>
 {};
diff --git a/libcudacxx/include/cuda/std/__type_traits/is_trivially_destructible.h b/libcudacxx/include/cuda/std/__type_traits/is_trivially_destructible.h
index 9116ced5e8f..57a4af4829c 100644
--- a/libcudacxx/include/cuda/std/__type_traits/is_trivially_destructible.h
+++ b/libcudacxx/include/cuda/std/__type_traits/is_trivially_destructible.h
@@ -47,12 +47,12 @@ _CCCL_SUPPRESS_DEPRECATED_POP
 #else
 
 template <class _Tp>
-struct __libcpp_trivial_destructor : public integral_constant<bool, is_scalar<_Tp>::value || is_reference<_Tp>::value>
+struct __cccl_trivial_destructor : public integral_constant<bool, is_scalar<_Tp>::value || is_reference<_Tp>::value>
 {};
 
 template <class _Tp>
 struct _CCCL_TYPE_VISIBILITY_DEFAULT is_trivially_destructible
-    : public __libcpp_trivial_destructor<remove_all_extents_t<_Tp>>
+    : public __cccl_trivial_destructor<remove_all_extents_t<_Tp>>
 {};
 
 template <class _Tp>
diff --git a/libcudacxx/include/cuda/std/__type_traits/is_unbounded_array.h b/libcudacxx/include/cuda/std/__type_traits/is_unbounded_array.h
index 501efded75a..2e09d4c8726 100644
--- a/libcudacxx/include/cuda/std/__type_traits/is_unbounded_array.h
+++ b/libcudacxx/include/cuda/std/__type_traits/is_unbounded_array.h
@@ -25,10 +25,10 @@
 _LIBCUDACXX_BEGIN_NAMESPACE_STD
 
 template <class>
-struct _CCCL_TYPE_VISIBILITY_DEFAULT __libcpp_is_unbounded_array : false_type
+struct _CCCL_TYPE_VISIBILITY_DEFAULT __cccl_is_unbounded_array : false_type
 {};
 template <class _Tp>
-struct _CCCL_TYPE_VISIBILITY_DEFAULT __libcpp_is_unbounded_array<_Tp[]> : true_type
+struct _CCCL_TYPE_VISIBILITY_DEFAULT __cccl_is_unbounded_array<_Tp[]> : true_type
 {};
 
 template <class>
diff --git a/libcudacxx/include/cuda/std/__type_traits/is_union.h b/libcudacxx/include/cuda/std/__type_traits/is_union.h
index 37ee313d8a3..9978f99e6be 100644
--- a/libcudacxx/include/cuda/std/__type_traits/is_union.h
+++ b/libcudacxx/include/cuda/std/__type_traits/is_union.h
@@ -39,10 +39,10 @@ _CCCL_INLINE_VAR constexpr bool is_union_v = _CCCL_BUILTIN_IS_UNION(_Tp);
 #else
 
 template <class _Tp>
-struct __libcpp_union : public false_type
+struct __cccl_union : public false_type
 {};
 template <class _Tp>
-struct _CCCL_TYPE_VISIBILITY_DEFAULT is_union : public __libcpp_union<remove_cv_t<_Tp>>
+struct _CCCL_TYPE_VISIBILITY_DEFAULT is_union : public __cccl_union<remove_cv_t<_Tp>>
 {};
 
 #  if !defined(_CCCL_NO_VARIABLE_TEMPLATES)
diff --git a/libcudacxx/include/cuda/std/__type_traits/is_unsigned.h b/libcudacxx/include/cuda/std/__type_traits/is_unsigned.h
index abd951c7202..4a5ad7d92e2 100644
--- a/libcudacxx/include/cuda/std/__type_traits/is_unsigned.h
+++ b/libcudacxx/include/cuda/std/__type_traits/is_unsigned.h
@@ -45,23 +45,23 @@ _CCCL_INLINE_VAR constexpr bool is_unsigned_v = _CCCL_BUILTIN_IS_UNSIGNED(_Tp);
 #else
 
 template <class _Tp, bool = is_integral<_Tp>::value>
-struct __libcpp_is_unsigned_impl : public bool_constant<(_Tp(0) < _Tp(-1))>
+struct __cccl_is_unsigned_impl : public bool_constant<(_Tp(0) < _Tp(-1))>
 {};
 
 template <class _Tp>
-struct __libcpp_is_unsigned_impl<_Tp, false> : public false_type
+struct __cccl_is_unsigned_impl<_Tp, false> : public false_type
 {}; // floating point
 
 template <class _Tp, bool = is_arithmetic<_Tp>::value>
-struct __libcpp_is_unsigned : public __libcpp_is_unsigned_impl<_Tp>
+struct __cccl_is_unsigned : public __cccl_is_unsigned_impl<_Tp>
 {};
 
 template <class _Tp>
-struct __libcpp_is_unsigned<_Tp, false> : public false_type
+struct __cccl_is_unsigned<_Tp, false> : public false_type
 {};
 
 template <class _Tp>
-struct _CCCL_TYPE_VISIBILITY_DEFAULT is_unsigned : public __libcpp_is_unsigned<_Tp>
+struct _CCCL_TYPE_VISIBILITY_DEFAULT is_unsigned : public __cccl_is_unsigned<_Tp>
 {};
 
 #  if !defined(_CCCL_NO_VARIABLE_TEMPLATES)
diff --git a/libcudacxx/include/cuda/std/__type_traits/is_unsigned_integer.h b/libcudacxx/include/cuda/std/__type_traits/is_unsigned_integer.h
index 888020032ed..088c98af66a 100644
--- a/libcudacxx/include/cuda/std/__type_traits/is_unsigned_integer.h
+++ b/libcudacxx/include/cuda/std/__type_traits/is_unsigned_integer.h
@@ -25,26 +25,26 @@
 _LIBCUDACXX_BEGIN_NAMESPACE_STD
 
 template <class _Tp>
-struct __libcpp_is_unsigned_integer : public false_type
+struct __cccl_is_unsigned_integer : public false_type
 {};
 template <>
-struct __libcpp_is_unsigned_integer<unsigned char> : public true_type
+struct __cccl_is_unsigned_integer<unsigned char> : public true_type
 {};
 template <>
-struct __libcpp_is_unsigned_integer<unsigned short> : public true_type
+struct __cccl_is_unsigned_integer<unsigned short> : public true_type
 {};
 template <>
-struct __libcpp_is_unsigned_integer<unsigned int> : public true_type
+struct __cccl_is_unsigned_integer<unsigned int> : public true_type
 {};
 template <>
-struct __libcpp_is_unsigned_integer<unsigned long> : public true_type
+struct __cccl_is_unsigned_integer<unsigned long> : public true_type
 {};
 template <>
-struct __libcpp_is_unsigned_integer<unsigned long long> : public true_type
+struct __cccl_is_unsigned_integer<unsigned long long> : public true_type
 {};
 #ifndef _LIBCUDACXX_HAS_NO_INT128
 template <>
-struct __libcpp_is_unsigned_integer<__uint128_t> : public true_type
+struct __cccl_is_unsigned_integer<__uint128_t> : public true_type
 {};
 #endif
 
diff --git a/libcudacxx/include/cuda/std/__utility/unreachable.h b/libcudacxx/include/cuda/std/__utility/unreachable.h
index e0d704c9b6d..a3b23397e0a 100644
--- a/libcudacxx/include/cuda/std/__utility/unreachable.h
+++ b/libcudacxx/include/cuda/std/__utility/unreachable.h
@@ -22,7 +22,7 @@
 
 _LIBCUDACXX_BEGIN_NAMESPACE_STD
 
-_CCCL_NORETURN _LIBCUDACXX_HIDE_FROM_ABI void __libcpp_unreachable()
+_CCCL_NORETURN _LIBCUDACXX_HIDE_FROM_ABI void __cccl_unreachable()
 {
   _CCCL_UNREACHABLE();
 }
diff --git a/libcudacxx/include/cuda/std/detail/libcxx/include/__string b/libcudacxx/include/cuda/std/detail/libcxx/include/__string
index 93cba133797..20857deef7c 100644
--- a/libcudacxx/include/cuda/std/detail/libcxx/include/__string
+++ b/libcudacxx/include/cuda/std/detail/libcxx/include/__string
@@ -262,7 +262,7 @@ struct _CCCL_TYPE_VISIBILITY_DEFAULT char_traits<char>
   {
 #if _CCCL_COMPILER(GCC, <, 13)
     // absurd workaround for GCC "internal compiler error: in cxx_eval_array_reference"
-    if (__libcpp_is_constant_evaluated())
+    if (_CUDA_VSTD::is_constant_evaluated())
       ;
 #endif
 #if defined(_CCCL_BUILTIN_STRLEN)
@@ -470,7 +470,7 @@ _LIBCUDACXX_HIDE_FROM_ABI constexpr size_t __char_traits_length_checked(const ty
 #if _LIBCUDACXX_DEBUG_LEVEL >= 1
   return __s
          ? _Traits::length(__s)
-         : (_CUDA_VSTD::__libcpp_debug_function(_CUDA_VSTD::__libcpp_debug_info(
+         : (_CUDA_VSTD::__cccl_debug_function(_CUDA_VSTD::__cccl_debug_info(
               __FILE__, __LINE__, "p == nullptr", "null pointer pass to non-null argument of char_traits<...>::length")),
             0);
 #else
diff --git a/libcudacxx/include/cuda/std/detail/libcxx/include/algorithm b/libcudacxx/include/cuda/std/detail/libcxx/include/algorithm
index 01e92219370..ec32a3e3f77 100644
--- a/libcudacxx/include/cuda/std/detail/libcxx/include/algorithm
+++ b/libcudacxx/include/cuda/std/detail/libcxx/include/algorithm
@@ -1070,7 +1070,7 @@ typename uniform_int_distribution<_IntType>::result_type uniform_int_distributio
   {
     return static_cast<result_type>(_Eng(__g, _Dt)());
   }
-  size_t __w = _Dt - __libcpp_clz(_Rp) - 1;
+  size_t __w = _Dt - __cccl_clz(_Rp) - 1;
   if ((_Rp & (std::numeric_limits<_UIntType>::max() >> (_Dt - __w))) != 0)
   {
     ++__w;
diff --git a/libcudacxx/include/cuda/std/detail/libcxx/include/complex b/libcudacxx/include/cuda/std/detail/libcxx/include/complex
index 4e98f7c9774..7eecbcc4a20 100644
--- a/libcudacxx/include/cuda/std/detail/libcxx/include/complex
+++ b/libcudacxx/include/cuda/std/detail/libcxx/include/complex
@@ -509,7 +509,7 @@ operator*(const complex<_Tp>& __z, const complex<_Tp>& __w)
 
 #if _CCCL_STD_VER > 2011 && defined(_CCCL_BUILTIN_IS_CONSTANT_EVALUATED)
   // Avoid floating point operations that are invalid during constant evaluation
-  if (__libcpp_is_constant_evaluated())
+  if (_CUDA_VSTD::is_constant_evaluated())
   {
     bool __z_zero = __a == _Tp(0) && __b == _Tp(0);
     bool __w_zero = __c == _Tp(0) && __d == _Tp(0);
@@ -652,7 +652,7 @@ operator/(const complex<_Tp>& __z, const complex<_Tp>& __w)
 
 #if _CCCL_STD_VER > 2011 && defined(_CCCL_BUILTIN_IS_CONSTANT_EVALUATED)
   // Avoid floating point operations that are invalid during constant evaluation
-  if (__libcpp_is_constant_evaluated())
+  if (_CUDA_VSTD::is_constant_evaluated())
   {
     bool __z_zero = __a == _Tp(0) && __b == _Tp(0);
     bool __w_zero = __c == _Tp(0) && __d == _Tp(0);
@@ -841,12 +841,12 @@ _LIBCUDACXX_HIDE_FROM_ABI _CCCL_CONSTEXPR_CXX14 _Tp real(const complex<_Tp>& __c
 // 26.3.7 values:
 
 template <class _Tp, bool = _CCCL_TRAIT(is_integral, _Tp), bool = _CCCL_TRAIT(is_floating_point, _Tp)>
-struct __libcpp_complex_overload_traits
+struct __cccl_complex_overload_traits
 {};
 
 // Integral Types
 template <class _Tp>
-struct __libcpp_complex_overload_traits<_Tp, true, false>
+struct __cccl_complex_overload_traits<_Tp, true, false>
 {
   using _ValueType   = double;
   using _ComplexType = complex<double>;
@@ -854,20 +854,20 @@ struct __libcpp_complex_overload_traits<_Tp, true, false>
 
 // Floating point types
 template <class _Tp>
-struct __libcpp_complex_overload_traits<_Tp, false, true>
+struct __cccl_complex_overload_traits<_Tp, false, true>
 {
   using _ValueType   = _Tp;
   using _ComplexType = complex<_Tp>;
 };
 
 template <class _Tp>
-using __libcpp_complex_value_type = typename __libcpp_complex_overload_traits<_Tp>::_ValueType;
+using __cccl_complex_value_type = typename __cccl_complex_overload_traits<_Tp>::_ValueType;
 
 template <class _Tp>
-using __libcpp_complex_complex_type = typename __libcpp_complex_overload_traits<_Tp>::_ComplexType;
+using __cccl_complex_complex_type = typename __cccl_complex_overload_traits<_Tp>::_ComplexType;
 
 template <class _Tp>
-_LIBCUDACXX_HIDE_FROM_ABI _CCCL_CONSTEXPR_CXX14 __libcpp_complex_value_type<_Tp> real(_Tp __re)
+_LIBCUDACXX_HIDE_FROM_ABI _CCCL_CONSTEXPR_CXX14 __cccl_complex_value_type<_Tp> real(_Tp __re)
 {
   return __re;
 }
@@ -881,7 +881,7 @@ _LIBCUDACXX_HIDE_FROM_ABI _CCCL_CONSTEXPR_CXX14 _Tp imag(const complex<_Tp>& __c
 }
 
 template <class _Tp>
-_LIBCUDACXX_HIDE_FROM_ABI _CCCL_CONSTEXPR_CXX14 __libcpp_complex_value_type<_Tp> imag(_Tp)
+_LIBCUDACXX_HIDE_FROM_ABI _CCCL_CONSTEXPR_CXX14 __cccl_complex_value_type<_Tp> imag(_Tp)
 {
   return 0;
 }
@@ -940,9 +940,9 @@ _LIBCUDACXX_HIDE_FROM_ABI _CCCL_CONSTEXPR_CXX14 _Tp norm(const complex<_Tp>& __c
 }
 
 template <class _Tp>
-_LIBCUDACXX_HIDE_FROM_ABI _CCCL_CONSTEXPR_CXX14 __libcpp_complex_value_type<_Tp> norm(_Tp __re)
+_LIBCUDACXX_HIDE_FROM_ABI _CCCL_CONSTEXPR_CXX14 __cccl_complex_value_type<_Tp> norm(_Tp __re)
 {
-  return static_cast<__libcpp_complex_value_type<_Tp>>(__re) * __re;
+  return static_cast<__cccl_complex_value_type<_Tp>>(__re) * __re;
 }
 
 // conj
@@ -954,9 +954,9 @@ _LIBCUDACXX_HIDE_FROM_ABI _CCCL_CONSTEXPR_CXX14 complex<_Tp> conj(const complex<
 }
 
 template <class _Tp>
-_LIBCUDACXX_HIDE_FROM_ABI _CCCL_CONSTEXPR_CXX14 __libcpp_complex_complex_type<_Tp> conj(_Tp __re)
+_LIBCUDACXX_HIDE_FROM_ABI _CCCL_CONSTEXPR_CXX14 __cccl_complex_complex_type<_Tp> conj(_Tp __re)
 {
-  return __libcpp_complex_complex_type<_Tp>(__re);
+  return __cccl_complex_complex_type<_Tp>(__re);
 }
 
 // proj
@@ -973,7 +973,7 @@ _LIBCUDACXX_HIDE_FROM_ABI complex<_Tp> proj(const complex<_Tp>& __c)
 }
 
 template <class _Tp>
-_LIBCUDACXX_HIDE_FROM_ABI enable_if_t<__is_complex_float<_Tp>::value, __libcpp_complex_complex_type<_Tp>> proj(_Tp __re)
+_LIBCUDACXX_HIDE_FROM_ABI enable_if_t<__is_complex_float<_Tp>::value, __cccl_complex_complex_type<_Tp>> proj(_Tp __re)
 {
   if (_CUDA_VSTD::__constexpr_isinf(__re))
   {
@@ -983,9 +983,9 @@ _LIBCUDACXX_HIDE_FROM_ABI enable_if_t<__is_complex_float<_Tp>::value, __libcpp_c
 }
 
 template <class _Tp>
-_LIBCUDACXX_HIDE_FROM_ABI enable_if_t<is_integral<_Tp>::value, __libcpp_complex_complex_type<_Tp>> proj(_Tp __re)
+_LIBCUDACXX_HIDE_FROM_ABI enable_if_t<is_integral<_Tp>::value, __cccl_complex_complex_type<_Tp>> proj(_Tp __re)
 {
-  return __libcpp_complex_complex_type<_Tp>(__re);
+  return __cccl_complex_complex_type<_Tp>(__re);
 }
 
 // polar
diff --git a/libcudacxx/include/cuda/std/detail/libcxx/include/limits b/libcudacxx/include/cuda/std/detail/libcxx/include/limits
index ea830da6046..82f6a00c804 100644
--- a/libcudacxx/include/cuda/std/detail/libcxx/include/limits
+++ b/libcudacxx/include/cuda/std/detail/libcxx/include/limits
@@ -141,7 +141,7 @@ enum float_denorm_style
 };
 
 template <class _Tp, bool = is_arithmetic<_Tp>::value>
-class __libcpp_numeric_limits
+class __cccl_numeric_limits
 {
 protected:
   typedef _Tp type;
@@ -215,20 +215,20 @@ protected:
 _CCCL_DIAG_PUSH
 _CCCL_DIAG_SUPPRESS_MSVC(4309)
 template <class _Tp, int __digits, bool _IsSigned>
-struct __libcpp_compute_min
+struct __cccl_compute_min
 {
   static constexpr _Tp value = static_cast<_Tp>(_Tp(1) << __digits);
 };
 _CCCL_DIAG_POP
 
 template <class _Tp, int __digits>
-struct __libcpp_compute_min<_Tp, __digits, false>
+struct __cccl_compute_min<_Tp, __digits, false>
 {
   static constexpr _Tp value = _Tp(0);
 };
 
 template <class _Tp>
-class __libcpp_numeric_limits<_Tp, true>
+class __cccl_numeric_limits<_Tp, true>
 {
 protected:
   typedef _Tp type;
@@ -239,7 +239,7 @@ protected:
   static constexpr int digits       = static_cast<int>(sizeof(type) * __CHAR_BIT__ - is_signed);
   static constexpr int digits10     = digits * 3 / 10;
   static constexpr int max_digits10 = 0;
-  static constexpr type __min       = __libcpp_compute_min<type, digits, is_signed>::value;
+  static constexpr type __min       = __cccl_compute_min<type, digits, is_signed>::value;
   static constexpr type __max       = is_signed ? type(type(~0) ^ __min) : type(~0);
   _LIBCUDACXX_HIDE_FROM_ABI static constexpr type min() noexcept
   {
@@ -307,7 +307,7 @@ protected:
 };
 
 template <>
-class __libcpp_numeric_limits<bool, true>
+class __cccl_numeric_limits<bool, true>
 {
 protected:
   typedef bool type;
@@ -382,7 +382,7 @@ protected:
 };
 
 template <>
-class __libcpp_numeric_limits<float, true>
+class __cccl_numeric_limits<float, true>
 {
 protected:
   typedef float type;
@@ -470,7 +470,7 @@ protected:
 };
 
 template <>
-class __libcpp_numeric_limits<double, true>
+class __cccl_numeric_limits<double, true>
 {
 protected:
   typedef double type;
@@ -558,7 +558,7 @@ protected:
 };
 
 template <>
-class __libcpp_numeric_limits<long double, true>
+class __cccl_numeric_limits<long double, true>
 {
 #ifndef _LIBCUDACXX_HAS_NO_LONG_DOUBLE
 
@@ -634,9 +634,9 @@ protected:
 };
 
 template <class _Tp>
-class _CCCL_TYPE_VISIBILITY_DEFAULT numeric_limits : private __libcpp_numeric_limits<remove_cv_t<_Tp>>
+class _CCCL_TYPE_VISIBILITY_DEFAULT numeric_limits : private __cccl_numeric_limits<remove_cv_t<_Tp>>
 {
-  typedef __libcpp_numeric_limits<remove_cv_t<_Tp>> __base;
+  typedef __cccl_numeric_limits<remove_cv_t<_Tp>> __base;
   typedef typename __base::type type;
 
 public:
diff --git a/libcudacxx/test/NOTES.TXT b/libcudacxx/test/NOTES.TXT
index 602de495103..ae5c1575281 100644
--- a/libcudacxx/test/NOTES.TXT
+++ b/libcudacxx/test/NOTES.TXT
@@ -12,7 +12,7 @@ These notes contain a list of things that must be done after branching for
 an LLVM release.
 
 1. Update _LIBCUDACXX_VERSION in `__config`
-2. Update the __libcpp_version file.
+2. Update the __cccl_version file.
 3. Update the version number in `docs/conf.py`
 4. Create ABI lists for the previous release under `lib/abi`
 
diff --git a/libcudacxx/test/libcudacxx/libcxx/type_traits/is_constant_evaluated.pass.cpp b/libcudacxx/test/libcudacxx/libcxx/type_traits/is_constant_evaluated.pass.cpp
index 56aa5c72b3d..cacffd6bc5b 100644
--- a/libcudacxx/test/libcudacxx/libcxx/type_traits/is_constant_evaluated.pass.cpp
+++ b/libcudacxx/test/libcudacxx/libcxx/type_traits/is_constant_evaluated.pass.cpp
@@ -9,7 +9,7 @@
 
 // <cuda/std/type_traits>
 
-// __libcpp_is_constant_evaluated()
+// _CUDA_VSTD::is_constant_evaluated()
 
 // returns false when there's no constant evaluation support from the compiler.
 //  as well as when called not in a constexpr context
@@ -21,14 +21,14 @@
 
 int main(int, char**)
 {
-  ASSERT_SAME_TYPE(decltype(cuda::std::__libcpp_is_constant_evaluated()), bool);
-  ASSERT_NOEXCEPT(cuda::std::__libcpp_is_constant_evaluated());
+  ASSERT_SAME_TYPE(decltype(cuda::std::is_constant_evaluated()), bool);
+  ASSERT_NOEXCEPT(cuda::std::is_constant_evaluated());
 
 #if defined(_CCCL_BUILTIN_IS_CONSTANT_EVALUATED)
-  static_assert(cuda::std::__libcpp_is_constant_evaluated(), "");
+  static_assert(cuda::std::is_constant_evaluated(), "");
 #endif
 
-  bool p = cuda::std::__libcpp_is_constant_evaluated();
+  bool p = cuda::std::is_constant_evaluated();
   assert(!p);
 
   return 0;
diff --git a/libcudacxx/test/libcudacxx/libcxx/utilities/meta/is_referenceable.pass.cpp b/libcudacxx/test/libcudacxx/libcxx/utilities/meta/is_referenceable.pass.cpp
index 8ea4ad3f34f..d1a83917feb 100644
--- a/libcudacxx/test/libcudacxx/libcxx/utilities/meta/is_referenceable.pass.cpp
+++ b/libcudacxx/test/libcudacxx/libcxx/utilities/meta/is_referenceable.pass.cpp
@@ -7,7 +7,7 @@
 //===----------------------------------------------------------------------===//
 //
 
-// __libcpp_is_referenceable<Tp>
+// __cccl_is_referenceable<Tp>
 //
 // [defns.referenceable] defines "a referenceable type" as:
 // An object type, a function type that does not have cv-qualifiers
@@ -22,141 +22,141 @@
 struct Foo
 {};
 
-static_assert((!cuda::std::__libcpp_is_referenceable<void>::value), "");
-static_assert((cuda::std::__libcpp_is_referenceable<int>::value), "");
-static_assert((cuda::std::__libcpp_is_referenceable<int[3]>::value), "");
-static_assert((cuda::std::__libcpp_is_referenceable<int[]>::value), "");
-static_assert((cuda::std::__libcpp_is_referenceable<int&>::value), "");
-static_assert((cuda::std::__libcpp_is_referenceable<const int&>::value), "");
-static_assert((cuda::std::__libcpp_is_referenceable<int*>::value), "");
-static_assert((cuda::std::__libcpp_is_referenceable<const int*>::value), "");
-static_assert((cuda::std::__libcpp_is_referenceable<Foo>::value), "");
-static_assert((cuda::std::__libcpp_is_referenceable<const Foo>::value), "");
-static_assert((cuda::std::__libcpp_is_referenceable<Foo&>::value), "");
-static_assert((cuda::std::__libcpp_is_referenceable<const Foo&>::value), "");
-static_assert((cuda::std::__libcpp_is_referenceable<Foo&&>::value), "");
-static_assert((cuda::std::__libcpp_is_referenceable<const Foo&&>::value), "");
+static_assert((!cuda::std::__cccl_is_referenceable<void>::value), "");
+static_assert((cuda::std::__cccl_is_referenceable<int>::value), "");
+static_assert((cuda::std::__cccl_is_referenceable<int[3]>::value), "");
+static_assert((cuda::std::__cccl_is_referenceable<int[]>::value), "");
+static_assert((cuda::std::__cccl_is_referenceable<int&>::value), "");
+static_assert((cuda::std::__cccl_is_referenceable<const int&>::value), "");
+static_assert((cuda::std::__cccl_is_referenceable<int*>::value), "");
+static_assert((cuda::std::__cccl_is_referenceable<const int*>::value), "");
+static_assert((cuda::std::__cccl_is_referenceable<Foo>::value), "");
+static_assert((cuda::std::__cccl_is_referenceable<const Foo>::value), "");
+static_assert((cuda::std::__cccl_is_referenceable<Foo&>::value), "");
+static_assert((cuda::std::__cccl_is_referenceable<const Foo&>::value), "");
+static_assert((cuda::std::__cccl_is_referenceable<Foo&&>::value), "");
+static_assert((cuda::std::__cccl_is_referenceable<const Foo&&>::value), "");
 
 #ifndef _LIBCUDACXX_HAS_NO_VECTOR_EXTENSION
-static_assert((cuda::std::__libcpp_is_referenceable<int __attribute__((__vector_size__(8)))>::value), "");
-static_assert((cuda::std::__libcpp_is_referenceable<const int __attribute__((__vector_size__(8)))>::value), "");
-static_assert((cuda::std::__libcpp_is_referenceable<float __attribute__((__vector_size__(16)))>::value), "");
-static_assert((cuda::std::__libcpp_is_referenceable<const float __attribute__((__vector_size__(16)))>::value), "");
+static_assert((cuda::std::__cccl_is_referenceable<int __attribute__((__vector_size__(8)))>::value), "");
+static_assert((cuda::std::__cccl_is_referenceable<const int __attribute__((__vector_size__(8)))>::value), "");
+static_assert((cuda::std::__cccl_is_referenceable<float __attribute__((__vector_size__(16)))>::value), "");
+static_assert((cuda::std::__cccl_is_referenceable<const float __attribute__((__vector_size__(16)))>::value), "");
 #endif
 
 // Functions without cv-qualifiers are referenceable
-static_assert((cuda::std::__libcpp_is_referenceable<void()>::value), "");
-static_assert((!cuda::std::__libcpp_is_referenceable<void() const>::value), "");
-static_assert((!cuda::std::__libcpp_is_referenceable<void() &>::value), "");
-static_assert((!cuda::std::__libcpp_is_referenceable<void() const&>::value), "");
-static_assert((!cuda::std::__libcpp_is_referenceable<void() &&>::value), "");
-static_assert((!cuda::std::__libcpp_is_referenceable<void() const&&>::value), "");
-
-static_assert((cuda::std::__libcpp_is_referenceable<void(int)>::value), "");
-static_assert((!cuda::std::__libcpp_is_referenceable<void(int) const>::value), "");
-static_assert((!cuda::std::__libcpp_is_referenceable<void(int) &>::value), "");
-static_assert((!cuda::std::__libcpp_is_referenceable<void(int) const&>::value), "");
-static_assert((!cuda::std::__libcpp_is_referenceable<void(int) &&>::value), "");
-static_assert((!cuda::std::__libcpp_is_referenceable<void(int) const&&>::value), "");
-
-static_assert((cuda::std::__libcpp_is_referenceable<void(int, float)>::value), "");
-static_assert((!cuda::std::__libcpp_is_referenceable<void(int, float) const>::value), "");
-static_assert((!cuda::std::__libcpp_is_referenceable<void(int, float) &>::value), "");
-static_assert((!cuda::std::__libcpp_is_referenceable<void(int, float) const&>::value), "");
-static_assert((!cuda::std::__libcpp_is_referenceable<void(int, float) &&>::value), "");
-static_assert((!cuda::std::__libcpp_is_referenceable<void(int, float) const&&>::value), "");
-
-static_assert((cuda::std::__libcpp_is_referenceable<void(int, float, Foo&)>::value), "");
-static_assert((!cuda::std::__libcpp_is_referenceable<void(int, float, Foo&) const>::value), "");
-static_assert((!cuda::std::__libcpp_is_referenceable<void(int, float, Foo&) &>::value), "");
-static_assert((!cuda::std::__libcpp_is_referenceable<void(int, float, Foo&) const&>::value), "");
-static_assert((!cuda::std::__libcpp_is_referenceable<void(int, float, Foo&) &&>::value), "");
-static_assert((!cuda::std::__libcpp_is_referenceable<void(int, float, Foo&) const&&>::value), "");
-
-static_assert((cuda::std::__libcpp_is_referenceable<void(...)>::value), "");
-static_assert((!cuda::std::__libcpp_is_referenceable<void(...) const>::value), "");
-static_assert((!cuda::std::__libcpp_is_referenceable<void(...) &>::value), "");
-static_assert((!cuda::std::__libcpp_is_referenceable<void(...) const&>::value), "");
-static_assert((!cuda::std::__libcpp_is_referenceable<void(...) &&>::value), "");
-static_assert((!cuda::std::__libcpp_is_referenceable<void(...) const&&>::value), "");
-
-static_assert((cuda::std::__libcpp_is_referenceable<void(int, ...)>::value), "");
-static_assert((!cuda::std::__libcpp_is_referenceable<void(int, ...) const>::value), "");
-static_assert((!cuda::std::__libcpp_is_referenceable<void(int, ...) &>::value), "");
-static_assert((!cuda::std::__libcpp_is_referenceable<void(int, ...) const&>::value), "");
-static_assert((!cuda::std::__libcpp_is_referenceable<void(int, ...) &&>::value), "");
-static_assert((!cuda::std::__libcpp_is_referenceable<void(int, ...) const&&>::value), "");
-
-static_assert((cuda::std::__libcpp_is_referenceable<void(int, float, ...)>::value), "");
-static_assert((!cuda::std::__libcpp_is_referenceable<void(int, float, ...) const>::value), "");
-static_assert((!cuda::std::__libcpp_is_referenceable<void(int, float, ...) &>::value), "");
-static_assert((!cuda::std::__libcpp_is_referenceable<void(int, float, ...) const&>::value), "");
-static_assert((!cuda::std::__libcpp_is_referenceable<void(int, float, ...) &&>::value), "");
-static_assert((!cuda::std::__libcpp_is_referenceable<void(int, float, ...) const&&>::value), "");
-
-static_assert((cuda::std::__libcpp_is_referenceable<void(int, float, Foo&, ...)>::value), "");
-static_assert((!cuda::std::__libcpp_is_referenceable<void(int, float, Foo&, ...) const>::value), "");
-static_assert((!cuda::std::__libcpp_is_referenceable<void(int, float, Foo&, ...) &>::value), "");
-static_assert((!cuda::std::__libcpp_is_referenceable<void(int, float, Foo&, ...) const&>::value), "");
-static_assert((!cuda::std::__libcpp_is_referenceable<void(int, float, Foo&, ...) &&>::value), "");
-static_assert((!cuda::std::__libcpp_is_referenceable<void(int, float, Foo&, ...) const&&>::value), "");
+static_assert((cuda::std::__cccl_is_referenceable<void()>::value), "");
+static_assert((!cuda::std::__cccl_is_referenceable<void() const>::value), "");
+static_assert((!cuda::std::__cccl_is_referenceable<void() &>::value), "");
+static_assert((!cuda::std::__cccl_is_referenceable<void() const&>::value), "");
+static_assert((!cuda::std::__cccl_is_referenceable<void() &&>::value), "");
+static_assert((!cuda::std::__cccl_is_referenceable<void() const&&>::value), "");
+
+static_assert((cuda::std::__cccl_is_referenceable<void(int)>::value), "");
+static_assert((!cuda::std::__cccl_is_referenceable<void(int) const>::value), "");
+static_assert((!cuda::std::__cccl_is_referenceable<void(int) &>::value), "");
+static_assert((!cuda::std::__cccl_is_referenceable<void(int) const&>::value), "");
+static_assert((!cuda::std::__cccl_is_referenceable<void(int) &&>::value), "");
+static_assert((!cuda::std::__cccl_is_referenceable<void(int) const&&>::value), "");
+
+static_assert((cuda::std::__cccl_is_referenceable<void(int, float)>::value), "");
+static_assert((!cuda::std::__cccl_is_referenceable<void(int, float) const>::value), "");
+static_assert((!cuda::std::__cccl_is_referenceable<void(int, float) &>::value), "");
+static_assert((!cuda::std::__cccl_is_referenceable<void(int, float) const&>::value), "");
+static_assert((!cuda::std::__cccl_is_referenceable<void(int, float) &&>::value), "");
+static_assert((!cuda::std::__cccl_is_referenceable<void(int, float) const&&>::value), "");
+
+static_assert((cuda::std::__cccl_is_referenceable<void(int, float, Foo&)>::value), "");
+static_assert((!cuda::std::__cccl_is_referenceable<void(int, float, Foo&) const>::value), "");
+static_assert((!cuda::std::__cccl_is_referenceable<void(int, float, Foo&) &>::value), "");
+static_assert((!cuda::std::__cccl_is_referenceable<void(int, float, Foo&) const&>::value), "");
+static_assert((!cuda::std::__cccl_is_referenceable<void(int, float, Foo&) &&>::value), "");
+static_assert((!cuda::std::__cccl_is_referenceable<void(int, float, Foo&) const&&>::value), "");
+
+static_assert((cuda::std::__cccl_is_referenceable<void(...)>::value), "");
+static_assert((!cuda::std::__cccl_is_referenceable<void(...) const>::value), "");
+static_assert((!cuda::std::__cccl_is_referenceable<void(...) &>::value), "");
+static_assert((!cuda::std::__cccl_is_referenceable<void(...) const&>::value), "");
+static_assert((!cuda::std::__cccl_is_referenceable<void(...) &&>::value), "");
+static_assert((!cuda::std::__cccl_is_referenceable<void(...) const&&>::value), "");
+
+static_assert((cuda::std::__cccl_is_referenceable<void(int, ...)>::value), "");
+static_assert((!cuda::std::__cccl_is_referenceable<void(int, ...) const>::value), "");
+static_assert((!cuda::std::__cccl_is_referenceable<void(int, ...) &>::value), "");
+static_assert((!cuda::std::__cccl_is_referenceable<void(int, ...) const&>::value), "");
+static_assert((!cuda::std::__cccl_is_referenceable<void(int, ...) &&>::value), "");
+static_assert((!cuda::std::__cccl_is_referenceable<void(int, ...) const&&>::value), "");
+
+static_assert((cuda::std::__cccl_is_referenceable<void(int, float, ...)>::value), "");
+static_assert((!cuda::std::__cccl_is_referenceable<void(int, float, ...) const>::value), "");
+static_assert((!cuda::std::__cccl_is_referenceable<void(int, float, ...) &>::value), "");
+static_assert((!cuda::std::__cccl_is_referenceable<void(int, float, ...) const&>::value), "");
+static_assert((!cuda::std::__cccl_is_referenceable<void(int, float, ...) &&>::value), "");
+static_assert((!cuda::std::__cccl_is_referenceable<void(int, float, ...) const&&>::value), "");
+
+static_assert((cuda::std::__cccl_is_referenceable<void(int, float, Foo&, ...)>::value), "");
+static_assert((!cuda::std::__cccl_is_referenceable<void(int, float, Foo&, ...) const>::value), "");
+static_assert((!cuda::std::__cccl_is_referenceable<void(int, float, Foo&, ...) &>::value), "");
+static_assert((!cuda::std::__cccl_is_referenceable<void(int, float, Foo&, ...) const&>::value), "");
+static_assert((!cuda::std::__cccl_is_referenceable<void(int, float, Foo&, ...) &&>::value), "");
+static_assert((!cuda::std::__cccl_is_referenceable<void(int, float, Foo&, ...) const&&>::value), "");
 
 // member functions with or without cv-qualifiers are referenceable
-static_assert((cuda::std::__libcpp_is_referenceable<void (Foo::*)()>::value), "");
-static_assert((cuda::std::__libcpp_is_referenceable<void (Foo::*)() const>::value), "");
-static_assert((cuda::std::__libcpp_is_referenceable<void (Foo::*)() &>::value), "");
-static_assert((cuda::std::__libcpp_is_referenceable<void (Foo::*)() const&>::value), "");
-static_assert((cuda::std::__libcpp_is_referenceable<void (Foo::*)() &&>::value), "");
-static_assert((cuda::std::__libcpp_is_referenceable<void (Foo::*)() const&&>::value), "");
-
-static_assert((cuda::std::__libcpp_is_referenceable<void (Foo::*)(int)>::value), "");
-static_assert((cuda::std::__libcpp_is_referenceable<void (Foo::*)(int) const>::value), "");
-static_assert((cuda::std::__libcpp_is_referenceable<void (Foo::*)(int) &>::value), "");
-static_assert((cuda::std::__libcpp_is_referenceable<void (Foo::*)(int) const&>::value), "");
-static_assert((cuda::std::__libcpp_is_referenceable<void (Foo::*)(int) &&>::value), "");
-static_assert((cuda::std::__libcpp_is_referenceable<void (Foo::*)(int) const&&>::value), "");
-
-static_assert((cuda::std::__libcpp_is_referenceable<void (Foo::*)(int, float)>::value), "");
-static_assert((cuda::std::__libcpp_is_referenceable<void (Foo::*)(int, float) const>::value), "");
-static_assert((cuda::std::__libcpp_is_referenceable<void (Foo::*)(int, float) &>::value), "");
-static_assert((cuda::std::__libcpp_is_referenceable<void (Foo::*)(int, float) const&>::value), "");
-static_assert((cuda::std::__libcpp_is_referenceable<void (Foo::*)(int, float) &&>::value), "");
-static_assert((cuda::std::__libcpp_is_referenceable<void (Foo::*)(int, float) const&&>::value), "");
-
-static_assert((cuda::std::__libcpp_is_referenceable<void (Foo::*)(int, float, Foo&)>::value), "");
-static_assert((cuda::std::__libcpp_is_referenceable<void (Foo::*)(int, float, Foo&) const>::value), "");
-static_assert((cuda::std::__libcpp_is_referenceable<void (Foo::*)(int, float, Foo&) &>::value), "");
-static_assert((cuda::std::__libcpp_is_referenceable<void (Foo::*)(int, float, Foo&) const&>::value), "");
-static_assert((cuda::std::__libcpp_is_referenceable<void (Foo::*)(int, float, Foo&) &&>::value), "");
-static_assert((cuda::std::__libcpp_is_referenceable<void (Foo::*)(int, float, Foo&) const&&>::value), "");
-
-static_assert((cuda::std::__libcpp_is_referenceable<void (Foo::*)(...)>::value), "");
-static_assert((cuda::std::__libcpp_is_referenceable<void (Foo::*)(...) const>::value), "");
-static_assert((cuda::std::__libcpp_is_referenceable<void (Foo::*)(...) &>::value), "");
-static_assert((cuda::std::__libcpp_is_referenceable<void (Foo::*)(...) const&>::value), "");
-static_assert((cuda::std::__libcpp_is_referenceable<void (Foo::*)(...) &&>::value), "");
-static_assert((cuda::std::__libcpp_is_referenceable<void (Foo::*)(...) const&&>::value), "");
-
-static_assert((cuda::std::__libcpp_is_referenceable<void (Foo::*)(int, ...)>::value), "");
-static_assert((cuda::std::__libcpp_is_referenceable<void (Foo::*)(int, ...) const>::value), "");
-static_assert((cuda::std::__libcpp_is_referenceable<void (Foo::*)(int, ...) &>::value), "");
-static_assert((cuda::std::__libcpp_is_referenceable<void (Foo::*)(int, ...) const&>::value), "");
-static_assert((cuda::std::__libcpp_is_referenceable<void (Foo::*)(int, ...) &&>::value), "");
-static_assert((cuda::std::__libcpp_is_referenceable<void (Foo::*)(int, ...) const&&>::value), "");
-
-static_assert((cuda::std::__libcpp_is_referenceable<void (Foo::*)(int, float, ...)>::value), "");
-static_assert((cuda::std::__libcpp_is_referenceable<void (Foo::*)(int, float, ...) const>::value), "");
-static_assert((cuda::std::__libcpp_is_referenceable<void (Foo::*)(int, float, ...) &>::value), "");
-static_assert((cuda::std::__libcpp_is_referenceable<void (Foo::*)(int, float, ...) const&>::value), "");
-static_assert((cuda::std::__libcpp_is_referenceable<void (Foo::*)(int, float, ...) &&>::value), "");
-static_assert((cuda::std::__libcpp_is_referenceable<void (Foo::*)(int, float, ...) const&&>::value), "");
-
-static_assert((cuda::std::__libcpp_is_referenceable<void (Foo::*)(int, float, Foo&, ...)>::value), "");
-static_assert((cuda::std::__libcpp_is_referenceable<void (Foo::*)(int, float, Foo&, ...) const>::value), "");
-static_assert((cuda::std::__libcpp_is_referenceable<void (Foo::*)(int, float, Foo&, ...) &>::value), "");
-static_assert((cuda::std::__libcpp_is_referenceable<void (Foo::*)(int, float, Foo&, ...) const&>::value), "");
-static_assert((cuda::std::__libcpp_is_referenceable<void (Foo::*)(int, float, Foo&, ...) &&>::value), "");
-static_assert((cuda::std::__libcpp_is_referenceable<void (Foo::*)(int, float, Foo&, ...) const&&>::value), "");
+static_assert((cuda::std::__cccl_is_referenceable<void (Foo::*)()>::value), "");
+static_assert((cuda::std::__cccl_is_referenceable<void (Foo::*)() const>::value), "");
+static_assert((cuda::std::__cccl_is_referenceable<void (Foo::*)() &>::value), "");
+static_assert((cuda::std::__cccl_is_referenceable<void (Foo::*)() const&>::value), "");
+static_assert((cuda::std::__cccl_is_referenceable<void (Foo::*)() &&>::value), "");
+static_assert((cuda::std::__cccl_is_referenceable<void (Foo::*)() const&&>::value), "");
+
+static_assert((cuda::std::__cccl_is_referenceable<void (Foo::*)(int)>::value), "");
+static_assert((cuda::std::__cccl_is_referenceable<void (Foo::*)(int) const>::value), "");
+static_assert((cuda::std::__cccl_is_referenceable<void (Foo::*)(int) &>::value), "");
+static_assert((cuda::std::__cccl_is_referenceable<void (Foo::*)(int) const&>::value), "");
+static_assert((cuda::std::__cccl_is_referenceable<void (Foo::*)(int) &&>::value), "");
+static_assert((cuda::std::__cccl_is_referenceable<void (Foo::*)(int) const&&>::value), "");
+
+static_assert((cuda::std::__cccl_is_referenceable<void (Foo::*)(int, float)>::value), "");
+static_assert((cuda::std::__cccl_is_referenceable<void (Foo::*)(int, float) const>::value), "");
+static_assert((cuda::std::__cccl_is_referenceable<void (Foo::*)(int, float) &>::value), "");
+static_assert((cuda::std::__cccl_is_referenceable<void (Foo::*)(int, float) const&>::value), "");
+static_assert((cuda::std::__cccl_is_referenceable<void (Foo::*)(int, float) &&>::value), "");
+static_assert((cuda::std::__cccl_is_referenceable<void (Foo::*)(int, float) const&&>::value), "");
+
+static_assert((cuda::std::__cccl_is_referenceable<void (Foo::*)(int, float, Foo&)>::value), "");
+static_assert((cuda::std::__cccl_is_referenceable<void (Foo::*)(int, float, Foo&) const>::value), "");
+static_assert((cuda::std::__cccl_is_referenceable<void (Foo::*)(int, float, Foo&) &>::value), "");
+static_assert((cuda::std::__cccl_is_referenceable<void (Foo::*)(int, float, Foo&) const&>::value), "");
+static_assert((cuda::std::__cccl_is_referenceable<void (Foo::*)(int, float, Foo&) &&>::value), "");
+static_assert((cuda::std::__cccl_is_referenceable<void (Foo::*)(int, float, Foo&) const&&>::value), "");
+
+static_assert((cuda::std::__cccl_is_referenceable<void (Foo::*)(...)>::value), "");
+static_assert((cuda::std::__cccl_is_referenceable<void (Foo::*)(...) const>::value), "");
+static_assert((cuda::std::__cccl_is_referenceable<void (Foo::*)(...) &>::value), "");
+static_assert((cuda::std::__cccl_is_referenceable<void (Foo::*)(...) const&>::value), "");
+static_assert((cuda::std::__cccl_is_referenceable<void (Foo::*)(...) &&>::value), "");
+static_assert((cuda::std::__cccl_is_referenceable<void (Foo::*)(...) const&&>::value), "");
+
+static_assert((cuda::std::__cccl_is_referenceable<void (Foo::*)(int, ...)>::value), "");
+static_assert((cuda::std::__cccl_is_referenceable<void (Foo::*)(int, ...) const>::value), "");
+static_assert((cuda::std::__cccl_is_referenceable<void (Foo::*)(int, ...) &>::value), "");
+static_assert((cuda::std::__cccl_is_referenceable<void (Foo::*)(int, ...) const&>::value), "");
+static_assert((cuda::std::__cccl_is_referenceable<void (Foo::*)(int, ...) &&>::value), "");
+static_assert((cuda::std::__cccl_is_referenceable<void (Foo::*)(int, ...) const&&>::value), "");
+
+static_assert((cuda::std::__cccl_is_referenceable<void (Foo::*)(int, float, ...)>::value), "");
+static_assert((cuda::std::__cccl_is_referenceable<void (Foo::*)(int, float, ...) const>::value), "");
+static_assert((cuda::std::__cccl_is_referenceable<void (Foo::*)(int, float, ...) &>::value), "");
+static_assert((cuda::std::__cccl_is_referenceable<void (Foo::*)(int, float, ...) const&>::value), "");
+static_assert((cuda::std::__cccl_is_referenceable<void (Foo::*)(int, float, ...) &&>::value), "");
+static_assert((cuda::std::__cccl_is_referenceable<void (Foo::*)(int, float, ...) const&&>::value), "");
+
+static_assert((cuda::std::__cccl_is_referenceable<void (Foo::*)(int, float, Foo&, ...)>::value), "");
+static_assert((cuda::std::__cccl_is_referenceable<void (Foo::*)(int, float, Foo&, ...) const>::value), "");
+static_assert((cuda::std::__cccl_is_referenceable<void (Foo::*)(int, float, Foo&, ...) &>::value), "");
+static_assert((cuda::std::__cccl_is_referenceable<void (Foo::*)(int, float, Foo&, ...) const&>::value), "");
+static_assert((cuda::std::__cccl_is_referenceable<void (Foo::*)(int, float, Foo&, ...) &&>::value), "");
+static_assert((cuda::std::__cccl_is_referenceable<void (Foo::*)(int, float, Foo&, ...) const&&>::value), "");
 
 int main(int, char**)
 {
diff --git a/libcudacxx/test/libcudacxx/std/algorithms/alg.sorting/alg.sort/partial.sort.copy/partial_sort_copy.pass.cpp b/libcudacxx/test/libcudacxx/std/algorithms/alg.sorting/alg.sort/partial.sort.copy/partial_sort_copy.pass.cpp
index d3217b85baf..c02f29f0178 100644
--- a/libcudacxx/test/libcudacxx/std/algorithms/alg.sorting/alg.sort/partial.sort.copy/partial_sort_copy.pass.cpp
+++ b/libcudacxx/test/libcudacxx/std/algorithms/alg.sorting/alg.sort/partial.sort.copy/partial_sort_copy.pass.cpp
@@ -78,7 +78,7 @@ __host__ __device__ TEST_CONSTEXPR_CXX14 bool test()
   assert(j == 75);
 
   test<int, random_access_iterator<int*>, random_access_iterator<int*>>();
-  if (!cuda::std::__libcpp_is_constant_evaluated()) // This breaks some compilers due to excessive constant folding
+  if (!cuda::std::is_constant_evaluated()) // This breaks some compilers due to excessive constant folding
   {
     test<int, random_access_iterator<int*>, int*>();
     test<int, int*, random_access_iterator<int*>>();
@@ -86,7 +86,7 @@ __host__ __device__ TEST_CONSTEXPR_CXX14 bool test()
   }
 
   test<MoveOnly, random_access_iterator<int*>, random_access_iterator<MoveOnly*>>();
-  if (!cuda::std::__libcpp_is_constant_evaluated()) // This breaks some compilers due to excessive constant folding
+  if (!cuda::std::is_constant_evaluated()) // This breaks some compilers due to excessive constant folding
   {
     test<MoveOnly, random_access_iterator<int*>, MoveOnly*>();
     test<MoveOnly, int*, random_access_iterator<MoveOnly*>>();
diff --git a/libcudacxx/test/libcudacxx/std/algorithms/alg.sorting/alg.sort/partial.sort.copy/partial_sort_copy_comp.pass.cpp b/libcudacxx/test/libcudacxx/std/algorithms/alg.sorting/alg.sort/partial.sort.copy/partial_sort_copy_comp.pass.cpp
index 45bced305b0..2c0c210fb08 100644
--- a/libcudacxx/test/libcudacxx/std/algorithms/alg.sorting/alg.sort/partial.sort.copy/partial_sort_copy_comp.pass.cpp
+++ b/libcudacxx/test/libcudacxx/std/algorithms/alg.sorting/alg.sort/partial.sort.copy/partial_sort_copy_comp.pass.cpp
@@ -84,7 +84,7 @@ __host__ __device__ TEST_CONSTEXPR_CXX14 bool test()
   assert(j == 75);
 
   test<int, random_access_iterator<int*>, random_access_iterator<int*>>();
-  if (!cuda::std::__libcpp_is_constant_evaluated()) // This breaks some compilers due to excessive constant folding
+  if (!cuda::std::is_constant_evaluated()) // This breaks some compilers due to excessive constant folding
   {
     test<int, random_access_iterator<int*>, int*>();
     test<int, int*, random_access_iterator<int*>>();
@@ -92,7 +92,7 @@ __host__ __device__ TEST_CONSTEXPR_CXX14 bool test()
   }
 
   test<MoveOnly, random_access_iterator<int*>, random_access_iterator<MoveOnly*>>();
-  if (!cuda::std::__libcpp_is_constant_evaluated()) // This breaks some compilers due to excessive constant folding
+  if (!cuda::std::is_constant_evaluated()) // This breaks some compilers due to excessive constant folding
   {
     test<MoveOnly, random_access_iterator<int*>, MoveOnly*>();
     test<MoveOnly, int*, random_access_iterator<MoveOnly*>>();
diff --git a/libcudacxx/test/libcudacxx/std/containers/sequences/inplace_vector/access.pass.cpp b/libcudacxx/test/libcudacxx/std/containers/sequences/inplace_vector/access.pass.cpp
index c240c9759a9..64388533bbb 100644
--- a/libcudacxx/test/libcudacxx/std/containers/sequences/inplace_vector/access.pass.cpp
+++ b/libcudacxx/test/libcudacxx/std/containers/sequences/inplace_vector/access.pass.cpp
@@ -76,7 +76,7 @@ __host__ __device__ constexpr bool test()
   test<int>();
   test<Trivial>();
 
-  if (!cuda::std::__libcpp_is_constant_evaluated())
+  if (!cuda::std::is_constant_evaluated())
   {
     test<NonTrivial>();
     test<NonTrivialDestructor>();
diff --git a/libcudacxx/test/libcudacxx/std/containers/sequences/inplace_vector/assign.pass.cpp b/libcudacxx/test/libcudacxx/std/containers/sequences/inplace_vector/assign.pass.cpp
index 38fd504a48d..a2d12513c3f 100644
--- a/libcudacxx/test/libcudacxx/std/containers/sequences/inplace_vector/assign.pass.cpp
+++ b/libcudacxx/test/libcudacxx/std/containers/sequences/inplace_vector/assign.pass.cpp
@@ -234,7 +234,7 @@ __host__ __device__ constexpr bool test()
   test<int>();
   test<Trivial>();
 
-  if (!cuda::std::__libcpp_is_constant_evaluated())
+  if (!cuda::std::is_constant_evaluated())
   {
     test<NonTrivial>();
     test<NonTrivialDestructor>();
diff --git a/libcudacxx/test/libcudacxx/std/containers/sequences/inplace_vector/assignment.pass.cpp b/libcudacxx/test/libcudacxx/std/containers/sequences/inplace_vector/assignment.pass.cpp
index d905877373f..347ed9d777e 100644
--- a/libcudacxx/test/libcudacxx/std/containers/sequences/inplace_vector/assignment.pass.cpp
+++ b/libcudacxx/test/libcudacxx/std/containers/sequences/inplace_vector/assignment.pass.cpp
@@ -224,7 +224,7 @@ __host__ __device__ constexpr bool test()
   test<int>();
   test<Trivial>();
 
-  if (!cuda::std::__libcpp_is_constant_evaluated())
+  if (!cuda::std::is_constant_evaluated())
   {
     test<NonTrivial>();
     test<NonTrivialDestructor>();
diff --git a/libcudacxx/test/libcudacxx/std/containers/sequences/inplace_vector/capacity.pass.cpp b/libcudacxx/test/libcudacxx/std/containers/sequences/inplace_vector/capacity.pass.cpp
index 0b8280f9a43..f7b8a68031a 100644
--- a/libcudacxx/test/libcudacxx/std/containers/sequences/inplace_vector/capacity.pass.cpp
+++ b/libcudacxx/test/libcudacxx/std/containers/sequences/inplace_vector/capacity.pass.cpp
@@ -66,7 +66,7 @@ __host__ __device__ constexpr bool test()
   test<int>();
   test<Trivial>();
 
-  if (!cuda::std::__libcpp_is_constant_evaluated())
+  if (!cuda::std::is_constant_evaluated())
   {
     test<NonTrivial>();
     test<NonTrivialDestructor>();
diff --git a/libcudacxx/test/libcudacxx/std/containers/sequences/inplace_vector/comparison.pass.cpp b/libcudacxx/test/libcudacxx/std/containers/sequences/inplace_vector/comparison.pass.cpp
index 5f92e85755a..1a6f587c083 100644
--- a/libcudacxx/test/libcudacxx/std/containers/sequences/inplace_vector/comparison.pass.cpp
+++ b/libcudacxx/test/libcudacxx/std/containers/sequences/inplace_vector/comparison.pass.cpp
@@ -72,7 +72,7 @@ __host__ __device__ constexpr bool test()
   test<int>();
   test<Trivial>();
 
-  if (!cuda::std::__libcpp_is_constant_evaluated())
+  if (!cuda::std::is_constant_evaluated())
   {
     test<NonTrivial>();
     test<NonTrivialDestructor>();
diff --git a/libcudacxx/test/libcudacxx/std/containers/sequences/inplace_vector/constructor.pass.cpp b/libcudacxx/test/libcudacxx/std/containers/sequences/inplace_vector/constructor.pass.cpp
index 0b3e86bfc35..e30acffe9c1 100644
--- a/libcudacxx/test/libcudacxx/std/containers/sequences/inplace_vector/constructor.pass.cpp
+++ b/libcudacxx/test/libcudacxx/std/containers/sequences/inplace_vector/constructor.pass.cpp
@@ -283,7 +283,7 @@ __host__ __device__ constexpr void test()
 {
   test_default<T>();
 
-  if (!cuda::std::__libcpp_is_constant_evaluated())
+  if (!cuda::std::is_constant_evaluated())
   {
     test_copy_move<T>();
     test_size<T>();
@@ -308,7 +308,7 @@ __host__ __device__ constexpr bool test()
   test<ThrowingMoveAssignment>();
 
   // Due to reinterpret_cast within the destructor a on trivially destructible type cannot be constexpr at all
-  if (!cuda::std::__libcpp_is_constant_evaluated())
+  if (!cuda::std::is_constant_evaluated())
   {
     test<NonTrivialDestructor>();
   }
diff --git a/libcudacxx/test/libcudacxx/std/containers/sequences/inplace_vector/emplace.pass.cpp b/libcudacxx/test/libcudacxx/std/containers/sequences/inplace_vector/emplace.pass.cpp
index 3178c1b5b25..a581113b0d0 100644
--- a/libcudacxx/test/libcudacxx/std/containers/sequences/inplace_vector/emplace.pass.cpp
+++ b/libcudacxx/test/libcudacxx/std/containers/sequences/inplace_vector/emplace.pass.cpp
@@ -196,7 +196,7 @@ __host__ __device__ constexpr bool test()
   test<int>();
   test<Trivial>();
 
-  if (!cuda::std::__libcpp_is_constant_evaluated())
+  if (!cuda::std::is_constant_evaluated())
   {
     test<NonTrivial>();
     test<NonTrivialDestructor>();
diff --git a/libcudacxx/test/libcudacxx/std/containers/sequences/inplace_vector/insert.pass.cpp b/libcudacxx/test/libcudacxx/std/containers/sequences/inplace_vector/insert.pass.cpp
index cb9f5830d8b..9b80f98a67a 100644
--- a/libcudacxx/test/libcudacxx/std/containers/sequences/inplace_vector/insert.pass.cpp
+++ b/libcudacxx/test/libcudacxx/std/containers/sequences/inplace_vector/insert.pass.cpp
@@ -213,7 +213,7 @@ __host__ __device__ constexpr bool test()
   test<int>();
   test<Trivial>();
 
-  if (!cuda::std::__libcpp_is_constant_evaluated())
+  if (!cuda::std::is_constant_evaluated())
   {
     test<NonTrivial>();
     test<NonTrivialDestructor>();
diff --git a/libcudacxx/test/libcudacxx/std/containers/sequences/inplace_vector/iterators.pass.cpp b/libcudacxx/test/libcudacxx/std/containers/sequences/inplace_vector/iterators.pass.cpp
index 5b3590cc789..06029272c43 100644
--- a/libcudacxx/test/libcudacxx/std/containers/sequences/inplace_vector/iterators.pass.cpp
+++ b/libcudacxx/test/libcudacxx/std/containers/sequences/inplace_vector/iterators.pass.cpp
@@ -85,7 +85,7 @@ __host__ __device__ constexpr bool test()
   test<int>();
   test<Trivial>();
 
-  if (!cuda::std::__libcpp_is_constant_evaluated())
+  if (!cuda::std::is_constant_evaluated())
   {
     test<NonTrivial>();
     test<NonTrivialDestructor>();
diff --git a/libcudacxx/test/libcudacxx/std/containers/sequences/inplace_vector/resize.pass.cpp b/libcudacxx/test/libcudacxx/std/containers/sequences/inplace_vector/resize.pass.cpp
index 3500f591b2d..168cebbd6c2 100644
--- a/libcudacxx/test/libcudacxx/std/containers/sequences/inplace_vector/resize.pass.cpp
+++ b/libcudacxx/test/libcudacxx/std/containers/sequences/inplace_vector/resize.pass.cpp
@@ -231,7 +231,7 @@ __host__ __device__ constexpr bool test()
   test<int>();
   test<Trivial>();
 
-  if (!cuda::std::__libcpp_is_constant_evaluated())
+  if (!cuda::std::is_constant_evaluated())
   {
     test<NonTrivial>();
     test<NonTrivialDestructor>();
diff --git a/libcudacxx/test/libcudacxx/std/containers/sequences/inplace_vector/swap.pass.cpp b/libcudacxx/test/libcudacxx/std/containers/sequences/inplace_vector/swap.pass.cpp
index 775d9ecb3d5..552246b888b 100644
--- a/libcudacxx/test/libcudacxx/std/containers/sequences/inplace_vector/swap.pass.cpp
+++ b/libcudacxx/test/libcudacxx/std/containers/sequences/inplace_vector/swap.pass.cpp
@@ -62,7 +62,7 @@ __host__ __device__ constexpr bool test()
   test<int>();
   test<Trivial>();
 
-  if (!cuda::std::__libcpp_is_constant_evaluated())
+  if (!cuda::std::is_constant_evaluated())
   {
     test<NonTrivial>();
     test<NonTrivialDestructor>();
diff --git a/libcudacxx/test/libcudacxx/std/utilities/memory/smartptr/unique.ptr/unique.ptr.class/unique.ptr.ctor/pointer_deleter.pass.cpp b/libcudacxx/test/libcudacxx/std/utilities/memory/smartptr/unique.ptr/unique.ptr.class/unique.ptr.ctor/pointer_deleter.pass.cpp
index 46e2a3fdd88..4d57f632361 100644
--- a/libcudacxx/test/libcudacxx/std/utilities/memory/smartptr/unique.ptr/unique.ptr.class/unique.ptr.ctor/pointer_deleter.pass.cpp
+++ b/libcudacxx/test/libcudacxx/std/utilities/memory/smartptr/unique.ptr/unique.ptr.class/unique.ptr.ctor/pointer_deleter.pass.cpp
@@ -67,7 +67,7 @@ __host__ __device__ TEST_CONSTEXPR_CXX23 void test_sfinae()
     static_assert(cuda::std::is_constructible<U, int*, D const&>::value, "");
     static_assert(cuda::std::is_constructible<U, int*, D&>::value, "");
     static_assert(cuda::std::is_constructible<U, int*, D&&>::value, "");
-    // FIXME: __libcpp_compressed_pair attempts to perform a move even though
+    // FIXME: __cccl_compressed_pair attempts to perform a move even though
     // it should only copy.
     // D d;
     // U u(nullptr, cuda::std::move(d));
@@ -149,7 +149,7 @@ __host__ __device__ TEST_CONSTEXPR_CXX23 void test_sfinae_runtime()
     static_assert(!cuda::std::is_constructible<U, B*, D const&>::value, "");
     static_assert(!cuda::std::is_constructible<U, B*, D&>::value, "");
     static_assert(!cuda::std::is_constructible<U, B*, D&&>::value, "");
-    // FIXME: __libcpp_compressed_pair attempts to perform a move even though
+    // FIXME: __cccl_compressed_pair attempts to perform a move even though
     // it should only copy.
     // D d;
     // U u(nullptr, cuda::std::move(d));
diff --git a/libcudacxx/test/libcudacxx/std/utilities/meta/meta.const.eval/is_constant_evaluated.fail.cpp b/libcudacxx/test/libcudacxx/std/utilities/meta/meta.const.eval/is_constant_evaluated.fail.cpp
deleted file mode 100644
index edd04088205..00000000000
--- a/libcudacxx/test/libcudacxx/std/utilities/meta/meta.const.eval/is_constant_evaluated.fail.cpp
+++ /dev/null
@@ -1,28 +0,0 @@
-//===----------------------------------------------------------------------===//
-//
-// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
-// See https://llvm.org/LICENSE.txt for license information.
-// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
-//
-//===----------------------------------------------------------------------===//
-
-// UNSUPPORTED: c++98, c++03
-
-// <cuda/std/type_traits>
-
-#include <cuda/std/cassert>
-#include <cuda/std/type_traits>
-
-#include "test_macros.h"
-
-int main(int, char**)
-{
-#ifndef _CCCL_BUILTIN_IS_CONSTANT_EVALUATED
-  // expected-error@+1 {{no member named 'is_constant_evaluated' in namespace 'std'}}
-  bool b = cuda::std::is_constant_evaluated();
-#else
-  // expected-error-re@+1 {{{{(static_assert|static assertion)}} failed}}
-  static_assert(!cuda::std::is_constant_evaluated(), "");
-#endif
-  return 0;
-}
diff --git a/libcudacxx/test/libcudacxx/std/utilities/meta/meta.unary/meta.unary.prop/is_constructible.pass.cpp b/libcudacxx/test/libcudacxx/std/utilities/meta/meta.unary/meta.unary.prop/is_constructible.pass.cpp
index e655fa3aa6f..c9e544789be 100644
--- a/libcudacxx/test/libcudacxx/std/utilities/meta/meta.unary/meta.unary.prop/is_constructible.pass.cpp
+++ b/libcudacxx/test/libcudacxx/std/utilities/meta/meta.unary/meta.unary.prop/is_constructible.pass.cpp
@@ -79,7 +79,7 @@ __host__ __device__ void test_is_constructible()
 #ifndef TEST_COMPILER_MSVC
   // The fallback SFINAE version doesn't work reliable with MSVC, and we don't
   // use it, so waive it.
-  static_assert((cuda::std::__libcpp_is_constructible<T>::type::value), "");
+  static_assert((cuda::std::__cccl_is_constructible<T>::type::value), "");
 #endif
 #if TEST_STD_VER > 2011
   static_assert(cuda::std::is_constructible_v<T>, "");
@@ -93,7 +93,7 @@ __host__ __device__ void test_is_constructible()
 #ifndef TEST_COMPILER_MSVC
   // The fallback SFINAE version doesn't work reliable with MSVC, and we don't
   // use it, so waive it.
-  static_assert((cuda::std::__libcpp_is_constructible<T, A0>::type::value), "");
+  static_assert((cuda::std::__cccl_is_constructible<T, A0>::type::value), "");
 #endif
 #if TEST_STD_VER > 2011
   static_assert((cuda::std::is_constructible_v<T, A0>), "");
@@ -107,7 +107,7 @@ __host__ __device__ void test_is_constructible()
 #ifndef TEST_COMPILER_MSVC
   // The fallback SFINAE version doesn't work reliable with MSVC, and we don't
   // use it, so waive it.
-  static_assert((cuda::std::__libcpp_is_constructible<T, A0, A1>::type::value), "");
+  static_assert((cuda::std::__cccl_is_constructible<T, A0, A1>::type::value), "");
 #endif
 #if TEST_STD_VER > 2011
   static_assert((cuda::std::is_constructible_v<T, A0, A1>), "");
@@ -121,7 +121,7 @@ __host__ __device__ void test_is_constructible()
 #ifndef TEST_COMPILER_MSVC
   // The fallback SFINAE version doesn't work reliable with MSVC, and we don't
   // use it, so waive it.
-  static_assert((cuda::std::__libcpp_is_constructible<T, A0, A1, A2>::type::value), "");
+  static_assert((cuda::std::__cccl_is_constructible<T, A0, A1, A2>::type::value), "");
 #endif
 #if TEST_STD_VER > 2011
   static_assert((cuda::std::is_constructible_v<T, A0, A1, A2>), "");
@@ -135,7 +135,7 @@ __host__ __device__ void test_is_not_constructible()
 #ifndef TEST_COMPILER_MSVC
   // The fallback SFINAE version doesn't work reliable with MSVC, and we don't
   // use it, so waive it.
-  static_assert((!cuda::std::__libcpp_is_constructible<T>::type::value), "");
+  static_assert((!cuda::std::__cccl_is_constructible<T>::type::value), "");
 #endif
 #if TEST_STD_VER > 2011
   static_assert((!cuda::std::is_constructible_v<T>), "");
@@ -149,7 +149,7 @@ __host__ __device__ void test_is_not_constructible()
 #if !defined(TEST_COMPILER_MSVC) && !(defined(TEST_COMPILER_CLANG) && __clang_major__ >= 16)
   // The fallback SFINAE version doesn't work reliable with MSVC, and we don't
   // use it, so waive it.
-  static_assert((!cuda::std::__libcpp_is_constructible<T, A0>::type::value), "");
+  static_assert((!cuda::std::__cccl_is_constructible<T, A0>::type::value), "");
 #endif
 #if TEST_STD_VER > 2011
   static_assert((!cuda::std::is_constructible_v<T, A0>), "");
@@ -297,11 +297,11 @@ int main(int, char**)
   // FIXME Clang disallows this construction because it thinks that
   // 'static_cast<int&&>(declval<ExplicitTo<int&&>>())' is ill-formed.
   LIBCPP_STATIC_ASSERT(
-    clang_disallows_valid_static_cast_bug != cuda::std::__libcpp_is_constructible<int&&, ExplicitTo<int&&>>::value, "");
+    clang_disallows_valid_static_cast_bug != cuda::std::__cccl_is_constructible<int&&, ExplicitTo<int&&>>::value, "");
   ((void) clang_disallows_valid_static_cast_bug); // Prevent unused warning
 #  else
   static_assert(clang_disallows_valid_static_cast_bug == false, "");
-  LIBCPP_STATIC_ASSERT(cuda::std::__libcpp_is_constructible<int&&, ExplicitTo<int&&>>::value, "");
+  LIBCPP_STATIC_ASSERT(cuda::std::__cccl_is_constructible<int&&, ExplicitTo<int&&>>::value, "");
 #  endif
 #endif
 
@@ -309,7 +309,7 @@ int main(int, char**)
 #if defined(TEST_CLANG_VER) && !defined(TEST_COMPILER_NVCC)
   test_is_constructible<const int&, ExplicitTo<int>>();
   LIBCPP_STATIC_ASSERT(
-    clang_disallows_valid_static_cast_bug != cuda::std::__libcpp_is_constructible<int&&, ExplicitTo<int>>::value, "");
+    clang_disallows_valid_static_cast_bug != cuda::std::__cccl_is_constructible<int&&, ExplicitTo<int>>::value, "");
   static_assert(cuda::std::is_constructible<int&&, ExplicitTo<int>>::value, "");
 #elif defined(TEST_COMPILER_MSVC) && defined(TEST_COMPILER_NVCC)
   // FIXME NVCC and MSVC disagree about the validity of these tests, and give
diff --git a/libcudacxx/test/support/check_assertion.h b/libcudacxx/test/support/check_assertion.h
index 6def8f701e1..8d1a2de8da4 100644
--- a/libcudacxx/test/support/check_assertion.h
+++ b/libcudacxx/test/support/check_assertion.h
@@ -312,7 +312,7 @@ struct DeathTest
   std::string stderr_from_child_;
 };
 
-void std::__libcpp_verbose_abort(char const* format, ...)
+void std::__cccl_verbose_abort(char const* format, ...)
 {
   assert(!GlobalMatcher().empty());
 
diff --git a/libcudacxx/test/support/test_macros.h b/libcudacxx/test/support/test_macros.h
index c81987a0dc1..28915d10565 100644
--- a/libcudacxx/test/support/test_macros.h
+++ b/libcudacxx/test/support/test_macros.h
@@ -149,7 +149,7 @@
 
 #if TEST_HAS_BUILTIN(__builtin_is_constant_evaluated) || _CCCL_COMPILER(GCC, >=, 9) \
   || (_CCCL_COMPILER(MSVC) && _MSC_VER > 1924 && _CCCL_CUDACC_AT_LEAST(11, 3))
-#  define TEST_IS_CONSTANT_EVALUATED() _CUDA_VSTD::__libcpp_is_constant_evaluated()
+#  define TEST_IS_CONSTANT_EVALUATED() cuda::std::is_constant_evaluated()
 #else
 #  define TEST_IS_CONSTANT_EVALUATED() false
 #endif

From 90120a4a4f06c93c1f4bb5e8677032cd852e8860 Mon Sep 17 00:00:00 2001
From: Bernhard Manfred Gruber <bernhardmgruber@gmail.com>
Date: Tue, 26 Nov 2024 09:04:04 +0100
Subject: [PATCH 28/45] Add more CUB transform benchmarks  (#2906)

* Add more CUB transform benchmarks

Fixes: #2814
---
 .../bench/transform/babelstream1.cu           |  2 +-
 .../bench/transform/babelstream2.cu           |  2 +-
 .../bench/transform/babelstream3.cu           |  2 +-
 .../transform/{babelstream.h => common.h}     |  0
 cub/benchmarks/bench/transform/complex_cmp.cu | 48 +++++++++++
 cub/benchmarks/bench/transform/fib.cu         | 76 ++++++++++++++++++
 cub/benchmarks/bench/transform/heavy.cu       | 79 +++++++++++++++++++
 .../nvbench_helper/nvbench_helper.cuh         | 13 +++
 8 files changed, 219 insertions(+), 3 deletions(-)
 rename cub/benchmarks/bench/transform/{babelstream.h => common.h} (100%)
 create mode 100644 cub/benchmarks/bench/transform/complex_cmp.cu
 create mode 100644 cub/benchmarks/bench/transform/fib.cu
 create mode 100644 cub/benchmarks/bench/transform/heavy.cu

diff --git a/cub/benchmarks/bench/transform/babelstream1.cu b/cub/benchmarks/bench/transform/babelstream1.cu
index 87abdfef6ff..c3b9306398d 100644
--- a/cub/benchmarks/bench/transform/babelstream1.cu
+++ b/cub/benchmarks/bench/transform/babelstream1.cu
@@ -15,7 +15,7 @@
 #  endif
 #endif
 
-#include "babelstream.h"
+#include "common.h"
 
 #if !TUNE_BASE
 #  if CUB_DETAIL_COUNT(__CUDA_ARCH_LIST__) != 1
diff --git a/cub/benchmarks/bench/transform/babelstream2.cu b/cub/benchmarks/bench/transform/babelstream2.cu
index c8fa017b788..61d4e905d92 100644
--- a/cub/benchmarks/bench/transform/babelstream2.cu
+++ b/cub/benchmarks/bench/transform/babelstream2.cu
@@ -15,7 +15,7 @@
 #  endif
 #endif
 
-#include "babelstream.h"
+#include "common.h"
 
 #if !TUNE_BASE
 #  if CUB_DETAIL_COUNT(__CUDA_ARCH_LIST__) != 1
diff --git a/cub/benchmarks/bench/transform/babelstream3.cu b/cub/benchmarks/bench/transform/babelstream3.cu
index db541554210..a5c969764ae 100644
--- a/cub/benchmarks/bench/transform/babelstream3.cu
+++ b/cub/benchmarks/bench/transform/babelstream3.cu
@@ -15,7 +15,7 @@
 #  endif
 #endif
 
-#include "babelstream.h"
+#include "common.h"
 
 #if !TUNE_BASE
 #  if CUB_DETAIL_COUNT(__CUDA_ARCH_LIST__) != 1
diff --git a/cub/benchmarks/bench/transform/babelstream.h b/cub/benchmarks/bench/transform/common.h
similarity index 100%
rename from cub/benchmarks/bench/transform/babelstream.h
rename to cub/benchmarks/bench/transform/common.h
diff --git a/cub/benchmarks/bench/transform/complex_cmp.cu b/cub/benchmarks/bench/transform/complex_cmp.cu
new file mode 100644
index 00000000000..ac9eb4b0f8b
--- /dev/null
+++ b/cub/benchmarks/bench/transform/complex_cmp.cu
@@ -0,0 +1,48 @@
+// SPDX-FileCopyrightText: Copyright (c) 2024, NVIDIA CORPORATION. All rights reserved.
+// SPDX-License-Identifier: BSD-3-Clause
+
+// %RANGE% TUNE_THREADS tpb 128:1024:128
+// %RANGE% TUNE_ALGORITHM alg 0:1:1
+
+// keep checks at the top so compilation of discarded variants fails really fast
+#if !TUNE_BASE
+#  if TUNE_ALGORITHM == 1 && (__CUDA_ARCH_LIST__) < 900
+#    error "Cannot compile algorithm 4 (ublkcp) below sm90"
+#  endif
+
+#  if TUNE_ALGORITHM == 1 && !defined(_CUB_HAS_TRANSFORM_UBLKCP)
+#    error "Cannot tune for ublkcp algorithm, which is not provided by CUB (old CTK?)"
+#  endif
+#endif
+
+#include "common.h"
+
+#if !TUNE_BASE
+#  if CUB_DETAIL_COUNT(__CUDA_ARCH_LIST__) != 1
+#    error "This benchmark does not support being compiled for multiple architectures"
+#  endif
+#endif
+
+// This benchmark tests overlapping memory regions for reading and is compute intensive
+
+template <typename OffsetT>
+static void compare_complex(nvbench::state& state, nvbench::type_list<OffsetT>)
+{
+  const auto n                      = narrow<OffsetT>(state.get_int64("Elements{io}"));
+  thrust::device_vector<complex> in = generate(n);
+  thrust::device_vector<bool> out(n - 1);
+
+  state.add_element_count(n);
+  state.add_global_memory_reads<complex>(n);
+  state.add_global_memory_writes<bool>(n);
+
+  // the complex comparison needs lots of compute and transform reads from overlapping input
+  using compare_op = less_t;
+  bench_transform(state, ::cuda::std::tuple{in.begin(), in.begin() + 1}, out.begin(), n - 1, compare_op{});
+}
+
+// TODO(bgruber): hardcode OffsetT?
+NVBENCH_BENCH_TYPES(compare_complex, NVBENCH_TYPE_AXES(offset_types))
+  .set_name("compare_complex")
+  .set_type_axes_names({"OffsetT{ct}"})
+  .add_int64_power_of_two_axis("Elements{io}", nvbench::range(16, 28, 4));
diff --git a/cub/benchmarks/bench/transform/fib.cu b/cub/benchmarks/bench/transform/fib.cu
new file mode 100644
index 00000000000..8a6c4c3dfa8
--- /dev/null
+++ b/cub/benchmarks/bench/transform/fib.cu
@@ -0,0 +1,76 @@
+// SPDX-FileCopyrightText: Copyright (c) 2024, NVIDIA CORPORATION. All rights reserved.
+// SPDX-License-Identifier: BSD-3-Clause
+
+// %RANGE% TUNE_THREADS tpb 128:1024:128
+// %RANGE% TUNE_ALGORITHM alg 0:1:1
+
+// keep checks at the top so compilation of discarded variants fails really fast
+#if !TUNE_BASE
+#  if TUNE_ALGORITHM == 1 && (__CUDA_ARCH_LIST__) < 900
+#    error "Cannot compile algorithm 4 (ublkcp) below sm90"
+#  endif
+
+#  if TUNE_ALGORITHM == 1 && !defined(_CUB_HAS_TRANSFORM_UBLKCP)
+#    error "Cannot tune for ublkcp algorithm, which is not provided by CUB (old CTK?)"
+#  endif
+#endif
+
+#include "common.h"
+
+#if !TUNE_BASE
+#  if CUB_DETAIL_COUNT(__CUDA_ARCH_LIST__) != 1
+#    error "This benchmark does not support being compiled for multiple architectures"
+#  endif
+#endif
+
+// This benchmark is compute intensive with diverging threads
+
+template <class IndexT, class OutputT>
+struct fib_t
+{
+  __device__ OutputT operator()(IndexT n)
+  {
+    OutputT t1 = 0;
+    OutputT t2 = 1;
+
+    if (n < 1)
+    {
+      return t1;
+    }
+    if (n == 1)
+    {
+      return t1;
+    }
+    if (n == 2)
+    {
+      return t2;
+    }
+    for (IndexT i = 3; i <= n; ++i)
+    {
+      const auto next = t1 + t2;
+      t1              = t2;
+      t2              = next;
+    }
+    return t2;
+  }
+};
+template <typename OffsetT>
+static void fibonacci(nvbench::state& state, nvbench::type_list<OffsetT>)
+{
+  using index_t                     = int64_t;
+  using output_t                    = uint32_t;
+  const auto n                      = narrow<OffsetT>(state.get_int64("Elements{io}"));
+  thrust::device_vector<index_t> in = generate(n, bit_entropy::_1_000, index_t{0}, index_t{42});
+  thrust::device_vector<output_t> out(n);
+
+  state.add_element_count(n);
+  state.add_global_memory_reads<index_t>(n);
+  state.add_global_memory_writes<output_t>(n);
+
+  bench_transform(state, ::cuda::std::tuple{in.begin()}, out.begin(), n, fib_t<index_t, output_t>{});
+}
+
+NVBENCH_BENCH_TYPES(fibonacci, NVBENCH_TYPE_AXES(offset_types))
+  .set_name("fibonacci")
+  .set_type_axes_names({"OffsetT{ct}"})
+  .add_int64_power_of_two_axis("Elements{io}", nvbench::range(16, 28, 4));
diff --git a/cub/benchmarks/bench/transform/heavy.cu b/cub/benchmarks/bench/transform/heavy.cu
new file mode 100644
index 00000000000..7c35b069e24
--- /dev/null
+++ b/cub/benchmarks/bench/transform/heavy.cu
@@ -0,0 +1,79 @@
+// SPDX-FileCopyrightText: Copyright (c) 2024, NVIDIA CORPORATION. All rights reserved.
+// SPDX-License-Identifier: BSD-3-Clause
+
+// %RANGE% TUNE_THREADS tpb 128:1024:128
+// %RANGE% TUNE_ALGORITHM alg 0:1:1
+
+// keep checks at the top so compilation of discarded variants fails really fast
+#if !TUNE_BASE
+#  if TUNE_ALGORITHM == 1 && (__CUDA_ARCH_LIST__) < 900
+#    error "Cannot compile algorithm 4 (ublkcp) below sm90"
+#  endif
+
+#  if TUNE_ALGORITHM == 1 && !defined(_CUB_HAS_TRANSFORM_UBLKCP)
+#    error "Cannot tune for ublkcp algorithm, which is not provided by CUB (old CTK?)"
+#  endif
+#endif
+
+#include "common.h"
+
+#if !TUNE_BASE
+#  if CUB_DETAIL_COUNT(__CUDA_ARCH_LIST__) != 1
+#    error "This benchmark does not support being compiled for multiple architectures"
+#  endif
+#endif
+
+// This benchmark uses a LOT of registers and is compute intensive.
+
+template <int N>
+struct heavy_functor
+{
+  // we need to use an unsigned type so overflow in arithmetic wraps around
+  __device__ std::uint32_t operator()(std::uint32_t data) const
+  {
+    std::uint32_t reg[N];
+    reg[0] = data;
+    for (int i = 1; i < N; ++i)
+    {
+      reg[i] = reg[i - 1] * reg[i - 1] + 1;
+    }
+    for (int i = 0; i < N; ++i)
+    {
+      reg[i] = (reg[i] * reg[i]) % 19;
+    }
+    for (int i = 0; i < N; ++i)
+    {
+      reg[i] = reg[N - i - 1] * reg[i];
+    }
+    std::uint32_t x = 0;
+    for (int i = 0; i < N; ++i)
+    {
+      x += reg[i];
+    }
+    return x;
+  }
+};
+
+template <typename Heaviness>
+static void heavy(nvbench::state& state, nvbench::type_list<Heaviness>)
+{
+  using value_t                     = std::uint32_t;
+  using offset_t                    = int;
+  const auto n                      = narrow<offset_t>(state.get_int64("Elements{io}"));
+  thrust::device_vector<value_t> in = generate(n);
+  thrust::device_vector<value_t> out(n);
+
+  state.add_element_count(n);
+  state.add_global_memory_reads<value_t>(n);
+  state.add_global_memory_writes<value_t>(n);
+
+  bench_transform(state, ::cuda::std::tuple{in.begin()}, out.begin(), n, heavy_functor<Heaviness::value>{});
+}
+
+template <int I>
+using ic = ::cuda::std::integral_constant<int, I>;
+
+NVBENCH_BENCH_TYPES(heavy, NVBENCH_TYPE_AXES(nvbench::type_list<ic<32>, ic<64>, ic<128>, ic<256>>))
+  .set_name("heavy")
+  .set_type_axes_names({"Heaviness{ct}"})
+  .add_int64_power_of_two_axis("Elements{io}", nvbench::range(16, 28, 4));
diff --git a/cub/benchmarks/nvbench_helper/nvbench_helper/nvbench_helper.cuh b/cub/benchmarks/nvbench_helper/nvbench_helper/nvbench_helper.cuh
index 88b189cf964..9c16bee3033 100644
--- a/cub/benchmarks/nvbench_helper/nvbench_helper/nvbench_helper.cuh
+++ b/cub/benchmarks/nvbench_helper/nvbench_helper/nvbench_helper.cuh
@@ -32,6 +32,19 @@ NVBENCH_DECLARE_TYPE_STRINGS(complex, "C64", "complex");
 NVBENCH_DECLARE_TYPE_STRINGS(::cuda::std::false_type, "false", "false_type");
 NVBENCH_DECLARE_TYPE_STRINGS(::cuda::std::true_type, "true", "true_type");
 
+template <typename T, T I>
+struct nvbench::type_strings<::cuda::std::integral_constant<T, I>>
+{
+  static std::string input_string()
+  {
+    return std::to_string(I);
+  }
+  static std::string description()
+  {
+    return "integral_constant<" + type_strings<T>::description() + ", " + std::to_string(I) + ">";
+  }
+};
+
 namespace detail
 {
 

From 159c1c3ed255e02e72fef860792db9cca3e4dbe1 Mon Sep 17 00:00:00 2001
From: Michael Schellenberger Costa <miscco@nvidia.com>
Date: Tue, 26 Nov 2024 09:11:43 +0100
Subject: [PATCH 29/45] Start reworking our math functions (#2749)

* Move cmath helpers to `__cmath` subfolder
* Drop unused functions
* Move `lerp` to its own file
* Properly qualify function calls in cmath
* Move definition of logarithms into their own file and implement them on our own
* Move definition of fp min max to its own file
* Move definition of floating point trait functions to their own file
* Improve tests to ensure we are not constant folding everything
* Also port `fpclassify` to enable proper `isnormal` implementation
Co-authored-by: Bernhard Manfred Gruber <bernhardmgruber@gmail.com>
---
 libcudacxx/include/cuda/std/__cccl/builtin.h  | 169 +++++-
 libcudacxx/include/cuda/std/__cmath/common.h  |  40 ++
 .../include/cuda/std/__cmath/fpclassify.h     | 189 +++++++
 libcudacxx/include/cuda/std/__cmath/lerp.h    | 102 ++++
 .../include/cuda/std/__cmath/logarithms.h     | 494 ++++++++++++++++++
 libcudacxx/include/cuda/std/__cmath/min_max.h | 227 ++++++++
 .../cmath_nvbf16.h => __cmath/nvbf16.h}       |  57 +-
 .../cmath_nvfp16.h => __cmath/nvfp16.h}       |  79 +--
 libcudacxx/include/cuda/std/__cmath/traits.h  | 470 +++++++++++++++++
 .../include/cuda/std/__complex/nvbf16.h       |   2 +-
 .../include/cuda/std/__complex/nvfp16.h       |   2 +-
 .../include/cuda/std/__type_traits/promote.h  |  55 +-
 .../cuda/std/detail/libcxx/include/cmath      | 243 ++-------
 .../cuda/std/detail/libcxx/include/complex    | 201 ++++---
 .../std/numerics/c.math/fp_min_max.pass.cpp   | 118 +++++
 .../std/numerics/c.math/fp_traits.pass.cpp    | 458 ++++++++++++++++
 .../std/numerics/c.math/lerp.pass.cpp         |  86 +++
 .../std/numerics/c.math/logarithms.pass.cpp   | 109 ++++
 18 files changed, 2643 insertions(+), 458 deletions(-)
 create mode 100644 libcudacxx/include/cuda/std/__cmath/common.h
 create mode 100644 libcudacxx/include/cuda/std/__cmath/fpclassify.h
 create mode 100644 libcudacxx/include/cuda/std/__cmath/lerp.h
 create mode 100644 libcudacxx/include/cuda/std/__cmath/logarithms.h
 create mode 100644 libcudacxx/include/cuda/std/__cmath/min_max.h
 rename libcudacxx/include/cuda/std/{__cuda/cmath_nvbf16.h => __cmath/nvbf16.h} (68%)
 rename libcudacxx/include/cuda/std/{__cuda/cmath_nvfp16.h => __cmath/nvfp16.h} (70%)
 create mode 100644 libcudacxx/include/cuda/std/__cmath/traits.h
 create mode 100644 libcudacxx/test/libcudacxx/std/numerics/c.math/fp_min_max.pass.cpp
 create mode 100644 libcudacxx/test/libcudacxx/std/numerics/c.math/fp_traits.pass.cpp
 create mode 100644 libcudacxx/test/libcudacxx/std/numerics/c.math/lerp.pass.cpp
 create mode 100644 libcudacxx/test/libcudacxx/std/numerics/c.math/logarithms.pass.cpp

diff --git a/libcudacxx/include/cuda/std/__cccl/builtin.h b/libcudacxx/include/cuda/std/__cccl/builtin.h
index b3a53918054..4e0bfae8a9e 100644
--- a/libcudacxx/include/cuda/std/__cccl/builtin.h
+++ b/libcudacxx/include/cuda/std/__cccl/builtin.h
@@ -146,6 +146,32 @@
 #  define _CCCL_BUILTIN_EXPECT(...) __builtin_expect(__VA_ARGS__)
 #endif // _CCCL_CHECK_BUILTIN(builtin_expect)
 
+#if _CCCL_CHECK_BUILTIN(builtin_fmax) || _CCCL_COMPILER(GCC)
+#  define _CCCL_BUILTIN_FMAXF(...) __builtin_fmaxf(__VA_ARGS__)
+#  define _CCCL_BUILTIN_FMAX(...)  __builtin_fmax(__VA_ARGS__)
+#  define _CCCL_BUILTIN_FMAXL(...) __builtin_fmaxl(__VA_ARGS__)
+#endif // _CCCL_CHECK_BUILTIN(builtin_fmax)
+
+// Below 11.7 nvcc treats the builtin as a host only function
+#if _CCCL_CUDACC_BELOW(11, 7)
+#  undef _CCCL_BUILTIN_FMAXF
+#  undef _CCCL_BUILTIN_FMAX
+#  undef _CCCL_BUILTIN_FMAXL
+#endif // _CCCL_CUDACC_BELOW(11, 7)
+
+#if _CCCL_CHECK_BUILTIN(builtin_fmin) || _CCCL_COMPILER(GCC)
+#  define _CCCL_BUILTIN_FMINF(...) __builtin_fminf(__VA_ARGS__)
+#  define _CCCL_BUILTIN_FMIN(...)  __builtin_fmin(__VA_ARGS__)
+#  define _CCCL_BUILTIN_FMINL(...) __builtin_fminl(__VA_ARGS__)
+#endif // _CCCL_CHECK_BUILTIN(builtin_fmin)
+
+// Below 11.7 nvcc treats the builtin as a host only function
+#if _CCCL_CUDACC_BELOW(11, 7)
+#  undef _CCCL_BUILTIN_FMINF
+#  undef _CCCL_BUILTIN_FMIN
+#  undef _CCCL_BUILTIN_FMINL
+#endif // _CCCL_CUDACC_BELOW(11, 7)
+
 #if _CCCL_HAS_BUILTIN(__builtin_FILE) || _CCCL_COMPILER(GCC) || _CCCL_COMPILER(MSVC, >=, 19, 27)
 #  define _CCCL_BUILTIN_FILE() __builtin_FILE()
 #else // ^^^ _CCCL_HAS_BUILTIN(__builtin_FILE) ^^^ / vvv !_CCCL_HAS_BUILTIN(__builtin_FILE) vvv
@@ -158,6 +184,15 @@
 #  define _CCCL_BUILTIN_FILE() __FILE__
 #endif // _CCCL_CUDACC_BELOW(11, 3)
 
+#if _CCCL_CHECK_BUILTIN(builtin_fpclassify) || _CCCL_COMPILER(GCC)
+#  define _CCCL_BUILTIN_FPCLASSIFY(...) __builtin_fpclassify(__VA_ARGS__)
+#endif // _CCCL_CHECK_BUILTIN(builtin_fpclassify)
+
+// Below 11.7 nvcc treats the builtin as a host only function
+#if _CCCL_CUDACC_BELOW(11, 7)
+#  undef _CCCL_BUILTIN_FPCLASSIFY
+#endif // _CCCL_CUDACC_BELOW(11, 7)
+
 #if _CCCL_HAS_BUILTIN(__builtin_FUNCTION) || _CCCL_COMPILER(GCC) || _CCCL_COMPILER(MSVC, >=, 19, 27)
 #  define _CCCL_BUILTIN_FUNCTION() __builtin_FUNCTION()
 #else // ^^^ _CCCL_HAS_BUILTIN(__builtin_FUNCTION) ^^^ / vvv !_CCCL_HAS_BUILTIN(__builtin_FUNCTION) vvv
@@ -180,7 +215,34 @@
 #  undef _CCCL_BUILTIN_IS_CONSTANT_EVALUATED
 #endif // _CCCL_STD_VER < 2014 && _CCCL_CUDA_COMPILER_NVCC
 
-#if _CCCL_CHECK_BUILTIN(builtin_launder) || _CCCL_COMPILER(GCC, >=, 7)
+#if _CCCL_CHECK_BUILTIN(builtin_isfinite) || _CCCL_COMPILER(GCC) || _CCCL_COMPILER(NVRTC)
+#  define _CCCL_BUILTIN_ISFINITE(...) __builtin_isfinite(__VA_ARGS__)
+#endif // _CCCL_CHECK_BUILTIN(isfinite)
+
+// Below 11.7 nvcc treats the builtin as a host only function
+#if _CCCL_CUDACC_BELOW(11, 7)
+#  undef _CCCL_BUILTIN_ISFINITE
+#endif // _CCCL_CUDACC_BELOW(11, 7)
+
+#if _CCCL_CHECK_BUILTIN(builtin_isinf) || _CCCL_COMPILER(GCC)
+#  define _CCCL_BUILTIN_ISINF(...) __builtin_isinf(__VA_ARGS__)
+#endif // _CCCL_CHECK_BUILTIN(isinf)
+
+// Below 11.7 nvcc treats the builtin as a host only function
+#if _CCCL_CUDACC_BELOW(11, 7)
+#  undef _CCCL_BUILTIN_ISINF
+#endif // _CCCL_CUDACC_BELOW(11, 7)
+
+#if _CCCL_CHECK_BUILTIN(builtin_isnan) || _CCCL_COMPILER(GCC)
+#  define _CCCL_BUILTIN_ISNAN(...) __builtin_isnan(__VA_ARGS__)
+#endif // _CCCL_CHECK_BUILTIN(isnan)
+
+// Below 11.7 nvcc treats the builtin as a host only function
+#if _CCCL_CUDACC_BELOW(11, 7)
+#  undef _CCCL_BUILTIN_ISNAN
+#endif // _CCCL_CUDACC_BELOW(11, 7)
+
+#if (_CCCL_CHECK_BUILTIN(builtin_launder) || (_CCCL_COMPILER(GCC) && _CCCL_GCC_VERSION >= 70000))
 #  define _CCCL_BUILTIN_LAUNDER(...) __builtin_launder(__VA_ARGS__)
 #endif // _CCCL_CHECK_BUILTIN(builtin_launder) && gcc >= 7
 
@@ -202,12 +264,105 @@
 #  define _CCCL_BUILTIN_LINE() __LINE__
 #endif // _CCCL_CUDACC_BELOW(11, 3)
 
+#if _CCCL_CHECK_BUILTIN(builtin_log) || _CCCL_COMPILER(GCC)
+#  define _CCCL_BUILTIN_LOGF(...) __builtin_logf(__VA_ARGS__)
+#  define _CCCL_BUILTIN_LOG(...)  __builtin_log(__VA_ARGS__)
+#  define _CCCL_BUILTIN_LOGL(...) __builtin_logl(__VA_ARGS__)
+#endif // _CCCL_CHECK_BUILTIN(builtin_log)
+
+// Below 11.7 nvcc treats the builtin as a host only function
+// clang-cuda fails with fatal error: error in backend: Undefined external symbol "logf"
+#if _CCCL_CUDACC_BELOW(11, 7) || defined(_CCCL_CUDA_COMPILER_CLANG)
+#  undef _CCCL_BUILTIN_LOGF
+#  undef _CCCL_BUILTIN_LOG
+#  undef _CCCL_BUILTIN_LOGL
+#endif // _CCCL_CUDACC_BELOW(11, 7) || _CCCL_CUDA_COMPILER_CLANG
+
+#if _CCCL_CHECK_BUILTIN(builtin_log10) || _CCCL_COMPILER(GCC)
+#  define _CCCL_BUILTIN_LOG10F(...) __builtin_log10f(__VA_ARGS__)
+#  define _CCCL_BUILTIN_LOG10(...)  __builtin_log10(__VA_ARGS__)
+#  define _CCCL_BUILTIN_LOG10L(...) __builtin_log10l(__VA_ARGS__)
+#endif // _CCCL_CHECK_BUILTIN(builtin_log10)
+
+// Below 11.7 nvcc treats the builtin as a host only function
+// clang-cuda fails with fatal error: error in backend: Undefined external symbol "log10f"
+#if _CCCL_CUDACC_BELOW(11, 7) || defined(_CCCL_CUDA_COMPILER_CLANG)
+#  undef _CCCL_BUILTIN_LOG10F
+#  undef _CCCL_BUILTIN_LOG10
+#  undef _CCCL_BUILTIN_LOG10L
+#endif // _CCCL_CUDACC_BELOW(11, 7)
+
+#if _CCCL_CHECK_BUILTIN(builtin_ilogb) || _CCCL_COMPILER(GCC)
+#  define _CCCL_BUILTIN_ILOGBF(...) __builtin_ilogbf(__VA_ARGS__)
+#  define _CCCL_BUILTIN_ILOGB(...)  __builtin_ilogb(__VA_ARGS__)
+#  define _CCCL_BUILTIN_ILOGBL(...) __builtin_ilogbl(__VA_ARGS__)
+#endif // _CCCL_CHECK_BUILTIN(builtin_log10)
+
+// Below 11.7 nvcc treats the builtin as a host only function
+// clang-cuda fails with fatal error: error in backend: Undefined external symbol "ilogb"
+#if _CCCL_CUDACC_BELOW(11, 7) || defined(_CCCL_CUDA_COMPILER_CLANG)
+#  undef _CCCL_BUILTIN_ILOGBF
+#  undef _CCCL_BUILTIN_ILOGB
+#  undef _CCCL_BUILTIN_ILOGBL
+#endif // _CCCL_CUDACC_BELOW(11, 7) || _CCCL_CUDA_COMPILER_CLANG
+
+#if _CCCL_CHECK_BUILTIN(builtin_log1p) || _CCCL_COMPILER(GCC)
+#  define _CCCL_BUILTIN_LOG1PF(...) __builtin_log1pf(__VA_ARGS__)
+#  define _CCCL_BUILTIN_LOG1P(...)  __builtin_log1p(__VA_ARGS__)
+#  define _CCCL_BUILTIN_LOG1PL(...) __builtin_log1pl(__VA_ARGS__)
+#endif // _CCCL_CHECK_BUILTIN(builtin_log1p)
+
+// Below 11.7 nvcc treats the builtin as a host only function
+// clang-cuda fails with fatal error: error in backend: Undefined external symbol "log1p"
+#if _CCCL_CUDACC_BELOW(11, 7) || defined(_CCCL_CUDA_COMPILER_CLANG)
+#  undef _CCCL_BUILTIN_LOG1PF
+#  undef _CCCL_BUILTIN_LOG1P
+#  undef _CCCL_BUILTIN_LOG1PL
+#endif // _CCCL_CUDACC_BELOW(11, 7) || _CCCL_CUDA_COMPILER_CLANG
+
+#if _CCCL_CHECK_BUILTIN(builtin_log2) || _CCCL_COMPILER(GCC)
+#  define _CCCL_BUILTIN_LOG2F(...) __builtin_log2f(__VA_ARGS__)
+#  define _CCCL_BUILTIN_LOG2(...)  __builtin_log2(__VA_ARGS__)
+#  define _CCCL_BUILTIN_LOG2L(...) __builtin_log2l(__VA_ARGS__)
+#endif // _CCCL_CHECK_BUILTIN(builtin_log1)
+
+// Below 11.7 nvcc treats the builtin as a host only function
+// clang-cuda fails with fatal error: error in backend: Undefined external symbol "log2f"
+#if _CCCL_CUDACC_BELOW(11, 7) || defined(_CCCL_CUDA_COMPILER_CLANG)
+#  undef _CCCL_BUILTIN_LOG2F
+#  undef _CCCL_BUILTIN_LOG2
+#  undef _CCCL_BUILTIN_LOG2L
+#endif // _CCCL_CUDACC_BELOW(11, 7)
+
+#if _CCCL_CHECK_BUILTIN(builtin_logb) || _CCCL_COMPILER(GCC)
+#  define _CCCL_BUILTIN_LOGBF(...) __builtin_logbf(__VA_ARGS__)
+#  define _CCCL_BUILTIN_LOGB(...)  __builtin_logb(__VA_ARGS__)
+#  define _CCCL_BUILTIN_LOGBL(...) __builtin_logbl(__VA_ARGS__)
+#endif // _CCCL_CHECK_BUILTIN(builtin_log1)
+
+// Below 11.7 nvcc treats the builtin as a host only function
+// clang-cuda fails with fatal error: error in backend: Undefined external symbol "logb"
+#if _CCCL_CUDACC_BELOW(11, 7) || defined(_CCCL_CUDA_COMPILER_CLANG)
+#  undef _CCCL_BUILTIN_LOGBF
+#  undef _CCCL_BUILTIN_LOGB
+#  undef _CCCL_BUILTIN_LOGBL
+#endif // _CCCL_CUDACC_BELOW(11, 7) || _CCCL_CUDA_COMPILER_CLANG
+
 #if _CCCL_CHECK_BUILTIN(__builtin_operator_new) && _CCCL_CHECK_BUILTIN(__builtin_operator_delete) \
   && defined(_CCCL_CUDA_COMPILER_CLANG)
 #  define _CCCL_BUILTIN_OPERATOR_DELETE(...) __builtin_operator_delete(__VA_ARGS__)
 #  define _CCCL_BUILTIN_OPERATOR_NEW(...)    __builtin_operator_new(__VA_ARGS__)
 #endif // _CCCL_CHECK_BUILTIN(__builtin_operator_new) && _CCCL_CHECK_BUILTIN(__builtin_operator_delete)
 
+#if _CCCL_CHECK_BUILTIN(builtin_signbit) || _CCCL_COMPILER(GCC)
+#  define _CCCL_BUILTIN_SIGNBIT(...) __builtin_signbit(__VA_ARGS__)
+#endif // _CCCL_CHECK_BUILTIN(builtin_signbit)
+
+// Below 11.7 nvcc treats the builtin as a host only function
+#if _CCCL_CUDACC_BELOW(11, 7)
+#  undef _CCCL_BUILTIN_SIGNBIT
+#endif // _CCCL_CUDACC_BELOW(11, 7)
+
 #if _CCCL_HAS_BUILTIN(__decay) && defined(_CCCL_CUDA_COMPILER_CLANG)
 #  define _CCCL_BUILTIN_DECAY(...) __decay(__VA_ARGS__)
 #endif // _CCCL_HAS_BUILTIN(__decay) && clang-cuda
@@ -471,18 +626,6 @@
 #  define _CCCL_BUILTIN_IS_VOLATILE(...) __is_volatile(__VA_ARGS__)
 #endif // _CCCL_HAS_BUILTIN(__is_volatile)
 
-#if _CCCL_CHECK_BUILTIN(isfinite)
-#  define _CCCL_BUILTIN_ISFINITE(...) __builtin_isfinite(__VA_ARGS__)
-#endif // _CCCL_CHECK_BUILTIN(isfinite)
-
-#if _CCCL_CHECK_BUILTIN(isinf)
-#  define _CCCL_BUILTIN_ISINF(...) __builtin_isinf(__VA_ARGS__)
-#endif // _CCCL_CHECK_BUILTIN(isinf)
-
-#if _CCCL_CHECK_BUILTIN(isnan)
-#  define _CCCL_BUILTIN_ISNAN(...) __builtin_isnan(__VA_ARGS__)
-#endif // _CCCL_CHECK_BUILTIN(isnan)
-
 #if _CCCL_CHECK_BUILTIN(make_integer_seq) || _CCCL_COMPILER(MSVC, >=, 19, 23)
 #  define _CCCL_BUILTIN_MAKE_INTEGER_SEQ(...) __make_integer_seq<__VA_ARGS__>
 #endif // _CCCL_CHECK_BUILTIN(make_integer_seq)
diff --git a/libcudacxx/include/cuda/std/__cmath/common.h b/libcudacxx/include/cuda/std/__cmath/common.h
new file mode 100644
index 00000000000..0f6f444d957
--- /dev/null
+++ b/libcudacxx/include/cuda/std/__cmath/common.h
@@ -0,0 +1,40 @@
+// -*- C++ -*-
+//===----------------------------------------------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+// SPDX-FileCopyrightText: Copyright (c) 2024 NVIDIA CORPORATION & AFFILIATES.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef _LIBCUDACXX___CMATH_COMMON_H
+#define _LIBCUDACXX___CMATH_COMMON_H
+
+#include <cuda/std/detail/__config>
+
+#if defined(_CCCL_IMPLICIT_SYSTEM_HEADER_GCC)
+#  pragma GCC system_header
+#elif defined(_CCCL_IMPLICIT_SYSTEM_HEADER_CLANG)
+#  pragma clang system_header
+#elif defined(_CCCL_IMPLICIT_SYSTEM_HEADER_MSVC)
+#  pragma system_header
+#endif // no system header
+
+// MSVC and clang cuda need the host side functions included
+#if _CCCL_COMPILER(MSVC) || defined(_CCCL_CUDA_COMPILER_CLANG)
+#  include <math.h>
+#endif // _CCCL_COMPILER(MSVC) || _CCCL_CUDA_COMPILER_CLANG
+
+#if defined(_LIBCUDACXX_HAS_NVFP16)
+#  include <cuda_fp16.h>
+#endif // _LIBCUDACXX_HAS_NVFP16
+
+#if defined(_LIBCUDACXX_HAS_NVBF16)
+_CCCL_DIAG_PUSH
+_CCCL_DIAG_SUPPRESS_CLANG("-Wunused-function")
+#  include <cuda_bf16.h>
+_CCCL_DIAG_POP
+#endif // _LIBCUDACXX_HAS_NVBF16
+
+#endif // _LIBCUDACXX___CMATH_COMMON_H
diff --git a/libcudacxx/include/cuda/std/__cmath/fpclassify.h b/libcudacxx/include/cuda/std/__cmath/fpclassify.h
new file mode 100644
index 00000000000..c55e88cb792
--- /dev/null
+++ b/libcudacxx/include/cuda/std/__cmath/fpclassify.h
@@ -0,0 +1,189 @@
+// -*- C++ -*-
+//===----------------------------------------------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+// SPDX-FileCopyrightText: Copyright (c) 2024 NVIDIA CORPORATION & AFFILIATES.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef _LIBCUDACXX___CMATH_FPCLASSIFY_H
+#define _LIBCUDACXX___CMATH_FPCLASSIFY_H
+
+#include <cuda/std/detail/__config>
+
+#if defined(_CCCL_IMPLICIT_SYSTEM_HEADER_GCC)
+#  pragma GCC system_header
+#elif defined(_CCCL_IMPLICIT_SYSTEM_HEADER_CLANG)
+#  pragma clang system_header
+#elif defined(_CCCL_IMPLICIT_SYSTEM_HEADER_MSVC)
+#  pragma system_header
+#endif // no system header
+
+#include <cuda/std/__bit/bit_cast.h>
+#include <cuda/std/__cmath/common.h>
+#include <cuda/std/__type_traits/enable_if.h>
+#include <cuda/std/__type_traits/is_extended_floating_point.h>
+#include <cuda/std/__type_traits/is_integral.h>
+
+#if _CCCL_COMPILER(NVRTC)
+#  ifndef FP_NAN
+#    define FP_NAN 0
+#  endif // ! FP_NAN
+#  ifndef FP_INFINITE
+#    define FP_INFINITE 1
+#  endif // ! FP_INFINITE
+#  ifndef FP_ZERO
+#    define FP_ZERO 2
+#  endif // ! FP_ZERO
+#  ifndef FP_SUBNORMAL
+#    define FP_SUBNORMAL 3
+#  endif // ! FP_SUBNORMAL
+#  ifndef FP_NORMAL
+#    define FP_NORMAL 4
+#  endif // ! FP_NORMAL
+#else // ^^^ _CCCL_COMPILER(NVRTC) ^^^ ^/  vvv !_CCCL_COMPILER(NVRTC) vvv
+#  include <math.h>
+#endif // !_CCCL_COMPILER(NVRTC)
+
+_LIBCUDACXX_BEGIN_NAMESPACE_STD
+
+struct _CCCL_FLOAT_BITS
+{
+#if defined(_LIBCUDACXX_LITTLE_ENDIAN)
+  unsigned int man  : 23;
+  unsigned int exp  : 8;
+  unsigned int sign : 1;
+#else // ^^^ _LIBCUDACXX_LITTLE_ENDIAN ^^^ / vvv _LIBCUDACXX_BIG_ENDIAN vvv
+  unsigned int sign : 1;
+  unsigned int exp  : 8;
+  unsigned int man  : 23;
+#endif // _LIBCUDACXX_BIG_ENDIAN
+};
+
+struct _CCCL_DOUBLE_BITS
+{
+#if defined(_LIBCUDACXX_LITTLE_ENDIAN)
+  unsigned int manl : 32;
+  unsigned int manh : 20;
+  unsigned int exp  : 11;
+  unsigned int sign : 1;
+#else // ^^^ _LIBCUDACXX_LITTLE_ENDIAN ^^^ / vvv _LIBCUDACXX_BIG_ENDIAN vvv
+  unsigned int sign : 1;
+  unsigned int exp  : 11;
+  unsigned int manh : 20;
+  unsigned int manl : 32;
+#endif // _LIBCUDACXX_BIG_ENDIAN
+};
+
+#if defined(_LIBCUDACXX_HAS_NVFP16)
+struct _CCCL_HALF_BITS
+{
+#  if defined(_LIBCUDACXX_LITTLE_ENDIAN)
+  unsigned short man  : 10;
+  unsigned short exp  : 5;
+  unsigned short sign : 1;
+#  else // ^^^ _LIBCUDACXX_LITTLE_ENDIAN ^^^ / vvv _LIBCUDACXX_BIG_ENDIAN vvv
+  unsigned short sign : 1;
+  unsigned short exp  : 5;
+  unsigned short man  : 10;
+#  endif // _LIBCUDACXX_BIG_ENDIAN
+};
+#endif // _LIBCUDACXX_HAS_NVFP16
+
+#if defined(_LIBCUDACXX_HAS_NVBF16)
+struct _CCCL_NVBFLOAT_BITS
+{
+#  if defined(_LIBCUDACXX_LITTLE_ENDIAN)
+  unsigned short man  : 7;
+  unsigned short exp  : 8;
+  unsigned short sign : 1;
+#  else // ^^^ _LIBCUDACXX_LITTLE_ENDIAN ^^^ / vvv _LIBCUDACXX_BIG_ENDIAN vvv
+  unsigned short sign : 1;
+  unsigned short exp  : 8;
+  unsigned short man  : 7;
+#  endif // _LIBCUDACXX_BIG_ENDIAN
+};
+#endif // _LIBCUDACXX_HAS_NVBF16
+
+_CCCL_NODISCARD _LIBCUDACXX_HIDE_FROM_ABI int fpclassify(float __x) noexcept
+{
+  _CCCL_FLOAT_BITS __bits = _CUDA_VSTD::bit_cast<_CCCL_FLOAT_BITS>(__x);
+  if (__bits.exp == 0)
+  {
+    return __bits.man == 0 ? FP_ZERO : FP_SUBNORMAL;
+  }
+  if (__bits.exp == 255)
+  {
+    return __bits.man == 0 ? FP_INFINITE : FP_NAN;
+  }
+  return (FP_NORMAL);
+}
+
+_CCCL_NODISCARD _LIBCUDACXX_HIDE_FROM_ABI int fpclassify(double __x) noexcept
+{
+  _CCCL_DOUBLE_BITS __bits = _CUDA_VSTD::bit_cast<_CCCL_DOUBLE_BITS>(__x);
+  if (__bits.exp == 0)
+  {
+    return (__bits.manl | __bits.manh) == 0 ? FP_ZERO : FP_SUBNORMAL;
+  }
+  if (__bits.exp == 2047)
+  {
+    return (__bits.manl | __bits.manh) == 0 ? FP_INFINITE : FP_NAN;
+  }
+  return (FP_NORMAL);
+}
+
+#if !defined(_LIBCUDACXX_HAS_NO_LONG_DOUBLE)
+_CCCL_NODISCARD _LIBCUDACXX_HIDE_FROM_ABI int fpclassify(long double __x) noexcept
+{
+#  if defined(_CCCL_BUILTIN_FPCLASSIFY)
+  return _CCCL_BUILTIN_FPCLASSIFY(FP_NAN, FP_INFINITE, FP_NORMAL, FP_SUBNORMAL, FP_ZERO, __x);
+#  else // ^^^ _CCCL_BUILTIN_SIGNBIT ^^^ / vvv !_CCCL_BUILTIN_SIGNBIT vvv
+  return ::fpclassify(__x);
+#  endif // !_CCCL_BUILTIN_SIGNBIT
+}
+#endif // !_LIBCUDACXX_HAS_NO_LONG_DOUBLE
+
+#if defined(_LIBCUDACXX_HAS_NVFP16)
+_CCCL_NODISCARD _LIBCUDACXX_HIDE_FROM_ABI int fpclassify(__half __x) noexcept
+{
+  _CCCL_HALF_BITS __bits = _CUDA_VSTD::bit_cast<_CCCL_HALF_BITS>(__x);
+  if (__bits.exp == 0)
+  {
+    return __bits.man == 0 ? FP_ZERO : FP_SUBNORMAL;
+  }
+  if (__bits.exp == 31)
+  {
+    return __bits.man == 0 ? FP_INFINITE : FP_NAN;
+  }
+  return (FP_NORMAL);
+}
+#endif // _LIBCUDACXX_HAS_NVFP16
+
+#if defined(_LIBCUDACXX_HAS_NVBF16)
+_CCCL_NODISCARD _LIBCUDACXX_HIDE_FROM_ABI int fpclassify(__nv_bfloat16 __x) noexcept
+{
+  _CCCL_NVBFLOAT_BITS __bits = _CUDA_VSTD::bit_cast<_CCCL_NVBFLOAT_BITS>(__x);
+  if (__bits.exp == 0)
+  {
+    return __bits.man == 0 ? FP_ZERO : FP_SUBNORMAL;
+  }
+  if (__bits.exp == 255)
+  {
+    return __bits.man == 0 ? FP_INFINITE : FP_NAN;
+  }
+  return (FP_NORMAL);
+}
+#endif // _LIBCUDACXX_HAS_NVBF16
+
+template <class _A1, enable_if_t<_CCCL_TRAIT(is_integral, _A1), int> = 0>
+_CCCL_NODISCARD _LIBCUDACXX_HIDE_FROM_ABI int fpclassify(_A1 __x) noexcept
+{
+  return (__x == 0) ? FP_ZERO : FP_NORMAL;
+}
+
+_LIBCUDACXX_END_NAMESPACE_STD
+
+#endif // _LIBCUDACXX___CMATH_FPCLASSIFY_H
diff --git a/libcudacxx/include/cuda/std/__cmath/lerp.h b/libcudacxx/include/cuda/std/__cmath/lerp.h
new file mode 100644
index 00000000000..1665a82bb2f
--- /dev/null
+++ b/libcudacxx/include/cuda/std/__cmath/lerp.h
@@ -0,0 +1,102 @@
+// -*- C++ -*-
+//===----------------------------------------------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+// SPDX-FileCopyrightText: Copyright (c) 2024 NVIDIA CORPORATION & AFFILIATES.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef _LIBCUDACXX___CMATH_LERP_H
+#define _LIBCUDACXX___CMATH_LERP_H
+
+#include <cuda/std/detail/__config>
+
+#if defined(_CCCL_IMPLICIT_SYSTEM_HEADER_GCC)
+#  pragma GCC system_header
+#elif defined(_CCCL_IMPLICIT_SYSTEM_HEADER_CLANG)
+#  pragma clang system_header
+#elif defined(_CCCL_IMPLICIT_SYSTEM_HEADER_MSVC)
+#  pragma system_header
+#endif // no system header
+
+#include <cuda/std/__cmath/common.h>
+#include <cuda/std/__type_traits/enable_if.h>
+#include <cuda/std/__type_traits/is_arithmetic.h>
+#include <cuda/std/__type_traits/promote.h>
+
+_LIBCUDACXX_BEGIN_NAMESPACE_STD
+
+template <typename _Fp>
+_CCCL_NODISCARD _LIBCUDACXX_HIDE_FROM_ABI _CCCL_CONSTEXPR_CXX14 _Fp __lerp(_Fp __a, _Fp __b, _Fp __t) noexcept
+{
+  if ((__a <= 0 && __b >= 0) || (__a >= 0 && __b <= 0))
+  {
+    return __t * __b + (1 - __t) * __a;
+  }
+
+  if (__t == 1)
+  {
+    return __b;
+  }
+  const _Fp __x = __a + __t * (__b - __a);
+  if ((__t > 1) == (__b > __a))
+  {
+    return __b < __x ? __x : __b;
+  }
+  else
+  {
+    return __x < __b ? __x : __b;
+  }
+}
+
+_CCCL_NODISCARD _LIBCUDACXX_HIDE_FROM_ABI _CCCL_CONSTEXPR_CXX14 float lerp(float __a, float __b, float __t) noexcept
+{
+  return _CUDA_VSTD::__lerp(__a, __b, __t);
+}
+
+_CCCL_NODISCARD _LIBCUDACXX_HIDE_FROM_ABI _CCCL_CONSTEXPR_CXX14 double lerp(double __a, double __b, double __t) noexcept
+{
+  return _CUDA_VSTD::__lerp(__a, __b, __t);
+}
+
+#if !defined(_LIBCUDACXX_HAS_NO_LONG_DOUBLE)
+_CCCL_NODISCARD _LIBCUDACXX_HIDE_FROM_ABI _CCCL_CONSTEXPR_CXX14 long double
+lerp(long double __a, long double __b, long double __t) noexcept
+{
+  return _CUDA_VSTD::__lerp(__a, __b, __t);
+}
+#endif // !_LIBCUDACXX_HAS_NO_LONG_DOUBLE
+
+#if defined(_LIBCUDACXX_HAS_NVFP16)
+_CCCL_NODISCARD _LIBCUDACXX_HIDE_FROM_ABI __half lerp(__half __a, __half __b, __half __t) noexcept
+{
+  return __float2half(_CUDA_VSTD::__lerp(__half2float(__a), __half2float(__b), __half2float(__t)));
+}
+#endif // _LIBCUDACXX_HAS_NVFP16
+
+#if defined(_LIBCUDACXX_HAS_NVBF16)
+_CCCL_NODISCARD _LIBCUDACXX_HIDE_FROM_ABI __nv_bfloat16
+lerp(__nv_bfloat16 __a, __nv_bfloat16 __b, __nv_bfloat16 __t) noexcept
+{
+  return __float2bfloat16(_CUDA_VSTD::__lerp(__bfloat162float(__a), __bfloat162float(__b), __bfloat162float(__t)));
+}
+#endif // _LIBCUDACXX_HAS_NVBF16
+
+template <class _A1, class _A2, class _A3>
+_CCCL_NODISCARD _LIBCUDACXX_HIDE_FROM_ABI _CCCL_CONSTEXPR_CXX14
+enable_if_t<_CCCL_TRAIT(is_arithmetic, _A1) && _CCCL_TRAIT(is_arithmetic, _A2) && _CCCL_TRAIT(is_arithmetic, _A3),
+            __promote_t<_A1, _A2, _A3>>
+lerp(_A1 __a, _A2 __b, _A3 __t) noexcept
+{
+  using __result_type = __promote_t<_A1, _A2, _A3>;
+  static_assert(!(_CCCL_TRAIT(is_same, _A1, __result_type) && _CCCL_TRAIT(is_same, _A2, __result_type)
+                  && _CCCL_TRAIT(is_same, _A3, __result_type)),
+                "");
+  return _CUDA_VSTD::__lerp((__result_type) __a, (__result_type) __b, (__result_type) __t);
+}
+
+_LIBCUDACXX_END_NAMESPACE_STD
+
+#endif // _LIBCUDACXX___CMATH_LERP_H
diff --git a/libcudacxx/include/cuda/std/__cmath/logarithms.h b/libcudacxx/include/cuda/std/__cmath/logarithms.h
new file mode 100644
index 00000000000..660b674f99b
--- /dev/null
+++ b/libcudacxx/include/cuda/std/__cmath/logarithms.h
@@ -0,0 +1,494 @@
+// -*- C++ -*-
+//===----------------------------------------------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+// SPDX-FileCopyrightText: Copyright (c) 2024 NVIDIA CORPORATION & AFFILIATES.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef _LIBCUDACXX___CMATH_LOGARITHMS_H
+#define _LIBCUDACXX___CMATH_LOGARITHMS_H
+
+#include <cuda/std/detail/__config>
+
+#if defined(_CCCL_IMPLICIT_SYSTEM_HEADER_GCC)
+#  pragma GCC system_header
+#elif defined(_CCCL_IMPLICIT_SYSTEM_HEADER_CLANG)
+#  pragma clang system_header
+#elif defined(_CCCL_IMPLICIT_SYSTEM_HEADER_MSVC)
+#  pragma system_header
+#endif // no system header
+
+#include <cuda/std/__cmath/common.h>
+#include <cuda/std/__type_traits/enable_if.h>
+#include <cuda/std/__type_traits/is_integral.h>
+#include <cuda/std/cstdint>
+
+#include <nv/target>
+
+_LIBCUDACXX_BEGIN_NAMESPACE_STD
+
+// log
+
+_CCCL_NODISCARD _LIBCUDACXX_HIDE_FROM_ABI float log(float __x) noexcept
+{
+#if defined(_CCCL_BUILTIN_LOGF)
+  return _CCCL_BUILTIN_LOGF(__x);
+#else // ^^^ _CCCL_BUILTIN_LOGF ^^^ / vvv !_CCCL_BUILTIN_LOGF vvv
+  return ::logf(__x);
+#endif // !_CCCL_BUILTIN_LOGF
+}
+
+_CCCL_NODISCARD _LIBCUDACXX_HIDE_FROM_ABI float logf(float __x) noexcept
+{
+#if defined(_CCCL_BUILTIN_LOGF)
+  return _CCCL_BUILTIN_LOGF(__x);
+#else // ^^^ _CCCL_BUILTIN_LOGF ^^^ / vvv !_CCCL_BUILTIN_LOGF vvv
+  return ::logf(__x);
+#endif // !_CCCL_BUILTIN_LOGF
+}
+
+_CCCL_NODISCARD _LIBCUDACXX_HIDE_FROM_ABI double log(double __x) noexcept
+{
+#if defined(_CCCL_BUILTIN_LOG)
+  return _CCCL_BUILTIN_LOG(__x);
+#else // ^^^ _CCCL_BUILTIN_LOG ^^^ / vvv !_CCCL_BUILTIN_LOG vvv
+  return ::log(__x);
+#endif // !_CCCL_BUILTIN_LOG
+}
+
+#if !defined(_LIBCUDACXX_HAS_NO_LONG_DOUBLE)
+_CCCL_NODISCARD _LIBCUDACXX_HIDE_FROM_ABI long double log(long double __x) noexcept
+{
+#  if defined(_CCCL_BUILTIN_LOGL)
+  return _CCCL_BUILTIN_LOGL(__x);
+#  else // ^^^ _CCCL_BUILTIN_LOGL ^^^ / vvv !_CCCL_BUILTIN_LOGL vvv
+  return ::logl(__x);
+#  endif // !_CCCL_BUILTIN_LOGL
+}
+
+_CCCL_NODISCARD _LIBCUDACXX_HIDE_FROM_ABI long double logl(long double __x) noexcept
+{
+#  if defined(_CCCL_BUILTIN_LOGL)
+  return _CCCL_BUILTIN_LOGL(__x);
+#  else // ^^^ _CCCL_BUILTIN_LOGL ^^^ / vvv !_CCCL_BUILTIN_LOGL vvv
+  return ::logl(__x);
+#  endif // !_CCCL_BUILTIN_LOGL
+}
+#endif // !_LIBCUDACXX_HAS_NO_LONG_DOUBLE
+
+#if defined(_LIBCUDACXX_HAS_NVFP16)
+_CCCL_NODISCARD _LIBCUDACXX_HIDE_FROM_ABI __half log(__half __x) noexcept
+{
+  NV_IF_ELSE_TARGET(NV_PROVIDES_SM_53, (return ::hlog(__x);), ({
+                      float __vf            = __half2float(__x);
+                      __vf                  = _CUDA_VSTD::logf(__vf);
+                      __half_raw __ret_repr = ::__float2half_rn(__vf);
+
+                      _CUDA_VSTD::uint16_t __repr = __half_raw(__x).x;
+                      switch (__repr)
+                      {
+                        case 7544:
+                          __ret_repr.x -= 1;
+                          break;
+
+                        default:;
+                      }
+
+                      return __ret_repr;
+                    }))
+}
+#endif // _LIBCUDACXX_HAS_NVFP16
+
+#if defined(_LIBCUDACXX_HAS_NVBF16)
+_CCCL_NODISCARD _LIBCUDACXX_HIDE_FROM_ABI __nv_bfloat16 log(__nv_bfloat16 __x) noexcept
+{
+  NV_IF_ELSE_TARGET(
+    NV_IS_DEVICE, (return ::hlog(__x);), (return __float2bfloat16(_CUDA_VSTD::logf(__bfloat162float(__x)));))
+}
+#endif // _LIBCUDACXX_HAS_NVBF16
+
+template <class _Integer, enable_if_t<_CCCL_TRAIT(is_integral, _Integer), int> = 0>
+_CCCL_NODISCARD _LIBCUDACXX_HIDE_FROM_ABI double log(_Integer __x) noexcept
+{
+#if defined(_CCCL_BUILTIN_LOG)
+  return _CCCL_BUILTIN_LOG((double) __x);
+#else // ^^^ _CCCL_BUILTIN_LOG ^^^ / vvv !_CCCL_BUILTIN_LOG vvv
+  return ::log((double) __x);
+#endif // !_CCCL_BUILTIN_LOG
+}
+
+// log10
+
+_CCCL_NODISCARD _LIBCUDACXX_HIDE_FROM_ABI float log10(float __x) noexcept
+{
+#if defined(_CCCL_BUILTIN_LOG10F)
+  return _CCCL_BUILTIN_LOG10F(__x);
+#else // ^^^ _CCCL_BUILTIN_LOG10F ^^^ / vvv !_CCCL_BUILTIN_LOG10F vvv
+  return ::log10f(__x);
+#endif // !_CCCL_BUILTIN_LOG10F
+}
+
+_CCCL_NODISCARD _LIBCUDACXX_HIDE_FROM_ABI float log10f(float __x) noexcept
+{
+#if defined(_CCCL_BUILTIN_LOG10F)
+  return _CCCL_BUILTIN_LOG10F(__x);
+#else // ^^^ _CCCL_BUILTIN_LOG10F ^^^ / vvv !_CCCL_BUILTIN_LOG10F vvv
+  return ::log10f(__x);
+#endif // !_CCCL_BUILTIN_LOG10F
+}
+
+_CCCL_NODISCARD _LIBCUDACXX_HIDE_FROM_ABI double log10(double __x) noexcept
+{
+#if defined(_CCCL_BUILTIN_LOG10)
+  return _CCCL_BUILTIN_LOG10(__x);
+#else // ^^^ _CCCL_BUILTIN_LOG10 ^^^ / vvv !_CCCL_BUILTIN_LOG10 vvv
+  return ::log10(__x);
+#endif // !_CCCL_BUILTIN_LOG10
+}
+
+#if !defined(_LIBCUDACXX_HAS_NO_LONG_DOUBLE)
+_CCCL_NODISCARD _LIBCUDACXX_HIDE_FROM_ABI long double log10(long double __x) noexcept
+{
+#  if defined(_CCCL_BUILTIN_LOG10L)
+  return _CCCL_BUILTIN_LOG10L(__x);
+#  else // ^^^ _CCCL_BUILTIN_LOG10L ^^^ / vvv !_CCCL_BUILTIN_LOG10L vvv
+  return ::log10l(__x);
+#  endif // !_CCCL_BUILTIN_LOG10L
+}
+
+_CCCL_NODISCARD _LIBCUDACXX_HIDE_FROM_ABI long double log10l(long double __x) noexcept
+{
+#  if defined(_CCCL_BUILTIN_LOG10L)
+  return _CCCL_BUILTIN_LOG10L(__x);
+#  else // ^^^ _CCCL_BUILTIN_LOG10L ^^^ / vvv !_CCCL_BUILTIN_LOG10L vvv
+  return ::log10l(__x);
+#  endif // !_CCCL_BUILTIN_LOG10L
+}
+#endif // !_LIBCUDACXX_HAS_NO_LONG_DOUBLE
+
+#if defined(_LIBCUDACXX_HAS_NVFP16)
+_CCCL_NODISCARD _LIBCUDACXX_HIDE_FROM_ABI __half log10(__half __x) noexcept
+{
+  NV_IF_ELSE_TARGET(
+    NV_PROVIDES_SM_53, (return ::hlog10(__x);), (return __float2half(_CUDA_VSTD::log10f(__half2float(__x)));))
+}
+#endif // _LIBCUDACXX_HAS_NVFP16
+
+#if defined(_LIBCUDACXX_HAS_NVBF16)
+_CCCL_NODISCARD _LIBCUDACXX_HIDE_FROM_ABI __nv_bfloat16 log10(__nv_bfloat16 __x) noexcept
+{
+  NV_IF_ELSE_TARGET(
+    NV_IS_DEVICE, (return ::hlog10(__x);), (return __float2bfloat16(_CUDA_VSTD::log10f(__bfloat162float(__x)));))
+}
+#endif // _LIBCUDACXX_HAS_NVBF16
+
+template <class _Integer, enable_if_t<_CCCL_TRAIT(is_integral, _Integer), int> = 0>
+_CCCL_NODISCARD _LIBCUDACXX_HIDE_FROM_ABI double log10(_Integer __x) noexcept
+{
+#if defined(_CCCL_BUILTIN_LOG10)
+  return _CCCL_BUILTIN_LOG10((double) __x);
+#else // ^^^ _CCCL_BUILTIN_LOG10 ^^^ / vvv !_CCCL_BUILTIN_LOG10 vvv
+  return ::log10((double) __x);
+#endif // !_CCCL_BUILTIN_LOG10
+}
+
+// ilogb
+
+_CCCL_NODISCARD _LIBCUDACXX_HIDE_FROM_ABI int ilogb(float __x) noexcept
+{
+#if defined(_CCCL_BUILTIN_ILOGBF)
+  return _CCCL_BUILTIN_ILOGBF(__x);
+#else // ^^^ _CCCL_BUILTIN_ILOGBF ^^^ / vvv !_CCCL_BUILTIN_ILOGBF vvv
+  return ::ilogbf(__x);
+#endif // !_CCCL_BUILTIN_ILOGBF
+}
+
+_CCCL_NODISCARD _LIBCUDACXX_HIDE_FROM_ABI int ilogbf(float __x) noexcept
+{
+#if defined(_CCCL_BUILTIN_ILOGBF)
+  return _CCCL_BUILTIN_ILOGBF(__x);
+#else // ^^^ _CCCL_BUILTIN_ILOGBF ^^^ / vvv !_CCCL_BUILTIN_ILOGBF vvv
+  return ::ilogbf(__x);
+#endif // !_CCCL_BUILTIN_ILOGBF
+}
+
+_CCCL_NODISCARD _LIBCUDACXX_HIDE_FROM_ABI int ilogb(double __x) noexcept
+{
+#if defined(_CCCL_BUILTIN_ILOGB)
+  return _CCCL_BUILTIN_ILOGB(__x);
+#else // ^^^ _CCCL_BUILTIN_ILOGB ^^^ / vvv !_CCCL_BUILTIN_ILOGB vvv
+  return ::ilogb(__x);
+#endif // !_CCCL_BUILTIN_ILOGB
+}
+
+#if !defined(_LIBCUDACXX_HAS_NO_LONG_DOUBLE)
+_CCCL_NODISCARD _LIBCUDACXX_HIDE_FROM_ABI int ilogb(long double __x) noexcept
+{
+#  if defined(_CCCL_BUILTIN_ILOGBL)
+  return _CCCL_BUILTIN_ILOGBL(__x);
+#  else // ^^^ _CCCL_BUILTIN_ILOGBL ^^^ / vvv !_CCCL_BUILTIN_ILOGBL vvv
+  return ::ilogbl(__x);
+#  endif // !_CCCL_BUILTIN_ILOGBL
+}
+
+_CCCL_NODISCARD _LIBCUDACXX_HIDE_FROM_ABI int ilogbl(long double __x) noexcept
+{
+#  if defined(_CCCL_BUILTIN_ILOGBL)
+  return _CCCL_BUILTIN_ILOGBL(__x);
+#  else // ^^^ _CCCL_BUILTIN_ILOGBL ^^^ / vvv !_CCCL_BUILTIN_ILOGBL vvv
+  return ::ilogbl(__x);
+#  endif // !_CCCL_BUILTIN_ILOGBL
+}
+#endif // !_LIBCUDACXX_HAS_NO_LONG_DOUBLE
+
+#if defined(_LIBCUDACXX_HAS_NVFP16)
+_CCCL_NODISCARD _LIBCUDACXX_HIDE_FROM_ABI int ilogb(__half __x) noexcept
+{
+  return _CUDA_VSTD::ilogbf(__half2float(__x));
+}
+#endif // defined(_LIBCUDACXX_HAS_NVFP16)
+
+#if defined(_LIBCUDACXX_HAS_NVBF16)
+_CCCL_NODISCARD _LIBCUDACXX_HIDE_FROM_ABI int ilogb(__nv_bfloat16 __x) noexcept
+{
+  return _CUDA_VSTD::ilogbf(__bfloat162float(__x));
+}
+#endif // _LIBCUDACXX_HAS_NVBF16
+
+template <class _Integer, enable_if_t<_CCCL_TRAIT(is_integral, _Integer), int> = 0>
+_CCCL_NODISCARD _LIBCUDACXX_HIDE_FROM_ABI int ilogb(_Integer __x) noexcept
+{
+#if defined(_CCCL_BUILTIN_ILOGB)
+  return _CCCL_BUILTIN_ILOGB((double) __x);
+#else // ^^^ _CCCL_BUILTIN_ILOGB ^^^ / vvv !_CCCL_BUILTIN_ILOGB vvv
+  return ::ilogb((double) __x);
+#endif // !_CCCL_BUILTIN_ILOGB
+}
+
+// log1p
+
+_CCCL_NODISCARD _LIBCUDACXX_HIDE_FROM_ABI float log1p(float __x) noexcept
+{
+#if defined(_CCCL_BUILTIN_LOG1PF)
+  return _CCCL_BUILTIN_LOG1PF(__x);
+#else // ^^^ _CCCL_BUILTIN_LOG1PF ^^^ / vvv !_CCCL_BUILTIN_LOG1PF vvv
+  return ::log1pf(__x);
+#endif // !_CCCL_BUILTIN_LOG1PF
+}
+
+_CCCL_NODISCARD _LIBCUDACXX_HIDE_FROM_ABI float log1pf(float __x) noexcept
+{
+#if defined(_CCCL_BUILTIN_LOG1PF)
+  return _CCCL_BUILTIN_LOG1PF(__x);
+#else // ^^^ _CCCL_BUILTIN_LOG1PF ^^^ / vvv !_CCCL_BUILTIN_LOG1PF vvv
+  return ::log1pf(__x);
+#endif // !_CCCL_BUILTIN_LOG1PF
+}
+
+_CCCL_NODISCARD _LIBCUDACXX_HIDE_FROM_ABI double log1p(double __x) noexcept
+{
+#if defined(_CCCL_BUILTIN_LOG1P)
+  return _CCCL_BUILTIN_LOG1P(__x);
+#else // ^^^ _CCCL_BUILTIN_LOG1P ^^^ / vvv !_CCCL_BUILTIN_LOG1P vvv
+  return ::log1p(__x);
+#endif // !_CCCL_BUILTIN_LOG1P
+}
+
+#if !defined(_LIBCUDACXX_HAS_NO_LONG_DOUBLE)
+_CCCL_NODISCARD _LIBCUDACXX_HIDE_FROM_ABI long double log1p(long double __x) noexcept
+{
+#  if defined(_CCCL_BUILTIN_LOG1PL)
+  return _CCCL_BUILTIN_LOG1PL(__x);
+#  else // ^^^ _CCCL_BUILTIN_LOG1PL ^^^ / vvv !_CCCL_BUILTIN_LOG1PL vvv
+  return ::log1pl(__x);
+#  endif // !_CCCL_BUILTIN_LOG1PL
+}
+
+_CCCL_NODISCARD _LIBCUDACXX_HIDE_FROM_ABI long double log1pl(long double __x) noexcept
+{
+#  if defined(_CCCL_BUILTIN_LOG1PL)
+  return _CCCL_BUILTIN_LOG1PL(__x);
+#  else // ^^^ _CCCL_BUILTIN_LOG1PL ^^^ / vvv !_CCCL_BUILTIN_LOG1PL vvv
+  return ::log1pl(__x);
+#  endif // !_CCCL_BUILTIN_LOG1PL
+}
+#endif // !_LIBCUDACXX_HAS_NO_LONG_DOUBLE
+
+#if defined(_LIBCUDACXX_HAS_NVFP16)
+_CCCL_NODISCARD _LIBCUDACXX_HIDE_FROM_ABI __half log1p(__half __x) noexcept
+{
+  return __float2half(_CUDA_VSTD::log1pf(__half2float(__x)));
+}
+#endif // _LIBCUDACXX_HAS_NVFP16
+
+#if defined(_LIBCUDACXX_HAS_NVBF16)
+_CCCL_NODISCARD _LIBCUDACXX_HIDE_FROM_ABI __nv_bfloat16 log1p(__nv_bfloat16 __x) noexcept
+{
+  return __float2bfloat16(_CUDA_VSTD::log1pf(__bfloat162float(__x)));
+}
+#endif // _LIBCUDACXX_HAS_NVBF16
+
+template <class _Integer, enable_if_t<_CCCL_TRAIT(is_integral, _Integer), int> = 0>
+_CCCL_NODISCARD _LIBCUDACXX_HIDE_FROM_ABI double log1p(_Integer __x) noexcept
+{
+#if defined(_CCCL_BUILTIN_LOG1P)
+  return _CCCL_BUILTIN_LOG1P((double) __x);
+#else // ^^^ _CCCL_BUILTIN_LOG1P ^^^ / vvv !_CCCL_BUILTIN_LOG1P vvv
+  return ::log1p((double) __x);
+#endif // !_CCCL_BUILTIN_LOG1P
+}
+
+// log2
+
+_CCCL_NODISCARD _LIBCUDACXX_HIDE_FROM_ABI float log2(float __x) noexcept
+{
+#if defined(_CCCL_BUILTIN_LOG2F)
+  return _CCCL_BUILTIN_LOG2F(__x);
+#else // ^^^ _CCCL_BUILTIN_LOG2F ^^^ / vvv !_CCCL_BUILTIN_LOG2F vvv
+  return ::log2f(__x);
+#endif // !_CCCL_BUILTIN_LOG2F
+}
+
+_CCCL_NODISCARD _LIBCUDACXX_HIDE_FROM_ABI float log2f(float __x) noexcept
+{
+#if defined(_CCCL_BUILTIN_LOG2F)
+  return _CCCL_BUILTIN_LOG2F(__x);
+#else // ^^^ _CCCL_BUILTIN_LOG2F ^^^ / vvv !_CCCL_BUILTIN_LOG2F vvv
+  return ::log2f(__x);
+#endif // !_CCCL_BUILTIN_LOG2F
+}
+
+_CCCL_NODISCARD _LIBCUDACXX_HIDE_FROM_ABI double log2(double __x) noexcept
+{
+#if defined(_CCCL_BUILTIN_LOG2)
+  return _CCCL_BUILTIN_LOG2(__x);
+#else // ^^^ _CCCL_BUILTIN_LOG2 ^^^ / vvv !_CCCL_BUILTIN_LOG2 vvv
+  return ::log2(__x);
+#endif // !_CCCL_BUILTIN_LOG2
+}
+
+#if !defined(_LIBCUDACXX_HAS_NO_LONG_DOUBLE)
+_CCCL_NODISCARD _LIBCUDACXX_HIDE_FROM_ABI long double log2(long double __x) noexcept
+{
+#  if defined(_CCCL_BUILTIN_LOG2L)
+  return _CCCL_BUILTIN_LOG2L(__x);
+#  else // ^^^ _CCCL_BUILTIN_LOG2L ^^^ / vvv !_CCCL_BUILTIN_LOG2L vvv
+  return ::log2l(__x);
+#  endif // !_CCCL_BUILTIN_LOG2L
+}
+
+_CCCL_NODISCARD _LIBCUDACXX_HIDE_FROM_ABI long double log2l(long double __x) noexcept
+{
+#  if defined(_CCCL_BUILTIN_LOG2L)
+  return _CCCL_BUILTIN_LOG2L(__x);
+#  else // ^^^ _CCCL_BUILTIN_LOG2L ^^^ / vvv !_CCCL_BUILTIN_LOG2L vvv
+  return ::log2l(__x);
+#  endif // !_CCCL_BUILTIN_LOG2L
+}
+#endif // !_LIBCUDACXX_HAS_NO_LONG_DOUBLE
+
+#if defined(_LIBCUDACXX_HAS_NVFP16)
+_CCCL_NODISCARD _LIBCUDACXX_HIDE_FROM_ABI __half log2(__half __x) noexcept
+{
+  NV_IF_ELSE_TARGET(
+    NV_PROVIDES_SM_53, (return ::hlog2(__x);), (return __float2half(_CUDA_VSTD::log2f(__half2float(__x)));))
+}
+#endif // _LIBCUDACXX_HAS_NVFP16
+
+#if defined(_LIBCUDACXX_HAS_NVBF16)
+_CCCL_NODISCARD _LIBCUDACXX_HIDE_FROM_ABI __nv_bfloat16 log2(__nv_bfloat16 __x) noexcept
+{
+  NV_IF_ELSE_TARGET(
+    NV_IS_DEVICE, (return ::hlog2(__x);), (return __float2bfloat16(_CUDA_VSTD::log2f(__bfloat162float(__x)));))
+}
+#endif // _LIBCUDACXX_HAS_NVBF16
+
+template <class _Integer, enable_if_t<_CCCL_TRAIT(is_integral, _Integer), int> = 0>
+_CCCL_NODISCARD _LIBCUDACXX_HIDE_FROM_ABI double log2(_Integer __x) noexcept
+{
+#if defined(_CCCL_BUILTIN_LOG2)
+  return _CCCL_BUILTIN_LOG2((double) __x);
+#else // ^^^ _CCCL_BUILTIN_LOG2 ^^^ / vvv !_CCCL_BUILTIN_LOG2 vvv
+  return ::log2((double) __x);
+#endif // !_CCCL_BUILTIN_LOG2
+}
+
+// logb
+
+_CCCL_NODISCARD _LIBCUDACXX_HIDE_FROM_ABI float logb(float __x) noexcept
+{
+#if defined(_CCCL_BUILTIN_LOGBF)
+  return _CCCL_BUILTIN_LOGBF(__x);
+#else // ^^^ _CCCL_BUILTIN_LOGBF ^^^ / vvv !_CCCL_BUILTIN_LOGBF vvv
+  return ::logbf(__x);
+#endif // !_CCCL_BUILTIN_LOGBF
+}
+
+_CCCL_NODISCARD _LIBCUDACXX_HIDE_FROM_ABI float logbf(float __x) noexcept
+{
+#if defined(_CCCL_BUILTIN_LOGBF)
+  return _CCCL_BUILTIN_LOGBF(__x);
+#else // ^^^ _CCCL_BUILTIN_LOGBF ^^^ / vvv !_CCCL_BUILTIN_LOGBF vvv
+  return ::logbf(__x);
+#endif // !_CCCL_BUILTIN_LOGBF
+}
+
+_CCCL_NODISCARD _LIBCUDACXX_HIDE_FROM_ABI double logb(double __x) noexcept
+{
+#if defined(_CCCL_BUILTIN_LOGB)
+  return _CCCL_BUILTIN_LOGB(__x);
+#else // ^^^ _CCCL_BUILTIN_LOGB ^^^ / vvv !_CCCL_BUILTIN_LOGB vvv
+  return ::logb(__x);
+#endif // !_CCCL_BUILTIN_LOGB
+}
+
+#if !defined(_LIBCUDACXX_HAS_NO_LONG_DOUBLE)
+_CCCL_NODISCARD _LIBCUDACXX_HIDE_FROM_ABI long double logb(long double __x) noexcept
+{
+#  if defined(_CCCL_BUILTIN_LOGBL)
+  return _CCCL_BUILTIN_LOGBL(__x);
+#  else // ^^^ _CCCL_BUILTIN_LOGBL ^^^ / vvv !_CCCL_BUILTIN_LOGBL vvv
+  return ::logbl(__x);
+#  endif // !_CCCL_BUILTIN_LOGBL
+}
+
+_CCCL_NODISCARD _LIBCUDACXX_HIDE_FROM_ABI long double logbl(long double __x) noexcept
+{
+#  if defined(_CCCL_BUILTIN_LOGBL)
+  return _CCCL_BUILTIN_LOGBL(__x);
+#  else // ^^^ _CCCL_BUILTIN_LOGBL ^^^ / vvv !_CCCL_BUILTIN_LOGBL vvv
+  return ::logbl(__x);
+#  endif // !_CCCL_BUILTIN_LOGBL
+}
+#endif // !_LIBCUDACXX_HAS_NO_LONG_DOUBLE
+
+#if defined(_LIBCUDACXX_HAS_NVFP16)
+_CCCL_NODISCARD _LIBCUDACXX_HIDE_FROM_ABI __half logb(__half __x) noexcept
+{
+  return __float2half(_CUDA_VSTD::logbf(__half2float(__x)));
+}
+#endif // _LIBCUDACXX_HAS_NVFP16
+
+#if defined(_LIBCUDACXX_HAS_NVBF16)
+_CCCL_NODISCARD _LIBCUDACXX_HIDE_FROM_ABI __nv_bfloat16 logb(__nv_bfloat16 __x) noexcept
+{
+  return __float2bfloat16(_CUDA_VSTD::logbf(__bfloat162float(__x)));
+}
+#endif // _LIBCUDACXX_HAS_NVBF16
+
+template <class _Integer, enable_if_t<_CCCL_TRAIT(is_integral, _Integer), int> = 0>
+_CCCL_NODISCARD _LIBCUDACXX_HIDE_FROM_ABI double logb(_Integer __x) noexcept
+{
+#if defined(_CCCL_BUILTIN_LOGB)
+  return _CCCL_BUILTIN_LOGB((double) __x);
+#else // ^^^ _CCCL_BUILTIN_LOGB ^^^ / vvv !_CCCL_BUILTIN_LOGB vvv
+  return ::logb((double) __x);
+#endif // !_CCCL_BUILTIN_LOGB
+}
+
+_LIBCUDACXX_END_NAMESPACE_STD
+
+#endif // _LIBCUDACXX___CMATH_LOGARITHMS_H
diff --git a/libcudacxx/include/cuda/std/__cmath/min_max.h b/libcudacxx/include/cuda/std/__cmath/min_max.h
new file mode 100644
index 00000000000..009fd499ac8
--- /dev/null
+++ b/libcudacxx/include/cuda/std/__cmath/min_max.h
@@ -0,0 +1,227 @@
+// -*- C++ -*-
+//===----------------------------------------------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+// SPDX-FileCopyrightText: Copyright (c) 2024 NVIDIA CORPORATION & AFFILIATES.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef _LIBCUDACXX___CMATH_MIN_MAX_H
+#define _LIBCUDACXX___CMATH_MIN_MAX_H
+
+#include <cuda/std/detail/__config>
+
+#if defined(_CCCL_IMPLICIT_SYSTEM_HEADER_GCC)
+#  pragma GCC system_header
+#elif defined(_CCCL_IMPLICIT_SYSTEM_HEADER_CLANG)
+#  pragma clang system_header
+#elif defined(_CCCL_IMPLICIT_SYSTEM_HEADER_MSVC)
+#  pragma system_header
+#endif // no system header
+
+#include <cuda/std/__cmath/common.h>
+#include <cuda/std/__type_traits/enable_if.h>
+#include <cuda/std/__type_traits/is_arithmetic.h>
+#include <cuda/std/__type_traits/promote.h>
+
+#include <nv/target>
+
+_LIBCUDACXX_BEGIN_NAMESPACE_STD
+
+// fmax
+
+_CCCL_NODISCARD _LIBCUDACXX_HIDE_FROM_ABI float fmax(float __x, float __y) noexcept
+{
+#if defined(_CCCL_BUILTIN_FMAX)
+  return _CCCL_BUILTIN_FMAXF(__x, __y);
+#else // ^^^ _CCCL_BUILTIN_FMAX ^^^ / vvv !_CCCL_BUILTIN_FMAX vvv
+  return ::fmaxf(__x, __y);
+#endif // !_CCCL_BUILTIN_FMAX
+}
+
+_CCCL_NODISCARD _LIBCUDACXX_HIDE_FROM_ABI float fmaxf(float __x, float __y) noexcept
+{
+#if defined(_CCCL_BUILTIN_FMAX)
+  return _CCCL_BUILTIN_FMAXF(__x, __y);
+#else // ^^^ _CCCL_BUILTIN_FMAX ^^^ / vvv !_CCCL_BUILTIN_FMAX vvv
+  return ::fmaxf(__x, __y);
+#endif // !_CCCL_BUILTIN_FMAX
+}
+
+_CCCL_NODISCARD _LIBCUDACXX_HIDE_FROM_ABI double fmax(double __x, double __y) noexcept
+{
+#if defined(_CCCL_BUILTIN_FMAX)
+  return _CCCL_BUILTIN_FMAX(__x, __y);
+#else // ^^^ _CCCL_BUILTIN_FMAX ^^^ / vvv !_CCCL_BUILTIN_FMAX vvv
+  return ::fmax(__x, __y);
+#endif // !_CCCL_BUILTIN_FMAX
+}
+
+#if !defined(_LIBCUDACXX_HAS_NO_LONG_DOUBLE)
+_CCCL_NODISCARD _LIBCUDACXX_HIDE_FROM_ABI long double fmax(long double __x, long double __y) noexcept
+{
+#  if defined(_CCCL_BUILTIN_FMAX)
+  return _CCCL_BUILTIN_FMAXL(__x, __y);
+#  else // ^^^ _CCCL_BUILTIN_FMAX ^^^ / vvv !_CCCL_BUILTIN_FMAX vvv
+  return ::fmaxl(__x, __y);
+#  endif // !_CCCL_BUILTIN_FMAX
+}
+_CCCL_NODISCARD _LIBCUDACXX_HIDE_FROM_ABI long double fmaxl(long double __x, long double __y) noexcept
+{
+#  if defined(_CCCL_BUILTIN_FMAX)
+  return _CCCL_BUILTIN_FMAXL(__x, __y);
+#  else // ^^^ _CCCL_BUILTIN_FMAX ^^^ / vvv !_CCCL_BUILTIN_FMAX vvv
+  return ::fmaxl(__x, __y);
+#  endif // !_CCCL_BUILTIN_FMAX
+}
+#endif // !_LIBCUDACXX_HAS_NO_LONG_DOUBLE
+
+#if defined(_LIBCUDACXX_HAS_NVFP16)
+_CCCL_NODISCARD _LIBCUDACXX_HIDE_FROM_ABI __half fmax(__half __x, __half __y) noexcept
+{
+  NV_IF_ELSE_TARGET(NV_IS_DEVICE,
+                    (return ::__hmax(__x, __y);),
+                    (return __float2half(_CUDA_VSTD::fmaxf(__half2float(__x), __half2float(__y)));))
+}
+template <class _A1, enable_if_t<_CCCL_TRAIT(is_arithmetic, _A1), int> = 0>
+_CCCL_NODISCARD _LIBCUDACXX_HIDE_FROM_ABI __promote_t<float, _A1> fmax(__half __x, _A1 __y) noexcept
+{
+  return _CUDA_VSTD::fmaxf(__half2float(__x), __y);
+}
+
+template <class _A1, enable_if_t<_CCCL_TRAIT(is_arithmetic, _A1), int> = 0>
+_CCCL_NODISCARD _LIBCUDACXX_HIDE_FROM_ABI __promote_t<_A1, float> fmax(_A1 __x, __half __y) noexcept
+{
+  return _CUDA_VSTD::fmaxf(__x, __half2float(__y));
+}
+#endif // _LIBCUDACXX_HAS_NVFP16
+
+#if defined(_LIBCUDACXX_HAS_NVBF16)
+_CCCL_NODISCARD _LIBCUDACXX_HIDE_FROM_ABI __nv_bfloat16 fmax(__nv_bfloat16 __x, __nv_bfloat16 __y) noexcept
+{
+  NV_IF_ELSE_TARGET(NV_IS_DEVICE,
+                    (return ::__hmax(__x, __y);),
+                    (return __float2bfloat16(_CUDA_VSTD::fmaxf(__bfloat162float(__x), __bfloat162float(__y)));))
+}
+template <class _A1, enable_if_t<_CCCL_TRAIT(is_arithmetic, _A1), int> = 0>
+_CCCL_NODISCARD _LIBCUDACXX_HIDE_FROM_ABI __promote_t<float, _A1> fmax(__nv_bfloat16 __x, _A1 __y) noexcept
+{
+  return _CUDA_VSTD::fmaxf(__bfloat162float(__x), __y);
+}
+
+template <class _A1, enable_if_t<_CCCL_TRAIT(is_arithmetic, _A1), int> = 0>
+_CCCL_NODISCARD _LIBCUDACXX_HIDE_FROM_ABI __promote_t<_A1, float> fmax(_A1 __x, __nv_bfloat16 __y) noexcept
+{
+  return _CUDA_VSTD::fmaxf(__x, __bfloat162float(__y));
+}
+#endif // _LIBCUDACXX_HAS_NVBF16
+
+template <class _A1, class _A2, enable_if_t<_CCCL_TRAIT(is_arithmetic, _A1) && _CCCL_TRAIT(is_arithmetic, _A2), int> = 0>
+_CCCL_NODISCARD _LIBCUDACXX_HIDE_FROM_ABI __promote_t<_A1, _A2> fmax(_A1 __x, _A2 __y) noexcept
+{
+  using __result_type = __promote_t<_A1, _A2>;
+  static_assert(!(_CCCL_TRAIT(is_same, _A1, __result_type) && _CCCL_TRAIT(is_same, _A2, __result_type)), "");
+  return _CUDA_VSTD::fmax((__result_type) __x, (__result_type) __y);
+}
+
+// fmin
+
+_CCCL_NODISCARD _LIBCUDACXX_HIDE_FROM_ABI float fmin(float __x, float __y) noexcept
+{
+#if defined(_CCCL_BUILTIN_FMIN)
+  return _CCCL_BUILTIN_FMINF(__x, __y);
+#else // ^^^ _CCCL_BUILTIN_FMIN ^^^ / vvv !_CCCL_BUILTIN_FMIN vvv
+  return ::fminf(__x, __y);
+#endif // !_CCCL_BUILTIN_FMIN
+}
+
+_CCCL_NODISCARD _LIBCUDACXX_HIDE_FROM_ABI float fminf(float __x, float __y) noexcept
+{
+#if defined(_CCCL_BUILTIN_FMIN)
+  return _CCCL_BUILTIN_FMINF(__x, __y);
+#else // ^^^ _CCCL_BUILTIN_FMIN ^^^ / vvv !_CCCL_BUILTIN_FMIN vvv
+  return ::fminf(__x, __y);
+#endif // !_CCCL_BUILTIN_FMIN
+}
+
+_CCCL_NODISCARD _LIBCUDACXX_HIDE_FROM_ABI double fmin(double __x, double __y) noexcept
+{
+#if defined(_CCCL_BUILTIN_FMIN)
+  return _CCCL_BUILTIN_FMIN(__x, __y);
+#else // ^^^ _CCCL_BUILTIN_FMIN ^^^ / vvv !_CCCL_BUILTIN_FMIN vvv
+  return ::fmin(__x, __y);
+#endif // !_CCCL_BUILTIN_FMIN
+}
+
+#if !defined(_LIBCUDACXX_HAS_NO_LONG_DOUBLE)
+_CCCL_NODISCARD _LIBCUDACXX_HIDE_FROM_ABI long double fmin(long double __x, long double __y) noexcept
+{
+#  if defined(_CCCL_BUILTIN_FMIN)
+  return _CCCL_BUILTIN_FMINL(__x, __y);
+#  else // ^^^ _CCCL_BUILTIN_FMIN ^^^ / vvv !_CCCL_BUILTIN_FMIN vvv
+  return ::fminl(__x, __y);
+#  endif // !_CCCL_BUILTIN_FMIN
+}
+_CCCL_NODISCARD _LIBCUDACXX_HIDE_FROM_ABI long double fminl(long double __x, long double __y) noexcept
+{
+#  if defined(_CCCL_BUILTIN_FMIN)
+  return _CCCL_BUILTIN_FMINL(__x, __y);
+#  else // ^^^ _CCCL_BUILTIN_FMIN ^^^ / vvv !_CCCL_BUILTIN_FMIN vvv
+  return ::fminl(__x, __y);
+#  endif // !_CCCL_BUILTIN_FMIN
+}
+#endif // !_LIBCUDACXX_HAS_NO_LONG_DOUBLE
+
+#if defined(_LIBCUDACXX_HAS_NVFP16)
+_CCCL_NODISCARD _LIBCUDACXX_HIDE_FROM_ABI __half fmin(__half __x, __half __y) noexcept
+{
+  NV_IF_ELSE_TARGET(NV_IS_DEVICE,
+                    (return ::__hmin(__x, __y);),
+                    (return __float2half(_CUDA_VSTD::fminf(__half2float(__x), __half2float(__y)));))
+}
+template <class _A1, enable_if_t<_CCCL_TRAIT(is_arithmetic, _A1), int> = 0>
+_CCCL_NODISCARD _LIBCUDACXX_HIDE_FROM_ABI __promote_t<float, _A1> fmin(__half __x, _A1 __y) noexcept
+{
+  return _CUDA_VSTD::fminf(__half2float(__x), __y);
+}
+
+template <class _A1, enable_if_t<_CCCL_TRAIT(is_arithmetic, _A1), int> = 0>
+_CCCL_NODISCARD _LIBCUDACXX_HIDE_FROM_ABI __promote_t<_A1, float> fmin(_A1 __x, __half __y) noexcept
+{
+  return _CUDA_VSTD::fminf(__x, __half2float(__y));
+}
+#endif // _LIBCUDACXX_HAS_NVFP16
+
+#if defined(_LIBCUDACXX_HAS_NVBF16)
+_CCCL_NODISCARD _LIBCUDACXX_HIDE_FROM_ABI __nv_bfloat16 fmin(__nv_bfloat16 __x, __nv_bfloat16 __y) noexcept
+{
+  NV_IF_ELSE_TARGET(NV_IS_DEVICE,
+                    (return ::__hmin(__x, __y);),
+                    (return __float2bfloat16(_CUDA_VSTD::fminf(__bfloat162float(__x), __bfloat162float(__y)));))
+}
+template <class _A1, enable_if_t<_CCCL_TRAIT(is_arithmetic, _A1), int> = 0>
+_CCCL_NODISCARD _LIBCUDACXX_HIDE_FROM_ABI __promote_t<float, _A1> fmin(__nv_bfloat16 __x, _A1 __y) noexcept
+{
+  return _CUDA_VSTD::fminf(__bfloat162float(__x), __y);
+}
+
+template <class _A1, enable_if_t<_CCCL_TRAIT(is_arithmetic, _A1), int> = 0>
+_CCCL_NODISCARD _LIBCUDACXX_HIDE_FROM_ABI __promote_t<_A1, float> fmin(_A1 __x, __nv_bfloat16 __y) noexcept
+{
+  return _CUDA_VSTD::fminf(__x, __bfloat162float(__y));
+}
+#endif // _LIBCUDACXX_HAS_NVBF16
+
+template <class _A1, class _A2, enable_if_t<_CCCL_TRAIT(is_arithmetic, _A1) && _CCCL_TRAIT(is_arithmetic, _A2), int> = 0>
+_CCCL_NODISCARD _LIBCUDACXX_HIDE_FROM_ABI __promote_t<_A1, _A2> fmin(_A1 __x, _A2 __y) noexcept
+{
+  using __result_type = __promote_t<_A1, _A2>;
+  static_assert(!(_CCCL_TRAIT(is_same, _A1, __result_type) && _CCCL_TRAIT(is_same, _A2, __result_type)), "");
+  return _CUDA_VSTD::fmin((__result_type) __x, (__result_type) __y);
+}
+
+_LIBCUDACXX_END_NAMESPACE_STD
+
+#endif // _LIBCUDACXX___CMATH_MIN_MAX_H
diff --git a/libcudacxx/include/cuda/std/__cuda/cmath_nvbf16.h b/libcudacxx/include/cuda/std/__cmath/nvbf16.h
similarity index 68%
rename from libcudacxx/include/cuda/std/__cuda/cmath_nvbf16.h
rename to libcudacxx/include/cuda/std/__cmath/nvbf16.h
index 08ad0445e01..8f116968f8b 100644
--- a/libcudacxx/include/cuda/std/__cuda/cmath_nvbf16.h
+++ b/libcudacxx/include/cuda/std/__cmath/nvbf16.h
@@ -8,8 +8,8 @@
 //
 //===----------------------------------------------------------------------===//
 
-#ifndef _LIBCUDACXX___CUDA_CMATH_NVBF16_H
-#define _LIBCUDACXX___CUDA_CMATH_NVBF16_H
+#ifndef _LIBCUDACXX___CMATH_NVBF16_H
+#define _LIBCUDACXX___CMATH_NVBF16_H
 
 #include <cuda/std/detail/__config>
 
@@ -70,57 +70,12 @@ _LIBCUDACXX_HIDE_FROM_ABI __nv_bfloat16 atan2(__nv_bfloat16 __x, __nv_bfloat16 _
   return __float2bfloat16(::atan2f(__bfloat162float(__x), __bfloat162float(__y)));
 }
 
-_LIBCUDACXX_HIDE_FROM_ABI __nv_bfloat16 log(__nv_bfloat16 __x)
-{
-  NV_IF_ELSE_TARGET(NV_IS_DEVICE, (return ::hlog(__x);), (return __float2bfloat16(::logf(__bfloat162float(__x)));))
-}
-
 _LIBCUDACXX_HIDE_FROM_ABI __nv_bfloat16 sqrt(__nv_bfloat16 __x)
 {
   NV_IF_ELSE_TARGET(NV_IS_DEVICE, (return ::hsqrt(__x);), (return __float2bfloat16(::sqrtf(__bfloat162float(__x)));))
 }
 
 // floating point helper
-_LIBCUDACXX_HIDE_FROM_ABI bool signbit(__nv_bfloat16 __v)
-{
-  return ::signbit(::__bfloat162float(__v));
-}
-
-_LIBCUDACXX_HIDE_FROM_ABI bool __constexpr_isnan(__nv_bfloat16 __x) noexcept
-{
-  return ::__hisnan(__x);
-}
-
-_LIBCUDACXX_HIDE_FROM_ABI bool isnan(__nv_bfloat16 __v)
-{
-  return __constexpr_isnan(__v);
-}
-
-_LIBCUDACXX_HIDE_FROM_ABI bool __constexpr_isinf(__nv_bfloat16 __x) noexcept
-{
-#  if _CCCL_STD_VER >= 2020 && _CCCL_CUDACC_BELOW(12, 3)
-  // this is a workaround for nvbug 4362808
-  return !::__hisnan(__x) && ::__hisnan(__x - __x);
-#  else // ^^^ C++20 && below 12.3 ^^^ / vvv C++17 or 12.3+ vvv
-  return ::__hisinf(__x) != 0;
-#  endif // _CCCL_STD_VER <= 2017 || _CCCL_CUDACC_BELOW(12, 3)
-}
-
-_LIBCUDACXX_HIDE_FROM_ABI bool isinf(__nv_bfloat16 __v)
-{
-  return __constexpr_isinf(__v);
-}
-
-_LIBCUDACXX_HIDE_FROM_ABI bool __constexpr_isfinite(__nv_bfloat16 __x) noexcept
-{
-  return !__constexpr_isnan(__x) && !__constexpr_isinf(__x);
-}
-
-_LIBCUDACXX_HIDE_FROM_ABI bool isfinite(__nv_bfloat16 __v)
-{
-  return __constexpr_isfinite(__v);
-}
-
 _LIBCUDACXX_HIDE_FROM_ABI __nv_bfloat16 __constexpr_copysign(__nv_bfloat16 __x, __nv_bfloat16 __y) noexcept
 {
   return __float2bfloat16(::copysignf(__bfloat162float(__x), __bfloat162float(__y)));
@@ -128,7 +83,7 @@ _LIBCUDACXX_HIDE_FROM_ABI __nv_bfloat16 __constexpr_copysign(__nv_bfloat16 __x,
 
 _LIBCUDACXX_HIDE_FROM_ABI __nv_bfloat16 copysign(__nv_bfloat16 __x, __nv_bfloat16 __y)
 {
-  return __constexpr_copysign(__x, __y);
+  return _CUDA_VSTD::__constexpr_copysign(__x, __y);
 }
 
 _LIBCUDACXX_HIDE_FROM_ABI __nv_bfloat16 __constexpr_fabs(__nv_bfloat16 __x) noexcept
@@ -138,12 +93,12 @@ _LIBCUDACXX_HIDE_FROM_ABI __nv_bfloat16 __constexpr_fabs(__nv_bfloat16 __x) noex
 
 _LIBCUDACXX_HIDE_FROM_ABI __nv_bfloat16 fabs(__nv_bfloat16 __x)
 {
-  return __constexpr_fabs(__x);
+  return _CUDA_VSTD::__constexpr_fabs(__x);
 }
 
 _LIBCUDACXX_HIDE_FROM_ABI __nv_bfloat16 abs(__nv_bfloat16 __x)
 {
-  return __constexpr_fabs(__x);
+  return _CUDA_VSTD::__constexpr_fabs(__x);
 }
 
 _LIBCUDACXX_HIDE_FROM_ABI __nv_bfloat16 __constexpr_fmax(__nv_bfloat16 __x, __nv_bfloat16 __y) noexcept
@@ -155,4 +110,4 @@ _LIBCUDACXX_END_NAMESPACE_STD
 
 #endif /// _LIBCUDACXX_HAS_NVBF16
 
-#endif // _LIBCUDACXX___CUDA_CMATH_NVBF16_H
+#endif // _LIBCUDACXX___CMATH_NVBF16_H
diff --git a/libcudacxx/include/cuda/std/__cuda/cmath_nvfp16.h b/libcudacxx/include/cuda/std/__cmath/nvfp16.h
similarity index 70%
rename from libcudacxx/include/cuda/std/__cuda/cmath_nvfp16.h
rename to libcudacxx/include/cuda/std/__cmath/nvfp16.h
index 42f314b36bf..dbcaebbb4ef 100644
--- a/libcudacxx/include/cuda/std/__cuda/cmath_nvfp16.h
+++ b/libcudacxx/include/cuda/std/__cmath/nvfp16.h
@@ -8,8 +8,8 @@
 //
 //===----------------------------------------------------------------------===//
 
-#ifndef _LIBCUDACXX___CUDA_CMATH_NVFP16_H
-#define _LIBCUDACXX___CUDA_CMATH_NVFP16_H
+#ifndef _LIBCUDACXX___CMATH_NVFP16_H
+#define _LIBCUDACXX___CMATH_NVFP16_H
 
 #include <cuda/std/detail/__config>
 
@@ -135,79 +135,12 @@ _LIBCUDACXX_HIDE_FROM_ABI __half atan2(__half __x, __half __y)
   return __float2half(::atan2f(__half2float(__x), __half2float(__y)));
 }
 
-// clang-format off
-_LIBCUDACXX_HIDE_FROM_ABI  __half log(__half __x)
-{
-  NV_IF_ELSE_TARGET(NV_PROVIDES_SM_53, (
-    return ::hlog(__x);
-  ), (
-    {
-      float __vf            = __half2float(__x);
-      __vf                  = ::logf(__vf);
-      __half_raw __ret_repr = ::__float2half_rn(__vf);
-
-      uint16_t __repr = __half_raw(__x).x;
-      switch (__repr)
-      {
-        case 7544:
-          __ret_repr.x -= 1;
-          break;
-
-        default:;
-      }
-
-      return __ret_repr;
-    }
-  ))
-}
-// clang-format on
-
 _LIBCUDACXX_HIDE_FROM_ABI __half sqrt(__half __x)
 {
   NV_IF_ELSE_TARGET(NV_IS_DEVICE, (return ::hsqrt(__x);), (return __float2half(::sqrtf(__half2float(__x)));))
 }
 
 // floating point helper
-_LIBCUDACXX_HIDE_FROM_ABI bool signbit(__half __v)
-{
-  return ::signbit(::__half2float(__v));
-}
-
-_LIBCUDACXX_HIDE_FROM_ABI bool __constexpr_isnan(__half __x) noexcept
-{
-  return ::__hisnan(__x);
-}
-
-_LIBCUDACXX_HIDE_FROM_ABI bool isnan(__half __v)
-{
-  return __constexpr_isnan(__v);
-}
-
-_LIBCUDACXX_HIDE_FROM_ABI bool __constexpr_isinf(__half __x) noexcept
-{
-#  if _CCCL_STD_VER >= 2020 && _CCCL_CUDACC_BELOW(12, 3)
-  // this is a workaround for nvbug 4362808
-  return !::__hisnan(__x) && ::__hisnan(__x - __x);
-#  else // ^^^ C++20 && below 12.3 ^^^ / vvv C++17 or 12.3+ vvv
-  return ::__hisinf(__x) != 0;
-#  endif // _CCCL_STD_VER <= 2017 || _CCCL_CUDACC_BELOW(12, 3)
-}
-
-_LIBCUDACXX_HIDE_FROM_ABI bool isinf(__half __v)
-{
-  return __constexpr_isinf(__v);
-}
-
-_LIBCUDACXX_HIDE_FROM_ABI bool __constexpr_isfinite(__half __x) noexcept
-{
-  return !__constexpr_isnan(__x) && !__constexpr_isinf(__x);
-}
-
-_LIBCUDACXX_HIDE_FROM_ABI bool isfinite(__half __v)
-{
-  return __constexpr_isfinite(__v);
-}
-
 _LIBCUDACXX_HIDE_FROM_ABI __half __constexpr_copysign(__half __x, __half __y) noexcept
 {
   return __float2half(::copysignf(__half2float(__x), __half2float(__y)));
@@ -215,7 +148,7 @@ _LIBCUDACXX_HIDE_FROM_ABI __half __constexpr_copysign(__half __x, __half __y) no
 
 _LIBCUDACXX_HIDE_FROM_ABI __half copysign(__half __x, __half __y)
 {
-  return __constexpr_copysign(__x, __y);
+  return _CUDA_VSTD::__constexpr_copysign(__x, __y);
 }
 
 _LIBCUDACXX_HIDE_FROM_ABI __half __constexpr_fabs(__half __x) noexcept
@@ -225,12 +158,12 @@ _LIBCUDACXX_HIDE_FROM_ABI __half __constexpr_fabs(__half __x) noexcept
 
 _LIBCUDACXX_HIDE_FROM_ABI __half fabs(__half __x)
 {
-  return __constexpr_fabs(__x);
+  return _CUDA_VSTD::__constexpr_fabs(__x);
 }
 
 _LIBCUDACXX_HIDE_FROM_ABI __half abs(__half __x)
 {
-  return __constexpr_fabs(__x);
+  return _CUDA_VSTD::__constexpr_fabs(__x);
 }
 
 _LIBCUDACXX_HIDE_FROM_ABI __half __constexpr_fmax(__half __x, __half __y) noexcept
@@ -242,4 +175,4 @@ _LIBCUDACXX_END_NAMESPACE_STD
 
 #endif /// _LIBCUDACXX_HAS_NVFP16
 
-#endif // _LIBCUDACXX___CUDA_CMATH_NVFP16_H
+#endif // _LIBCUDACXX___CMATH_NVFP16_H
diff --git a/libcudacxx/include/cuda/std/__cmath/traits.h b/libcudacxx/include/cuda/std/__cmath/traits.h
new file mode 100644
index 00000000000..cac18ee341b
--- /dev/null
+++ b/libcudacxx/include/cuda/std/__cmath/traits.h
@@ -0,0 +1,470 @@
+// -*- C++ -*-
+//===----------------------------------------------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+// SPDX-FileCopyrightText: Copyright (c) 2024 NVIDIA CORPORATION & AFFILIATES.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef _LIBCUDACXX___CMATH_TRAITS_H
+#define _LIBCUDACXX___CMATH_TRAITS_H
+
+#include <cuda/std/detail/__config>
+
+#if defined(_CCCL_IMPLICIT_SYSTEM_HEADER_GCC)
+#  pragma GCC system_header
+#elif defined(_CCCL_IMPLICIT_SYSTEM_HEADER_CLANG)
+#  pragma clang system_header
+#elif defined(_CCCL_IMPLICIT_SYSTEM_HEADER_MSVC)
+#  pragma system_header
+#endif // no system header
+
+#include <cuda/std/__cmath/common.h>
+#include <cuda/std/__cmath/fpclassify.h>
+#include <cuda/std/__type_traits/enable_if.h>
+#include <cuda/std/__type_traits/is_arithmetic.h>
+#include <cuda/std/__type_traits/is_extended_floating_point.h>
+#include <cuda/std/__type_traits/is_integral.h>
+#include <cuda/std/__type_traits/is_signed.h>
+#include <cuda/std/__type_traits/promote.h>
+
+#include <nv/target>
+
+_LIBCUDACXX_BEGIN_NAMESPACE_STD
+
+_CCCL_NODISCARD _LIBCUDACXX_HIDE_FROM_ABI bool signbit(float __x) noexcept
+{
+#if defined(_CCCL_BUILTIN_SIGNBIT)
+  return _CCCL_BUILTIN_SIGNBIT(__x);
+#else // ^^^ _CCCL_BUILTIN_SIGNBIT ^^^ / vvv !_CCCL_BUILTIN_SIGNBIT vvv
+  return ::signbit(__x);
+#endif // !_CCCL_BUILTIN_SIGNBIT
+}
+
+_CCCL_NODISCARD _LIBCUDACXX_HIDE_FROM_ABI bool signbit(double __x) noexcept
+{
+#if defined(_CCCL_BUILTIN_SIGNBIT)
+  return _CCCL_BUILTIN_SIGNBIT(__x);
+#else // ^^^ _CCCL_BUILTIN_SIGNBIT ^^^ / vvv !_CCCL_BUILTIN_SIGNBIT vvv
+  return ::signbit(__x);
+#endif // !_CCCL_BUILTIN_SIGNBIT
+}
+
+#if !defined(_LIBCUDACXX_HAS_NO_LONG_DOUBLE)
+_CCCL_NODISCARD _LIBCUDACXX_HIDE_FROM_ABI bool signbit(long double __x) noexcept
+{
+#  if defined(_CCCL_BUILTIN_SIGNBIT)
+  return _CCCL_BUILTIN_SIGNBIT(__x);
+#  else // ^^^ _CCCL_BUILTIN_SIGNBIT ^^^ / vvv !_CCCL_BUILTIN_SIGNBIT vvv
+  return ::signbit(__x);
+#  endif // !_CCCL_BUILTIN_SIGNBIT
+}
+#endif // !_LIBCUDACXX_HAS_NO_LONG_DOUBLE
+
+#if defined(_LIBCUDACXX_HAS_NVFP16)
+_CCCL_NODISCARD _LIBCUDACXX_HIDE_FROM_ABI bool signbit(__half __x) noexcept
+{
+  return _CUDA_VSTD::signbit(__half2float(__x));
+}
+#endif // _LIBCUDACXX_HAS_NVFP16
+
+#if defined(_LIBCUDACXX_HAS_NVBF16)
+_CCCL_NODISCARD _LIBCUDACXX_HIDE_FROM_ABI bool signbit(__nv_bfloat16 __x) noexcept
+{
+  return _CUDA_VSTD::signbit(__bfloat162float(__x));
+}
+#endif // _LIBCUDACXX_HAS_NVBF16
+
+template <class _A1, enable_if_t<_CCCL_TRAIT(is_integral, _A1) && _CCCL_TRAIT(is_signed, _A1), int> = 0>
+_CCCL_NODISCARD _LIBCUDACXX_HIDE_FROM_ABI bool signbit(_A1 __x) noexcept
+{
+  return __x < 0;
+}
+
+template <class _A1, enable_if_t<_CCCL_TRAIT(is_integral, _A1) && !_CCCL_TRAIT(is_signed, _A1), int> = 0>
+_CCCL_NODISCARD _LIBCUDACXX_HIDE_FROM_ABI bool signbit(_A1) noexcept
+{
+  return false;
+}
+
+// isfinite
+
+#if defined(_CCCL_BUILTIN_ISFINITE) || (defined(_CCCL_BUILTIN_ISINF) && defined(_CCCL_BUILTIN_ISNAN))
+#  define _CCCL_CONSTEXPR_ISFINITE constexpr
+#else // ^^^ _CCCL_BUILTIN_ISFINITE ^^^ / vvv !_CCCL_BUILTIN_ISFINITE vvv
+#  define _CCCL_CONSTEXPR_ISFINITE
+#endif // !_CCCL_BUILTIN_ISFINITE
+
+template <class _A1, enable_if_t<_CCCL_TRAIT(is_integral, _A1), int> = 0>
+_CCCL_NODISCARD _LIBCUDACXX_HIDE_FROM_ABI constexpr bool isfinite(_A1) noexcept
+{
+  return true;
+}
+
+_CCCL_NODISCARD _LIBCUDACXX_HIDE_FROM_ABI _CCCL_CONSTEXPR_ISFINITE bool isfinite(float __x) noexcept
+{
+#if defined(_CCCL_BUILTIN_ISFINITE)
+  return _CCCL_BUILTIN_ISFINITE(__x);
+#elif _CCCL_CUDACC_BELOW(11, 8)
+  return !::__isinf(__x) && !::__isnan(__x);
+#else // ^^^ _CCCL_CUDACC_BELOW(11, 8) ^^^ / vvv !_CCCL_CUDACC_BELOW(11, 8) vvv
+  return ::isfinite(__x);
+#endif // !_CCCL_CUDACC_BELOW(11, 8)
+}
+
+_CCCL_NODISCARD _LIBCUDACXX_HIDE_FROM_ABI _CCCL_CONSTEXPR_ISFINITE bool isfinite(double __x) noexcept
+{
+#if defined(_CCCL_BUILTIN_ISFINITE)
+  return _CCCL_BUILTIN_ISFINITE(__x);
+#elif _CCCL_CUDACC_BELOW(11, 8)
+  return !::__isinf(__x) && !::__isnan(__x);
+#else // ^^^ _CCCL_CUDACC_BELOW(11, 8) ^^^ / vvv !_CCCL_CUDACC_BELOW(11, 8) vvv
+  return ::isfinite(__x);
+#endif // !_CCCL_CUDACC_BELOW(11, 8)
+}
+
+#if !defined(_LIBCUDACXX_HAS_NO_LONG_DOUBLE)
+_CCCL_NODISCARD _LIBCUDACXX_HIDE_FROM_ABI _CCCL_CONSTEXPR_ISFINITE bool isfinite(long double __x) noexcept
+{
+#  if defined(_CCCL_BUILTIN_ISFINITE)
+  return _CCCL_BUILTIN_ISFINITE(__x);
+#  elif _CCCL_CUDACC_BELOW(11, 8)
+  return !::__isinf(__x) && !::__isnan(__x);
+#  else // ^^^ _CCCL_CUDACC_BELOW(11, 8) ^^^ / vvv !_CCCL_CUDACC_BELOW(11, 8) vvv
+  return ::isfinite(__x);
+#  endif // !_CCCL_CUDACC_BELOW(11, 8)
+}
+#endif // !_LIBCUDACXX_HAS_NO_LONG_DOUBLE
+
+#if defined(_LIBCUDACXX_HAS_NVFP16)
+_CCCL_NODISCARD _LIBCUDACXX_HIDE_FROM_ABI bool isfinite(__half __x) noexcept
+{
+  return !::__hisnan(__x) && !::__hisinf(__x);
+}
+#endif // _LIBCUDACXX_HAS_NVFP16
+
+#if defined(_LIBCUDACXX_HAS_NVBF16)
+_CCCL_NODISCARD _LIBCUDACXX_HIDE_FROM_ABI bool isfinite(__nv_bfloat16 __x) noexcept
+{
+  return !::__hisnan(__x) && !::__hisinf(__x);
+}
+#endif // _LIBCUDACXX_HAS_NVBF16
+
+// isinf
+
+#if defined(_CCCL_BUILTIN_ISINF)
+#  define _CCCL_CONSTEXPR_ISINF constexpr
+#else // ^^^ _CCCL_BUILTIN_ISINF ^^^ / vvv !_CCCL_BUILTIN_ISINF vvv
+#  define _CCCL_CONSTEXPR_ISINF
+#endif // !_CCCL_BUILTIN_ISINF
+
+template <class _A1, enable_if_t<_CCCL_TRAIT(is_integral, _A1), int> = 0>
+_CCCL_NODISCARD _LIBCUDACXX_HIDE_FROM_ABI constexpr bool isinf(_A1) noexcept
+{
+  return false;
+}
+
+_CCCL_NODISCARD _LIBCUDACXX_HIDE_FROM_ABI _CCCL_CONSTEXPR_ISINF bool isinf(float __x) noexcept
+{
+#if defined(_CCCL_BUILTIN_ISINF)
+  return _CCCL_BUILTIN_ISINF(__x);
+#elif _CCCL_CUDACC_BELOW(11, 8)
+  return ::__isinf(__x);
+#else // ^^^ _CCCL_CUDACC_BELOW(11, 8) ^^^ / vvv !_CCCL_CUDACC_BELOW(11, 8) vvv
+  return ::isinf(__x);
+#endif // !_CCCL_CUDACC_BELOW(11, 8)
+}
+
+_CCCL_NODISCARD _LIBCUDACXX_HIDE_FROM_ABI _CCCL_CONSTEXPR_ISINF bool isinf(double __x) noexcept
+{
+#if defined(_CCCL_BUILTIN_ISINF)
+  return _CCCL_BUILTIN_ISINF(__x);
+#elif _CCCL_CUDACC_BELOW(11, 8)
+  return ::__isinf(__x);
+#else // ^^^ _CCCL_CUDACC_BELOW(11, 8) ^^^ / vvv !_CCCL_CUDACC_BELOW(11, 8) vvv
+  return ::isinf(__x);
+#endif // !_CCCL_CUDACC_BELOW(11, 8)
+}
+
+#if !defined(_LIBCUDACXX_HAS_NO_LONG_DOUBLE)
+_CCCL_NODISCARD _LIBCUDACXX_HIDE_FROM_ABI _CCCL_CONSTEXPR_ISINF bool isinf(long double __x) noexcept
+{
+#  if defined(_CCCL_BUILTIN_ISINF)
+  return _CCCL_BUILTIN_ISINF(__x);
+#  elif _CCCL_CUDACC_BELOW(11, 8)
+  return ::__isinf(__x);
+#  else // ^^^ _CCCL_CUDACC_BELOW(11, 8) ^^^ / vvv !_CCCL_CUDACC_BELOW(11, 8) vvv
+  return ::isinf(__x);
+#  endif // !_CCCL_CUDACC_BELOW(11, 8)
+}
+#endif // !_LIBCUDACXX_HAS_NO_LONG_DOUBLE
+
+#if defined(_LIBCUDACXX_HAS_NVFP16)
+_CCCL_NODISCARD _LIBCUDACXX_HIDE_FROM_ABI bool isinf(__half __x) noexcept
+{
+#  if _CCCL_STD_VER >= 2020 && _CCCL_CUDACC_BELOW(12, 3)
+  // this is a workaround for nvbug 4362808
+  return !::__hisnan(__x) && ::__hisnan(__x - __x);
+#  else // ^^^ C++20 && below 12.3 ^^^ / vvv C++17 or 12.3+ vvv
+  return ::__hisinf(__x) != 0;
+#  endif // _CCCL_STD_VER <= 2017 || _CCCL_CUDACC_VER < 1203000
+}
+#endif // _LIBCUDACXX_HAS_NVFP16
+
+#if defined(_LIBCUDACXX_HAS_NVBF16)
+_CCCL_NODISCARD _LIBCUDACXX_HIDE_FROM_ABI bool isinf(__nv_bfloat16 __x) noexcept
+{
+#  if _CCCL_STD_VER >= 2020 && _CCCL_CUDACC_BELOW(12, 3)
+  // this is a workaround for nvbug 4362808
+  return !::__hisnan(__x) && ::__hisnan(__x - __x);
+#  else // ^^^ C++20 && below 12.3 ^^^ / vvv C++17 or 12.3+ vvv
+  return ::__hisinf(__x) != 0;
+#  endif // _CCCL_STD_VER <= 2017 || _CCCL_CUDACC_VER < 1203000
+}
+#endif // _LIBCUDACXX_HAS_NVBF16
+
+// isnan
+
+#if defined(_CCCL_BUILTIN_ISNAN)
+#  define _CCCL_CONSTEXPR_ISNAN constexpr
+#else // ^^^ _CCCL_BUILTIN_ISNAN ^^^ / vvv !_CCCL_BUILTIN_ISNAN vvv
+#  define _CCCL_CONSTEXPR_ISNAN
+#endif // !_CCCL_BUILTIN_ISNAN
+
+template <class _A1, enable_if_t<_CCCL_TRAIT(is_integral, _A1), int> = 0>
+_CCCL_NODISCARD _LIBCUDACXX_HIDE_FROM_ABI constexpr bool isnan(_A1) noexcept
+{
+  return false;
+}
+
+_CCCL_NODISCARD _LIBCUDACXX_HIDE_FROM_ABI _CCCL_CONSTEXPR_ISNAN bool isnan(float __x) noexcept
+{
+#if defined(_CCCL_BUILTIN_ISNAN)
+  return _CCCL_BUILTIN_ISNAN(__x);
+#elif _CCCL_CUDACC_BELOW(11, 8)
+  return ::__isnan(__x);
+#else // ^^^ _CCCL_CUDACC_BELOW(11, 8) ^^^ / vvv !_CCCL_CUDACC_BELOW(11, 8) vvv
+  return ::isnan(__x);
+#endif // !_CCCL_CUDACC_BELOW(11, 8)
+}
+
+_CCCL_NODISCARD _LIBCUDACXX_HIDE_FROM_ABI _CCCL_CONSTEXPR_ISNAN bool isnan(double __x) noexcept
+{
+#if defined(_CCCL_BUILTIN_ISNAN)
+  return _CCCL_BUILTIN_ISNAN(__x);
+#elif _CCCL_CUDACC_BELOW(11, 8)
+  return ::__isnan(__x);
+#else // ^^^ _CCCL_CUDACC_BELOW(11, 8) ^^^ / vvv !_CCCL_CUDACC_BELOW(11, 8) vvv
+  return ::isnan(__x);
+#endif // !_CCCL_CUDACC_BELOW(11, 8)
+}
+
+#if !defined(_LIBCUDACXX_HAS_NO_LONG_DOUBLE)
+_CCCL_NODISCARD _LIBCUDACXX_HIDE_FROM_ABI _CCCL_CONSTEXPR_ISNAN bool isnan(long double __x) noexcept
+{
+#  if defined(_CCCL_BUILTIN_ISNAN)
+  return _CCCL_BUILTIN_ISNAN(__x);
+#  elif _CCCL_CUDACC_BELOW(11, 8)
+  return ::__isnan(__x);
+#  else // ^^^ _CCCL_CUDACC_BELOW(11, 8) ^^^ / vvv !_CCCL_CUDACC_BELOW(11, 8) vvv
+  return ::isnan(__x);
+#  endif // !_CCCL_CUDACC_BELOW(11, 8)
+}
+#endif // !_LIBCUDACXX_HAS_NO_LONG_DOUBLE
+
+#if defined(_LIBCUDACXX_HAS_NVFP16)
+_CCCL_NODISCARD _LIBCUDACXX_HIDE_FROM_ABI bool isnan(__half __x) noexcept
+{
+  return ::__hisnan(__x);
+}
+#endif // _LIBCUDACXX_HAS_NVFP16
+
+#if defined(_LIBCUDACXX_HAS_NVBF16)
+_CCCL_NODISCARD _LIBCUDACXX_HIDE_FROM_ABI bool isnan(__nv_bfloat16 __x) noexcept
+{
+  return ::__hisnan(__x);
+}
+#endif // _LIBCUDACXX_HAS_NVBF16
+
+// isnormal
+
+template <class _A1, enable_if_t<_CCCL_TRAIT(is_integral, _A1), int> = 0>
+_CCCL_NODISCARD _LIBCUDACXX_HIDE_FROM_ABI bool isnormal(_A1 __x) noexcept
+{
+  return __x != 0;
+}
+
+_CCCL_NODISCARD _LIBCUDACXX_HIDE_FROM_ABI bool isnormal(float __x) noexcept
+{
+  return _CUDA_VSTD::fpclassify(__x) == FP_NORMAL;
+}
+
+_CCCL_NODISCARD _LIBCUDACXX_HIDE_FROM_ABI bool isnormal(double __x) noexcept
+{
+  return _CUDA_VSTD::fpclassify(__x) == FP_NORMAL;
+}
+
+#if !defined(_LIBCUDACXX_HAS_NO_LONG_DOUBLE)
+_CCCL_NODISCARD _LIBCUDACXX_HIDE_FROM_ABI bool isnormal(long double __x) noexcept
+{
+  return _CUDA_VSTD::fpclassify(__x) == FP_NORMAL;
+}
+#endif // !_LIBCUDACXX_HAS_NO_LONG_DOUBLE
+
+#if defined(_LIBCUDACXX_HAS_NVFP16)
+_CCCL_NODISCARD _LIBCUDACXX_HIDE_FROM_ABI bool isnormal(__half __x) noexcept
+{
+  return _CUDA_VSTD::isnormal(__half2float(__x));
+}
+#endif // _LIBCUDACXX_HAS_NVFP16
+
+#if defined(_LIBCUDACXX_HAS_NVBF16)
+_CCCL_NODISCARD _LIBCUDACXX_HIDE_FROM_ABI bool isnormal(__nv_bfloat16 __x) noexcept
+{
+  return _CUDA_VSTD::isnormal(__bfloat162float(__x));
+}
+#endif // _LIBCUDACXX_HAS_NVBF16
+
+// isgreater
+
+template <class _Tp>
+struct __is_extended_arithmetic
+{
+  static constexpr bool value = _CCCL_TRAIT(is_arithmetic, _Tp) || _CCCL_TRAIT(__is_extended_floating_point, _Tp);
+};
+
+#if !defined(_CCCL_NO_VARIABLE_TEMPLATES)
+template <class _Tp>
+_CCCL_INLINE_VAR constexpr bool __is_extended_arithmetic_v =
+  is_arithmetic_v<_Tp> || __is_extended_floating_point_v<_Tp>;
+#endif // !_CCCL_NO_INLINE_VARIABLES
+
+template <class _A1, enable_if_t<_CCCL_TRAIT(__is_extended_arithmetic, _A1), int> = 0>
+_CCCL_NODISCARD _CCCL_DEVICE _CCCL_HIDE_FROM_ABI bool __device_isgreater(_A1 __x, _A1 __y) noexcept
+{
+  if (_CUDA_VSTD::isnan(__x) || _CUDA_VSTD::isnan(__y))
+  {
+    return false;
+  }
+  return __x > __y;
+}
+
+template <class _A1,
+          class _A2,
+          enable_if_t<_CCCL_TRAIT(__is_extended_arithmetic, _A1) && _CCCL_TRAIT(__is_extended_arithmetic, _A2), int> = 0>
+_CCCL_NODISCARD _LIBCUDACXX_HIDE_FROM_ABI bool isgreater(_A1 __x, _A2 __y) noexcept
+{
+  using type = __promote_t<_A1, _A2>;
+  NV_IF_ELSE_TARGET(NV_IS_HOST,
+                    (return ::isgreater((type) __x, (type) __y);),
+                    (return _CUDA_VSTD::__device_isgreater((type) __x, (type) __y);))
+}
+
+// isgreaterequal
+
+template <class _A1, enable_if_t<_CCCL_TRAIT(__is_extended_arithmetic, _A1), int> = 0>
+_CCCL_NODISCARD _CCCL_DEVICE _CCCL_HIDE_FROM_ABI bool __device_isgreaterequal(_A1 __x, _A1 __y) noexcept
+{
+  if (_CUDA_VSTD::isnan(__x) || _CUDA_VSTD::isnan(__y))
+  {
+    return false;
+  }
+  return __x >= __y;
+}
+
+template <class _A1,
+          class _A2,
+          enable_if_t<_CCCL_TRAIT(__is_extended_arithmetic, _A1) && _CCCL_TRAIT(__is_extended_arithmetic, _A2), int> = 0>
+_CCCL_NODISCARD _LIBCUDACXX_HIDE_FROM_ABI bool isgreaterequal(_A1 __x, _A2 __y) noexcept
+{
+  using type = __promote_t<_A1, _A2>;
+  NV_IF_ELSE_TARGET(NV_IS_HOST,
+                    (return ::isgreaterequal((type) __x, (type) __y);),
+                    (return _CUDA_VSTD::__device_isgreaterequal((type) __x, (type) __y);))
+}
+
+// isless
+
+template <class _A1, enable_if_t<_CCCL_TRAIT(__is_extended_arithmetic, _A1), int> = 0>
+_CCCL_NODISCARD _CCCL_DEVICE _CCCL_HIDE_FROM_ABI bool __device_isless(_A1 __x, _A1 __y) noexcept
+{
+  if (_CUDA_VSTD::isnan(__x) || _CUDA_VSTD::isnan(__y))
+  {
+    return false;
+  }
+  return __x < __y;
+}
+
+template <class _A1,
+          class _A2,
+          enable_if_t<_CCCL_TRAIT(__is_extended_arithmetic, _A1) && _CCCL_TRAIT(__is_extended_arithmetic, _A2), int> = 0>
+_CCCL_NODISCARD _LIBCUDACXX_HIDE_FROM_ABI bool isless(_A1 __x, _A2 __y) noexcept
+{
+  using type = __promote_t<_A1, _A2>;
+  NV_IF_ELSE_TARGET(NV_IS_HOST,
+                    (return ::isless((type) __x, (type) __y);),
+                    (return _CUDA_VSTD::__device_isless((type) __x, (type) __y);))
+}
+
+// islessequal
+
+template <class _A1, enable_if_t<_CCCL_TRAIT(__is_extended_arithmetic, _A1), int> = 0>
+_CCCL_NODISCARD _CCCL_DEVICE _CCCL_HIDE_FROM_ABI bool __device_islessequal(_A1 __x, _A1 __y) noexcept
+{
+  if (_CUDA_VSTD::isnan(__x) || _CUDA_VSTD::isnan(__y))
+  {
+    return false;
+  }
+  return __x <= __y;
+}
+
+template <class _A1,
+          class _A2,
+          enable_if_t<_CCCL_TRAIT(__is_extended_arithmetic, _A1) && _CCCL_TRAIT(__is_extended_arithmetic, _A2), int> = 0>
+_CCCL_NODISCARD _LIBCUDACXX_HIDE_FROM_ABI bool islessequal(_A1 __x, _A2 __y) noexcept
+{
+  using type = __promote_t<_A1, _A2>;
+  NV_IF_ELSE_TARGET(NV_IS_HOST,
+                    (return ::islessequal((type) __x, (type) __y);),
+                    (return _CUDA_VSTD::__device_islessequal((type) __x, (type) __y);))
+}
+
+// islessgreater
+
+template <class _A1, enable_if_t<_CCCL_TRAIT(__is_extended_arithmetic, _A1), int> = 0>
+_CCCL_NODISCARD _CCCL_DEVICE _CCCL_HIDE_FROM_ABI bool __device_islessgreater(_A1 __x, _A1 __y) noexcept
+{
+  if (_CUDA_VSTD::isnan(__x) || _CUDA_VSTD::isnan(__y))
+  {
+    return false;
+  }
+  return __x < __y || __x > __y;
+}
+
+template <class _A1,
+          class _A2,
+          enable_if_t<_CCCL_TRAIT(__is_extended_arithmetic, _A1) && _CCCL_TRAIT(__is_extended_arithmetic, _A2), int> = 0>
+_CCCL_NODISCARD _LIBCUDACXX_HIDE_FROM_ABI bool islessgreater(_A1 __x, _A2 __y) noexcept
+{
+  using type = __promote_t<_A1, _A2>;
+  NV_IF_ELSE_TARGET(NV_IS_HOST,
+                    (return ::islessgreater((type) __x, (type) __y);),
+                    (return _CUDA_VSTD::__device_islessgreater((type) __x, (type) __y);))
+}
+
+// isunordered
+
+template <class _A1,
+          class _A2,
+          enable_if_t<_CCCL_TRAIT(__is_extended_arithmetic, _A1) && _CCCL_TRAIT(__is_extended_arithmetic, _A2), int> = 0>
+_CCCL_NODISCARD _LIBCUDACXX_HIDE_FROM_ABI bool isunordered(_A1 __x, _A2 __y) noexcept
+{
+  using type = __promote_t<_A1, _A2>;
+  return _CUDA_VSTD::isnan((type) __x) || _CUDA_VSTD::isnan((type) __y);
+}
+
+_LIBCUDACXX_END_NAMESPACE_STD
+
+#endif // _LIBCUDACXX___CMATH_TRAITS_H
diff --git a/libcudacxx/include/cuda/std/__complex/nvbf16.h b/libcudacxx/include/cuda/std/__complex/nvbf16.h
index 0167f952141..1282b47f6d9 100644
--- a/libcudacxx/include/cuda/std/__complex/nvbf16.h
+++ b/libcudacxx/include/cuda/std/__complex/nvbf16.h
@@ -28,8 +28,8 @@ _CCCL_DIAG_SUPPRESS_CLANG("-Wunused-function")
 #  include <cuda_bf16.h>
 _CCCL_DIAG_POP
 
+#  include <cuda/std/__cmath/nvbf16.h>
 #  include <cuda/std/__complex/vector_support.h>
-#  include <cuda/std/__cuda/cmath_nvbf16.h>
 #  include <cuda/std/__fwd/get.h>
 #  include <cuda/std/__type_traits/enable_if.h>
 #  include <cuda/std/__type_traits/integral_constant.h>
diff --git a/libcudacxx/include/cuda/std/__complex/nvfp16.h b/libcudacxx/include/cuda/std/__complex/nvfp16.h
index 8ddd2b27747..bc2da05d61d 100644
--- a/libcudacxx/include/cuda/std/__complex/nvfp16.h
+++ b/libcudacxx/include/cuda/std/__complex/nvfp16.h
@@ -25,8 +25,8 @@
 
 #  include <cuda_fp16.h>
 
+#  include <cuda/std/__cmath/nvfp16.h>
 #  include <cuda/std/__complex/vector_support.h>
-#  include <cuda/std/__cuda/cmath_nvfp16.h>
 #  include <cuda/std/__fwd/get.h>
 #  include <cuda/std/__type_traits/enable_if.h>
 #  include <cuda/std/__type_traits/integral_constant.h>
diff --git a/libcudacxx/include/cuda/std/__type_traits/promote.h b/libcudacxx/include/cuda/std/__type_traits/promote.h
index daa545c5fa1..18a5afacfef 100644
--- a/libcudacxx/include/cuda/std/__type_traits/promote.h
+++ b/libcudacxx/include/cuda/std/__type_traits/promote.h
@@ -43,10 +43,10 @@ struct __numeric_type
 {
   _LIBCUDACXX_HIDE_FROM_ABI static void __test(...);
 #ifdef _LIBCUDACXX_HAS_NVFP16
-  _LIBCUDACXX_HIDE_FROM_ABI static __half __test(__half);
+  _LIBCUDACXX_HIDE_FROM_ABI static float __test(__half);
 #endif // _LIBCUDACXX_HAS_NVBF16
 #ifdef _LIBCUDACXX_HAS_NVBF16
-  _LIBCUDACXX_HIDE_FROM_ABI static __nv_bfloat16 __test(__nv_bfloat16);
+  _LIBCUDACXX_HIDE_FROM_ABI static float __test(__nv_bfloat16);
 #endif // _LIBCUDACXX_HAS_NVFP16
   _LIBCUDACXX_HIDE_FROM_ABI static float __test(float);
   _LIBCUDACXX_HIDE_FROM_ABI static double __test(char);
@@ -69,10 +69,55 @@ struct __numeric_type<void>
   static const bool value = true;
 };
 
+template <class _A1, class _A2, class _A3>
+struct __is_mixed_extended_floating_point
+{
+  static constexpr bool value = false;
+};
+
+#if defined(_LIBCUDACXX_HAS_NVFP16) && defined(_LIBCUDACXX_HAS_NVBF16)
+template <class _A1>
+struct __is_mixed_extended_floating_point<_A1, __half, __nv_bfloat16>
+{
+  static constexpr bool value = true;
+};
+
+template <class _A1>
+struct __is_mixed_extended_floating_point<_A1, __nv_bfloat16, __half>
+{
+  static constexpr bool value = true;
+};
+
+template <class _A1>
+struct __is_mixed_extended_floating_point<__half, _A1, __nv_bfloat16>
+{
+  static constexpr bool value = true;
+};
+
+template <class _A1>
+struct __is_mixed_extended_floating_point<__nv_bfloat16, _A1, __half>
+{
+  static constexpr bool value = true;
+};
+
+template <class _A1>
+struct __is_mixed_extended_floating_point<__half, __nv_bfloat16, _A1>
+{
+  static constexpr bool value = true;
+};
+
+template <class _A1>
+struct __is_mixed_extended_floating_point<__nv_bfloat16, __half, _A1>
+{
+  static constexpr bool value = true;
+};
+#endif // _LIBCUDACXX_HAS_NVFP16 && _LIBCUDACXX_HAS_NVBF16
+
 template <class _A1,
           class _A2 = void,
           class _A3 = void,
-          bool      = __numeric_type<_A1>::value && __numeric_type<_A2>::value && __numeric_type<_A3>::value>
+          bool      = __numeric_type<_A1>::value && __numeric_type<_A2>::value && __numeric_type<_A3>::value
+              && !__is_mixed_extended_floating_point<_A1, _A2, _A3>::value>
 class __promote_imp
 {
 public:
@@ -96,8 +141,8 @@ template <class _A1, class _A2>
 class __promote_imp<_A1, _A2, void, true>
 {
 private:
-  typedef typename __promote_imp<_A1>::type __type1;
-  typedef typename __promote_imp<_A2>::type __type2;
+  using __type1 = typename __promote_imp<_A1>::type;
+  using __type2 = typename __promote_imp<_A2>::type;
 
 public:
   typedef decltype(__type1() + __type2()) type;
diff --git a/libcudacxx/include/cuda/std/detail/libcxx/include/cmath b/libcudacxx/include/cuda/std/detail/libcxx/include/cmath
index 7066ddec4f2..0f5610d97fc 100644
--- a/libcudacxx/include/cuda/std/detail/libcxx/include/cmath
+++ b/libcudacxx/include/cuda/std/detail/libcxx/include/cmath
@@ -318,16 +318,21 @@ long double    truncl(long double x);
 #  include <cmath>
 #endif // _CCCL_COMPILER(NVHPC)
 
+#include <cuda/std/__cmath/fpclassify.h>
+#include <cuda/std/__cmath/lerp.h>
+#include <cuda/std/__cmath/logarithms.h>
+#include <cuda/std/__cmath/min_max.h>
+#include <cuda/std/__cmath/traits.h>
 #include <cuda/std/limits>
 #include <cuda/std/type_traits>
 #include <cuda/std/version>
 
 #ifdef _LIBCUDACXX_HAS_NVFP16
-#  include <cuda/std/__cuda/cmath_nvfp16.h>
+#  include <cuda/std/__cmath/nvfp16.h>
 #endif // _LIBCUDACXX_HAS_NVFP16
 
 #ifdef _LIBCUDACXX_HAS_NVBF16
-#  include <cuda/std/__cuda/cmath_nvbf16.h>
+#  include <cuda/std/__cmath/nvbf16.h>
 #endif // _LIBCUDACXX_HAS_NVBF16
 
 #if _CCCL_COMPILER(NVRTC)
@@ -340,11 +345,6 @@ _CCCL_PUSH_MACROS
 
 _LIBCUDACXX_BEGIN_NAMESPACE_STD
 
-using ::isfinite;
-using ::isinf;
-using ::isnan;
-using ::signbit;
-
 using ::acos;
 using ::acosf;
 using ::asin;
@@ -386,9 +386,6 @@ using ::asinhf;
 using ::atanh;
 using ::atanhf;
 
-using ::log;
-using ::logf;
-
 using ::hypot;
 using ::hypotf;
 
@@ -398,16 +395,6 @@ using ::abs;
 
 #if !_CCCL_COMPILER(NVRTC)
 
-using ::fpclassify;
-using ::isgreater;
-using ::isgreaterequal;
-using ::isless;
-using ::islessequal;
-using ::islessgreater;
-using ::isnormal;
-
-using ::isunordered;
-
 using ::double_t;
 using ::float_t;
 
@@ -424,11 +411,6 @@ using ::frexpf;
 using ::ldexp;
 using ::ldexpf;
 
-using ::log;
-using ::logf;
-
-using ::log10;
-using ::log10f;
 using ::modf;
 using ::modff;
 
@@ -472,24 +454,12 @@ using ::fdim;
 using ::fdimf;
 using ::fma;
 using ::fmaf;
-using ::fmax;
-using ::fmaxf;
-using ::fmin;
-using ::fminf;
-using ::ilogb;
-using ::ilogbf;
 using ::lgamma;
 using ::lgammaf;
 using ::llrint;
 using ::llrintf;
 using ::llround;
 using ::llroundf;
-using ::log1p;
-using ::log1pf;
-using ::log2;
-using ::log2f;
-using ::logb;
-using ::logbf;
 using ::lrint;
 using ::lrintf;
 using ::lround;
@@ -534,8 +504,6 @@ using ::floorl;
 using ::fmodl;
 using ::frexpl;
 using ::ldexpl;
-using ::log10l;
-using ::logl;
 using ::modfl;
 using ::powl;
 using ::sinhl;
@@ -557,16 +525,10 @@ using ::exp2l;
 using ::expm1l;
 using ::fdiml;
 using ::fmal;
-using ::fmaxl;
-using ::fminl;
 using ::hypotl;
-using ::ilogbl;
 using ::lgammal;
 using ::llrintl;
 using ::llroundl;
-using ::log1pl;
-using ::log2l;
-using ::logbl;
 using ::lrintl;
 using ::lroundl;
 using ::nanl;
@@ -587,16 +549,16 @@ using ::truncl;
 #if _CCCL_STD_VER > 2014 && !defined(__cuda_std__)
 _LIBCUDACXX_HIDE_FROM_ABI float hypot(float x, float y, float z)
 {
-  return sqrt(x * x + y * y + z * z);
+  return _CUDA_VSTD::sqrt(x * x + y * y + z * z);
 }
 _LIBCUDACXX_HIDE_FROM_ABI double hypot(double x, double y, double z)
 {
-  return sqrt(x * x + y * y + z * z);
+  return _CUDA_VSTD::sqrt(x * x + y * y + z * z);
 }
 #  ifdef _LIBCUDACXX_HAS_COMPLEX_LONG_DOUBLE
 _LIBCUDACXX_HIDE_FROM_ABI long double hypot(long double x, long double y, long double z)
 {
-  return sqrt(x * x + y * y + z * z);
+  return _CUDA_VSTD::sqrt(x * x + y * y + z * z);
 }
 #  endif
 
@@ -610,7 +572,7 @@ hypot(_A1 __lcpp_x, _A2 __lcpp_y, _A3 __lcpp_z) noexcept
   static_assert(
     (!(is_same<_A1, __result_type>::value && is_same<_A2, __result_type>::value && is_same<_A3, __result_type>::value)),
     "");
-  return ::hypot((__result_type) __lcpp_x, (__result_type) __lcpp_y, (__result_type) __lcpp_z);
+  return _CUDA_VSTD::hypot((__result_type) __lcpp_x, (__result_type) __lcpp_y, (__result_type) __lcpp_z);
 }
 #endif
 
@@ -620,69 +582,6 @@ hypot(_A1 __lcpp_x, _A2 __lcpp_y, _A3 __lcpp_z) noexcept
 #  define _CCCL_CONSTEXPR_CXX14_COMPLEX
 #endif // _LIBCUDACXX_HAS_NO_CONSTEXPR_COMPLEX_OPERATIONS
 
-template <class _A1>
-_LIBCUDACXX_HIDE_FROM_ABI constexpr enable_if_t<is_floating_point<_A1>::value, bool>
-__constexpr_isnan(_A1 __lcpp_x) noexcept
-{
-#if _CCCL_CUDACC_BELOW(11, 8)
-  return __isnan(__lcpp_x);
-#elif defined(_CCCL_BUILTIN_ISNAN)
-  // nvcc at times has issues determining the type of __lcpp_x
-  return _CCCL_BUILTIN_ISNAN(static_cast<double>(__lcpp_x));
-#else // ^^^ _CCCL_BUILTIN_ISNAN ^^^ / vvv !_CCCL_BUILTIN_ISNAN vvv
-  return ::isnan(__lcpp_x);
-#endif // !_CCCL_BUILTIN_ISNAN
-}
-
-template <class _A1>
-_LIBCUDACXX_HIDE_FROM_ABI constexpr enable_if_t<!is_floating_point<_A1>::value, bool>
-__constexpr_isnan(_A1 __lcpp_x) noexcept
-{
-  return ::isnan(__lcpp_x);
-}
-
-template <class _A1>
-_LIBCUDACXX_HIDE_FROM_ABI constexpr enable_if_t<is_floating_point<_A1>::value, bool>
-__constexpr_isinf(_A1 __lcpp_x) noexcept
-{
-#if _CCCL_CUDACC_BELOW(11, 8)
-  return __isinf(__lcpp_x);
-#elif defined(_CCCL_BUILTIN_ISINF)
-  // nvcc at times has issues determining the type of __lcpp_x
-  return __builtin_isinf(static_cast<double>(__lcpp_x));
-#else // ^^^ _CCCL_BUILTIN_ISINF ^^^ / vvv !_CCCL_BUILTIN_ISINF vvv
-  return ::isinf(__lcpp_x);
-#endif // !_CCCL_BUILTIN_ISINF
-}
-
-template <class _A1>
-_LIBCUDACXX_HIDE_FROM_ABI constexpr enable_if_t<!is_floating_point<_A1>::value, bool>
-__constexpr_isinf(_A1 __lcpp_x) noexcept
-{
-  return ::isinf(__lcpp_x);
-}
-
-template <class _A1>
-_LIBCUDACXX_HIDE_FROM_ABI constexpr enable_if_t<is_floating_point<_A1>::value, bool>
-__constexpr_isfinite(_A1 __lcpp_x) noexcept
-{
-#if _CCCL_CUDACC_BELOW(11, 8)
-  return !__isinf(__lcpp_x) && !__isnan(__lcpp_x);
-#elif defined(_CCCL_BUILTIN_ISFINITE)
-  // nvcc at times has issues determining the type of __lcpp_x
-  return __builtin_isfinite(static_cast<double>(__lcpp_x));
-#else // ^^^ _CCCL_BUILTIN_ISFINITE ^^^ / vvv !_CCCL_BUILTIN_ISFINITE vvv
-  return ::isfinite(__lcpp_x);
-#endif // !_CCCL_BUILTIN_ISFINITE
-}
-
-template <class _A1>
-_LIBCUDACXX_HIDE_FROM_ABI constexpr enable_if_t<!is_floating_point<_A1>::value, bool>
-__constexpr_isfinite(_A1 __lcpp_x) noexcept
-{
-  return isfinite(__lcpp_x);
-}
-
 #if _CCCL_COMPILER(MSVC) || _CCCL_COMPILER(NVRTC)
 template <class _A1>
 _LIBCUDACXX_HIDE_FROM_ABI _A1 __constexpr_copysign(_A1 __x, _A1 __y) noexcept
@@ -700,11 +599,13 @@ _LIBCUDACXX_HIDE_FROM_ABI _CCCL_CONSTEXPR_CXX14 double __constexpr_copysign(doub
   return __builtin_copysign(__x, __y);
 }
 
+#  if !defined(_LIBCUDACXX_HAS_NO_LONG_DOUBLE)
 _LIBCUDACXX_HIDE_FROM_ABI _CCCL_CONSTEXPR_CXX14 long double
 __constexpr_copysign(long double __x, long double __y) noexcept
 {
   return __builtin_copysignl(__x, __y);
 }
+#  endif // !_LIBCUDACXX_HAS_NO_LONG_DOUBLE
 
 template <class _A1, class _A2>
 _LIBCUDACXX_HIDE_FROM_ABI
@@ -734,10 +635,12 @@ _LIBCUDACXX_HIDE_FROM_ABI _CCCL_CONSTEXPR_CXX14 double __constexpr_fabs(double _
   return __builtin_fabs(__x);
 }
 
+#  if !defined(_LIBCUDACXX_HAS_NO_LONG_DOUBLE)
 _LIBCUDACXX_HIDE_FROM_ABI _CCCL_CONSTEXPR_CXX14 long double __constexpr_fabs(long double __x) noexcept
 {
   return __builtin_fabsl(__x);
 }
+#  endif // !_LIBCUDACXX_HAS_NO_LONG_DOUBLE
 
 template <class _Tp, enable_if_t<is_integral<_Tp>::value, int> = 0>
 _LIBCUDACXX_HIDE_FROM_ABI _CCCL_CONSTEXPR_CXX14 double __constexpr_fabs(_Tp __x) noexcept
@@ -762,11 +665,11 @@ _LIBCUDACXX_HIDE_FROM_ABI _CCCL_CONSTEXPR_CXX14_COMPLEX float __constexpr_fmax(f
   if (_CCCL_BUILTIN_IS_CONSTANT_EVALUATED())
 #    endif // _CCCL_COMPILER(ICC) && _NV_ISEMPTY(_CCCL_CONSTEXPR_CXX14_COMPLEX)
   {
-    if (__constexpr_isnan(__x))
+    if (_CUDA_VSTD::isnan(__x))
     {
       return __y;
     }
-    if (__constexpr_isnan(__y))
+    if (_CUDA_VSTD::isnan(__y))
     {
       return __x;
     }
@@ -785,11 +688,11 @@ _LIBCUDACXX_HIDE_FROM_ABI _CCCL_CONSTEXPR_CXX14_COMPLEX double __constexpr_fmax(
   if (_CCCL_BUILTIN_IS_CONSTANT_EVALUATED())
 #    endif // _CCCL_COMPILER(ICC) && _NV_ISEMPTY(_CCCL_CONSTEXPR_CXX14_COMPLEX)
   {
-    if (__constexpr_isnan(__x))
+    if (_CUDA_VSTD::isnan(__x))
     {
       return __y;
     }
-    if (__constexpr_isnan(__y))
+    if (_CUDA_VSTD::isnan(__y))
     {
       return __x;
     }
@@ -799,29 +702,31 @@ _LIBCUDACXX_HIDE_FROM_ABI _CCCL_CONSTEXPR_CXX14_COMPLEX double __constexpr_fmax(
   return __builtin_fmax(__x, __y);
 }
 
+#  if !defined(_LIBCUDACXX_HAS_NO_LONG_DOUBLE)
 _LIBCUDACXX_HIDE_FROM_ABI _CCCL_CONSTEXPR_CXX14_COMPLEX long double
 __constexpr_fmax(long double __x, long double __y) noexcept
 {
-#  if defined(_CCCL_BUILTIN_IS_CONSTANT_EVALUATED) && !defined(_LIBCUDACXX_HAS_NO_CONSTEXPR_COMPLEX_OPERATIONS)
-#    if _CCCL_COMPILER(ICC) && _NV_ISEMPTY(_CCCL_CONSTEXPR_CXX14_COMPLEX)
+#    if defined(_CCCL_BUILTIN_IS_CONSTANT_EVALUATED) && !defined(_LIBCUDACXX_HAS_NO_CONSTEXPR_COMPLEX_OPERATIONS)
+#      if _CCCL_COMPILER(ICC) && _NV_ISEMPTY(_CCCL_CONSTEXPR_CXX14_COMPLEX)
   if (false)
-#    else // _CCCL_COMPILER(ICC) && _NV_ISEMPTY(_CCCL_CONSTEXPR_CXX14_COMPLEX)
+#      else // _CCCL_COMPILER(ICC) && _NV_ISEMPTY(_CCCL_CONSTEXPR_CXX14_COMPLEX)
   if (_CCCL_BUILTIN_IS_CONSTANT_EVALUATED())
-#    endif // _CCCL_COMPILER(ICC) && _NV_ISEMPTY(_CCCL_CONSTEXPR_CXX14_COMPLEX)
+#      endif // _CCCL_COMPILER(ICC) && _NV_ISEMPTY(_CCCL_CONSTEXPR_CXX14_COMPLEX)
   {
-    if (__constexpr_isnan(__x))
+    if (_CUDA_VSTD::isnan(__x))
     {
       return __y;
     }
-    if (__constexpr_isnan(__y))
+    if (_CUDA_VSTD::isnan(__y))
     {
       return __x;
     }
     return __x < __y ? __y : __x;
   }
-#  endif // defined(_CCCL_BUILTIN_IS_CONSTANT_EVALUATED)
+#    endif // defined(_CCCL_BUILTIN_IS_CONSTANT_EVALUATED)
   return __builtin_fmax(__x, __y);
 }
+#  endif // !_LIBCUDACXX_HAS_NO_LONG_DOUBLE
 
 template <class _Tp, class _Up, enable_if_t<is_arithmetic<_Tp>::value && is_arithmetic<_Up>::value, int> = 0>
 _LIBCUDACXX_HIDE_FROM_ABI _CCCL_CONSTEXPR_CXX14_COMPLEX __promote_t<_Tp, _Up> __constexpr_fmax(_Tp __x, _Up __y) noexcept
@@ -835,7 +740,7 @@ _LIBCUDACXX_HIDE_FROM_ABI _CCCL_CONSTEXPR_CXX14_COMPLEX __promote_t<_Tp, _Up> __
 template <class _A1>
 _LIBCUDACXX_HIDE_FROM_ABI _A1 __constexpr_logb(_A1 __x)
 {
-  return ::logb(__x);
+  return _CUDA_VSTD::logb(__x);
 }
 #else
 template <class _Tp>
@@ -850,17 +755,17 @@ _LIBCUDACXX_HIDE_FROM_ABI _CCCL_CONSTEXPR_CXX14_COMPLEX _Tp __constexpr_logb(_Tp
       return -numeric_limits<_Tp>::infinity();
     }
 
-    if (__constexpr_isinf(__x))
+    if (_CUDA_VSTD::isinf(__x))
     {
       return numeric_limits<_Tp>::infinity();
     }
 
-    if (__constexpr_isnan(__x))
+    if (_CUDA_VSTD::isnan(__x))
     {
       return numeric_limits<_Tp>::quiet_NaN();
     }
 
-    __x                      = __constexpr_fabs(__x);
+    __x                      = _CUDA_VSTD::__constexpr_fabs(__x);
     unsigned long long __exp = 0;
     while (__x >= _Tp(numeric_limits<_Tp>::radix))
     {
@@ -912,7 +817,7 @@ _LIBCUDACXX_HIDE_FROM_ABI _CCCL_CONSTEXPR_CXX14_COMPLEX _Tp __constexpr_scalbn(_
       return __x;
     }
 
-    if (__constexpr_isinf(__x))
+    if (_CUDA_VSTD::isinf(__x))
     {
       return __x;
     }
@@ -922,7 +827,7 @@ _LIBCUDACXX_HIDE_FROM_ABI _CCCL_CONSTEXPR_CXX14_COMPLEX _Tp __constexpr_scalbn(_
       return __x;
     }
 
-    if (__constexpr_isnan(__x))
+    if (_CUDA_VSTD::isnan(__x))
     {
       return numeric_limits<_Tp>::quiet_NaN();
     }
@@ -960,86 +865,6 @@ _LIBCUDACXX_HIDE_FROM_ABI _CCCL_CONSTEXPR_CXX14_COMPLEX _Tp __constexpr_scalbn(_
 }
 #endif // !_CCCL_COMPILER(MSVC)
 
-#if _CCCL_STD_VER > 2017
-template <typename _Fp>
-_LIBCUDACXX_HIDE_FROM_ABI constexpr _Fp __lerp(_Fp __a, _Fp __b, _Fp __t) noexcept
-{
-  if ((__a <= 0 && __b >= 0) || (__a >= 0 && __b <= 0))
-  {
-    return __t * __b + (1 - __t) * __a;
-  }
-
-  if (__t == 1)
-  {
-    return __b;
-  }
-  const _Fp __x = __a + __t * (__b - __a);
-  if ((__t > 1) == (__b > __a))
-  {
-    return __b < __x ? __x : __b;
-  }
-  else
-  {
-    return __x < __b ? __x : __b;
-  }
-}
-
-_LIBCUDACXX_HIDE_FROM_ABI constexpr float lerp(float __a, float __b, float __t) noexcept
-{
-  return __lerp(__a, __b, __t);
-}
-
-_LIBCUDACXX_HIDE_FROM_ABI constexpr double lerp(double __a, double __b, double __t) noexcept
-{
-  return __lerp(__a, __b, __t);
-}
-
-_LIBCUDACXX_HIDE_FROM_ABI constexpr long double lerp(long double __a, long double __b, long double __t) noexcept
-{
-  return __lerp(__a, __b, __t);
-}
-
-#endif // _CCCL_STD_VER > 2017
-
-template <class _IntT,
-          class _FloatT,
-          bool _FloatBigger = (numeric_limits<_FloatT>::digits > numeric_limits<_IntT>::digits),
-          int _Bits         = (numeric_limits<_IntT>::digits - numeric_limits<_FloatT>::digits)>
-_LIBCUDACXX_HIDE_FROM_ABI constexpr _IntT __max_representable_int_for_float() noexcept
-{
-  static_assert(is_floating_point<_FloatT>::value, "must be a floating point type");
-  static_assert(is_integral<_IntT>::value, "must be an integral type");
-  static_assert(numeric_limits<_FloatT>::radix == 2, "FloatT has incorrect radix");
-#ifdef _LIBCUDACXX_HAS_COMPLEX_LONG_DOUBLE
-  static_assert(
-    (_IsSame<_FloatT, float>::value || _IsSame<_FloatT, double>::value || _IsSame<_FloatT, long double>::value),
-    "unsupported floating point type");
-#else
-  static_assert((_IsSame<_FloatT, float>::value || _IsSame<_FloatT, double>::value), "unsupported floating point type");
-#endif
-  return _FloatBigger ? numeric_limits<_IntT>::max() : (numeric_limits<_IntT>::max() >> _Bits << _Bits);
-}
-
-// Convert a floating point number to the specified integral type after
-// clamping to the integral types representable range.
-//
-// The behavior is undefined if `__r` is NaN.
-template <class _IntT, class _RealT>
-_LIBCUDACXX_HIDE_FROM_ABI _IntT __clamp_to_integral(_RealT __r) noexcept
-{
-  using _Lim          = _CUDA_VSTD::numeric_limits<_IntT>;
-  const _IntT _MaxVal = _CUDA_VSTD::__max_representable_int_for_float<_IntT, _RealT>();
-  if (__r >= ::nextafter(static_cast<_RealT>(_MaxVal), INFINITY))
-  {
-    return _Lim::max();
-  }
-  else if (__r <= _Lim::lowest())
-  {
-    return _Lim::min();
-  }
-  return static_cast<_IntT>(__r);
-}
-
 _LIBCUDACXX_END_NAMESPACE_STD
 
 _CCCL_POP_MACROS
diff --git a/libcudacxx/include/cuda/std/detail/libcxx/include/complex b/libcudacxx/include/cuda/std/detail/libcxx/include/complex
index 7eecbcc4a20..22a88aa93db 100644
--- a/libcudacxx/include/cuda/std/detail/libcxx/include/complex
+++ b/libcudacxx/include/cuda/std/detail/libcxx/include/complex
@@ -513,16 +513,14 @@ operator*(const complex<_Tp>& __z, const complex<_Tp>& __w)
   {
     bool __z_zero = __a == _Tp(0) && __b == _Tp(0);
     bool __w_zero = __c == _Tp(0) && __d == _Tp(0);
-    bool __z_inf  = _CUDA_VSTD::__constexpr_isinf(__a) || _CUDA_VSTD::__constexpr_isinf(__b);
-    bool __w_inf  = _CUDA_VSTD::__constexpr_isinf(__c) || _CUDA_VSTD::__constexpr_isinf(__d);
+    bool __z_inf  = _CUDA_VSTD::isinf(__a) || _CUDA_VSTD::isinf(__b);
+    bool __w_inf  = _CUDA_VSTD::isinf(__c) || _CUDA_VSTD::isinf(__d);
     bool __z_nan  = !__z_inf
-                && ((_CUDA_VSTD::__constexpr_isnan(__a) && _CUDA_VSTD::__constexpr_isnan(__b))
-                    || (_CUDA_VSTD::__constexpr_isnan(__a) && __b == _Tp(0))
-                    || (__a == _Tp(0) && _CUDA_VSTD::__constexpr_isnan(__b)));
+                && ((_CUDA_VSTD::isnan(__a) && _CUDA_VSTD::isnan(__b)) || (_CUDA_VSTD::isnan(__a) && __b == _Tp(0))
+                    || (__a == _Tp(0) && _CUDA_VSTD::isnan(__b)));
     bool __w_nan = !__w_inf
-                && ((_CUDA_VSTD::__constexpr_isnan(__c) && _CUDA_VSTD::__constexpr_isnan(__d))
-                    || (_CUDA_VSTD::__constexpr_isnan(__c) && __d == _Tp(0))
-                    || (__c == _Tp(0) && _CUDA_VSTD::__constexpr_isnan(__d)));
+                && ((_CUDA_VSTD::isnan(__c) && _CUDA_VSTD::isnan(__d)) || (_CUDA_VSTD::isnan(__c) && __d == _Tp(0))
+                    || (__c == _Tp(0) && _CUDA_VSTD::isnan(__d)));
     if (__z_nan || __w_nan)
     {
       return complex<_Tp>(_Tp(numeric_limits<_Tp>::quiet_NaN()), _Tp(0));
@@ -535,10 +533,8 @@ operator*(const complex<_Tp>& __z, const complex<_Tp>& __w)
       }
       return complex<_Tp>(_Tp(numeric_limits<_Tp>::infinity()), _Tp(numeric_limits<_Tp>::infinity()));
     }
-    bool __z_nonzero_nan =
-      !__z_inf && !__z_nan && (_CUDA_VSTD::__constexpr_isnan(__a) || _CUDA_VSTD::__constexpr_isnan(__b));
-    bool __w_nonzero_nan =
-      !__w_inf && !__w_nan && (_CUDA_VSTD::__constexpr_isnan(__c) || _CUDA_VSTD::__constexpr_isnan(__d));
+    bool __z_nonzero_nan = !__z_inf && !__z_nan && (_CUDA_VSTD::isnan(__a) || _CUDA_VSTD::isnan(__b));
+    bool __w_nonzero_nan = !__w_inf && !__w_nan && (_CUDA_VSTD::isnan(__c) || _CUDA_VSTD::isnan(__d));
     if (__z_nonzero_nan || __w_nonzero_nan)
     {
       return complex<_Tp>(_Tp(numeric_limits<_Tp>::quiet_NaN()), _Tp(0));
@@ -551,54 +547,54 @@ operator*(const complex<_Tp>& __z, const complex<_Tp>& __w)
   _Tp __x = __partials.__ac - __partials.__bd;
   _Tp __y = __partials.__ad + __partials.__bc;
 #ifndef LIBCUDACXX_ENABLE_SIMPLIFIED_COMPLEX_MULTIPLICATION
-  if (_CUDA_VSTD::__constexpr_isnan(__x) && _CUDA_VSTD::__constexpr_isnan(__y))
+  if (_CUDA_VSTD::isnan(__x) && _CUDA_VSTD::isnan(__y))
   {
     bool __recalc = false;
-    if (_CUDA_VSTD::__constexpr_isinf(__a) || _CUDA_VSTD::__constexpr_isinf(__b))
+    if (_CUDA_VSTD::isinf(__a) || _CUDA_VSTD::isinf(__b))
     {
-      __a = _CUDA_VSTD::__constexpr_copysign(_CUDA_VSTD::__constexpr_isinf(__a) ? _Tp(1) : _Tp(0), __a);
-      __b = _CUDA_VSTD::__constexpr_copysign(_CUDA_VSTD::__constexpr_isinf(__b) ? _Tp(1) : _Tp(0), __b);
-      if (_CUDA_VSTD::__constexpr_isnan(__c))
+      __a = _CUDA_VSTD::__constexpr_copysign(_CUDA_VSTD::isinf(__a) ? _Tp(1) : _Tp(0), __a);
+      __b = _CUDA_VSTD::__constexpr_copysign(_CUDA_VSTD::isinf(__b) ? _Tp(1) : _Tp(0), __b);
+      if (_CUDA_VSTD::isnan(__c))
       {
         __c = _CUDA_VSTD::__constexpr_copysign(_Tp(0), __c);
       }
-      if (_CUDA_VSTD::__constexpr_isnan(__d))
+      if (_CUDA_VSTD::isnan(__d))
       {
         __d = _CUDA_VSTD::__constexpr_copysign(_Tp(0), __d);
       }
       __recalc = true;
     }
-    if (_CUDA_VSTD::__constexpr_isinf(__c) || _CUDA_VSTD::__constexpr_isinf(__d))
+    if (_CUDA_VSTD::isinf(__c) || _CUDA_VSTD::isinf(__d))
     {
-      __c = _CUDA_VSTD::__constexpr_copysign(_CUDA_VSTD::__constexpr_isinf(__c) ? _Tp(1) : _Tp(0), __c);
-      __d = _CUDA_VSTD::__constexpr_copysign(_CUDA_VSTD::__constexpr_isinf(__d) ? _Tp(1) : _Tp(0), __d);
-      if (_CUDA_VSTD::__constexpr_isnan(__a))
+      __c = _CUDA_VSTD::__constexpr_copysign(_CUDA_VSTD::isinf(__c) ? _Tp(1) : _Tp(0), __c);
+      __d = _CUDA_VSTD::__constexpr_copysign(_CUDA_VSTD::isinf(__d) ? _Tp(1) : _Tp(0), __d);
+      if (_CUDA_VSTD::isnan(__a))
       {
         __a = _CUDA_VSTD::__constexpr_copysign(_Tp(0), __a);
       }
-      if (_CUDA_VSTD::__constexpr_isnan(__b))
+      if (_CUDA_VSTD::isnan(__b))
       {
         __b = _CUDA_VSTD::__constexpr_copysign(_Tp(0), __b);
       }
       __recalc = true;
     }
     if (!__recalc
-        && (_CUDA_VSTD::__constexpr_isinf(__partials.__ac) || _CUDA_VSTD::__constexpr_isinf(__partials.__bd)
-            || _CUDA_VSTD::__constexpr_isinf(__partials.__ad) || _CUDA_VSTD::__constexpr_isinf(__partials.__bc)))
+        && (_CUDA_VSTD::isinf(__partials.__ac) || _CUDA_VSTD::isinf(__partials.__bd)
+            || _CUDA_VSTD::isinf(__partials.__ad) || _CUDA_VSTD::isinf(__partials.__bc)))
     {
-      if (_CUDA_VSTD::__constexpr_isnan(__a))
+      if (_CUDA_VSTD::isnan(__a))
       {
         __a = _CUDA_VSTD::__constexpr_copysign(_Tp(0), __a);
       }
-      if (_CUDA_VSTD::__constexpr_isnan(__b))
+      if (_CUDA_VSTD::isnan(__b))
       {
         __b = _CUDA_VSTD::__constexpr_copysign(_Tp(0), __b);
       }
-      if (_CUDA_VSTD::__constexpr_isnan(__c))
+      if (_CUDA_VSTD::isnan(__c))
       {
         __c = _CUDA_VSTD::__constexpr_copysign(_Tp(0), __c);
       }
-      if (_CUDA_VSTD::__constexpr_isnan(__d))
+      if (_CUDA_VSTD::isnan(__d))
       {
         __d = _CUDA_VSTD::__constexpr_copysign(_Tp(0), __d);
       }
@@ -643,7 +639,7 @@ operator/(const complex<_Tp>& __z, const complex<_Tp>& __w)
   _Tp __d      = __w.imag();
   _Tp __logbw  = _CUDA_VSTD::__constexpr_logb(
     _CUDA_VSTD::__constexpr_fmax(_CUDA_VSTD::__constexpr_fabs(__c), _CUDA_VSTD::__constexpr_fabs(__d)));
-  if (_CUDA_VSTD::__constexpr_isfinite(__logbw))
+  if (_CUDA_VSTD::isfinite(__logbw))
   {
     __ilogbw = static_cast<int>(__logbw);
     __c      = _CUDA_VSTD::__constexpr_scalbn(__c, -__ilogbw);
@@ -656,24 +652,20 @@ operator/(const complex<_Tp>& __z, const complex<_Tp>& __w)
   {
     bool __z_zero = __a == _Tp(0) && __b == _Tp(0);
     bool __w_zero = __c == _Tp(0) && __d == _Tp(0);
-    bool __z_inf  = _CUDA_VSTD::__constexpr_isinf(__a) || _CUDA_VSTD::__constexpr_isinf(__b);
-    bool __w_inf  = _CUDA_VSTD::__constexpr_isinf(__c) || _CUDA_VSTD::__constexpr_isinf(__d);
+    bool __z_inf  = _CUDA_VSTD::isinf(__a) || _CUDA_VSTD::isinf(__b);
+    bool __w_inf  = _CUDA_VSTD::isinf(__c) || _CUDA_VSTD::isinf(__d);
     bool __z_nan  = !__z_inf
-                && ((_CUDA_VSTD::__constexpr_isnan(__a) && _CUDA_VSTD::__constexpr_isnan(__b))
-                    || (_CUDA_VSTD::__constexpr_isnan(__a) && __b == _Tp(0))
-                    || (__a == _Tp(0) && _CUDA_VSTD::__constexpr_isnan(__b)));
+                && ((_CUDA_VSTD::isnan(__a) && _CUDA_VSTD::isnan(__b)) || (_CUDA_VSTD::isnan(__a) && __b == _Tp(0))
+                    || (__a == _Tp(0) && _CUDA_VSTD::isnan(__b)));
     bool __w_nan = !__w_inf
-                && ((_CUDA_VSTD::__constexpr_isnan(__c) && _CUDA_VSTD::__constexpr_isnan(__d))
-                    || (_CUDA_VSTD::__constexpr_isnan(__c) && __d == _Tp(0))
-                    || (__c == _Tp(0) && _CUDA_VSTD::__constexpr_isnan(__d)));
+                && ((_CUDA_VSTD::isnan(__c) && _CUDA_VSTD::isnan(__d)) || (_CUDA_VSTD::isnan(__c) && __d == _Tp(0))
+                    || (__c == _Tp(0) && _CUDA_VSTD::isnan(__d)));
     if ((__z_nan || __w_nan) || (__z_inf && __w_inf))
     {
       return complex<_Tp>(_Tp(numeric_limits<_Tp>::quiet_NaN()), _Tp(0));
     }
-    bool __z_nonzero_nan =
-      !__z_inf && !__z_nan && (_CUDA_VSTD::__constexpr_isnan(__a) || _CUDA_VSTD::__constexpr_isnan(__b));
-    bool __w_nonzero_nan =
-      !__w_inf && !__w_nan && (_CUDA_VSTD::__constexpr_isnan(__c) || _CUDA_VSTD::__constexpr_isnan(__d));
+    bool __z_nonzero_nan = !__z_inf && !__z_nan && (_CUDA_VSTD::isnan(__a) || _CUDA_VSTD::isnan(__b));
+    bool __w_nonzero_nan = !__w_inf && !__w_nan && (_CUDA_VSTD::isnan(__c) || _CUDA_VSTD::isnan(__d));
     if (__z_nonzero_nan || __w_nonzero_nan)
     {
       if (__w_zero)
@@ -708,26 +700,25 @@ operator/(const complex<_Tp>& __z, const complex<_Tp>& __w)
   _Tp __x     = _CUDA_VSTD::__constexpr_scalbn((__partials.__ac + __partials.__bd) / __denom, -__ilogbw);
   _Tp __y     = _CUDA_VSTD::__constexpr_scalbn((__partials.__bc - __partials.__ad) / __denom, -__ilogbw);
 #ifndef LIBCUDACXX_ENABLE_SIMPLIFIED_COMPLEX_DIVISION
-  if (_CUDA_VSTD::__constexpr_isnan(__x) && _CUDA_VSTD::__constexpr_isnan(__y))
+  if (_CUDA_VSTD::isnan(__x) && _CUDA_VSTD::isnan(__y))
   {
-    if ((__denom == _Tp(0)) && (!_CUDA_VSTD::__constexpr_isnan(__a) || !_CUDA_VSTD::__constexpr_isnan(__b)))
+    if ((__denom == _Tp(0)) && (!_CUDA_VSTD::isnan(__a) || !_CUDA_VSTD::isnan(__b)))
     {
       __x = _CUDA_VSTD::__constexpr_copysign(_Tp(INFINITY), __c) * __a;
       __y = _CUDA_VSTD::__constexpr_copysign(_Tp(INFINITY), __c) * __b;
     }
-    else if ((_CUDA_VSTD::__constexpr_isinf(__a) || _CUDA_VSTD::__constexpr_isinf(__b))
-             && _CUDA_VSTD::__constexpr_isfinite(__c) && _CUDA_VSTD::__constexpr_isfinite(__d))
+    else if ((_CUDA_VSTD::isinf(__a) || _CUDA_VSTD::isinf(__b)) && _CUDA_VSTD::isfinite(__c)
+             && _CUDA_VSTD::isfinite(__d))
     {
-      __a = _CUDA_VSTD::__constexpr_copysign(_CUDA_VSTD::__constexpr_isinf(__a) ? _Tp(1) : _Tp(0), __a);
-      __b = _CUDA_VSTD::__constexpr_copysign(_CUDA_VSTD::__constexpr_isinf(__b) ? _Tp(1) : _Tp(0), __b);
+      __a = _CUDA_VSTD::__constexpr_copysign(_CUDA_VSTD::isinf(__a) ? _Tp(1) : _Tp(0), __a);
+      __b = _CUDA_VSTD::__constexpr_copysign(_CUDA_VSTD::isinf(__b) ? _Tp(1) : _Tp(0), __b);
       __x = _Tp(INFINITY) * (__a * __c + __b * __d);
       __y = _Tp(INFINITY) * (__b * __c - __a * __d);
     }
-    else if (_CUDA_VSTD::__constexpr_isinf(__logbw) && __logbw > _Tp(0) && _CUDA_VSTD::__constexpr_isfinite(__a)
-             && _CUDA_VSTD::__constexpr_isfinite(__b))
+    else if (_CUDA_VSTD::isinf(__logbw) && __logbw > _Tp(0) && _CUDA_VSTD::isfinite(__a) && _CUDA_VSTD::isfinite(__b))
     {
-      __c = _CUDA_VSTD::__constexpr_copysign(_CUDA_VSTD::__constexpr_isinf(__c) ? _Tp(1) : _Tp(0), __c);
-      __d = _CUDA_VSTD::__constexpr_copysign(_CUDA_VSTD::__constexpr_isinf(__d) ? _Tp(1) : _Tp(0), __d);
+      __c = _CUDA_VSTD::__constexpr_copysign(_CUDA_VSTD::isinf(__c) ? _Tp(1) : _Tp(0), __c);
+      __d = _CUDA_VSTD::__constexpr_copysign(_CUDA_VSTD::isinf(__d) ? _Tp(1) : _Tp(0), __d);
       __x = _Tp(0) * (__a * __c + __b * __d);
       __y = _Tp(0) * (__b * __c - __a * __d);
     }
@@ -928,11 +919,11 @@ _LIBCUDACXX_HIDE_FROM_ABI enable_if_t<is_same<_Tp, float>::value, float> arg(_Tp
 template <class _Tp>
 _LIBCUDACXX_HIDE_FROM_ABI _CCCL_CONSTEXPR_CXX14 _Tp norm(const complex<_Tp>& __c)
 {
-  if (_CUDA_VSTD::__constexpr_isinf(__c.real()))
+  if (_CUDA_VSTD::isinf(__c.real()))
   {
     return _CUDA_VSTD::abs(__c.real());
   }
-  if (_CUDA_VSTD::__constexpr_isinf(__c.imag()))
+  if (_CUDA_VSTD::isinf(__c.imag()))
   {
     return _CUDA_VSTD::abs(__c.imag());
   }
@@ -965,7 +956,7 @@ template <class _Tp>
 _LIBCUDACXX_HIDE_FROM_ABI complex<_Tp> proj(const complex<_Tp>& __c)
 {
   complex<_Tp> __r = __c;
-  if (_CUDA_VSTD::__constexpr_isinf(__c.real()) || _CUDA_VSTD::__constexpr_isinf(__c.imag()))
+  if (_CUDA_VSTD::isinf(__c.real()) || _CUDA_VSTD::isinf(__c.imag()))
   {
     __r = complex<_Tp>(INFINITY, __constexpr_copysign(_Tp(0), __c.imag()));
   }
@@ -975,7 +966,7 @@ _LIBCUDACXX_HIDE_FROM_ABI complex<_Tp> proj(const complex<_Tp>& __c)
 template <class _Tp>
 _LIBCUDACXX_HIDE_FROM_ABI enable_if_t<__is_complex_float<_Tp>::value, __cccl_complex_complex_type<_Tp>> proj(_Tp __re)
 {
-  if (_CUDA_VSTD::__constexpr_isinf(__re))
+  if (_CUDA_VSTD::isinf(__re))
   {
     __re = _CUDA_VSTD::abs(__re);
   }
@@ -993,33 +984,33 @@ _LIBCUDACXX_HIDE_FROM_ABI enable_if_t<is_integral<_Tp>::value, __cccl_complex_co
 template <class _Tp>
 _LIBCUDACXX_HIDE_FROM_ABI complex<_Tp> polar(const _Tp& __rho, const _Tp& __theta = _Tp())
 {
-  if (_CUDA_VSTD::__constexpr_isnan(__rho) || _CUDA_VSTD::signbit(__rho))
+  if (_CUDA_VSTD::isnan(__rho) || _CUDA_VSTD::signbit(__rho))
   {
     return complex<_Tp>(_Tp(NAN), _Tp(NAN));
   }
-  if (_CUDA_VSTD::__constexpr_isnan(__theta))
+  if (_CUDA_VSTD::isnan(__theta))
   {
-    if (_CUDA_VSTD::__constexpr_isinf(__rho))
+    if (_CUDA_VSTD::isinf(__rho))
     {
       return complex<_Tp>(__rho, __theta);
     }
     return complex<_Tp>(__theta, __theta);
   }
-  if (_CUDA_VSTD::__constexpr_isinf(__theta))
+  if (_CUDA_VSTD::isinf(__theta))
   {
-    if (_CUDA_VSTD::__constexpr_isinf(__rho))
+    if (_CUDA_VSTD::isinf(__rho))
     {
       return complex<_Tp>(__rho, _Tp(NAN));
     }
     return complex<_Tp>(_Tp(NAN), _Tp(NAN));
   }
   _Tp __x = __rho * _CUDA_VSTD::cos(__theta);
-  if (_CUDA_VSTD::__constexpr_isnan(__x))
+  if (_CUDA_VSTD::isnan(__x))
   {
     __x = 0;
   }
   _Tp __y = __rho * _CUDA_VSTD::sin(__theta);
-  if (_CUDA_VSTD::__constexpr_isnan(__y))
+  if (_CUDA_VSTD::isnan(__y))
   {
     __y = 0;
   }
@@ -1047,18 +1038,18 @@ _LIBCUDACXX_HIDE_FROM_ABI complex<_Tp> log10(const complex<_Tp>& __x)
 template <class _Tp>
 _LIBCUDACXX_HIDE_FROM_ABI complex<_Tp> sqrt(const complex<_Tp>& __x)
 {
-  if (_CUDA_VSTD::__constexpr_isinf(__x.imag()))
+  if (_CUDA_VSTD::isinf(__x.imag()))
   {
     return complex<_Tp>(_Tp(INFINITY), __x.imag());
   }
-  if (_CUDA_VSTD::__constexpr_isinf(__x.real()))
+  if (_CUDA_VSTD::isinf(__x.real()))
   {
     if (__x.real() > _Tp(0))
     {
-      return complex<_Tp>(
-        __x.real(), _CUDA_VSTD::__constexpr_isnan(__x.imag()) ? __x.imag() : __constexpr_copysign(_Tp(0), __x.imag()));
+      return complex<_Tp>(__x.real(),
+                          _CUDA_VSTD::isnan(__x.imag()) ? __x.imag() : __constexpr_copysign(_Tp(0), __x.imag()));
     }
-    return complex<_Tp>(_CUDA_VSTD::__constexpr_isnan(__x.imag()) ? __x.imag() : _Tp(0),
+    return complex<_Tp>(_CUDA_VSTD::isnan(__x.imag()) ? __x.imag() : _Tp(0),
                         __constexpr_copysign(__x.real(), __x.imag()));
   }
   return _CUDA_VSTD::polar(_CUDA_VSTD::sqrt(_CUDA_VSTD::abs(__x)), _CUDA_VSTD::arg(__x) / _Tp(2));
@@ -1074,18 +1065,18 @@ _LIBCUDACXX_HIDE_FROM_ABI complex<_Tp> exp(const complex<_Tp>& __x)
   {
     return complex<_Tp>(_CUDA_VSTD::exp(__x.real()), __constexpr_copysign(_Tp(0), __x.imag()));
   }
-  if (_CUDA_VSTD::__constexpr_isinf(__x.real()))
+  if (_CUDA_VSTD::isinf(__x.real()))
   {
     if (__x.real() < _Tp(0))
     {
-      if (!_CUDA_VSTD::__constexpr_isfinite(__i))
+      if (!_CUDA_VSTD::isfinite(__i))
       {
         __i = _Tp(1);
       }
     }
-    else if (__i == _Tp(0) || !_CUDA_VSTD::__constexpr_isfinite(__i))
+    else if (__i == _Tp(0) || !_CUDA_VSTD::isfinite(__i))
     {
-      if (_CUDA_VSTD::__constexpr_isinf(__i))
+      if (_CUDA_VSTD::isinf(__i))
       {
         __i = _Tp(NAN);
       }
@@ -1144,21 +1135,21 @@ template <class _Tp>
 _LIBCUDACXX_HIDE_FROM_ABI complex<_Tp> asinh(const complex<_Tp>& __x)
 {
   const _Tp __pi(static_cast<_Tp>(atan2(+0., -0.)));
-  if (_CUDA_VSTD::__constexpr_isinf(__x.real()))
+  if (_CUDA_VSTD::isinf(__x.real()))
   {
-    if (_CUDA_VSTD::__constexpr_isnan(__x.imag()))
+    if (_CUDA_VSTD::isnan(__x.imag()))
     {
       return __x;
     }
-    if (_CUDA_VSTD::__constexpr_isinf(__x.imag()))
+    if (_CUDA_VSTD::isinf(__x.imag()))
     {
       return complex<_Tp>(__x.real(), __constexpr_copysign(__pi * _Tp(0.25), __x.imag()));
     }
     return complex<_Tp>(__x.real(), __constexpr_copysign(_Tp(0), __x.imag()));
   }
-  if (_CUDA_VSTD::__constexpr_isnan(__x.real()))
+  if (_CUDA_VSTD::isnan(__x.real()))
   {
-    if (_CUDA_VSTD::__constexpr_isinf(__x.imag()))
+    if (_CUDA_VSTD::isinf(__x.imag()))
     {
       return complex<_Tp>(__x.imag(), __x.real());
     }
@@ -1168,7 +1159,7 @@ _LIBCUDACXX_HIDE_FROM_ABI complex<_Tp> asinh(const complex<_Tp>& __x)
     }
     return complex<_Tp>(__x.real(), __x.real());
   }
-  if (_CUDA_VSTD::__constexpr_isinf(__x.imag()))
+  if (_CUDA_VSTD::isinf(__x.imag()))
   {
     return complex<_Tp>(__constexpr_copysign(__x.imag(), __x.real()), __constexpr_copysign(__pi / _Tp(2), __x.imag()));
   }
@@ -1182,13 +1173,13 @@ template <class _Tp>
 _LIBCUDACXX_HIDE_FROM_ABI complex<_Tp> acosh(const complex<_Tp>& __x)
 {
   const _Tp __pi(static_cast<_Tp>(atan2(+0., -0.)));
-  if (_CUDA_VSTD::__constexpr_isinf(__x.real()))
+  if (_CUDA_VSTD::isinf(__x.real()))
   {
-    if (_CUDA_VSTD::__constexpr_isnan(__x.imag()))
+    if (_CUDA_VSTD::isnan(__x.imag()))
     {
       return complex<_Tp>(_CUDA_VSTD::abs(__x.real()), __x.imag());
     }
-    if (_CUDA_VSTD::__constexpr_isinf(__x.imag()))
+    if (_CUDA_VSTD::isinf(__x.imag()))
     {
       if (__x.real() > _Tp(0))
       {
@@ -1205,15 +1196,15 @@ _LIBCUDACXX_HIDE_FROM_ABI complex<_Tp> acosh(const complex<_Tp>& __x)
     }
     return complex<_Tp>(__x.real(), __constexpr_copysign(_Tp(0), __x.imag()));
   }
-  if (_CUDA_VSTD::__constexpr_isnan(__x.real()))
+  if (_CUDA_VSTD::isnan(__x.real()))
   {
-    if (_CUDA_VSTD::__constexpr_isinf(__x.imag()))
+    if (_CUDA_VSTD::isinf(__x.imag()))
     {
       return complex<_Tp>(_CUDA_VSTD::abs(__x.imag()), __x.real());
     }
     return complex<_Tp>(__x.real(), __x.real());
   }
-  if (_CUDA_VSTD::__constexpr_isinf(__x.imag()))
+  if (_CUDA_VSTD::isinf(__x.imag()))
   {
     return complex<_Tp>(_CUDA_VSTD::abs(__x.imag()), __constexpr_copysign(__pi / _Tp(2), __x.imag()));
   }
@@ -1227,23 +1218,23 @@ template <class _Tp>
 _LIBCUDACXX_HIDE_FROM_ABI complex<_Tp> atanh(const complex<_Tp>& __x)
 {
   const _Tp __pi(static_cast<_Tp>(atan2(+0., -0.)));
-  if (_CUDA_VSTD::__constexpr_isinf(__x.imag()))
+  if (_CUDA_VSTD::isinf(__x.imag()))
   {
     return complex<_Tp>(__constexpr_copysign(_Tp(0), __x.real()), __constexpr_copysign(__pi / _Tp(2), __x.imag()));
   }
-  if (_CUDA_VSTD::__constexpr_isnan(__x.imag()))
+  if (_CUDA_VSTD::isnan(__x.imag()))
   {
-    if (_CUDA_VSTD::__constexpr_isinf(__x.real()) || __x.real() == _Tp(0))
+    if (_CUDA_VSTD::isinf(__x.real()) || __x.real() == _Tp(0))
     {
       return complex<_Tp>(__constexpr_copysign(_Tp(0), __x.real()), __x.imag());
     }
     return complex<_Tp>(__x.imag(), __x.imag());
   }
-  if (_CUDA_VSTD::__constexpr_isnan(__x.real()))
+  if (_CUDA_VSTD::isnan(__x.real()))
   {
     return complex<_Tp>(__x.real(), __x.real());
   }
-  if (_CUDA_VSTD::__constexpr_isinf(__x.real()))
+  if (_CUDA_VSTD::isinf(__x.real()))
   {
     return complex<_Tp>(__constexpr_copysign(_Tp(0), __x.real()), __constexpr_copysign(__pi / _Tp(2), __x.imag()));
   }
@@ -1260,15 +1251,15 @@ _LIBCUDACXX_HIDE_FROM_ABI complex<_Tp> atanh(const complex<_Tp>& __x)
 template <class _Tp>
 _LIBCUDACXX_HIDE_FROM_ABI complex<_Tp> sinh(const complex<_Tp>& __x)
 {
-  if (_CUDA_VSTD::__constexpr_isinf(__x.real()) && !_CUDA_VSTD::__constexpr_isfinite(__x.imag()))
+  if (_CUDA_VSTD::isinf(__x.real()) && !_CUDA_VSTD::isfinite(__x.imag()))
   {
     return complex<_Tp>(__x.real(), _Tp(NAN));
   }
-  if (__x.real() == _Tp(0) && !_CUDA_VSTD::__constexpr_isfinite(__x.imag()))
+  if (__x.real() == _Tp(0) && !_CUDA_VSTD::isfinite(__x.imag()))
   {
     return complex<_Tp>(__x.real(), _Tp(NAN));
   }
-  if (__x.imag() == _Tp(0) && !_CUDA_VSTD::__constexpr_isfinite(__x.real()))
+  if (__x.imag() == _Tp(0) && !_CUDA_VSTD::isfinite(__x.real()))
   {
     return __x;
   }
@@ -1281,11 +1272,11 @@ _LIBCUDACXX_HIDE_FROM_ABI complex<_Tp> sinh(const complex<_Tp>& __x)
 template <class _Tp>
 _LIBCUDACXX_HIDE_FROM_ABI complex<_Tp> cosh(const complex<_Tp>& __x)
 {
-  if (_CUDA_VSTD::__constexpr_isinf(__x.real()) && !_CUDA_VSTD::__constexpr_isfinite(__x.imag()))
+  if (_CUDA_VSTD::isinf(__x.real()) && !_CUDA_VSTD::isfinite(__x.imag()))
   {
     return complex<_Tp>(_CUDA_VSTD::abs(__x.real()), _Tp(NAN));
   }
-  if (__x.real() == _Tp(0) && !_CUDA_VSTD::__constexpr_isfinite(__x.imag()))
+  if (__x.real() == _Tp(0) && !_CUDA_VSTD::isfinite(__x.imag()))
   {
     return complex<_Tp>(_Tp(NAN), __x.real());
   }
@@ -1293,7 +1284,7 @@ _LIBCUDACXX_HIDE_FROM_ABI complex<_Tp> cosh(const complex<_Tp>& __x)
   {
     return complex<_Tp>(_Tp(1), __x.imag());
   }
-  if (__x.imag() == _Tp(0) && !_CUDA_VSTD::__constexpr_isfinite(__x.real()))
+  if (__x.imag() == _Tp(0) && !_CUDA_VSTD::isfinite(__x.real()))
   {
     return complex<_Tp>(_CUDA_VSTD::abs(__x.real()), __x.imag());
   }
@@ -1306,16 +1297,16 @@ _LIBCUDACXX_HIDE_FROM_ABI complex<_Tp> cosh(const complex<_Tp>& __x)
 template <class _Tp>
 _LIBCUDACXX_HIDE_FROM_ABI complex<_Tp> tanh(const complex<_Tp>& __x)
 {
-  if (_CUDA_VSTD::__constexpr_isinf(__x.real()))
+  if (_CUDA_VSTD::isinf(__x.real()))
   {
-    if (!_CUDA_VSTD::__constexpr_isfinite(__x.imag()))
+    if (!_CUDA_VSTD::isfinite(__x.imag()))
     {
       return complex<_Tp>(__constexpr_copysign(_Tp(1), __x.real()), _Tp(0));
     }
     return complex<_Tp>(__constexpr_copysign(_Tp(1), __x.real()),
                         __constexpr_copysign(_Tp(0), _CUDA_VSTD::sin(_Tp(2) * __x.imag())));
   }
-  if (_CUDA_VSTD::__constexpr_isnan(__x.real()) && __x.imag() == _Tp(0))
+  if (_CUDA_VSTD::isnan(__x.real()) && __x.imag() == _Tp(0))
   {
     return __x;
   }
@@ -1323,7 +1314,7 @@ _LIBCUDACXX_HIDE_FROM_ABI complex<_Tp> tanh(const complex<_Tp>& __x)
   _Tp __2i(_Tp(2) * __x.imag());
   _Tp __d(_CUDA_VSTD::cosh(__2r) + _CUDA_VSTD::cos(__2i));
   _Tp __2rsh(_CUDA_VSTD::sinh(__2r));
-  if (_CUDA_VSTD::__constexpr_isinf(__2rsh) && _CUDA_VSTD::__constexpr_isinf(__d))
+  if (_CUDA_VSTD::isinf(__2rsh) && _CUDA_VSTD::isinf(__d))
   {
     return complex<_Tp>(__2rsh > _Tp(0) ? _Tp(1) : _Tp(-1), __2i > _Tp(0) ? _Tp(0) : _Tp(-0.));
   }
@@ -1345,13 +1336,13 @@ template <class _Tp>
 _LIBCUDACXX_HIDE_FROM_ABI complex<_Tp> acos(const complex<_Tp>& __x)
 {
   const _Tp __pi(static_cast<_Tp>(atan2(+0., -0.)));
-  if (_CUDA_VSTD::__constexpr_isinf(__x.real()))
+  if (_CUDA_VSTD::isinf(__x.real()))
   {
-    if (_CUDA_VSTD::__constexpr_isnan(__x.imag()))
+    if (_CUDA_VSTD::isnan(__x.imag()))
     {
       return complex<_Tp>(__x.imag(), __x.real());
     }
-    if (_CUDA_VSTD::__constexpr_isinf(__x.imag()))
+    if (_CUDA_VSTD::isinf(__x.imag()))
     {
       if (__x.real() < _Tp(0))
       {
@@ -1365,15 +1356,15 @@ _LIBCUDACXX_HIDE_FROM_ABI complex<_Tp> acos(const complex<_Tp>& __x)
     }
     return complex<_Tp>(_Tp(0), _CUDA_VSTD::signbit(__x.imag()) ? __x.real() : -__x.real());
   }
-  if (_CUDA_VSTD::__constexpr_isnan(__x.real()))
+  if (_CUDA_VSTD::isnan(__x.real()))
   {
-    if (_CUDA_VSTD::__constexpr_isinf(__x.imag()))
+    if (_CUDA_VSTD::isinf(__x.imag()))
     {
       return complex<_Tp>(__x.real(), -__x.imag());
     }
     return complex<_Tp>(__x.real(), __x.real());
   }
-  if (_CUDA_VSTD::__constexpr_isinf(__x.imag()))
+  if (_CUDA_VSTD::isinf(__x.imag()))
   {
     return complex<_Tp>(__pi / _Tp(2), -__x.imag());
   }
diff --git a/libcudacxx/test/libcudacxx/std/numerics/c.math/fp_min_max.pass.cpp b/libcudacxx/test/libcudacxx/std/numerics/c.math/fp_min_max.pass.cpp
new file mode 100644
index 00000000000..a42b36caeaa
--- /dev/null
+++ b/libcudacxx/test/libcudacxx/std/numerics/c.math/fp_min_max.pass.cpp
@@ -0,0 +1,118 @@
+//===----------------------------------------------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+// SPDX-FileCopyrightText: Copyright (c) 2024 NVIDIA CORPORATION & AFFILIATES.
+//
+//===----------------------------------------------------------------------===//
+
+// <cmath>
+
+#include <cuda/std/cassert>
+#include <cuda/std/cmath>
+#include <cuda/std/type_traits>
+
+#include "test_macros.h"
+
+template <class T>
+__host__ __device__ void test_fmax(T value)
+{
+  static_assert((cuda::std::is_same<decltype(cuda::std::fmax((T) 0, (T) 0)), T>::value), "");
+  static_assert(
+    (cuda::std::is_same<decltype(cuda::std::fmax((float) 0, (T) 0)), cuda::std::__promote_t<float, T>>::value), "");
+  static_assert(
+    (cuda::std::is_same<decltype(cuda::std::fmax((double) 0, (T) 0)), cuda::std::__promote_t<double, T>>::value), "");
+  assert(cuda::std::fmax(value, (T) 0) == value);
+}
+
+__host__ __device__ void test_fmax(float value)
+{
+  test_fmax<float>(value);
+  test_fmax<double>(value);
+#if !defined(_LIBCUDACXX_HAS_NO_LONG_DOUBLE)
+  test_fmax<long double>(value);
+#endif // !_LIBCUDACXX_HAS_NO_LONG_DOUBLE
+#ifdef _LIBCUDACXX_HAS_NVFP16
+  test_fmax<__half>(__float2half(value));
+#endif // _LIBCUDACXX_HAS_NVFP16
+#ifdef _LIBCUDACXX_HAS_NVBF16
+  test_fmax<__nv_bfloat16>(__float2bfloat16(value));
+#endif // _LIBCUDACXX_HAS_NVBF16
+
+  static_assert((cuda::std::is_same<decltype(cuda::std::fmax((int) 0, (int) 0)), double>::value), "");
+  static_assert((cuda::std::is_same<decltype(cuda::std::fmax((int) 0, (long long) 0)), double>::value), "");
+  static_assert((cuda::std::is_same<decltype(cuda::std::fmax((int) 0, (unsigned long long) 0)), double>::value), "");
+  static_assert((cuda::std::is_same<decltype(cuda::std::fmax((float) 0, (unsigned int) 0)), double>::value), "");
+  static_assert((cuda::std::is_same<decltype(cuda::std::fmax((double) 0, (long) 0)), double>::value), "");
+
+  static_assert((cuda::std::is_same<decltype(cuda::std::fmax((bool) 0, (float) 0)), double>::value), "");
+  static_assert((cuda::std::is_same<decltype(cuda::std::fmax((unsigned short) 0, (double) 0)), double>::value), "");
+
+  static_assert((cuda::std::is_same<decltype(cuda::std::fmaxf(0, 0)), float>::value), "");
+#if !defined(_LIBCUDACXX_HAS_NO_LONG_DOUBLE)
+  static_assert((cuda::std::is_same<decltype(cuda::std::fmax((long double) 0, (unsigned long) 0)), long double>::value),
+                "");
+  static_assert((cuda::std::is_same<decltype(cuda::std::fmaxl(0, 0)), long double>::value), "");
+#endif // !_LIBCUDACXX_HAS_NO_LONG_DOUBLE
+}
+
+template <class T>
+__host__ __device__ void test_fmin(T value)
+{
+  static_assert((cuda::std::is_same<decltype(cuda::std::fmin((T) 0, (T) 0)), T>::value), "");
+  static_assert(
+    (cuda::std::is_same<decltype(cuda::std::fmin((float) 0, (T) 0)), cuda::std::__promote_t<float, T>>::value), "");
+  static_assert(
+    (cuda::std::is_same<decltype(cuda::std::fmin((double) 0, (T) 0)), cuda::std::__promote_t<double, T>>::value), "");
+  assert(cuda::std::fmin(value, (T) 0) == T(0));
+}
+
+__host__ __device__ void test_fmin(float value)
+{
+  test_fmax<float>(value);
+  test_fmax<double>(value);
+#if !defined(_LIBCUDACXX_HAS_NO_LONG_DOUBLE)
+  test_fmax<long double>(value);
+#endif // !_LIBCUDACXX_HAS_NO_LONG_DOUBLE
+#ifdef _LIBCUDACXX_HAS_NVFP16
+  test_fmax<__half>(__float2half(value));
+#endif // _LIBCUDACXX_HAS_NVFP16
+#ifdef _LIBCUDACXX_HAS_NVBF16
+  test_fmax<__nv_bfloat16>(__float2bfloat16(value));
+#endif // _LIBCUDACXX_HAS_NVBF16
+
+  static_assert((cuda::std::is_same<decltype(cuda::std::fmin((int) 0, (int) 0)), double>::value), "");
+  static_assert((cuda::std::is_same<decltype(cuda::std::fmin((int) 0, (long long) 0)), double>::value), "");
+  static_assert((cuda::std::is_same<decltype(cuda::std::fmin((int) 0, (unsigned long long) 0)), double>::value), "");
+  static_assert((cuda::std::is_same<decltype(cuda::std::fmin((float) 0, (unsigned int) 0)), double>::value), "");
+  static_assert((cuda::std::is_same<decltype(cuda::std::fmin((double) 0, (long) 0)), double>::value), "");
+
+  static_assert((cuda::std::is_same<decltype(cuda::std::fmin((bool) 0, (float) 0)), double>::value), "");
+  static_assert((cuda::std::is_same<decltype(cuda::std::fmin((unsigned short) 0, (double) 0)), double>::value), "");
+
+  static_assert((cuda::std::is_same<decltype(cuda::std::fminf(0, 0)), float>::value), "");
+#if !defined(_LIBCUDACXX_HAS_NO_LONG_DOUBLE)
+  static_assert((cuda::std::is_same<decltype(cuda::std::fmin((long double) 0, (unsigned long) 0)), long double>::value),
+                "");
+  static_assert((cuda::std::is_same<decltype(cuda::std::fminl(0, 0)), long double>::value), "");
+#endif // !_LIBCUDACXX_HAS_NO_LONG_DOUBLE
+}
+
+__host__ __device__ void test(float value)
+{
+  test_fmax(value);
+  test_fmin(value);
+}
+
+__global__ void test_global_kernel(float* value)
+{
+  test(*value);
+}
+
+int main(int, char**)
+{
+  volatile float value = 1.0f;
+  test(value);
+  return 0;
+}
diff --git a/libcudacxx/test/libcudacxx/std/numerics/c.math/fp_traits.pass.cpp b/libcudacxx/test/libcudacxx/std/numerics/c.math/fp_traits.pass.cpp
new file mode 100644
index 00000000000..93e469bb3f4
--- /dev/null
+++ b/libcudacxx/test/libcudacxx/std/numerics/c.math/fp_traits.pass.cpp
@@ -0,0 +1,458 @@
+//===----------------------------------------------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+// SPDX-FileCopyrightText: Copyright (c) 2024 NVIDIA CORPORATION & AFFILIATES.
+//
+//===----------------------------------------------------------------------===//
+
+// <cmath>
+
+#include <cuda/std/cassert>
+#include <cuda/std/cmath>
+#include <cuda/std/limits>
+#include <cuda/std/type_traits>
+
+#include "fp_compare.h"
+#include "test_macros.h"
+
+template <class T>
+__host__ __device__ void test_fpclassify(float val)
+{
+  static_assert((cuda::std::is_same<decltype(cuda::std::fpclassify(T(val))), int>::value), "");
+  assert(cuda::std::fpclassify(T(val)) == FP_NORMAL);
+  assert(cuda::std::fpclassify(T(1.0)) == FP_NORMAL);
+  assert(cuda::std::fpclassify(T(0.0)) == FP_ZERO);
+  assert(cuda::std::fpclassify(T(-1.0)) == FP_NORMAL);
+  assert(cuda::std::fpclassify(T(-0.0)) == FP_ZERO);
+  // extended floating point types have issues here
+  if (!cuda::std::__is_extended_floating_point<T>::value)
+  {
+    assert(cuda::std::fpclassify(T(cuda::std::numeric_limits<T>::quiet_NaN())) == FP_NAN);
+    assert(cuda::std::fpclassify(T(cuda::std::numeric_limits<T>::infinity())) == FP_INFINITE);
+    assert(cuda::std::fpclassify(T(cuda::std::numeric_limits<T>::denorm_min())) == FP_SUBNORMAL);
+  }
+  else
+  {
+    assert(cuda::std::fpclassify(T(cuda::std::numeric_limits<float>::quiet_NaN())) == FP_NAN);
+    assert(cuda::std::fpclassify(T(cuda::std::numeric_limits<float>::infinity())) == FP_INFINITE);
+    // float subnormal turns to 0.0 for our half precision types
+    assert(cuda::std::fpclassify(T(cuda::std::numeric_limits<float>::denorm_min())) == FP_ZERO);
+  }
+}
+
+__host__ __device__ void test_fpclassify(float val)
+{
+  test_fpclassify<float>(val);
+  test_fpclassify<double>(val);
+#if !defined(_LIBCUDACXX_HAS_NO_LONG_DOUBLE)
+  test_fpclassify<long double>(val);
+#endif // !_LIBCUDACXX_HAS_NO_LONG_DOUBLE
+#ifdef _LIBCUDACXX_HAS_NVFP16
+  test_fpclassify<__half>(val);
+#endif // _LIBCUDACXX_HAS_NVFP16
+#ifdef _LIBCUDACXX_HAS_NVBF16
+  test_fpclassify<__nv_bfloat16>(val);
+#endif // _LIBCUDACXX_HAS_NVBF16
+
+  assert(cuda::std::fpclassify(0u) == FP_ZERO);
+  assert(cuda::std::fpclassify(cuda::std::numeric_limits<unsigned>::max()) == FP_NORMAL);
+  assert(cuda::std::fpclassify(1) == FP_NORMAL);
+  assert(cuda::std::fpclassify(-1) == FP_NORMAL);
+  assert(cuda::std::fpclassify(cuda::std::numeric_limits<int>::max()) == FP_NORMAL);
+  assert(cuda::std::fpclassify(cuda::std::numeric_limits<int>::min()) == FP_NORMAL);
+}
+
+template <class T>
+__host__ __device__ void test_signbit(float val)
+{
+  static_assert((cuda::std::is_same<decltype(cuda::std::signbit((T) 0)), bool>::value), "");
+  assert(cuda::std::signbit(T(val)) == false);
+  assert(cuda::std::signbit(T(-1.0)) == true);
+  assert(cuda::std::signbit(T(0.0)) == false);
+}
+
+__host__ __device__ void test_signbit(float val)
+{
+  test_signbit<float>(val);
+  test_signbit<double>(val);
+#if !defined(_LIBCUDACXX_HAS_NO_LONG_DOUBLE)
+  test_signbit<long double>(val);
+#endif // !_LIBCUDACXX_HAS_NO_LONG_DOUBLE
+#ifdef _LIBCUDACXX_HAS_NVFP16
+  test_signbit<__half>(val);
+#endif // _LIBCUDACXX_HAS_NVFP16
+#ifdef _LIBCUDACXX_HAS_NVBF16
+  test_signbit<__nv_bfloat16>(val);
+#endif // _LIBCUDACXX_HAS_NVBF16
+
+  assert(cuda::std::signbit(0u) == false);
+  assert(cuda::std::signbit(cuda::std::numeric_limits<unsigned>::max()) == false);
+  assert(cuda::std::signbit(1) == false);
+  assert(cuda::std::signbit(-1) == true);
+  assert(cuda::std::signbit(cuda::std::numeric_limits<int>::max()) == false);
+  assert(cuda::std::signbit(cuda::std::numeric_limits<int>::min()) == true);
+}
+
+template <class T>
+__host__ __device__ void test_isfinite(float val)
+{
+  static_assert((cuda::std::is_same<decltype(cuda::std::isfinite((T) 0)), bool>::value), "");
+  assert(cuda::std::isfinite(T(val)) == true);
+  assert(cuda::std::isfinite(T(-1.0f)) == true);
+  assert(cuda::std::isfinite(T(1.0f)) == true);
+  assert(cuda::std::isfinite(T(NAN)) == false);
+  assert(cuda::std::isfinite(T(INFINITY)) == false);
+  assert(cuda::std::isfinite(-T(INFINITY)) == false);
+}
+
+__host__ __device__ void test_isfinite(float val)
+{
+  test_isfinite<float>(val);
+  test_isfinite<double>(val);
+#if !defined(_LIBCUDACXX_HAS_NO_LONG_DOUBLE)
+  test_isfinite<long double>();
+#endif // !_LIBCUDACXX_HAS_NO_LONG_DOUBLE
+#ifdef _LIBCUDACXX_HAS_NVFP16
+  test_isfinite<__half>(val);
+#endif // _LIBCUDACXX_HAS_NVFP16
+#ifdef _LIBCUDACXX_HAS_NVBF16
+  test_isfinite<__nv_bfloat16>(val);
+#endif // _LIBCUDACXX_HAS_NVBF16
+
+  assert(cuda::std::isfinite(0) == true);
+  assert(cuda::std::isfinite(1) == true);
+  assert(cuda::std::isfinite(-1) == true);
+  assert(cuda::std::isfinite(cuda::std::numeric_limits<int>::max()) == true);
+  assert(cuda::std::isfinite(cuda::std::numeric_limits<int>::min()) == true);
+}
+
+__host__ __device__ _CCCL_CONSTEXPR_ISFINITE bool test_constexpr_isfinite(float val)
+{
+  return cuda::std::isfinite(val);
+}
+
+template <class T>
+__host__ __device__ void test_isnormal(float val)
+{
+  static_assert((cuda::std::is_same<decltype(cuda::std::isnormal((T) 0)), bool>::value), "");
+  assert(cuda::std::isnormal(T(val)) == true);
+  assert(cuda::std::isnormal(T(-1.0f)) == true);
+  assert(cuda::std::isnormal(T(1.0f)) == true);
+  assert(cuda::std::isnormal(T(0.0f)) == false);
+  assert(cuda::std::isnormal(T(NAN)) == false);
+  assert(cuda::std::isnormal(T(INFINITY)) == false);
+  assert(cuda::std::isnormal(-T(INFINITY)) == false);
+}
+
+__host__ __device__ void test_isnormal(float val)
+{
+  test_isnormal<float>(val);
+  test_isnormal<double>(val);
+#if !defined(_LIBCUDACXX_HAS_NO_LONG_DOUBLE)
+  test_isnormal<long double>();
+#endif // !_LIBCUDACXX_HAS_NO_LONG_DOUBLE
+#ifdef _LIBCUDACXX_HAS_NVFP16
+  test_isnormal<__half>(val);
+#endif // _LIBCUDACXX_HAS_NVFP16
+#ifdef _LIBCUDACXX_HAS_NVBF16
+  test_isnormal<__nv_bfloat16>(val);
+#endif // _LIBCUDACXX_HAS_NVBF16
+
+  assert(cuda::std::isnormal(0) == false);
+  assert(cuda::std::isnormal(1) == true);
+  assert(cuda::std::isnormal(-1) == true);
+  assert(cuda::std::isnormal(cuda::std::numeric_limits<int>::max()) == true);
+  assert(cuda::std::isnormal(cuda::std::numeric_limits<int>::min()) == true);
+}
+
+__host__ __device__ void test_isgreater(float val)
+{
+  static_assert((cuda::std::is_same<decltype(cuda::std::isgreater((float) 0, (float) 0)), bool>::value), "");
+  static_assert((cuda::std::is_same<decltype(cuda::std::isgreater((float) 0, (double) 0)), bool>::value), "");
+#if !defined(_LIBCUDACXX_HAS_NO_LONG_DOUBLE)
+  static_assert((cuda::std::is_same<decltype(cuda::std::isgreater((float) 0, (long double) 0)), bool>::value), "");
+#endif // !_LIBCUDACXX_HAS_NO_LONG_DOUBLE
+  static_assert((cuda::std::is_same<decltype(cuda::std::isgreater((double) 0, (float) 0)), bool>::value), "");
+  static_assert((cuda::std::is_same<decltype(cuda::std::isgreater((double) 0, (double) 0)), bool>::value), "");
+  static_assert((cuda::std::is_same<decltype(cuda::std::isgreater(0, (double) 0)), bool>::value), "");
+#if !defined(_LIBCUDACXX_HAS_NO_LONG_DOUBLE)
+  static_assert((cuda::std::is_same<decltype(cuda::std::isgreater((double) 0, (long double) 0)), bool>::value), "");
+  static_assert((cuda::std::is_same<decltype(cuda::std::isgreater((long double) 0, (float) 0)), bool>::value), "");
+  static_assert((cuda::std::is_same<decltype(cuda::std::isgreater((long double) 0, (double) 0)), bool>::value), "");
+  static_assert((cuda::std::is_same<decltype(cuda::std::isgreater((long double) 0, (long double) 0)), bool>::value),
+                "");
+#endif // !_LIBCUDACXX_HAS_NO_LONG_DOUBLE
+#ifdef _LIBCUDACXX_HAS_NVFP16
+  static_assert((cuda::std::is_same<decltype(cuda::std::isgreater((__half) 0, (__half) 0)), bool>::value), "");
+  static_assert((cuda::std::is_same<decltype(cuda::std::isgreater((__half) 0, (float) 0)), bool>::value), "");
+  static_assert((cuda::std::is_same<decltype(cuda::std::isgreater((__half) 0, (double) 0)), bool>::value), "");
+#endif // _LIBCUDACXX_HAS_NVFP16
+#ifdef _LIBCUDACXX_HAS_NVBF16
+  static_assert((cuda::std::is_same<decltype(cuda::std::isgreater((__nv_bfloat16) 0, (__nv_bfloat16) 0)), bool>::value),
+                "");
+  static_assert((cuda::std::is_same<decltype(cuda::std::isgreater((__nv_bfloat16) 0, (float) 0)), bool>::value), "");
+  static_assert((cuda::std::is_same<decltype(cuda::std::isgreater((__nv_bfloat16) 0, (double) 0)), bool>::value), "");
+#endif // _LIBCUDACXX_HAS_NVBF16
+  assert(cuda::std::isgreater(-1.0, 0.F) == false);
+}
+
+__host__ __device__ void test_isgreaterequal(float val)
+{
+  static_assert((cuda::std::is_same<decltype(cuda::std::isgreaterequal((float) 0, (float) 0)), bool>::value), "");
+  static_assert((cuda::std::is_same<decltype(cuda::std::isgreaterequal((float) 0, (double) 0)), bool>::value), "");
+#if !defined(_LIBCUDACXX_HAS_NO_LONG_DOUBLE)
+  static_assert((cuda::std::is_same<decltype(cuda::std::isgreaterequal((float) 0, (long double) 0)), bool>::value), "");
+#endif // !_LIBCUDACXX_HAS_NO_LONG_DOUBLE
+  static_assert((cuda::std::is_same<decltype(cuda::std::isgreaterequal((double) 0, (float) 0)), bool>::value), "");
+  static_assert((cuda::std::is_same<decltype(cuda::std::isgreaterequal((double) 0, (double) 0)), bool>::value), "");
+  static_assert((cuda::std::is_same<decltype(cuda::std::isgreaterequal(0, (double) 0)), bool>::value), "");
+#if !defined(_LIBCUDACXX_HAS_NO_LONG_DOUBLE)
+  static_assert((cuda::std::is_same<decltype(cuda::std::isgreaterequal((double) 0, (long double) 0)), bool>::value),
+                "");
+  static_assert((cuda::std::is_same<decltype(cuda::std::isgreaterequal((long double) 0, (float) 0)), bool>::value), "");
+  static_assert((cuda::std::is_same<decltype(cuda::std::isgreaterequal((long double) 0, (double) 0)), bool>::value),
+                "");
+  static_assert(
+    (cuda::std::is_same<decltype(cuda::std::isgreaterequal((long double) 0, (long double) 0)), bool>::value), "");
+#endif // !_LIBCUDACXX_HAS_NO_LONG_DOUBLE
+#ifdef _LIBCUDACXX_HAS_NVFP16
+  static_assert((cuda::std::is_same<decltype(cuda::std::isgreaterequal((__half) 0, (__half) 0)), bool>::value), "");
+  static_assert((cuda::std::is_same<decltype(cuda::std::isgreaterequal((__half) 0, (float) 0)), bool>::value), "");
+  static_assert((cuda::std::is_same<decltype(cuda::std::isgreaterequal((__half) 0, (double) 0)), bool>::value), "");
+#endif // _LIBCUDACXX_HAS_NVFP16
+#ifdef _LIBCUDACXX_HAS_NVBF16
+  static_assert(
+    (cuda::std::is_same<decltype(cuda::std::isgreaterequal((__nv_bfloat16) 0, (__nv_bfloat16) 0)), bool>::value), "");
+  static_assert((cuda::std::is_same<decltype(cuda::std::isgreaterequal((__nv_bfloat16) 0, (float) 0)), bool>::value),
+                "");
+  static_assert((cuda::std::is_same<decltype(cuda::std::isgreaterequal((__nv_bfloat16) 0, (double) 0)), bool>::value),
+                "");
+#endif // _LIBCUDACXX_HAS_NVBF16
+  assert(cuda::std::isgreaterequal(-1.0, 0.F) == false);
+}
+
+__host__ __device__ void test_isinf(float val)
+{
+  static_assert((cuda::std::is_same<decltype(cuda::std::isinf((float) 0)), bool>::value), "");
+
+  typedef decltype(cuda::std::isinf((double) 0)) DoubleRetType;
+  static_assert((cuda::std::is_same<DoubleRetType, bool>::value), "");
+  static_assert((cuda::std::is_same<decltype(cuda::std::isinf(0)), bool>::value), "");
+#if !defined(_LIBCUDACXX_HAS_NO_LONG_DOUBLE)
+  static_assert((cuda::std::is_same<decltype(cuda::std::isinf((long double) 0)), bool>::value), "");
+#endif // !_LIBCUDACXX_HAS_NO_LONG_DOUBLE
+#ifdef _LIBCUDACXX_HAS_NVFP16
+  static_assert((cuda::std::is_same<decltype(cuda::std::isinf((__half) 0)), bool>::value), "");
+#endif // _LIBCUDACXX_HAS_NVFP16
+#ifdef _LIBCUDACXX_HAS_NVBF16
+  static_assert((cuda::std::is_same<decltype(cuda::std::isinf((__nv_bfloat16) 0)), bool>::value), "");
+#endif // _LIBCUDACXX_HAS_NVBF16
+  assert(cuda::std::isinf(-1.0) == false);
+  assert(cuda::std::isinf(0) == false);
+  assert(cuda::std::isinf(1) == false);
+  assert(cuda::std::isinf(-1) == false);
+  assert(cuda::std::isinf(cuda::std::numeric_limits<int>::max()) == false);
+  assert(cuda::std::isinf(cuda::std::numeric_limits<int>::min()) == false);
+}
+
+__host__ __device__ _CCCL_CONSTEXPR_ISINF bool test_constexpr_isinf(float val)
+{
+  return cuda::std::isinf(val);
+}
+
+__host__ __device__ void test_isless(float val)
+{
+  static_assert((cuda::std::is_same<decltype(cuda::std::isless((float) 0, (float) 0)), bool>::value), "");
+  static_assert((cuda::std::is_same<decltype(cuda::std::isless((float) 0, (double) 0)), bool>::value), "");
+#if !defined(_LIBCUDACXX_HAS_NO_LONG_DOUBLE)
+  static_assert((cuda::std::is_same<decltype(cuda::std::isless((float) 0, (long double) 0)), bool>::value), "");
+#endif // !_LIBCUDACXX_HAS_NO_LONG_DOUBLE
+  static_assert((cuda::std::is_same<decltype(cuda::std::isless((double) 0, (float) 0)), bool>::value), "");
+  static_assert((cuda::std::is_same<decltype(cuda::std::isless((double) 0, (double) 0)), bool>::value), "");
+  static_assert((cuda::std::is_same<decltype(cuda::std::isless(0, (double) 0)), bool>::value), "");
+#if !defined(_LIBCUDACXX_HAS_NO_LONG_DOUBLE)
+  static_assert((cuda::std::is_same<decltype(cuda::std::isless((double) 0, (long double) 0)), bool>::value), "");
+  static_assert((cuda::std::is_same<decltype(cuda::std::isless((long double) 0, (float) 0)), bool>::value), "");
+  static_assert((cuda::std::is_same<decltype(cuda::std::isless((long double) 0, (double) 0)), bool>::value), "");
+  static_assert((cuda::std::is_same<decltype(cuda::std::isless((long double) 0, (long double) 0)), bool>::value), "");
+#endif // !_LIBCUDACXX_HAS_NO_LONG_DOUBLE
+#ifdef _LIBCUDACXX_HAS_NVFP16
+  static_assert((cuda::std::is_same<decltype(cuda::std::isless((__half) 0, (__half) 0)), bool>::value), "");
+  static_assert((cuda::std::is_same<decltype(cuda::std::isless((__half) 0, (float) 0)), bool>::value), "");
+  static_assert((cuda::std::is_same<decltype(cuda::std::isless((__half) 0, (double) 0)), bool>::value), "");
+#endif // _LIBCUDACXX_HAS_NVFP16
+#ifdef _LIBCUDACXX_HAS_NVBF16
+  static_assert((cuda::std::is_same<decltype(cuda::std::isless((__nv_bfloat16) 0, (__nv_bfloat16) 0)), bool>::value),
+                "");
+  static_assert((cuda::std::is_same<decltype(cuda::std::isless((__nv_bfloat16) 0, (float) 0)), bool>::value), "");
+  static_assert((cuda::std::is_same<decltype(cuda::std::isless((__nv_bfloat16) 0, (double) 0)), bool>::value), "");
+#endif // _LIBCUDACXX_HAS_NVBF16
+  assert(cuda::std::isless(-1.0, 0.F) == true);
+}
+
+__host__ __device__ void test_islessequal(float val)
+{
+  static_assert((cuda::std::is_same<decltype(cuda::std::islessequal((float) 0, (float) 0)), bool>::value), "");
+  static_assert((cuda::std::is_same<decltype(cuda::std::islessequal((float) 0, (double) 0)), bool>::value), "");
+#if !defined(_LIBCUDACXX_HAS_NO_LONG_DOUBLE)
+  static_assert((cuda::std::is_same<decltype(cuda::std::islessequal((float) 0, (long double) 0)), bool>::value), "");
+#endif // !_LIBCUDACXX_HAS_NO_LONG_DOUBLE
+  static_assert((cuda::std::is_same<decltype(cuda::std::islessequal((double) 0, (float) 0)), bool>::value), "");
+  static_assert((cuda::std::is_same<decltype(cuda::std::islessequal((double) 0, (double) 0)), bool>::value), "");
+  static_assert((cuda::std::is_same<decltype(cuda::std::islessequal(0, (double) 0)), bool>::value), "");
+#if !defined(_LIBCUDACXX_HAS_NO_LONG_DOUBLE)
+  static_assert((cuda::std::is_same<decltype(cuda::std::islessequal((double) 0, (long double) 0)), bool>::value), "");
+  static_assert((cuda::std::is_same<decltype(cuda::std::islessequal((long double) 0, (float) 0)), bool>::value), "");
+  static_assert((cuda::std::is_same<decltype(cuda::std::islessequal((long double) 0, (double) 0)), bool>::value), "");
+  static_assert((cuda::std::is_same<decltype(cuda::std::islessequal((long double) 0, (long double) 0)), bool>::value),
+                "");
+#endif // !_LIBCUDACXX_HAS_NO_LONG_DOUBLE
+#ifdef _LIBCUDACXX_HAS_NVFP16
+  static_assert((cuda::std::is_same<decltype(cuda::std::islessequal((__half) 0, (__half) 0)), bool>::value), "");
+  static_assert((cuda::std::is_same<decltype(cuda::std::islessequal((__half) 0, (float) 0)), bool>::value), "");
+  static_assert((cuda::std::is_same<decltype(cuda::std::islessequal((__half) 0, (double) 0)), bool>::value), "");
+#endif // _LIBCUDACXX_HAS_NVFP16
+#ifdef _LIBCUDACXX_HAS_NVBF16
+  static_assert(
+    (cuda::std::is_same<decltype(cuda::std::islessequal((__nv_bfloat16) 0, (__nv_bfloat16) 0)), bool>::value), "");
+  static_assert((cuda::std::is_same<decltype(cuda::std::islessequal((__nv_bfloat16) 0, (float) 0)), bool>::value), "");
+  static_assert((cuda::std::is_same<decltype(cuda::std::islessequal((__nv_bfloat16) 0, (double) 0)), bool>::value), "");
+#endif // _LIBCUDACXX_HAS_NVBF16
+  assert(cuda::std::islessequal(-1.0, 0.F) == true);
+}
+
+__host__ __device__ void test_islessgreater(float val)
+{
+  static_assert((cuda::std::is_same<decltype(cuda::std::islessgreater((float) 0, (float) 0)), bool>::value), "");
+  static_assert((cuda::std::is_same<decltype(cuda::std::islessgreater((float) 0, (double) 0)), bool>::value), "");
+#if !defined(_LIBCUDACXX_HAS_NO_LONG_DOUBLE)
+  static_assert((cuda::std::is_same<decltype(cuda::std::islessgreater((float) 0, (long double) 0)), bool>::value), "");
+#endif // !_LIBCUDACXX_HAS_NO_LONG_DOUBLE
+  static_assert((cuda::std::is_same<decltype(cuda::std::islessgreater((double) 0, (float) 0)), bool>::value), "");
+  static_assert((cuda::std::is_same<decltype(cuda::std::islessgreater((double) 0, (double) 0)), bool>::value), "");
+  static_assert((cuda::std::is_same<decltype(cuda::std::islessgreater(0, (double) 0)), bool>::value), "");
+#if !defined(_LIBCUDACXX_HAS_NO_LONG_DOUBLE)
+  static_assert((cuda::std::is_same<decltype(cuda::std::islessgreater((double) 0, (long double) 0)), bool>::value), "");
+  static_assert((cuda::std::is_same<decltype(cuda::std::islessgreater((long double) 0, (float) 0)), bool>::value), "");
+  static_assert((cuda::std::is_same<decltype(cuda::std::islessgreater((long double) 0, (double) 0)), bool>::value), "");
+  static_assert((cuda::std::is_same<decltype(cuda::std::islessgreater((long double) 0, (long double) 0)), bool>::value),
+                "");
+#endif // !_LIBCUDACXX_HAS_NO_LONG_DOUBLE
+#ifdef _LIBCUDACXX_HAS_NVFP16
+  static_assert((cuda::std::is_same<decltype(cuda::std::islessgreater((__half) 0, (__half) 0)), bool>::value), "");
+  static_assert((cuda::std::is_same<decltype(cuda::std::islessgreater((__half) 0, (float) 0)), bool>::value), "");
+  static_assert((cuda::std::is_same<decltype(cuda::std::islessgreater((__half) 0, (double) 0)), bool>::value), "");
+#endif // _LIBCUDACXX_HAS_NVFP16
+#ifdef _LIBCUDACXX_HAS_NVBF16
+  static_assert(
+    (cuda::std::is_same<decltype(cuda::std::islessgreater((__nv_bfloat16) 0, (__nv_bfloat16) 0)), bool>::value), "");
+  static_assert((cuda::std::is_same<decltype(cuda::std::islessgreater((__nv_bfloat16) 0, (float) 0)), bool>::value),
+                "");
+  static_assert((cuda::std::is_same<decltype(cuda::std::islessgreater((__nv_bfloat16) 0, (double) 0)), bool>::value),
+                "");
+#endif // _LIBCUDACXX_HAS_NVBF16
+  assert(cuda::std::islessgreater(-1.0, 0.F) == true);
+}
+
+__host__ __device__ void test_isnan(float val)
+{
+  static_assert((cuda::std::is_same<decltype(cuda::std::isnan((float) 0)), bool>::value), "");
+
+  typedef decltype(cuda::std::isnan((double) 0)) DoubleRetType;
+  static_assert((cuda::std::is_same<DoubleRetType, bool>::value), "");
+  static_assert((cuda::std::is_same<decltype(cuda::std::isnan(0)), bool>::value), "");
+#if !defined(_LIBCUDACXX_HAS_NO_LONG_DOUBLE)
+  static_assert((cuda::std::is_same<decltype(cuda::std::isnan((long double) 0)), bool>::value), "");
+#endif // !_LIBCUDACXX_HAS_NO_LONG_DOUBLE
+#ifdef _LIBCUDACXX_HAS_NVFP16
+  static_assert((cuda::std::is_same<decltype(cuda::std::isnan((__half) 0)), bool>::value), "");
+#endif // _LIBCUDACXX_HAS_NVFP16
+#ifdef _LIBCUDACXX_HAS_NVBF16
+  static_assert((cuda::std::is_same<decltype(cuda::std::isnan((__nv_bfloat16) 0)), bool>::value), "");
+#endif // _LIBCUDACXX_HAS_NVBF16
+  assert(cuda::std::isnan(-1.0) == false);
+  assert(cuda::std::isnan(0) == false);
+  assert(cuda::std::isnan(1) == false);
+  assert(cuda::std::isnan(-1) == false);
+  assert(cuda::std::isnan(cuda::std::numeric_limits<int>::max()) == false);
+  assert(cuda::std::isnan(cuda::std::numeric_limits<int>::min()) == false);
+}
+
+__host__ __device__ _CCCL_CONSTEXPR_ISNAN bool test_constexpr_isnan(float val)
+{
+  return cuda::std::isnan(val);
+}
+
+__host__ __device__ void test_isunordered(float val)
+{
+  static_assert((cuda::std::is_same<decltype(cuda::std::isunordered((float) 0, (float) 0)), bool>::value), "");
+  static_assert((cuda::std::is_same<decltype(cuda::std::isunordered((float) 0, (double) 0)), bool>::value), "");
+#if !defined(_LIBCUDACXX_HAS_NO_LONG_DOUBLE)
+  static_assert((cuda::std::is_same<decltype(cuda::std::isunordered((float) 0, (long double) 0)), bool>::value), "");
+#endif // !_LIBCUDACXX_HAS_NO_LONG_DOUBLE
+  static_assert((cuda::std::is_same<decltype(cuda::std::isunordered((double) 0, (float) 0)), bool>::value), "");
+  static_assert((cuda::std::is_same<decltype(cuda::std::isunordered((double) 0, (double) 0)), bool>::value), "");
+  static_assert((cuda::std::is_same<decltype(cuda::std::isunordered(0, (double) 0)), bool>::value), "");
+#if !defined(_LIBCUDACXX_HAS_NO_LONG_DOUBLE)
+  static_assert((cuda::std::is_same<decltype(cuda::std::isunordered((double) 0, (long double) 0)), bool>::value), "");
+  static_assert((cuda::std::is_same<decltype(cuda::std::isunordered((long double) 0, (float) 0)), bool>::value), "");
+  static_assert((cuda::std::is_same<decltype(cuda::std::isunordered((long double) 0, (double) 0)), bool>::value), "");
+  static_assert((cuda::std::is_same<decltype(cuda::std::isunordered((long double) 0, (long double) 0)), bool>::value),
+                "");
+#endif // !_LIBCUDACXX_HAS_NO_LONG_DOUBLE
+#ifdef _LIBCUDACXX_HAS_NVFP16
+  static_assert((cuda::std::is_same<decltype(cuda::std::isunordered((__half) 0, (__half) 0)), bool>::value), "");
+  static_assert((cuda::std::is_same<decltype(cuda::std::isunordered((__half) 0, (float) 0)), bool>::value), "");
+  static_assert((cuda::std::is_same<decltype(cuda::std::isunordered((__half) 0, (double) 0)), bool>::value), "");
+#endif // _LIBCUDACXX_HAS_NVFP16
+#ifdef _LIBCUDACXX_HAS_NVBF16
+  static_assert(
+    (cuda::std::is_same<decltype(cuda::std::isunordered((__nv_bfloat16) 0, (__nv_bfloat16) 0)), bool>::value), "");
+  static_assert((cuda::std::is_same<decltype(cuda::std::isunordered((__nv_bfloat16) 0, (float) 0)), bool>::value), "");
+  static_assert((cuda::std::is_same<decltype(cuda::std::isunordered((__nv_bfloat16) 0, (double) 0)), bool>::value), "");
+#endif // _LIBCUDACXX_HAS_NVBF16
+  assert(cuda::std::isunordered(-1.0, 0.F) == false);
+}
+
+__host__ __device__ void test(float val)
+{
+  test_fpclassify(val);
+  test_signbit(val);
+  test_isfinite(val);
+  test_isnormal(val);
+  test_isgreater(val);
+  test_isgreaterequal(val);
+  test_isinf(val);
+  test_isless(val);
+  test_islessequal(val);
+  test_islessgreater(val);
+  test_isnan(val);
+  test_isunordered(val);
+}
+
+__global__ void test_global_kernel(float* val)
+{
+  test(*val);
+}
+
+int main(int, char**)
+{
+  volatile float val = 1.0f;
+  test(val);
+
+#if defined(_CCCL_BUILTIN_ISNAN)
+  static_assert(!test_constexpr_isnan(1.0f), "");
+#endif // _CCCL_BUILTIN_ISNAN
+
+#if defined(_CCCL_BUILTIN_ISINF)
+  static_assert(!test_constexpr_isinf(1.0f), "");
+#endif // _CCCL_BUILTIN_ISINF
+
+#if defined(_CCCL_BUILTIN_ISFINITE) || (defined(_CCCL_BUILTIN_ISINF) && defined(_CCCL_BUILTIN_ISNAN))
+  static_assert(test_constexpr_isfinite(1.0f), "");
+#endif // _CCCL_BUILTIN_ISFINITE|| (_CCCL_BUILTIN_ISINF && _CCCL_BUILTIN_ISNAN)
+
+  return 0;
+}
diff --git a/libcudacxx/test/libcudacxx/std/numerics/c.math/lerp.pass.cpp b/libcudacxx/test/libcudacxx/std/numerics/c.math/lerp.pass.cpp
new file mode 100644
index 00000000000..366ff77bd7a
--- /dev/null
+++ b/libcudacxx/test/libcudacxx/std/numerics/c.math/lerp.pass.cpp
@@ -0,0 +1,86 @@
+//===----------------------------------------------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+// SPDX-FileCopyrightText: Copyright (c) 2024 NVIDIA CORPORATION & AFFILIATES.
+//
+//===----------------------------------------------------------------------===//
+
+// <cmath>
+
+// constexpr float lerp(float a, float b, float t) noexcept;
+// constexpr double lerp(double a, double b, double t) noexcept;
+// constexpr long double lerp(long double a, long double b, long double t) noexcept;
+
+#include <cuda/std/cassert>
+#include <cuda/std/cmath>
+#include <cuda/std/limits>
+#include <cuda/std/type_traits>
+
+#include "fp_compare.h"
+#include "test_macros.h"
+
+template <typename T>
+__host__ __device__ TEST_CONSTEXPR_CXX14 bool constexpr_test()
+{
+  return cuda::std::lerp(T(0.0), T(12), T(0.0)) == T(0.0) && cuda::std::lerp(T(12), T(0.0), T(0.5)) == T(6)
+      && cuda::std::lerp(T(0.0), T(12), T(2)) == T(24);
+}
+
+template <typename T>
+__host__ __device__ void test()
+{
+  ASSERT_SAME_TYPE(T, decltype(cuda::std::lerp(T(), T(), T())));
+  static_assert(noexcept(cuda::std::lerp(T(), T(), T())), "");
+
+  const T maxV = cuda::std::numeric_limits<T>::max();
+  const T inf  = cuda::std::numeric_limits<T>::infinity();
+
+  //  Things that can be compared exactly
+  assert((cuda::std::lerp(T(0.0), T(12), T(0.0)) == T(0.0)));
+  assert((cuda::std::lerp(T(0.0), T(12), T(1)) == T(12)));
+  assert((cuda::std::lerp(T(12), T(0.0), T(0.0)) == T(12)));
+  assert((cuda::std::lerp(T(12), T(0.0), T(1)) == T(0.0)));
+
+  assert((cuda::std::lerp(T(0.0), T(12), T(0.5)) == T(6)));
+  assert((cuda::std::lerp(T(12), T(0.0), T(0.5)) == T(6)));
+  assert((cuda::std::lerp(T(0.0), T(12), T(2)) == T(24)));
+  assert((cuda::std::lerp(T(12), T(0.0), T(2)) == T(-12)));
+
+  assert((cuda::std::lerp(maxV, maxV / T(10), T(0.0)) == maxV));
+  assert((cuda::std::lerp(maxV / T(10), maxV, T(1)) == maxV));
+
+  assert((cuda::std::lerp(T(2.3), T(2.3), inf) == T(2.3)));
+
+  assert(cuda::std::lerp(T(0.0), T(0.0), T(23)) == T(0.0));
+
+  // __half and __nvbfloat have precision issues here
+  if (!cuda::std::__is_extended_floating_point<T>::value)
+  {
+    assert(cuda::std::isnan(cuda::std::lerp(T(0.0), T(0.0), T(inf))));
+  }
+}
+
+int main(int, char**)
+{
+  test<float>();
+  test<double>();
+#if !defined(_LIBCUDACXX_HAS_NO_LONG_DOUBLE)
+  test<long double>();
+#endif //!_LIBCUDACXX_HAS_NO_LONG_DOUBLE
+
+#ifdef _LIBCUDACXX_HAS_NVFP16
+  test<__half>();
+#endif // _LIBCUDACXX_HAS_NVFP16
+#ifdef _LIBCUDACXX_HAS_NVBF16
+  test<__nv_bfloat16>();
+#endif // _LIBCUDACXX_HAS_NVBF16
+
+#if TEST_STD_VER >= 2014
+  static_assert(constexpr_test<float>(), "");
+  static_assert(constexpr_test<double>(), "");
+#endif // TEST_STD_VER >= 2014
+
+  return 0;
+}
diff --git a/libcudacxx/test/libcudacxx/std/numerics/c.math/logarithms.pass.cpp b/libcudacxx/test/libcudacxx/std/numerics/c.math/logarithms.pass.cpp
new file mode 100644
index 00000000000..ea0aff8f0ff
--- /dev/null
+++ b/libcudacxx/test/libcudacxx/std/numerics/c.math/logarithms.pass.cpp
@@ -0,0 +1,109 @@
+//===----------------------------------------------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+// SPDX-FileCopyrightText: Copyright (c) 2024 NVIDIA CORPORATION & AFFILIATES.
+//
+//===----------------------------------------------------------------------===//
+
+// <cmath>
+
+#include <cuda/std/cassert>
+#include <cuda/std/cmath>
+#include <cuda/std/type_traits>
+
+#include "test_macros.h"
+
+template <class T>
+__host__ __device__ void test_log(T value)
+{
+  using ret = cuda::std::conditional_t<cuda::std::is_integral<T>::value, double, T>;
+  static_assert(cuda::std::is_same<decltype(cuda::std::log(value)), ret>::value, "");
+  assert(cuda::std::log(value) == ret{0});
+}
+
+template <class T>
+__host__ __device__ void test_log10(T value)
+{
+  using ret = cuda::std::conditional_t<cuda::std::is_integral<T>::value, double, T>;
+  static_assert(cuda::std::is_same<decltype(cuda::std::log10(value)), ret>::value, "");
+  assert(cuda::std::log10(value) == ret{0});
+}
+
+template <class T>
+__host__ __device__ void test_ilogb(T value)
+{
+  static_assert(cuda::std::is_same<decltype(cuda::std::ilogb(value)), int>::value, "");
+  assert(cuda::std::ilogb(value) == 0);
+}
+
+template <class T>
+__host__ __device__ void test_log1p(T value)
+{
+  using ret = cuda::std::conditional_t<cuda::std::is_integral<T>::value, double, T>;
+  static_assert(cuda::std::is_same<decltype(cuda::std::log1p(value)), ret>::value, "");
+  assert(cuda::std::log1p(value - value) == ret{0});
+}
+
+template <class T>
+__host__ __device__ void test_log2(T value)
+{
+  using ret = cuda::std::conditional_t<cuda::std::is_integral<T>::value, double, T>;
+  static_assert(cuda::std::is_same<decltype(cuda::std::log2(value)), ret>::value, "");
+  assert(cuda::std::log2(value) == ret{0});
+}
+
+template <class T>
+__host__ __device__ void test_logb(T value)
+{
+  using ret = cuda::std::conditional_t<cuda::std::is_integral<T>::value, double, T>;
+  static_assert(cuda::std::is_same<decltype(cuda::std::logb(value)), ret>::value, "");
+  assert(cuda::std::logb(value) == ret{0});
+}
+
+template <class T>
+__host__ __device__ void test(T value)
+{
+  test_log<T>(value);
+  test_log10<T>(value);
+  test_ilogb<T>(value);
+  test_log1p<T>(value);
+  test_log2<T>(value);
+  test_logb<T>(value);
+}
+
+__host__ __device__ void test(float value)
+{
+  test<float>(value);
+  test<double>(value);
+#if !defined(_LIBCUDACXX_HAS_NO_LONG_DOUBLE)
+  test<long double>(value);
+#endif // !_LIBCUDACXX_HAS_NO_LONG_DOUBLE
+#ifdef _LIBCUDACXX_HAS_NVFP16
+  test<__half>(__float2half(value));
+#endif // _LIBCUDACXX_HAS_NVFP16
+#ifdef _LIBCUDACXX_HAS_NVBF16
+  test<__nv_bfloat16>(__float2bfloat16(value));
+#endif // _LIBCUDACXX_HAS_NVBF16
+
+  test<unsigned short>(static_cast<unsigned short>(value));
+  test<int>(static_cast<int>(value));
+  test<unsigned int>(static_cast<unsigned int>(value));
+  test<long>(static_cast<long>(value));
+  test<unsigned long>(static_cast<unsigned long>(value));
+  test<long long>(static_cast<long long>(value));
+  test<unsigned long long>(static_cast<unsigned long long>(value));
+}
+
+__global__ void test_global_kernel(float* value)
+{
+  test(*value);
+}
+
+int main(int, char**)
+{
+  volatile float value = 1.0f;
+  test(value);
+  return 0;
+}

From 831c62e94ce23091cc0d7d6280b6b38e9606baba Mon Sep 17 00:00:00 2001
From: Michael Schellenberger Costa <miscco@nvidia.com>
Date: Tue, 26 Nov 2024 10:15:20 +0100
Subject: [PATCH 30/45] Drop memory resources in libcu++ (#2860)

This moves our memory resources into cuda::experimental

That way it is easier to use them because cudax is the actual user
---
 cudax/examples/simple_p2p.cu                  |   6 +-
 .../uninitialized_async_buffer.cuh            |  11 +-
 .../__container/uninitialized_buffer.cuh      |   9 +-
 .../__memory_resource/any_resource.cuh        |   6 +-
 .../__memory_resource/device_memory_pool.cuh  |  17 +-
 .../device_memory_resource.cuh                |  67 ++---
 .../managed_memory_resource.cuh               | 254 ++++++++++++++++
 .../pinned_memory_resource.cuh                | 256 ++++++++++++++++
 .../__memory_resource/properties.cuh          |  49 ++++
 .../__memory_resource/shared_resource.cuh     |   4 +-
 .../cuda/experimental/memory_resource.cuh     |   3 +
 cudax/test/CMakeLists.txt                     |   2 +
 cudax/test/algorithm/common.cuh               |   4 +-
 cudax/test/algorithm/copy.cu                  |  10 +-
 cudax/test/algorithm/fill.cu                  |   6 +-
 .../containers/uninitialized_async_buffer.cu  |  10 +-
 cudax/test/containers/uninitialized_buffer.cu |  19 +-
 .../memory_resource/any_async_resource.cu     |  14 +-
 cudax/test/memory_resource/any_resource.cu    |  12 +-
 .../memory_resource/device_memory_pool.cu     |  30 +-
 .../memory_resource/device_memory_resource.cu |  64 ++--
 .../managed_memory_resource.cu                | 273 +++++++++++++++++
 .../memory_resource/pinned_memory_resource.cu | 274 ++++++++++++++++++
 cudax/test/memory_resource/shared_resource.cu |  14 +-
 .../device_memory_resource.h                  | 219 --------------
 .../managed_memory_resource.h                 | 201 -------------
 .../pinned_memory_resource.h                  | 204 -------------
 libcudacxx/include/cuda/memory_resource       |   3 -
 .../device_memory_resource/allocate.pass.cpp  |  95 ------
 .../device_memory_resource/equality.pass.cpp  | 144 ---------
 .../device_memory_resource/traits.pass.cpp    |  31 --
 .../managed_memory_resource/allocate.pass.cpp |  96 ------
 .../managed_memory_resource/equality.pass.cpp | 130 ---------
 .../managed_memory_resource/traits.pass.cpp   |  31 --
 .../pinned_memory_resource/allocate.pass.cpp  |  98 -------
 .../pinned_memory_resource/equality.pass.cpp  | 132 ---------
 .../pinned_memory_resource/traits.pass.cpp    |  31 --
 37 files changed, 1263 insertions(+), 1566 deletions(-)
 create mode 100644 cudax/include/cuda/experimental/__memory_resource/managed_memory_resource.cuh
 create mode 100644 cudax/include/cuda/experimental/__memory_resource/pinned_memory_resource.cuh
 create mode 100644 cudax/include/cuda/experimental/__memory_resource/properties.cuh
 create mode 100644 cudax/test/memory_resource/managed_memory_resource.cu
 create mode 100644 cudax/test/memory_resource/pinned_memory_resource.cu
 delete mode 100644 libcudacxx/include/cuda/__memory_resource/device_memory_resource.h
 delete mode 100644 libcudacxx/include/cuda/__memory_resource/managed_memory_resource.h
 delete mode 100644 libcudacxx/include/cuda/__memory_resource/pinned_memory_resource.h
 delete mode 100644 libcudacxx/test/libcudacxx/cuda/memory_resource/device_memory_resource/allocate.pass.cpp
 delete mode 100644 libcudacxx/test/libcudacxx/cuda/memory_resource/device_memory_resource/equality.pass.cpp
 delete mode 100644 libcudacxx/test/libcudacxx/cuda/memory_resource/device_memory_resource/traits.pass.cpp
 delete mode 100644 libcudacxx/test/libcudacxx/cuda/memory_resource/managed_memory_resource/allocate.pass.cpp
 delete mode 100644 libcudacxx/test/libcudacxx/cuda/memory_resource/managed_memory_resource/equality.pass.cpp
 delete mode 100644 libcudacxx/test/libcudacxx/cuda/memory_resource/managed_memory_resource/traits.pass.cpp
 delete mode 100644 libcudacxx/test/libcudacxx/cuda/memory_resource/pinned_memory_resource/allocate.pass.cpp
 delete mode 100644 libcudacxx/test/libcudacxx/cuda/memory_resource/pinned_memory_resource/equality.pass.cpp
 delete mode 100644 libcudacxx/test/libcudacxx/cuda/memory_resource/pinned_memory_resource/traits.pass.cpp

diff --git a/cudax/examples/simple_p2p.cu b/cudax/examples/simple_p2p.cu
index 5b83a43b904..c6d9be9f707 100644
--- a/cudax/examples/simple_p2p.cu
+++ b/cudax/examples/simple_p2p.cu
@@ -121,7 +121,7 @@ void test_cross_device_access_from_kernel(
 
   // This will be a pinned memory vector once available
   cudax::uninitialized_buffer<float, cuda::mr::host_accessible> host_buffer(
-    cuda::mr::pinned_memory_resource(), dev0_buffer.size());
+    cudax::pinned_memory_resource(), dev0_buffer.size());
   std::generate(host_buffer.begin(), host_buffer.end(), []() {
     static int i = 0;
     return static_cast<float>((i++) % 4096);
@@ -219,9 +219,9 @@ try
   cudax::stream dev1_stream(peers[1]);
 
   printf("Enabling peer access between GPU%d and GPU%d...\n", peers[0].get(), peers[1].get());
-  cudax::mr::device_memory_resource dev0_resource(peers[0]);
+  cudax::device_memory_resource dev0_resource(peers[0]);
   dev0_resource.enable_peer_access_from(peers[1]);
-  cudax::mr::device_memory_resource dev1_resource(peers[1]);
+  cudax::device_memory_resource dev1_resource(peers[1]);
   dev1_resource.enable_peer_access_from(peers[0]);
 
   // Allocate buffers
diff --git a/cudax/include/cuda/experimental/__container/uninitialized_async_buffer.cuh b/cudax/include/cuda/experimental/__container/uninitialized_async_buffer.cuh
index fb502cbbf7d..731ed555bb3 100644
--- a/cudax/include/cuda/experimental/__container/uninitialized_async_buffer.cuh
+++ b/cudax/include/cuda/experimental/__container/uninitialized_async_buffer.cuh
@@ -33,6 +33,7 @@
 #include <cuda/stream_ref>
 
 #include <cuda/experimental/__memory_resource/any_resource.cuh>
+#include <cuda/experimental/__memory_resource/properties.cuh>
 
 #if _CCCL_STD_VER >= 2014 && !_CCCL_COMPILER(MSVC2017) && defined(LIBCUDACXX_ENABLE_EXPERIMENTAL_MEMORY_RESOURCE)
 
@@ -73,10 +74,10 @@ class uninitialized_async_buffer
 {
 private:
   static_assert(_CUDA_VMR::__contains_execution_space_property<_Properties...>,
-                "The properties of cuda::experimental::mr::uninitialized_async_buffer must contain at least one "
+                "The properties of cuda::experimental::uninitialized_async_buffer must contain at least one "
                 "execution space property!");
 
-  using __async_resource = ::cuda::experimental::mr::any_async_resource<_Properties...>;
+  using __async_resource = ::cuda::experimental::any_async_resource<_Properties...>;
 
   __async_resource __mr_;
   ::cuda::stream_ref __stream_ = {};
@@ -117,7 +118,7 @@ private:
   _CCCL_NODISCARD_FRIEND _CCCL_HIDE_FROM_ABI auto
   __cudax_launch_transform(::cuda::stream_ref, uninitialized_async_buffer& __self) noexcept
     _CCCL_TRAILING_REQUIRES(_CUDA_VSTD::span<_Tp>)(
-      _CUDA_VSTD::same_as<_Tp, _Tp2>&& _CUDA_VSTD::__is_included_in_v<_CUDA_VMR::device_accessible, _Properties...>)
+      _CUDA_VSTD::same_as<_Tp, _Tp2>&& _CUDA_VSTD::__is_included_in_v<mr::device_accessible, _Properties...>)
   {
     // TODO add auto synchronization
     return {__self.__get_data(), __self.size()};
@@ -129,7 +130,7 @@ private:
   _CCCL_NODISCARD_FRIEND _CCCL_HIDE_FROM_ABI auto
   __cudax_launch_transform(::cuda::stream_ref, const uninitialized_async_buffer& __self) noexcept
     _CCCL_TRAILING_REQUIRES(_CUDA_VSTD::span<const _Tp>)(
-      _CUDA_VSTD::same_as<_Tp, _Tp2>&& _CUDA_VSTD::__is_included_in_v<_CUDA_VMR::device_accessible, _Properties...>)
+      _CUDA_VSTD::same_as<_Tp, _Tp2>&& _CUDA_VSTD::__is_included_in_v<mr::device_accessible, _Properties...>)
   {
     // TODO add auto synchronization
     return {__self.__get_data(), __self.size()};
@@ -294,7 +295,7 @@ public:
 };
 
 template <class _Tp>
-using uninitialized_async_device_buffer = uninitialized_async_buffer<_Tp, _CUDA_VMR::device_accessible>;
+using uninitialized_async_device_buffer = uninitialized_async_buffer<_Tp, mr::device_accessible>;
 
 } // namespace cuda::experimental
 
diff --git a/cudax/include/cuda/experimental/__container/uninitialized_buffer.cuh b/cudax/include/cuda/experimental/__container/uninitialized_buffer.cuh
index 9a2f1200678..1f661c0c7d5 100644
--- a/cudax/include/cuda/experimental/__container/uninitialized_buffer.cuh
+++ b/cudax/include/cuda/experimental/__container/uninitialized_buffer.cuh
@@ -32,6 +32,7 @@
 #include <cuda/std/span>
 
 #include <cuda/experimental/__memory_resource/any_resource.cuh>
+#include <cuda/experimental/__memory_resource/properties.cuh>
 
 #if _CCCL_STD_VER >= 2014 && !_CCCL_COMPILER(MSVC2017) && defined(LIBCUDACXX_ENABLE_EXPERIMENTAL_MEMORY_RESOURCE)
 
@@ -67,7 +68,7 @@ private:
                 "The properties of cuda::experimental::uninitialized_buffer must contain at least one execution space "
                 "property!");
 
-  using __resource = ::cuda::experimental::mr::any_resource<_Properties...>;
+  using __resource = ::cuda::experimental::any_resource<_Properties...>;
 
   __resource __mr_;
   size_t __count_ = 0;
@@ -107,7 +108,7 @@ private:
   _CCCL_NODISCARD_FRIEND _CCCL_HIDE_FROM_ABI auto
   __cudax_launch_transform(::cuda::stream_ref, uninitialized_buffer& __self) noexcept
     _CCCL_TRAILING_REQUIRES(_CUDA_VSTD::span<_Tp>)(
-      _CUDA_VSTD::same_as<_Tp, _Tp2>&& _CUDA_VSTD::__is_included_in_v<_CUDA_VMR::device_accessible, _Properties...>)
+      _CUDA_VSTD::same_as<_Tp, _Tp2>&& _CUDA_VSTD::__is_included_in_v<mr::device_accessible, _Properties...>)
   {
     return {__self.__get_data(), __self.size()};
   }
@@ -118,7 +119,7 @@ private:
   _CCCL_NODISCARD_FRIEND _CCCL_HIDE_FROM_ABI auto
   __cudax_launch_transform(::cuda::stream_ref, const uninitialized_buffer& __self) noexcept
     _CCCL_TRAILING_REQUIRES(_CUDA_VSTD::span<const _Tp>)(
-      _CUDA_VSTD::same_as<_Tp, _Tp2>&& _CUDA_VSTD::__is_included_in_v<_CUDA_VMR::device_accessible, _Properties...>)
+      _CUDA_VSTD::same_as<_Tp, _Tp2>&& _CUDA_VSTD::__is_included_in_v<mr::device_accessible, _Properties...>)
   {
     return {__self.__get_data(), __self.size()};
   }
@@ -259,7 +260,7 @@ public:
 };
 
 template <class _Tp>
-using uninitialized_device_buffer = uninitialized_buffer<_Tp, _CUDA_VMR::device_accessible>;
+using uninitialized_device_buffer = uninitialized_buffer<_Tp, mr::device_accessible>;
 
 } // namespace cuda::experimental
 
diff --git a/cudax/include/cuda/experimental/__memory_resource/any_resource.cuh b/cudax/include/cuda/experimental/__memory_resource/any_resource.cuh
index f442e56dcfe..c3d6fce7a08 100644
--- a/cudax/include/cuda/experimental/__memory_resource/any_resource.cuh
+++ b/cudax/include/cuda/experimental/__memory_resource/any_resource.cuh
@@ -51,7 +51,7 @@
 #include <cuda/std/__utility/forward.h>
 #include <cuda/std/__utility/in_place.h>
 
-namespace cuda::experimental::mr
+namespace cuda::experimental
 {
 template <class _Ty, class _Uy = _CUDA_VSTD::remove_cvref_t<_Ty>>
 _CCCL_INLINE_VAR constexpr bool __is_basic_any_resource = false;
@@ -73,7 +73,7 @@ class basic_any_resource
 {
 private:
   static_assert(_CUDA_VMR::__contains_execution_space_property<_Properties...>,
-                "The properties of cuda::experimental::mr::basic_any_resource must contain at least one execution "
+                "The properties of cuda::experimental::basic_any_resource must contain at least one execution "
                 "space property!");
 
   template <_CUDA_VMR::_AllocType, class...>
@@ -352,6 +352,6 @@ auto make_any_async_resource(_Args&&... __args) -> any_async_resource<_Propertie
   return any_async_resource<_Properties...>{_CUDA_VSTD::in_place_type<_Resource>, _CUDA_VSTD::forward<_Args>(__args)...};
 }
 
-} // namespace cuda::experimental::mr
+} // namespace cuda::experimental
 
 #endif //_CUDAX__MEMORY_RESOURCE_ANY_RESOURCE_H
diff --git a/cudax/include/cuda/experimental/__memory_resource/device_memory_pool.cuh b/cudax/include/cuda/experimental/__memory_resource/device_memory_pool.cuh
index c74f7d68f77..f3ffcfeea24 100644
--- a/cudax/include/cuda/experimental/__memory_resource/device_memory_pool.cuh
+++ b/cudax/include/cuda/experimental/__memory_resource/device_memory_pool.cuh
@@ -43,7 +43,7 @@
 
 //! @file
 //! The \c device_memory_pool class provides a wrapper around a `cudaMempool_t`.
-namespace cuda::experimental::mr
+namespace cuda::experimental
 {
 
 //! @brief  Checks whether the current device supports \c cudaMallocAsync.
@@ -166,7 +166,7 @@ private:
   _CCCL_NODISCARD static cudaMemPool_t
   __create_cuda_mempool(const int __device_id, memory_pool_properties __properties) noexcept
   {
-    ::cuda::experimental::mr::__device_supports_stream_ordered_allocations(__device_id);
+    ::cuda::experimental::__device_supports_stream_ordered_allocations(__device_id);
     device_memory_pool::__cuda_supports_export_handle_type(__device_id, __properties.allocation_handle_type);
 
     ::cudaMemPoolProps __pool_properties{};
@@ -315,7 +315,7 @@ public:
   //! @param __devices A vector of `device_ref`s listing devices to enable access for
   void enable_peer_access_from(const ::std::vector<device_ref>& __devices)
   {
-    ::cuda::experimental::mr::__mempool_switch_peer_access(
+    ::cuda::experimental::__mempool_switch_peer_access(
       __pool_handle_, {__devices.data(), __devices.size()}, cudaMemAccessFlagsProtReadWrite);
   }
 
@@ -324,8 +324,7 @@ public:
   //! @param __device device_ref indicating for which device the access should be enabled
   void enable_peer_access_from(device_ref __device)
   {
-    ::cuda::experimental::mr::__mempool_switch_peer_access(
-      __pool_handle_, {&__device, 1}, cudaMemAccessFlagsProtReadWrite);
+    ::cuda::experimental::__mempool_switch_peer_access(__pool_handle_, {&__device, 1}, cudaMemAccessFlagsProtReadWrite);
   }
 
   //! @brief Disable peer access to this memory pool from the supplied devices
@@ -335,7 +334,7 @@ public:
   //! @param __devices A vector of `device_ref`s listing devices to disable access for
   void disable_peer_access_from(const ::std::vector<device_ref>& __devices)
   {
-    ::cuda::experimental::mr::__mempool_switch_peer_access(
+    ::cuda::experimental::__mempool_switch_peer_access(
       __pool_handle_, {__devices.data(), __devices.size()}, cudaMemAccessFlagsProtNone);
   }
 
@@ -344,7 +343,7 @@ public:
   //! @param __device device_ref indicating for which device the access should be disable
   void disable_peer_access_from(device_ref __device)
   {
-    ::cuda::experimental::mr::__mempool_switch_peer_access(__pool_handle_, {&__device, 1}, cudaMemAccessFlagsProtNone);
+    ::cuda::experimental::__mempool_switch_peer_access(__pool_handle_, {&__device, 1}, cudaMemAccessFlagsProtNone);
   }
 
   //! @brief Query if memory allocated through this memory resource is accessible by the supplied device
@@ -352,7 +351,7 @@ public:
   //! @param __device device for which the peer access is queried
   _CCCL_NODISCARD bool is_accessible_from(device_ref __device)
   {
-    return ::cuda::experimental::mr::__mempool_get_access(__pool_handle_, __device);
+    return ::cuda::experimental::__mempool_get_access(__pool_handle_, __device);
   }
 
   //! @brief Equality comparison with another \c device_memory_pool.
@@ -424,7 +423,7 @@ public:
   static device_memory_pool from_native_handle(_CUDA_VSTD::nullptr_t) = delete;
 };
 
-} // namespace cuda::experimental::mr
+} // namespace cuda::experimental
 
 #  endif // _CCCL_STD_VER >= 2014
 
diff --git a/cudax/include/cuda/experimental/__memory_resource/device_memory_resource.cuh b/cudax/include/cuda/experimental/__memory_resource/device_memory_resource.cuh
index fffe3dea722..c0aedab7fa9 100644
--- a/cudax/include/cuda/experimental/__memory_resource/device_memory_resource.cuh
+++ b/cudax/include/cuda/experimental/__memory_resource/device_memory_resource.cuh
@@ -40,6 +40,7 @@
 
 #  include <cuda/experimental/__device/device_ref.cuh>
 #  include <cuda/experimental/__memory_resource/device_memory_pool.cuh>
+#  include <cuda/experimental/__memory_resource/properties.cuh>
 #  include <cuda/experimental/__stream/stream.cuh>
 
 #  if _CCCL_STD_VER >= 2014
@@ -47,7 +48,7 @@
 //! @file
 //! The \c device_memory_pool class provides an asynchronous memory resource that allocates device memory in stream
 //! order.
-namespace cuda::experimental::mr
+namespace cuda::experimental
 {
 
 //! @brief global stream to synchronize in the synchronous interface of \c device_memory_resource
@@ -92,7 +93,7 @@ private:
   //! @returns The default memory pool of the specified device.
   _CCCL_NODISCARD static ::cudaMemPool_t __get_default_mem_pool(const int __device_id)
   {
-    ::cuda::experimental::mr::__device_supports_stream_ordered_allocations(__device_id);
+    ::cuda::experimental::__device_supports_stream_ordered_allocations(__device_id);
 
     ::cudaMemPool_t __pool;
     _CCCL_TRY_CUDA_API(
@@ -247,7 +248,7 @@ public:
   //! @param __devices A vector of `device_ref`s listing devices to enable access for
   void enable_peer_access_from(const ::std::vector<device_ref>& __devices)
   {
-    ::cuda::experimental::mr::__mempool_switch_peer_access(
+    ::cuda::experimental::__mempool_switch_peer_access(
       __pool_, {__devices.data(), __devices.size()}, cudaMemAccessFlagsProtReadWrite);
   }
 
@@ -259,7 +260,7 @@ public:
   //! @param __device device_ref indicating for which device the access should be enabled
   void enable_peer_access_from(device_ref __device)
   {
-    ::cuda::experimental::mr::__mempool_switch_peer_access(__pool_, {&__device, 1}, cudaMemAccessFlagsProtReadWrite);
+    ::cuda::experimental::__mempool_switch_peer_access(__pool_, {&__device, 1}, cudaMemAccessFlagsProtReadWrite);
   }
 
   //! @brief Enable peer access to memory allocated through this memory resource by the supplied devices
@@ -271,7 +272,7 @@ public:
   //! @param __devices A vector of `device_ref`s listing devices to disable access for
   void disable_peer_access_from(const ::std::vector<device_ref>& __devices)
   {
-    ::cuda::experimental::mr::__mempool_switch_peer_access(
+    ::cuda::experimental::__mempool_switch_peer_access(
       __pool_, {__devices.data(), __devices.size()}, cudaMemAccessFlagsProtNone);
   }
 
@@ -283,7 +284,7 @@ public:
   //! @param __device device_ref indicating for which device the access should be enabled
   void disable_peer_access_from(device_ref __device)
   {
-    ::cuda::experimental::mr::__mempool_switch_peer_access(__pool_, {&__device, 1}, cudaMemAccessFlagsProtNone);
+    ::cuda::experimental::__mempool_switch_peer_access(__pool_, {&__device, 1}, cudaMemAccessFlagsProtNone);
   }
 
   //! @brief Query if memory allocated through this memory resource is accessible by the supplied device
@@ -291,7 +292,7 @@ public:
   //! @param __device device for which the peer access is queried
   _CCCL_NODISCARD bool is_accessible_from(device_ref __device)
   {
-    return ::cuda::experimental::mr::__mempool_get_access(__pool_, __device);
+    return ::cuda::experimental::__mempool_get_access(__pool_, __device);
   }
 
   //! @brief Equality comparison with another device_memory_resource.
@@ -319,10 +320,10 @@ public:
   _CCCL_REQUIRES((_CUDA_VMR::__different_resource<device_memory_resource, _Resource>) )
   _CCCL_NODISCARD bool operator==(_Resource const& __rhs) const noexcept
   {
-    if constexpr (has_property<_Resource, _CUDA_VMR::device_accessible>)
+    if constexpr (has_property<_Resource, device_accessible>)
     {
-      return _CUDA_VMR::resource_ref<_CUDA_VMR::device_accessible>{const_cast<device_memory_resource*>(this)}
-          == _CUDA_VMR::resource_ref<_CUDA_VMR::device_accessible>{const_cast<_Resource&>(__rhs)};
+      return _CUDA_VMR::resource_ref<device_accessible>{const_cast<device_memory_resource*>(this)}
+          == _CUDA_VMR::resource_ref<device_accessible>{const_cast<_Resource&>(__rhs)};
     }
     else
     {
@@ -332,68 +333,68 @@ public:
 #    else // ^^^ C++20 ^^^ / vvv C++17
   template <class _Resource>
   _CCCL_NODISCARD_FRIEND auto operator==(device_memory_resource const& __lhs, _Resource const& __rhs) noexcept
-    _CCCL_TRAILING_REQUIRES(bool)(_CUDA_VMR::__different_resource<device_memory_resource, _Resource>&&
-                                    has_property<_Resource, _CUDA_VMR::device_accessible>)
+    _CCCL_TRAILING_REQUIRES(bool)(
+      _CUDA_VMR::__different_resource<device_memory_resource, _Resource>&& has_property<_Resource, device_accessible>)
   {
-    return _CUDA_VMR::resource_ref<_CUDA_VMR::device_accessible>{const_cast<device_memory_resource&>(__lhs)}
-        == _CUDA_VMR::resource_ref<_CUDA_VMR::device_accessible>{const_cast<_Resource&>(__rhs)};
+    return _CUDA_VMR::resource_ref<device_accessible>{const_cast<device_memory_resource&>(__lhs)}
+        == _CUDA_VMR::resource_ref<device_accessible>{const_cast<_Resource&>(__rhs)};
   }
 
   template <class _Resource>
   _CCCL_NODISCARD_FRIEND auto operator==(device_memory_resource const&, _Resource const&) noexcept
     _CCCL_TRAILING_REQUIRES(bool)(_CUDA_VMR::__different_resource<device_memory_resource, _Resource>
-                                  && !has_property<_Resource, _CUDA_VMR::device_accessible>)
+                                  && !has_property<_Resource, device_accessible>)
   {
     return false;
   }
 
   template <class _Resource>
   _CCCL_NODISCARD_FRIEND auto operator==(_Resource const& __rhs, device_memory_resource const& __lhs) noexcept
-    _CCCL_TRAILING_REQUIRES(bool)(_CUDA_VMR::__different_resource<device_memory_resource, _Resource>&&
-                                    has_property<_Resource, _CUDA_VMR::device_accessible>)
+    _CCCL_TRAILING_REQUIRES(bool)(
+      _CUDA_VMR::__different_resource<device_memory_resource, _Resource>&& has_property<_Resource, device_accessible>)
   {
-    return _CUDA_VMR::resource_ref<_CUDA_VMR::device_accessible>{const_cast<device_memory_resource&>(__lhs)}
-        == _CUDA_VMR::resource_ref<_CUDA_VMR::device_accessible>{const_cast<_Resource&>(__rhs)};
+    return _CUDA_VMR::resource_ref<device_accessible>{const_cast<device_memory_resource&>(__lhs)}
+        == _CUDA_VMR::resource_ref<device_accessible>{const_cast<_Resource&>(__rhs)};
   }
 
   template <class _Resource>
   _CCCL_NODISCARD_FRIEND auto operator==(_Resource const&, device_memory_resource const&) noexcept
     _CCCL_TRAILING_REQUIRES(bool)(_CUDA_VMR::__different_resource<device_memory_resource, _Resource>
-                                  && !has_property<_Resource, _CUDA_VMR::device_accessible>)
+                                  && !has_property<_Resource, device_accessible>)
   {
     return false;
   }
 
   template <class _Resource>
   _CCCL_NODISCARD_FRIEND auto operator!=(device_memory_resource const& __lhs, _Resource const& __rhs) noexcept
-    _CCCL_TRAILING_REQUIRES(bool)(_CUDA_VMR::__different_resource<device_memory_resource, _Resource>&&
-                                    has_property<_Resource, _CUDA_VMR::device_accessible>)
+    _CCCL_TRAILING_REQUIRES(bool)(
+      _CUDA_VMR::__different_resource<device_memory_resource, _Resource>&& has_property<_Resource, device_accessible>)
   {
-    return _CUDA_VMR::resource_ref<_CUDA_VMR::device_accessible>{const_cast<device_memory_resource&>(__lhs)}
-        != _CUDA_VMR::resource_ref<_CUDA_VMR::device_accessible>{const_cast<_Resource&>(__rhs)};
+    return _CUDA_VMR::resource_ref<device_accessible>{const_cast<device_memory_resource&>(__lhs)}
+        != _CUDA_VMR::resource_ref<device_accessible>{const_cast<_Resource&>(__rhs)};
   }
 
   template <class _Resource>
   _CCCL_NODISCARD_FRIEND auto operator!=(device_memory_resource const&, _Resource const&) noexcept
     _CCCL_TRAILING_REQUIRES(bool)(_CUDA_VMR::__different_resource<device_memory_resource, _Resource>
-                                  && !has_property<_Resource, _CUDA_VMR::device_accessible>)
+                                  && !has_property<_Resource, device_accessible>)
   {
     return true;
   }
 
   template <class _Resource>
   _CCCL_NODISCARD_FRIEND auto operator!=(_Resource const& __rhs, device_memory_resource const& __lhs) noexcept
-    _CCCL_TRAILING_REQUIRES(bool)(_CUDA_VMR::__different_resource<device_memory_resource, _Resource>&&
-                                    has_property<_Resource, _CUDA_VMR::device_accessible>)
+    _CCCL_TRAILING_REQUIRES(bool)(
+      _CUDA_VMR::__different_resource<device_memory_resource, _Resource>&& has_property<_Resource, device_accessible>)
   {
-    return _CUDA_VMR::resource_ref<_CUDA_VMR::device_accessible>{const_cast<device_memory_resource&>(__lhs)}
-        != _CUDA_VMR::resource_ref<_CUDA_VMR::device_accessible>{const_cast<_Resource&>(__rhs)};
+    return _CUDA_VMR::resource_ref<device_accessible>{const_cast<device_memory_resource&>(__lhs)}
+        != _CUDA_VMR::resource_ref<device_accessible>{const_cast<_Resource&>(__rhs)};
   }
 
   template <class _Resource>
   _CCCL_NODISCARD_FRIEND auto operator!=(_Resource const&, device_memory_resource const&) noexcept
     _CCCL_TRAILING_REQUIRES(bool)(_CUDA_VMR::__different_resource<device_memory_resource, _Resource>
-                                  && !has_property<_Resource, _CUDA_VMR::device_accessible>)
+                                  && !has_property<_Resource, device_accessible>)
   {
     return true;
   }
@@ -408,12 +409,12 @@ public:
 #    ifndef _CCCL_DOXYGEN_INVOKED // Doxygen cannot handle the friend function
   //! @brief Enables the \c device_accessible property for \c device_memory_resource.
   //! @relates device_memory_resource
-  friend constexpr void get_property(device_memory_resource const&, _CUDA_VMR::device_accessible) noexcept {}
+  friend constexpr void get_property(device_memory_resource const&, device_accessible) noexcept {}
 #    endif // _CCCL_DOXYGEN_INVOKED
 };
-static_assert(_CUDA_VMR::resource_with<device_memory_resource, _CUDA_VMR::device_accessible>, "");
+static_assert(_CUDA_VMR::resource_with<device_memory_resource, device_accessible>, "");
 
-} // namespace cuda::experimental::mr
+} // namespace cuda::experimental
 
 #  endif // _CCCL_STD_VER >= 2014
 
diff --git a/cudax/include/cuda/experimental/__memory_resource/managed_memory_resource.cuh b/cudax/include/cuda/experimental/__memory_resource/managed_memory_resource.cuh
new file mode 100644
index 00000000000..f240155339c
--- /dev/null
+++ b/cudax/include/cuda/experimental/__memory_resource/managed_memory_resource.cuh
@@ -0,0 +1,254 @@
+//===----------------------------------------------------------------------===//
+//
+// Part of CUDA Experimental in CUDA C++ Core Libraries,
+// under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+// SPDX-FileCopyrightText: Copyright (c) 2024 NVIDIA CORPORATION & AFFILIATES.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef _CUDAX__MEMORY_RESOURCE_MANAGED_MEMORY_RESOURCE_CUH
+#define _CUDAX__MEMORY_RESOURCE_MANAGED_MEMORY_RESOURCE_CUH
+
+#include <cuda/std/detail/__config>
+
+#if defined(_CCCL_IMPLICIT_SYSTEM_HEADER_GCC)
+#  pragma GCC system_header
+#elif defined(_CCCL_IMPLICIT_SYSTEM_HEADER_CLANG)
+#  pragma clang system_header
+#elif defined(_CCCL_IMPLICIT_SYSTEM_HEADER_MSVC)
+#  pragma system_header
+#endif // no system header
+
+#if defined(_CCCL_CUDA_COMPILER_CLANG)
+#  include <cuda_runtime_api.h>
+#endif // _CCCL_CUDA_COMPILER_CLANG
+
+#include <cuda/__memory_resource/get_property.h>
+#include <cuda/__memory_resource/properties.h>
+#include <cuda/__memory_resource/resource.h>
+#include <cuda/__memory_resource/resource_ref.h>
+#include <cuda/std/__concepts/concept_macros.h>
+#include <cuda/std/__cuda/api_wrapper.h>
+#include <cuda/std/detail/libcxx/include/stdexcept>
+
+#include <cuda/experimental/__memory_resource/properties.cuh>
+
+//! @file
+//! The \c managed_memory_resource class provides a memory resource that allocates managed memory.
+namespace cuda::experimental
+{
+
+//! @brief \c managed_memory_resource uses `cudaMallocManaged` / `cudaFree` for allocation / deallocation.
+class managed_memory_resource
+{
+private:
+  unsigned int __flags_ = cudaMemAttachGlobal;
+
+  static constexpr unsigned int __available_flags = cudaMemAttachGlobal | cudaMemAttachHost;
+
+public:
+  constexpr managed_memory_resource(const unsigned int __flags = cudaMemAttachGlobal) noexcept
+      : __flags_(__flags & __available_flags)
+  {
+    _CCCL_ASSERT(__flags_ == __flags, "Unexpected flags passed to managed_memory_resource");
+  }
+
+  //! @brief Allocate CUDA unified memory of size at least \p __bytes.
+  //! @param __bytes The size in bytes of the allocation.
+  //! @param __alignment The requested alignment of the allocation.
+  //! @throw std::invalid_argument in case of invalid alignment or \c cuda::cuda_error of the returned error code.
+  //! @return Pointer to the newly allocated memory
+  _CCCL_NODISCARD void* allocate(const size_t __bytes,
+                                 const size_t __alignment = _CUDA_VMR::default_cuda_malloc_alignment) const
+  {
+    // We need to ensure that the provided alignment matches the minimal provided alignment
+    if (!__is_valid_alignment(__alignment))
+    {
+      _CUDA_VSTD::__throw_invalid_argument("Invalid alignment passed to managed_memory_resource::allocate.");
+    }
+
+    void* __ptr{nullptr};
+    _CCCL_TRY_CUDA_API(
+      ::cudaMallocManaged, "Failed to allocate memory with cudaMallocManaged.", &__ptr, __bytes, __flags_);
+    return __ptr;
+  }
+
+  //! @brief Allocate CUDA unified memory of size at least \p __bytes.
+  //! @param __bytes The size in bytes of the allocation.
+  //! @param __alignment The requested alignment of the allocation.
+  //! @param __stream Stream on which to perform allocation. Currently ignored
+  //! @throws std::invalid_argument In case of invalid alignment.
+  //! @throws cuda::cuda_error If an error code was return by the cuda api call.
+  //! @returns Pointer to the newly allocated memory.
+  _CCCL_NODISCARD void* allocate_async(const size_t __bytes, const size_t __alignment, const ::cuda::stream_ref __stream)
+  {
+    (void) __stream;
+    return allocate(__bytes, __alignment);
+  }
+
+  //! @brief Allocate CUDA unified memory of size at least \p __bytes.
+  //! @param __bytes The size in bytes of the allocation.
+  //! @param __stream Stream on which to perform allocation.
+  //! @throws cuda::cuda_error If an error code was return by the cuda api call.
+  //! @returns Pointer to the newly allocated memory.
+  _CCCL_NODISCARD void* allocate_async(const size_t __bytes, const ::cuda::stream_ref __stream)
+  {
+    (void) __stream;
+    return allocate(__bytes);
+  }
+
+  //! @brief Deallocate memory pointed to by \p __ptr.
+  //! @param __ptr Pointer to be deallocated. Must have been allocated through a call to `allocate`.
+  //! @param __bytes The number of bytes that was passed to the `allocate` call that returned \p __ptr.
+  //! @param __alignment The alignment that was passed to the `allocate` call that returned \p __ptr.
+  void deallocate(
+    void* __ptr, const size_t, const size_t __alignment = _CUDA_VMR::default_cuda_malloc_alignment) const noexcept
+  {
+    // We need to ensure that the provided alignment matches the minimal provided alignment
+    _CCCL_ASSERT(__is_valid_alignment(__alignment), "Invalid alignment passed to managed_memory_resource::deallocate.");
+    _CCCL_ASSERT_CUDA_API(::cudaFree, "managed_memory_resource::deallocate failed", __ptr);
+    (void) __alignment;
+  }
+
+  //! @brief Deallocate memory pointed to by \p __ptr.
+  //! @param __ptr Pointer to be deallocated. Must have been allocated through a call to `allocate_async`.
+  //! @param __bytes The number of bytes that was passed to the `allocate_async` call that returned \p __ptr.
+  //! @param __alignment The alignment that was passed to the `allocate_async` call that returned \p __ptr.
+  //! @param __stream A stream that has a stream ordering relationship with the stream used in the
+  //! <a href="https://docs.nvidia.com/cuda/cuda-runtime-api/group__CUDART__MEMORY__POOLS.html">allocate_async</a> call
+  //! that returned \p __ptr.
+  //! @note The pointer passed to `deallocate_async` must not be in use in a stream other than \p __stream.
+  //! It is the caller's responsibility to properly synchronize all relevant streams before calling `deallocate_async`.
+  void deallocate_async(void* __ptr, const size_t __bytes, const size_t __alignment, const ::cuda::stream_ref __stream)
+  {
+    deallocate(__ptr, __bytes);
+    (void) __alignment;
+    (void) __stream;
+  }
+
+  //! @brief Deallocate memory pointed to by \p __ptr.
+  //! @param __ptr Pointer to be deallocated. Must have been allocated through a call to `allocate_async`.
+  //! @param __bytes The number of bytes that was passed to the `allocate_async` call that returned \p __ptr.
+  //! @param __stream A stream that has a stream ordering relationship with the stream used in the
+  //! <a href="https://docs.nvidia.com/cuda/cuda-runtime-api/group__CUDART__MEMORY__POOLS.html">allocate_async</a> call
+  //! that returned \p __ptr.
+  //! @note The pointer passed to `deallocate_async` must not be in use in a stream other than \p __stream.
+  //! It is the caller's responsibility to properly synchronize all relevant streams before calling `deallocate_async`.
+  void deallocate_async(void* __ptr, size_t __bytes, const ::cuda::stream_ref __stream)
+  {
+    deallocate(__ptr, __bytes);
+    (void) __stream;
+  }
+
+  //! @brief Equality comparison with another \c managed_memory_resource.
+  //! @param __other The other \c managed_memory_resource.
+  //! @return Whether both \c managed_memory_resource were constructed with the same flags.
+  _CCCL_NODISCARD constexpr bool operator==(managed_memory_resource const& __other) const noexcept
+  {
+    return __flags_ == __other.__flags_;
+  }
+#if _CCCL_STD_VER <= 2017
+  //! @brief Inequality comparison with another \c managed_memory_resource.
+  //! @param __other The other \c managed_memory_resource.
+  //! @return Whether both \c managed_memory_resource were constructed with different flags.
+  _CCCL_NODISCARD constexpr bool operator!=(managed_memory_resource const& __other) const noexcept
+  {
+    return __flags_ != __other.__flags_;
+  }
+#endif // _CCCL_STD_VER <= 2017
+
+#ifndef DOXYGEN_SHOULD_SKIP_THIS // Do not document
+#  if _CCCL_STD_VER >= 2020
+  //! @brief Equality comparison between a \c managed_memory_resource and another resource
+  //! @param __rhs The resource to compare to
+  //! @return If the underlying types are equality comparable, returns the result of equality comparison of both
+  //! resources. Otherwise, returns false.
+  _CCCL_TEMPLATE(class _Resource)
+  _CCCL_REQUIRES(_CUDA_VMR::__different_resource<managed_memory_resource, _Resource>)
+  _CCCL_NODISCARD bool operator==(_Resource const& __rhs) const noexcept
+  {
+    if constexpr (has_property<_Resource, mr::device_accessible>)
+    {
+      return _CUDA_VMR::resource_ref<mr::device_accessible>{const_cast<managed_memory_resource*>(this)}
+          == _CUDA_VMR::resource_ref<mr::device_accessible>{const_cast<_Resource&>(__rhs)};
+    }
+    else if constexpr (has_property<_Resource, mr::device_accessible>)
+    {
+      return _CUDA_VMR::resource_ref<mr::device_accessible>{const_cast<managed_memory_resource*>(this)}
+          == _CUDA_VMR::resource_ref<mr::device_accessible>{const_cast<_Resource&>(__rhs)};
+    }
+    else
+    {
+      return false;
+    }
+  }
+#  else // ^^^ C++20 ^^^ / vvv C++17
+  template <class _Resource>
+  _CCCL_NODISCARD_FRIEND auto operator==(managed_memory_resource const& __lhs, _Resource const& __rhs) noexcept
+    _CCCL_TRAILING_REQUIRES(bool)(_CUDA_VMR::__different_resource<managed_memory_resource, _Resource>&&
+                                    has_property<_Resource, mr::device_accessible>)
+  {
+    return _CUDA_VMR::resource_ref<mr::device_accessible>{const_cast<managed_memory_resource&>(__lhs)}
+        == _CUDA_VMR::resource_ref<mr::device_accessible>{const_cast<_Resource&>(__rhs)};
+  }
+  template <class _Resource>
+  _CCCL_NODISCARD_FRIEND auto operator==(managed_memory_resource const& __lhs, _Resource const& __rhs) noexcept
+    _CCCL_TRAILING_REQUIRES(bool)(
+      _CUDA_VMR::__different_resource<managed_memory_resource, _Resource>
+      && !has_property<_Resource, mr::device_accessible> && has_property<_Resource, mr::device_accessible>)
+  {
+    return _CUDA_VMR::resource_ref<mr::device_accessible>{const_cast<managed_memory_resource&>(__lhs)}
+        == _CUDA_VMR::resource_ref<mr::device_accessible>{const_cast<_Resource&>(__rhs)};
+  }
+  template <class _Resource>
+  _CCCL_NODISCARD_FRIEND auto operator==(managed_memory_resource const&, _Resource const&) noexcept
+    _CCCL_TRAILING_REQUIRES(bool)(
+      _CUDA_VMR::__different_resource<managed_memory_resource, _Resource>
+      && !has_property<_Resource, mr::device_accessible> && !has_property<_Resource, mr::device_accessible>)
+  {
+    return false;
+  }
+
+  template <class _Resource>
+  _CCCL_NODISCARD_FRIEND auto operator==(_Resource const& __lhs, managed_memory_resource const& __rhs) noexcept
+    _CCCL_TRAILING_REQUIRES(bool)(_CUDA_VMR::__different_resource<managed_memory_resource, _Resource>)
+  {
+    return __rhs == __lhs;
+  }
+
+  template <class _Resource>
+  _CCCL_NODISCARD_FRIEND auto operator!=(managed_memory_resource const& __lhs, _Resource const& __rhs) noexcept
+    _CCCL_TRAILING_REQUIRES(bool)(_CUDA_VMR::__different_resource<managed_memory_resource, _Resource>)
+  {
+    return !(__lhs == __rhs);
+  }
+
+  template <class _Resource>
+  _CCCL_NODISCARD_FRIEND auto operator!=(_Resource const& __rhs, managed_memory_resource const& __lhs) noexcept
+    _CCCL_TRAILING_REQUIRES(bool)(_CUDA_VMR::__different_resource<managed_memory_resource, _Resource>)
+  {
+    return !(__rhs == __lhs);
+  }
+#  endif // _CCCL_STD_VER <= 2017
+
+  //! @brief Enables the \c device_accessible property
+  friend constexpr void get_property(managed_memory_resource const&, mr::device_accessible) noexcept {}
+  //! @brief Enables the \c host_accessible property
+  friend constexpr void get_property(managed_memory_resource const&, mr::host_accessible) noexcept {}
+#endif // DOXYGEN_SHOULD_SKIP_THIS
+
+  //! @brief Checks whether the passed in alignment is valid
+  static constexpr bool __is_valid_alignment(const size_t __alignment) noexcept
+  {
+    return __alignment <= _CUDA_VMR::default_cuda_malloc_alignment
+        && (_CUDA_VMR::default_cuda_malloc_alignment % __alignment == 0);
+  }
+};
+static_assert(_CUDA_VMR::async_resource_with<managed_memory_resource, mr::device_accessible>, "");
+static_assert(_CUDA_VMR::async_resource_with<managed_memory_resource, mr::host_accessible>, "");
+
+} // namespace cuda::experimental
+
+#endif //_CUDAX__MEMORY_RESOURCE_MANAGED_MEMORY_RESOURCE_CUH
diff --git a/cudax/include/cuda/experimental/__memory_resource/pinned_memory_resource.cuh b/cudax/include/cuda/experimental/__memory_resource/pinned_memory_resource.cuh
new file mode 100644
index 00000000000..60ec7c9b49e
--- /dev/null
+++ b/cudax/include/cuda/experimental/__memory_resource/pinned_memory_resource.cuh
@@ -0,0 +1,256 @@
+//===----------------------------------------------------------------------===//
+//
+// Part of CUDA Experimental in CUDA C++ Core Libraries,
+// under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+// SPDX-FileCopyrightText: Copyright (c) 2024 NVIDIA CORPORATION & AFFILIATES.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef _CUDA__MEMORY_RESOURCE_CUDA_PINNED_MEMORY_RESOURCE_H
+#define _CUDA__MEMORY_RESOURCE_CUDA_PINNED_MEMORY_RESOURCE_H
+
+#include <cuda/std/detail/__config>
+
+#if defined(_CCCL_IMPLICIT_SYSTEM_HEADER_GCC)
+#  pragma GCC system_header
+#elif defined(_CCCL_IMPLICIT_SYSTEM_HEADER_CLANG)
+#  pragma clang system_header
+#elif defined(_CCCL_IMPLICIT_SYSTEM_HEADER_MSVC)
+#  pragma system_header
+#endif // no system header
+
+#if defined(_CCCL_CUDA_COMPILER_CLANG)
+#  include <cuda_runtime.h>
+#  include <cuda_runtime_api.h>
+#endif // _CCCL_CUDA_COMPILER_CLANG
+
+#include <cuda/__memory_resource/get_property.h>
+#include <cuda/__memory_resource/properties.h>
+#include <cuda/__memory_resource/resource.h>
+#include <cuda/__memory_resource/resource_ref.h>
+#include <cuda/std/__concepts/concept_macros.h>
+#include <cuda/std/__cuda/api_wrapper.h>
+#include <cuda/std/detail/libcxx/include/stdexcept>
+
+#include <cuda/experimental/__memory_resource/properties.cuh>
+
+//! @file
+//! The \c managed_memory_resource class provides a memory resource that allocates pinned memory.
+namespace cuda::experimental
+{
+
+//! @brief pinned_memory_resource uses `cudaMallocHost` / `cudaFreeHost` for allocation / deallocation.
+class pinned_memory_resource
+{
+private:
+  unsigned int __flags_ = cudaHostAllocDefault;
+
+  static constexpr unsigned int __available_flags =
+    cudaHostAllocDefault | cudaHostAllocPortable | cudaHostAllocMapped | cudaHostAllocWriteCombined;
+
+public:
+  constexpr pinned_memory_resource(const unsigned int __flags = cudaHostAllocDefault) noexcept
+      : __flags_(__flags & __available_flags)
+  {
+    _CCCL_ASSERT(__flags_ == __flags, "Unexpected flags passed to pinned_memory_resource");
+  }
+
+  //! @brief Allocate host memory of size at least \p __bytes.
+  //! @param __bytes The size in bytes of the allocation.
+  //! @param __alignment The requested alignment of the allocation.
+  //! @throw std::invalid_argument in case of invalid alignment or \c cuda::cuda_error of the returned error code.
+  //! @return Pointer to the newly allocated memory
+  _CCCL_NODISCARD void* allocate(const size_t __bytes,
+                                 const size_t __alignment = _CUDA_VMR::default_cuda_malloc_host_alignment) const
+  {
+    // We need to ensure that the provided alignment matches the minimal provided alignment
+    if (!__is_valid_alignment(__alignment))
+    {
+      _CUDA_VSTD::__throw_invalid_argument("Invalid alignment passed to pinned_memory_resource::allocate.");
+    }
+
+    void* __ptr{nullptr};
+    _CCCL_TRY_CUDA_API(::cudaMallocHost, "Failed to allocate memory with cudaMallocHost.", &__ptr, __bytes, __flags_);
+    return __ptr;
+  }
+
+  //! @brief Allocate host memory of size at least \p __bytes.
+  //! @param __bytes The size in bytes of the allocation.
+  //! @param __alignment The requested alignment of the allocation.
+  //! @param __stream Stream on which to perform allocation. Currently ignored
+  //! @throws std::invalid_argument In case of invalid alignment.
+  //! @throws cuda::cuda_error If an error code was return by the cuda api call.
+  //! @returns Pointer to the newly allocated memory.
+  _CCCL_NODISCARD void* allocate_async(const size_t __bytes, const size_t __alignment, const ::cuda::stream_ref __stream)
+  {
+    (void) __stream;
+    return allocate(__bytes, __alignment);
+  }
+
+  //! @brief Allocate host memory of size at least \p __bytes.
+  //! @param __bytes The size in bytes of the allocation.
+  //! @param __stream Stream on which to perform allocation.
+  //! @throws cuda::cuda_error If an error code was return by the cuda api call.
+  //! @returns Pointer to the newly allocated memory.
+  _CCCL_NODISCARD void* allocate_async(const size_t __bytes, const ::cuda::stream_ref __stream)
+  {
+    (void) __stream;
+    return allocate(__bytes);
+  }
+
+  //! @brief Deallocate memory pointed to by \p __ptr.
+  //! @param __ptr Pointer to be deallocated. Must have been allocated through a call to `allocate`.
+  //! @param __bytes The number of bytes that was passed to the `allocate` call that returned \p __ptr.
+  //! @param __alignment The alignment that was passed to the `allocate` call that returned \p __ptr.
+  void deallocate(
+    void* __ptr, const size_t, const size_t __alignment = _CUDA_VMR::default_cuda_malloc_host_alignment) const noexcept
+  {
+    // We need to ensure that the provided alignment matches the minimal provided alignment
+    _CCCL_ASSERT(__is_valid_alignment(__alignment), "Invalid alignment passed to pinned_memory_resource::deallocate.");
+    _CCCL_ASSERT_CUDA_API(::cudaFreeHost, "pinned_memory_resource::deallocate failed", __ptr);
+    (void) __alignment;
+  }
+
+  //! @brief Deallocate memory pointed to by \p __ptr.
+  //! @param __ptr Pointer to be deallocated. Must have been allocated through a call to `allocate_async`.
+  //! @param __bytes The number of bytes that was passed to the `allocate_async` call that returned \p __ptr.
+  //! @param __alignment The alignment that was passed to the `allocate_async` call that returned \p __ptr.
+  //! @param __stream A stream that has a stream ordering relationship with the stream used in the
+  //! <a href="https://docs.nvidia.com/cuda/cuda-runtime-api/group__CUDART__MEMORY__POOLS.html">allocate_async</a> call
+  //! that returned \p __ptr.
+  //! @note The pointer passed to `deallocate_async` must not be in use in a stream other than \p __stream.
+  //! It is the caller's responsibility to properly synchronize all relevant streams before calling `deallocate_async`.
+  void deallocate_async(void* __ptr, const size_t __bytes, const size_t __alignment, const ::cuda::stream_ref __stream)
+  {
+    deallocate(__ptr, __bytes);
+    (void) __alignment;
+    (void) __stream;
+  }
+
+  //! @brief Deallocate memory pointed to by \p __ptr.
+  //! @param __ptr Pointer to be deallocated. Must have been allocated through a call to `allocate_async`.
+  //! @param __bytes The number of bytes that was passed to the `allocate_async` call that returned \p __ptr.
+  //! @param __stream A stream that has a stream ordering relationship with the stream used in the
+  //! <a href="https://docs.nvidia.com/cuda/cuda-runtime-api/group__CUDART__MEMORY__POOLS.html">allocate_async</a> call
+  //! that returned \p __ptr.
+  //! @note The pointer passed to `deallocate_async` must not be in use in a stream other than \p __stream.
+  //! It is the caller's responsibility to properly synchronize all relevant streams before calling `deallocate_async`.
+  void deallocate_async(void* __ptr, size_t __bytes, const ::cuda::stream_ref __stream)
+  {
+    deallocate(__ptr, __bytes);
+    (void) __stream;
+  }
+
+  //! @brief Equality comparison with another \c pinned_memory_resource.
+  //! @param __other The other \c pinned_memory_resource.
+  //! @return Whether both \c pinned_memory_resource were constructed with the same flags.
+  _CCCL_NODISCARD constexpr bool operator==(pinned_memory_resource const& __other) const noexcept
+  {
+    return __flags_ == __other.__flags_;
+  }
+#if _CCCL_STD_VER <= 2017
+  //! @brief Equality comparison with another \c pinned_memory_resource.
+  //! @param __other The other \c pinned_memory_resource.
+  //! @return Whether both \c pinned_memory_resource were constructed with different flags.
+  _CCCL_NODISCARD constexpr bool operator!=(pinned_memory_resource const& __other) const noexcept
+  {
+    return __flags_ != __other.__flags_;
+  }
+#endif // _CCCL_STD_VER <= 2017
+
+#ifndef DOXYGEN_SHOULD_SKIP_THIS // Do not document
+#  if _CCCL_STD_VER >= 2020
+  //! @brief Equality comparison between a \c pinned_memory_resource and another resource
+  //! @param __rhs The resource to compare to
+  //! @return If the underlying types are equality comparable, returns the result of equality comparison of both
+  //! resources. Otherwise, returns false.
+  _CCCL_TEMPLATE(class _Resource)
+  _CCCL_REQUIRES(_CUDA_VMR::__different_resource<pinned_memory_resource, _Resource>)
+  _CCCL_NODISCARD bool operator==(_Resource const& __rhs) const noexcept
+  {
+    if constexpr (has_property<_Resource, device_accessible>)
+    {
+      return _CUDA_VMR::resource_ref<device_accessible>{const_cast<pinned_memory_resource*>(this)}
+          == _CUDA_VMR::resource_ref<device_accessible>{const_cast<_Resource&>(__rhs)};
+    }
+    else if constexpr (has_property<_Resource, device_accessible>)
+    {
+      return _CUDA_VMR::resource_ref<device_accessible>{const_cast<pinned_memory_resource*>(this)}
+          == _CUDA_VMR::resource_ref<device_accessible>{const_cast<_Resource&>(__rhs)};
+    }
+    else
+    {
+      return false;
+    }
+  }
+#  else // ^^^ C++20 ^^^ / vvv C++17
+  template <class _Resource>
+  _CCCL_NODISCARD_FRIEND auto operator==(pinned_memory_resource const& __lhs, _Resource const& __rhs) noexcept
+    _CCCL_TRAILING_REQUIRES(bool)(
+      _CUDA_VMR::__different_resource<pinned_memory_resource,
+                                      _Resource>&& ::cuda::has_property<_Resource, device_accessible>)
+  {
+    return _CUDA_VMR::resource_ref<device_accessible>{const_cast<pinned_memory_resource&>(__lhs)}
+        == _CUDA_VMR::resource_ref<device_accessible>{const_cast<_Resource&>(__rhs)};
+  }
+  template <class _Resource>
+  _CCCL_NODISCARD_FRIEND auto operator==(pinned_memory_resource const& __lhs, _Resource const& __rhs) noexcept
+    _CCCL_TRAILING_REQUIRES(bool)(
+      _CUDA_VMR::__different_resource<pinned_memory_resource, _Resource>
+      && !::cuda::has_property<_Resource, device_accessible> && ::cuda::has_property<_Resource, device_accessible>)
+  {
+    return _CUDA_VMR::resource_ref<device_accessible>{const_cast<pinned_memory_resource&>(__lhs)}
+        == _CUDA_VMR::resource_ref<device_accessible>{const_cast<_Resource&>(__rhs)};
+  }
+  template <class _Resource>
+  _CCCL_NODISCARD_FRIEND auto operator==(pinned_memory_resource const&, _Resource const&) noexcept
+    _CCCL_TRAILING_REQUIRES(bool)(
+      _CUDA_VMR::__different_resource<pinned_memory_resource, _Resource>
+      && !::cuda::has_property<_Resource, device_accessible> && !::cuda::has_property<_Resource, device_accessible>)
+  {
+    return false;
+  }
+
+  template <class _Resource>
+  _CCCL_NODISCARD_FRIEND auto operator==(_Resource const& __lhs, pinned_memory_resource const& __rhs) noexcept
+    _CCCL_TRAILING_REQUIRES(bool)(_CUDA_VMR::__different_resource<pinned_memory_resource, _Resource>)
+  {
+    return __rhs == __lhs;
+  }
+
+  template <class _Resource>
+  _CCCL_NODISCARD_FRIEND auto operator!=(pinned_memory_resource const& __lhs, _Resource const& __rhs) noexcept
+    _CCCL_TRAILING_REQUIRES(bool)(_CUDA_VMR::__different_resource<pinned_memory_resource, _Resource>)
+  {
+    return !(__lhs == __rhs);
+  }
+
+  template <class _Resource>
+  _CCCL_NODISCARD_FRIEND auto operator!=(_Resource const& __rhs, pinned_memory_resource const& __lhs) noexcept
+    _CCCL_TRAILING_REQUIRES(bool)(_CUDA_VMR::__different_resource<pinned_memory_resource, _Resource>)
+  {
+    return !(__rhs == __lhs);
+  }
+#  endif // _CCCL_STD_VER <= 2017
+
+  //! @brief Enables the \c device_accessible property
+  friend constexpr void get_property(pinned_memory_resource const&, device_accessible) noexcept {}
+  //! @brief Enables the \c host_accessible property
+  friend constexpr void get_property(pinned_memory_resource const&, host_accessible) noexcept {}
+#endif // DOXYGEN_SHOULD_SKIP_THIS
+
+  //! @brief Checks whether the passed in alignment is valid
+  static constexpr bool __is_valid_alignment(const size_t __alignment) noexcept
+  {
+    return __alignment <= _CUDA_VMR::default_cuda_malloc_host_alignment
+        && (_CUDA_VMR::default_cuda_malloc_host_alignment % __alignment == 0);
+  }
+};
+static_assert(_CUDA_VMR::async_resource_with<pinned_memory_resource, device_accessible>, "");
+static_assert(_CUDA_VMR::async_resource_with<pinned_memory_resource, device_accessible>, "");
+
+} // namespace cuda::experimental
+
+#endif //_CUDA__MEMORY_RESOURCE_CUDA_PINNED_MEMORY_RESOURCE_H
diff --git a/cudax/include/cuda/experimental/__memory_resource/properties.cuh b/cudax/include/cuda/experimental/__memory_resource/properties.cuh
new file mode 100644
index 00000000000..b1646ab4b36
--- /dev/null
+++ b/cudax/include/cuda/experimental/__memory_resource/properties.cuh
@@ -0,0 +1,49 @@
+//===----------------------------------------------------------------------===//
+//
+// Part of CUDA Experimental in CUDA C++ Core Libraries,
+// under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+// SPDX-FileCopyrightText: Copyright (c) 2024 NVIDIA CORPORATION & AFFILIATES.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef _CUDAX__MEMORY_RESOURCE_PROPERTIES_CUH
+#define _CUDAX__MEMORY_RESOURCE_PROPERTIES_CUH
+
+#include <cuda/std/detail/__config>
+
+#if defined(_CCCL_IMPLICIT_SYSTEM_HEADER_GCC)
+#  pragma GCC system_header
+#elif defined(_CCCL_IMPLICIT_SYSTEM_HEADER_CLANG)
+#  pragma clang system_header
+#elif defined(_CCCL_IMPLICIT_SYSTEM_HEADER_MSVC)
+#  pragma system_header
+#endif // no system header
+
+// If the memory resource header was included without the experimental flag,
+// tell the user to define the experimental flag.
+#if defined(_CUDA_MEMORY_RESOURCE) && !defined(LIBCUDACXX_ENABLE_EXPERIMENTAL_MEMORY_RESOURCE)
+#  error "To use the experimental memory resource, define LIBCUDACXX_ENABLE_EXPERIMENTAL_MEMORY_RESOURCE"
+#endif
+
+// cuda::mr is unavable on MSVC 2017
+#if _CCCL_COMPILER(MSVC2017)
+#  error "The any_resource header is not supported on MSVC 2017"
+#endif // _CCCL_COMPILER(MSVC2017)
+
+#if !defined(LIBCUDACXX_ENABLE_EXPERIMENTAL_MEMORY_RESOURCE)
+#  define LIBCUDACXX_ENABLE_EXPERIMENTAL_MEMORY_RESOURCE
+#endif
+
+#include <cuda/__memory_resource/properties.h>
+
+namespace cuda::experimental
+{
+
+using ::cuda::mr::device_accessible;
+using ::cuda::mr::host_accessible;
+
+} // namespace cuda::experimental
+
+#endif //_CUDAX__MEMORY_RESOURCE_PROPERTIES_CUH
diff --git a/cudax/include/cuda/experimental/__memory_resource/shared_resource.cuh b/cudax/include/cuda/experimental/__memory_resource/shared_resource.cuh
index 1b0a81320b1..bfea3e43e68 100644
--- a/cudax/include/cuda/experimental/__memory_resource/shared_resource.cuh
+++ b/cudax/include/cuda/experimental/__memory_resource/shared_resource.cuh
@@ -44,7 +44,7 @@
 #include <cuda/std/__utility/move.h>
 #include <cuda/std/atomic>
 
-namespace cuda::experimental::mr
+namespace cuda::experimental
 {
 
 //! @rst
@@ -268,6 +268,6 @@ auto make_shared_resource(_Args&&... __args) -> shared_resource<_Resource>
   return shared_resource<_Resource>{_CUDA_VSTD::forward<_Args>(__args)...};
 }
 
-} // namespace cuda::experimental::mr
+} // namespace cuda::experimental
 
 #endif // _CUDAX__MEMORY_RESOURCE_SHARED_RESOURCE_H
diff --git a/cudax/include/cuda/experimental/memory_resource.cuh b/cudax/include/cuda/experimental/memory_resource.cuh
index 42f32a97d8a..c1bb3b916ac 100644
--- a/cudax/include/cuda/experimental/memory_resource.cuh
+++ b/cudax/include/cuda/experimental/memory_resource.cuh
@@ -14,6 +14,9 @@
 #include <cuda/experimental/__memory_resource/any_resource.cuh>
 #include <cuda/experimental/__memory_resource/device_memory_pool.cuh>
 #include <cuda/experimental/__memory_resource/device_memory_resource.cuh>
+#include <cuda/experimental/__memory_resource/managed_memory_resource.cuh>
+#include <cuda/experimental/__memory_resource/pinned_memory_resource.cuh>
+#include <cuda/experimental/__memory_resource/properties.cuh>
 #include <cuda/experimental/__memory_resource/shared_resource.cuh>
 
 #endif // __CUDAX_MEMORY_RESOURCE___
diff --git a/cudax/test/CMakeLists.txt b/cudax/test/CMakeLists.txt
index 2e54f2ca6d1..9af2c83cc6f 100644
--- a/cudax/test/CMakeLists.txt
+++ b/cudax/test/CMakeLists.txt
@@ -110,6 +110,8 @@ foreach(cn_target IN LISTS cudax_TARGETS)
     memory_resource/any_resource.cu
     memory_resource/device_memory_pool.cu
     memory_resource/device_memory_resource.cu
+    memory_resource/managed_memory_resource.cu
+    memory_resource/pinned_memory_resource.cu
     memory_resource/shared_resource.cu
   )
 
diff --git a/cudax/test/algorithm/common.cuh b/cudax/test/algorithm/common.cuh
index 4b262966190..c4c7be0d02c 100644
--- a/cudax/test/algorithm/common.cuh
+++ b/cudax/test/algorithm/common.cuh
@@ -63,11 +63,11 @@ namespace cuda::experimental
 template <typename AsKernelArg = cuda::std::span<int>>
 struct weird_buffer
 {
-  const cuda::mr::pinned_memory_resource& resource;
+  const pinned_memory_resource& resource;
   int* data;
   std::size_t size;
 
-  weird_buffer(const cuda::mr::pinned_memory_resource& res, std::size_t s)
+  weird_buffer(const pinned_memory_resource& res, std::size_t s)
       : resource(res)
       , data((int*) res.allocate(s * sizeof(int)))
       , size(s)
diff --git a/cudax/test/algorithm/copy.cu b/cudax/test/algorithm/copy.cu
index 3db65e22c51..afb9a2b71d5 100644
--- a/cudax/test/algorithm/copy.cu
+++ b/cudax/test/algorithm/copy.cu
@@ -16,7 +16,7 @@ TEST_CASE("1d Copy", "[data_manipulation]")
 
   SECTION("Device resource")
   {
-    cudax::mr::device_memory_resource device_resource;
+    cudax::device_memory_resource device_resource;
     std::vector<int> host_vector(buffer_size);
 
     {
@@ -46,8 +46,8 @@ TEST_CASE("1d Copy", "[data_manipulation]")
 
   SECTION("Host and managed resource")
   {
-    cuda::mr::managed_memory_resource managed_resource;
-    cuda::mr::pinned_memory_resource host_resource;
+    cudax::managed_memory_resource managed_resource;
+    cudax::pinned_memory_resource host_resource;
 
     {
       cudax::uninitialized_buffer<int, cuda::mr::host_accessible> host_buffer(host_resource, buffer_size);
@@ -78,7 +78,7 @@ TEST_CASE("1d Copy", "[data_manipulation]")
   }
   SECTION("Launch transform")
   {
-    cuda::mr::pinned_memory_resource host_resource;
+    cudax::pinned_memory_resource host_resource;
     cudax::weird_buffer input(host_resource, buffer_size);
     cudax::weird_buffer output(host_resource, buffer_size);
 
@@ -90,7 +90,7 @@ TEST_CASE("1d Copy", "[data_manipulation]")
 
   SECTION("Asymetric size")
   {
-    cuda::mr::pinned_memory_resource host_resource;
+    cudax::pinned_memory_resource host_resource;
     cudax::uninitialized_buffer<int, cuda::mr::host_accessible> host_buffer(host_resource, 1);
     cudax::fill_bytes(_stream, host_buffer, fill_byte);
 
diff --git a/cudax/test/algorithm/fill.cu b/cudax/test/algorithm/fill.cu
index ce733871f51..35fae342ad3 100644
--- a/cudax/test/algorithm/fill.cu
+++ b/cudax/test/algorithm/fill.cu
@@ -15,7 +15,7 @@ TEST_CASE("Fill", "[data_manipulation]")
   cudax::stream _stream;
   SECTION("Host resource")
   {
-    cuda::mr::pinned_memory_resource host_resource;
+    cudax::pinned_memory_resource host_resource;
     cudax::uninitialized_buffer<int, cuda::mr::device_accessible> buffer(host_resource, buffer_size);
 
     cudax::fill_bytes(_stream, buffer, fill_byte);
@@ -25,7 +25,7 @@ TEST_CASE("Fill", "[data_manipulation]")
 
   SECTION("Device resource")
   {
-    cuda::mr::device_memory_resource device_resource;
+    cudax::device_memory_resource device_resource;
     cudax::uninitialized_buffer<int, cuda::mr::device_accessible> buffer(device_resource, buffer_size);
     cudax::fill_bytes(_stream, buffer, fill_byte);
 
@@ -37,7 +37,7 @@ TEST_CASE("Fill", "[data_manipulation]")
   }
   SECTION("Launch transform")
   {
-    cuda::mr::pinned_memory_resource host_resource;
+    cudax::pinned_memory_resource host_resource;
     cudax::weird_buffer buffer(host_resource, buffer_size);
 
     cudax::fill_bytes(_stream, buffer, fill_byte);
diff --git a/cudax/test/containers/uninitialized_async_buffer.cu b/cudax/test/containers/uninitialized_async_buffer.cu
index 6a63a5f99f2..3ec6f1bed6d 100644
--- a/cudax/test/containers/uninitialized_async_buffer.cu
+++ b/cudax/test/containers/uninitialized_async_buffer.cu
@@ -42,7 +42,7 @@ constexpr int get_property(
 {
   return 42;
 }
-constexpr int get_property(const cuda::experimental::mr::device_memory_resource&, my_property)
+constexpr int get_property(const cuda::experimental::device_memory_resource&, my_property)
 {
   return 42;
 }
@@ -56,7 +56,7 @@ TEMPLATE_TEST_CASE(
   static_assert(!cuda::std::is_copy_constructible<uninitialized_async_buffer>::value, "");
   static_assert(!cuda::std::is_copy_assignable<uninitialized_async_buffer>::value, "");
 
-  cuda::experimental::mr::device_memory_resource resource{};
+  cuda::experimental::device_memory_resource resource{};
   cuda::experimental::stream stream{};
 
   SECTION("construction")
@@ -207,7 +207,7 @@ TEMPLATE_TEST_CASE(
 
 // A test resource that keeps track of the number of resources are
 // currently alive.
-struct test_async_device_memory_resource : cudax::mr::device_memory_resource
+struct test_async_device_memory_resource : cudax::device_memory_resource
 {
   static int count;
 
@@ -217,7 +217,7 @@ struct test_async_device_memory_resource : cudax::mr::device_memory_resource
   }
 
   test_async_device_memory_resource(const test_async_device_memory_resource& other)
-      : cudax::mr::device_memory_resource{other}
+      : cudax::device_memory_resource{other}
   {
     ++count;
   }
@@ -234,7 +234,7 @@ TEST_CASE("uninitialized_async_buffer's memory resource does not dangle", "[cont
 {
   cuda::experimental::stream stream{};
   cudax::uninitialized_async_buffer<int, ::cuda::mr::device_accessible> buffer{
-    cudax::mr::device_memory_resource{}, stream, 0};
+    cudax::device_memory_resource{}, stream, 0};
 
   {
     CHECK(test_async_device_memory_resource::count == 0);
diff --git a/cudax/test/containers/uninitialized_buffer.cu b/cudax/test/containers/uninitialized_buffer.cu
index 22fe1ef473c..56ac77da86c 100644
--- a/cudax/test/containers/uninitialized_buffer.cu
+++ b/cudax/test/containers/uninitialized_buffer.cu
@@ -12,7 +12,6 @@
 #include <thrust/fill.h>
 #include <thrust/reduce.h>
 
-#include <cuda/memory_resource>
 #include <cuda/std/cstdint>
 #include <cuda/std/span>
 #include <cuda/std/type_traits>
@@ -20,6 +19,7 @@
 
 #include <cuda/experimental/buffer.cuh>
 #include <cuda/experimental/launch.cuh>
+#include <cuda/experimental/memory_resource.cuh>
 #include <cuda/experimental/stream.cuh>
 
 #include "testing.cuh"
@@ -56,7 +56,7 @@ constexpr int get_property(
 {
   return 42;
 }
-constexpr int get_property(const cuda::mr::device_memory_resource&, my_property)
+constexpr int get_property(const cudax::device_memory_resource&, my_property)
 {
   return 42;
 }
@@ -69,7 +69,7 @@ TEMPLATE_TEST_CASE(
   static_assert(!cuda::std::is_copy_constructible<uninitialized_buffer>::value, "");
   static_assert(!cuda::std::is_copy_assignable<uninitialized_buffer>::value, "");
 
-  cuda::mr::device_memory_resource resource{};
+  cudax::device_memory_resource resource{};
 
   SECTION("construction")
   {
@@ -111,7 +111,7 @@ TEMPLATE_TEST_CASE(
   {
     static_assert(!cuda::std::is_copy_assignable<uninitialized_buffer>::value, "");
     {
-      cuda::mr::managed_memory_resource other_resource{};
+      cudax::managed_memory_resource other_resource{};
       uninitialized_buffer input{other_resource, 42};
       uninitialized_buffer buf{resource, 1337};
       const auto* old_ptr       = buf.data();
@@ -222,7 +222,7 @@ TEST_CASE("uninitialized_buffer is usable with cudax::launch", "[container]")
   SECTION("non-const")
   {
     const int grid_size = 4;
-    cudax::uninitialized_buffer<int, ::cuda::mr::device_accessible> buffer{cuda::mr::device_memory_resource{}, 1024};
+    cudax::uninitialized_buffer<int, ::cuda::mr::device_accessible> buffer{cudax::device_memory_resource{}, 1024};
     auto dimensions = cudax::make_hierarchy(cudax::grid_dims(grid_size), cudax::block_dims<256>());
 
     cudax::stream stream;
@@ -233,8 +233,7 @@ TEST_CASE("uninitialized_buffer is usable with cudax::launch", "[container]")
   SECTION("const")
   {
     const int grid_size = 4;
-    const cudax::uninitialized_buffer<int, ::cuda::mr::device_accessible> buffer{
-      cuda::mr::device_memory_resource{}, 1024};
+    const cudax::uninitialized_buffer<int, ::cuda::mr::device_accessible> buffer{cudax::device_memory_resource{}, 1024};
     auto dimensions = cudax::make_hierarchy(cudax::grid_dims(grid_size), cudax::block_dims<256>());
 
     cudax::stream stream;
@@ -245,7 +244,7 @@ TEST_CASE("uninitialized_buffer is usable with cudax::launch", "[container]")
 
 // A test resource that keeps track of the number of resources are
 // currently alive.
-struct test_device_memory_resource : cuda::mr::device_memory_resource
+struct test_device_memory_resource : cudax::device_memory_resource
 {
   static int count;
 
@@ -255,7 +254,7 @@ struct test_device_memory_resource : cuda::mr::device_memory_resource
   }
 
   test_device_memory_resource(const test_device_memory_resource& other)
-      : cuda::mr::device_memory_resource{other}
+      : cudax::device_memory_resource{other}
   {
     ++count;
   }
@@ -270,7 +269,7 @@ int test_device_memory_resource::count = 0;
 
 TEST_CASE("uninitialized_buffer's memory resource does not dangle", "[container]")
 {
-  cudax::uninitialized_buffer<int, ::cuda::mr::device_accessible> buffer{cuda::mr::device_memory_resource{}, 0};
+  cudax::uninitialized_buffer<int, ::cuda::mr::device_accessible> buffer{cudax::device_memory_resource{}, 0};
 
   {
     CHECK(test_device_memory_resource::count == 0);
diff --git a/cudax/test/memory_resource/any_async_resource.cu b/cudax/test/memory_resource/any_async_resource.cu
index f032ac3f6b8..89c28b8a279 100644
--- a/cudax/test/memory_resource/any_async_resource.cu
+++ b/cudax/test/memory_resource/any_async_resource.cu
@@ -25,7 +25,7 @@ TEMPLATE_TEST_CASE_METHOD(test_fixture, "any_async_resource", "[container][resou
     Counts expected{};
     CHECK(this->counts == expected);
     {
-      cudax::mr::any_async_resource<cuda::mr::host_accessible> mr{TestResource{42, this}};
+      cudax::any_async_resource<cuda::mr::host_accessible> mr{TestResource{42, this}};
       expected.new_count += is_big;
       ++expected.object_count;
       ++expected.move_count;
@@ -44,7 +44,7 @@ TEMPLATE_TEST_CASE_METHOD(test_fixture, "any_async_resource", "[container][resou
     Counts expected{};
     CHECK(this->counts == expected);
     {
-      cudax::mr::any_async_resource<cuda::mr::host_accessible> mr{TestResource{42, this}};
+      cudax::any_async_resource<cuda::mr::host_accessible> mr{TestResource{42, this}};
       expected.new_count += is_big;
       ++expected.object_count;
       ++expected.move_count;
@@ -79,7 +79,7 @@ TEMPLATE_TEST_CASE_METHOD(test_fixture, "any_async_resource", "[container][resou
     Counts expected{};
     CHECK(this->counts == expected);
     {
-      cudax::mr::any_async_resource<cuda::mr::host_accessible> mr{TestResource{42, this}};
+      cudax::any_async_resource<cuda::mr::host_accessible> mr{TestResource{42, this}};
       expected.new_count += is_big;
       ++expected.object_count;
       ++expected.move_count;
@@ -108,7 +108,7 @@ TEMPLATE_TEST_CASE_METHOD(test_fixture, "any_async_resource", "[container][resou
     CHECK(this->counts == expected);
     {
       cudax::stream stream{};
-      cudax::mr::any_async_resource<cuda::mr::host_accessible> mr{TestResource{42, this}};
+      cudax::any_async_resource<cuda::mr::host_accessible> mr{TestResource{42, this}};
       expected.new_count += is_big;
       ++expected.object_count;
       ++expected.move_count;
@@ -135,7 +135,7 @@ TEMPLATE_TEST_CASE_METHOD(test_fixture, "any_async_resource", "[container][resou
   {
     Counts expected{};
     {
-      cudax::mr::any_async_resource<cuda::mr::host_accessible> mr{TestResource{42, this}};
+      cudax::any_async_resource<cuda::mr::host_accessible> mr{TestResource{42, this}};
       expected.new_count += is_big;
       ++expected.object_count;
       ++expected.move_count;
@@ -165,8 +165,8 @@ TEMPLATE_TEST_CASE_METHOD(test_fixture, "any_async_resource", "[container][resou
     Counts expected{};
     CHECK(this->counts == expected);
     {
-      cudax::mr::any_async_resource<cuda::mr::host_accessible> mr =
-        cudax::mr::make_any_async_resource<TestResource, cuda::mr::host_accessible>(42, this);
+      cudax::any_async_resource<cuda::mr::host_accessible> mr =
+        cudax::make_any_async_resource<TestResource, cuda::mr::host_accessible>(42, this);
       expected.new_count += is_big;
       ++expected.object_count;
       CHECK(this->counts == expected);
diff --git a/cudax/test/memory_resource/any_resource.cu b/cudax/test/memory_resource/any_resource.cu
index 213dee61d93..c013785f32f 100644
--- a/cudax/test/memory_resource/any_resource.cu
+++ b/cudax/test/memory_resource/any_resource.cu
@@ -24,7 +24,7 @@ TEMPLATE_TEST_CASE_METHOD(test_fixture, "any_resource", "[container][resource]",
     Counts expected{};
     CHECK(this->counts == expected);
     {
-      cudax::mr::any_resource<cuda::mr::host_accessible> mr{TestResource{42, this}};
+      cudax::any_resource<cuda::mr::host_accessible> mr{TestResource{42, this}};
       expected.new_count += is_big;
       ++expected.object_count;
       ++expected.move_count;
@@ -43,7 +43,7 @@ TEMPLATE_TEST_CASE_METHOD(test_fixture, "any_resource", "[container][resource]",
     Counts expected{};
     CHECK(this->counts == expected);
     {
-      cudax::mr::any_resource<cuda::mr::host_accessible> mr{TestResource{42, this}};
+      cudax::any_resource<cuda::mr::host_accessible> mr{TestResource{42, this}};
       expected.new_count += is_big;
       ++expected.object_count;
       ++expected.move_count;
@@ -78,7 +78,7 @@ TEMPLATE_TEST_CASE_METHOD(test_fixture, "any_resource", "[container][resource]",
     Counts expected{};
     CHECK(this->counts == expected);
     {
-      cudax::mr::any_resource<cuda::mr::host_accessible> mr{TestResource{42, this}};
+      cudax::any_resource<cuda::mr::host_accessible> mr{TestResource{42, this}};
       expected.new_count += is_big;
       ++expected.object_count;
       ++expected.move_count;
@@ -105,7 +105,7 @@ TEMPLATE_TEST_CASE_METHOD(test_fixture, "any_resource", "[container][resource]",
   {
     Counts expected{};
     {
-      cudax::mr::any_resource<cuda::mr::host_accessible> mr{TestResource{42, this}};
+      cudax::any_resource<cuda::mr::host_accessible> mr{TestResource{42, this}};
       expected.new_count += is_big;
       ++expected.object_count;
       ++expected.move_count;
@@ -135,8 +135,8 @@ TEMPLATE_TEST_CASE_METHOD(test_fixture, "any_resource", "[container][resource]",
     Counts expected{};
     CHECK(this->counts == expected);
     {
-      cudax::mr::any_resource<cuda::mr::host_accessible> mr =
-        cudax::mr::make_any_resource<TestResource, cuda::mr::host_accessible>(42, this);
+      cudax::any_resource<cuda::mr::host_accessible> mr =
+        cudax::make_any_resource<TestResource, cuda::mr::host_accessible>(42, this);
       expected.new_count += is_big;
       ++expected.object_count;
       CHECK(this->counts == expected);
diff --git a/cudax/test/memory_resource/device_memory_pool.cu b/cudax/test/memory_resource/device_memory_pool.cu
index 22faeda6bb8..3260829c4b6 100644
--- a/cudax/test/memory_resource/device_memory_pool.cu
+++ b/cudax/test/memory_resource/device_memory_pool.cu
@@ -22,7 +22,7 @@
 #include <testing.cuh>
 
 namespace cudax = cuda::experimental;
-using pool      = cudax::mr::device_memory_pool;
+using pool      = cudax::device_memory_pool;
 static_assert(!cuda::std::is_trivial<pool>::value, "");
 static_assert(!cuda::std::is_trivially_default_constructible<pool>::value, "");
 static_assert(!cuda::std::is_default_constructible<pool>::value, "");
@@ -89,10 +89,10 @@ TEST_CASE("device_memory_pool construction", "[memory_resource]")
                        current_device);
   }
 
-  using memory_pool = cudax::mr::device_memory_pool;
+  using memory_pool = cudax::device_memory_pool;
   SECTION("Construct from device id")
   {
-    cudax::mr::device_memory_pool from_device{current_device};
+    cudax::device_memory_pool from_device{current_device};
 
     ::cudaMemPool_t get = from_device.get();
     CHECK(get != current_default_pool);
@@ -109,7 +109,7 @@ TEST_CASE("device_memory_pool construction", "[memory_resource]")
 
   SECTION("Construct with empty properties")
   {
-    cudax::mr::memory_pool_properties props{};
+    cudax::memory_pool_properties props{};
     memory_pool from_defaulted_properties{current_device, props};
 
     ::cudaMemPool_t get = from_defaulted_properties.get();
@@ -127,7 +127,7 @@ TEST_CASE("device_memory_pool construction", "[memory_resource]")
 
   SECTION("Construct with initial pool size")
   {
-    cudax::mr::memory_pool_properties props = {42, 20};
+    cudax::memory_pool_properties props = {42, 20};
     memory_pool with_threshold{current_device, props};
 
     ::cudaMemPool_t get = with_threshold.get();
@@ -147,8 +147,8 @@ TEST_CASE("device_memory_pool construction", "[memory_resource]")
 #if _CCCL_CUDACC_AT_LEAST(11, 2)
   SECTION("Construct with allocation handle")
   {
-    cudax::mr::memory_pool_properties props = {
-      42, 20, cudax::mr::cudaMemAllocationHandleType::cudaMemHandleTypePosixFileDescriptor};
+    cudax::memory_pool_properties props = {
+      42, 20, cudax::cudaMemAllocationHandleType::cudaMemHandleTypePosixFileDescriptor};
     memory_pool with_allocation_handle{current_device, props};
 
     ::cudaMemPool_t get = with_allocation_handle.get();
@@ -175,7 +175,7 @@ TEST_CASE("device_memory_pool construction", "[memory_resource]")
     ::cudaMemPool_t new_pool{};
     _CCCL_TRY_CUDA_API(::cudaMemPoolCreate, "Failed to call cudaMemPoolCreate", &new_pool, &pool_properties);
 
-    cudax::mr::device_memory_pool from_handle = cudax::mr::device_memory_pool::from_native_handle(new_pool);
+    cudax::device_memory_pool from_handle = cudax::device_memory_pool::from_native_handle(new_pool);
     CHECK(from_handle.get() == new_pool);
   }
 }
@@ -200,9 +200,9 @@ TEST_CASE("device_memory_pool comparison", "[memory_resource]")
                        current_device);
   }
 
-  cudax::mr::device_memory_pool first{current_device};
+  cudax::device_memory_pool first{current_device};
   { // comparison against a plain device_memory_pool
-    cudax::mr::device_memory_pool second{current_device};
+    cudax::device_memory_pool second{current_device};
     CHECK(first == first);
     CHECK(first != second);
   }
@@ -237,7 +237,7 @@ TEST_CASE("device_memory_pool accessors", "[memory_resource]")
 
   SECTION("device_memory_pool::set_attribute")
   {
-    cudax::mr::device_memory_pool pool{current_device};
+    cudax::device_memory_pool pool{current_device};
 
     { // cudaMemPoolReuseFollowEventDependencies
       // Get the attribute value
@@ -300,7 +300,7 @@ TEST_CASE("device_memory_pool accessors", "[memory_resource]")
     }
 
     // prime the pool to a given size
-    cudax::mr::device_memory_resource resource{pool};
+    cudax::device_memory_resource resource{pool};
     cudax::stream stream{};
 
     // Allocate a buffer to prime
@@ -417,9 +417,9 @@ TEST_CASE("device_memory_pool accessors", "[memory_resource]")
 
   SECTION("device_memory_pool::trim_to")
   {
-    cudax::mr::device_memory_pool pool{current_device};
+    cudax::device_memory_pool pool{current_device};
     // prime the pool to a given size
-    cudax::mr::device_memory_resource resource{pool};
+    cudax::device_memory_resource resource{pool};
     cudax::stream stream{};
 
     // Allocate 2 buffers
@@ -476,7 +476,7 @@ TEST_CASE("device_memory_pool accessors", "[memory_resource]")
       auto peers = cudax::devices[0].get_peers();
       if (peers.size() > 0)
       {
-        cudax::mr::device_memory_pool pool{cudax::devices[0]};
+        cudax::device_memory_pool pool{cudax::devices[0]};
         CUDAX_CHECK(pool.is_accessible_from(cudax::devices[0]));
 
         pool.enable_peer_access_from(peers);
diff --git a/cudax/test/memory_resource/device_memory_resource.cu b/cudax/test/memory_resource/device_memory_resource.cu
index 55839831bc4..29b5d4d9baf 100644
--- a/cudax/test/memory_resource/device_memory_resource.cu
+++ b/cudax/test/memory_resource/device_memory_resource.cu
@@ -21,15 +21,15 @@
 
 namespace cudax = cuda::experimental;
 
-static_assert(!cuda::std::is_trivial<cudax::mr::device_memory_resource>::value, "");
-static_assert(!cuda::std::is_trivially_default_constructible<cudax::mr::device_memory_resource>::value, "");
-static_assert(cuda::std::is_default_constructible<cudax::mr::device_memory_resource>::value, "");
-static_assert(cuda::std::is_copy_constructible<cudax::mr::device_memory_resource>::value, "");
-static_assert(cuda::std::is_move_constructible<cudax::mr::device_memory_resource>::value, "");
-static_assert(cuda::std::is_copy_assignable<cudax::mr::device_memory_resource>::value, "");
-static_assert(cuda::std::is_move_assignable<cudax::mr::device_memory_resource>::value, "");
-static_assert(cuda::std::is_trivially_destructible<cudax::mr::device_memory_resource>::value, "");
-static_assert(!cuda::std::is_empty<cudax::mr::device_memory_resource>::value, "");
+static_assert(!cuda::std::is_trivial<cudax::device_memory_resource>::value, "");
+static_assert(!cuda::std::is_trivially_default_constructible<cudax::device_memory_resource>::value, "");
+static_assert(cuda::std::is_default_constructible<cudax::device_memory_resource>::value, "");
+static_assert(cuda::std::is_copy_constructible<cudax::device_memory_resource>::value, "");
+static_assert(cuda::std::is_move_constructible<cudax::device_memory_resource>::value, "");
+static_assert(cuda::std::is_copy_assignable<cudax::device_memory_resource>::value, "");
+static_assert(cuda::std::is_move_assignable<cudax::device_memory_resource>::value, "");
+static_assert(cuda::std::is_trivially_destructible<cudax::device_memory_resource>::value, "");
+static_assert(!cuda::std::is_empty<cudax::device_memory_resource>::value, "");
 
 static bool ensure_release_threshold(::cudaMemPool_t pool, const size_t expected_threshold)
 {
@@ -87,7 +87,7 @@ TEST_CASE("device_memory_resource construction", "[memory_resource]")
                        current_device);
   }
 
-  using async_resource = cuda::experimental::mr::device_memory_resource;
+  using async_resource = cuda::experimental::device_memory_resource;
   SECTION("Default construction")
   {
     {
@@ -99,7 +99,7 @@ TEST_CASE("device_memory_resource construction", "[memory_resource]")
     void* ptr{nullptr};
     _CCCL_TRY_CUDA_API(
       ::cudaMallocAsync,
-      "Failed to allocate with pool passed to cuda::experimental::mr::device_memory_resource",
+      "Failed to allocate with pool passed to cuda::experimental::device_memory_resource",
       &ptr,
       42,
       current_default_pool,
@@ -108,7 +108,7 @@ TEST_CASE("device_memory_resource construction", "[memory_resource]")
 
     _CCCL_ASSERT_CUDA_API(
       ::cudaFreeAsync,
-      "Failed to deallocate with pool passed to cuda::experimental::mr::device_memory_resource",
+      "Failed to deallocate with pool passed to cuda::experimental::device_memory_resource",
       ptr,
       ::cudaStream_t{0});
   }
@@ -133,7 +133,7 @@ TEST_CASE("device_memory_resource construction", "[memory_resource]")
     void* ptr{nullptr};
     _CCCL_TRY_CUDA_API(
       ::cudaMallocAsync,
-      "Failed to allocate with pool passed to cuda::experimental::mr::device_memory_resource",
+      "Failed to allocate with pool passed to cuda::experimental::device_memory_resource",
       &ptr,
       42,
       current_default_pool,
@@ -142,17 +142,17 @@ TEST_CASE("device_memory_resource construction", "[memory_resource]")
 
     _CCCL_ASSERT_CUDA_API(
       ::cudaFreeAsync,
-      "Failed to deallocate with pool passed to cuda::experimental::mr::device_memory_resource",
+      "Failed to deallocate with pool passed to cuda::experimental::device_memory_resource",
       ptr,
       ::cudaStream_t{0});
   }
 
   SECTION("Construct with initial pool size")
   {
-    cuda::experimental::mr::memory_pool_properties props = {
+    cuda::experimental::memory_pool_properties props = {
       42,
     };
-    cuda::experimental::mr::device_memory_pool pool{current_device, props};
+    cuda::experimental::device_memory_pool pool{current_device, props};
     async_resource from_initial_pool_size{pool};
 
     ::cudaMemPool_t get = from_initial_pool_size.get();
@@ -170,11 +170,11 @@ TEST_CASE("device_memory_resource construction", "[memory_resource]")
 
   SECTION("Construct with release threshold")
   {
-    cuda::experimental::mr::memory_pool_properties props = {
+    cuda::experimental::memory_pool_properties props = {
       42,
       20,
     };
-    cuda::experimental::mr::device_memory_pool pool{current_device, props};
+    cuda::experimental::device_memory_pool pool{current_device, props};
     async_resource with_threshold{pool};
 
     ::cudaMemPool_t get = with_threshold.get();
@@ -194,12 +194,12 @@ TEST_CASE("device_memory_resource construction", "[memory_resource]")
 #if _CCCL_CUDACC_AT_LEAST(11, 2)
   SECTION("Construct with allocation handle")
   {
-    cuda::experimental::mr::memory_pool_properties props = {
+    cuda::experimental::memory_pool_properties props = {
       42,
       20,
-      cuda::experimental::mr::cudaMemAllocationHandleType::cudaMemHandleTypePosixFileDescriptor,
+      cuda::experimental::cudaMemAllocationHandleType::cudaMemHandleTypePosixFileDescriptor,
     };
-    cuda::experimental::mr::device_memory_pool pool{current_device, props};
+    cuda::experimental::device_memory_pool pool{current_device, props};
     async_resource with_allocation_handle{pool};
 
     ::cudaMemPool_t get = with_allocation_handle.get();
@@ -228,7 +228,7 @@ static void ensure_device_ptr(void* ptr)
 
 TEST_CASE("device_memory_resource allocation", "[memory_resource]")
 {
-  cuda::experimental::mr::device_memory_resource res{};
+  cuda::experimental::device_memory_resource res{};
 
   { // allocate / deallocate
     auto* ptr = res.allocate(42);
@@ -404,9 +404,9 @@ TEST_CASE("device_memory_resource comparison", "[memory_resource]")
     _CCCL_TRY_CUDA_API(::cudaGetDevice, "Failed to query current device with cudaGetDevice.", &current_device);
   }
 
-  cuda::experimental::mr::device_memory_resource first{};
+  cuda::experimental::device_memory_resource first{};
   { // comparison against a plain device_memory_resource
-    cuda::experimental::mr::device_memory_resource second{};
+    cuda::experimental::device_memory_resource second{};
     CHECK(first == second);
     CHECK(!(first != second));
   }
@@ -421,13 +421,13 @@ TEST_CASE("device_memory_resource comparison", "[memory_resource]")
       pool_properties.location.id   = current_device;
       _CCCL_TRY_CUDA_API(::cudaMemPoolCreate, "Failed to call cudaMemPoolCreate", &cuda_pool_handle, &pool_properties);
     }
-    cuda::experimental::mr::device_memory_resource second{cuda_pool_handle};
+    cuda::experimental::device_memory_resource second{cuda_pool_handle};
     CHECK(first != second);
     CHECK(!(first == second));
   }
 
   { // comparison against a device_memory_resource wrapped inside a resource_ref<device_accessible>
-    cuda::experimental::mr::device_memory_resource second{};
+    cuda::experimental::device_memory_resource second{};
     cuda::mr::resource_ref<cuda::mr::device_accessible> second_ref{second};
     CHECK(first == second_ref);
     CHECK(!(first != second_ref));
@@ -436,7 +436,7 @@ TEST_CASE("device_memory_resource comparison", "[memory_resource]")
   }
 
   { // comparison against a device_memory_resource wrapped inside a async_resource_ref
-    cuda::experimental::mr::device_memory_resource second{};
+    cuda::experimental::device_memory_resource second{};
     cuda::mr::async_resource_ref<cuda::mr::device_accessible> second_ref{second};
 
     CHECK(first == second_ref);
@@ -481,8 +481,8 @@ TEST_CASE("Async memory resource peer access")
     auto peers = cudax::devices[0].get_peers();
     if (peers.size() > 0)
     {
-      cudax::mr::device_memory_pool pool{cudax::devices[0]};
-      cudax::mr::device_memory_resource resource{pool};
+      cudax::device_memory_pool pool{cudax::devices[0]};
+      cudax::device_memory_resource resource{pool};
       cudax::stream stream{peers.front()};
       CUDAX_CHECK(resource.is_accessible_from(cudax::devices[0]));
 
@@ -503,7 +503,7 @@ TEST_CASE("Async memory resource peer access")
       CUDAX_CHECK(resource.is_accessible_from(peers.front()));
       allocate_and_check_access(resource);
 
-      cudax::mr::device_memory_resource another_resource{pool};
+      cudax::device_memory_resource another_resource{pool};
       CUDAX_CHECK(another_resource.is_accessible_from(peers.front()));
       allocate_and_check_access(another_resource);
 
@@ -527,8 +527,8 @@ TEST_CASE("Async memory resource peer access")
       resource.enable_peer_access_from(peers);
 
       // Check the resource using the default pool
-      cudax::mr::device_memory_resource default_pool_resource{};
-      cudax::mr::device_memory_resource another_default_pool_resource{};
+      cudax::device_memory_resource default_pool_resource{};
+      cudax::device_memory_resource another_default_pool_resource{};
 
       default_pool_resource.enable_peer_access_from(peers.front());
 
diff --git a/cudax/test/memory_resource/managed_memory_resource.cu b/cudax/test/memory_resource/managed_memory_resource.cu
new file mode 100644
index 00000000000..073402124bd
--- /dev/null
+++ b/cudax/test/memory_resource/managed_memory_resource.cu
@@ -0,0 +1,273 @@
+//===----------------------------------------------------------------------===//
+//
+// Part of CUDA Experimental in CUDA C++ Core Libraries,
+// under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+// SPDX-FileCopyrightText: Copyright (c) 2024 NVIDIA CORPORATION & AFFILIATES.
+//
+//===----------------------------------------------------------------------===//
+
+#include <cuda/std/cstdint>
+#include <cuda/std/type_traits>
+#include <cuda/stream_ref>
+
+#include <cuda/experimental/memory_resource.cuh>
+#include <cuda/experimental/stream.cuh>
+
+#include <stdexcept>
+
+#include <catch2/catch.hpp>
+#include <utility.cuh>
+
+namespace cudax = cuda::experimental;
+
+using managed_resource = cudax::managed_memory_resource;
+static_assert(!cuda::std::is_trivial<managed_resource>::value, "");
+static_assert(!cuda::std::is_trivially_default_constructible<managed_resource>::value, "");
+static_assert(cuda::std::is_trivially_copy_constructible<managed_resource>::value, "");
+static_assert(cuda::std::is_trivially_move_constructible<managed_resource>::value, "");
+static_assert(cuda::std::is_trivially_copy_assignable<managed_resource>::value, "");
+static_assert(cuda::std::is_trivially_move_assignable<managed_resource>::value, "");
+static_assert(cuda::std::is_trivially_destructible<managed_resource>::value, "");
+static_assert(!cuda::std::is_empty<managed_resource>::value, "");
+
+static void ensure_managed_ptr(void* ptr)
+{
+  CHECK(ptr != nullptr);
+  cudaPointerAttributes attributes;
+  cudaError_t status = cudaPointerGetAttributes(&attributes, ptr);
+  CHECK(status == cudaSuccess);
+  CHECK(attributes.type == cudaMemoryTypeManaged);
+}
+
+TEST_CASE("managed_memory_resource construction", "[memory_resource]")
+{
+  SECTION("Default construction")
+  {
+    STATIC_REQUIRE(cuda::std::is_default_constructible_v<managed_resource>);
+  }
+
+  SECTION("Construct with flag")
+  {
+    managed_resource defaulted{};
+    managed_resource with_flag{cudaMemAttachHost};
+    CHECK(defaulted != with_flag);
+  }
+}
+
+TEST_CASE("managed_memory_resource allocation", "[memory_resource]")
+{
+  managed_resource res{};
+  cudax::stream stream{};
+
+  { // allocate / deallocate
+    auto* ptr = res.allocate(42);
+    static_assert(cuda::std::is_same<decltype(ptr), void*>::value, "");
+    ensure_managed_ptr(ptr);
+
+    res.deallocate(ptr, 42);
+  }
+
+  { // allocate / deallocate with alignment
+    auto* ptr = res.allocate(42, 4);
+    static_assert(cuda::std::is_same<decltype(ptr), void*>::value, "");
+    ensure_managed_ptr(ptr);
+
+    res.deallocate(ptr, 42, 4);
+  }
+
+  { // allocate_async / deallocate_async
+    auto* ptr = res.allocate_async(42, stream);
+    static_assert(cuda::std::is_same<decltype(ptr), void*>::value, "");
+
+    stream.wait();
+    ensure_managed_ptr(ptr);
+
+    res.deallocate_async(ptr, 42, stream);
+  }
+
+  { // allocate_async / deallocate_async with alignment
+    auto* ptr = res.allocate_async(42, 4, stream);
+    static_assert(cuda::std::is_same<decltype(ptr), void*>::value, "");
+
+    stream.wait();
+    ensure_managed_ptr(ptr);
+
+    res.deallocate_async(ptr, 42, 4, stream);
+  }
+
+#ifndef _LIBCUDACXX_NO_EXCEPTIONS
+  { // allocate with too small alignment
+    while (true)
+    {
+      try
+      {
+        auto* ptr = res.allocate(5, 42);
+        (void) ptr;
+      }
+      catch (std::invalid_argument&)
+      {
+        break;
+      }
+      CHECK(false);
+    }
+  }
+
+  { // allocate with non matching alignment
+    while (true)
+    {
+      try
+      {
+        auto* ptr = res.allocate(5, 1337);
+        (void) ptr;
+      }
+      catch (std::invalid_argument&)
+      {
+        break;
+      }
+      CHECK(false);
+    }
+  }
+  { // allocate_async with too small alignment
+    while (true)
+    {
+      try
+      {
+        auto* ptr = res.allocate_async(5, 42, stream);
+        (void) ptr;
+      }
+      catch (std::invalid_argument&)
+      {
+        break;
+      }
+      CHECK(false);
+    }
+  }
+
+  { // allocate_async with non matching alignment
+    while (true)
+    {
+      try
+      {
+        auto* ptr = res.allocate_async(5, 1337, stream);
+        (void) ptr;
+      }
+      catch (std::invalid_argument&)
+      {
+        break;
+      }
+      CHECK(false);
+    }
+  }
+#endif // _LIBCUDACXX_NO_EXCEPTIONS
+}
+
+enum class AccessibilityType
+{
+  Device,
+  Host,
+};
+
+template <AccessibilityType Accessibilty>
+struct resource
+{
+  void* allocate(size_t, size_t)
+  {
+    return nullptr;
+  }
+  void deallocate(void*, size_t, size_t) noexcept {}
+
+  bool operator==(const resource&) const
+  {
+    return true;
+  }
+  bool operator!=(const resource& other) const
+  {
+    return false;
+  }
+};
+static_assert(cuda::mr::resource<resource<AccessibilityType::Host>>, "");
+static_assert(cuda::mr::resource<resource<AccessibilityType::Device>>, "");
+
+template <AccessibilityType Accessibilty>
+struct async_resource : public resource<Accessibilty>
+{
+  void* allocate_async(size_t, size_t, cuda::stream_ref)
+  {
+    return nullptr;
+  }
+  void deallocate_async(void*, size_t, size_t, cuda::stream_ref) {}
+};
+static_assert(cuda::mr::async_resource<async_resource<AccessibilityType::Host>>, "");
+static_assert(cuda::mr::async_resource<async_resource<AccessibilityType::Device>>, "");
+
+// test for cccl#2214: https://github.com/NVIDIA/cccl/issues/2214
+struct derived_managed_resource : cudax::managed_memory_resource
+{
+  using cudax::managed_memory_resource::managed_memory_resource;
+};
+static_assert(cuda::mr::resource<derived_managed_resource>, "");
+
+TEST_CASE("managed_memory_resource comparison", "[memory_resource]")
+{
+  managed_resource first{};
+  { // comparison against a plain managed_memory_resource
+    managed_resource second{};
+    CHECK(first == second);
+    CHECK(!(first != second));
+  }
+
+  { // comparison against a plain managed_memory_resource with a different pool
+    managed_resource second{cudaMemAttachHost};
+    CHECK(first != second);
+    CHECK(!(first == second));
+  }
+
+  { // comparison against a managed_memory_resource wrapped inside a resource_ref<device_accessible>
+    managed_resource second{};
+    cuda::mr::resource_ref<cuda::mr::device_accessible> second_ref{second};
+    CHECK(first == second_ref);
+    CHECK(!(first != second_ref));
+    CHECK(second_ref == first);
+    CHECK(!(second_ref != first));
+  }
+
+  { // comparison against a managed_memory_resource wrapped inside a async_resource_ref
+    managed_resource second{};
+    cuda::mr::async_resource_ref<cuda::mr::device_accessible> second_ref{second};
+
+    CHECK(first == second_ref);
+    CHECK(!(first != second_ref));
+    CHECK(second_ref == first);
+    CHECK(!(second_ref != first));
+  }
+
+  { // comparison against a different managed_resource through resource_ref
+    resource<AccessibilityType::Host> host_resource{};
+    resource<AccessibilityType::Device> device_resource{};
+    CHECK(!(first == host_resource));
+    CHECK(first != host_resource);
+    CHECK(!(first == device_resource));
+    CHECK(first != device_resource);
+
+    CHECK(!(host_resource == first));
+    CHECK(host_resource != first);
+    CHECK(!(device_resource == first));
+    CHECK(device_resource != first);
+  }
+
+  { // comparison against a different managed_resource through resource_ref
+    resource<AccessibilityType::Host> host_async_resource{};
+    resource<AccessibilityType::Device> device_async_resource{};
+    CHECK(!(first == host_async_resource));
+    CHECK(first != host_async_resource);
+    CHECK(!(first == device_async_resource));
+    CHECK(first != device_async_resource);
+
+    CHECK(!(host_async_resource == first));
+    CHECK(host_async_resource != first);
+    CHECK(!(device_async_resource == first));
+    CHECK(device_async_resource != first);
+  }
+}
diff --git a/cudax/test/memory_resource/pinned_memory_resource.cu b/cudax/test/memory_resource/pinned_memory_resource.cu
new file mode 100644
index 00000000000..6423b292de0
--- /dev/null
+++ b/cudax/test/memory_resource/pinned_memory_resource.cu
@@ -0,0 +1,274 @@
+//===----------------------------------------------------------------------===//
+//
+// Part of CUDA Experimental in CUDA C++ Core Libraries,
+// under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+// SPDX-FileCopyrightText: Copyright (c) 2024 NVIDIA CORPORATION & AFFILIATES.
+//
+//===----------------------------------------------------------------------===//
+
+#include <cuda/std/cstdint>
+#include <cuda/std/type_traits>
+#include <cuda/stream_ref>
+
+#include <cuda/experimental/memory_resource.cuh>
+#include <cuda/experimental/stream.cuh>
+
+#include <stdexcept>
+
+#include <catch2/catch.hpp>
+#include <utility.cuh>
+
+namespace cudax = cuda::experimental;
+
+using pinned_resource = cudax::pinned_memory_resource;
+static_assert(!cuda::std::is_trivial<pinned_resource>::value, "");
+static_assert(!cuda::std::is_trivially_default_constructible<pinned_resource>::value, "");
+static_assert(cuda::std::is_trivially_copy_constructible<pinned_resource>::value, "");
+static_assert(cuda::std::is_trivially_move_constructible<pinned_resource>::value, "");
+static_assert(cuda::std::is_trivially_copy_assignable<pinned_resource>::value, "");
+static_assert(cuda::std::is_trivially_move_assignable<pinned_resource>::value, "");
+static_assert(cuda::std::is_trivially_destructible<pinned_resource>::value, "");
+static_assert(!cuda::std::is_empty<pinned_resource>::value, "");
+
+static void ensure_pinned_ptr(void* ptr)
+{
+  CHECK(ptr != nullptr);
+  cudaPointerAttributes attributes;
+  cudaError_t status = cudaPointerGetAttributes(&attributes, ptr);
+  CHECK(status == cudaSuccess);
+  CHECK(attributes.type == cudaMemoryTypeHost);
+  CHECK(attributes.devicePointer != nullptr);
+}
+
+TEST_CASE("pinned_memory_resource construction", "[memory_resource]")
+{
+  SECTION("Default construction")
+  {
+    STATIC_REQUIRE(cuda::std::is_default_constructible_v<pinned_resource>);
+  }
+
+  SECTION("Construct with flag")
+  {
+    pinned_resource defaulted{};
+    pinned_resource with_flag{cudaHostAllocMapped};
+    CHECK(defaulted != with_flag);
+  }
+}
+
+TEST_CASE("pinned_memory_resource allocation", "[memory_resource]")
+{
+  pinned_resource res{};
+  cudax::stream stream{};
+
+  { // allocate / deallocate
+    auto* ptr = res.allocate(42);
+    static_assert(cuda::std::is_same<decltype(ptr), void*>::value, "");
+    ensure_pinned_ptr(ptr);
+
+    res.deallocate(ptr, 42);
+  }
+
+  { // allocate / deallocate with alignment
+    auto* ptr = res.allocate(42, 4);
+    static_assert(cuda::std::is_same<decltype(ptr), void*>::value, "");
+    ensure_pinned_ptr(ptr);
+
+    res.deallocate(ptr, 42, 4);
+  }
+
+  { // allocate_async / deallocate_async
+    auto* ptr = res.allocate_async(42, stream);
+    static_assert(cuda::std::is_same<decltype(ptr), void*>::value, "");
+
+    stream.wait();
+    ensure_pinned_ptr(ptr);
+
+    res.deallocate_async(ptr, 42, stream);
+  }
+
+  { // allocate_async / deallocate_async with alignment
+    auto* ptr = res.allocate_async(42, 4, stream);
+    static_assert(cuda::std::is_same<decltype(ptr), void*>::value, "");
+
+    stream.wait();
+    ensure_pinned_ptr(ptr);
+
+    res.deallocate_async(ptr, 42, 4, stream);
+  }
+
+#ifndef _LIBCUDACXX_NO_EXCEPTIONS
+  { // allocate with too small alignment
+    while (true)
+    {
+      try
+      {
+        auto* ptr = res.allocate(5, 42);
+        (void) ptr;
+      }
+      catch (std::invalid_argument&)
+      {
+        break;
+      }
+      CHECK(false);
+    }
+  }
+
+  { // allocate with non matching alignment
+    while (true)
+    {
+      try
+      {
+        auto* ptr = res.allocate(5, 1337);
+        (void) ptr;
+      }
+      catch (std::invalid_argument&)
+      {
+        break;
+      }
+      CHECK(false);
+    }
+  }
+  { // allocate_async with too small alignment
+    while (true)
+    {
+      try
+      {
+        auto* ptr = res.allocate_async(5, 42, stream);
+        (void) ptr;
+      }
+      catch (std::invalid_argument&)
+      {
+        break;
+      }
+      CHECK(false);
+    }
+  }
+
+  { // allocate_async with non matching alignment
+    while (true)
+    {
+      try
+      {
+        auto* ptr = res.allocate_async(5, 1337, stream);
+        (void) ptr;
+      }
+      catch (std::invalid_argument&)
+      {
+        break;
+      }
+      CHECK(false);
+    }
+  }
+#endif // _LIBCUDACXX_NO_EXCEPTIONS
+}
+
+enum class AccessibilityType
+{
+  Device,
+  Host,
+};
+
+template <AccessibilityType Accessibilty>
+struct resource
+{
+  void* allocate(size_t, size_t)
+  {
+    return nullptr;
+  }
+  void deallocate(void*, size_t, size_t) noexcept {}
+
+  bool operator==(const resource&) const
+  {
+    return true;
+  }
+  bool operator!=(const resource& other) const
+  {
+    return false;
+  }
+};
+static_assert(cuda::mr::resource<resource<AccessibilityType::Host>>, "");
+static_assert(cuda::mr::resource<resource<AccessibilityType::Device>>, "");
+
+template <AccessibilityType Accessibilty>
+struct async_resource : public resource<Accessibilty>
+{
+  void* allocate_async(size_t, size_t, cuda::stream_ref)
+  {
+    return nullptr;
+  }
+  void deallocate_async(void*, size_t, size_t, cuda::stream_ref) {}
+};
+static_assert(cuda::mr::async_resource<async_resource<AccessibilityType::Host>>, "");
+static_assert(cuda::mr::async_resource<async_resource<AccessibilityType::Device>>, "");
+
+// test for cccl#2214: https://github.com/NVIDIA/cccl/issues/2214
+struct derived_pinned_resource : cudax::pinned_memory_resource
+{
+  using cudax::pinned_memory_resource::pinned_memory_resource;
+};
+static_assert(cuda::mr::resource<derived_pinned_resource>, "");
+
+TEST_CASE("pinned_memory_resource comparison", "[memory_resource]")
+{
+  pinned_resource first{};
+  { // comparison against a plain pinned_memory_resource
+    pinned_resource second{};
+    CHECK(first == second);
+    CHECK(!(first != second));
+  }
+
+  { // comparison against a plain pinned_memory_resource with a different pool
+    pinned_resource second{cudaMemAttachHost};
+    CHECK(first != second);
+    CHECK(!(first == second));
+  }
+
+  { // comparison against a pinned_memory_resource wrapped inside a resource_ref<device_accessible>
+    pinned_resource second{};
+    cuda::mr::resource_ref<cuda::mr::device_accessible> second_ref{second};
+    CHECK(first == second_ref);
+    CHECK(!(first != second_ref));
+    CHECK(second_ref == first);
+    CHECK(!(second_ref != first));
+  }
+
+  { // comparison against a pinned_memory_resource wrapped inside a async_resource_ref
+    pinned_resource second{};
+    cuda::mr::async_resource_ref<cuda::mr::device_accessible> second_ref{second};
+
+    CHECK(first == second_ref);
+    CHECK(!(first != second_ref));
+    CHECK(second_ref == first);
+    CHECK(!(second_ref != first));
+  }
+
+  { // comparison against a different pinned_resource through resource_ref
+    resource<AccessibilityType::Host> host_resource{};
+    resource<AccessibilityType::Device> device_resource{};
+    CHECK(!(first == host_resource));
+    CHECK(first != host_resource);
+    CHECK(!(first == device_resource));
+    CHECK(first != device_resource);
+
+    CHECK(!(host_resource == first));
+    CHECK(host_resource != first);
+    CHECK(!(device_resource == first));
+    CHECK(device_resource != first);
+  }
+
+  { // comparison against a different pinned_resource through resource_ref
+    resource<AccessibilityType::Host> host_async_resource{};
+    resource<AccessibilityType::Device> device_async_resource{};
+    CHECK(!(first == host_async_resource));
+    CHECK(first != host_async_resource);
+    CHECK(!(first == device_async_resource));
+    CHECK(first != device_async_resource);
+
+    CHECK(!(host_async_resource == first));
+    CHECK(host_async_resource != first);
+    CHECK(!(device_async_resource == first));
+    CHECK(device_async_resource != first);
+  }
+}
diff --git a/cudax/test/memory_resource/shared_resource.cu b/cudax/test/memory_resource/shared_resource.cu
index 4cdd7bc1d31..02e98f10cf9 100644
--- a/cudax/test/memory_resource/shared_resource.cu
+++ b/cudax/test/memory_resource/shared_resource.cu
@@ -18,14 +18,14 @@
 TEMPLATE_TEST_CASE_METHOD(test_fixture, "shared_resource", "[container][resource]", big_resource, small_resource)
 {
   using TestResource = TestType;
-  static_assert(cuda::mr::async_resource<cudax::mr::shared_resource<TestResource>>);
+  static_assert(cuda::mr::async_resource<cudax::shared_resource<TestResource>>);
 
   SECTION("construct and destruct")
   {
     Counts expected{};
     CHECK(this->counts == expected);
     {
-      cudax::mr::shared_resource<TestResource> mr{42, this};
+      cudax::shared_resource<TestResource> mr{42, this};
       ++expected.object_count;
       CHECK(this->counts == expected);
     }
@@ -42,7 +42,7 @@ TEMPLATE_TEST_CASE_METHOD(test_fixture, "shared_resource", "[container][resource
     Counts expected{};
     CHECK(this->counts == expected);
     {
-      cudax::mr::shared_resource<TestResource> mr{42, this};
+      cudax::shared_resource<TestResource> mr{42, this};
       ++expected.object_count;
       CHECK(this->counts == expected);
 
@@ -56,7 +56,7 @@ TEMPLATE_TEST_CASE_METHOD(test_fixture, "shared_resource", "[container][resource
       CHECK(mr2 == mr3); // pointers compare equal, no call to TestResource::operator==
       CHECK(this->counts == expected);
 
-      cudax::mr::shared_resource<TestResource> mr4{TestResource{42, this}};
+      cudax::shared_resource<TestResource> mr4{TestResource{42, this}};
       ++expected.object_count;
       ++expected.move_count;
       CHECK(mr3 == mr4); // pointers are not equal, calls TestResource::operator==
@@ -76,7 +76,7 @@ TEMPLATE_TEST_CASE_METHOD(test_fixture, "shared_resource", "[container][resource
     Counts expected{};
     CHECK(this->counts == expected);
     {
-      cudax::mr::shared_resource<TestResource> mr{42, this};
+      cudax::shared_resource<TestResource> mr{42, this};
       ++expected.object_count;
       CHECK(this->counts == expected);
 
@@ -101,7 +101,7 @@ TEMPLATE_TEST_CASE_METHOD(test_fixture, "shared_resource", "[container][resource
   {
     Counts expected{};
     {
-      cudax::mr::shared_resource<TestResource> mr{42, this};
+      cudax::shared_resource<TestResource> mr{42, this};
       ++expected.object_count;
       CHECK(this->counts == expected);
 
@@ -130,7 +130,7 @@ TEMPLATE_TEST_CASE_METHOD(test_fixture, "shared_resource", "[container][resource
     {
       bytes(42 * sizeof(int));
       cudax::uninitialized_buffer<int, cuda::mr::host_accessible> buffer{
-        cudax::mr::shared_resource<TestResource>(42, this), 42};
+        cudax::shared_resource<TestResource>(42, this), 42};
       ++expected.object_count;
       ++expected.allocate_count;
       CHECK(this->counts == expected);
diff --git a/libcudacxx/include/cuda/__memory_resource/device_memory_resource.h b/libcudacxx/include/cuda/__memory_resource/device_memory_resource.h
deleted file mode 100644
index 72e01a5521d..00000000000
--- a/libcudacxx/include/cuda/__memory_resource/device_memory_resource.h
+++ /dev/null
@@ -1,219 +0,0 @@
-//===----------------------------------------------------------------------===//
-//
-// Part of libcu++, the C++ Standard Library for your entire system,
-// under the Apache License v2.0 with LLVM Exceptions.
-// See https://llvm.org/LICENSE.txt for license information.
-// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
-// SPDX-FileCopyrightText: Copyright (c) 2024 NVIDIA CORPORATION & AFFILIATES.
-//
-//===----------------------------------------------------------------------===//
-
-#ifndef _CUDA__MEMORY_RESOURCE_CUDA_MEMORY_RESOURCE_H
-#define _CUDA__MEMORY_RESOURCE_CUDA_MEMORY_RESOURCE_H
-
-#include <cuda/std/detail/__config>
-
-#if defined(_CCCL_IMPLICIT_SYSTEM_HEADER_GCC)
-#  pragma GCC system_header
-#elif defined(_CCCL_IMPLICIT_SYSTEM_HEADER_CLANG)
-#  pragma clang system_header
-#elif defined(_CCCL_IMPLICIT_SYSTEM_HEADER_MSVC)
-#  pragma system_header
-#endif // no system header
-
-#if !_CCCL_COMPILER(MSVC2017) && defined(LIBCUDACXX_ENABLE_EXPERIMENTAL_MEMORY_RESOURCE)
-
-#  if defined(_CCCL_CUDA_COMPILER_CLANG)
-#    include <cuda_runtime_api.h>
-#  endif // _CCCL_CUDA_COMPILER_CLANG
-
-#  include <cuda/__memory_resource/get_property.h>
-#  include <cuda/__memory_resource/properties.h>
-#  include <cuda/__memory_resource/resource.h>
-#  include <cuda/__memory_resource/resource_ref.h>
-#  include <cuda/std/__concepts/concept_macros.h>
-#  include <cuda/std/__cuda/api_wrapper.h>
-#  include <cuda/std/__cuda/ensure_current_device.h>
-#  include <cuda/std/detail/libcxx/include/stdexcept>
-
-#  if _CCCL_STD_VER >= 2014
-
-_LIBCUDACXX_BEGIN_NAMESPACE_CUDA_MR
-
-//! @brief device_memory_resource uses `cudaMalloc` / `cudaFree` for allocation / deallocation.
-//! By default uses device 0 to allocate memory
-class device_memory_resource
-{
-private:
-  int __device_id_{0};
-
-public:
-  //! @brief default constructs a device_memory_resource allocating memory on device 0
-  _CCCL_HIDE_FROM_ABI device_memory_resource() = default;
-
-  //! @brief default constructs a device_memory_resource allocating memory on device \p __device_id
-  //! @param __device_id The id of the device we are allocating memory on
-  constexpr device_memory_resource(const int __device_id) noexcept
-      : __device_id_(__device_id)
-  {}
-
-  //! @brief Allocate device memory of size at least \p __bytes.
-  //! @param __bytes The size in bytes of the allocation.
-  //! @param __alignment The requested alignment of the allocation.
-  //! @throw std::invalid_argument in case of invalid alignment or \c cuda::cuda_error of the returned error code.
-  //! @return Pointer to the newly allocated memory
-  _CCCL_NODISCARD void* allocate(const size_t __bytes, const size_t __alignment = default_cuda_malloc_alignment) const
-  {
-    // We need to ensure that the provided alignment matches the minimal provided alignment
-    if (!__is_valid_alignment(__alignment))
-    {
-      _CUDA_VSTD::__throw_invalid_argument("Invalid alignment passed to device_memory_resource::allocate.");
-    }
-
-    // We need to ensure that we allocate on the right device as `cudaMalloc` always uses the current device
-    __ensure_current_device __device_wrapper{__device_id_};
-
-    void* __ptr{nullptr};
-    _CCCL_TRY_CUDA_API(::cudaMalloc, "Failed to allocate memory with cudaMalloc.", &__ptr, __bytes);
-    return __ptr;
-  }
-
-  //! @brief Deallocate memory pointed to by \p __ptr.
-  //! @param __ptr Pointer to be deallocated. Must have been allocated through a call to `allocate`
-  //! @param __bytes The number of bytes that was passed to the `allocate` call that returned \p __ptr.
-  //! @param __alignment The alignment that was passed to the `allocate` call that returned \p __ptr.
-  void deallocate(void* __ptr, const size_t, const size_t __alignment = default_cuda_malloc_alignment) const noexcept
-  {
-    // We need to ensure that the provided alignment matches the minimal provided alignment
-    _CCCL_ASSERT(__is_valid_alignment(__alignment), "Invalid alignment passed to device_memory_resource::deallocate.");
-    _CCCL_ASSERT_CUDA_API(::cudaFree, "device_memory_resource::deallocate failed", __ptr);
-    (void) __alignment;
-  }
-
-  //! @brief Equality comparison with another \c device_memory_resource
-  //! @param __other The other \c device_memory_resource
-  //! @return true, if both resources hold the same device id
-  _CCCL_NODISCARD constexpr bool operator==(device_memory_resource const& __other) const noexcept
-  {
-    return __device_id_ == __other.__device_id_;
-  }
-#    if _CCCL_STD_VER <= 2017
-  //! @brief Inequality comparison with another \c device_memory_resource
-  //! @param __other The other \c device_memory_resource
-  //! @return true, if both resources hold different device id's
-  _CCCL_NODISCARD constexpr bool operator!=(device_memory_resource const& __other) const noexcept
-  {
-    return __device_id_ != __other.__device_id_;
-  }
-#    endif // _CCCL_STD_VER <= 2017
-
-#    if _CCCL_STD_VER >= 2020
-  //! @brief Equality comparison between a \c device_memory_resource and another resource
-  //! @param __rhs The resource to compare to
-  //! @return If the underlying types are equality comparable, returns the result of equality comparison of both
-  //! resources. Otherwise, returns false.
-  _CCCL_TEMPLATE(class _Resource)
-  _CCCL_REQUIRES((__different_resource<device_memory_resource, _Resource>) )
-  _CCCL_NODISCARD bool operator==(_Resource const& __rhs) const noexcept
-  {
-    if constexpr (has_property<_Resource, device_accessible>)
-    {
-      return resource_ref<device_accessible>{const_cast<device_memory_resource*>(this)}
-          == resource_ref<device_accessible>{const_cast<_Resource&>(__rhs)};
-    }
-    else
-    {
-      return false;
-    }
-  }
-#    else // ^^^ C++20 ^^^ / vvv C++17
-  template <class _Resource>
-  _CCCL_NODISCARD_FRIEND auto operator==(device_memory_resource const& __lhs, _Resource const& __rhs) noexcept
-    _CCCL_TRAILING_REQUIRES(bool)(
-      __different_resource<device_memory_resource, _Resource>&& has_property<_Resource, device_accessible>)
-  {
-    return resource_ref<device_accessible>{const_cast<device_memory_resource&>(__lhs)}
-        == resource_ref<device_accessible>{const_cast<_Resource&>(__rhs)};
-  }
-
-  template <class _Resource>
-  _CCCL_NODISCARD_FRIEND auto operator==(device_memory_resource const&, _Resource const&) noexcept
-    _CCCL_TRAILING_REQUIRES(bool)(__different_resource<device_memory_resource, _Resource>
-                                  && !has_property<_Resource, device_accessible>)
-  {
-    return false;
-  }
-
-  template <class _Resource>
-  _CCCL_NODISCARD_FRIEND auto operator==(_Resource const& __rhs, device_memory_resource const& __lhs) noexcept
-    _CCCL_TRAILING_REQUIRES(bool)(
-      __different_resource<device_memory_resource, _Resource>&& has_property<_Resource, device_accessible>)
-  {
-    return resource_ref<device_accessible>{const_cast<device_memory_resource&>(__lhs)}
-        == resource_ref<device_accessible>{const_cast<_Resource&>(__rhs)};
-  }
-
-  template <class _Resource>
-  _CCCL_NODISCARD_FRIEND auto operator==(_Resource const&, device_memory_resource const&) noexcept
-    _CCCL_TRAILING_REQUIRES(bool)(__different_resource<device_memory_resource, _Resource>
-                                  && !has_property<_Resource, device_accessible>)
-  {
-    return false;
-  }
-
-  template <class _Resource>
-  _CCCL_NODISCARD_FRIEND auto operator!=(device_memory_resource const& __lhs, _Resource const& __rhs) noexcept
-    _CCCL_TRAILING_REQUIRES(bool)(
-      __different_resource<device_memory_resource, _Resource>&& has_property<_Resource, device_accessible>)
-  {
-    return resource_ref<device_accessible>{const_cast<device_memory_resource&>(__lhs)}
-        != resource_ref<device_accessible>{const_cast<_Resource&>(__rhs)};
-  }
-
-  template <class _Resource>
-  _CCCL_NODISCARD_FRIEND auto operator!=(device_memory_resource const&, _Resource const&) noexcept
-    _CCCL_TRAILING_REQUIRES(bool)(__different_resource<device_memory_resource, _Resource>
-                                  && !has_property<_Resource, device_accessible>)
-  {
-    return true;
-  }
-
-  template <class _Resource>
-  _CCCL_NODISCARD_FRIEND auto operator!=(_Resource const& __rhs, device_memory_resource const& __lhs) noexcept
-    _CCCL_TRAILING_REQUIRES(bool)(
-      __different_resource<device_memory_resource, _Resource>&& has_property<_Resource, device_accessible>)
-  {
-    return resource_ref<device_accessible>{const_cast<device_memory_resource&>(__lhs)}
-        != resource_ref<device_accessible>{const_cast<_Resource&>(__rhs)};
-  }
-
-  template <class _Resource>
-  _CCCL_NODISCARD_FRIEND auto operator!=(_Resource const&, device_memory_resource const&) noexcept
-    _CCCL_TRAILING_REQUIRES(bool)(__different_resource<device_memory_resource, _Resource>
-                                  && !has_property<_Resource, device_accessible>)
-  {
-    return true;
-  }
-#    endif // _CCCL_STD_VER <= 2017
-
-  //! @brief Enables the \c device_accessible property
-  friend constexpr void get_property(device_memory_resource const&, device_accessible) noexcept {}
-
-  //! @brief Checks whether the passed in alignment is valid
-  static constexpr bool __is_valid_alignment(const size_t __alignment) noexcept
-  {
-    return __alignment <= default_cuda_malloc_alignment && (default_cuda_malloc_alignment % __alignment == 0);
-  }
-};
-static_assert(resource_with<device_memory_resource, device_accessible>, "");
-
-// For backward compatability
-using cuda_memory_resource _LIBCUDACXX_DEPRECATED = device_memory_resource;
-
-_LIBCUDACXX_END_NAMESPACE_CUDA_MR
-
-#  endif // _CCCL_STD_VER >= 2014
-
-#endif // !_CCCL_COMPILER(MSVC2017) && LIBCUDACXX_ENABLE_EXPERIMENTAL_MEMORY_RESOURCE
-
-#endif // _CUDA__MEMORY_RESOURCE_CUDA_MEMORY_RESOURCE_H
diff --git a/libcudacxx/include/cuda/__memory_resource/managed_memory_resource.h b/libcudacxx/include/cuda/__memory_resource/managed_memory_resource.h
deleted file mode 100644
index 86835aede18..00000000000
--- a/libcudacxx/include/cuda/__memory_resource/managed_memory_resource.h
+++ /dev/null
@@ -1,201 +0,0 @@
-//===----------------------------------------------------------------------===//
-//
-// Part of libcu++, the C++ Standard Library for your entire system,
-// under the Apache License v2.0 with LLVM Exceptions.
-// See https://llvm.org/LICENSE.txt for license information.
-// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
-// SPDX-FileCopyrightText: Copyright (c) 2024 NVIDIA CORPORATION & AFFILIATES.
-//
-//===----------------------------------------------------------------------===//
-
-#ifndef _CUDA__MEMORY_RESOURCE_CUDA_MANAGED_MEMORY_RESOURCE_H
-#define _CUDA__MEMORY_RESOURCE_CUDA_MANAGED_MEMORY_RESOURCE_H
-
-#include <cuda/std/detail/__config>
-
-#if defined(_CCCL_IMPLICIT_SYSTEM_HEADER_GCC)
-#  pragma GCC system_header
-#elif defined(_CCCL_IMPLICIT_SYSTEM_HEADER_CLANG)
-#  pragma clang system_header
-#elif defined(_CCCL_IMPLICIT_SYSTEM_HEADER_MSVC)
-#  pragma system_header
-#endif // no system header
-
-#if !_CCCL_COMPILER(MSVC2017) && defined(LIBCUDACXX_ENABLE_EXPERIMENTAL_MEMORY_RESOURCE)
-
-#  if defined(_CCCL_CUDA_COMPILER_CLANG)
-#    include <cuda_runtime_api.h>
-#  endif // _CCCL_CUDA_COMPILER_CLANG
-
-#  include <cuda/__memory_resource/get_property.h>
-#  include <cuda/__memory_resource/properties.h>
-#  include <cuda/__memory_resource/resource.h>
-#  include <cuda/__memory_resource/resource_ref.h>
-#  include <cuda/std/__concepts/concept_macros.h>
-#  include <cuda/std/__cuda/api_wrapper.h>
-#  include <cuda/std/detail/libcxx/include/stdexcept>
-
-#  if _CCCL_STD_VER >= 2014
-
-_LIBCUDACXX_BEGIN_NAMESPACE_CUDA_MR
-
-//! @brief \c managed_memory_resource uses `cudaMallocManaged` / `cudaFree` for allocation / deallocation.
-class managed_memory_resource
-{
-private:
-  unsigned int __flags_ = cudaMemAttachGlobal;
-
-  static constexpr unsigned int __available_flags = cudaMemAttachGlobal | cudaMemAttachHost;
-
-public:
-  constexpr managed_memory_resource(const unsigned int __flags = cudaMemAttachGlobal) noexcept
-      : __flags_(__flags & __available_flags)
-  {
-    _CCCL_ASSERT(__flags_ == __flags, "Unexpected flags passed to managed_memory_resource");
-  }
-
-  //! @brief Allocate CUDA unified memory of size at least \p __bytes.
-  //! @param __bytes The size in bytes of the allocation.
-  //! @param __alignment The requested alignment of the allocation.
-  //! @throw std::invalid_argument in case of invalid alignment or \c cuda::cuda_error of the returned error code.
-  //! @return Pointer to the newly allocated memory
-  _CCCL_NODISCARD void* allocate(const size_t __bytes, const size_t __alignment = default_cuda_malloc_alignment) const
-  {
-    // We need to ensure that the provided alignment matches the minimal provided alignment
-    if (!__is_valid_alignment(__alignment))
-    {
-      _CUDA_VSTD::__throw_invalid_argument("Invalid alignment passed to managed_memory_resource::allocate.");
-    }
-
-    void* __ptr{nullptr};
-    _CCCL_TRY_CUDA_API(
-      ::cudaMallocManaged, "Failed to allocate memory with cudaMallocManaged.", &__ptr, __bytes, __flags_);
-    return __ptr;
-  }
-
-  //! @brief Deallocate memory pointed to by \p __ptr.
-  //! @param __ptr Pointer to be deallocated. Must have been allocated through a call to `allocate`.
-  //! @param __bytes The number of bytes that was passed to the `allocate` call that returned \p __ptr.
-  //! @param __alignment The alignment that was passed to the `allocate` call that returned \p __ptr.
-  void deallocate(void* __ptr, const size_t, const size_t __alignment = default_cuda_malloc_alignment) const noexcept
-  {
-    // We need to ensure that the provided alignment matches the minimal provided alignment
-    _CCCL_ASSERT(__is_valid_alignment(__alignment), "Invalid alignment passed to managed_memory_resource::deallocate.");
-    _CCCL_ASSERT_CUDA_API(::cudaFree, "managed_memory_resource::deallocate failed", __ptr);
-    (void) __alignment;
-  }
-
-  //! @brief Equality comparison with another \c managed_memory_resource.
-  //! @param __other The other \c managed_memory_resource.
-  //! @return Whether both \c managed_memory_resource were constructed with the same flags.
-  _CCCL_NODISCARD constexpr bool operator==(managed_memory_resource const& __other) const noexcept
-  {
-    return __flags_ == __other.__flags_;
-  }
-#    if _CCCL_STD_VER <= 2017
-  //! @brief Inequality comparison with another \c managed_memory_resource.
-  //! @param __other The other \c managed_memory_resource.
-  //! @return Whether both \c managed_memory_resource were constructed with different flags.
-  _CCCL_NODISCARD constexpr bool operator!=(managed_memory_resource const& __other) const noexcept
-  {
-    return __flags_ != __other.__flags_;
-  }
-#    endif // _CCCL_STD_VER <= 2017
-
-#    if _CCCL_STD_VER >= 2020
-  //! @brief Equality comparison between a \c managed_memory_resource and another resource
-  //! @param __rhs The resource to compare to
-  //! @return If the underlying types are equality comparable, returns the result of equality comparison of both
-  //! resources. Otherwise, returns false.
-  _CCCL_TEMPLATE(class _Resource)
-  _CCCL_REQUIRES(__different_resource<managed_memory_resource, _Resource>)
-  _CCCL_NODISCARD bool operator==(_Resource const& __rhs) const noexcept
-  {
-    if constexpr (has_property<_Resource, host_accessible>)
-    {
-      return resource_ref<host_accessible>{const_cast<managed_memory_resource*>(this)}
-          == resource_ref<host_accessible>{const_cast<_Resource&>(__rhs)};
-    }
-    else if constexpr (has_property<_Resource, device_accessible>)
-    {
-      return resource_ref<device_accessible>{const_cast<managed_memory_resource*>(this)}
-          == resource_ref<device_accessible>{const_cast<_Resource&>(__rhs)};
-    }
-    else
-    {
-      return false;
-    }
-  }
-#    else // ^^^ C++20 ^^^ / vvv C++17
-  template <class _Resource>
-  _CCCL_NODISCARD_FRIEND auto operator==(managed_memory_resource const& __lhs, _Resource const& __rhs) noexcept
-    _CCCL_TRAILING_REQUIRES(bool)(
-      __different_resource<managed_memory_resource, _Resource>&& has_property<_Resource, host_accessible>)
-  {
-    return resource_ref<host_accessible>{const_cast<managed_memory_resource&>(__lhs)}
-        == resource_ref<host_accessible>{const_cast<_Resource&>(__rhs)};
-  }
-  template <class _Resource>
-  _CCCL_NODISCARD_FRIEND auto operator==(managed_memory_resource const& __lhs, _Resource const& __rhs) noexcept
-    _CCCL_TRAILING_REQUIRES(bool)(
-      __different_resource<managed_memory_resource, _Resource> && !has_property<_Resource, host_accessible>
-      && has_property<_Resource, device_accessible>)
-  {
-    return resource_ref<device_accessible>{const_cast<managed_memory_resource&>(__lhs)}
-        == resource_ref<device_accessible>{const_cast<_Resource&>(__rhs)};
-  }
-  template <class _Resource>
-  _CCCL_NODISCARD_FRIEND auto operator==(managed_memory_resource const& __lhs, _Resource const& __rhs) noexcept
-    _CCCL_TRAILING_REQUIRES(bool)(
-      __different_resource<managed_memory_resource, _Resource> && !has_property<_Resource, host_accessible>
-      && !has_property<_Resource, device_accessible>)
-  {
-    return false;
-  }
-
-  template <class _Resource>
-  _CCCL_NODISCARD_FRIEND auto operator==(_Resource const& __lhs, managed_memory_resource const& __rhs) noexcept
-    _CCCL_TRAILING_REQUIRES(bool)(__different_resource<managed_memory_resource, _Resource>)
-  {
-    return __rhs == __lhs;
-  }
-
-  template <class _Resource>
-  _CCCL_NODISCARD_FRIEND auto operator!=(managed_memory_resource const& __lhs, _Resource const& __rhs) noexcept
-    _CCCL_TRAILING_REQUIRES(bool)(__different_resource<managed_memory_resource, _Resource>)
-  {
-    return !(__lhs == __rhs);
-  }
-
-  template <class _Resource>
-  _CCCL_NODISCARD_FRIEND auto operator!=(_Resource const& __rhs, managed_memory_resource const& __lhs) noexcept
-    _CCCL_TRAILING_REQUIRES(bool)(__different_resource<managed_memory_resource, _Resource>)
-  {
-    return !(__rhs == __lhs);
-  }
-#    endif // _CCCL_STD_VER <= 2017
-
-  //! @brief Enables the \c device_accessible property
-  friend constexpr void get_property(managed_memory_resource const&, device_accessible) noexcept {}
-  //! @brief Enables the \c host_accessible property
-  friend constexpr void get_property(managed_memory_resource const&, host_accessible) noexcept {}
-
-  //! @brief Checks whether the passed in alignment is valid
-  static constexpr bool __is_valid_alignment(const size_t __alignment) noexcept
-  {
-    return __alignment <= default_cuda_malloc_alignment && (default_cuda_malloc_alignment % __alignment == 0);
-  }
-};
-static_assert(resource_with<managed_memory_resource, device_accessible>, "");
-static_assert(resource_with<managed_memory_resource, host_accessible>, "");
-
-// For backward compatability
-using cuda_managed_memory_resource _LIBCUDACXX_DEPRECATED = managed_memory_resource;
-
-_LIBCUDACXX_END_NAMESPACE_CUDA_MR
-
-#  endif // _CCCL_STD_VER >= 2014
-
-#endif // !_CCCL_COMPILER(MSVC2017) && LIBCUDACXX_ENABLE_EXPERIMENTAL_MEMORY_RESOURCE
-
-#endif //_CUDA__MEMORY_RESOURCE_CUDA_MANAGED_MEMORY_RESOURCE_H
diff --git a/libcudacxx/include/cuda/__memory_resource/pinned_memory_resource.h b/libcudacxx/include/cuda/__memory_resource/pinned_memory_resource.h
deleted file mode 100644
index 819d485a104..00000000000
--- a/libcudacxx/include/cuda/__memory_resource/pinned_memory_resource.h
+++ /dev/null
@@ -1,204 +0,0 @@
-//===----------------------------------------------------------------------===//
-//
-// Part of libcu++, the C++ Standard Library for your entire system,
-// under the Apache License v2.0 with LLVM Exceptions.
-// See https://llvm.org/LICENSE.txt for license information.
-// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
-// SPDX-FileCopyrightText: Copyright (c) 2024 NVIDIA CORPORATION & AFFILIATES.
-//
-//===----------------------------------------------------------------------===//
-
-#ifndef _CUDA__MEMORY_RESOURCE_CUDA_PINNED_MEMORY_RESOURCE_H
-#define _CUDA__MEMORY_RESOURCE_CUDA_PINNED_MEMORY_RESOURCE_H
-
-#include <cuda/std/detail/__config>
-
-#if defined(_CCCL_IMPLICIT_SYSTEM_HEADER_GCC)
-#  pragma GCC system_header
-#elif defined(_CCCL_IMPLICIT_SYSTEM_HEADER_CLANG)
-#  pragma clang system_header
-#elif defined(_CCCL_IMPLICIT_SYSTEM_HEADER_MSVC)
-#  pragma system_header
-#endif // no system header
-
-#if !_CCCL_COMPILER(MSVC2017) && defined(LIBCUDACXX_ENABLE_EXPERIMENTAL_MEMORY_RESOURCE)
-
-#  if defined(_CCCL_CUDA_COMPILER_CLANG)
-#    include <cuda_runtime.h>
-#    include <cuda_runtime_api.h>
-#  endif // _CCCL_CUDA_COMPILER_CLANG
-
-#  include <cuda/__memory_resource/get_property.h>
-#  include <cuda/__memory_resource/properties.h>
-#  include <cuda/__memory_resource/resource.h>
-#  include <cuda/__memory_resource/resource_ref.h>
-#  include <cuda/std/__concepts/concept_macros.h>
-#  include <cuda/std/__cuda/api_wrapper.h>
-#  include <cuda/std/detail/libcxx/include/stdexcept>
-
-#  if _CCCL_STD_VER >= 2014
-
-_LIBCUDACXX_BEGIN_NAMESPACE_CUDA_MR
-
-//! @brief pinned_memory_resource uses `cudaMallocHost` / `cudaFreeHost` for allocation / deallocation.
-class pinned_memory_resource
-{
-private:
-  unsigned int __flags_ = cudaHostAllocDefault;
-
-  static constexpr unsigned int __available_flags =
-    cudaHostAllocDefault | cudaHostAllocPortable | cudaHostAllocMapped | cudaHostAllocWriteCombined;
-
-public:
-  constexpr pinned_memory_resource(const unsigned int __flags = cudaHostAllocDefault) noexcept
-      : __flags_(__flags & __available_flags)
-  {
-    _CCCL_ASSERT(__flags_ == __flags, "Unexpected flags passed to pinned_memory_resource");
-  }
-
-  //! @brief Allocate host memory of size at least \p __bytes.
-  //! @param __bytes The size in bytes of the allocation.
-  //! @param __alignment The requested alignment of the allocation.
-  //! @throw std::invalid_argument in case of invalid alignment or \c cuda::cuda_error of the returned error code.
-  //! @return Pointer to the newly allocated memory
-  _CCCL_NODISCARD void* allocate(const size_t __bytes,
-                                 const size_t __alignment = default_cuda_malloc_host_alignment) const
-  {
-    // We need to ensure that the provided alignment matches the minimal provided alignment
-    if (!__is_valid_alignment(__alignment))
-    {
-      _CUDA_VSTD::__throw_invalid_argument("Invalid alignment passed to pinned_memory_resource::allocate.");
-    }
-
-    void* __ptr{nullptr};
-    _CCCL_TRY_CUDA_API(::cudaMallocHost, "Failed to allocate memory with cudaMallocHost.", &__ptr, __bytes, __flags_);
-    return __ptr;
-  }
-
-  //! @brief Deallocate memory pointed to by \p __ptr.
-  //! @param __ptr Pointer to be deallocated. Must have been allocated through a call to `allocate`.
-  //! @param __bytes The number of bytes that was passed to the `allocate` call that returned \p __ptr.
-  //! @param __alignment The alignment that was passed to the `allocate` call that returned \p __ptr.
-  void
-  deallocate(void* __ptr, const size_t, const size_t __alignment = default_cuda_malloc_host_alignment) const noexcept
-  {
-    // We need to ensure that the provided alignment matches the minimal provided alignment
-    _CCCL_ASSERT(__is_valid_alignment(__alignment), "Invalid alignment passed to pinned_memory_resource::deallocate.");
-    _CCCL_ASSERT_CUDA_API(::cudaFreeHost, "pinned_memory_resource::deallocate failed", __ptr);
-    (void) __alignment;
-  }
-
-  //! @brief Equality comparison with another \c pinned_memory_resource.
-  //! @param __other The other \c pinned_memory_resource.
-  //! @return Whether both \c pinned_memory_resource were constructed with the same flags.
-  _CCCL_NODISCARD constexpr bool operator==(pinned_memory_resource const& __other) const noexcept
-  {
-    return __flags_ == __other.__flags_;
-  }
-#    if _CCCL_STD_VER <= 2017
-  //! @brief Equality comparison with another \c pinned_memory_resource.
-  //! @param __other The other \c pinned_memory_resource.
-  //! @return Whether both \c pinned_memory_resource were constructed with different flags.
-  _CCCL_NODISCARD constexpr bool operator!=(pinned_memory_resource const& __other) const noexcept
-  {
-    return __flags_ != __other.__flags_;
-  }
-#    endif // _CCCL_STD_VER <= 2017
-
-#    if _CCCL_STD_VER >= 2020
-  //! @brief Equality comparison between a \c pinned_memory_resource and another resource
-  //! @param __rhs The resource to compare to
-  //! @return If the underlying types are equality comparable, returns the result of equality comparison of both
-  //! resources. Otherwise, returns false.
-  _CCCL_TEMPLATE(class _Resource)
-  _CCCL_REQUIRES(__different_resource<pinned_memory_resource, _Resource>)
-  _CCCL_NODISCARD bool operator==(_Resource const& __rhs) const noexcept
-  {
-    if constexpr (has_property<_Resource, host_accessible>)
-    {
-      return resource_ref<host_accessible>{const_cast<pinned_memory_resource*>(this)}
-          == resource_ref<host_accessible>{const_cast<_Resource&>(__rhs)};
-    }
-    else if constexpr (has_property<_Resource, device_accessible>)
-    {
-      return resource_ref<device_accessible>{const_cast<pinned_memory_resource*>(this)}
-          == resource_ref<device_accessible>{const_cast<_Resource&>(__rhs)};
-    }
-    else
-    {
-      return false;
-    }
-  }
-#    else // ^^^ C++20 ^^^ / vvv C++17
-  template <class _Resource>
-  _CCCL_NODISCARD_FRIEND auto operator==(pinned_memory_resource const& __lhs, _Resource const& __rhs) noexcept
-    _CCCL_TRAILING_REQUIRES(bool)(
-      __different_resource<pinned_memory_resource, _Resource>&& has_property<_Resource, host_accessible>)
-  {
-    return resource_ref<host_accessible>{const_cast<pinned_memory_resource&>(__lhs)}
-        == resource_ref<host_accessible>{const_cast<_Resource&>(__rhs)};
-  }
-  template <class _Resource>
-  _CCCL_NODISCARD_FRIEND auto operator==(pinned_memory_resource const& __lhs, _Resource const& __rhs) noexcept
-    _CCCL_TRAILING_REQUIRES(bool)(
-      __different_resource<pinned_memory_resource, _Resource> && !has_property<_Resource, host_accessible>
-      && has_property<_Resource, device_accessible>)
-  {
-    return resource_ref<device_accessible>{const_cast<pinned_memory_resource&>(__lhs)}
-        == resource_ref<device_accessible>{const_cast<_Resource&>(__rhs)};
-  }
-  template <class _Resource>
-  _CCCL_NODISCARD_FRIEND auto operator==(pinned_memory_resource const& __lhs, _Resource const& __rhs) noexcept
-    _CCCL_TRAILING_REQUIRES(bool)(
-      __different_resource<pinned_memory_resource, _Resource> && !has_property<_Resource, host_accessible>
-      && !has_property<_Resource, device_accessible>)
-  {
-    return false;
-  }
-
-  template <class _Resource>
-  _CCCL_NODISCARD_FRIEND auto operator==(_Resource const& __lhs, pinned_memory_resource const& __rhs) noexcept
-    _CCCL_TRAILING_REQUIRES(bool)(__different_resource<pinned_memory_resource, _Resource>)
-  {
-    return __rhs == __lhs;
-  }
-
-  template <class _Resource>
-  _CCCL_NODISCARD_FRIEND auto operator!=(pinned_memory_resource const& __lhs, _Resource const& __rhs) noexcept
-    _CCCL_TRAILING_REQUIRES(bool)(__different_resource<pinned_memory_resource, _Resource>)
-  {
-    return !(__lhs == __rhs);
-  }
-
-  template <class _Resource>
-  _CCCL_NODISCARD_FRIEND auto operator!=(_Resource const& __rhs, pinned_memory_resource const& __lhs) noexcept
-    _CCCL_TRAILING_REQUIRES(bool)(__different_resource<pinned_memory_resource, _Resource>)
-  {
-    return !(__rhs == __lhs);
-  }
-#    endif // _CCCL_STD_VER <= 2017
-
-  //! @brief Enables the \c device_accessible property
-  friend constexpr void get_property(pinned_memory_resource const&, device_accessible) noexcept {}
-  //! @brief Enables the \c host_accessible property
-  friend constexpr void get_property(pinned_memory_resource const&, host_accessible) noexcept {}
-
-  //! @brief Checks whether the passed in alignment is valid
-  static constexpr bool __is_valid_alignment(const size_t __alignment) noexcept
-  {
-    return __alignment <= default_cuda_malloc_host_alignment && (default_cuda_malloc_host_alignment % __alignment == 0);
-  }
-};
-static_assert(resource_with<pinned_memory_resource, device_accessible>, "");
-static_assert(resource_with<pinned_memory_resource, host_accessible>, "");
-
-// For backward compatability
-using cuda_pinned_memory_resource _LIBCUDACXX_DEPRECATED = pinned_memory_resource;
-
-_LIBCUDACXX_END_NAMESPACE_CUDA_MR
-
-#  endif // _CCCL_STD_VER >= 2014
-
-#endif // !_CCCL_COMPILER(MSVC2017) && LIBCUDACXX_ENABLE_EXPERIMENTAL_MEMORY_RESOURCE
-
-#endif //_CUDA__MEMORY_RESOURCE_CUDA_PINNED_MEMORY_RESOURCE_H
diff --git a/libcudacxx/include/cuda/memory_resource b/libcudacxx/include/cuda/memory_resource
index d2e4296b749..e1c0ac468c1 100644
--- a/libcudacxx/include/cuda/memory_resource
+++ b/libcudacxx/include/cuda/memory_resource
@@ -32,10 +32,7 @@
 //!
 //!@endrst
 
-#include <cuda/__memory_resource/device_memory_resource.h>
 #include <cuda/__memory_resource/get_property.h>
-#include <cuda/__memory_resource/managed_memory_resource.h>
-#include <cuda/__memory_resource/pinned_memory_resource.h>
 #include <cuda/__memory_resource/properties.h>
 #include <cuda/__memory_resource/resource.h>
 #include <cuda/__memory_resource/resource_ref.h>
diff --git a/libcudacxx/test/libcudacxx/cuda/memory_resource/device_memory_resource/allocate.pass.cpp b/libcudacxx/test/libcudacxx/cuda/memory_resource/device_memory_resource/allocate.pass.cpp
deleted file mode 100644
index fe983aa93de..00000000000
--- a/libcudacxx/test/libcudacxx/cuda/memory_resource/device_memory_resource/allocate.pass.cpp
+++ /dev/null
@@ -1,95 +0,0 @@
-//===----------------------------------------------------------------------===//
-//
-// Part of libcu++, the C++ Standard Library for your entire system,
-// under the Apache License v2.0 with LLVM Exceptions.
-// See https://llvm.org/LICENSE.txt for license information.
-// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
-// SPDX-FileCopyrightText: Copyright (c) 2024 NVIDIA CORPORATION & AFFILIATES.
-//
-//===----------------------------------------------------------------------===//
-
-// UNSUPPORTED: c++03, c++11
-// UNSUPPORTED: msvc-19.16
-// UNSUPPORTED: nvrtc
-
-#include <cuda/memory_resource>
-#include <cuda/std/cassert>
-#include <cuda/std/cstdint>
-#include <cuda/stream_ref>
-
-#include "test_macros.h"
-
-void ensure_device_ptr(void* ptr)
-{
-  assert(ptr != nullptr);
-  cudaPointerAttributes attributes;
-  cudaError_t status = cudaPointerGetAttributes(&attributes, ptr);
-  assert(status == cudaSuccess);
-  assert(attributes.type == cudaMemoryTypeDevice);
-}
-
-void test()
-{
-  cuda::mr::device_memory_resource res{};
-
-  { // allocate / deallocate
-    auto* ptr = res.allocate(42);
-    static_assert(cuda::std::is_same<decltype(ptr), void*>::value, "");
-    ensure_device_ptr(ptr);
-
-    res.deallocate(ptr, 42);
-  }
-
-  { // allocate / deallocate with alignment
-    constexpr size_t desired_alignment = 64;
-    auto* ptr                          = res.allocate(42, desired_alignment);
-    static_assert(cuda::std::is_same<decltype(ptr), void*>::value, "");
-    ensure_device_ptr(ptr);
-
-    // also check the alignment
-    const auto address   = reinterpret_cast<cuda::std::uintptr_t>(ptr);
-    const auto alignment = address & (~address + 1ULL);
-    assert(alignment >= desired_alignment);
-    res.deallocate(ptr, 42, desired_alignment);
-  }
-
-#ifndef TEST_HAS_NO_EXCEPTIONS
-  { // allocate with too small alignment
-    while (true)
-    {
-      try
-      {
-        auto* ptr = res.allocate(5, 42);
-        unused(ptr);
-      }
-      catch (const std::invalid_argument&)
-      {
-        break;
-      }
-      assert(false);
-    }
-  }
-
-  { // allocate with non matching alignment
-    while (true)
-    {
-      try
-      {
-        auto* ptr = res.allocate(5, 1337);
-        unused(ptr);
-      }
-      catch (const std::invalid_argument&)
-      {
-        break;
-      }
-      assert(false);
-    }
-  }
-#endif // TEST_HAS_NO_EXCEPTIONS
-}
-
-int main(int, char**)
-{
-  NV_IF_TARGET(NV_IS_HOST, test();)
-  return 0;
-}
diff --git a/libcudacxx/test/libcudacxx/cuda/memory_resource/device_memory_resource/equality.pass.cpp b/libcudacxx/test/libcudacxx/cuda/memory_resource/device_memory_resource/equality.pass.cpp
deleted file mode 100644
index 56be1650df5..00000000000
--- a/libcudacxx/test/libcudacxx/cuda/memory_resource/device_memory_resource/equality.pass.cpp
+++ /dev/null
@@ -1,144 +0,0 @@
-//===----------------------------------------------------------------------===//
-//
-// Part of libcu++, the C++ Standard Library for your entire system,
-// under the Apache License v2.0 with LLVM Exceptions.
-// See https://llvm.org/LICENSE.txt for license information.
-// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
-// SPDX-FileCopyrightText: Copyright (c) 2024 NVIDIA CORPORATION & AFFILIATES.
-//
-//===----------------------------------------------------------------------===//
-
-// UNSUPPORTED: c++03, c++11
-// UNSUPPORTED: msvc-19.16
-// UNSUPPORTED: nvrtc
-
-#include <cuda/memory_resource>
-#include <cuda/std/cassert>
-#include <cuda/std/cstdint>
-#include <cuda/stream_ref>
-
-enum class AccessibilityType
-{
-  Device,
-  Host,
-};
-
-template <AccessibilityType Accessibilty>
-struct resource
-{
-  void* allocate(size_t, size_t)
-  {
-    return nullptr;
-  }
-  void deallocate(void*, size_t, size_t) noexcept {}
-
-  bool operator==(const resource&) const
-  {
-    return true;
-  }
-  bool operator!=(const resource& other) const
-  {
-    return false;
-  }
-
-  template <AccessibilityType Accessibilty2                                         = Accessibilty,
-            cuda::std::enable_if_t<Accessibilty2 == AccessibilityType::Device, int> = 0>
-  friend void get_property(const resource&, cuda::mr::device_accessible) noexcept
-  {}
-  template <AccessibilityType Accessibilty2                                       = Accessibilty,
-            cuda::std::enable_if_t<Accessibilty2 == AccessibilityType::Host, int> = 0>
-  friend void get_property(const resource&, cuda::mr::host_accessible) noexcept
-  {}
-};
-static_assert(cuda::mr::resource<resource<AccessibilityType::Host>>, "");
-static_assert(!cuda::mr::resource_with<resource<AccessibilityType::Host>, cuda::mr::device_accessible>, "");
-static_assert(cuda::mr::resource<resource<AccessibilityType::Device>>, "");
-static_assert(cuda::mr::resource_with<resource<AccessibilityType::Device>, cuda::mr::device_accessible>, "");
-
-template <AccessibilityType Accessibilty>
-struct async_resource : public resource<Accessibilty>
-{
-  void* allocate_async(size_t, size_t, cuda::stream_ref)
-  {
-    return nullptr;
-  }
-  void deallocate_async(void*, size_t, size_t, cuda::stream_ref) {}
-};
-static_assert(cuda::mr::async_resource<async_resource<AccessibilityType::Host>>, "");
-static_assert(!cuda::mr::async_resource_with<async_resource<AccessibilityType::Host>, cuda::mr::device_accessible>, "");
-static_assert(cuda::mr::async_resource<async_resource<AccessibilityType::Device>>, "");
-static_assert(cuda::mr::async_resource_with<async_resource<AccessibilityType::Device>, cuda::mr::device_accessible>,
-              "");
-
-// test for cccl#2214: https://github.com/NVIDIA/cccl/issues/2214
-struct derived_resource : cuda::mr::device_memory_resource
-{
-  using cuda::mr::device_memory_resource::device_memory_resource;
-};
-static_assert(cuda::mr::resource<derived_resource>, "");
-
-// Ensure that we can only
-
-void test()
-{
-  cuda::mr::device_memory_resource first{};
-  { // comparison against a plain device_memory_resource
-    cuda::mr::device_memory_resource second{};
-    assert(first == second);
-    assert(!(first != second));
-  }
-
-  { // comparison against a device_memory_resource wrapped inside a resource_ref<device_accessible>
-    cuda::mr::device_memory_resource second{};
-    cuda::mr::resource_ref<cuda::mr::device_accessible> second_ref{second};
-    assert(first == second_ref);
-    assert(!(first != second_ref));
-    assert(second_ref == first);
-    assert(!(second_ref != first));
-  }
-
-  { // comparison against a device_memory_resource wrapped inside a resource_ref<cuda::mr::device_accessible>
-    cuda::mr::device_memory_resource second{};
-    cuda::mr::resource_ref<cuda::mr::device_accessible> second_ref{second};
-    assert(first == second_ref);
-    assert(!(first != second_ref));
-    assert(second_ref == first);
-    assert(!(second_ref != first));
-  }
-
-  { // comparison against a different resource
-    resource<AccessibilityType::Host> host_resource{};
-    resource<AccessibilityType::Device> device_resource{};
-    assert(!(first == host_resource));
-    assert(first != host_resource);
-    assert(!(first == device_resource));
-    assert(first != device_resource);
-
-    assert(!(host_resource == first));
-    assert(host_resource != first);
-    assert(!(device_resource == first));
-    assert(device_resource != first);
-  }
-
-  { // comparison against a different resource through resource_ref<cuda::mr::device_accessible>
-    async_resource<AccessibilityType::Host> host_async_resource{};
-    async_resource<AccessibilityType::Device> device_async_resource{};
-    cuda::mr::resource_ref<cuda::mr::host_accessible> host_ref{host_async_resource};
-    cuda::mr::resource_ref<cuda::mr::device_accessible> device_ref{device_async_resource};
-    assert(!(first == host_ref));
-    assert(first != host_ref);
-    assert(!(first == device_async_resource));
-    assert(first != device_async_resource);
-
-    assert(!(host_ref == first));
-    assert(host_ref != first);
-    assert(!(device_async_resource == first));
-    assert(device_async_resource != first);
-  }
-}
-
-int main(int, char**)
-{
-  NV_IF_TARGET(NV_IS_HOST, test();)
-  return 0;
-}
diff --git a/libcudacxx/test/libcudacxx/cuda/memory_resource/device_memory_resource/traits.pass.cpp b/libcudacxx/test/libcudacxx/cuda/memory_resource/device_memory_resource/traits.pass.cpp
deleted file mode 100644
index d642b83bf02..00000000000
--- a/libcudacxx/test/libcudacxx/cuda/memory_resource/device_memory_resource/traits.pass.cpp
+++ /dev/null
@@ -1,31 +0,0 @@
-//===----------------------------------------------------------------------===//
-//
-// Part of libcu++, the C++ Standard Library for your entire system,
-// under the Apache License v2.0 with LLVM Exceptions.
-// See https://llvm.org/LICENSE.txt for license information.
-// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
-// SPDX-FileCopyrightText: Copyright (c) 2024 NVIDIA CORPORATION & AFFILIATES.
-//
-//===----------------------------------------------------------------------===//
-
-// UNSUPPORTED: c++03, c++11
-// UNSUPPORTED: msvc-19.16
-// UNSUPPORTED: nvrtc
-
-#include <cuda/memory_resource>
-#include <cuda/std/type_traits>
-
-using resource = cuda::mr::device_memory_resource;
-static_assert(!cuda::std::is_trivial<resource>::value, "");
-static_assert(!cuda::std::is_trivially_default_constructible<resource>::value, "");
-static_assert(cuda::std::is_trivially_copy_constructible<resource>::value, "");
-static_assert(cuda::std::is_trivially_move_constructible<resource>::value, "");
-static_assert(cuda::std::is_trivially_copy_assignable<resource>::value, "");
-static_assert(cuda::std::is_trivially_move_assignable<resource>::value, "");
-static_assert(cuda::std::is_trivially_destructible<resource>::value, "");
-static_assert(!cuda::std::is_empty<resource>::value, "");
-
-int main(int, char**)
-{
-  return 0;
-}
diff --git a/libcudacxx/test/libcudacxx/cuda/memory_resource/managed_memory_resource/allocate.pass.cpp b/libcudacxx/test/libcudacxx/cuda/memory_resource/managed_memory_resource/allocate.pass.cpp
deleted file mode 100644
index f32093a1582..00000000000
--- a/libcudacxx/test/libcudacxx/cuda/memory_resource/managed_memory_resource/allocate.pass.cpp
+++ /dev/null
@@ -1,96 +0,0 @@
-//===----------------------------------------------------------------------===//
-//
-// Part of libcu++, the C++ Standard Library for your entire system,
-// under the Apache License v2.0 with LLVM Exceptions.
-// See https://llvm.org/LICENSE.txt for license information.
-// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
-// SPDX-FileCopyrightText: Copyright (c) 2024 NVIDIA CORPORATION & AFFILIATES.
-//
-//===----------------------------------------------------------------------===//
-
-// UNSUPPORTED: c++03, c++11
-// UNSUPPORTED: msvc-19.16
-// UNSUPPORTED: nvrtc
-
-#include <cuda/memory_resource>
-#include <cuda/std/cassert>
-#include <cuda/std/cstdint>
-#include <cuda/stream_ref>
-
-#include "test_macros.h"
-
-void ensure_managed_ptr(void* ptr)
-{
-  assert(ptr != nullptr);
-  cudaPointerAttributes attributes;
-  cudaError_t status = cudaPointerGetAttributes(&attributes, ptr);
-  assert(status == cudaSuccess);
-  assert(attributes.type == cudaMemoryTypeManaged);
-}
-
-void test(const unsigned int flag)
-{
-  cuda::mr::managed_memory_resource res{flag};
-
-  { // allocate / deallocate
-    auto* ptr = res.allocate(42);
-    static_assert(cuda::std::is_same<decltype(ptr), void*>::value, "");
-    ensure_managed_ptr(ptr);
-
-    res.deallocate(ptr, 42);
-  }
-
-  { // allocate / deallocate with alignment
-    auto* ptr = res.allocate(42, 4);
-    static_assert(cuda::std::is_same<decltype(ptr), void*>::value, "");
-    ensure_managed_ptr(ptr);
-
-    res.deallocate(ptr, 42, 4);
-  }
-
-#ifndef TEST_HAS_NO_EXCEPTIONS
-  { // allocate with too small alignment
-    while (true)
-    {
-      try
-      {
-        auto* ptr = res.allocate(5, 42);
-        unused(ptr);
-      }
-      catch (const std::invalid_argument&)
-      {
-        break;
-      }
-      assert(false);
-    }
-  }
-
-  { // allocate with non matching alignment
-    while (true)
-    {
-      try
-      {
-        auto* ptr = res.allocate(5, 1337);
-        unused(ptr);
-      }
-      catch (const std::invalid_argument&)
-      {
-        break;
-      }
-      assert(false);
-    }
-  }
-#endif // TEST_HAS_NO_EXCEPTIONS
-}
-
-void test()
-{
-  test(cudaMemAttachGlobal);
-  test(cudaMemAttachHost);
-}
-
-int main(int, char**)
-{
-  NV_IF_TARGET(NV_IS_HOST, test();)
-  return 0;
-}
diff --git a/libcudacxx/test/libcudacxx/cuda/memory_resource/managed_memory_resource/equality.pass.cpp b/libcudacxx/test/libcudacxx/cuda/memory_resource/managed_memory_resource/equality.pass.cpp
deleted file mode 100644
index 2c42c469b4b..00000000000
--- a/libcudacxx/test/libcudacxx/cuda/memory_resource/managed_memory_resource/equality.pass.cpp
+++ /dev/null
@@ -1,130 +0,0 @@
-//===----------------------------------------------------------------------===//
-//
-// Part of libcu++, the C++ Standard Library for your entire system,
-// under the Apache License v2.0 with LLVM Exceptions.
-// See https://llvm.org/LICENSE.txt for license information.
-// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
-// SPDX-FileCopyrightText: Copyright (c) 2024 NVIDIA CORPORATION & AFFILIATES.
-//
-//===----------------------------------------------------------------------===//
-
-// UNSUPPORTED: c++03, c++11
-// UNSUPPORTED: msvc-19.16
-// UNSUPPORTED: nvrtc
-
-#include <cuda/memory_resource>
-#include <cuda/std/cassert>
-#include <cuda/std/cstdint>
-#include <cuda/stream_ref>
-
-enum class AccessibilityType
-{
-  Device,
-  Host,
-};
-
-template <AccessibilityType Accessibilty>
-struct resource
-{
-  void* allocate(size_t, size_t)
-  {
-    return nullptr;
-  }
-  void deallocate(void*, size_t, size_t) noexcept {}
-
-  bool operator==(const resource&) const
-  {
-    return true;
-  }
-  bool operator!=(const resource& other) const
-  {
-    return false;
-  }
-};
-static_assert(cuda::mr::resource<resource<AccessibilityType::Host>>, "");
-static_assert(cuda::mr::resource<resource<AccessibilityType::Device>>, "");
-
-template <AccessibilityType Accessibilty>
-struct async_resource : public resource<Accessibilty>
-{
-  void* allocate_async(size_t, size_t, cuda::stream_ref)
-  {
-    return nullptr;
-  }
-  void deallocate_async(void*, size_t, size_t, cuda::stream_ref) {}
-};
-static_assert(cuda::mr::async_resource<async_resource<AccessibilityType::Host>>, "");
-static_assert(cuda::mr::async_resource<async_resource<AccessibilityType::Device>>, "");
-
-// test for cccl#2214: https://github.com/NVIDIA/cccl/issues/2214
-struct derived_managed_resource : cuda::mr::managed_memory_resource
-{
-  using cuda::mr::managed_memory_resource::managed_memory_resource;
-};
-static_assert(cuda::mr::resource<derived_managed_resource>, "");
-
-void test()
-{
-  cuda::mr::managed_memory_resource first{};
-  { // comparison against a plain managed_memory_resource
-    cuda::mr::managed_memory_resource second{};
-    assert(first == second);
-    assert(!(first != second));
-  }
-
-  { // comparison against a plain managed_memory_resource with a different flag set
-    cuda::mr::managed_memory_resource second{cudaMemAttachHost};
-    assert(!(first == second));
-    assert((first != second));
-  }
-
-  { // comparison against a managed_memory_resource wrapped inside a resource_ref<cuda::mr::host_accessible>
-    cuda::mr::managed_memory_resource second{};
-    assert(first == cuda::mr::resource_ref<cuda::mr::host_accessible>{second});
-    assert(!(first != cuda::mr::resource_ref<cuda::mr::host_accessible>{second}));
-    assert(cuda::mr::resource_ref<cuda::mr::host_accessible>{second} == first);
-    assert(!(cuda::mr::resource_ref<cuda::mr::host_accessible>{second} != first));
-  }
-
-  { // comparison against a managed_memory_resource wrapped inside a resource_ref<cuda::mr::device_accessible>
-    cuda::mr::managed_memory_resource second{};
-    assert(first == cuda::mr::resource_ref<cuda::mr::device_accessible>{second});
-    assert(!(first != cuda::mr::resource_ref<cuda::mr::device_accessible>{second}));
-    assert(cuda::mr::resource_ref<cuda::mr::device_accessible>{second} == first);
-    assert(!(cuda::mr::resource_ref<cuda::mr::device_accessible>{second} != first));
-  }
-
-  { // comparison against a different resource through resource_ref
-    resource<AccessibilityType::Host> host_resource{};
-    resource<AccessibilityType::Device> device_resource{};
-    assert(!(first == host_resource));
-    assert(first != host_resource);
-    assert(!(first == device_resource));
-    assert(first != device_resource);
-
-    assert(!(host_resource == first));
-    assert(host_resource != first);
-    assert(!(device_resource == first));
-    assert(device_resource != first);
-  }
-
-  { // comparison against a different resource through resource_ref
-    async_resource<AccessibilityType::Host> host_async_resource{};
-    async_resource<AccessibilityType::Device> device_async_resource{};
-    assert(!(first == host_async_resource));
-    assert(first != host_async_resource);
-    assert(!(first == device_async_resource));
-    assert(first != device_async_resource);
-
-    assert(!(host_async_resource == first));
-    assert(host_async_resource != first);
-    assert(!(device_async_resource == first));
-    assert(device_async_resource != first);
-  }
-}
-
-int main(int, char**)
-{
-  NV_IF_TARGET(NV_IS_HOST, test();)
-  return 0;
-}
diff --git a/libcudacxx/test/libcudacxx/cuda/memory_resource/managed_memory_resource/traits.pass.cpp b/libcudacxx/test/libcudacxx/cuda/memory_resource/managed_memory_resource/traits.pass.cpp
deleted file mode 100644
index 02b9bd0294c..00000000000
--- a/libcudacxx/test/libcudacxx/cuda/memory_resource/managed_memory_resource/traits.pass.cpp
+++ /dev/null
@@ -1,31 +0,0 @@
-//===----------------------------------------------------------------------===//
-//
-// Part of libcu++, the C++ Standard Library for your entire system,
-// under the Apache License v2.0 with LLVM Exceptions.
-// See https://llvm.org/LICENSE.txt for license information.
-// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
-// SPDX-FileCopyrightText: Copyright (c) 2024 NVIDIA CORPORATION & AFFILIATES.
-//
-//===----------------------------------------------------------------------===//
-
-// UNSUPPORTED: c++03, c++11
-// UNSUPPORTED: msvc-19.16
-// UNSUPPORTED: nvrtc
-
-#include <cuda/memory_resource>
-#include <cuda/std/type_traits>
-
-using resource = cuda::mr::managed_memory_resource;
-static_assert(!cuda::std::is_trivial<resource>::value, "");
-static_assert(!cuda::std::is_trivially_default_constructible<resource>::value, "");
-static_assert(cuda::std::is_trivially_copy_constructible<resource>::value, "");
-static_assert(cuda::std::is_trivially_move_constructible<resource>::value, "");
-static_assert(cuda::std::is_trivially_copy_assignable<resource>::value, "");
-static_assert(cuda::std::is_trivially_move_assignable<resource>::value, "");
-static_assert(cuda::std::is_trivially_destructible<resource>::value, "");
-static_assert(!cuda::std::is_empty<resource>::value, "");
-
-int main(int, char**)
-{
-  return 0;
-}
diff --git a/libcudacxx/test/libcudacxx/cuda/memory_resource/pinned_memory_resource/allocate.pass.cpp b/libcudacxx/test/libcudacxx/cuda/memory_resource/pinned_memory_resource/allocate.pass.cpp
deleted file mode 100644
index a8fff25ffa6..00000000000
--- a/libcudacxx/test/libcudacxx/cuda/memory_resource/pinned_memory_resource/allocate.pass.cpp
+++ /dev/null
@@ -1,98 +0,0 @@
-//===----------------------------------------------------------------------===//
-//
-// Part of libcu++, the C++ Standard Library for your entire system,
-// under the Apache License v2.0 with LLVM Exceptions.
-// See https://llvm.org/LICENSE.txt for license information.
-// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
-// SPDX-FileCopyrightText: Copyright (c) 2024 NVIDIA CORPORATION & AFFILIATES.
-//
-//===----------------------------------------------------------------------===//
-
-// UNSUPPORTED: c++03, c++11
-// UNSUPPORTED: msvc-19.16
-// UNSUPPORTED: nvrtc
-
-#include <cuda/memory_resource>
-#include <cuda/std/cassert>
-#include <cuda/std/cstdint>
-#include <cuda/stream_ref>
-
-#include "test_macros.h"
-
-void ensure_pinned_host_ptr(void* ptr)
-{
-  assert(ptr != nullptr);
-  cudaPointerAttributes attributes;
-  cudaError_t status = cudaPointerGetAttributes(&attributes, ptr);
-  assert(status == cudaSuccess);
-  assert((attributes.type == cudaMemoryTypeHost) && (attributes.devicePointer != nullptr));
-}
-
-void test(const unsigned int flag)
-{
-  cuda::mr::pinned_memory_resource res{flag};
-
-  { // allocate / deallocate
-    auto* ptr = res.allocate(42);
-    static_assert(cuda::std::is_same<decltype(ptr), void*>::value, "");
-    ensure_pinned_host_ptr(ptr);
-
-    res.deallocate(ptr, 42);
-  }
-
-  { // allocate / deallocate with alignment
-    auto* ptr = res.allocate(42, 4);
-    static_assert(cuda::std::is_same<decltype(ptr), void*>::value, "");
-    ensure_pinned_host_ptr(ptr);
-
-    res.deallocate(ptr, 42, 4);
-  }
-
-#ifndef TEST_HAS_NO_EXCEPTIONS
-  { // allocate with too small alignment
-    while (true)
-    {
-      try
-      {
-        auto* ptr = res.allocate(5, 42);
-        unused(ptr);
-      }
-      catch (const std::invalid_argument&)
-      {
-        break;
-      }
-      assert(false);
-    }
-  }
-
-  { // allocate with non matching alignment
-    while (true)
-    {
-      try
-      {
-        auto* ptr = res.allocate(5, 1337);
-        unused(ptr);
-      }
-      catch (const std::invalid_argument&)
-      {
-        break;
-      }
-      assert(false);
-    }
-  }
-#endif // TEST_HAS_NO_EXCEPTIONS
-}
-
-void test()
-{
-  test(cudaHostAllocDefault);
-  test(cudaHostAllocPortable);
-  test(cudaHostAllocMapped);
-  test(cudaHostAllocWriteCombined);
-}
-
-int main(int, char**)
-{
-  NV_IF_TARGET(NV_IS_HOST, test();)
-  return 0;
-}
diff --git a/libcudacxx/test/libcudacxx/cuda/memory_resource/pinned_memory_resource/equality.pass.cpp b/libcudacxx/test/libcudacxx/cuda/memory_resource/pinned_memory_resource/equality.pass.cpp
deleted file mode 100644
index e7f9918895d..00000000000
--- a/libcudacxx/test/libcudacxx/cuda/memory_resource/pinned_memory_resource/equality.pass.cpp
+++ /dev/null
@@ -1,132 +0,0 @@
-//===----------------------------------------------------------------------===//
-//
-// Part of libcu++, the C++ Standard Library for your entire system,
-// under the Apache License v2.0 with LLVM Exceptions.
-// See https://llvm.org/LICENSE.txt for license information.
-// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
-// SPDX-FileCopyrightText: Copyright (c) 2024 NVIDIA CORPORATION & AFFILIATES.
-//
-//===----------------------------------------------------------------------===//
-
-// UNSUPPORTED: c++03, c++11
-// UNSUPPORTED: msvc-19.16
-// UNSUPPORTED: nvrtc
-
-#include <cuda/memory_resource>
-#include <cuda/std/cassert>
-#include <cuda/std/cstdint>
-#include <cuda/stream_ref>
-
-enum class AccessibilityType
-{
-  Device,
-  Host,
-};
-
-template <AccessibilityType Accessibilty>
-struct resource
-{
-  void* allocate(size_t, size_t)
-  {
-    return nullptr;
-  }
-  void deallocate(void*, size_t, size_t) noexcept {}
-
-  bool operator==(const resource&) const
-  {
-    return true;
-  }
-  bool operator!=(const resource& other) const
-  {
-    return false;
-  }
-};
-static_assert(cuda::mr::resource<resource<AccessibilityType::Host>>, "");
-static_assert(cuda::mr::resource<resource<AccessibilityType::Device>>, "");
-
-template <AccessibilityType Accessibilty>
-struct async_resource : public resource<Accessibilty>
-{
-  void* allocate_async(size_t, size_t, cuda::stream_ref)
-  {
-    return nullptr;
-  }
-  void deallocate_async(void*, size_t, size_t, cuda::stream_ref) {}
-};
-static_assert(cuda::mr::async_resource<async_resource<AccessibilityType::Host>>, "");
-static_assert(cuda::mr::async_resource<async_resource<AccessibilityType::Device>>, "");
-
-// test for cccl#2214: https://github.com/NVIDIA/cccl/issues/2214
-struct derived_pinned_resource : cuda::mr::pinned_memory_resource
-{
-  using cuda::mr::pinned_memory_resource::pinned_memory_resource;
-};
-static_assert(cuda::mr::resource<derived_pinned_resource>, "");
-
-void test()
-{
-  cuda::mr::pinned_memory_resource first{};
-  { // comparison against a plain pinned_memory_resource
-    cuda::mr::pinned_memory_resource second{cudaHostAllocDefault};
-    assert(first == second);
-    assert(!(first != second));
-  }
-
-  { // comparison against a plain pinned_memory_resource with a different flag set
-    cuda::mr::pinned_memory_resource second{cudaHostAllocPortable};
-    assert(!(first == second));
-    assert((first != second));
-  }
-
-  { // comparison against a pinned_memory_resource wrapped inside a resource_ref<cuda::mr::host_accessible>
-    cuda::mr::pinned_memory_resource second{};
-    cuda::mr::resource_ref<cuda::mr::host_accessible> second_ref{second};
-    assert(first == second_ref);
-    assert(!(first != second_ref));
-    assert(second_ref == first);
-    assert(!(second_ref != first));
-  }
-
-  { // comparison against a pinned_memory_resource wrapped inside a resource_ref<cuda::mr::device_accessible>
-    cuda::mr::pinned_memory_resource second{};
-    cuda::mr::resource_ref<cuda::mr::device_accessible> second_ref{second};
-    assert(first == second_ref);
-    assert(!(first != second_ref));
-    assert(second_ref == first);
-    assert(!(second_ref != first));
-  }
-
-  { // comparison against a different resource through resource_ref
-    resource<AccessibilityType::Host> host_resource{};
-    resource<AccessibilityType::Device> device_resource{};
-    assert(!(first == host_resource));
-    assert(first != host_resource);
-    assert(!(first == device_resource));
-    assert(first != device_resource);
-
-    assert(!(host_resource == first));
-    assert(host_resource != first);
-    assert(!(device_resource == first));
-    assert(device_resource != first);
-  }
-
-  { // comparison against a different resource through resource_ref
-    async_resource<AccessibilityType::Host> host_async_resource{};
-    async_resource<AccessibilityType::Device> device_async_resource{};
-    assert(!(first == host_async_resource));
-    assert(first != host_async_resource);
-    assert(!(first == device_async_resource));
-    assert(first != device_async_resource);
-
-    assert(!(host_async_resource == first));
-    assert(host_async_resource != first);
-    assert(!(device_async_resource == first));
-    assert(device_async_resource != first);
-  }
-}
-
-int main(int, char**)
-{
-  NV_IF_TARGET(NV_IS_HOST, test();)
-  return 0;
-}
diff --git a/libcudacxx/test/libcudacxx/cuda/memory_resource/pinned_memory_resource/traits.pass.cpp b/libcudacxx/test/libcudacxx/cuda/memory_resource/pinned_memory_resource/traits.pass.cpp
deleted file mode 100644
index b0bbae95268..00000000000
--- a/libcudacxx/test/libcudacxx/cuda/memory_resource/pinned_memory_resource/traits.pass.cpp
+++ /dev/null
@@ -1,31 +0,0 @@
-//===----------------------------------------------------------------------===//
-//
-// Part of libcu++, the C++ Standard Library for your entire system,
-// under the Apache License v2.0 with LLVM Exceptions.
-// See https://llvm.org/LICENSE.txt for license information.
-// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
-// SPDX-FileCopyrightText: Copyright (c) 2024 NVIDIA CORPORATION & AFFILIATES.
-//
-//===----------------------------------------------------------------------===//
-
-// UNSUPPORTED: c++03, c++11
-// UNSUPPORTED: msvc-19.16
-// UNSUPPORTED: nvrtc
-
-#include <cuda/memory_resource>
-#include <cuda/std/type_traits>
-
-using resource = cuda::mr::pinned_memory_resource;
-static_assert(!cuda::std::is_trivial<resource>::value, "");
-static_assert(!cuda::std::is_trivially_default_constructible<resource>::value, "");
-static_assert(cuda::std::is_trivially_copy_constructible<resource>::value, "");
-static_assert(cuda::std::is_trivially_move_constructible<resource>::value, "");
-static_assert(cuda::std::is_trivially_copy_assignable<resource>::value, "");
-static_assert(cuda::std::is_trivially_move_assignable<resource>::value, "");
-static_assert(cuda::std::is_trivially_destructible<resource>::value, "");
-static_assert(!cuda::std::is_empty<resource>::value, "");
-
-int main(int, char**)
-{
-  return 0;
-}

From c7ed9749f052fc1c8aaf825af2f7d6447c479b17 Mon Sep 17 00:00:00 2001
From: Federico Busato <50413820+fbusato@users.noreply.github.com>
Date: Tue, 26 Nov 2024 01:32:34 -0800
Subject: [PATCH 31/45] `std::dims` (#2961)

---
 docs/libcudacxx/standard_api.rst              |  6 ++--
 .../standard_api/container_library/mdspan.rst |  1 +
 .../include/cuda/std/__mdspan/extents.h       |  3 ++
 .../mdspan.extents.dims/compare.pass.cpp      | 29 +++++++++++++++++++
 4 files changed, 37 insertions(+), 2 deletions(-)
 create mode 100644 libcudacxx/test/libcudacxx/std/containers/views/mdspan/mdspan.extents.dims/compare.pass.cpp

diff --git a/docs/libcudacxx/standard_api.rst b/docs/libcudacxx/standard_api.rst
index 0729df55406..cb01d478702 100644
--- a/docs/libcudacxx/standard_api.rst
+++ b/docs/libcudacxx/standard_api.rst
@@ -101,5 +101,7 @@ Feature availability:
 
 -  C++23 ``<mdspan>`` is available in C++17.
 
-   -  mdspan is feature complete in C++17 onwards.
-   -  mdspan on msvc is only supported in C++20 and onwards.
+   -  ``mdspan`` is feature complete in C++17 onwards.
+   -  ``mdspan`` on msvc is only supported in C++20 and onwards.
+
+-  C++26 ``std::dims`` is available in C++17.
diff --git a/docs/libcudacxx/standard_api/container_library/mdspan.rst b/docs/libcudacxx/standard_api/container_library/mdspan.rst
index 664a60eb48e..72174d13624 100644
--- a/docs/libcudacxx/standard_api/container_library/mdspan.rst
+++ b/docs/libcudacxx/standard_api/container_library/mdspan.rst
@@ -7,6 +7,7 @@ Extensions
 ----------
 
 -  All features of ``<mdspan>`` are made available in C++17 onwards
+-  C++26 ``std::dims`` is made available in C++17 onwards
 
 Restrictions
 ------------
diff --git a/libcudacxx/include/cuda/std/__mdspan/extents.h b/libcudacxx/include/cuda/std/__mdspan/extents.h
index d0bdfd016f6..0acb6579d7d 100644
--- a/libcudacxx/include/cuda/std/__mdspan/extents.h
+++ b/libcudacxx/include/cuda/std/__mdspan/extents.h
@@ -523,6 +523,9 @@ struct __make_dextents<_IndexType, 0, _CUDA_VSTD::extents<_IndexType, _ExtentsPa
 template <class _IndexType, size_t _Rank>
 using dextents = typename __detail::__make_dextents<_IndexType, _Rank>::type;
 
+template <size_t _Rank, class _IndexType = size_t>
+using dims = dextents<_IndexType, _Rank>;
+
 #  if defined(__MDSPAN_USE_CLASS_TEMPLATE_ARGUMENT_DEDUCTION)
 template <class... _IndexTypes>
 _CCCL_HOST_DEVICE extents(_IndexTypes...)
diff --git a/libcudacxx/test/libcudacxx/std/containers/views/mdspan/mdspan.extents.dims/compare.pass.cpp b/libcudacxx/test/libcudacxx/std/containers/views/mdspan/mdspan.extents.dims/compare.pass.cpp
new file mode 100644
index 00000000000..ec2e8c6d725
--- /dev/null
+++ b/libcudacxx/test/libcudacxx/std/containers/views/mdspan/mdspan.extents.dims/compare.pass.cpp
@@ -0,0 +1,29 @@
+//===----------------------------------------------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+// SPDX-FileCopyrightText: Copyright (c) 2024 NVIDIA CORPORATION & AFFILIATES.
+//
+//===----------------------------------------------------------------------===//
+
+// UNSUPPORTED: c++11
+// UNSUPPORTED: msvc && c++14, msvc && c++17
+
+#include <cuda/std/cassert>
+#include <cuda/std/mdspan>
+
+int main(int, char**)
+{
+  {
+    using index_t = size_t;
+
+    cuda::std::dextents<index_t, 3> e0{1, 2, 3};
+    cuda::std::dims<3> e1{1, 2, 3};
+
+    static_assert(cuda::std::is_same<decltype(e0), decltype(e1)>::value, "");
+    assert(e0 == e1);
+  }
+
+  return 0;
+}

From 8d6986d46ca5288d4bd7af7b9088f8a55297ba93 Mon Sep 17 00:00:00 2001
From: Michael Schellenberger Costa <miscco@nvidia.com>
Date: Tue, 26 Nov 2024 11:29:19 +0100
Subject: [PATCH 32/45] Fix merge conflict from moving resources up a namespace
 (#2965)

---
 cudax/test/algorithm/common.cuh | 2 +-
 cudax/test/algorithm/copy.cu    | 2 +-
 cudax/test/algorithm/fill.cu    | 2 +-
 3 files changed, 3 insertions(+), 3 deletions(-)

diff --git a/cudax/test/algorithm/common.cuh b/cudax/test/algorithm/common.cuh
index c4c7be0d02c..661d087f3bc 100644
--- a/cudax/test/algorithm/common.cuh
+++ b/cudax/test/algorithm/common.cuh
@@ -46,7 +46,7 @@ void check_result_and_erase(cudax::stream_ref stream, Result&& result, uint8_t p
 template <typename Layout = cuda::std::layout_right, typename Extents>
 auto make_buffer_for_mdspan(Extents extents, char value = 0)
 {
-  cuda::mr::pinned_memory_resource host_resource;
+  cudax::pinned_memory_resource host_resource;
   auto mapping = typename Layout::template mapping<decltype(extents)>{extents};
 
   cudax::uninitialized_buffer<int, cuda::mr::host_accessible> buffer(host_resource, mapping.required_span_size());
diff --git a/cudax/test/algorithm/copy.cu b/cudax/test/algorithm/copy.cu
index afb9a2b71d5..583c3a836aa 100644
--- a/cudax/test/algorithm/copy.cu
+++ b/cudax/test/algorithm/copy.cu
@@ -160,7 +160,7 @@ TEST_CASE("Mdspan copy", "[data_manipulation]")
     auto mdspan_buffer                   = make_buffer_for_mdspan(mixed_extents, 1);
     cuda::std::mdspan<int, decltype(mixed_extents)> mdspan(mdspan_buffer.data(), mixed_extents);
     cudax::weird_buffer<cuda::std::mdspan<int, decltype(static_extents)>> buffer{
-      cuda::mr::pinned_memory_resource{}, mdspan.mapping().required_span_size()};
+      cudax::pinned_memory_resource{}, mdspan.mapping().required_span_size()};
 
     cudax::copy_bytes(stream, mdspan, buffer);
     stream.wait();
diff --git a/cudax/test/algorithm/fill.cu b/cudax/test/algorithm/fill.cu
index 35fae342ad3..80bf6ef57e6 100644
--- a/cudax/test/algorithm/fill.cu
+++ b/cudax/test/algorithm/fill.cu
@@ -67,7 +67,7 @@ TEST_CASE("Mdspan Fill", "[data_manipulation]")
   {
     using static_extents = cuda::std::extents<size_t, 2, 3, 4>;
     auto size            = cuda::std::layout_left::mapping<static_extents>().required_span_size();
-    cudax::weird_buffer<cuda::std::mdspan<int, static_extents>> buffer(cuda::mr::pinned_memory_resource{}, size);
+    cudax::weird_buffer<cuda::std::mdspan<int, static_extents>> buffer(cudax::pinned_memory_resource{}, size);
 
     cudax::fill_bytes(stream, buffer, fill_byte);
     check_result_and_erase(stream, cuda::std::span(buffer.data, buffer.size));

From 3e826380f2c42162d529dc67944ad8e8435d4d18 Mon Sep 17 00:00:00 2001
From: pciolkosz <pciolkosz@nvidia.com>
Date: Tue, 26 Nov 2024 15:45:03 -0800
Subject: [PATCH 33/45] [CUDAX] Add a way to combine thread hierarchies (#2746)

* Implement hierarchy_dimensions::combine

* Fix issues after the merge
---
 .../__hierarchy/hierarchy_dimensions.cuh      | 66 ++++++++++++++++++-
 .../__hierarchy/hierarchy_levels.cuh          | 11 ++--
 cudax/test/hierarchy/hierarchy_smoke.cu       | 51 ++++++++++++++
 3 files changed, 120 insertions(+), 8 deletions(-)

diff --git a/cudax/include/cuda/experimental/__hierarchy/hierarchy_dimensions.cuh b/cudax/include/cuda/experimental/__hierarchy/hierarchy_dimensions.cuh
index a458c0d4017..61ddc5cb203 100644
--- a/cudax/include/cuda/experimental/__hierarchy/hierarchy_dimensions.cuh
+++ b/cudax/include/cuda/experimental/__hierarchy/hierarchy_dimensions.cuh
@@ -138,7 +138,7 @@ template <typename... Levels>
 struct can_stack_checker
 {
   template <typename... LevelsShifted>
-  using can_stack = ::cuda::std::__fold_and<detail::can_stack_on_top<LevelsShifted, Levels>...>;
+  using can_stack = ::cuda::std::__fold_and<detail::can_rhs_stack_on_lhs<LevelsShifted, Levels>...>;
 };
 
 template <typename LUnit, typename L1, typename... Levels>
@@ -436,6 +436,9 @@ private:
   };
 
 public:
+  template <typename, typename...>
+  friend struct hierarchy_dimensions_fragment;
+
   template <typename Unit, typename Level>
   using extents_type = decltype(::cuda::std::apply(
     ::cuda::std::declval<detail::hierarchy_extents_helper<Unit>>(),
@@ -715,6 +718,63 @@ public:
 
     return ::cuda::std::apply(detail::get_level_helper<Level>{}, levels);
   }
+
+  //! @brief Returns a new hierarchy with combined levels of this and the other supplied hierarchy
+  //!
+  //! This function combines this hierarchy with the supplied hierarchy, the resulting hierarchy
+  //! holds levels present in both hierarchies. In case of overlap of levels this hierarchy
+  //! is prioritized, so the result always holds all levels from this hierarchy and non-overlapping
+  //! levels from the other hierarchy.
+  //!
+  //! @param other The other hierarchy to be combined with this hierarchy
+  //!
+  //! @return Hierarchy holding the combined levels from both hierarchies
+  template <typename OtherUnit, typename... OtherLevels>
+  constexpr auto combine(const hierarchy_dimensions_fragment<OtherUnit, OtherLevels...>& other)
+  {
+    using this_top_level     = __level_type_of<::cuda::std::__type_index_c<0, Levels...>>;
+    using this_bottom_level  = __level_type_of<::cuda::std::__type_index_c<sizeof...(Levels) - 1, Levels...>>;
+    using other_top_level    = __level_type_of<::cuda::std::__type_index_c<0, OtherLevels...>>;
+    using other_bottom_level = __level_type_of<::cuda::std::__type_index_c<sizeof...(OtherLevels) - 1, OtherLevels...>>;
+    if constexpr (detail::can_rhs_stack_on_lhs<other_top_level, this_bottom_level>)
+    {
+      // Easily stackable case, example this is (grid), other is (cluster, block)
+      return ::cuda::std::apply(fragment_helper<OtherUnit>(), ::cuda::std::tuple_cat(levels, other.levels));
+    }
+    else if constexpr (has_level<this_bottom_level, hierarchy_dimensions_fragment<OtherUnit, OtherLevels...>>
+                       && (!has_level<this_top_level, hierarchy_dimensions_fragment<OtherUnit, OtherLevels...>>
+                           || ::cuda::std::is_same_v<this_top_level, other_top_level>) )
+    {
+      // Overlap with this on the top, e.g. this is (grid, cluster), other is (cluster, block), can fully overlap
+      // Do we have some CCCL tuple utils that can select all but the first?
+      auto to_add_with_one_too_many = other.template levels_range<OtherUnit, this_bottom_level>();
+      auto to_add                   = ::cuda::std::apply(
+        [](auto&&, auto&&... rest) {
+          return ::cuda::std::make_tuple(rest...);
+        },
+        to_add_with_one_too_many);
+      return ::cuda::std::apply(fragment_helper<OtherUnit>(), ::cuda::std::tuple_cat(levels, to_add));
+    }
+    else
+    {
+      if constexpr (detail::can_rhs_stack_on_lhs<this_top_level, other_bottom_level>)
+      {
+        // Easily stackable case again, just reversed
+        return ::cuda::std::apply(fragment_helper<BottomUnit>(), ::cuda::std::tuple_cat(other.levels, levels));
+      }
+      else
+      {
+        // Overlap with this on the bottom, e.g. this is (cluster, block), other is (grid, cluster), can fully overlap
+        static_assert(has_level<other_bottom_level, hierarchy_dimensions_fragment<BottomUnit, Levels...>>
+                        && (!has_level<this_bottom_level, hierarchy_dimensions_fragment<OtherUnit, OtherLevels...>>
+                            || ::cuda::std::is_same_v<this_bottom_level, other_bottom_level>),
+                      "Can't combine the hierarchies");
+
+        auto to_add = other.template levels_range<this_top_level, other_top_level>();
+        return ::cuda::std::apply(fragment_helper<BottomUnit>(), ::cuda::std::tuple_cat(to_add, levels));
+      }
+    }
+  }
 };
 
 /**
@@ -810,14 +870,14 @@ _CUDAX_API constexpr auto operator&(const hierarchy_dimensions_fragment<LUnit, L
   using top_level    = __level_type_of<::cuda::std::__type_index_c<0, Levels...>>;
   using bottom_level = __level_type_of<::cuda::std::__type_index_c<sizeof...(Levels) - 1, Levels...>>;
 
-  if constexpr (detail::can_stack_on_top<top_level, __level_type_of<NewLevel>>)
+  if constexpr (detail::can_rhs_stack_on_lhs<top_level, __level_type_of<NewLevel>>)
   {
     return hierarchy_dimensions_fragment<LUnit, NewLevel, Levels...>(
       ::cuda::std::tuple_cat(::cuda::std::make_tuple(new_level), ls.levels));
   }
   else
   {
-    static_assert(detail::can_stack_on_top<__level_type_of<NewLevel>, bottom_level>,
+    static_assert(detail::can_rhs_stack_on_lhs<__level_type_of<NewLevel>, bottom_level>,
                   "Not supported order of levels in hierarchy");
     using NewUnit = detail::__default_unit_below<__level_type_of<NewLevel>>;
     return hierarchy_dimensions_fragment<NewUnit, Levels..., NewLevel>(
diff --git a/cudax/include/cuda/experimental/__hierarchy/hierarchy_levels.cuh b/cudax/include/cuda/experimental/__hierarchy/hierarchy_levels.cuh
index 23593866c75..bbdcdcfc77e 100644
--- a/cudax/include/cuda/experimental/__hierarchy/hierarchy_levels.cuh
+++ b/cudax/include/cuda/experimental/__hierarchy/hierarchy_levels.cuh
@@ -68,6 +68,7 @@ struct dimensions_query
     return hierarchy::extents<Unit, Level>();
   }
 };
+
 } // namespace detail
 
 // Struct to represent levels allowed below or above a certain level,
@@ -91,12 +92,12 @@ _CCCL_INLINE_VAR constexpr bool is_level_allowed<QueryLevel, allowed_levels<Leve
   ::cuda::std::disjunction_v<::cuda::std::is_same<QueryLevel, Levels>...>;
 
 template <typename L1, typename L2>
-_CCCL_INLINE_VAR constexpr bool can_stack_on_top =
+_CCCL_INLINE_VAR constexpr bool can_rhs_stack_on_lhs =
   is_level_allowed<L1, typename L2::allowed_below> || is_level_allowed<L2, typename L1::allowed_above>;
 
 template <typename Unit, typename Level>
 _CCCL_INLINE_VAR constexpr bool legal_unit_for_level =
-  can_stack_on_top<Unit, Level> || legal_unit_for_level<Unit, __default_unit_below<Level>>;
+  can_rhs_stack_on_lhs<Unit, Level> || legal_unit_for_level<Unit, __default_unit_below<Level>>;
 
 template <typename Unit>
 _CCCL_INLINE_VAR constexpr bool legal_unit_for_level<Unit, void> = false;
@@ -275,7 +276,7 @@ struct dims_helper<cluster_level, grid_level>
 template <typename Unit, typename Level>
 /* _CCCL_NODISCARD */ _CCCL_DEVICE auto extents_impl()
 {
-  if constexpr (::cuda::std::is_same_v<Unit, Level> || can_stack_on_top<Unit, Level>)
+  if constexpr (::cuda::std::is_same_v<Unit, Level> || can_rhs_stack_on_lhs<Unit, Level>)
   {
     return dim3_to_dims(dims_helper<Unit, Level>::extents());
   }
@@ -291,7 +292,7 @@ template <typename Unit, typename Level>
 template <typename Unit, typename Level>
 /* _CCCL_NODISCARD */ _CCCL_DEVICE auto index_impl()
 {
-  if constexpr (::cuda::std::is_same_v<Unit, Level> || detail::can_stack_on_top<Unit, Level>)
+  if constexpr (::cuda::std::is_same_v<Unit, Level> || detail::can_rhs_stack_on_lhs<Unit, Level>)
   {
     return dim3_to_dims(dims_helper<Unit, Level>::index());
   }
@@ -386,7 +387,7 @@ template <typename Unit, typename Level>
 _CCCL_DEVICE auto rank(const Unit&, const Level&)
 {
   static_assert(detail::legal_unit_for_level<Unit, Level>);
-  if constexpr (detail::can_stack_on_top<Unit, Level>)
+  if constexpr (detail::can_rhs_stack_on_lhs<Unit, Level>)
   {
     return detail::index_to_linear<typename Level::product_type>(
       detail::index_impl<Unit, Level>(), detail::extents_impl<Unit, Level>());
diff --git a/cudax/test/hierarchy/hierarchy_smoke.cu b/cudax/test/hierarchy/hierarchy_smoke.cu
index 582e745ce3c..206c71d45bb 100644
--- a/cudax/test/hierarchy/hierarchy_smoke.cu
+++ b/cudax/test/hierarchy/hierarchy_smoke.cu
@@ -526,3 +526,54 @@ TEST_CASE("cudax::distribute", "[hierarchy]")
   CUDAX_REQUIRE(dims.count(cudax::thread, cudax::block) == 256);
   CUDAX_REQUIRE(dims.count(cudax::block, cudax::grid) == (numElements + threadsPerBlock - 1) / threadsPerBlock);
 }
+
+TEST_CASE("hierarchy merge", "[hierarchy]")
+{
+  SECTION("Non overlapping")
+  {
+    auto h1       = cudax::make_hierarchy_fragment<cudax::block_level>(cudax::grid_dims<2>());
+    auto h2       = cudax::make_hierarchy_fragment<cudax::thread_level>(cudax::block_dims<3>());
+    auto combined = h1.combine(h2);
+    static_assert(combined.count(cudax::thread) == 6);
+    static_assert(combined.count(cudax::thread, cudax::block) == 3);
+    static_assert(combined.count(cudax::block) == 2);
+    auto combined_the_other_way = h2.combine(h1);
+    static_assert(cuda::std::is_same_v<decltype(combined), decltype(combined_the_other_way)>);
+    static_assert(combined_the_other_way.count(cudax::thread) == 6);
+
+    auto dynamic_values   = cudax::cluster_dims(4) & cudax::block_dims(5);
+    auto combined_dynamic = dynamic_values.combine(h1);
+    CUDAX_REQUIRE(combined_dynamic.count(cudax::thread) == 40);
+  }
+  SECTION("Overlapping")
+  {
+    auto h1 = cudax::make_hierarchy_fragment<cudax::block_level>(cudax::grid_dims<2>(), cudax::cluster_dims<3>());
+    auto h2 = cudax::make_hierarchy_fragment<cudax::thread_level>(cudax::block_dims<4>(), cudax::cluster_dims<5>());
+    auto combined = h1.combine(h2);
+    static_assert(combined.count(cudax::thread) == 24);
+    static_assert(combined.count(cudax::thread, cudax::block) == 4);
+    static_assert(combined.count(cudax::block) == 6);
+
+    auto combined_the_other_way = h2.combine(h1);
+    static_assert(!cuda::std::is_same_v<decltype(combined), decltype(combined_the_other_way)>);
+    static_assert(combined_the_other_way.count(cudax::thread) == 40);
+    static_assert(combined_the_other_way.count(cudax::thread, cudax::block) == 4);
+    static_assert(combined_the_other_way.count(cudax::block) == 10);
+
+    auto ultimate_combination = combined.combine(combined_the_other_way);
+    static_assert(cuda::std::is_same_v<decltype(combined), decltype(ultimate_combination)>);
+    static_assert(ultimate_combination.count(cudax::thread) == 24);
+
+    auto block_level_replacement = cudax::make_hierarchy_fragment<cudax::thread_level>(cudax::block_dims<6>());
+    auto with_block_replaced     = block_level_replacement.combine(combined);
+    static_assert(with_block_replaced.count(cudax::thread) == 36);
+    static_assert(with_block_replaced.count(cudax::thread, cudax::block) == 6);
+
+    auto grid_cluster_level_replacement =
+      cudax::make_hierarchy_fragment<cudax::block_level>(cudax::grid_dims<7>(), cudax::cluster_dims<8>());
+    auto with_grid_cluster_replaced = grid_cluster_level_replacement.combine(combined);
+    static_assert(with_grid_cluster_replaced.count(cudax::thread) == 7 * 8 * 4);
+    static_assert(with_grid_cluster_replaced.count(cudax::block, cudax::cluster) == 8);
+    static_assert(with_grid_cluster_replaced.count(cudax::cluster) == 7);
+  }
+}

From ab87e540f47abb5f85adc2edad12d00afbfc34d9 Mon Sep 17 00:00:00 2001
From: Bradley Dice <bdice@bradleydice.com>
Date: Tue, 26 Nov 2024 18:11:57 -0600
Subject: [PATCH 34/45] Require approval to run CI on draft PRs. (#2969)

---
 .github/copy-pr-bot.yaml | 1 +
 1 file changed, 1 insertion(+)

diff --git a/.github/copy-pr-bot.yaml b/.github/copy-pr-bot.yaml
index d799c24aa69..f1297e5fb15 100644
--- a/.github/copy-pr-bot.yaml
+++ b/.github/copy-pr-bot.yaml
@@ -2,6 +2,7 @@
 # https://docs.gha-runners.nvidia.com/apps/copy-pr-bot/
 
 enabled: true
+auto_sync_draft: false
 additional_trustees:
   - ahendriksen
   - gonzalobg

From 27d8c87eb887feb61b6aadd4557f0444a2681562 Mon Sep 17 00:00:00 2001
From: Federico Busato <50413820+fbusato@users.noreply.github.com>
Date: Tue, 26 Nov 2024 16:42:47 -0800
Subject: [PATCH 35/45] fix thread-reduce performance regression (#2944)

---
 cub/cub/thread/thread_operators.cuh |  17 ++++
 cub/cub/thread/thread_reduce.cuh    | 147 ++++++++++++++++++++++------
 2 files changed, 134 insertions(+), 30 deletions(-)

diff --git a/cub/cub/thread/thread_operators.cuh b/cub/cub/thread/thread_operators.cuh
index 05f2d6a41f6..2ba2f6e0c1b 100644
--- a/cub/cub/thread/thread_operators.cuh
+++ b/cub/cub/thread/thread_operators.cuh
@@ -702,6 +702,10 @@ struct CubOperatorToSimdOperator<::cuda::minimum<>, T>
   using simd_type = typename type::simd_type;
 };
 
+template <typename T>
+struct CubOperatorToSimdOperator<::cuda::minimum<T>, T> : CubOperatorToSimdOperator<::cuda::minimum<>, T>
+{};
+
 template <typename T>
 struct CubOperatorToSimdOperator<::cuda::maximum<>, T>
 {
@@ -709,6 +713,10 @@ struct CubOperatorToSimdOperator<::cuda::maximum<>, T>
   using simd_type = typename type::simd_type;
 };
 
+template <typename T>
+struct CubOperatorToSimdOperator<::cuda::maximum<T>, T> : CubOperatorToSimdOperator<::cuda::maximum<>, T>
+{};
+
 template <typename T>
 struct CubOperatorToSimdOperator<::cuda::std::plus<>, T>
 {
@@ -716,6 +724,10 @@ struct CubOperatorToSimdOperator<::cuda::std::plus<>, T>
   using simd_type = typename type::simd_type;
 };
 
+template <typename T>
+struct CubOperatorToSimdOperator<::cuda::std::plus<T>, T> : CubOperatorToSimdOperator<::cuda::std::plus<>, T>
+{};
+
 template <typename T>
 struct CubOperatorToSimdOperator<::cuda::std::multiplies<>, T>
 {
@@ -723,6 +735,11 @@ struct CubOperatorToSimdOperator<::cuda::std::multiplies<>, T>
   using simd_type = typename type::simd_type;
 };
 
+template <typename T>
+struct CubOperatorToSimdOperator<::cuda::std::multiplies<T>, T>
+    : CubOperatorToSimdOperator<::cuda::std::multiplies<>, T>
+{};
+
 template <typename ReduceOp, typename T>
 using cub_operator_to_simd_operator_t = typename CubOperatorToSimdOperator<ReduceOp, T>::type;
 
diff --git a/cub/cub/thread/thread_reduce.cuh b/cub/cub/thread/thread_reduce.cuh
index f384d907b34..ad8342d65a9 100644
--- a/cub/cub/thread/thread_reduce.cuh
+++ b/cub/cub/thread/thread_reduce.cuh
@@ -229,8 +229,10 @@ namespace internal
 template <typename T, typename ReductionOp>
 struct enable_generic_simd_reduction_traits
 {
-  static constexpr bool value = cub::detail::is_one_of<T, ::cuda::std::int16_t, ::cuda::std::uint16_t>()
-                             && cub::detail::is_one_of<ReductionOp, ::cuda::minimum<>, ::cuda::maximum<>>();
+  static constexpr bool value =
+    cub::detail::is_one_of<T, ::cuda::std::int16_t, ::cuda::std::uint16_t>()
+    && cub::detail::
+      is_one_of<ReductionOp, ::cuda::minimum<>, ::cuda::minimum<T>, ::cuda::maximum<>, ::cuda::maximum<T>>();
 };
 
 #  if defined(_CCCL_HAS_NVFP16)
@@ -238,8 +240,16 @@ struct enable_generic_simd_reduction_traits
 template <typename ReductionOp>
 struct enable_generic_simd_reduction_traits<__half, ReductionOp>
 {
-  static constexpr bool value = cub::detail::
-    is_one_of<ReductionOp, ::cuda::minimum<>, ::cuda::maximum<>, ::cuda::std::plus<>, ::cuda::std::multiplies<>>();
+  static constexpr bool value = cub::detail::is_one_of<
+    ReductionOp,
+    ::cuda::minimum<>,
+    ::cuda::minimum<__half>,
+    ::cuda::maximum<>,
+    ::cuda::maximum<__half>,
+    ::cuda::std::plus<>,
+    ::cuda::std::plus<__half>,
+    ::cuda::std::multiplies<>,
+    ::cuda::std::multiplies<__half>>();
 };
 #  endif // defined(_CCCL_HAS_NVFP16)
 
@@ -248,8 +258,16 @@ struct enable_generic_simd_reduction_traits<__half, ReductionOp>
 template <typename ReductionOp>
 struct enable_generic_simd_reduction_traits<__nv_bfloat16, ReductionOp>
 {
-  static constexpr bool value = cub::detail::
-    is_one_of<ReductionOp, ::cuda::minimum<>, ::cuda::maximum<>, ::cuda::std::plus<>, ::cuda::std::multiplies<>>();
+  static constexpr bool value = cub::detail::is_one_of<
+    ReductionOp,
+    ::cuda::minimum<>,
+    ::cuda::minimum<__nv_bfloat16>,
+    ::cuda::maximum<>,
+    ::cuda::maximum<__nv_bfloat16>,
+    ::cuda::std::plus<>,
+    ::cuda::std::plus<__nv_bfloat16>,
+    ::cuda::std::multiplies<>,
+    ::cuda::std::multiplies<__nv_bfloat16>>();
 };
 
 #  endif // defined(_CCCL_HAS_NVBF16)
@@ -269,7 +287,8 @@ _CCCL_NODISCARD _CCCL_DEVICE constexpr bool enable_sm90_simd_reduction()
   using cub::detail::is_one_of;
   // ::cuda::std::plus<> not handled: IADD3 always produces less instructions than VIADD2
   return is_one_of<T, ::cuda::std::int16_t, ::cuda::std::uint16_t>() && //
-         is_one_of<ReductionOp, ::cuda::minimum<>, ::cuda::maximum<>>() && Length >= 10;
+         is_one_of<ReductionOp, ::cuda::minimum<>, ::cuda::minimum<T>, ::cuda::maximum<>, ::cuda::maximum<T>>()
+      && Length >= 10;
 }
 
 template <typename T, typename ReductionOp, int Length>
@@ -277,7 +296,15 @@ _CCCL_NODISCARD _CCCL_DEVICE constexpr bool enable_sm80_simd_reduction()
 {
   using cub::detail::is_one_of;
   using ::cuda::std::is_same;
-  return is_one_of<ReductionOp, ::cuda::minimum<>, ::cuda::maximum<>, ::cuda::std::plus<>, ::cuda::std::multiplies<>>()
+  return is_one_of<ReductionOp,
+                   ::cuda::minimum<>,
+                   ::cuda::minimum<T>,
+                   ::cuda::maximum<>,
+                   ::cuda::maximum<T>,
+                   ::cuda::std::plus<>,
+                   ::cuda::std::plus<T>,
+                   ::cuda::std::multiplies<>,
+                   ::cuda::std::multiplies<T>>()
       && Length >= 4
 #  if defined(_CCCL_HAS_NVFP16) && defined(_CCCL_HAS_NVBF16)
       && (is_same<T, __half>::value || is_same<T, __nv_bfloat16>::value)
@@ -295,7 +322,12 @@ _CCCL_NODISCARD _CCCL_DEVICE constexpr bool enable_sm70_simd_reduction()
   using cub::detail::is_one_of;
   using ::cuda::std::is_same;
 #  if defined(_CCCL_HAS_NVFP16)
-  return is_same<T, __half>::value && is_one_of<ReductionOp, ::cuda::std::plus<>, ::cuda::std::multiplies<>>()
+  return is_same<T, __half>::value
+      && is_one_of<ReductionOp,
+                   ::cuda::std::plus<>,
+                   ::cuda::std::plus<T>,
+                   ::cuda::std::multiplies<>,
+                   ::cuda::std::multiplies<T>>()
       && Length >= 4;
 #  else
   return false;
@@ -344,14 +376,21 @@ template <typename T, typename ReductionOp>
 struct enable_ternary_reduction_sm90
 {
   static constexpr bool value =
-    cub::detail::is_one_of<T, ::cuda::std::int32_t, ::cuda::std::uint32_t, ::cuda::std::int64_t, ::cuda::std::uint64_t>
-    && cub::detail::is_one_of<ReductionOp,
-                              ::cuda::minimum<>,
-                              ::cuda::maximum<>,
-                              ::cuda::std::plus<>,
-                              ::cuda::std::bit_and<>,
-                              ::cuda::std::bit_or<>,
-                              ::cuda::std::bit_xor<>>();
+    cub::detail::is_one_of<T, ::cuda::std::int32_t, ::cuda::std::uint32_t>()
+    && cub::detail::is_one_of<
+      ReductionOp,
+      ::cuda::minimum<>,
+      ::cuda::minimum<T>,
+      ::cuda::maximum<>,
+      ::cuda::maximum<T>,
+      ::cuda::std::plus<>,
+      ::cuda::std::plus<T>,
+      ::cuda::std::bit_and<>,
+      ::cuda::std::bit_and<T>,
+      ::cuda::std::bit_or<>,
+      ::cuda::std::bit_or<T>,
+      ::cuda::std::bit_xor<>,
+      ::cuda::std::bit_xor<T>>();
 };
 
 #  if defined(_CCCL_HAS_NVFP16)
@@ -360,7 +399,13 @@ template <typename ReductionOp>
 struct enable_ternary_reduction_sm90<__half2, ReductionOp>
 {
   static constexpr bool value =
-    cub::detail::is_one_of<ReductionOp, ::cuda::minimum<>, ::cuda::maximum<>, SimdMin<__half>, SimdMax<__half>>();
+    cub::detail::is_one_of<ReductionOp,
+                           ::cuda::minimum<>,
+                           ::cuda::minimum<__half2>,
+                           ::cuda::maximum<>,
+                           ::cuda::maximum<__half2>,
+                           SimdMin<__half>,
+                           SimdMax<__half>>();
 };
 
 #  endif // defined(_CCCL_HAS_NVFP16)
@@ -370,8 +415,14 @@ struct enable_ternary_reduction_sm90<__half2, ReductionOp>
 template <typename ReductionOp>
 struct enable_ternary_reduction_sm90<__nv_bfloat162, ReductionOp>
 {
-  static constexpr bool value = cub::detail::
-    is_one_of<ReductionOp, ::cuda::minimum<>, ::cuda::maximum<>, SimdMin<__nv_bfloat16>, SimdMax<__nv_bfloat16>>();
+  static constexpr bool value =
+    cub::detail::is_one_of<ReductionOp,
+                           ::cuda::minimum<>,
+                           ::cuda::minimum<__nv_bfloat162>,
+                           ::cuda::maximum<>,
+                           ::cuda::maximum<__nv_bfloat162>,
+                           SimdMin<__nv_bfloat16>,
+                           SimdMax<__nv_bfloat16>>();
 };
 
 #  endif // defined(_CCCL_HAS_NVBF16)
@@ -394,10 +445,11 @@ _CCCL_NODISCARD _CCCL_DEVICE _CCCL_FORCEINLINE _CCCL_CONSTEXPR_CXX14 bool enable
       NV_PROVIDES_SM_90,
         (return enable_ternary_reduction_sm90<T, ReductionOp>::value;),
       NV_PROVIDES_SM_50,
-        (return is_one_of<AccumT, ::cuda::std::int32_t, ::cuda::std::uint32_t, ::cuda::std::int64_t,
-                                  ::cuda::std::uint64_t>()
-             && is_one_of<ReductionOp, ::cuda::std::plus<>, ::cuda::std::bit_and<>, ::cuda::std::bit_or<>,
-                                       ::cuda::std::bit_xor<>>();),
+        (return is_one_of<AccumT, ::cuda::std::int32_t, ::cuda::std::uint32_t>()
+             && is_one_of<ReductionOp, ::cuda::std::plus<>,    ::cuda::std::plus<T>,
+                                       ::cuda::std::bit_and<>, ::cuda::std::bit_and<T>,
+                                       ::cuda::std::bit_or<>,  ::cuda::std::bit_or<T>,
+                                       ::cuda::std::bit_xor<>, ::cuda::std::bit_xor<T>>();),
       NV_ANY_TARGET,
         (return false;)
     );
@@ -415,12 +467,19 @@ _CCCL_NODISCARD _CCCL_DEVICE constexpr bool enable_promotion()
   return ::cuda::std::is_integral<T>::value && sizeof(T) <= 2
       && is_one_of<ReductionOp,
                    ::cuda::std::plus<>,
+                   ::cuda::std::plus<T>,
                    ::cuda::std::multiplies<>,
+                   ::cuda::std::multiplies<T>,
                    ::cuda::std::bit_and<>,
+                   ::cuda::std::bit_and<T>,
                    ::cuda::std::bit_or<>,
+                   ::cuda::std::bit_or<T>,
                    ::cuda::std::bit_xor<>,
+                   ::cuda::std::bit_xor<T>,
                    ::cuda::maximum<>,
-                   ::cuda::minimum<>>();
+                   ::cuda::maximum<T>,
+                   ::cuda::minimum<>,
+                   ::cuda::minimum<T>>();
 }
 
 /***********************************************************************************************************************
@@ -551,18 +610,46 @@ _CCCL_NODISCARD _CCCL_DEVICE _CCCL_FORCEINLINE AccumT ThreadReduce(const Input&
   using cub::internal::enable_simd_reduction;
   using cub::internal::enable_ternary_reduction;
   using PromT = ::cuda::std::_If<enable_promotion<Input, ReductionOp, AccumT>(), int, AccumT>;
+  _CCCL_IF_CONSTEXPR (!cub::detail::is_one_of<
+                        ReductionOp,
+                        ::cuda::std::plus<>,
+                        ::cuda::std::plus<ValueT>,
+                        ::cuda::std::multiplies<>,
+                        ::cuda::std::multiplies<ValueT>,
+                        ::cuda::std::bit_and<>,
+                        ::cuda::std::bit_and<ValueT>,
+                        ::cuda::std::bit_or<>,
+                        ::cuda::std::bit_or<ValueT>,
+                        ::cuda::std::bit_xor<>,
+                        ::cuda::std::bit_xor<ValueT>,
+                        ::cuda::maximum<>,
+                        ::cuda::maximum<ValueT>,
+                        ::cuda::minimum<>,
+                        ::cuda::minimum<ValueT>,
+                        cub::internal::SimdMin<ValueT>,
+                        cub::internal::SimdMax<ValueT>>())
+  {
+    return cub::internal::ThreadReduceSequential<AccumT>(input, reduction_op);
+  }
+  _CCCL_IF_CONSTEXPR (cub::detail::is_one_of<ReductionOp, ::cuda::std::plus<>, ::cuda::std::plus<ValueT>>()
+                      && cub::detail::is_one_of<ValueT, int, ::cuda::std::uint32_t>())
+  {
+    // clang-format off
+    NV_IF_TARGET(NV_PROVIDES_SM_90,
+      (return cub::internal::ThreadReduceSequential<AccumT>(input, reduction_op);),
+      (return cub::internal::ThreadReduceTernaryTree<PromT>(input, reduction_op);)
+    );
+    // clang-format on
+  }
   if (enable_simd_reduction<Input, ReductionOp, AccumT>())
   {
     return cub::internal::ThreadReduceSimd(input, reduction_op);
   }
-  else if (enable_ternary_reduction<Input, ReductionOp, PromT>())
+  if (enable_ternary_reduction<Input, ReductionOp, PromT>())
   {
     return cub::internal::ThreadReduceTernaryTree<PromT>(input, reduction_op);
   }
-  else
-  {
-    return cub::internal::ThreadReduceBinaryTree<PromT>(input, reduction_op);
-  }
+  return cub::internal::ThreadReduceBinaryTree<PromT>(input, reduction_op);
 }
 
 //! @brief Reduction over statically-sized array-like types, seeded with the specified @p prefix.

From 83aca35dde9ecc81044398a6d21a89d88a3f708f Mon Sep 17 00:00:00 2001
From: Eric Niebler <eniebler@nvidia.com>
Date: Tue, 26 Nov 2024 18:22:01 -0800
Subject: [PATCH 36/45] add a `__type_switch` utility and use it the ptx
 generator (#2946)

---
 docs/repo.toml                                |   5 +-
 .../functions/cuda_ptx_generated_helper.h     |  92 +++++++-------
 libcudacxx/include/cuda/std/__cccl/dialect.h  |   9 ++
 .../cuda/std/__type_traits/type_list.h        | 112 +++++++++++++++---
 .../test/libcudacxx/cuda/type_list.pass.cpp   |  20 ++++
 5 files changed, 168 insertions(+), 70 deletions(-)

diff --git a/docs/repo.toml b/docs/repo.toml
index f7c426f13db..ace31c74c71 100644
--- a/docs/repo.toml
+++ b/docs/repo.toml
@@ -149,6 +149,7 @@ doxygen_predefined = [
     "_CCCL_FORCEINLINE",
     "_CCCL_STD_VER",
     "_CCCL_NODISCARD",
+    "_CCCL_NTTP_AUTO=auto",
     "_CCCL_VISIBILITY_HIDDEN",
     "_CCCL_SUPPRESS_DEPRECATED_PUSH",
     "_CCCL_SUPPRESS_DEPRECATED_POP",
@@ -261,6 +262,7 @@ doxygen_predefined = [
   "_CCCL_HOST=",
   "_CCCL_HOST_DEVICE=",
   "_CCCL_NODISCARD=[[nodiscard]]",
+  "_CCCL_NTTP_AUTO=auto",
   "_CCCL_STD_VER",
   "_CCCL_SUPPRESS_DEPRECATED_PUSH",
   "_CCCL_SUPPRESS_DEPRECATED_POP",
@@ -408,6 +410,7 @@ doxygen_predefined = [
   "_CCCL_CUDACC_AT_LEAST(x, y)=1",
   "_CCCL_CUDACC_BELOW(x, y)=0",
   "_CCCL_DEVICE=",
+  "_CCCL_DOXYGEN_INVOKED",
   "_CCCL_EAT_REST(x)=",
   "_CCCL_EXEC_CHECK_DISABLE=",
   "_CCCL_FORCEINLINE=",
@@ -419,6 +422,7 @@ doxygen_predefined = [
   "_CCCL_INLINE_VAR=inline",
   "_CCCL_NODISCARD=[[nodiscard]]",
   "_CCCL_NODISCARD_FRIEND=",
+  "_CCCL_NTTP_AUTO=auto",
   "_CCCL_STD_VER=2020",
   "_CCCL_TRAIT(x, y)=x<y>::value",
   "_CUDA_VMR=cuda::mr",
@@ -443,7 +447,6 @@ doxygen_predefined = [
   "_CUDAX_TRIVIAL_DEVICE_API",
   "_CUDAX_PUBLIC_API",
   "LIBCUDACXX_ENABLE_EXPERIMENTAL_MEMORY_RESOURCE=",
-  "_CCCL_DOXYGEN_INVOKED",
 ]
 
 # make sure to use ./fetch_imgs.sh
diff --git a/libcudacxx/include/cuda/std/__atomic/functions/cuda_ptx_generated_helper.h b/libcudacxx/include/cuda/std/__atomic/functions/cuda_ptx_generated_helper.h
index 2e2266ce979..cdb35957509 100644
--- a/libcudacxx/include/cuda/std/__atomic/functions/cuda_ptx_generated_helper.h
+++ b/libcudacxx/include/cuda/std/__atomic/functions/cuda_ptx_generated_helper.h
@@ -28,6 +28,7 @@
 #include <cuda/std/__type_traits/is_floating_point.h>
 #include <cuda/std/__type_traits/is_scalar.h>
 #include <cuda/std/__type_traits/is_signed.h>
+#include <cuda/std/__type_traits/type_list.h>
 #include <cuda/std/cstddef>
 #include <cuda/std/cstdint>
 
@@ -110,61 +111,50 @@ struct __atomic_longlong2
 
 template <class _Type>
 using __atomic_cuda_deduce_bitwise =
-  _If<sizeof(_Type) == 1,
-      __atomic_cuda_operand_deduction<uint8_t, __atomic_cuda_operand_b8>,
-      _If<sizeof(_Type) == 2,
-          __atomic_cuda_operand_deduction<uint16_t, __atomic_cuda_operand_b16>,
-          _If<sizeof(_Type) == 4,
-              __atomic_cuda_operand_deduction<uint32_t, __atomic_cuda_operand_b32>,
-              _If<sizeof(_Type) == 8,
-                  __atomic_cuda_operand_deduction<uint64_t, __atomic_cuda_operand_b64>,
-                  __atomic_cuda_operand_deduction<__atomic_longlong2, __atomic_cuda_operand_b128>>>>>;
+  __type_switch<sizeof(_Type),
+                __type_case<1, __atomic_cuda_operand_deduction<uint8_t, __atomic_cuda_operand_b8>>,
+                __type_case<2, __atomic_cuda_operand_deduction<uint16_t, __atomic_cuda_operand_b16>>,
+                __type_case<4, __atomic_cuda_operand_deduction<uint32_t, __atomic_cuda_operand_b32>>,
+                __type_case<8, __atomic_cuda_operand_deduction<uint64_t, __atomic_cuda_operand_b64>>,
+                __type_default<__atomic_cuda_operand_deduction<__atomic_longlong2, __atomic_cuda_operand_b128>>>;
 
 template <class _Type>
-using __atomic_cuda_deduce_arithmetic =
-  _If<_CCCL_TRAIT(is_floating_point, _Type),
-      _If<sizeof(_Type) == 4,
-          __atomic_cuda_operand_deduction<float, __atomic_cuda_operand_f32>,
-          __atomic_cuda_operand_deduction<double, __atomic_cuda_operand_f64>>,
-      _If<_CCCL_TRAIT(is_signed, _Type),
-          _If<sizeof(_Type) == 1,
-              __atomic_cuda_operand_deduction<int8_t, __atomic_cuda_operand_s8>,
-              _If<sizeof(_Type) == 2,
-                  __atomic_cuda_operand_deduction<int16_t, __atomic_cuda_operand_s16>,
-                  _If<sizeof(_Type) == 4,
-                      __atomic_cuda_operand_deduction<int32_t, __atomic_cuda_operand_s32>,
-                      __atomic_cuda_operand_deduction<int64_t, __atomic_cuda_operand_u64>>>>, // There is no
-                                                                                              // atom.add.s64
-          _If<sizeof(_Type) == 1,
-              __atomic_cuda_operand_deduction<uint8_t, __atomic_cuda_operand_u8>,
-              _If<sizeof(_Type) == 2,
-                  __atomic_cuda_operand_deduction<uint16_t, __atomic_cuda_operand_u16>,
-                  _If<sizeof(_Type) == 4,
-                      __atomic_cuda_operand_deduction<uint32_t, __atomic_cuda_operand_u32>,
-                      __atomic_cuda_operand_deduction<uint64_t, __atomic_cuda_operand_u64>>>>>>;
+using __atomic_cuda_deduce_arithmetic = _If<
+  _CCCL_TRAIT(is_floating_point, _Type),
+  _If<sizeof(_Type) == 4,
+      __atomic_cuda_operand_deduction<float, __atomic_cuda_operand_f32>,
+      __atomic_cuda_operand_deduction<double, __atomic_cuda_operand_f64>>,
+  _If<_CCCL_TRAIT(is_signed, _Type),
+      __type_switch<sizeof(_Type),
+                    __type_case<1, __atomic_cuda_operand_deduction<int8_t, __atomic_cuda_operand_s8>>,
+                    __type_case<2, __atomic_cuda_operand_deduction<int16_t, __atomic_cuda_operand_s16>>,
+                    __type_case<4, __atomic_cuda_operand_deduction<int32_t, __atomic_cuda_operand_s32>>,
+                    __type_default<__atomic_cuda_operand_deduction<int64_t, __atomic_cuda_operand_u64>>>, // There is no
+                                                                                                          // atom.add.s64
+      __type_switch<sizeof(_Type),
+                    __type_case<1, __atomic_cuda_operand_deduction<uint8_t, __atomic_cuda_operand_u8>>,
+                    __type_case<2, __atomic_cuda_operand_deduction<uint16_t, __atomic_cuda_operand_u16>>,
+                    __type_case<4, __atomic_cuda_operand_deduction<uint32_t, __atomic_cuda_operand_u32>>,
+                    __type_default<__atomic_cuda_operand_deduction<uint64_t, __atomic_cuda_operand_u64>>>>>;
 
 template <class _Type>
-using __atomic_cuda_deduce_minmax =
-  _If<_CCCL_TRAIT(is_floating_point, _Type),
-      _If<sizeof(_Type) == 4,
-          __atomic_cuda_operand_deduction<float, __atomic_cuda_operand_f32>,
-          __atomic_cuda_operand_deduction<double, __atomic_cuda_operand_f64>>,
-      _If<_CCCL_TRAIT(is_signed, _Type),
-          _If<sizeof(_Type) == 1,
-              __atomic_cuda_operand_deduction<int8_t, __atomic_cuda_operand_s8>,
-              _If<sizeof(_Type) == 2,
-                  __atomic_cuda_operand_deduction<int16_t, __atomic_cuda_operand_s16>,
-                  _If<sizeof(_Type) == 4,
-                      __atomic_cuda_operand_deduction<int32_t, __atomic_cuda_operand_s32>,
-                      __atomic_cuda_operand_deduction<int64_t, __atomic_cuda_operand_s64>>>>, // atom.min|max.s64
-                                                                                              // supported
-          _If<sizeof(_Type) == 1,
-              __atomic_cuda_operand_deduction<uint8_t, __atomic_cuda_operand_u8>,
-              _If<sizeof(_Type) == 2,
-                  __atomic_cuda_operand_deduction<uint16_t, __atomic_cuda_operand_u16>,
-                  _If<sizeof(_Type) == 4,
-                      __atomic_cuda_operand_deduction<uint32_t, __atomic_cuda_operand_u32>,
-                      __atomic_cuda_operand_deduction<uint64_t, __atomic_cuda_operand_u64>>>>>>;
+using __atomic_cuda_deduce_minmax = _If<
+  _CCCL_TRAIT(is_floating_point, _Type),
+  _If<sizeof(_Type) == 4,
+      __atomic_cuda_operand_deduction<float, __atomic_cuda_operand_f32>,
+      __atomic_cuda_operand_deduction<double, __atomic_cuda_operand_f64>>,
+  _If<_CCCL_TRAIT(is_signed, _Type),
+      __type_switch<sizeof(_Type),
+                    __type_case<1, __atomic_cuda_operand_deduction<int8_t, __atomic_cuda_operand_s8>>,
+                    __type_case<2, __atomic_cuda_operand_deduction<int16_t, __atomic_cuda_operand_s16>>,
+                    __type_case<4, __atomic_cuda_operand_deduction<int32_t, __atomic_cuda_operand_s32>>,
+                    __type_default<__atomic_cuda_operand_deduction<int64_t, __atomic_cuda_operand_s64>>>, // atom.min|max.s64
+                                                                                                          // supported
+      __type_switch<sizeof(_Type),
+                    __type_case<1, __atomic_cuda_operand_deduction<uint8_t, __atomic_cuda_operand_u8>>,
+                    __type_case<2, __atomic_cuda_operand_deduction<uint16_t, __atomic_cuda_operand_u16>>,
+                    __type_case<4, __atomic_cuda_operand_deduction<uint32_t, __atomic_cuda_operand_u32>>,
+                    __type_default<__atomic_cuda_operand_deduction<uint64_t, __atomic_cuda_operand_u64>>>>>;
 
 template <class _Type>
 using __atomic_enable_if_native_bitwise = bool;
diff --git a/libcudacxx/include/cuda/std/__cccl/dialect.h b/libcudacxx/include/cuda/std/__cccl/dialect.h
index 407f2db6ecf..06387172b9b 100644
--- a/libcudacxx/include/cuda/std/__cccl/dialect.h
+++ b/libcudacxx/include/cuda/std/__cccl/dialect.h
@@ -105,6 +105,15 @@
 #  define _CCCL_NO_VARIABLE_TEMPLATES
 #endif // _CCCL_STD_VER <= 2011
 
+// Declaring a non-type template parameters with auto is only available from C++17 onwards
+#if _CCCL_STD_VER >= 2017 && defined(__cpp_nontype_template_parameter_auto) \
+  && (__cpp_nontype_template_parameter_auto >= 201606L)
+#  define _CCCL_NTTP_AUTO auto
+#else // ^^^ C++17 ^^^ / vvv C++14 vvv
+#  define _CCCL_NO_NONTYPE_TEMPLATE_PARAMETER_AUTO
+#  define _CCCL_NTTP_AUTO unsigned long long int
+#endif // _CCCL_STD_VER <= 2014
+
 // concepts are only available from C++20 onwards
 #if _CCCL_STD_VER <= 2017 || !defined(__cpp_concepts) || (__cpp_concepts < 201907L)
 #  define _CCCL_NO_CONCEPTS
diff --git a/libcudacxx/include/cuda/std/__type_traits/type_list.h b/libcudacxx/include/cuda/std/__type_traits/type_list.h
index 00f69273673..1beb22b1807 100644
--- a/libcudacxx/include/cuda/std/__type_traits/type_list.h
+++ b/libcudacxx/include/cuda/std/__type_traits/type_list.h
@@ -557,6 +557,100 @@ using __type_front = __type_at_c<0, _List>;
 template <class _List>
 using __type_back = __type_at_c<_List::__size - 1, _List>;
 
+//! \brief A pair of types
+template <class _First, class _Second>
+struct _CCCL_TYPE_VISIBILITY_DEFAULT __type_pair
+{
+  using __first _CCCL_NODEBUG_ALIAS  = _First;
+  using __second _CCCL_NODEBUG_ALIAS = _Second;
+};
+
+//! \brief Retrieve the first of a pair of types
+//! \pre \c _Pair is a specialization of \c __type_pair
+template <class _Pair>
+using __type_pair_first _CCCL_NODEBUG_ALIAS = typename _Pair::__first;
+
+//! \brief Retrieve the second of a pair of types
+//! \pre \c _Pair is a specialization of \c __type_pair
+template <class _Pair>
+using __type_pair_second _CCCL_NODEBUG_ALIAS = typename _Pair::__second;
+
+//! \see __type_switch
+template <class _Value>
+struct _CCCL_TYPE_VISIBILITY_DEFAULT __type_default
+{
+  template <class>
+  using __rebind _CCCL_NODEBUG_ALIAS = __type_default;
+
+  using type _CCCL_NODEBUG_ALIAS = _Value;
+};
+
+#  if _CCCL_CUDACC_AT_LEAST(12, 0) || defined(_CCCL_DOXYGEN_INVOKED)
+
+//! \see __type_switch
+template <_CCCL_NTTP_AUTO _Label, class _Value>
+struct _CCCL_TYPE_VISIBILITY_DEFAULT __type_case
+{
+  template <class _OtherInt>
+  using __rebind _CCCL_NODEBUG_ALIAS = __type_case<static_cast<_OtherInt>(_Label), _Value>;
+
+  using type = _Value;
+};
+
+#  else // ^^^ CUDACC >= 12.0 || DOXYGEN ^^^ / vvv CUDACC < 12.0 && !DOXYGEN vvv
+
+template <class _Label, class _Value>
+struct _CCCL_TYPE_VISIBILITY_DEFAULT __type_case_
+{
+  template <class _OtherInt>
+  using __rebind _CCCL_NODEBUG_ALIAS = __type_case_<integral_constant<_OtherInt, _Label::value>, _Value>;
+
+  using type = _Value;
+};
+
+template <_CCCL_NTTP_AUTO _Label, class _Value>
+using __type_case _CCCL_NODEBUG_ALIAS = __type_case_<integral_constant<decltype(_Label), _Label>, _Value>;
+
+#  endif // CUDACC < 12.0 && !DOXYGEN
+
+namespace __detail
+{
+template <_CCCL_NTTP_AUTO _Label, class _Value>
+_LIBCUDACXX_HIDE_FROM_ABI auto __type_switch_fn(__type_case<_Label, _Value>*, int) -> __type_case<_Label, _Value>;
+
+template <_CCCL_NTTP_AUTO _Label, class _Value>
+_LIBCUDACXX_HIDE_FROM_ABI auto __type_switch_fn(__type_default<_Value>*, long) -> __type_default<_Value>;
+} // namespace __detail
+
+//! \see __type_switch
+template <class _Type, class... _Cases>
+struct _CCCL_TYPE_VISIBILITY_DEFAULT _LIBCUDACXX_DECLSPEC_EMPTY_BASES __type_switch_fn
+    : _Cases::template __rebind<_Type>...
+{
+  template <class _Label>
+  using __call _CCCL_NODEBUG_ALIAS =
+    __type<decltype(__detail::__type_switch_fn<_Label::value>(static_cast<__type_switch_fn*>(nullptr), 0))>;
+};
+
+//! \brief Given an integral constant \c _Label and a pack of "cases"
+//! consisting of one or more specializations of \c __type_case and zero or
+//! one specializations of \c __type_default, `__type_switch<_Label, _Cases...>`
+//! returns the value associated with the first case whose label matches the
+//! given label. If no such case exists, the value associated with the default
+//! case is returned. If no default case exists, the type is ill-formed.
+//!
+//! \p Example:
+//! \code
+//! using result = __type_switch<2,
+//!                              __type_case<1, char>,
+//!                              __type_case<2, double>,
+//!                              __type_default<float>>;
+//! static_assert(is_same_v<result, double>);
+//! \endcode
+template <_CCCL_NTTP_AUTO _Label, class... _Cases>
+using __type_switch _CCCL_NODEBUG_ALIAS =
+  __type_call<__type_switch_fn<decltype(_Label), _Cases...>, integral_constant<decltype(_Label), _Label>>;
+
 namespace __detail
 {
 #  if _CCCL_COMPILER(MSVC, <, 19, 38)
@@ -907,24 +1001,6 @@ struct _CCCL_TYPE_VISIBILITY_DEFAULT __type_sizeof
   using __call _CCCL_NODEBUG_ALIAS = integral_constant<size_t, sizeof(_Ty)>;
 };
 
-//! \brief A pair of types
-template <class _First, class _Second>
-struct _CCCL_TYPE_VISIBILITY_DEFAULT __type_pair
-{
-  using __first _CCCL_NODEBUG_ALIAS  = _First;
-  using __second _CCCL_NODEBUG_ALIAS = _Second;
-};
-
-//! \brief Retreive the first of a pair of types
-//! \pre \c _Pair is a specialization of \c __type_pair
-template <class _Pair>
-using __type_pair_first = typename _Pair::__first;
-
-//! \brief Retreive the second of a pair of types
-//! \pre \c _Pair is a specialization of \c __type_pair
-template <class _Pair>
-using __type_pair_second = typename _Pair::__second;
-
 //! \brief A list of compile-time values, and a meta-callable that accepts a
 //! meta-callable and evaluates it with the values, each value wrapped in an
 //! integral constant wrapper.
diff --git a/libcudacxx/test/libcudacxx/cuda/type_list.pass.cpp b/libcudacxx/test/libcudacxx/cuda/type_list.pass.cpp
index 2e477d3622a..9978d7e2c35 100644
--- a/libcudacxx/test/libcudacxx/cuda/type_list.pass.cpp
+++ b/libcudacxx/test/libcudacxx/cuda/type_list.pass.cpp
@@ -494,6 +494,26 @@ static_assert(
   "");
 #endif
 
+// __type_switch
+static_assert(::cuda::std::is_same<::cuda::std::__type_switch<0,
+                                                              ::cuda::std::__type_case<0, char>,
+                                                              ::cuda::std::__type_case<1, double>,
+                                                              ::cuda::std::__type_default<float>>,
+                                   char>::value,
+              "");
+static_assert(::cuda::std::is_same<::cuda::std::__type_switch<1,
+                                                              ::cuda::std::__type_case<0, char>,
+                                                              ::cuda::std::__type_case<1, double>,
+                                                              ::cuda::std::__type_default<float>>,
+                                   double>::value,
+              "");
+static_assert(::cuda::std::is_same<::cuda::std::__type_switch<2,
+                                                              ::cuda::std::__type_case<0, char>,
+                                                              ::cuda::std::__type_case<1, double>,
+                                                              ::cuda::std::__type_default<float>>,
+                                   float>::value,
+              "");
+
 // __type_concat
 static_assert(::cuda::std::is_same<::cuda::std::__type_concat<>, ::cuda::std::__type_list<>>::value, "");
 

From 0f8687420e80bdd7449e0b173babf7692fcb9a18 Mon Sep 17 00:00:00 2001
From: Eric Niebler <eniebler@nvidia.com>
Date: Wed, 27 Nov 2024 14:22:52 -0800
Subject: [PATCH 37/45] replace use of old `_CONCEPT_FRAGMENT` macro in cudax
 (#2973)

* replace use of old `_CONCEPT_FRAGMENT` macro in cudax

* fix docs build
---
 .../cuda/experimental/__stream/get_stream.cuh | 21 ++++++++-----------
 1 file changed, 9 insertions(+), 12 deletions(-)

diff --git a/cudax/include/cuda/experimental/__stream/get_stream.cuh b/cudax/include/cuda/experimental/__stream/get_stream.cuh
index 9edf1d251df..6703cf67ead 100644
--- a/cudax/include/cuda/experimental/__stream/get_stream.cuh
+++ b/cudax/include/cuda/experimental/__stream/get_stream.cuh
@@ -33,24 +33,21 @@
 
 namespace cuda::experimental
 {
-
-template <class _Tp>
-_CCCL_CONCEPT __convertible_to_stream_ref = _CUDA_VSTD::convertible_to<_Tp, ::cuda::stream_ref>;
-
-template <class _Tp>
-_CCCL_CONCEPT_FRAGMENT(
-  __has_member_get_stream_,
-  requires(const _Tp& __t)(requires(!__convertible_to_stream_ref<_Tp>),
-                           requires(_CUDA_VSTD::same_as<decltype(__t.get_stream()), ::cuda::stream_ref>)));
-
+// clang-format off
 template <class _Tp>
-_CCCL_CONCEPT __has_member_get_stream = _CCCL_FRAGMENT(__has_member_get_stream_, _Tp);
+_CCCL_CONCEPT __has_member_get_stream =
+  _CCCL_REQUIRES_EXPR((_Tp), const _Tp& __t)
+  (
+    requires(!_CUDA_VSTD::convertible_to<_Tp, ::cuda::stream_ref>),
+    _Same_as(::cuda::stream_ref) __t.get_stream()
+  );
+// clang-format on
 
 //! @brief `get_stream` is a customization point object that queries a type `T` for an associated stream
 struct get_stream_t
 {
   _CCCL_TEMPLATE(class _Tp)
-  _CCCL_REQUIRES(__convertible_to_stream_ref<_Tp>)
+  _CCCL_REQUIRES((_CUDA_VSTD::convertible_to<_Tp, ::cuda::stream_ref>) )
   _CCCL_NODISCARD _CCCL_HIDE_FROM_ABI constexpr ::cuda::stream_ref operator()(const _Tp& __t) const
     noexcept(noexcept(static_cast<::cuda::stream_ref>(__t)))
   {

From a5d33e070708f65234b353baed0df380bad4b367 Mon Sep 17 00:00:00 2001
From: Eric Niebler <eniebler@nvidia.com>
Date: Wed, 27 Nov 2024 22:46:05 -0800
Subject: [PATCH 38/45] remove vestigal uses of the old
 `DOXYGEN_SHOULD_SKIP_THIS` macro (#2978)

---
 .../__memory_resource/managed_memory_resource.cuh             | 4 ++--
 .../experimental/__memory_resource/pinned_memory_resource.cuh | 4 ++--
 2 files changed, 4 insertions(+), 4 deletions(-)

diff --git a/cudax/include/cuda/experimental/__memory_resource/managed_memory_resource.cuh b/cudax/include/cuda/experimental/__memory_resource/managed_memory_resource.cuh
index f240155339c..57394558757 100644
--- a/cudax/include/cuda/experimental/__memory_resource/managed_memory_resource.cuh
+++ b/cudax/include/cuda/experimental/__memory_resource/managed_memory_resource.cuh
@@ -159,7 +159,7 @@ public:
   }
 #endif // _CCCL_STD_VER <= 2017
 
-#ifndef DOXYGEN_SHOULD_SKIP_THIS // Do not document
+#ifndef _CCCL_DOXYGEN_INVOKED // Do not document
 #  if _CCCL_STD_VER >= 2020
   //! @brief Equality comparison between a \c managed_memory_resource and another resource
   //! @param __rhs The resource to compare to
@@ -237,7 +237,7 @@ public:
   friend constexpr void get_property(managed_memory_resource const&, mr::device_accessible) noexcept {}
   //! @brief Enables the \c host_accessible property
   friend constexpr void get_property(managed_memory_resource const&, mr::host_accessible) noexcept {}
-#endif // DOXYGEN_SHOULD_SKIP_THIS
+#endif // _CCCL_DOXYGEN_INVOKED
 
   //! @brief Checks whether the passed in alignment is valid
   static constexpr bool __is_valid_alignment(const size_t __alignment) noexcept
diff --git a/cudax/include/cuda/experimental/__memory_resource/pinned_memory_resource.cuh b/cudax/include/cuda/experimental/__memory_resource/pinned_memory_resource.cuh
index 60ec7c9b49e..7b36888b0ef 100644
--- a/cudax/include/cuda/experimental/__memory_resource/pinned_memory_resource.cuh
+++ b/cudax/include/cuda/experimental/__memory_resource/pinned_memory_resource.cuh
@@ -160,7 +160,7 @@ public:
   }
 #endif // _CCCL_STD_VER <= 2017
 
-#ifndef DOXYGEN_SHOULD_SKIP_THIS // Do not document
+#ifndef _CCCL_DOXYGEN_INVOKED // Do not document
 #  if _CCCL_STD_VER >= 2020
   //! @brief Equality comparison between a \c pinned_memory_resource and another resource
   //! @param __rhs The resource to compare to
@@ -239,7 +239,7 @@ public:
   friend constexpr void get_property(pinned_memory_resource const&, device_accessible) noexcept {}
   //! @brief Enables the \c host_accessible property
   friend constexpr void get_property(pinned_memory_resource const&, host_accessible) noexcept {}
-#endif // DOXYGEN_SHOULD_SKIP_THIS
+#endif // _CCCL_DOXYGEN_INVOKED
 
   //! @brief Checks whether the passed in alignment is valid
   static constexpr bool __is_valid_alignment(const size_t __alignment) noexcept

From d68714d45c608d35f7d4a36f2b404a58780fbc82 Mon Sep 17 00:00:00 2001
From: Bernhard Manfred Gruber <bernhardmgruber@gmail.com>
Date: Thu, 28 Nov 2024 08:24:09 +0100
Subject: [PATCH 39/45] Fix proclaim_copyable_arguments for lambdas (#2833)

Co-authored-by: Michael Schellenberger Costa <miscco@nvidia.com>
---
 cudax/cmake/cudaxBuildCompilerTargets.cmake   |  1 +
 .../cuda/__functional/address_stability.h     | 11 +++++-
 thrust/testing/address_stability.cmake        | 12 ++++++
 thrust/testing/address_stability.cu           | 37 +++++++++++++++++++
 4 files changed, 59 insertions(+), 2 deletions(-)
 create mode 100644 thrust/testing/address_stability.cmake

diff --git a/cudax/cmake/cudaxBuildCompilerTargets.cmake b/cudax/cmake/cudaxBuildCompilerTargets.cmake
index f19ced87e49..84fec426823 100644
--- a/cudax/cmake/cudaxBuildCompilerTargets.cmake
+++ b/cudax/cmake/cudaxBuildCompilerTargets.cmake
@@ -47,6 +47,7 @@ function(cudax_build_compiler_targets)
   if("Clang" STREQUAL "${CMAKE_CXX_COMPILER_ID}")
     # stf heavily uses host device lambdas which break on clang due to a warning about the implicitly
     # deleted copy constructor
+    # TODO(bgruber): remove this when NVBug 4980157 is resolved
     append_option_if_available("-Wno-deprecated-copy" cxx_compile_options)
   endif()
 
diff --git a/libcudacxx/include/cuda/__functional/address_stability.h b/libcudacxx/include/cuda/__functional/address_stability.h
index f745b963b42..f2ef9f6d331 100644
--- a/libcudacxx/include/cuda/__functional/address_stability.h
+++ b/libcudacxx/include/cuda/__functional/address_stability.h
@@ -48,6 +48,13 @@ _CCCL_INLINE_VAR constexpr bool proclaims_copyable_arguments_v = proclaims_copya
 template <typename F>
 struct __callable_permitting_copied_arguments : F
 {
+#if _CCCL_STD_VER <= 2014
+  template <typename G>
+  _LIBCUDACXX_HIDE_FROM_ABI constexpr __callable_permitting_copied_arguments(G&& g)
+      : F(::cuda::std::forward<G>(g))
+  {}
+#endif // _CCCL_STD_VER <= 2014
+
   using F::operator();
 };
 
@@ -61,9 +68,9 @@ struct proclaims_copyable_arguments<__callable_permitting_copied_arguments<F>> :
 //! @see proclaims_copyable_arguments
 template <typename F>
 _CCCL_NODISCARD _LIBCUDACXX_HIDE_FROM_ABI constexpr auto
-proclaim_copyable_arguments(F f) -> __callable_permitting_copied_arguments<F>
+proclaim_copyable_arguments(F&& f) -> __callable_permitting_copied_arguments<::cuda::std::decay_t<F>>
 {
-  return __callable_permitting_copied_arguments<F>{_CUDA_VSTD::move(f)};
+  return {::cuda::std::forward<F>(f)};
 }
 
 // Specializations for libcu++ function objects are provided here to not pull this include into `<cuda/std/...>` headers
diff --git a/thrust/testing/address_stability.cmake b/thrust/testing/address_stability.cmake
new file mode 100644
index 00000000000..e02e34f5870
--- /dev/null
+++ b/thrust/testing/address_stability.cmake
@@ -0,0 +1,12 @@
+target_compile_options(${test_target} PRIVATE $<$<COMPILE_LANG_AND_ID:CUDA,NVIDIA>: --extended-lambda>)
+
+# this check is actually not correct, because we must check the host compiler, not the CXX compiler.
+# We rely on that those are usually the same ;)
+if ("Clang" STREQUAL "${CMAKE_CXX_COMPILER_ID}" AND CMAKE_CXX_COMPILER_VERSION VERSION_GREATER_EQUAL 13)
+    # When clang >= 13 is used as host compiler, we get the following warning:
+    #   nvcc_internal_extended_lambda_implementation:312:22: error: definition of implicit copy constructor for '__nv_hdl_wrapper_t<false, true, false, __nv_dl_tag<void (*)(), &TestAddressStabilityLambda, 2>, int (const int &)>' is deprecated because it has a user-declared copy assignment operator [-Werror,-Wdeprecated-copy]
+    #   312 | __nv_hdl_wrapper_t & operator=(const __nv_hdl_wrapper_t &in) = delete;
+    #   |                      ^
+    # Let's suppress it until NVBug 4980157 is resolved.
+    target_compile_options(${test_target} PRIVATE $<$<COMPILE_LANG_AND_ID:CUDA,NVIDIA>: -Wno-deprecated-copy>)
+endif ()
diff --git a/thrust/testing/address_stability.cu b/thrust/testing/address_stability.cu
index 987fc938058..1fed9100097 100644
--- a/thrust/testing/address_stability.cu
+++ b/thrust/testing/address_stability.cu
@@ -83,3 +83,40 @@ void TestAddressStabilityUserDefinedFunctionObject()
   static_assert(proclaims_copyable_arguments<decltype(proclaim_copyable_arguments(my_plus<const int&&>{}))>::value, "");
 }
 DECLARE_UNITTEST(TestAddressStabilityUserDefinedFunctionObject);
+
+void TestAddressStabilityLambda()
+{
+  using ::cuda::proclaim_copyable_arguments;
+  using ::cuda::proclaims_copyable_arguments;
+
+  {
+    auto l = [](const int& i) {
+      return i + 2;
+    };
+    static_assert(!proclaims_copyable_arguments<decltype(l)>::value, "");
+    auto pr_l = proclaim_copyable_arguments(l);
+    ASSERT_EQUAL(pr_l(3), 5);
+    static_assert(proclaims_copyable_arguments<decltype(pr_l)>::value, "");
+  }
+
+  {
+    auto l = [] _CCCL_DEVICE(const int& i) {
+      return i + 2;
+    };
+    static_assert(!proclaims_copyable_arguments<decltype(l)>::value, "");
+    auto pr_device_l = proclaim_copyable_arguments(l);
+    (void) &pr_device_l;
+    static_assert(proclaims_copyable_arguments<decltype(pr_device_l)>::value, "");
+  }
+
+  {
+    auto l = [] _CCCL_HOST_DEVICE(const int& i) {
+      return i + 2;
+    };
+    static_assert(!proclaims_copyable_arguments<decltype(l)>::value, "");
+    auto pr_l = proclaim_copyable_arguments(l);
+    ASSERT_EQUAL(pr_l(3), 5);
+    static_assert(proclaims_copyable_arguments<decltype(pr_l)>::value, "");
+  }
+}
+DECLARE_UNITTEST(TestAddressStabilityLambda);

From af0a8bb6239503709d74d7e1120c06870ff9687a Mon Sep 17 00:00:00 2001
From: Allard Hendriksen <ahendriksen@nvidia.com>
Date: Thu, 28 Nov 2024 11:53:37 +0100
Subject: [PATCH 40/45] Forward declare half types in cuda::ptx (#2981)

---
 .../__ptx/instructions/cp_reduce_async_bulk.h    | 16 ++++++----------
 1 file changed, 6 insertions(+), 10 deletions(-)

diff --git a/libcudacxx/include/cuda/__ptx/instructions/cp_reduce_async_bulk.h b/libcudacxx/include/cuda/__ptx/instructions/cp_reduce_async_bulk.h
index f1487301ada..ce7af1ecc20 100644
--- a/libcudacxx/include/cuda/__ptx/instructions/cp_reduce_async_bulk.h
+++ b/libcudacxx/include/cuda/__ptx/instructions/cp_reduce_async_bulk.h
@@ -28,16 +28,12 @@
 
 #include <nv/target> // __CUDA_MINIMUM_ARCH__ and friends
 
-#if defined(_LIBCUDACXX_HAS_NVFP16)
-#  include <cuda_fp16.h>
-#endif // _LIBCUDACXX_HAS_NVFP16
-
-#if defined(_LIBCUDACXX_HAS_NVBF16)
-_CCCL_DIAG_PUSH
-_CCCL_DIAG_SUPPRESS_CLANG("-Wunused-function")
-#  include <cuda_bf16.h>
-_CCCL_DIAG_POP
-#endif // _LIBCUDACXX_HAS_NVBF16
+// Forward-declare __half and __nv_bfloat16. The cuda_fp16.h and cuda_bf16.h are
+// expensive to include. The APIs use only pointers, so we do not have to define
+// the types. If the user wants to use these types, it is their responsibility
+// to include the headers.
+struct __half;
+struct __nv_bfloat16;
 
 _LIBCUDACXX_BEGIN_NAMESPACE_CUDA_PTX
 

From 9beeb267e3012fe4fd9c0378e7f6c11c2573c2a3 Mon Sep 17 00:00:00 2001
From: Bernhard Manfred Gruber <bernhardmgruber@gmail.com>
Date: Thu, 28 Nov 2024 18:35:20 +0100
Subject: [PATCH 41/45] Fix tuning benchmark for `cub::DeviceTransform` (#2970)

* Replace CUB_DETAIL_COUNT by _CCCL_PP_COUNT. It was removed at some point, but not replaced everywhere.
* Add missing pragma once to header
* Fix use of _CUB_HAS_TRANSFORM_UBLKCP before it is defined
---
 cub/benchmarks/bench/transform/babelstream1.cu | 17 -----------------
 cub/benchmarks/bench/transform/babelstream2.cu | 17 -----------------
 cub/benchmarks/bench/transform/babelstream3.cu | 17 -----------------
 cub/benchmarks/bench/transform/common.h        | 15 +++++++++++++++
 cub/benchmarks/bench/transform/complex_cmp.cu  | 17 -----------------
 cub/benchmarks/bench/transform/fib.cu          | 17 -----------------
 cub/benchmarks/bench/transform/heavy.cu        | 17 -----------------
 7 files changed, 15 insertions(+), 102 deletions(-)

diff --git a/cub/benchmarks/bench/transform/babelstream1.cu b/cub/benchmarks/bench/transform/babelstream1.cu
index c3b9306398d..ba796f4982e 100644
--- a/cub/benchmarks/bench/transform/babelstream1.cu
+++ b/cub/benchmarks/bench/transform/babelstream1.cu
@@ -4,25 +4,8 @@
 // %RANGE% TUNE_THREADS tpb 128:1024:128
 // %RANGE% TUNE_ALGORITHM alg 0:1:1
 
-// keep checks at the top so compilation of discarded variants fails really fast
-#if !TUNE_BASE
-#  if TUNE_ALGORITHM == 1 && (__CUDA_ARCH_LIST__) < 900
-#    error "Cannot compile algorithm 4 (ublkcp) below sm90"
-#  endif
-
-#  if TUNE_ALGORITHM == 1 && !defined(_CUB_HAS_TRANSFORM_UBLKCP)
-#    error "Cannot tune for ublkcp algorithm, which is not provided by CUB (old CTK?)"
-#  endif
-#endif
-
 #include "common.h"
 
-#if !TUNE_BASE
-#  if CUB_DETAIL_COUNT(__CUDA_ARCH_LIST__) != 1
-#    error "This benchmark does not support being compiled for multiple architectures"
-#  endif
-#endif
-
 template <typename T, typename OffsetT>
 static void mul(nvbench::state& state, nvbench::type_list<T, OffsetT>)
 {
diff --git a/cub/benchmarks/bench/transform/babelstream2.cu b/cub/benchmarks/bench/transform/babelstream2.cu
index 61d4e905d92..33ffd6ee173 100644
--- a/cub/benchmarks/bench/transform/babelstream2.cu
+++ b/cub/benchmarks/bench/transform/babelstream2.cu
@@ -4,25 +4,8 @@
 // %RANGE% TUNE_THREADS tpb 128:1024:128
 // %RANGE% TUNE_ALGORITHM alg 0:1:1
 
-// keep checks at the top so compilation of discarded variants fails really fast
-#if !TUNE_BASE
-#  if TUNE_ALGORITHM == 1 && (__CUDA_ARCH_LIST__) < 900
-#    error "Cannot compile algorithm 4 (ublkcp) below sm90"
-#  endif
-
-#  if TUNE_ALGORITHM == 1 && !defined(_CUB_HAS_TRANSFORM_UBLKCP)
-#    error "Cannot tune for ublkcp algorithm, which is not provided by CUB (old CTK?)"
-#  endif
-#endif
-
 #include "common.h"
 
-#if !TUNE_BASE
-#  if CUB_DETAIL_COUNT(__CUDA_ARCH_LIST__) != 1
-#    error "This benchmark does not support being compiled for multiple architectures"
-#  endif
-#endif
-
 template <typename T, typename OffsetT>
 static void add(nvbench::state& state, nvbench::type_list<T, OffsetT>)
 {
diff --git a/cub/benchmarks/bench/transform/babelstream3.cu b/cub/benchmarks/bench/transform/babelstream3.cu
index a5c969764ae..90ce2e74ac4 100644
--- a/cub/benchmarks/bench/transform/babelstream3.cu
+++ b/cub/benchmarks/bench/transform/babelstream3.cu
@@ -4,25 +4,8 @@
 // %RANGE% TUNE_THREADS tpb 128:1024:128
 // %RANGE% TUNE_ALGORITHM alg 0:1:1
 
-// keep checks at the top so compilation of discarded variants fails really fast
-#if !TUNE_BASE
-#  if TUNE_ALGORITHM == 1 && (__CUDA_ARCH_LIST__) < 900
-#    error "Cannot compile algorithm 4 (ublkcp) below sm90"
-#  endif
-
-#  if TUNE_ALGORITHM == 1 && !defined(_CUB_HAS_TRANSFORM_UBLKCP)
-#    error "Cannot tune for ublkcp algorithm, which is not provided by CUB (old CTK?)"
-#  endif
-#endif
-
 #include "common.h"
 
-#if !TUNE_BASE
-#  if CUB_DETAIL_COUNT(__CUDA_ARCH_LIST__) != 1
-#    error "This benchmark does not support being compiled for multiple architectures"
-#  endif
-#endif
-
 template <typename T, typename OffsetT>
 static void nstream(nvbench::state& state, nvbench::type_list<T, OffsetT>)
 {
diff --git a/cub/benchmarks/bench/transform/common.h b/cub/benchmarks/bench/transform/common.h
index 68a158c92bb..d8339645429 100644
--- a/cub/benchmarks/bench/transform/common.h
+++ b/cub/benchmarks/bench/transform/common.h
@@ -1,7 +1,22 @@
 // SPDX-FileCopyrightText: Copyright (c) 2024, NVIDIA CORPORATION. All rights reserved.
 // SPDX-License-Identifier: BSD-3-Clause
 
+#pragma once
+
+// keep checks at the top so compilation of discarded variants fails really fast
 #include <cub/device/dispatch/dispatch_transform.cuh>
+#if !TUNE_BASE && TUNE_ALGORITHM == 1
+#  if _CCCL_PP_COUNT(__CUDA_ARCH_LIST__) != 1
+#    error "When tuning, this benchmark does not support being compiled for multiple architectures"
+#  endif
+#  if (__CUDA_ARCH_LIST__) < 900
+#    error "Cannot compile algorithm 4 (ublkcp) below sm90"
+#  endif
+#  ifndef _CUB_HAS_TRANSFORM_UBLKCP
+#    error "Cannot tune for ublkcp algorithm, which is not provided by CUB (old CTK?)"
+#  endif
+#endif
+
 #include <cub/util_namespace.cuh>
 
 #include <cuda/std/type_traits>
diff --git a/cub/benchmarks/bench/transform/complex_cmp.cu b/cub/benchmarks/bench/transform/complex_cmp.cu
index ac9eb4b0f8b..6849820ee5b 100644
--- a/cub/benchmarks/bench/transform/complex_cmp.cu
+++ b/cub/benchmarks/bench/transform/complex_cmp.cu
@@ -4,25 +4,8 @@
 // %RANGE% TUNE_THREADS tpb 128:1024:128
 // %RANGE% TUNE_ALGORITHM alg 0:1:1
 
-// keep checks at the top so compilation of discarded variants fails really fast
-#if !TUNE_BASE
-#  if TUNE_ALGORITHM == 1 && (__CUDA_ARCH_LIST__) < 900
-#    error "Cannot compile algorithm 4 (ublkcp) below sm90"
-#  endif
-
-#  if TUNE_ALGORITHM == 1 && !defined(_CUB_HAS_TRANSFORM_UBLKCP)
-#    error "Cannot tune for ublkcp algorithm, which is not provided by CUB (old CTK?)"
-#  endif
-#endif
-
 #include "common.h"
 
-#if !TUNE_BASE
-#  if CUB_DETAIL_COUNT(__CUDA_ARCH_LIST__) != 1
-#    error "This benchmark does not support being compiled for multiple architectures"
-#  endif
-#endif
-
 // This benchmark tests overlapping memory regions for reading and is compute intensive
 
 template <typename OffsetT>
diff --git a/cub/benchmarks/bench/transform/fib.cu b/cub/benchmarks/bench/transform/fib.cu
index 8a6c4c3dfa8..b7e16031907 100644
--- a/cub/benchmarks/bench/transform/fib.cu
+++ b/cub/benchmarks/bench/transform/fib.cu
@@ -4,25 +4,8 @@
 // %RANGE% TUNE_THREADS tpb 128:1024:128
 // %RANGE% TUNE_ALGORITHM alg 0:1:1
 
-// keep checks at the top so compilation of discarded variants fails really fast
-#if !TUNE_BASE
-#  if TUNE_ALGORITHM == 1 && (__CUDA_ARCH_LIST__) < 900
-#    error "Cannot compile algorithm 4 (ublkcp) below sm90"
-#  endif
-
-#  if TUNE_ALGORITHM == 1 && !defined(_CUB_HAS_TRANSFORM_UBLKCP)
-#    error "Cannot tune for ublkcp algorithm, which is not provided by CUB (old CTK?)"
-#  endif
-#endif
-
 #include "common.h"
 
-#if !TUNE_BASE
-#  if CUB_DETAIL_COUNT(__CUDA_ARCH_LIST__) != 1
-#    error "This benchmark does not support being compiled for multiple architectures"
-#  endif
-#endif
-
 // This benchmark is compute intensive with diverging threads
 
 template <class IndexT, class OutputT>
diff --git a/cub/benchmarks/bench/transform/heavy.cu b/cub/benchmarks/bench/transform/heavy.cu
index 7c35b069e24..be17a04fd8c 100644
--- a/cub/benchmarks/bench/transform/heavy.cu
+++ b/cub/benchmarks/bench/transform/heavy.cu
@@ -4,25 +4,8 @@
 // %RANGE% TUNE_THREADS tpb 128:1024:128
 // %RANGE% TUNE_ALGORITHM alg 0:1:1
 
-// keep checks at the top so compilation of discarded variants fails really fast
-#if !TUNE_BASE
-#  if TUNE_ALGORITHM == 1 && (__CUDA_ARCH_LIST__) < 900
-#    error "Cannot compile algorithm 4 (ublkcp) below sm90"
-#  endif
-
-#  if TUNE_ALGORITHM == 1 && !defined(_CUB_HAS_TRANSFORM_UBLKCP)
-#    error "Cannot tune for ublkcp algorithm, which is not provided by CUB (old CTK?)"
-#  endif
-#endif
-
 #include "common.h"
 
-#if !TUNE_BASE
-#  if CUB_DETAIL_COUNT(__CUDA_ARCH_LIST__) != 1
-#    error "This benchmark does not support being compiled for multiple architectures"
-#  endif
-#endif
-
 // This benchmark uses a LOT of registers and is compute intensive.
 
 template <int N>

From d9a94936d88670dfff12516cb5c2b1c400c6e3b2 Mon Sep 17 00:00:00 2001
From: David Bayer <48736217+davebayer@users.noreply.github.com>
Date: Fri, 29 Nov 2024 09:48:44 +0100
Subject: [PATCH 42/45] fix old gcc version check (#2989)

---
 libcudacxx/include/cuda/std/__cccl/builtin.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/libcudacxx/include/cuda/std/__cccl/builtin.h b/libcudacxx/include/cuda/std/__cccl/builtin.h
index 4e0bfae8a9e..2097bad2d74 100644
--- a/libcudacxx/include/cuda/std/__cccl/builtin.h
+++ b/libcudacxx/include/cuda/std/__cccl/builtin.h
@@ -242,7 +242,7 @@
 #  undef _CCCL_BUILTIN_ISNAN
 #endif // _CCCL_CUDACC_BELOW(11, 7)
 
-#if (_CCCL_CHECK_BUILTIN(builtin_launder) || (_CCCL_COMPILER(GCC) && _CCCL_GCC_VERSION >= 70000))
+#if _CCCL_CHECK_BUILTIN(builtin_launder) || _CCCL_COMPILER(GCC, >=, 7)
 #  define _CCCL_BUILTIN_LAUNDER(...) __builtin_launder(__VA_ARGS__)
 #endif // _CCCL_CHECK_BUILTIN(builtin_launder) && gcc >= 7
 

From 5bb947109065c8189a96deacbd853e11ff253a43 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Hao=20Zhang=28=E5=BC=A0=E6=B5=A9=29?=
 <hzhangxyz@outlook.com>
Date: Sat, 30 Nov 2024 23:17:16 +0800
Subject: [PATCH 43/45] Fix a typo in thrust/binary_search.h (#2980) (#2992)

---
 thrust/thrust/binary_search.h | 16 ++++++++--------
 1 file changed, 8 insertions(+), 8 deletions(-)

diff --git a/thrust/thrust/binary_search.h b/thrust/thrust/binary_search.h
index 20e96722ea3..d370fe37f95 100644
--- a/thrust/thrust/binary_search.h
+++ b/thrust/thrust/binary_search.h
@@ -756,8 +756,8 @@ bool binary_search(ForwardIterator first, ForwardIterator last, const T& value,
  *  thrust::equal_range(thrust::device, input.begin(), input.end(), 1); // returns [input.begin() + 1, input.begin() +
  * 1) thrust::equal_range(thrust::device, input.begin(), input.end(), 2); // returns [input.begin() + 1, input.begin() +
  * 2) thrust::equal_range(thrust::device, input.begin(), input.end(), 3); // returns [input.begin() + 2, input.begin() +
- * 2) thrust::equal_range(thrust::device, input.begin(), input.end(), 8); // returns [input.begin() + 4, input.end)
- *  thrust::equal_range(thrust::device, input.begin(), input.end(), 9); // returns [input.end(), input.end)
+ * 2) thrust::equal_range(thrust::device, input.begin(), input.end(), 8); // returns [input.begin() + 4, input.end())
+ *  thrust::equal_range(thrust::device, input.begin(), input.end(), 9); // returns [input.end(), input.end())
  *  \endcode
  *
  *  \see https://en.cppreference.com/w/cpp/algorithm/equal_range
@@ -821,8 +821,8 @@ _CCCL_HOST_DEVICE thrust::pair<ForwardIterator, ForwardIterator> equal_range(
  *  thrust::equal_range(input.begin(), input.end(), 1); // returns [input.begin() + 1, input.begin() + 1)
  *  thrust::equal_range(input.begin(), input.end(), 2); // returns [input.begin() + 1, input.begin() + 2)
  *  thrust::equal_range(input.begin(), input.end(), 3); // returns [input.begin() + 2, input.begin() + 2)
- *  thrust::equal_range(input.begin(), input.end(), 8); // returns [input.begin() + 4, input.end)
- *  thrust::equal_range(input.begin(), input.end(), 9); // returns [input.end(), input.end)
+ *  thrust::equal_range(input.begin(), input.end(), 8); // returns [input.begin() + 4, input.end())
+ *  thrust::equal_range(input.begin(), input.end(), 9); // returns [input.end(), input.end())
  *  \endcode
  *
  *  \see https://en.cppreference.com/w/cpp/algorithm/equal_range
@@ -893,8 +893,8 @@ equal_range(ForwardIterator first, ForwardIterator last, const LessThanComparabl
  * thrust::less<int>()); // returns [input.begin() + 1, input.begin() + 2) thrust::equal_range(thrust::device,
  * input.begin(), input.end(), 3, thrust::less<int>()); // returns [input.begin() + 2, input.begin() + 2)
  *  thrust::equal_range(thrust::device, input.begin(), input.end(), 8, thrust::less<int>()); // returns [input.begin() +
- * 4, input.end) thrust::equal_range(thrust::device, input.begin(), input.end(), 9, thrust::less<int>()); // returns
- * [input.end(), input.end) \endcode
+ * 4, input.end()) thrust::equal_range(thrust::device, input.begin(), input.end(), 9, thrust::less<int>()); // returns
+ * [input.end(), input.end()) \endcode
  *
  *  \see https://en.cppreference.com/w/cpp/algorithm/equal_range
  *  \see \p lower_bound
@@ -962,8 +962,8 @@ _CCCL_HOST_DEVICE thrust::pair<ForwardIterator, ForwardIterator> equal_range(
  * input.begin() + 1) thrust::equal_range(input.begin(), input.end(), 2, thrust::less<int>()); // returns [input.begin()
  * + 1, input.begin() + 2) thrust::equal_range(input.begin(), input.end(), 3, thrust::less<int>()); // returns
  * [input.begin() + 2, input.begin() + 2) thrust::equal_range(input.begin(), input.end(), 8, thrust::less<int>()); //
- * returns [input.begin() + 4, input.end) thrust::equal_range(input.begin(), input.end(), 9, thrust::less<int>()); //
- * returns [input.end(), input.end) \endcode
+ * returns [input.begin() + 4, input.end()) thrust::equal_range(input.begin(), input.end(), 9, thrust::less<int>()); //
+ * returns [input.end(), input.end()) \endcode
  *
  *  \see https://en.cppreference.com/w/cpp/algorithm/equal_range
  *  \see \p lower_bound

From 2a1273921aca66a80850f850143092458b31efb7 Mon Sep 17 00:00:00 2001
From: Bernhard Manfred Gruber <bernhardmgruber@gmail.com>
Date: Sat, 30 Nov 2024 16:23:09 +0100
Subject: [PATCH 44/45] Enable assertions for CCCL users in CMake Debug builds
 (#2986)

Fixes: #2975
---
 lib/cmake/libcudacxx/libcudacxx-config.cmake | 1 +
 1 file changed, 1 insertion(+)

diff --git a/lib/cmake/libcudacxx/libcudacxx-config.cmake b/lib/cmake/libcudacxx/libcudacxx-config.cmake
index 824a4976b19..3945f726af0 100644
--- a/lib/cmake/libcudacxx/libcudacxx-config.cmake
+++ b/lib/cmake/libcudacxx/libcudacxx-config.cmake
@@ -39,6 +39,7 @@ set(_libcudacxx_INCLUDE_DIR "${_libcudacxx_VERSION_INCLUDE_DIR}"
 )
 unset(_libcudacxx_VERSION_INCLUDE_DIR CACHE) # Clear tmp variable from cache
 target_include_directories(_libcudacxx_libcudacxx INTERFACE "${_libcudacxx_INCLUDE_DIR}")
+target_compile_definitions(_libcudacxx_libcudacxx INTERFACE $<$<CONFIG:Debug>:CCCL_ENABLE_ASSERTIONS>)
 
 #
 # Standardize version info

From cb5921b33dc8ae1c4038a6386d79c1a973422c45 Mon Sep 17 00:00:00 2001
From: Bernhard Manfred Gruber <bernhardmgruber@gmail.com>
Date: Sat, 30 Nov 2024 16:31:02 +0100
Subject: [PATCH 45/45] Fix CMake warning for FindPythonInterp (#2982)

CMake Warning (dev) at libcudacxx/CMakeLists.txt:43 (include):
  Policy CMP0148 is not set: The FindPythonInterp and FindPythonLibs modules
  are removed.  Run "cmake --help-policy CMP0148" for policy details.  Use
  the cmake_policy command to set the policy and suppress this warning.
---
 libcudacxx/CMakeLists.txt | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/libcudacxx/CMakeLists.txt b/libcudacxx/CMakeLists.txt
index 1ccfb6a92ff..39f86b6bdb2 100644
--- a/libcudacxx/CMakeLists.txt
+++ b/libcudacxx/CMakeLists.txt
@@ -40,8 +40,8 @@ option(LIBCUDACXX_ENABLE_LIBCUDACXX_TESTS "Enable libcu++ tests." ON)
 if (LIBCUDACXX_ENABLE_LIBCUDACXX_TESTS)
   enable_testing()
 
-  include(FindPythonInterp)
-  if (NOT PYTHONINTERP_FOUND)
+  find_package (Python COMPONENTS Interpreter)
+  if (NOT Python_Interpreter_FOUND)
     message(FATAL_ERROR
       "Failed to find python interpreter, which is required for running tests and "
       "building a libcu++ static library.")