From 1b8151c081149268ab2fec3961570bb538f1194b Mon Sep 17 00:00:00 2001 From: Michael Schellenberger Costa Date: Thu, 21 Nov 2024 16:05:35 +0100 Subject: [PATCH 01/45] Try to work around issue with NVHPC in conjunction of older CTK versions (#2889) NVHPC can consume older CTK headers for stdpar, so we need to try and avoid using those --- cub/cub/thread/thread_operators.cuh | 38 +++++++++++++++++++++++++++++ 1 file changed, 38 insertions(+) diff --git a/cub/cub/thread/thread_operators.cuh b/cub/cub/thread/thread_operators.cuh index cfc47edcfe7..2de65083843 100644 --- a/cub/cub/thread/thread_operators.cuh +++ b/cub/cub/thread/thread_operators.cuh @@ -440,10 +440,15 @@ struct SimdMin<__half> _CCCL_NODISCARD _CCCL_DEVICE _CCCL_FORCEINLINE __half2 operator()(__half2 a, __half2 b) const { +# if _CCCL_CUDACC_BELOW(12, 0) && defined(_CCCL_CUDA_COMPILER_NVHPC) + return __floats2half2_rn(::cuda::minimum<>{}(__half2float(a.x), __half2float(b.x)), + ::cuda::minimum<>{}(__half2float(a.y), __half2float(b.y))); +# else // ^^^ _CCCL_CUDACC_BELOW(12, 0) && _CCCL_CUDA_COMPILER_NVHPC ^^^ / vvv otherwise vvv NV_IF_TARGET(NV_PROVIDES_SM_80, (return __hmin2(a, b);), (return __halves2half2(__float2half(::cuda::minimum<>{}(__half2float(a.x), __half2float(b.x))), __float2half(::cuda::minimum<>{}(__half2float(a.y), __half2float(b.y))));)); +# endif // !_CCCL_CUDACC_BELOW(12, 0) || !_CCCL_CUDA_COMPILER_NVHPC } }; @@ -470,11 +475,16 @@ struct SimdMin<__nv_bfloat16> _CCCL_NODISCARD _CCCL_DEVICE _CCCL_FORCEINLINE __nv_bfloat162 operator()(__nv_bfloat162 a, __nv_bfloat162 b) const { +# if _CCCL_CUDACC_BELOW(12, 0) && defined(_CCCL_CUDA_COMPILER_NVHPC) + return __floats2bfloat162_rn(::cuda::minimum<>{}(__bfloat162float(a.x), __bfloat162float(b.x)), + ::cuda::minimum<>{}(__bfloat162float(a.y), __bfloat162float(b.y))); +# else // ^^^ _CCCL_CUDACC_BELOW(12, 0) && _CCCL_CUDA_COMPILER_NVHPC ^^^ / vvv otherwise vvv NV_IF_TARGET(NV_PROVIDES_SM_80, (return __hmin2(a, b);), (return cub::internal::halves2bfloat162( __float2bfloat16(::cuda::minimum<>{}(__bfloat162float(a.x), __bfloat162float(b.x))), __float2bfloat16(::cuda::minimum<>{}(__bfloat162float(a.y), __bfloat162float(b.y))));)); +# endif // !_CCCL_CUDACC_BELOW(12, 0) || !_CCCL_CUDA_COMPILER_NVHPC } }; @@ -521,10 +531,15 @@ struct SimdMax<__half> _CCCL_NODISCARD _CCCL_DEVICE _CCCL_FORCEINLINE __half2 operator()(__half2 a, __half2 b) const { +# if _CCCL_CUDACC_BELOW(12, 0) && defined(_CCCL_CUDA_COMPILER_NVHPC) + return __floats2half2_rn(::cuda::maximum<>{}(__half2float(a.x), __half2float(b.x)), + ::cuda::maximum<>{}(__half2float(a.y), __half2float(b.y))); +# else // ^^^ _CCCL_CUDACC_BELOW(12, 0) && _CCCL_CUDA_COMPILER_NVHPC ^^^ / vvv otherwise vvv NV_IF_TARGET(NV_PROVIDES_SM_80, (return __hmax2(a, b);), (return __halves2half2(__float2half(::cuda::maximum<>{}(__half2float(a.x), __half2float(b.x))), __float2half(::cuda::maximum<>{}(__half2float(a.y), __half2float(b.y))));)); +# endif // !_CCCL_CUDACC_BELOW(12, 0) || !_CCCL_CUDA_COMPILER_NVHPC } }; @@ -539,11 +554,16 @@ struct SimdMax<__nv_bfloat16> _CCCL_NODISCARD _CCCL_DEVICE _CCCL_FORCEINLINE __nv_bfloat162 operator()(__nv_bfloat162 a, __nv_bfloat162 b) const { +# if _CCCL_CUDACC_BELOW(12, 0) && defined(_CCCL_CUDA_COMPILER_NVHPC) + return __floats2bfloat162_rn(::cuda::maximum<>{}(__bfloat162float(a.x), __bfloat162float(b.x)), + ::cuda::maximum<>{}(__bfloat162float(a.y), __bfloat162float(b.y))); +# else // ^^^ _CCCL_CUDACC_BELOW(12, 0) && _CCCL_CUDA_COMPILER_NVHPC ^^^ / vvv otherwise vvv NV_IF_TARGET(NV_PROVIDES_SM_80, (return __hmax2(a, b);), (return cub::internal::halves2bfloat162( __float2bfloat16(::cuda::maximum<>{}(__bfloat162float(a.x), __bfloat162float(b.x))), __float2bfloat16(::cuda::maximum<>{}(__bfloat162float(a.y), __bfloat162float(b.y))));)); +# endif // !_CCCL_CUDACC_BELOW(12, 0) || !_CCCL_CUDA_COMPILER_NVHPC } }; @@ -566,10 +586,14 @@ struct SimdSum<__half> _CCCL_NODISCARD _CCCL_DEVICE _CCCL_FORCEINLINE __half2 operator()(__half2 a, __half2 b) const { +# if _CCCL_CUDACC_BELOW(12, 0) && defined(_CCCL_CUDA_COMPILER_NVHPC) + return __floats2half2_rn(__half2float(a.x) + __half2float(b.x), __half2float(a.y) + __half2float(b.y)); +# else // ^^^ _CCCL_CUDACC_BELOW(12, 0) && _CCCL_CUDA_COMPILER_NVHPC ^^^ / vvv otherwise vvv NV_IF_TARGET(NV_PROVIDES_SM_53, (return __hadd2(a, b);), (return __halves2half2(__float2half(__half2float(a.x) + __half2float(b.x)), __float2half(__half2float(a.y) + __half2float(b.y)));)); +# endif // !_CCCL_CUDACC_BELOW(12, 0) || !_CCCL_CUDA_COMPILER_NVHPC } }; @@ -584,11 +608,16 @@ struct SimdSum<__nv_bfloat16> _CCCL_NODISCARD _CCCL_DEVICE _CCCL_FORCEINLINE __nv_bfloat162 operator()(__nv_bfloat162 a, __nv_bfloat162 b) const { +# if _CCCL_CUDACC_BELOW(12, 0) && defined(_CCCL_CUDA_COMPILER_NVHPC) + return __floats2bfloat162_rn( + __bfloat162float(a.x) + __bfloat162float(b.x), __bfloat162float(a.y) + __bfloat162float(b.y)); +# else // ^^^ _CCCL_CUDACC_BELOW(12, 0) && _CCCL_CUDA_COMPILER_NVHPC ^^^ / vvv otherwise vvv NV_IF_TARGET( NV_PROVIDES_SM_80, (return __hadd2(a, b);), (return cub::internal::halves2bfloat162(__float2bfloat16(__bfloat162float(a.x) + __bfloat162float(b.x)), __float2bfloat16(__bfloat162float(a.y) + __bfloat162float(b.y)));)); +# endif // !_CCCL_CUDACC_BELOW(12, 0) || !_CCCL_CUDA_COMPILER_NVHPC } }; @@ -611,10 +640,14 @@ struct SimdMul<__half> _CCCL_NODISCARD _CCCL_DEVICE _CCCL_FORCEINLINE __half2 operator()(__half2 a, __half2 b) const { +# if _CCCL_CUDACC_BELOW(12, 0) && defined(_CCCL_CUDA_COMPILER_NVHPC) + return __floats2half2_rn(__half2float(a.x) * __half2float(b.x), __half2float(a.y) * __half2float(b.y)); +# else // ^^^ _CCCL_CUDACC_BELOW(12, 0) && _CCCL_CUDA_COMPILER_NVHPC ^^^ / vvv otherwise vvv NV_IF_TARGET(NV_PROVIDES_SM_53, (return __hmul2(a, b);), (return __halves2half2(__float2half(__half2float(a.x) * __half2float(b.x)), __float2half(__half2float(a.y) * __half2float(b.y)));)); +# endif // !_CCCL_CUDACC_BELOW(12, 0) || !_CCCL_CUDA_COMPILER_NVHPC } }; @@ -629,10 +662,15 @@ struct SimdMul<__nv_bfloat16> _CCCL_NODISCARD _CCCL_DEVICE _CCCL_FORCEINLINE __nv_bfloat162 operator()(__nv_bfloat162 a, __nv_bfloat162 b) const { +# if _CCCL_CUDACC_BELOW(12, 0) && defined(_CCCL_CUDA_COMPILER_NVHPC) + return __floats2bfloat162_rn( + __bfloat162float(a.x) * __bfloat162float(b.x), __bfloat162float(a.y) * __bfloat162float(b.y)); +# else // ^^^ _CCCL_CUDACC_BELOW(12, 0) && _CCCL_CUDA_COMPILER_NVHPC ^^^ / vvv otherwise vvv NV_IF_TARGET(NV_PROVIDES_SM_80, (return __hmul2(a, b);), (return halves2bfloat162(__float2bfloat16(__bfloat162float(a.x) * __bfloat162float(b.x)), __float2bfloat16(__bfloat162float(a.y) * __bfloat162float(b.y)));)); +# endif // !_CCCL_CUDACC_BELOW(12, 0) || !_CCCL_CUDA_COMPILER_NVHPC } }; From 9af2a13df00318cefb4902500bef74074ec50e8e Mon Sep 17 00:00:00 2001 From: Bernhard Manfred Gruber Date: Thu, 21 Nov 2024 16:14:53 +0100 Subject: [PATCH 02/45] Refactoring (#2905) --- .../nvbench_helper/nvbench_helper.cuh | 79 +++++++++---------- 1 file changed, 39 insertions(+), 40 deletions(-) diff --git a/cub/benchmarks/nvbench_helper/nvbench_helper/nvbench_helper.cuh b/cub/benchmarks/nvbench_helper/nvbench_helper/nvbench_helper.cuh index e8dacb4a1ff..88b189cf964 100644 --- a/cub/benchmarks/nvbench_helper/nvbench_helper/nvbench_helper.cuh +++ b/cub/benchmarks/nvbench_helper/nvbench_helper/nvbench_helper.cuh @@ -418,52 +418,51 @@ struct less_t { return lhs < rhs; } -}; - -template <> -__host__ __device__ inline bool less_t::operator()(const complex& lhs, const complex& rhs) const -{ - double magnitude_0 = cuda::std::abs(lhs); - double magnitude_1 = cuda::std::abs(rhs); - if (cuda::std::isnan(magnitude_0) || cuda::std::isnan(magnitude_1)) - { - // NaN's are always equal. - return false; - } - else if (cuda::std::isinf(magnitude_0) || cuda::std::isinf(magnitude_1)) + __host__ __device__ inline bool operator()(const complex& lhs, const complex& rhs) const { - // If the real or imaginary part of the complex number has a very large value - // (close to the maximum representable value for a double), it is possible that - // the magnitude computation can result in positive infinity: - // ```cpp - // const double large_number = std::numeric_limits::max() / 2; - // std::complex z(large_number, large_number); - // std::abs(z) == inf; - // ``` - // Dividing both components by a constant before computing the magnitude prevents overflow. - const complex::value_type scaler = 0.5; - - magnitude_0 = cuda::std::abs(lhs * scaler); - magnitude_1 = cuda::std::abs(rhs * scaler); - } + double magnitude_0 = cuda::std::abs(lhs); + double magnitude_1 = cuda::std::abs(rhs); + + if (cuda::std::isnan(magnitude_0) || cuda::std::isnan(magnitude_1)) + { + // NaN's are always equal. + return false; + } + else if (cuda::std::isinf(magnitude_0) || cuda::std::isinf(magnitude_1)) + { + // If the real or imaginary part of the complex number has a very large value + // (close to the maximum representable value for a double), it is possible that + // the magnitude computation can result in positive infinity: + // ```cpp + // const double large_number = std::numeric_limits::max() / 2; + // std::complex z(large_number, large_number); + // std::abs(z) == inf; + // ``` + // Dividing both components by a constant before computing the magnitude prevents overflow. + const complex::value_type scaler = 0.5; + + magnitude_0 = cuda::std::abs(lhs * scaler); + magnitude_1 = cuda::std::abs(rhs * scaler); + } - const complex::value_type difference = cuda::std::abs(magnitude_0 - magnitude_1); - const complex::value_type threshold = cuda::std::numeric_limits::epsilon() * 2; + const complex::value_type difference = cuda::std::abs(magnitude_0 - magnitude_1); + const complex::value_type threshold = cuda::std::numeric_limits::epsilon() * 2; - if (difference < threshold) - { - // Triangles with the same magnitude are sorted by their phase angle. - const complex::value_type phase_angle_0 = cuda::std::arg(lhs); - const complex::value_type phase_angle_1 = cuda::std::arg(rhs); + if (difference < threshold) + { + // Triangles with the same magnitude are sorted by their phase angle. + const complex::value_type phase_angle_0 = cuda::std::arg(lhs); + const complex::value_type phase_angle_1 = cuda::std::arg(rhs); - return phase_angle_0 < phase_angle_1; - } - else - { - return magnitude_0 < magnitude_1; + return phase_angle_0 < phase_angle_1; + } + else + { + return magnitude_0 < magnitude_1; + } } -} +}; struct max_t { From 801b794cc8f46837cd66595eb9f0bc6824907630 Mon Sep 17 00:00:00 2001 From: Eric Niebler Date: Thu, 21 Nov 2024 11:02:57 -0800 Subject: [PATCH 03/45] add "`interface`" to `_CCCL_PUSH_MACROS` (#2919) --- libcudacxx/include/cuda/std/__cccl/diagnostic.h | 9 ++++++--- 1 file changed, 6 insertions(+), 3 deletions(-) diff --git a/libcudacxx/include/cuda/std/__cccl/diagnostic.h b/libcudacxx/include/cuda/std/__cccl/diagnostic.h index 4183fd96bf7..fdedae215f3 100644 --- a/libcudacxx/include/cuda/std/__cccl/diagnostic.h +++ b/libcudacxx/include/cuda/std/__cccl/diagnostic.h @@ -175,9 +175,12 @@ # define _CCCL_PUSH_MACROS _CCCL_MSVC_WARNINGS_PUSH # define _CCCL_POP_MACROS _CCCL_MSVC_WARNINGS_POP #else // ^^^ _CCCL_HAS_NO_PRAGMA_PUSH_POP_MACRO ^^^ / vvv !_CCCL_HAS_NO_PRAGMA_PUSH_POP_MACRO vvv -# define _CCCL_PUSH_MACROS _CCCL_PRAGMA(push_macro("min")) _CCCL_PRAGMA(push_macro("max")) _CCCL_MSVC_WARNINGS_PUSH -# define _CCCL_POP_MACROS _CCCL_PRAGMA(pop_macro("min")) _CCCL_PRAGMA(pop_macro("max")) _CCCL_MSVC_WARNINGS_POP - +# define _CCCL_PUSH_MACROS \ + _CCCL_PRAGMA(push_macro("min")) \ + _CCCL_PRAGMA(push_macro("max")) _CCCL_PRAGMA(push_macro("interface")) _CCCL_MSVC_WARNINGS_PUSH +# define _CCCL_POP_MACROS \ + _CCCL_PRAGMA(pop_macro("min")) \ + _CCCL_PRAGMA(pop_macro("max")) _CCCL_PRAGMA(pop_macro("interface")) _CCCL_MSVC_WARNINGS_POP #endif // !_CCCL_HAS_NO_PRAGMA_PUSH_POP_MACRO #endif // __CCCL_DIAGNOSTIC_H From 0722044948f46e61e704828be78843ff256c7eb6 Mon Sep 17 00:00:00 2001 From: Eric Niebler Date: Thu, 21 Nov 2024 12:34:03 -0800 Subject: [PATCH 04/45] Replace inconsistent Doxygen macros with `_CCCL_DOXYGEN_INVOKED` (#2921) fixes #2362 --- cub/cub/block/block_discontinuity.cuh | 4 +- cub/cub/block/block_exchange.cuh | 4 +- cub/cub/block/block_load.cuh | 4 +- cub/cub/block/block_merge_sort.cuh | 4 +- cub/cub/block/block_radix_rank.cuh | 16 ++--- cub/cub/block/block_radix_sort.cuh | 8 +-- cub/cub/block/block_run_length_decode.cuh | 4 +- cub/cub/block/block_scan.cuh | 4 +- cub/cub/block/block_store.cuh | 4 +- cub/cub/block/radix_rank_sort_operations.cuh | 4 +- cub/cub/detail/array_utils.cuh | 4 +- cub/cub/detail/detect_cuda_runtime.cuh | 2 +- cub/cub/detail/nvtx.cuh | 4 +- cub/cub/detail/strong_load.cuh | 4 +- cub/cub/detail/strong_store.cuh | 4 +- cub/cub/device/device_adjacent_difference.cuh | 16 ++--- cub/cub/device/device_histogram.cuh | 32 +++++----- cub/cub/device/device_merge_sort.cuh | 24 +++---- cub/cub/device/device_partition.cuh | 12 ++-- cub/cub/device/device_radix_sort.cuh | 16 ++--- cub/cub/device/device_reduce.cuh | 28 ++++---- cub/cub/device/device_run_length_encode.cuh | 8 +-- cub/cub/device/device_scan.cuh | 56 ++++++++-------- .../device/device_segmented_radix_sort.cuh | 32 +++++----- cub/cub/device/device_segmented_reduce.cuh | 24 +++---- cub/cub/device/device_segmented_sort.cuh | 64 +++++++++---------- cub/cub/device/device_select.cuh | 24 +++---- cub/cub/device/device_spmv.cuh | 4 +- cub/cub/device/device_transform.cuh | 16 ++--- .../dispatch/dispatch_adjacent_difference.cuh | 8 +-- .../device/dispatch/dispatch_histogram.cuh | 16 ++--- cub/cub/device/dispatch/dispatch_reduce.cuh | 16 ++--- .../dispatch/dispatch_reduce_by_key.cuh | 4 +- cub/cub/device/dispatch/dispatch_rle.cuh | 4 +- cub/cub/device/dispatch/dispatch_scan.cuh | 8 +-- .../device/dispatch/dispatch_scan_by_key.cuh | 8 +-- .../dispatch/dispatch_segmented_sort.cuh | 8 +-- .../device/dispatch/dispatch_select_if.cuh | 4 +- .../device/dispatch/dispatch_spmv_orig.cuh | 8 +-- .../dispatch/dispatch_three_way_partition.cuh | 4 +- .../dispatch/dispatch_unique_by_key.cuh | 8 +-- cub/cub/grid/grid_queue.cuh | 4 +- cub/cub/thread/thread_load.cuh | 4 +- cub/cub/thread/thread_operators.cuh | 4 +- cub/cub/thread/thread_reduce.cuh | 8 +-- cub/cub/thread/thread_store.cuh | 4 +- cub/cub/util_allocator.cuh | 4 +- cub/cub/util_arch.cuh | 2 +- cub/cub/util_cpp_dialect.cuh | 4 +- cub/cub/util_debug.cuh | 4 +- cub/cub/util_device.cuh | 12 ++-- cub/cub/util_macro.cuh | 2 +- cub/cub/util_ptx.cuh | 12 ++-- cub/cub/util_temporary_storage.cuh | 4 +- cub/cub/util_type.cuh | 12 ++-- cub/cub/util_vsmem.cuh | 4 +- cub/cub/warp/warp_reduce.cuh | 8 +-- .../uninitialized_async_buffer.cuh | 4 +- .../__container/uninitialized_buffer.cuh | 4 +- .../cuda/experimental/__device/device.cuh | 2 +- .../device_memory_resource.cuh | 4 +- .../__stf/internal/data_interface.cuh | 4 +- .../__stf/internal/execution_policy.cuh | 6 +- .../__stf/internal/reduction_base.cuh | 4 +- .../places/exec/host/callback_queues.cuh | 4 +- .../experimental/__stf/places/inner_shape.cuh | 4 +- .../experimental/__stf/stream/reduction.cuh | 4 +- .../experimental/__stf/stream/stream_task.cuh | 4 +- .../cuda/experimental/__stf/utility/core.cuh | 8 +-- .../experimental/__stf/utility/unittest.cuh | 6 +- .../__utility/ensure_current_device.cuh | 4 +- docs/repo.toml | 9 +-- .../cuda/std/__type_traits/type_list.h | 4 +- thrust/thrust/detail/type_deduction.h | 4 +- thrust/thrust/device_malloc_allocator.h | 4 +- thrust/thrust/device_ptr.h | 4 +- thrust/thrust/device_reference.h | 2 +- thrust/thrust/memory.h | 4 +- thrust/thrust/optional.h | 4 +- thrust/thrust/pair.h | 18 +++--- .../random/linear_congruential_engine.h | 4 +- thrust/thrust/tuple.h | 18 +++--- 82 files changed, 376 insertions(+), 379 deletions(-) diff --git a/cub/cub/block/block_discontinuity.cuh b/cub/cub/block/block_discontinuity.cuh index 2fb15e9059b..fb88dfac07f 100644 --- a/cub/cub/block/block_discontinuity.cuh +++ b/cub/cub/block/block_discontinuity.cuh @@ -270,7 +270,7 @@ public: //! @name Head flag operations //! @{ -#ifndef DOXYGEN_SHOULD_SKIP_THIS // Do not document +#ifndef _CCCL_DOXYGEN_INVOKED // Do not document /** * @param[out] head_flags @@ -349,7 +349,7 @@ public: Iterate::FlagHeads(linear_tid, head_flags, input, preds, flag_op); } -#endif // DOXYGEN_SHOULD_SKIP_THIS +#endif // _CCCL_DOXYGEN_INVOKED //! @rst //! Sets head flags indicating discontinuities between items partitioned across the thread diff --git a/cub/cub/block/block_exchange.cuh b/cub/cub/block/block_exchange.cuh index a781d68e68b..bdc2a3dc932 100644 --- a/cub/cub/block/block_exchange.cuh +++ b/cub/cub/block/block_exchange.cuh @@ -1217,7 +1217,7 @@ public: //! @} end member group -#ifndef DOXYGEN_SHOULD_SKIP_THIS // Do not document +#ifndef _CCCL_DOXYGEN_INVOKED // Do not document /// @param[in-out] items /// Items to exchange, converting between **striped** and **blocked** arrangements. @@ -1292,7 +1292,7 @@ public: ScatterToStriped(items, items, ranks, is_valid); } -#endif // DOXYGEN_SHOULD_SKIP_THIS +#endif // _CCCL_DOXYGEN_INVOKED }; CUB_NAMESPACE_END diff --git a/cub/cub/block/block_load.cuh b/cub/cub/block/block_load.cuh index 641ff6d5d09..c1e9b95ac56 100644 --- a/cub/cub/block/block_load.cuh +++ b/cub/cub/block/block_load.cuh @@ -179,7 +179,7 @@ _CCCL_DEVICE _CCCL_FORCEINLINE void LoadDirectBlocked( LoadDirectBlocked(linear_tid, block_src_it, dst_items, block_items_end); } -#ifndef DOXYGEN_SHOULD_SKIP_THIS // Do not document +#ifndef _CCCL_DOXYGEN_INVOKED // Do not document //! @brief Internal implementation for load vectorization //! @@ -225,7 +225,7 @@ InternalLoadDirectBlockedVectorized(int linear_tid, const T* block_src_ptr, T (& } } -#endif // DOXYGEN_SHOULD_SKIP_THIS +#endif // _CCCL_DOXYGEN_INVOKED //! @rst //! Load a linear segment of items into a blocked arrangement across the thread block. diff --git a/cub/cub/block/block_merge_sort.cuh b/cub/cub/block/block_merge_sort.cuh index 29510db5e97..b6d0c8a33b1 100644 --- a/cub/cub/block/block_merge_sort.cuh +++ b/cub/cub/block/block_merge_sort.cuh @@ -175,14 +175,14 @@ private: // Whether or not there are values to be trucked along with keys static constexpr bool KEYS_ONLY = ::cuda::std::is_same::value; -#ifndef DOXYGEN_SHOULD_SKIP_THIS // Do not document +#ifndef _CCCL_DOXYGEN_INVOKED // Do not document /// Shared memory type required by this thread block union _TempStorage { KeyT keys_shared[ITEMS_PER_TILE + 1]; ValueT items_shared[ITEMS_PER_TILE + 1]; }; // union TempStorage -#endif // DOXYGEN_SHOULD_SKIP_THIS +#endif // _CCCL_DOXYGEN_INVOKED /// Shared storage reference _TempStorage& temp_storage; diff --git a/cub/cub/block/block_radix_rank.cuh b/cub/cub/block/block_radix_rank.cuh index 73228368fc5..5426e967712 100644 --- a/cub/cub/block/block_radix_rank.cuh +++ b/cub/cub/block/block_radix_rank.cuh @@ -93,7 +93,7 @@ struct BlockRadixRankEmptyCallback _CCCL_DEVICE _CCCL_FORCEINLINE void operator()(int (&bins)[BINS_PER_THREAD]) {} }; -#ifndef DOXYGEN_SHOULD_SKIP_THIS // Do not document +#ifndef _CCCL_DOXYGEN_INVOKED // Do not document namespace detail { @@ -121,7 +121,7 @@ struct warp_in_block_matcher_t }; } // namespace detail -#endif // DOXYGEN_SHOULD_SKIP_THIS +#endif // _CCCL_DOXYGEN_INVOKED //! @rst //! BlockRadixRank provides operations for ranking unsigned integer types within a CUDA thread block. @@ -263,7 +263,7 @@ private: /// BlockScan type using BlockScan = BlockScan; -#ifndef DOXYGEN_SHOULD_SKIP_THIS // Do not document +#ifndef _CCCL_DOXYGEN_INVOKED // Do not document struct __align__(16) _TempStorage { union Aliasable @@ -276,7 +276,7 @@ private: // Storage for scanning local ranks typename BlockScan::TempStorage block_scan; }; -#endif // !DOXYGEN_SHOULD_SKIP_THIS +#endif // !_CCCL_DOXYGEN_INVOKED /// Shared storage reference _TempStorage& temp_storage; @@ -597,7 +597,7 @@ private: /// BlockScan type using BlockScanT = BlockScan; -#ifndef DOXYGEN_SHOULD_SKIP_THIS // Do not document +#ifndef _CCCL_DOXYGEN_INVOKED // Do not document struct __align__(16) _TempStorage { typename BlockScanT::TempStorage block_scan; @@ -609,7 +609,7 @@ private: } aliasable; }; -#endif // !DOXYGEN_SHOULD_SKIP_THIS +#endif // !_CCCL_DOXYGEN_INVOKED /// Shared storage reference _TempStorage& temp_storage; @@ -1183,7 +1183,7 @@ struct BlockRadixRankMatchEarlyCounts } }; -#ifndef DOXYGEN_SHOULD_SKIP_THIS // Do not document +#ifndef _CCCL_DOXYGEN_INVOKED // Do not document namespace detail { @@ -1211,6 +1211,6 @@ using block_radix_rank_t = ::cuda::std::_If< BlockRadixRankMatchEarlyCounts>>>>; } // namespace detail -#endif // DOXYGEN_SHOULD_SKIP_THIS +#endif // _CCCL_DOXYGEN_INVOKED CUB_NAMESPACE_END diff --git a/cub/cub/block/block_radix_sort.cuh b/cub/cub/block/block_radix_sort.cuh index 48650992918..3223b920b13 100644 --- a/cub/cub/block/block_radix_sort.cuh +++ b/cub/cub/block/block_radix_sort.cuh @@ -303,7 +303,7 @@ private: /// BlockExchange utility type for values using BlockExchangeValues = BlockExchange; -#ifndef DOXYGEN_SHOULD_SKIP_THIS // Do not document +#ifndef _CCCL_DOXYGEN_INVOKED // Do not document /// Shared memory storage layout type union _TempStorage { @@ -312,7 +312,7 @@ private: typename BlockExchangeKeys::TempStorage exchange_keys; typename BlockExchangeValues::TempStorage exchange_values; }; -#endif // DOXYGEN_SHOULD_SKIP_THIS +#endif // _CCCL_DOXYGEN_INVOKED /****************************************************************************** * Thread fields @@ -469,7 +469,7 @@ private: } public: -#ifndef DOXYGEN_SHOULD_SKIP_THIS // Do not document +#ifndef _CCCL_DOXYGEN_INVOKED // Do not document /** * @brief Sort blocked -> striped arrangement @@ -554,7 +554,7 @@ public: } } -#endif // DOXYGEN_SHOULD_SKIP_THIS +#endif // _CCCL_DOXYGEN_INVOKED /// @smemstorage{BlockRadixSort} struct TempStorage : Uninitialized<_TempStorage> diff --git a/cub/cub/block/block_run_length_decode.cuh b/cub/cub/block/block_run_length_decode.cuh index 253fdb8b1d9..74934576cd5 100644 --- a/cub/cub/block/block_run_length_decode.cuh +++ b/cub/cub/block/block_run_length_decode.cuh @@ -173,7 +173,7 @@ private: /// Type used to index into the block's runs using RunOffsetT = uint32_t; -#ifndef DOXYGEN_SHOULD_SKIP_THIS // Do not document +#ifndef _CCCL_DOXYGEN_INVOKED // Do not document /// Shared memory type required by this thread block union _TempStorage { @@ -184,7 +184,7 @@ private: DecodedOffsetT run_offsets[BLOCK_RUNS]; } runs; }; // union TempStorage -#endif // DOXYGEN_SHOULD_SKIP_THIS +#endif // _CCCL_DOXYGEN_INVOKED /// Internal storage allocator (used when the user does not provide pre-allocated shared memory) _CCCL_DEVICE _CCCL_FORCEINLINE _TempStorage& PrivateStorage() diff --git a/cub/cub/block/block_scan.cuh b/cub/cub/block/block_scan.cuh index 0644e8ca254..c49eb36a52e 100644 --- a/cub/cub/block/block_scan.cuh +++ b/cub/cub/block/block_scan.cuh @@ -1291,7 +1291,7 @@ public: } //! @} end member group -#ifndef DOXYGEN_SHOULD_SKIP_THIS // Do not document no-initial-value scans +#ifndef _CCCL_DOXYGEN_INVOKED // Do not document no-initial-value scans //! @name Exclusive prefix scan operations (no initial value, single datum per thread) //! @{ @@ -1445,7 +1445,7 @@ public: } //! @} end member group -#endif // DOXYGEN_SHOULD_SKIP_THIS // Do not document no-initial-value scans +#endif // _CCCL_DOXYGEN_INVOKED // Do not document no-initial-value scans //! @name Inclusive prefix sum operations //! @{ diff --git a/cub/cub/block/block_store.cuh b/cub/cub/block/block_store.cuh index 9d057d7fe4b..443f7a7f93b 100644 --- a/cub/cub/block/block_store.cuh +++ b/cub/cub/block/block_store.cuh @@ -1229,12 +1229,12 @@ public: //! @} end member group }; -#ifndef DOXYGEN_SHOULD_SKIP_THIS // Do not document +#ifndef _CCCL_DOXYGEN_INVOKED // Do not document template > struct BlockStoreType { using type = cub::BlockStore; }; -#endif // DOXYGEN_SHOULD_SKIP_THIS +#endif // _CCCL_DOXYGEN_INVOKED CUB_NAMESPACE_END diff --git a/cub/cub/block/radix_rank_sort_operations.cuh b/cub/cub/block/radix_rank_sort_operations.cuh index e56a0ec1e27..d4fdd9c405f 100644 --- a/cub/cub/block/radix_rank_sort_operations.cuh +++ b/cub/cub/block/radix_rank_sort_operations.cuh @@ -142,7 +142,7 @@ struct ShiftDigitExtractor : BaseDigitExtractor } }; -#ifndef DOXYGEN_SHOULD_SKIP_THIS // Do not document +#ifndef _CCCL_DOXYGEN_INVOKED // Do not document namespace detail { @@ -564,7 +564,7 @@ struct traits_t } // namespace radix } // namespace detail -#endif // DOXYGEN_SHOULD_SKIP_THIS +#endif // _CCCL_DOXYGEN_INVOKED //! Twiddling keys for radix sort template diff --git a/cub/cub/detail/array_utils.cuh b/cub/cub/detail/array_utils.cuh index cfc8fafb452..1857c895a3c 100644 --- a/cub/cub/detail/array_utils.cuh +++ b/cub/cub/detail/array_utils.cuh @@ -51,7 +51,7 @@ CUB_NAMESPACE_BEGIN namespace detail { -#ifndef DOXYGEN_SHOULD_SKIP_THIS // Do not document +#ifndef _CCCL_DOXYGEN_INVOKED // Do not document /*********************************************************************************************************************** * Generic Array-like to Array Conversion @@ -74,7 +74,7 @@ to_array(const Input& input) return to_array_impl(input, ::cuda::std::make_index_sequence()>{}); } -#endif // !DOXYGEN_SHOULD_SKIP_THIS +#endif // !_CCCL_DOXYGEN_INVOKED } // namespace detail diff --git a/cub/cub/detail/detect_cuda_runtime.cuh b/cub/cub/detail/detect_cuda_runtime.cuh index 35c52f4aedb..d83b2c1179a 100644 --- a/cub/cub/detail/detect_cuda_runtime.cuh +++ b/cub/cub/detail/detect_cuda_runtime.cuh @@ -49,7 +49,7 @@ # include #endif // !_CCCL_COMPILER(NVRTC) -#ifdef DOXYGEN_SHOULD_SKIP_THIS // Only parse this during doxygen passes: +#ifdef _CCCL_DOXYGEN_INVOKED // Only parse this during doxygen passes: /** * \def CUB_DISABLE_CDP diff --git a/cub/cub/detail/nvtx.cuh b/cub/cub/detail/nvtx.cuh index 6a5dd8ff039..3bda5e596f3 100644 --- a/cub/cub/detail/nvtx.cuh +++ b/cub/cub/detail/nvtx.cuh @@ -37,10 +37,10 @@ # pragma system_header #endif // no system header -#ifdef DOXYGEN_SHOULD_SKIP_THIS // Only parse this during doxygen passes: +#ifdef _CCCL_DOXYGEN_INVOKED // Only parse this during doxygen passes: //! When this macro is defined, no NVTX ranges are emitted by CCCL # define CCCL_DISABLE_NVTX -#endif // DOXYGEN_SHOULD_SKIP_THIS +#endif // _CCCL_DOXYGEN_INVOKED // Enable the functionality of this header if: // * The NVTX3 C API is available in CTK diff --git a/cub/cub/detail/strong_load.cuh b/cub/cub/detail/strong_load.cuh index e63bd3456c0..61693d808e2 100644 --- a/cub/cub/detail/strong_load.cuh +++ b/cub/cub/detail/strong_load.cuh @@ -49,7 +49,7 @@ CUB_NAMESPACE_BEGIN -#ifndef DOXYGEN_SHOULD_SKIP_THIS // Do not document +#ifndef _CCCL_DOXYGEN_INVOKED // Do not document namespace detail { @@ -247,6 +247,6 @@ static _CCCL_DEVICE _CCCL_FORCEINLINE unsigned int load_acquire(unsigned int con } // namespace detail -#endif // DOXYGEN_SHOULD_SKIP_THIS +#endif // _CCCL_DOXYGEN_INVOKED CUB_NAMESPACE_END diff --git a/cub/cub/detail/strong_store.cuh b/cub/cub/detail/strong_store.cuh index fe16cae9674..9b8091738db 100644 --- a/cub/cub/detail/strong_store.cuh +++ b/cub/cub/detail/strong_store.cuh @@ -47,7 +47,7 @@ CUB_NAMESPACE_BEGIN -#ifndef DOXYGEN_SHOULD_SKIP_THIS // Do not document +#ifndef _CCCL_DOXYGEN_INVOKED // Do not document namespace detail { @@ -302,6 +302,6 @@ _CCCL_DEVICE _CCCL_FORCEINLINE void store_release(unsigned char* ptr, unsigned c } // namespace detail -#endif // DOXYGEN_SHOULD_SKIP_THIS +#endif // _CCCL_DOXYGEN_INVOKED CUB_NAMESPACE_END diff --git a/cub/cub/device/device_adjacent_difference.cuh b/cub/cub/device/device_adjacent_difference.cuh index 84add4262e2..41728342abc 100644 --- a/cub/cub/device/device_adjacent_difference.cuh +++ b/cub/cub/device/device_adjacent_difference.cuh @@ -267,7 +267,7 @@ public: d_temp_storage, temp_storage_bytes, d_input, d_output, num_items, difference_op, stream); } -#ifndef DOXYGEN_SHOULD_SKIP_THIS // Do not document +#ifndef _CCCL_DOXYGEN_INVOKED // Do not document template CUB_DETAIL_RUNTIME_DEBUG_SYNC_IS_NOT_SUPPORTED static CUB_RUNTIME_FUNCTION cudaError_t SubtractLeftCopy( void* d_temp_storage, @@ -283,7 +283,7 @@ public: return SubtractLeftCopy(d_temp_storage, temp_storage_bytes, d_input, d_output, num_items, difference_op, stream); } -#endif // DOXYGEN_SHOULD_SKIP_THIS +#endif // _CCCL_DOXYGEN_INVOKED //! @rst //! Subtracts the left element of each adjacent pair of elements residing within device-accessible memory. @@ -398,7 +398,7 @@ public: d_temp_storage, temp_storage_bytes, d_input, d_input, num_items, difference_op, stream); } -#ifndef DOXYGEN_SHOULD_SKIP_THIS // Do not document +#ifndef _CCCL_DOXYGEN_INVOKED // Do not document template CUB_DETAIL_RUNTIME_DEBUG_SYNC_IS_NOT_SUPPORTED static CUB_RUNTIME_FUNCTION cudaError_t SubtractLeft( void* d_temp_storage, @@ -413,7 +413,7 @@ public: return SubtractLeft(d_temp_storage, temp_storage_bytes, d_input, num_items, difference_op, stream); } -#endif // DOXYGEN_SHOULD_SKIP_THIS +#endif // _CCCL_DOXYGEN_INVOKED //! @rst //! Subtracts the right element of each adjacent pair of elements residing within device-accessible memory. @@ -545,7 +545,7 @@ public: d_temp_storage, temp_storage_bytes, d_input, d_output, num_items, difference_op, stream); } -#ifndef DOXYGEN_SHOULD_SKIP_THIS // Do not document +#ifndef _CCCL_DOXYGEN_INVOKED // Do not document template CUB_DETAIL_RUNTIME_DEBUG_SYNC_IS_NOT_SUPPORTED static CUB_RUNTIME_FUNCTION cudaError_t SubtractRightCopy( void* d_temp_storage, @@ -561,7 +561,7 @@ public: return SubtractRightCopy(d_temp_storage, temp_storage_bytes, d_input, d_output, num_items, difference_op, stream); } -#endif // DOXYGEN_SHOULD_SKIP_THIS +#endif // _CCCL_DOXYGEN_INVOKED //! @rst //! Subtracts the right element of each adjacent pair of elements residing within device-accessible memory. @@ -665,7 +665,7 @@ public: d_temp_storage, temp_storage_bytes, d_input, d_input, num_items, difference_op, stream); } -#ifndef DOXYGEN_SHOULD_SKIP_THIS // Do not document +#ifndef _CCCL_DOXYGEN_INVOKED // Do not document template CUB_DETAIL_RUNTIME_DEBUG_SYNC_IS_NOT_SUPPORTED static CUB_RUNTIME_FUNCTION cudaError_t SubtractRight( void* d_temp_storage, @@ -680,7 +680,7 @@ public: return SubtractRight(d_temp_storage, temp_storage_bytes, d_input, num_items, difference_op, stream); } -#endif // DOXYGEN_SHOULD_SKIP_THIS +#endif // _CCCL_DOXYGEN_INVOKED }; CUB_NAMESPACE_END diff --git a/cub/cub/device/device_histogram.cuh b/cub/cub/device/device_histogram.cuh index e6abc4bd07b..32e485df2b3 100644 --- a/cub/cub/device/device_histogram.cuh +++ b/cub/cub/device/device_histogram.cuh @@ -206,7 +206,7 @@ struct DeviceHistogram stream); } -#ifndef DOXYGEN_SHOULD_SKIP_THIS // Do not document +#ifndef _CCCL_DOXYGEN_INVOKED // Do not document template CUB_DETAIL_RUNTIME_DEBUG_SYNC_IS_NOT_SUPPORTED CUB_RUNTIME_FUNCTION static cudaError_t HistogramEven( void* d_temp_storage, @@ -233,7 +233,7 @@ struct DeviceHistogram num_samples, stream); } -#endif // DOXYGEN_SHOULD_SKIP_THIS +#endif // _CCCL_DOXYGEN_INVOKED //! @rst //! Computes an intensity histogram from a sequence of data samples using equal-width bins. @@ -386,7 +386,7 @@ struct DeviceHistogram stream); } -#ifndef DOXYGEN_SHOULD_SKIP_THIS // Do not document +#ifndef _CCCL_DOXYGEN_INVOKED // Do not document template CUB_DETAIL_RUNTIME_DEBUG_SYNC_IS_NOT_SUPPORTED CUB_RUNTIME_FUNCTION static cudaError_t HistogramEven( void* d_temp_storage, @@ -417,7 +417,7 @@ struct DeviceHistogram row_stride_bytes, stream); } -#endif // DOXYGEN_SHOULD_SKIP_THIS +#endif // _CCCL_DOXYGEN_INVOKED //! @rst //! Computes per-channel intensity histograms from a sequence of multi-channel "pixel" data samples using @@ -588,7 +588,7 @@ struct DeviceHistogram stream); } -#ifndef DOXYGEN_SHOULD_SKIP_THIS // Do not document +#ifndef _CCCL_DOXYGEN_INVOKED // Do not document template CUB_DETAIL_RUNTIME_DEBUG_SYNC_IS_NOT_SUPPORTED CUB_RUNTIME_FUNCTION static cudaError_t HistogramRange( void* d_temp_storage, @@ -1017,7 +1017,7 @@ struct DeviceHistogram return HistogramRange( d_temp_storage, temp_storage_bytes, d_samples, d_histogram, num_levels, d_levels, num_samples, stream); } -#endif // DOXYGEN_SHOULD_SKIP_THIS +#endif // _CCCL_DOXYGEN_INVOKED //! @rst //! Computes an intensity histogram from a sequence of data samples using the specified bin boundary levels. @@ -1157,7 +1157,7 @@ struct DeviceHistogram stream); } -#ifndef DOXYGEN_SHOULD_SKIP_THIS // Do not document +#ifndef _CCCL_DOXYGEN_INVOKED // Do not document template CUB_DETAIL_RUNTIME_DEBUG_SYNC_IS_NOT_SUPPORTED CUB_RUNTIME_FUNCTION static cudaError_t HistogramRange( void* d_temp_storage, @@ -1186,7 +1186,7 @@ struct DeviceHistogram row_stride_bytes, stream); } -#endif // DOXYGEN_SHOULD_SKIP_THIS +#endif // _CCCL_DOXYGEN_INVOKED //! @rst //! Computes per-channel intensity histograms from a sequence of multi-channel "pixel" data samples @@ -1346,7 +1346,7 @@ struct DeviceHistogram stream); } -#ifndef DOXYGEN_SHOULD_SKIP_THIS // Do not document +#ifndef _CCCL_DOXYGEN_INVOKED // Do not document template CUB_DETAIL_RUNTIME_DEBUG_SYNC_IS_NOT_SUPPORTED CUB_RUNTIME_FUNCTION static cudaError_t SortPairs( void* d_temp_storage, @@ -263,7 +263,7 @@ public: return SortPairs( d_temp_storage, temp_storage_bytes, d_keys, d_items, num_items, compare_op, stream); } -#endif // DOXYGEN_SHOULD_SKIP_THIS +#endif // _CCCL_DOXYGEN_INVOKED /** * @brief Sorts items using a merge sorting method. @@ -411,7 +411,7 @@ public: stream); } -#ifndef DOXYGEN_SHOULD_SKIP_THIS // Do not document +#ifndef _CCCL_DOXYGEN_INVOKED // Do not document template CUB_DETAIL_RUNTIME_DEBUG_SYNC_IS_NOT_SUPPORTED CUB_RUNTIME_FUNCTION static cudaError_t SortKeys( void* d_temp_storage, @@ -586,7 +586,7 @@ public: return SortKeys( d_temp_storage, temp_storage_bytes, d_keys, num_items, compare_op, stream); } -#endif // DOXYGEN_SHOULD_SKIP_THIS +#endif // _CCCL_DOXYGEN_INVOKED private: // Internal version without NVTX range @@ -729,7 +729,7 @@ public: d_temp_storage, temp_storage_bytes, d_input_keys, d_output_keys, num_items, compare_op, stream); } -#ifndef DOXYGEN_SHOULD_SKIP_THIS // Do not document +#ifndef _CCCL_DOXYGEN_INVOKED // Do not document template CUB_DETAIL_RUNTIME_DEBUG_SYNC_IS_NOT_SUPPORTED CUB_RUNTIME_FUNCTION static cudaError_t SortKeysCopy( void* d_temp_storage, @@ -746,7 +746,7 @@ public: return SortKeysCopy( d_temp_storage, temp_storage_bytes, d_input_keys, d_output_keys, num_items, compare_op, stream); } -#endif // DOXYGEN_SHOULD_SKIP_THIS +#endif // _CCCL_DOXYGEN_INVOKED /** * @brief Sorts items using a merge sorting method. @@ -857,7 +857,7 @@ public: d_temp_storage, temp_storage_bytes, d_keys, d_items, num_items, compare_op, stream); } -#ifndef DOXYGEN_SHOULD_SKIP_THIS // Do not document +#ifndef _CCCL_DOXYGEN_INVOKED // Do not document template CUB_DETAIL_RUNTIME_DEBUG_SYNC_IS_NOT_SUPPORTED CUB_RUNTIME_FUNCTION static cudaError_t StableSortPairs( void* d_temp_storage, @@ -874,7 +874,7 @@ public: return StableSortPairs( d_temp_storage, temp_storage_bytes, d_keys, d_items, num_items, compare_op, stream); } -#endif // DOXYGEN_SHOULD_SKIP_THIS +#endif // _CCCL_DOXYGEN_INVOKED /** * @brief Sorts items using a merge sorting method. @@ -976,7 +976,7 @@ public: d_temp_storage, temp_storage_bytes, d_keys, num_items, compare_op, stream); } -#ifndef DOXYGEN_SHOULD_SKIP_THIS // Do not document +#ifndef _CCCL_DOXYGEN_INVOKED // Do not document template CUB_DETAIL_RUNTIME_DEBUG_SYNC_IS_NOT_SUPPORTED CUB_RUNTIME_FUNCTION static cudaError_t StableSortKeys( void* d_temp_storage, @@ -992,7 +992,7 @@ public: return StableSortKeys( d_temp_storage, temp_storage_bytes, d_keys, num_items, compare_op, stream); } -#endif // DOXYGEN_SHOULD_SKIP_THIS +#endif // _CCCL_DOXYGEN_INVOKED /** * @brief Sorts items using a merge sorting method. diff --git a/cub/cub/device/device_partition.cuh b/cub/cub/device/device_partition.cuh index 48666f1370b..621bf2b9070 100644 --- a/cub/cub/device/device_partition.cuh +++ b/cub/cub/device/device_partition.cuh @@ -223,7 +223,7 @@ struct DevicePartition stream); } -#ifndef DOXYGEN_SHOULD_SKIP_THIS // Do not document +#ifndef _CCCL_DOXYGEN_INVOKED // Do not document template ( d_temp_storage, temp_storage_bytes, d_in, d_flags, d_out, d_num_selected_out, num_items, stream); } -#endif // DOXYGEN_SHOULD_SKIP_THIS +#endif // _CCCL_DOXYGEN_INVOKED //! @rst //! Uses the ``select_op`` functor to split the corresponding items from ``d_in`` into @@ -405,7 +405,7 @@ struct DevicePartition stream); } -#ifndef DOXYGEN_SHOULD_SKIP_THIS // Do not document +#ifndef _CCCL_DOXYGEN_INVOKED // Do not document template ( d_temp_storage, temp_storage_bytes, d_in, d_out, d_num_selected_out, num_items, select_op, stream); } -#endif // DOXYGEN_SHOULD_SKIP_THIS +#endif // _CCCL_DOXYGEN_INVOKED private: template CUB_DETAIL_RUNTIME_DEBUG_SYNC_IS_NOT_SUPPORTED CUB_RUNTIME_FUNCTION static cudaError_t SortPairs( void* d_temp_storage, @@ -818,7 +818,7 @@ public: d_temp_storage, temp_storage_bytes, d_keys, d_values, num_items, begin_bit, end_bit, is_overwrite_okay, stream); } -#ifndef DOXYGEN_SHOULD_SKIP_THIS // Do not document +#ifndef _CCCL_DOXYGEN_INVOKED // Do not document template CUB_DETAIL_RUNTIME_DEBUG_SYNC_IS_NOT_SUPPORTED CUB_RUNTIME_FUNCTION static cudaError_t SortPairs( void* d_temp_storage, @@ -1252,7 +1252,7 @@ public: d_temp_storage, temp_storage_bytes, d_keys, d_values, num_items, begin_bit, end_bit, is_overwrite_okay, stream); } -#ifndef DOXYGEN_SHOULD_SKIP_THIS // Do not document +#ifndef _CCCL_DOXYGEN_INVOKED // Do not document template CUB_DETAIL_RUNTIME_DEBUG_SYNC_IS_NOT_SUPPORTED CUB_RUNTIME_FUNCTION static cudaError_t SortPairsDescending( void* d_temp_storage, @@ -1706,7 +1706,7 @@ public: d_temp_storage, temp_storage_bytes, d_keys, d_values, num_items, begin_bit, end_bit, is_overwrite_okay, stream); } -#ifndef DOXYGEN_SHOULD_SKIP_THIS // Do not document +#ifndef _CCCL_DOXYGEN_INVOKED // Do not document template CUB_DETAIL_RUNTIME_DEBUG_SYNC_IS_NOT_SUPPORTED CUB_RUNTIME_FUNCTION static cudaError_t SortPairsDescending( void* d_temp_storage, @@ -2412,7 +2412,7 @@ public: stream); } -#ifndef DOXYGEN_SHOULD_SKIP_THIS // Do not document +#ifndef _CCCL_DOXYGEN_INVOKED // Do not document template CUB_DETAIL_RUNTIME_DEBUG_SYNC_IS_NOT_SUPPORTED CUB_RUNTIME_FUNCTION static cudaError_t SortKeys( void* d_temp_storage, @@ -2552,7 +2552,7 @@ public: d_temp_storage, temp_storage_bytes, d_keys, d_values, num_items, begin_bit, end_bit, is_overwrite_okay, stream); } -#ifndef DOXYGEN_SHOULD_SKIP_THIS // Do not document +#ifndef _CCCL_DOXYGEN_INVOKED // Do not document template CUB_DETAIL_RUNTIME_DEBUG_SYNC_IS_NOT_SUPPORTED CUB_RUNTIME_FUNCTION static cudaError_t SortKeys( void* d_temp_storage, @@ -2945,7 +2945,7 @@ public: d_temp_storage, temp_storage_bytes, d_keys, d_values, num_items, begin_bit, end_bit, is_overwrite_okay, stream); } -#ifndef DOXYGEN_SHOULD_SKIP_THIS // Do not document +#ifndef _CCCL_DOXYGEN_INVOKED // Do not document template CUB_DETAIL_RUNTIME_DEBUG_SYNC_IS_NOT_SUPPORTED CUB_RUNTIME_FUNCTION static cudaError_t SortKeysDescending( void* d_temp_storage, @@ -3345,7 +3345,7 @@ public: d_temp_storage, temp_storage_bytes, d_keys, d_values, num_items, begin_bit, end_bit, is_overwrite_okay, stream); } -#ifndef DOXYGEN_SHOULD_SKIP_THIS // Do not document +#ifndef _CCCL_DOXYGEN_INVOKED // Do not document template CUB_DETAIL_RUNTIME_DEBUG_SYNC_IS_NOT_SUPPORTED CUB_RUNTIME_FUNCTION static cudaError_t SortKeysDescending( void* d_temp_storage, diff --git a/cub/cub/device/device_reduce.cuh b/cub/cub/device/device_reduce.cuh index a9b94f60534..bd78224be5d 100644 --- a/cub/cub/device/device_reduce.cuh +++ b/cub/cub/device/device_reduce.cuh @@ -205,7 +205,7 @@ struct DeviceReduce d_temp_storage, temp_storage_bytes, d_in, d_out, static_cast(num_items), reduction_op, init, stream); } -#ifndef DOXYGEN_SHOULD_SKIP_THIS // Do not document +#ifndef _CCCL_DOXYGEN_INVOKED // Do not document template CUB_DETAIL_RUNTIME_DEBUG_SYNC_IS_NOT_SUPPORTED CUB_RUNTIME_FUNCTION static cudaError_t Reduce( void* d_temp_storage, @@ -223,7 +223,7 @@ struct DeviceReduce return Reduce( d_temp_storage, temp_storage_bytes, d_in, d_out, num_items, reduction_op, init, stream); } -#endif // DOXYGEN_SHOULD_SKIP_THIS +#endif // _CCCL_DOXYGEN_INVOKED //! @rst //! Computes a device-wide sum using the addition (``+``) operator. @@ -330,7 +330,7 @@ struct DeviceReduce stream); } -#ifndef DOXYGEN_SHOULD_SKIP_THIS // Do not document +#ifndef _CCCL_DOXYGEN_INVOKED // Do not document template CUB_DETAIL_RUNTIME_DEBUG_SYNC_IS_NOT_SUPPORTED CUB_RUNTIME_FUNCTION static cudaError_t Sum(void* d_temp_storage, @@ -345,7 +345,7 @@ struct DeviceReduce return Sum(d_temp_storage, temp_storage_bytes, d_in, d_out, num_items, stream); } -#endif // DOXYGEN_SHOULD_SKIP_THIS +#endif // _CCCL_DOXYGEN_INVOKED //! @rst //! Computes a device-wide minimum using the less-than (``<``) operator. @@ -456,7 +456,7 @@ struct DeviceReduce stream); } -#ifndef DOXYGEN_SHOULD_SKIP_THIS // Do not document +#ifndef _CCCL_DOXYGEN_INVOKED // Do not document template CUB_DETAIL_RUNTIME_DEBUG_SYNC_IS_NOT_SUPPORTED CUB_RUNTIME_FUNCTION static cudaError_t Min(void* d_temp_storage, @@ -471,7 +471,7 @@ struct DeviceReduce return Min(d_temp_storage, temp_storage_bytes, d_in, d_out, num_items, stream); } -#endif // DOXYGEN_SHOULD_SKIP_THIS +#endif // _CCCL_DOXYGEN_INVOKED //! @rst //! Finds the first device-wide minimum using the less-than (``<``) operator, also returning the index of that item. @@ -591,7 +591,7 @@ struct DeviceReduce d_temp_storage, temp_storage_bytes, d_indexed_in, d_out, num_items, cub::ArgMin(), initial_value, stream); } -#ifndef DOXYGEN_SHOULD_SKIP_THIS // Do not document +#ifndef _CCCL_DOXYGEN_INVOKED // Do not document template CUB_DETAIL_RUNTIME_DEBUG_SYNC_IS_NOT_SUPPORTED CUB_RUNTIME_FUNCTION static cudaError_t ArgMin( void* d_temp_storage, @@ -606,7 +606,7 @@ struct DeviceReduce return ArgMin(d_temp_storage, temp_storage_bytes, d_in, d_out, num_items, stream); } -#endif // DOXYGEN_SHOULD_SKIP_THIS +#endif // _CCCL_DOXYGEN_INVOKED //! @rst //! Computes a device-wide maximum using the greater-than (``>``) operator. @@ -715,7 +715,7 @@ struct DeviceReduce stream); } -#ifndef DOXYGEN_SHOULD_SKIP_THIS // Do not document +#ifndef _CCCL_DOXYGEN_INVOKED // Do not document template CUB_DETAIL_RUNTIME_DEBUG_SYNC_IS_NOT_SUPPORTED CUB_RUNTIME_FUNCTION static cudaError_t Max(void* d_temp_storage, @@ -730,7 +730,7 @@ struct DeviceReduce return Max(d_temp_storage, temp_storage_bytes, d_in, d_out, num_items, stream); } -#endif // DOXYGEN_SHOULD_SKIP_THIS +#endif // _CCCL_DOXYGEN_INVOKED //! @rst //! Finds the first device-wide maximum using the greater-than (``>``) @@ -854,7 +854,7 @@ struct DeviceReduce d_temp_storage, temp_storage_bytes, d_indexed_in, d_out, num_items, cub::ArgMax(), initial_value, stream); } -#ifndef DOXYGEN_SHOULD_SKIP_THIS // Do not document +#ifndef _CCCL_DOXYGEN_INVOKED // Do not document template CUB_DETAIL_RUNTIME_DEBUG_SYNC_IS_NOT_SUPPORTED CUB_RUNTIME_FUNCTION static cudaError_t ArgMax( void* d_temp_storage, @@ -869,7 +869,7 @@ struct DeviceReduce return ArgMax(d_temp_storage, temp_storage_bytes, d_in, d_out, num_items, stream); } -#endif // DOXYGEN_SHOULD_SKIP_THIS +#endif // _CCCL_DOXYGEN_INVOKED //! @rst //! Fuses transform and reduce operations @@ -1195,7 +1195,7 @@ struct DeviceReduce stream); } -#ifndef DOXYGEN_SHOULD_SKIP_THIS // Do not document +#ifndef _CCCL_DOXYGEN_INVOKED // Do not document template ( d_temp_storage, temp_storage_bytes, d_in, d_unique_out, d_counts_out, d_num_runs_out, num_items, stream); } -#endif // DOXYGEN_SHOULD_SKIP_THIS +#endif // _CCCL_DOXYGEN_INVOKED //! @rst //! Enumerates the starting offsets and lengths of all non-trivial runs @@ -386,7 +386,7 @@ struct DeviceRunLengthEncode stream); } -#ifndef DOXYGEN_SHOULD_SKIP_THIS // Do not document +#ifndef _CCCL_DOXYGEN_INVOKED // Do not document template ( d_temp_storage, temp_storage_bytes, d_in, d_offsets_out, d_lengths_out, d_num_runs_out, num_items, stream); } -#endif // DOXYGEN_SHOULD_SKIP_THIS +#endif // _CCCL_DOXYGEN_INVOKED }; CUB_NAMESPACE_END diff --git a/cub/cub/device/device_scan.cuh b/cub/cub/device/device_scan.cuh index e8b56709eda..e105fa36819 100644 --- a/cub/cub/device/device_scan.cuh +++ b/cub/cub/device/device_scan.cuh @@ -208,7 +208,7 @@ struct DeviceScan stream); } -#ifndef DOXYGEN_SHOULD_SKIP_THIS // Do not document +#ifndef _CCCL_DOXYGEN_INVOKED // Do not document template CUB_DETAIL_RUNTIME_DEBUG_SYNC_IS_NOT_SUPPORTED CUB_RUNTIME_FUNCTION static cudaError_t ExclusiveSum( void* d_temp_storage, @@ -224,7 +224,7 @@ struct DeviceScan return ExclusiveSum( d_temp_storage, temp_storage_bytes, d_in, d_out, num_items, stream); } -#endif // DOXYGEN_SHOULD_SKIP_THIS +#endif // _CCCL_DOXYGEN_INVOKED //! @rst //! Computes a device-wide exclusive prefix sum in-place. @@ -302,7 +302,7 @@ struct DeviceScan return ExclusiveSum(d_temp_storage, temp_storage_bytes, d_data, d_data, num_items, stream); } -#ifndef DOXYGEN_SHOULD_SKIP_THIS // Do not document +#ifndef _CCCL_DOXYGEN_INVOKED // Do not document template CUB_DETAIL_RUNTIME_DEBUG_SYNC_IS_NOT_SUPPORTED CUB_RUNTIME_FUNCTION static cudaError_t ExclusiveSum( void* d_temp_storage, @@ -316,7 +316,7 @@ struct DeviceScan return ExclusiveSum(d_temp_storage, temp_storage_bytes, d_data, num_items, stream); } -#endif // DOXYGEN_SHOULD_SKIP_THIS +#endif // _CCCL_DOXYGEN_INVOKED //! @rst //! Computes a device-wide exclusive prefix scan using the specified @@ -450,7 +450,7 @@ struct DeviceScan stream); } -#ifndef DOXYGEN_SHOULD_SKIP_THIS // Do not document +#ifndef _CCCL_DOXYGEN_INVOKED // Do not document template CUB_DETAIL_RUNTIME_DEBUG_SYNC_IS_NOT_SUPPORTED CUB_RUNTIME_FUNCTION static cudaError_t ExclusiveScan( void* d_temp_storage, @@ -468,7 +468,7 @@ struct DeviceScan return ExclusiveScan( d_temp_storage, temp_storage_bytes, d_in, d_out, scan_op, init_value, num_items, stream); } -#endif // DOXYGEN_SHOULD_SKIP_THIS +#endif // _CCCL_DOXYGEN_INVOKED //! @rst //! Computes a device-wide exclusive prefix scan using the specified @@ -579,7 +579,7 @@ struct DeviceScan return ExclusiveScan(d_temp_storage, temp_storage_bytes, d_data, d_data, scan_op, init_value, num_items, stream); } -#ifndef DOXYGEN_SHOULD_SKIP_THIS // Do not document +#ifndef _CCCL_DOXYGEN_INVOKED // Do not document template CUB_DETAIL_RUNTIME_DEBUG_SYNC_IS_NOT_SUPPORTED CUB_RUNTIME_FUNCTION static cudaError_t ExclusiveScan( void* d_temp_storage, @@ -596,7 +596,7 @@ struct DeviceScan return ExclusiveScan( d_temp_storage, temp_storage_bytes, d_data, scan_op, init_value, num_items, stream); } -#endif // DOXYGEN_SHOULD_SKIP_THIS +#endif // _CCCL_DOXYGEN_INVOKED //! @rst //! Computes a device-wide exclusive prefix scan using the specified @@ -739,7 +739,7 @@ struct DeviceScan stream); } -#ifndef DOXYGEN_SHOULD_SKIP_THIS // Do not document +#ifndef _CCCL_DOXYGEN_INVOKED // Do not document template ( d_temp_storage, temp_storage_bytes, d_in, d_out, scan_op, init_value, num_items, stream); } -#endif // DOXYGEN_SHOULD_SKIP_THIS +#endif // _CCCL_DOXYGEN_INVOKED //! @rst //! Computes a device-wide exclusive prefix scan using the specified binary ``scan_op`` functor. @@ -880,7 +880,7 @@ struct DeviceScan return ExclusiveScan(d_temp_storage, temp_storage_bytes, d_data, d_data, scan_op, init_value, num_items, stream); } -#ifndef DOXYGEN_SHOULD_SKIP_THIS // Do not document +#ifndef _CCCL_DOXYGEN_INVOKED // Do not document template ( d_temp_storage, temp_storage_bytes, d_data, scan_op, init_value, num_items, stream); } -#endif // DOXYGEN_SHOULD_SKIP_THIS +#endif // _CCCL_DOXYGEN_INVOKED //! @} end member group //! @name Inclusive scans @@ -1003,7 +1003,7 @@ struct DeviceScan d_temp_storage, temp_storage_bytes, d_in, d_out, ::cuda::std::plus<>{}, NullType{}, num_items, stream); } -#ifndef DOXYGEN_SHOULD_SKIP_THIS // Do not document +#ifndef _CCCL_DOXYGEN_INVOKED // Do not document template CUB_DETAIL_RUNTIME_DEBUG_SYNC_IS_NOT_SUPPORTED CUB_RUNTIME_FUNCTION static cudaError_t InclusiveSum( void* d_temp_storage, @@ -1019,7 +1019,7 @@ struct DeviceScan return InclusiveSum( d_temp_storage, temp_storage_bytes, d_in, d_out, num_items, stream); } -#endif // DOXYGEN_SHOULD_SKIP_THIS +#endif // _CCCL_DOXYGEN_INVOKED //! @rst //! Computes a device-wide inclusive prefix sum in-place. @@ -1096,7 +1096,7 @@ struct DeviceScan return InclusiveSum(d_temp_storage, temp_storage_bytes, d_data, d_data, num_items, stream); } -#ifndef DOXYGEN_SHOULD_SKIP_THIS // Do not document +#ifndef _CCCL_DOXYGEN_INVOKED // Do not document template CUB_DETAIL_RUNTIME_DEBUG_SYNC_IS_NOT_SUPPORTED CUB_RUNTIME_FUNCTION static cudaError_t InclusiveSum( void* d_temp_storage, @@ -1110,7 +1110,7 @@ struct DeviceScan return InclusiveSum(d_temp_storage, temp_storage_bytes, d_data, num_items, stream); } -#endif // DOXYGEN_SHOULD_SKIP_THIS +#endif // _CCCL_DOXYGEN_INVOKED //! @rst //! Computes a device-wide inclusive prefix scan using the specified binary ``scan_op`` functor. @@ -1333,7 +1333,7 @@ struct DeviceScan stream); } -#ifndef DOXYGEN_SHOULD_SKIP_THIS // Do not document +#ifndef _CCCL_DOXYGEN_INVOKED // Do not document template CUB_DETAIL_RUNTIME_DEBUG_SYNC_IS_NOT_SUPPORTED CUB_RUNTIME_FUNCTION static cudaError_t InclusiveScan( void* d_temp_storage, @@ -1350,7 +1350,7 @@ struct DeviceScan return InclusiveScan( d_temp_storage, temp_storage_bytes, d_in, d_out, scan_op, num_items, stream); } -#endif // DOXYGEN_SHOULD_SKIP_THIS +#endif // _CCCL_DOXYGEN_INVOKED //! @rst //! Computes a device-wide inclusive prefix scan using the specified binary ``scan_op`` functor. @@ -1451,7 +1451,7 @@ struct DeviceScan return InclusiveScan(d_temp_storage, temp_storage_bytes, d_data, d_data, scan_op, num_items, stream); } -#ifndef DOXYGEN_SHOULD_SKIP_THIS // Do not document +#ifndef _CCCL_DOXYGEN_INVOKED // Do not document template CUB_DETAIL_RUNTIME_DEBUG_SYNC_IS_NOT_SUPPORTED CUB_RUNTIME_FUNCTION static cudaError_t InclusiveScan( void* d_temp_storage, @@ -1466,7 +1466,7 @@ struct DeviceScan return InclusiveScan(d_temp_storage, temp_storage_bytes, d_data, scan_op, num_items, stream); } -#endif // DOXYGEN_SHOULD_SKIP_THIS +#endif // _CCCL_DOXYGEN_INVOKED //! @rst //! Computes a device-wide exclusive prefix sum-by-key with key equality @@ -1608,7 +1608,7 @@ struct DeviceScan stream); } -#ifndef DOXYGEN_SHOULD_SKIP_THIS // Do not document +#ifndef _CCCL_DOXYGEN_INVOKED // Do not document template ( d_temp_storage, temp_storage_bytes, d_keys_in, d_values_in, d_values_out, num_items, equality_op, stream); } -#endif // DOXYGEN_SHOULD_SKIP_THIS +#endif // _CCCL_DOXYGEN_INVOKED //! @rst //! Computes a device-wide exclusive prefix scan-by-key using the @@ -1814,7 +1814,7 @@ struct DeviceScan stream); } -#ifndef DOXYGEN_SHOULD_SKIP_THIS // Do not document +#ifndef _CCCL_DOXYGEN_INVOKED // Do not document template ( d_temp_storage, temp_storage_bytes, d_keys_in, d_values_in, d_values_out, num_items, equality_op, stream); } -#endif // DOXYGEN_SHOULD_SKIP_THIS +#endif // _CCCL_DOXYGEN_INVOKED //! @rst //! Computes a device-wide inclusive prefix scan-by-key using the @@ -2180,7 +2180,7 @@ struct DeviceScan stream); } -#ifndef DOXYGEN_SHOULD_SKIP_THIS // Do not document +#ifndef _CCCL_DOXYGEN_INVOKED // Do not document template ( d_temp_storage, temp_storage_bytes, d_keys_in, d_values_in, d_values_out, scan_op, num_items, equality_op, stream); } -#endif // DOXYGEN_SHOULD_SKIP_THIS +#endif // _CCCL_DOXYGEN_INVOKED //! @} end member group }; diff --git a/cub/cub/device/device_segmented_radix_sort.cuh b/cub/cub/device/device_segmented_radix_sort.cuh index cc627b971ca..490caf36c48 100644 --- a/cub/cub/device/device_segmented_radix_sort.cuh +++ b/cub/cub/device/device_segmented_radix_sort.cuh @@ -265,7 +265,7 @@ public: stream); } -#ifndef DOXYGEN_SHOULD_SKIP_THIS // Do not document +#ifndef _CCCL_DOXYGEN_INVOKED // Do not document template CUB_DETAIL_RUNTIME_DEBUG_SYNC_IS_NOT_SUPPORTED CUB_RUNTIME_FUNCTION static cudaError_t SortPairs( void* d_temp_storage, @@ -300,7 +300,7 @@ public: end_bit, stream); } -#endif // DOXYGEN_SHOULD_SKIP_THIS +#endif // _CCCL_DOXYGEN_INVOKED //! @rst //! Sorts segments of key-value pairs into ascending order. (``~N`` auxiliary storage required) @@ -476,7 +476,7 @@ public: stream); } -#ifndef DOXYGEN_SHOULD_SKIP_THIS // Do not document +#ifndef _CCCL_DOXYGEN_INVOKED // Do not document template CUB_DETAIL_RUNTIME_DEBUG_SYNC_IS_NOT_SUPPORTED CUB_RUNTIME_FUNCTION static cudaError_t SortPairs( void* d_temp_storage, @@ -507,7 +507,7 @@ public: end_bit, stream); } -#endif // DOXYGEN_SHOULD_SKIP_THIS +#endif // _CCCL_DOXYGEN_INVOKED //! @rst //! Sorts segments of key-value pairs into descending order. (``~2N`` auxiliary storage required). @@ -683,7 +683,7 @@ public: stream); } -#ifndef DOXYGEN_SHOULD_SKIP_THIS // Do not document +#ifndef _CCCL_DOXYGEN_INVOKED // Do not document template CUB_DETAIL_RUNTIME_DEBUG_SYNC_IS_NOT_SUPPORTED CUB_RUNTIME_FUNCTION static cudaError_t SortPairsDescending( void* d_temp_storage, @@ -718,7 +718,7 @@ public: end_bit, stream); } -#endif // DOXYGEN_SHOULD_SKIP_THIS +#endif // _CCCL_DOXYGEN_INVOKED //! @rst //! Sorts segments of key-value pairs into descending order. (``~N`` auxiliary storage required). @@ -898,7 +898,7 @@ public: stream); } -#ifndef DOXYGEN_SHOULD_SKIP_THIS // Do not document +#ifndef _CCCL_DOXYGEN_INVOKED // Do not document template CUB_DETAIL_RUNTIME_DEBUG_SYNC_IS_NOT_SUPPORTED CUB_RUNTIME_FUNCTION static cudaError_t SortPairsDescending( void* d_temp_storage, @@ -929,7 +929,7 @@ public: end_bit, stream); } -#endif // DOXYGEN_SHOULD_SKIP_THIS +#endif // _CCCL_DOXYGEN_INVOKED //! @} end member group //! @name Keys-only @@ -1092,7 +1092,7 @@ public: stream); } -#ifndef DOXYGEN_SHOULD_SKIP_THIS // Do not document +#ifndef _CCCL_DOXYGEN_INVOKED // Do not document template CUB_DETAIL_RUNTIME_DEBUG_SYNC_IS_NOT_SUPPORTED CUB_RUNTIME_FUNCTION static cudaError_t SortKeys( void* d_temp_storage, @@ -1123,7 +1123,7 @@ public: end_bit, stream); } -#endif // DOXYGEN_SHOULD_SKIP_THIS +#endif // _CCCL_DOXYGEN_INVOKED //! @rst //! Sorts segments of keys into ascending order. (``~N`` auxiliary storage required). @@ -1291,7 +1291,7 @@ public: stream); } -#ifndef DOXYGEN_SHOULD_SKIP_THIS // Do not document +#ifndef _CCCL_DOXYGEN_INVOKED // Do not document template CUB_DETAIL_RUNTIME_DEBUG_SYNC_IS_NOT_SUPPORTED CUB_RUNTIME_FUNCTION static cudaError_t SortKeys( void* d_temp_storage, @@ -1320,7 +1320,7 @@ public: end_bit, stream); } -#endif // DOXYGEN_SHOULD_SKIP_THIS +#endif // _CCCL_DOXYGEN_INVOKED //! @rst //! Sorts segments of keys into descending order. (``~2N`` auxiliary storage required). @@ -1479,7 +1479,7 @@ public: stream); } -#ifndef DOXYGEN_SHOULD_SKIP_THIS // Do not document +#ifndef _CCCL_DOXYGEN_INVOKED // Do not document template CUB_DETAIL_RUNTIME_DEBUG_SYNC_IS_NOT_SUPPORTED CUB_RUNTIME_FUNCTION static cudaError_t SortKeysDescending( void* d_temp_storage, @@ -1510,7 +1510,7 @@ public: end_bit, stream); } -#endif // DOXYGEN_SHOULD_SKIP_THIS +#endif // _CCCL_DOXYGEN_INVOKED //! @rst //! Sorts segments of keys into descending order. (``~N`` auxiliary storage required). @@ -1676,7 +1676,7 @@ public: stream); } -#ifndef DOXYGEN_SHOULD_SKIP_THIS // Do not document +#ifndef _CCCL_DOXYGEN_INVOKED // Do not document template CUB_DETAIL_RUNTIME_DEBUG_SYNC_IS_NOT_SUPPORTED CUB_RUNTIME_FUNCTION static cudaError_t SortKeysDescending( void* d_temp_storage, @@ -1705,7 +1705,7 @@ public: end_bit, stream); } -#endif // DOXYGEN_SHOULD_SKIP_THIS +#endif // _CCCL_DOXYGEN_INVOKED //! @} end member group }; diff --git a/cub/cub/device/device_segmented_reduce.cuh b/cub/cub/device/device_segmented_reduce.cuh index 9d4de803e86..7ad043eab5f 100644 --- a/cub/cub/device/device_segmented_reduce.cuh +++ b/cub/cub/device/device_segmented_reduce.cuh @@ -272,7 +272,7 @@ public: stream); } -#ifndef DOXYGEN_SHOULD_SKIP_THIS // Do not document +#ifndef _CCCL_DOXYGEN_INVOKED // Do not document template CUB_DETAIL_RUNTIME_DEBUG_SYNC_IS_NOT_SUPPORTED CUB_RUNTIME_FUNCTION static cudaError_t Sum(void* d_temp_storage, @@ -444,7 +444,7 @@ public: return Sum( d_temp_storage, temp_storage_bytes, d_in, d_out, num_segments, d_begin_offsets, d_end_offsets, stream); } -#endif // DOXYGEN_SHOULD_SKIP_THIS +#endif // _CCCL_DOXYGEN_INVOKED //! @rst //! Computes a device-wide segmented minimum using the less-than (``<``) operator. @@ -572,7 +572,7 @@ public: stream); } -#ifndef DOXYGEN_SHOULD_SKIP_THIS // Do not document +#ifndef _CCCL_DOXYGEN_INVOKED // Do not document template CUB_DETAIL_RUNTIME_DEBUG_SYNC_IS_NOT_SUPPORTED CUB_RUNTIME_FUNCTION static cudaError_t Min(void* d_temp_storage, @@ -590,7 +590,7 @@ public: return Min( d_temp_storage, temp_storage_bytes, d_in, d_out, num_segments, d_begin_offsets, d_end_offsets, stream); } -#endif // DOXYGEN_SHOULD_SKIP_THIS +#endif // _CCCL_DOXYGEN_INVOKED //! @rst //! Finds the first device-wide minimum in each segment using the @@ -742,7 +742,7 @@ public: stream); } -#ifndef DOXYGEN_SHOULD_SKIP_THIS // Do not document +#ifndef _CCCL_DOXYGEN_INVOKED // Do not document template CUB_DETAIL_RUNTIME_DEBUG_SYNC_IS_NOT_SUPPORTED CUB_RUNTIME_FUNCTION static cudaError_t ArgMin( void* d_temp_storage, @@ -760,7 +760,7 @@ public: return ArgMin( d_temp_storage, temp_storage_bytes, d_in, d_out, num_segments, d_begin_offsets, d_end_offsets, stream); } -#endif // DOXYGEN_SHOULD_SKIP_THIS +#endif // _CCCL_DOXYGEN_INVOKED //! @rst //! Computes a device-wide segmented maximum using the greater-than (``>``) operator. @@ -877,7 +877,7 @@ public: stream); } -#ifndef DOXYGEN_SHOULD_SKIP_THIS // Do not document +#ifndef _CCCL_DOXYGEN_INVOKED // Do not document template CUB_DETAIL_RUNTIME_DEBUG_SYNC_IS_NOT_SUPPORTED CUB_RUNTIME_FUNCTION static cudaError_t Max(void* d_temp_storage, @@ -895,7 +895,7 @@ public: return Max( d_temp_storage, temp_storage_bytes, d_in, d_out, num_segments, d_begin_offsets, d_end_offsets, stream); } -#endif // DOXYGEN_SHOULD_SKIP_THIS +#endif // _CCCL_DOXYGEN_INVOKED //! @rst //! Finds the first device-wide maximum in each segment using the @@ -1050,7 +1050,7 @@ public: stream); } -#ifndef DOXYGEN_SHOULD_SKIP_THIS // Do not document +#ifndef _CCCL_DOXYGEN_INVOKED // Do not document template CUB_DETAIL_RUNTIME_DEBUG_SYNC_IS_NOT_SUPPORTED CUB_RUNTIME_FUNCTION static cudaError_t ArgMax( void* d_temp_storage, @@ -1068,7 +1068,7 @@ public: return ArgMax( d_temp_storage, temp_storage_bytes, d_in, d_out, num_segments, d_begin_offsets, d_end_offsets, stream); } -#endif // DOXYGEN_SHOULD_SKIP_THIS +#endif // _CCCL_DOXYGEN_INVOKED }; CUB_NAMESPACE_END diff --git a/cub/cub/device/device_segmented_sort.cuh b/cub/cub/device/device_segmented_sort.cuh index 1f219aebd25..10b5c6d2388 100644 --- a/cub/cub/device/device_segmented_sort.cuh +++ b/cub/cub/device/device_segmented_sort.cuh @@ -306,7 +306,7 @@ public: stream); } -#ifndef DOXYGEN_SHOULD_SKIP_THIS // Do not document +#ifndef _CCCL_DOXYGEN_INVOKED // Do not document template CUB_DETAIL_RUNTIME_DEBUG_SYNC_IS_NOT_SUPPORTED CUB_RUNTIME_FUNCTION static cudaError_t SortKeys( void* d_temp_storage, @@ -333,7 +333,7 @@ public: d_end_offsets, stream); } -#endif // DOXYGEN_SHOULD_SKIP_THIS +#endif // _CCCL_DOXYGEN_INVOKED private: // Internal version without NVTX range @@ -503,7 +503,7 @@ public: stream); } -#ifndef DOXYGEN_SHOULD_SKIP_THIS // Do not document +#ifndef _CCCL_DOXYGEN_INVOKED // Do not document template CUB_DETAIL_RUNTIME_DEBUG_SYNC_IS_NOT_SUPPORTED CUB_RUNTIME_FUNCTION static cudaError_t SortKeysDescending( void* d_temp_storage, @@ -530,7 +530,7 @@ public: d_end_offsets, stream); } -#endif // DOXYGEN_SHOULD_SKIP_THIS +#endif // _CCCL_DOXYGEN_INVOKED private: // Internal version without NVTX range @@ -702,7 +702,7 @@ public: d_temp_storage, temp_storage_bytes, d_keys, num_items, num_segments, d_begin_offsets, d_end_offsets, stream); } -#ifndef DOXYGEN_SHOULD_SKIP_THIS // Do not document +#ifndef _CCCL_DOXYGEN_INVOKED // Do not document template CUB_DETAIL_RUNTIME_DEBUG_SYNC_IS_NOT_SUPPORTED CUB_RUNTIME_FUNCTION static cudaError_t SortKeys( void* d_temp_storage, @@ -720,7 +720,7 @@ public: return SortKeys( d_temp_storage, temp_storage_bytes, d_keys, num_items, num_segments, d_begin_offsets, d_end_offsets, stream); } -#endif // DOXYGEN_SHOULD_SKIP_THIS +#endif // _CCCL_DOXYGEN_INVOKED private: // Internal version without NVTX range @@ -893,7 +893,7 @@ public: d_temp_storage, temp_storage_bytes, d_keys, num_items, num_segments, d_begin_offsets, d_end_offsets, stream); } -#ifndef DOXYGEN_SHOULD_SKIP_THIS // Do not document +#ifndef _CCCL_DOXYGEN_INVOKED // Do not document template CUB_DETAIL_RUNTIME_DEBUG_SYNC_IS_NOT_SUPPORTED CUB_RUNTIME_FUNCTION static cudaError_t SortKeysDescending( void* d_temp_storage, @@ -911,7 +911,7 @@ public: return SortKeysDescending( d_temp_storage, temp_storage_bytes, d_keys, num_items, num_segments, d_begin_offsets, d_end_offsets, stream); } -#endif // DOXYGEN_SHOULD_SKIP_THIS +#endif // _CCCL_DOXYGEN_INVOKED //! @rst //! Sorts segments of keys into ascending order. Approximately @@ -1049,7 +1049,7 @@ public: stream); } -#ifndef DOXYGEN_SHOULD_SKIP_THIS // Do not document +#ifndef _CCCL_DOXYGEN_INVOKED // Do not document template CUB_DETAIL_RUNTIME_DEBUG_SYNC_IS_NOT_SUPPORTED CUB_RUNTIME_FUNCTION static cudaError_t StableSortKeys( void* d_temp_storage, @@ -1076,7 +1076,7 @@ public: d_end_offsets, stream); } -#endif // DOXYGEN_SHOULD_SKIP_THIS +#endif // _CCCL_DOXYGEN_INVOKED //! @rst //! Sorts segments of keys into descending order. @@ -1214,7 +1214,7 @@ public: stream); } -#ifndef DOXYGEN_SHOULD_SKIP_THIS // Do not document +#ifndef _CCCL_DOXYGEN_INVOKED // Do not document template CUB_DETAIL_RUNTIME_DEBUG_SYNC_IS_NOT_SUPPORTED CUB_RUNTIME_FUNCTION static cudaError_t StableSortKeysDescending( void* d_temp_storage, @@ -1241,7 +1241,7 @@ public: d_end_offsets, stream); } -#endif // DOXYGEN_SHOULD_SKIP_THIS +#endif // _CCCL_DOXYGEN_INVOKED //! @rst //! Sorts segments of keys into ascending order. @@ -1381,7 +1381,7 @@ public: d_temp_storage, temp_storage_bytes, d_keys, num_items, num_segments, d_begin_offsets, d_end_offsets, stream); } -#ifndef DOXYGEN_SHOULD_SKIP_THIS // Do not document +#ifndef _CCCL_DOXYGEN_INVOKED // Do not document template CUB_DETAIL_RUNTIME_DEBUG_SYNC_IS_NOT_SUPPORTED CUB_RUNTIME_FUNCTION static cudaError_t StableSortKeys( void* d_temp_storage, @@ -1399,7 +1399,7 @@ public: return StableSortKeys( d_temp_storage, temp_storage_bytes, d_keys, num_items, num_segments, d_begin_offsets, d_end_offsets, stream); } -#endif // DOXYGEN_SHOULD_SKIP_THIS +#endif // _CCCL_DOXYGEN_INVOKED //! @rst //! Sorts segments of keys into descending order. @@ -1538,7 +1538,7 @@ public: d_temp_storage, temp_storage_bytes, d_keys, num_items, num_segments, d_begin_offsets, d_end_offsets, stream); } -#ifndef DOXYGEN_SHOULD_SKIP_THIS // Do not document +#ifndef _CCCL_DOXYGEN_INVOKED // Do not document template CUB_DETAIL_RUNTIME_DEBUG_SYNC_IS_NOT_SUPPORTED CUB_RUNTIME_FUNCTION static cudaError_t StableSortKeysDescending( void* d_temp_storage, @@ -1556,7 +1556,7 @@ public: return StableSortKeysDescending( d_temp_storage, temp_storage_bytes, d_keys, num_items, num_segments, d_begin_offsets, d_end_offsets, stream); } -#endif // DOXYGEN_SHOULD_SKIP_THIS +#endif // _CCCL_DOXYGEN_INVOKED private: // Internal version without NVTX range @@ -1757,7 +1757,7 @@ public: stream); } -#ifndef DOXYGEN_SHOULD_SKIP_THIS // Do not document +#ifndef _CCCL_DOXYGEN_INVOKED // Do not document template CUB_DETAIL_RUNTIME_DEBUG_SYNC_IS_NOT_SUPPORTED CUB_RUNTIME_FUNCTION static cudaError_t SortPairs( void* d_temp_storage, @@ -1788,7 +1788,7 @@ public: d_end_offsets, stream); } -#endif // DOXYGEN_SHOULD_SKIP_THIS +#endif // _CCCL_DOXYGEN_INVOKED private: // Internal version without NVTX range @@ -1985,7 +1985,7 @@ public: stream); } -#ifndef DOXYGEN_SHOULD_SKIP_THIS // Do not document +#ifndef _CCCL_DOXYGEN_INVOKED // Do not document template CUB_DETAIL_RUNTIME_DEBUG_SYNC_IS_NOT_SUPPORTED CUB_RUNTIME_FUNCTION static cudaError_t SortPairsDescending( void* d_temp_storage, @@ -2016,7 +2016,7 @@ public: d_end_offsets, stream); } -#endif // DOXYGEN_SHOULD_SKIP_THIS +#endif // _CCCL_DOXYGEN_INVOKED private: // Internal version without NVTX range @@ -2213,7 +2213,7 @@ public: stream); } -#ifndef DOXYGEN_SHOULD_SKIP_THIS // Do not document +#ifndef _CCCL_DOXYGEN_INVOKED // Do not document template CUB_DETAIL_RUNTIME_DEBUG_SYNC_IS_NOT_SUPPORTED CUB_RUNTIME_FUNCTION static cudaError_t SortPairs( void* d_temp_storage, @@ -2240,7 +2240,7 @@ public: d_end_offsets, stream); } -#endif // DOXYGEN_SHOULD_SKIP_THIS +#endif // _CCCL_DOXYGEN_INVOKED private: // Internal version without NVTX range @@ -2436,7 +2436,7 @@ public: stream); } -#ifndef DOXYGEN_SHOULD_SKIP_THIS // Do not document +#ifndef _CCCL_DOXYGEN_INVOKED // Do not document template CUB_DETAIL_RUNTIME_DEBUG_SYNC_IS_NOT_SUPPORTED CUB_RUNTIME_FUNCTION static cudaError_t SortPairsDescending( void* d_temp_storage, @@ -2463,7 +2463,7 @@ public: d_end_offsets, stream); } -#endif // DOXYGEN_SHOULD_SKIP_THIS +#endif // _CCCL_DOXYGEN_INVOKED //! @rst //! Sorts segments of key-value pairs into ascending order. @@ -2623,7 +2623,7 @@ public: stream); } -#ifndef DOXYGEN_SHOULD_SKIP_THIS // Do not document +#ifndef _CCCL_DOXYGEN_INVOKED // Do not document template CUB_DETAIL_RUNTIME_DEBUG_SYNC_IS_NOT_SUPPORTED CUB_RUNTIME_FUNCTION static cudaError_t StableSortPairs( void* d_temp_storage, @@ -2654,7 +2654,7 @@ public: d_end_offsets, stream); } -#endif // DOXYGEN_SHOULD_SKIP_THIS +#endif // _CCCL_DOXYGEN_INVOKED //! @rst //! Sorts segments of key-value pairs into descending order. @@ -2814,7 +2814,7 @@ public: stream); } -#ifndef DOXYGEN_SHOULD_SKIP_THIS // Do not document +#ifndef _CCCL_DOXYGEN_INVOKED // Do not document template CUB_DETAIL_RUNTIME_DEBUG_SYNC_IS_NOT_SUPPORTED CUB_RUNTIME_FUNCTION static cudaError_t StableSortPairsDescending( void* d_temp_storage, @@ -2845,7 +2845,7 @@ public: d_end_offsets, stream); } -#endif // DOXYGEN_SHOULD_SKIP_THIS +#endif // _CCCL_DOXYGEN_INVOKED //! @rst //! Sorts segments of key-value pairs into ascending order. @@ -3011,7 +3011,7 @@ public: stream); } -#ifndef DOXYGEN_SHOULD_SKIP_THIS // Do not document +#ifndef _CCCL_DOXYGEN_INVOKED // Do not document template CUB_DETAIL_RUNTIME_DEBUG_SYNC_IS_NOT_SUPPORTED CUB_RUNTIME_FUNCTION static cudaError_t StableSortPairs( void* d_temp_storage, @@ -3038,7 +3038,7 @@ public: d_end_offsets, stream); } -#endif // DOXYGEN_SHOULD_SKIP_THIS +#endif // _CCCL_DOXYGEN_INVOKED //! @rst //! Sorts segments of key-value pairs into descending order. @@ -3203,7 +3203,7 @@ public: stream); } -#ifndef DOXYGEN_SHOULD_SKIP_THIS // Do not document +#ifndef _CCCL_DOXYGEN_INVOKED // Do not document template CUB_DETAIL_RUNTIME_DEBUG_SYNC_IS_NOT_SUPPORTED CUB_RUNTIME_FUNCTION static cudaError_t StableSortPairsDescending( void* d_temp_storage, @@ -3230,7 +3230,7 @@ public: d_end_offsets, stream); } -#endif // DOXYGEN_SHOULD_SKIP_THIS +#endif // _CCCL_DOXYGEN_INVOKED //! @} end member group }; diff --git a/cub/cub/device/device_select.cuh b/cub/cub/device/device_select.cuh index b537ab9204b..27a18cf809a 100644 --- a/cub/cub/device/device_select.cuh +++ b/cub/cub/device/device_select.cuh @@ -203,7 +203,7 @@ struct DeviceSelect stream); } -#ifndef DOXYGEN_SHOULD_SKIP_THIS // Do not document +#ifndef _CCCL_DOXYGEN_INVOKED // Do not document template CUB_DETAIL_RUNTIME_DEBUG_SYNC_IS_NOT_SUPPORTED CUB_RUNTIME_FUNCTION _CCCL_FORCEINLINE static cudaError_t Flagged( void* d_temp_storage, @@ -221,7 +221,7 @@ struct DeviceSelect return Flagged( d_temp_storage, temp_storage_bytes, d_in, d_flags, d_out, d_num_selected_out, num_items, stream); } -#endif // DOXYGEN_SHOULD_SKIP_THIS +#endif // _CCCL_DOXYGEN_INVOKED //! @rst //! Uses the ``d_flags`` sequence to selectively compact the items in `d_data``. @@ -341,7 +341,7 @@ struct DeviceSelect stream); } -#ifndef DOXYGEN_SHOULD_SKIP_THIS // Do not document +#ifndef _CCCL_DOXYGEN_INVOKED // Do not document template CUB_DETAIL_RUNTIME_DEBUG_SYNC_IS_NOT_SUPPORTED CUB_RUNTIME_FUNCTION _CCCL_FORCEINLINE static cudaError_t Flagged( void* d_temp_storage, @@ -358,7 +358,7 @@ struct DeviceSelect return Flagged( d_temp_storage, temp_storage_bytes, d_data, d_flags, d_num_selected_out, num_items, stream); } -#endif // DOXYGEN_SHOULD_SKIP_THIS +#endif // _CCCL_DOXYGEN_INVOKED //! @rst //! Uses the ``select_op`` functor to selectively copy items from ``d_in`` into ``d_out``. @@ -498,7 +498,7 @@ struct DeviceSelect stream); } -#ifndef DOXYGEN_SHOULD_SKIP_THIS // Do not document +#ifndef _CCCL_DOXYGEN_INVOKED // Do not document template CUB_DETAIL_RUNTIME_DEBUG_SYNC_IS_NOT_SUPPORTED CUB_RUNTIME_FUNCTION _CCCL_FORCEINLINE static cudaError_t If(void* d_temp_storage, @@ -516,7 +516,7 @@ struct DeviceSelect return If( d_temp_storage, temp_storage_bytes, d_in, d_out, d_num_selected_out, num_items, select_op, stream); } -#endif // DOXYGEN_SHOULD_SKIP_THIS +#endif // _CCCL_DOXYGEN_INVOKED //! @rst //! Uses the ``select_op`` functor to selectively compact items in ``d_data``. @@ -648,7 +648,7 @@ struct DeviceSelect stream); } -#ifndef DOXYGEN_SHOULD_SKIP_THIS // Do not document +#ifndef _CCCL_DOXYGEN_INVOKED // Do not document template CUB_DETAIL_RUNTIME_DEBUG_SYNC_IS_NOT_SUPPORTED CUB_RUNTIME_FUNCTION _CCCL_FORCEINLINE static cudaError_t If(void* d_temp_storage, @@ -665,7 +665,7 @@ struct DeviceSelect return If( d_temp_storage, temp_storage_bytes, d_data, d_num_selected_out, num_items, select_op, stream); } -#endif // DOXYGEN_SHOULD_SKIP_THIS +#endif // _CCCL_DOXYGEN_INVOKED //! @rst //! Uses the ``select_op`` functor applied to ``d_flags`` to selectively copy the @@ -1011,7 +1011,7 @@ struct DeviceSelect stream); } -#ifndef DOXYGEN_SHOULD_SKIP_THIS // Do not document +#ifndef _CCCL_DOXYGEN_INVOKED // Do not document template CUB_DETAIL_RUNTIME_DEBUG_SYNC_IS_NOT_SUPPORTED CUB_RUNTIME_FUNCTION _CCCL_FORCEINLINE static cudaError_t Unique( void* d_temp_storage, @@ -1028,7 +1028,7 @@ struct DeviceSelect return Unique( d_temp_storage, temp_storage_bytes, d_in, d_out, d_num_selected_out, num_items, stream); } -#endif // DOXYGEN_SHOULD_SKIP_THIS +#endif // _CCCL_DOXYGEN_INVOKED //! @rst //! Given an input sequence ``d_keys_in`` and ``d_values_in`` with runs of key-value pairs with consecutive @@ -1330,7 +1330,7 @@ struct DeviceSelect stream); } -#ifndef DOXYGEN_SHOULD_SKIP_THIS // Do not document +#ifndef _CCCL_DOXYGEN_INVOKED // Do not document template ::Dispatch(d_temp_storage, temp_storage_bytes, spmv_params, stream); } -#ifndef DOXYGEN_SHOULD_SKIP_THIS // Do not document +#ifndef _CCCL_DOXYGEN_INVOKED // Do not document template CUB_DETAIL_RUNTIME_DEBUG_SYNC_IS_NOT_SUPPORTED CUB_RUNTIME_FUNCTION static cudaError_t CsrMV( void* d_temp_storage, @@ -239,7 +239,7 @@ struct DeviceSpmv num_nonzeros, stream); } -#endif // DOXYGEN_SHOULD_SKIP_THIS +#endif // _CCCL_DOXYGEN_INVOKED //! @} end member group }; diff --git a/cub/cub/device/device_transform.cuh b/cub/cub/device/device_transform.cuh index 984109692f6..ef00248b448 100644 --- a/cub/cub/device/device_transform.cuh +++ b/cub/cub/device/device_transform.cuh @@ -66,7 +66,7 @@ struct DeviceTransform ::cuda::std::move(inputs), ::cuda::std::move(output), num_items, ::cuda::std::move(transform_op), stream); } -#ifndef DOXYGEN_SHOULD_SKIP_THIS // Do not document +#ifndef _CCCL_DOXYGEN_INVOKED // Do not document // This overload has additional parameters to specify temporary storage. Provided for compatibility with other CUB // APIs. template @@ -88,7 +88,7 @@ struct DeviceTransform return Transform( ::cuda::std::move(inputs), ::cuda::std::move(output), num_items, ::cuda::std::move(transform_op), stream); } -#endif // DOXYGEN_SHOULD_SKIP_THIS +#endif // _CCCL_DOXYGEN_INVOKED //! @rst //! Transforms one input sequence into one output sequence, by applying a transformation operation on corresponding @@ -120,7 +120,7 @@ struct DeviceTransform stream); } -#ifndef DOXYGEN_SHOULD_SKIP_THIS // Do not document +#ifndef _CCCL_DOXYGEN_INVOKED // Do not document // This overload has additional parameters to specify temporary storage. Provided for compatibility with other CUB // APIs. template @@ -146,7 +146,7 @@ struct DeviceTransform ::cuda::std::move(transform_op), stream); } -#endif // DOXYGEN_SHOULD_SKIP_THIS +#endif // _CCCL_DOXYGEN_INVOKED //! @rst //! Overview @@ -189,7 +189,7 @@ struct DeviceTransform ::cuda::std::move(inputs), ::cuda::std::move(output), num_items, ::cuda::std::move(transform_op), stream); } -#ifndef DOXYGEN_SHOULD_SKIP_THIS // Do not document +#ifndef _CCCL_DOXYGEN_INVOKED // Do not document template CUB_RUNTIME_FUNCTION static cudaError_t TransformStableArgumentAddresses( void* d_temp_storage, @@ -209,7 +209,7 @@ struct DeviceTransform return TransformStableArgumentAddresses( ::cuda::std::move(inputs), ::cuda::std::move(output), num_items, ::cuda::std::move(transform_op), stream); } -#endif // DOXYGEN_SHOULD_SKIP_THIS +#endif // _CCCL_DOXYGEN_INVOKED //! @rst //! Transforms one input sequence into one output sequence, by applying a transformation operation on corresponding @@ -241,7 +241,7 @@ struct DeviceTransform stream); } -#ifndef DOXYGEN_SHOULD_SKIP_THIS // Do not document +#ifndef _CCCL_DOXYGEN_INVOKED // Do not document template CUB_RUNTIME_FUNCTION static cudaError_t TransformStableArgumentAddresses( void* d_temp_storage, @@ -265,7 +265,7 @@ struct DeviceTransform ::cuda::std::move(transform_op), stream); } -#endif // DOXYGEN_SHOULD_SKIP_THIS +#endif // _CCCL_DOXYGEN_INVOKED }; CUB_NAMESPACE_END diff --git a/cub/cub/device/dispatch/dispatch_adjacent_difference.cuh b/cub/cub/device/dispatch/dispatch_adjacent_difference.cuh index af41c7137c7..4eef4fb5b86 100644 --- a/cub/cub/device/dispatch/dispatch_adjacent_difference.cuh +++ b/cub/cub/device/dispatch/dispatch_adjacent_difference.cuh @@ -169,7 +169,7 @@ struct DispatchAdjacentDifference : public SelectedPolicy , stream(stream) {} -#ifndef DOXYGEN_SHOULD_SKIP_THIS // Do not document +#ifndef _CCCL_DOXYGEN_INVOKED // Do not document CUB_DETAIL_RUNTIME_DEBUG_SYNC_IS_NOT_SUPPORTED CUB_DEPRECATED CUB_RUNTIME_FUNCTION _CCCL_FORCEINLINE DispatchAdjacentDifference( void* d_temp_storage, @@ -190,7 +190,7 @@ struct DispatchAdjacentDifference : public SelectedPolicy { CUB_DETAIL_RUNTIME_DEBUG_SYNC_USAGE_LOG } -#endif // DOXYGEN_SHOULD_SKIP_THIS +#endif // _CCCL_DOXYGEN_INVOKED /// Invocation template @@ -356,7 +356,7 @@ struct DispatchAdjacentDifference : public SelectedPolicy return error; } -#ifndef DOXYGEN_SHOULD_SKIP_THIS // Do not document +#ifndef _CCCL_DOXYGEN_INVOKED // Do not document CUB_DETAIL_RUNTIME_DEBUG_SYNC_IS_NOT_SUPPORTED CUB_RUNTIME_FUNCTION static cudaError_t Dispatch( void* d_temp_storage, @@ -372,7 +372,7 @@ struct DispatchAdjacentDifference : public SelectedPolicy return Dispatch(d_temp_storage, temp_storage_bytes, d_input, d_output, num_items, difference_op, stream); } -#endif // DOXYGEN_SHOULD_SKIP_THIS +#endif // _CCCL_DOXYGEN_INVOKED }; CUB_NAMESPACE_END diff --git a/cub/cub/device/dispatch/dispatch_histogram.cuh b/cub/cub/device/dispatch/dispatch_histogram.cuh index dab551559a4..15e0311fa2a 100644 --- a/cub/cub/device/dispatch/dispatch_histogram.cuh +++ b/cub/cub/device/dispatch/dispatch_histogram.cuh @@ -1036,7 +1036,7 @@ public: return error; } -#ifndef DOXYGEN_SHOULD_SKIP_THIS // Do not document +#ifndef _CCCL_DOXYGEN_INVOKED // Do not document CUB_DETAIL_RUNTIME_DEBUG_SYNC_IS_NOT_SUPPORTED CUB_RUNTIME_FUNCTION static cudaError_t DispatchRange( void* d_temp_storage, @@ -1067,7 +1067,7 @@ public: stream, is_byte_sample); } -#endif // DOXYGEN_SHOULD_SKIP_THIS +#endif // _CCCL_DOXYGEN_INVOKED /** * Dispatch routine for HistogramRange, specialized for 8-bit sample types @@ -1202,7 +1202,7 @@ public: return error; } -#ifndef DOXYGEN_SHOULD_SKIP_THIS // Do not document +#ifndef _CCCL_DOXYGEN_INVOKED // Do not document CUB_DETAIL_RUNTIME_DEBUG_SYNC_IS_NOT_SUPPORTED CUB_RUNTIME_FUNCTION static cudaError_t DispatchRange( void* d_temp_storage, @@ -1233,7 +1233,7 @@ public: stream, is_byte_sample); } -#endif // DOXYGEN_SHOULD_SKIP_THIS +#endif // _CCCL_DOXYGEN_INVOKED /** * Dispatch routine for HistogramEven, specialized for sample types larger than 8-bit @@ -1420,7 +1420,7 @@ public: return error; } -#ifndef DOXYGEN_SHOULD_SKIP_THIS // Do not document +#ifndef _CCCL_DOXYGEN_INVOKED // Do not document CUB_DETAIL_RUNTIME_DEBUG_SYNC_IS_NOT_SUPPORTED CUB_RUNTIME_FUNCTION _CCCL_FORCEINLINE static cudaError_t DispatchEven( void* d_temp_storage, @@ -1453,7 +1453,7 @@ public: stream, is_byte_sample); } -#endif // DOXYGEN_SHOULD_SKIP_THIS +#endif // _CCCL_DOXYGEN_INVOKED /** * Dispatch routine for HistogramEven, specialized for 8-bit sample types @@ -1592,7 +1592,7 @@ public: return error; } -#ifndef DOXYGEN_SHOULD_SKIP_THIS // Do not document +#ifndef _CCCL_DOXYGEN_INVOKED // Do not document CUB_DETAIL_RUNTIME_DEBUG_SYNC_IS_NOT_SUPPORTED CUB_RUNTIME_FUNCTION _CCCL_FORCEINLINE static cudaError_t DispatchEven( void* d_temp_storage, @@ -1625,7 +1625,7 @@ public: stream, is_byte_sample); } -#endif // DOXYGEN_SHOULD_SKIP_THIS +#endif // _CCCL_DOXYGEN_INVOKED }; CUB_NAMESPACE_END diff --git a/cub/cub/device/dispatch/dispatch_reduce.cuh b/cub/cub/device/dispatch/dispatch_reduce.cuh index 23855d05951..c485e80e446 100644 --- a/cub/cub/device/dispatch/dispatch_reduce.cuh +++ b/cub/cub/device/dispatch/dispatch_reduce.cuh @@ -469,7 +469,7 @@ struct DispatchReduce : SelectedPolicy , launcher_factory(launcher_factory) {} -#ifndef DOXYGEN_SHOULD_SKIP_THIS // Do not document +#ifndef _CCCL_DOXYGEN_INVOKED // Do not document CUB_DETAIL_RUNTIME_DEBUG_SYNC_IS_NOT_SUPPORTED CUB_RUNTIME_FUNCTION _CCCL_FORCEINLINE DispatchReduce( void* d_temp_storage, @@ -494,7 +494,7 @@ struct DispatchReduce : SelectedPolicy { CUB_DETAIL_RUNTIME_DEBUG_SYNC_USAGE_LOG } -#endif // DOXYGEN_SHOULD_SKIP_THIS +#endif // _CCCL_DOXYGEN_INVOKED //--------------------------------------------------------------------------- // Small-problem (single tile) invocation @@ -814,7 +814,7 @@ struct DispatchReduce : SelectedPolicy return error; } -#ifndef DOXYGEN_SHOULD_SKIP_THIS // Do not document +#ifndef _CCCL_DOXYGEN_INVOKED // Do not document CUB_DETAIL_RUNTIME_DEBUG_SYNC_IS_NOT_SUPPORTED CUB_RUNTIME_FUNCTION _CCCL_FORCEINLINE static cudaError_t Dispatch( void* d_temp_storage, @@ -831,7 +831,7 @@ struct DispatchReduce : SelectedPolicy return Dispatch(d_temp_storage, temp_storage_bytes, d_in, d_out, num_items, reduction_op, init, stream); } -#endif // DOXYGEN_SHOULD_SKIP_THIS +#endif // _CCCL_DOXYGEN_INVOKED }; /** @@ -1008,7 +1008,7 @@ struct DispatchSegmentedReduce : SelectedPolicy , ptx_version(ptx_version) {} -#ifndef DOXYGEN_SHOULD_SKIP_THIS // Do not document +#ifndef _CCCL_DOXYGEN_INVOKED // Do not document CUB_DETAIL_RUNTIME_DEBUG_SYNC_IS_NOT_SUPPORTED CUB_RUNTIME_FUNCTION _CCCL_FORCEINLINE DispatchSegmentedReduce( void* d_temp_storage, @@ -1037,7 +1037,7 @@ struct DispatchSegmentedReduce : SelectedPolicy { CUB_DETAIL_RUNTIME_DEBUG_SYNC_USAGE_LOG } -#endif // DOXYGEN_SHOULD_SKIP_THIS +#endif // _CCCL_DOXYGEN_INVOKED //--------------------------------------------------------------------------- // Chained policy invocation @@ -1238,7 +1238,7 @@ struct DispatchSegmentedReduce : SelectedPolicy return error; } -#ifndef DOXYGEN_SHOULD_SKIP_THIS // Do not document +#ifndef _CCCL_DOXYGEN_INVOKED // Do not document CUB_DETAIL_RUNTIME_DEBUG_SYNC_IS_NOT_SUPPORTED CUB_RUNTIME_FUNCTION _CCCL_FORCEINLINE static cudaError_t Dispatch( void* d_temp_storage, @@ -1267,7 +1267,7 @@ struct DispatchSegmentedReduce : SelectedPolicy init, stream); } -#endif // DOXYGEN_SHOULD_SKIP_THIS +#endif // _CCCL_DOXYGEN_INVOKED }; CUB_NAMESPACE_END diff --git a/cub/cub/device/dispatch/dispatch_reduce_by_key.cuh b/cub/cub/device/dispatch/dispatch_reduce_by_key.cuh index 00d7280701a..482b9afe19f 100644 --- a/cub/cub/device/dispatch/dispatch_reduce_by_key.cuh +++ b/cub/cub/device/dispatch/dispatch_reduce_by_key.cuh @@ -550,7 +550,7 @@ struct DispatchReduceByKey return error; } -#ifndef DOXYGEN_SHOULD_SKIP_THIS // Do not document +#ifndef _CCCL_DOXYGEN_INVOKED // Do not document CUB_DETAIL_RUNTIME_DEBUG_SYNC_IS_NOT_SUPPORTED CUB_RUNTIME_FUNCTION _CCCL_FORCEINLINE static cudaError_t Dispatch( void* d_temp_storage, @@ -581,7 +581,7 @@ struct DispatchReduceByKey num_items, stream); } -#endif // DOXYGEN_SHOULD_SKIP_THIS +#endif // _CCCL_DOXYGEN_INVOKED }; CUB_NAMESPACE_END diff --git a/cub/cub/device/dispatch/dispatch_rle.cuh b/cub/cub/device/dispatch/dispatch_rle.cuh index 2a6a0b3b641..bb99b20ab8a 100644 --- a/cub/cub/device/dispatch/dispatch_rle.cuh +++ b/cub/cub/device/dispatch/dispatch_rle.cuh @@ -543,7 +543,7 @@ struct DeviceRleDispatch return error; } -#ifndef DOXYGEN_SHOULD_SKIP_THIS // Do not document +#ifndef _CCCL_DOXYGEN_INVOKED // Do not document CUB_DETAIL_RUNTIME_DEBUG_SYNC_IS_NOT_SUPPORTED CUB_RUNTIME_FUNCTION _CCCL_FORCEINLINE static cudaError_t Dispatch( void* d_temp_storage, @@ -570,7 +570,7 @@ struct DeviceRleDispatch num_items, stream); } -#endif // DOXYGEN_SHOULD_SKIP_THIS +#endif // _CCCL_DOXYGEN_INVOKED }; CUB_NAMESPACE_END diff --git a/cub/cub/device/dispatch/dispatch_scan.cuh b/cub/cub/device/dispatch/dispatch_scan.cuh index d1efaa01cd2..691fc2ece8c 100644 --- a/cub/cub/device/dispatch/dispatch_scan.cuh +++ b/cub/cub/device/dispatch/dispatch_scan.cuh @@ -330,7 +330,7 @@ struct DispatchScan : SelectedPolicy , ptx_version(ptx_version) {} -#ifndef DOXYGEN_SHOULD_SKIP_THIS // Do not document +#ifndef _CCCL_DOXYGEN_INVOKED // Do not document CUB_DETAIL_RUNTIME_DEBUG_SYNC_IS_NOT_SUPPORTED CUB_RUNTIME_FUNCTION _CCCL_FORCEINLINE DispatchScan( void* d_temp_storage, @@ -355,7 +355,7 @@ struct DispatchScan : SelectedPolicy { CUB_DETAIL_RUNTIME_DEBUG_SYNC_USAGE_LOG } -#endif // DOXYGEN_SHOULD_SKIP_THIS +#endif // _CCCL_DOXYGEN_INVOKED template CUB_RUNTIME_FUNCTION _CCCL_HOST _CCCL_FORCEINLINE cudaError_t Invoke(InitKernel init_kernel, ScanKernel scan_kernel) @@ -593,7 +593,7 @@ struct DispatchScan : SelectedPolicy return error; } -#ifndef DOXYGEN_SHOULD_SKIP_THIS // Do not document +#ifndef _CCCL_DOXYGEN_INVOKED // Do not document CUB_DETAIL_RUNTIME_DEBUG_SYNC_IS_NOT_SUPPORTED CUB_RUNTIME_FUNCTION _CCCL_FORCEINLINE static cudaError_t Dispatch( void* d_temp_storage, @@ -610,7 +610,7 @@ struct DispatchScan : SelectedPolicy return Dispatch(d_temp_storage, temp_storage_bytes, d_in, d_out, scan_op, init_value, num_items, stream); } -#endif // DOXYGEN_SHOULD_SKIP_THIS +#endif // _CCCL_DOXYGEN_INVOKED }; CUB_NAMESPACE_END diff --git a/cub/cub/device/dispatch/dispatch_scan_by_key.cuh b/cub/cub/device/dispatch/dispatch_scan_by_key.cuh index aa04ce9f2ec..bf26c54e90e 100644 --- a/cub/cub/device/dispatch/dispatch_scan_by_key.cuh +++ b/cub/cub/device/dispatch/dispatch_scan_by_key.cuh @@ -342,7 +342,7 @@ struct DispatchScanByKey : SelectedPolicy , ptx_version(ptx_version) {} -#ifndef DOXYGEN_SHOULD_SKIP_THIS // Do not document +#ifndef _CCCL_DOXYGEN_INVOKED // Do not document CUB_DETAIL_RUNTIME_DEBUG_SYNC_IS_NOT_SUPPORTED CUB_RUNTIME_FUNCTION _CCCL_FORCEINLINE DispatchScanByKey( void* d_temp_storage, @@ -371,7 +371,7 @@ struct DispatchScanByKey : SelectedPolicy { CUB_DETAIL_RUNTIME_DEBUG_SYNC_USAGE_LOG } -#endif // DOXYGEN_SHOULD_SKIP_THIS +#endif // _CCCL_DOXYGEN_INVOKED template CUB_RUNTIME_FUNCTION _CCCL_HOST _CCCL_FORCEINLINE cudaError_t Invoke(InitKernel init_kernel, ScanKernel scan_kernel) @@ -622,7 +622,7 @@ struct DispatchScanByKey : SelectedPolicy return error; } -#ifndef DOXYGEN_SHOULD_SKIP_THIS // Do not document +#ifndef _CCCL_DOXYGEN_INVOKED // Do not document CUB_DETAIL_RUNTIME_DEBUG_SYNC_IS_NOT_SUPPORTED CUB_RUNTIME_FUNCTION _CCCL_FORCEINLINE static cudaError_t Dispatch( void* d_temp_storage, @@ -651,7 +651,7 @@ struct DispatchScanByKey : SelectedPolicy num_items, stream); } -#endif // DOXYGEN_SHOULD_SKIP_THIS +#endif // _CCCL_DOXYGEN_INVOKED }; CUB_NAMESPACE_END diff --git a/cub/cub/device/dispatch/dispatch_segmented_sort.cuh b/cub/cub/device/dispatch/dispatch_segmented_sort.cuh index 80d8973c759..a98e1de494a 100644 --- a/cub/cub/device/dispatch/dispatch_segmented_sort.cuh +++ b/cub/cub/device/dispatch/dispatch_segmented_sort.cuh @@ -1131,7 +1131,7 @@ struct DispatchSegmentedSort : SelectedPolicy , stream(stream) {} -#ifndef DOXYGEN_SHOULD_SKIP_THIS // Do not document +#ifndef _CCCL_DOXYGEN_INVOKED // Do not document CUB_DETAIL_RUNTIME_DEBUG_SYNC_IS_NOT_SUPPORTED CUB_RUNTIME_FUNCTION _CCCL_FORCEINLINE DispatchSegmentedSort( void* d_temp_storage, @@ -1158,7 +1158,7 @@ struct DispatchSegmentedSort : SelectedPolicy { CUB_DETAIL_RUNTIME_DEBUG_SYNC_USAGE_LOG } -#endif // DOXYGEN_SHOULD_SKIP_THIS +#endif // _CCCL_DOXYGEN_INVOKED template CUB_RUNTIME_FUNCTION _CCCL_FORCEINLINE cudaError_t Invoke() @@ -1440,7 +1440,7 @@ struct DispatchSegmentedSort : SelectedPolicy return error; } -#ifndef DOXYGEN_SHOULD_SKIP_THIS // Do not document +#ifndef _CCCL_DOXYGEN_INVOKED // Do not document CUB_DETAIL_RUNTIME_DEBUG_SYNC_IS_NOT_SUPPORTED CUB_RUNTIME_FUNCTION _CCCL_FORCEINLINE static cudaError_t Dispatch( void* d_temp_storage, @@ -1469,7 +1469,7 @@ struct DispatchSegmentedSort : SelectedPolicy is_overwrite_okay, stream); } -#endif // DOXYGEN_SHOULD_SKIP_THIS +#endif // _CCCL_DOXYGEN_INVOKED private: CUB_RUNTIME_FUNCTION _CCCL_FORCEINLINE int GetNumPasses(int radix_bits) diff --git a/cub/cub/device/dispatch/dispatch_select_if.cuh b/cub/cub/device/dispatch/dispatch_select_if.cuh index 807ba62e4b3..7fbf9ccda4f 100644 --- a/cub/cub/device/dispatch/dispatch_select_if.cuh +++ b/cub/cub/device/dispatch/dispatch_select_if.cuh @@ -845,7 +845,7 @@ struct DispatchSelectIf : SelectedPolicy return CubDebug(MaxPolicyT::Invoke(ptx_version, dispatch)); } -#ifndef DOXYGEN_SHOULD_SKIP_THIS // Do not document +#ifndef _CCCL_DOXYGEN_INVOKED // Do not document CUB_DETAIL_RUNTIME_DEBUG_SYNC_IS_NOT_SUPPORTED CUB_RUNTIME_FUNCTION _CCCL_FORCEINLINE static cudaError_t Dispatch( void* d_temp_storage, @@ -874,7 +874,7 @@ struct DispatchSelectIf : SelectedPolicy num_items, stream); } -#endif // DOXYGEN_SHOULD_SKIP_THIS +#endif // _CCCL_DOXYGEN_INVOKED }; CUB_NAMESPACE_END diff --git a/cub/cub/device/dispatch/dispatch_spmv_orig.cuh b/cub/cub/device/dispatch/dispatch_spmv_orig.cuh index a36a7f7890a..7d3d3094a48 100644 --- a/cub/cub/device/dispatch/dispatch_spmv_orig.cuh +++ b/cub/cub/device/dispatch/dispatch_spmv_orig.cuh @@ -893,7 +893,7 @@ struct DispatchSpmv return error; } -#ifndef DOXYGEN_SHOULD_SKIP_THIS // Do not document +#ifndef _CCCL_DOXYGEN_INVOKED // Do not document template grid_queue, OffsetT n grid_queue.FillAndResetDrain(num_items); } -#endif // DOXYGEN_SHOULD_SKIP_THIS +#endif // _CCCL_DOXYGEN_INVOKED CUB_NAMESPACE_END diff --git a/cub/cub/thread/thread_load.cuh b/cub/cub/thread/thread_load.cuh index 14577a56c92..6679f04b1e8 100644 --- a/cub/cub/thread/thread_load.cuh +++ b/cub/cub/thread/thread_load.cuh @@ -110,7 +110,7 @@ _CCCL_DEVICE _CCCL_FORCEINLINE cub::detail::value_t Thread //@} end member group -#ifndef DOXYGEN_SHOULD_SKIP_THIS // Do not document +#ifndef _CCCL_DOXYGEN_INVOKED // Do not document /// Helper structure for templated load iteration (inductive case) /// \deprecated [Since 2.6.0] Use UnrolledThreadLoad() or UnrolledCopy() instead. @@ -378,6 +378,6 @@ _CCCL_DEVICE _CCCL_FORCEINLINE cub::detail::value_t Thread return ThreadLoad(itr, Int2Type(), Int2Type<::cuda::std::is_pointer::value>()); } -#endif // DOXYGEN_SHOULD_SKIP_THIS +#endif // _CCCL_DOXYGEN_INVOKED CUB_NAMESPACE_END diff --git a/cub/cub/thread/thread_operators.cuh b/cub/cub/thread/thread_operators.cuh index 2de65083843..45d2446188f 100644 --- a/cub/cub/thread/thread_operators.cuh +++ b/cub/cub/thread/thread_operators.cuh @@ -396,7 +396,7 @@ CUB_DEPRECATED _CCCL_HOST_DEVICE BinaryFlip MakeBinaryFlip(BinaryOpT } _CCCL_SUPPRESS_DEPRECATED_POP -#ifndef DOXYGEN_SHOULD_SKIP_THIS // Do not document +#ifndef _CCCL_DOXYGEN_INVOKED // Do not document namespace internal { @@ -720,6 +720,6 @@ using simd_type_t = typename CubOperatorToSimdOperator::simd_type; } // namespace internal -#endif // !DOXYGEN_SHOULD_SKIP_THIS +#endif // !_CCCL_DOXYGEN_INVOKED CUB_NAMESPACE_END diff --git a/cub/cub/thread/thread_reduce.cuh b/cub/cub/thread/thread_reduce.cuh index 2a5b6566a26..d4b4a89fdfd 100644 --- a/cub/cub/thread/thread_reduce.cuh +++ b/cub/cub/thread/thread_reduce.cuh @@ -145,11 +145,11 @@ CUB_NAMESPACE_BEGIN //! template ()[0])>, #else typename ValueT = random_access_value_t, -#endif // !DOXYGEN_SHOULD_SKIP_THIS +#endif // !_CCCL_DOXYGEN_INVOKED typename AccumT = ::cuda::std::__accumulator_t> _CCCL_NODISCARD _CCCL_DEVICE _CCCL_FORCEINLINE AccumT ThreadReduce(const Input& input, ReductionOp reduction_op); // forward declaration @@ -158,7 +158,7 @@ _CCCL_NODISCARD _CCCL_DEVICE _CCCL_FORCEINLINE AccumT ThreadReduce(const Input& * Internal Reduction Implementations **********************************************************************************************************************/ -#ifndef DOXYGEN_SHOULD_SKIP_THIS // Do not document +#ifndef _CCCL_DOXYGEN_INVOKED // Do not document namespace detail { @@ -697,6 +697,6 @@ _CCCL_NODISCARD _CCCL_DEVICE _CCCL_FORCEINLINE T ThreadReduce(const T*, Reductio } // namespace internal -#endif // !DOXYGEN_SHOULD_SKIP_THIS +#endif // !_CCCL_DOXYGEN_INVOKED CUB_NAMESPACE_END diff --git a/cub/cub/thread/thread_store.cuh b/cub/cub/thread/thread_store.cuh index d0927a0d28d..a895884a60d 100644 --- a/cub/cub/thread/thread_store.cuh +++ b/cub/cub/thread/thread_store.cuh @@ -114,7 +114,7 @@ _CCCL_DEVICE _CCCL_FORCEINLINE void ThreadStore(OutputIteratorT itr, T val); //@} end member group -#ifndef DOXYGEN_SHOULD_SKIP_THIS // Do not document +#ifndef _CCCL_DOXYGEN_INVOKED // Do not document /// Helper structure for templated store iteration (inductive case) template @@ -353,6 +353,6 @@ _CCCL_DEVICE _CCCL_FORCEINLINE void ThreadStore(OutputIteratorT itr, T val) ThreadStore(itr, val, Int2Type(), Int2Type::value>()); } -#endif // DOXYGEN_SHOULD_SKIP_THIS +#endif // _CCCL_DOXYGEN_INVOKED CUB_NAMESPACE_END diff --git a/cub/cub/util_allocator.cuh b/cub/cub/util_allocator.cuh index d9559b874f3..39e59bdf4de 100644 --- a/cub/cub/util_allocator.cuh +++ b/cub/cub/util_allocator.cuh @@ -110,7 +110,7 @@ struct CachingDeviceAllocator /// Invalid size static constexpr size_t INVALID_SIZE = (size_t) -1; -#ifndef DOXYGEN_SHOULD_SKIP_THIS // Do not document +#ifndef _CCCL_DOXYGEN_INVOKED // Do not document /// Invalid device ordinal static constexpr int INVALID_DEVICE_ORDINAL = -1; @@ -299,7 +299,7 @@ struct CachingDeviceAllocator /// Set of live device allocations currently in use BusyBlocks live_blocks; -#endif // DOXYGEN_SHOULD_SKIP_THIS +#endif // _CCCL_DOXYGEN_INVOKED //--------------------------------------------------------------------- // Methods diff --git a/cub/cub/util_arch.cuh b/cub/cub/util_arch.cuh index 5f8780620fa..1d6d7289b78 100644 --- a/cub/cub/util_arch.cuh +++ b/cub/cub/util_arch.cuh @@ -52,7 +52,7 @@ CUB_NAMESPACE_BEGIN -#ifndef DOXYGEN_SHOULD_SKIP_THIS // Do not document +#ifndef _CCCL_DOXYGEN_INVOKED // Do not document // \deprecated [Since 2.1.0] # define CUB_USE_COOPERATIVE_GROUPS diff --git a/cub/cub/util_cpp_dialect.cuh b/cub/cub/util_cpp_dialect.cuh index 006a070a7e9..6f54239bf84 100644 --- a/cub/cub/util_cpp_dialect.cuh +++ b/cub/cub/util_cpp_dialect.cuh @@ -42,7 +42,7 @@ #include // IWYU pragma: export -#ifndef DOXYGEN_SHOULD_SKIP_THIS // Do not document +#ifndef _CCCL_DOXYGEN_INVOKED // Do not document // Deprecation warnings may be silenced by defining the following macros. These // may be combined. @@ -133,4 +133,4 @@ CUB_COMPILER_DEPRECATION_SOFT(C++ 17, C++ 14); # undef CUB_COMP_DEPR_IMPL0 # undef CUB_COMP_DEPR_IMPL1 -#endif // !DOXYGEN_SHOULD_SKIP_THIS +#endif // !_CCCL_DOXYGEN_INVOKED diff --git a/cub/cub/util_debug.cuh b/cub/cub/util_debug.cuh index edb75a64da3..0a08c9ae223 100644 --- a/cub/cub/util_debug.cuh +++ b/cub/cub/util_debug.cuh @@ -48,7 +48,7 @@ #include -#ifdef DOXYGEN_SHOULD_SKIP_THIS // Only parse this during doxygen passes: +#ifdef _CCCL_DOXYGEN_INVOKED // Only parse this during doxygen passes: /** * @def CUB_DEBUG_LOG @@ -92,7 +92,7 @@ */ # define CUB_DEBUG_ALL -#endif // DOXYGEN_SHOULD_SKIP_THIS +#endif // _CCCL_DOXYGEN_INVOKED // `CUB_DETAIL_DEBUG_LEVEL_*`: Implementation details, internal use only: diff --git a/cub/cub/util_device.cuh b/cub/cub/util_device.cuh index e395b17f6d3..5b8c1f3f1f3 100644 --- a/cub/cub/util_device.cuh +++ b/cub/cub/util_device.cuh @@ -65,7 +65,7 @@ CUB_NAMESPACE_BEGIN -#ifndef DOXYGEN_SHOULD_SKIP_THIS // Do not document +#ifndef _CCCL_DOXYGEN_INVOKED // Do not document namespace detail { @@ -90,7 +90,7 @@ template CUB_DETAIL_KERNEL_ATTRIBUTES void EmptyKernel() {} -#endif // DOXYGEN_SHOULD_SKIP_THIS +#endif // _CCCL_DOXYGEN_INVOKED /** * \brief Returns the current device or -1 if an error occurred. @@ -105,13 +105,13 @@ CUB_RUNTIME_FUNCTION inline int CurrentDevice() return device; } -#ifndef DOXYGEN_SHOULD_SKIP_THIS // Do not document +#ifndef _CCCL_DOXYGEN_INVOKED // Do not document //! @brief RAII helper which saves the current device and switches to the specified device on construction and switches //! to the saved device on destruction. using SwitchDevice = ::cuda::__ensure_current_device; -#endif // DOXYGEN_SHOULD_SKIP_THIS +#endif // _CCCL_DOXYGEN_INVOKED /** * \brief Returns the number of CUDA devices available or -1 if an error @@ -171,7 +171,7 @@ CUB_RUNTIME_FUNCTION inline int DeviceCount() return result; } -#ifndef DOXYGEN_SHOULD_SKIP_THIS // Do not document +#ifndef _CCCL_DOXYGEN_INVOKED // Do not document /** * \brief Per-device cache for a CUDA attribute value; the attribute is queried * and stored for each device upon construction. @@ -286,7 +286,7 @@ public: return entry.payload; } }; -#endif // DOXYGEN_SHOULD_SKIP_THIS +#endif // _CCCL_DOXYGEN_INVOKED /** * \brief Retrieves the PTX version that will be used on the current device (major * 100 + minor * 10). diff --git a/cub/cub/util_macro.cuh b/cub/cub/util_macro.cuh index f98751b2ddf..b3ab7e73629 100644 --- a/cub/cub/util_macro.cuh +++ b/cub/cub/util_macro.cuh @@ -49,7 +49,7 @@ CUB_NAMESPACE_BEGIN -#ifndef DOXYGEN_SHOULD_SKIP_THIS // Do not document +#ifndef _CCCL_DOXYGEN_INVOKED // Do not document # define CUB_PREVENT_MACRO_SUBSTITUTION template constexpr _CCCL_HOST_DEVICE auto min CUB_PREVENT_MACRO_SUBSTITUTION(T&& t, U&& u) diff --git a/cub/cub/util_ptx.cuh b/cub/cub/util_ptx.cuh index 3fc73b90304..aa522d9576e 100644 --- a/cub/cub/util_ptx.cuh +++ b/cub/cub/util_ptx.cuh @@ -97,7 +97,7 @@ _CCCL_DEVICE _CCCL_FORCEINLINE unsigned int SHL_ADD(unsigned int x, unsigned int return ret; } -#ifndef DOXYGEN_SHOULD_SKIP_THIS // Do not document +#ifndef _CCCL_DOXYGEN_INVOKED // Do not document /** * Bitfield-extract. @@ -135,7 +135,7 @@ BFE(UnsignedBits source, unsigned int bit_start, unsigned int num_bits, Int2Type } # endif -#endif // DOXYGEN_SHOULD_SKIP_THIS +#endif // _CCCL_DOXYGEN_INVOKED /** * \brief Bitfield-extract. Extracts \p num_bits from \p source starting at bit-offset \p bit_start. The input \p @@ -199,7 +199,7 @@ _CCCL_DEVICE _CCCL_FORCEINLINE int PRMT(unsigned int a, unsigned int b, unsigned return ret; } -#ifndef DOXYGEN_SHOULD_SKIP_THIS // Do not document +#ifndef _CCCL_DOXYGEN_INVOKED // Do not document /** * Sync-threads barrier. @@ -329,7 +329,7 @@ _CCCL_DEVICE _CCCL_FORCEINLINE float FFMA_RZ(float a, float b, float c) return d; } -#endif // DOXYGEN_SHOULD_SKIP_THIS +#endif // _CCCL_DOXYGEN_INVOKED /** * \brief Terminates the calling thread @@ -689,7 +689,7 @@ _CCCL_DEVICE _CCCL_FORCEINLINE T ShuffleIndex(T input, int src_lane, unsigned in return output; } -#ifndef DOXYGEN_SHOULD_SKIP_THIS // Do not document +#ifndef _CCCL_DOXYGEN_INVOKED // Do not document namespace detail { @@ -751,7 +751,7 @@ struct warp_matcher_t }; } // namespace detail -#endif // DOXYGEN_SHOULD_SKIP_THIS +#endif // _CCCL_DOXYGEN_INVOKED /** * Compute a 32b mask of threads having the same least-significant diff --git a/cub/cub/util_temporary_storage.cuh b/cub/cub/util_temporary_storage.cuh index ee456083c3e..61c00f969f4 100644 --- a/cub/cub/util_temporary_storage.cuh +++ b/cub/cub/util_temporary_storage.cuh @@ -48,7 +48,7 @@ CUB_NAMESPACE_BEGIN -#ifndef DOXYGEN_SHOULD_SKIP_THIS // Do not document +#ifndef _CCCL_DOXYGEN_INVOKED // Do not document /** * @brief Alias temporaries to externally-allocated device storage (or simply return the amount of storage needed). @@ -112,6 +112,6 @@ _CCCL_HOST_DEVICE _CCCL_FORCEINLINE cudaError_t AliasTemporaries( return cudaSuccess; } -#endif // DOXYGEN_SHOULD_SKIP_THIS +#endif // _CCCL_DOXYGEN_INVOKED CUB_NAMESPACE_END diff --git a/cub/cub/util_type.cuh b/cub/cub/util_type.cuh index 42ffef0f6b0..f062ebc4ae9 100644 --- a/cub/cub/util_type.cuh +++ b/cub/cub/util_type.cuh @@ -85,7 +85,7 @@ CUB_NAMESPACE_BEGIN * Conditional types ******************************************************************************/ -#ifndef DOXYGEN_SHOULD_SKIP_THIS // Do not document +#ifndef _CCCL_DOXYGEN_INVOKED // Do not document namespace detail { //! Alias to the given iterator's value_type. @@ -142,7 +142,7 @@ struct Log2 }; // Inductive case }; -# ifndef DOXYGEN_SHOULD_SKIP_THIS // Do not document +# ifndef _CCCL_DOXYGEN_INVOKED // Do not document template struct Log2 @@ -155,7 +155,7 @@ struct Log2 }; }; -# endif // DOXYGEN_SHOULD_SKIP_THIS +# endif // _CCCL_DOXYGEN_INVOKED /** * \brief Statically determine if N is a power-of-two @@ -169,13 +169,13 @@ struct PowerOfTwo }; }; -#endif // DOXYGEN_SHOULD_SKIP_THIS +#endif // _CCCL_DOXYGEN_INVOKED /****************************************************************************** * Marker types ******************************************************************************/ -#ifndef DOXYGEN_SHOULD_SKIP_THIS // Do not document +#ifndef _CCCL_DOXYGEN_INVOKED // Do not document /** * \brief A simple "null" marker type @@ -1156,6 +1156,6 @@ template struct Traits : NumericTraits::type> {}; -#endif // DOXYGEN_SHOULD_SKIP_THIS +#endif // _CCCL_DOXYGEN_INVOKED CUB_NAMESPACE_END diff --git a/cub/cub/util_vsmem.cuh b/cub/cub/util_vsmem.cuh index d2e5541c09c..f5926ce11e5 100644 --- a/cub/cub/util_vsmem.cuh +++ b/cub/cub/util_vsmem.cuh @@ -54,7 +54,7 @@ CUB_NAMESPACE_BEGIN -#ifndef DOXYGEN_SHOULD_SKIP_THIS // Do not document +#ifndef _CCCL_DOXYGEN_INVOKED // Do not document namespace detail { @@ -248,6 +248,6 @@ using vsmem_helper_default_fallback_policy_t = } // namespace detail -#endif // DOXYGEN_SHOULD_SKIP_THIS +#endif // _CCCL_DOXYGEN_INVOKED CUB_NAMESPACE_END diff --git a/cub/cub/warp/warp_reduce.cuh b/cub/cub/warp/warp_reduce.cuh index 1d647a06c86..00440c18bdf 100644 --- a/cub/cub/warp/warp_reduce.cuh +++ b/cub/cub/warp/warp_reduce.cuh @@ -170,14 +170,14 @@ private: }; public: -#ifndef DOXYGEN_SHOULD_SKIP_THIS // Do not document +#ifndef _CCCL_DOXYGEN_INVOKED // Do not document /// Internal specialization. /// Use SHFL-based reduction if LOGICAL_WARP_THREADS is a power-of-two using InternalWarpReduce = ::cuda::std::_If, WarpReduceSmem>; -#endif // DOXYGEN_SHOULD_SKIP_THIS +#endif // _CCCL_DOXYGEN_INVOKED private: /// Shared memory storage layout type for WarpReduce @@ -662,7 +662,7 @@ public: //! @} end member group }; -#ifndef DOXYGEN_SHOULD_SKIP_THIS // Do not document +#ifndef _CCCL_DOXYGEN_INVOKED // Do not document template class WarpReduce { @@ -740,6 +740,6 @@ public: return input; } }; -#endif // DOXYGEN_SHOULD_SKIP_THIS +#endif // _CCCL_DOXYGEN_INVOKED CUB_NAMESPACE_END diff --git a/cudax/include/cuda/experimental/__container/uninitialized_async_buffer.cuh b/cudax/include/cuda/experimental/__container/uninitialized_async_buffer.cuh index 4bcd93d259f..5bfd60da9d3 100644 --- a/cudax/include/cuda/experimental/__container/uninitialized_async_buffer.cuh +++ b/cudax/include/cuda/experimental/__container/uninitialized_async_buffer.cuh @@ -273,12 +273,12 @@ public: __stream_ = __new_stream; } -# ifndef DOXYGEN_SHOULD_SKIP_THIS // friend functions are currently broken +# ifndef _CCCL_DOXYGEN_INVOKED // friend functions are currently broken //! @brief Forwards the passed properties _CCCL_TEMPLATE(class _Property) _CCCL_REQUIRES((!property_with_value<_Property>) _CCCL_AND _CUDA_VSTD::__is_included_in_v<_Property, _Properties...>) _CCCL_HIDE_FROM_ABI friend constexpr void get_property(const uninitialized_async_buffer&, _Property) noexcept {} -# endif // DOXYGEN_SHOULD_SKIP_THIS +# endif // _CCCL_DOXYGEN_INVOKED //! @brief Internal method to grow the allocation to a new size \p __count. //! @param __count The new size of the allocation. diff --git a/cudax/include/cuda/experimental/__container/uninitialized_buffer.cuh b/cudax/include/cuda/experimental/__container/uninitialized_buffer.cuh index d480ded4588..38c968d25c8 100644 --- a/cudax/include/cuda/experimental/__container/uninitialized_buffer.cuh +++ b/cudax/include/cuda/experimental/__container/uninitialized_buffer.cuh @@ -238,12 +238,12 @@ public: return __mr_; } -# ifndef DOXYGEN_SHOULD_SKIP_THIS // friend functions are currently broken +# ifndef _CCCL_DOXYGEN_INVOKED // friend functions are currently broken //! @brief Forwards the passed Properties _CCCL_TEMPLATE(class _Property) _CCCL_REQUIRES((!property_with_value<_Property>) _CCCL_AND _CUDA_VSTD::__is_included_in_v<_Property, _Properties...>) _CCCL_HIDE_FROM_ABI friend constexpr void get_property(const uninitialized_buffer&, _Property) noexcept {} -# endif // DOXYGEN_SHOULD_SKIP_THIS +# endif // _CCCL_DOXYGEN_INVOKED //! @brief Internal method to grow the allocation to a new size \p __count. //! @param __count The new size of the allocation. diff --git a/cudax/include/cuda/experimental/__device/device.cuh b/cudax/include/cuda/experimental/__device/device.cuh index 52c109bff6a..3e19bafb4e7 100644 --- a/cudax/include/cuda/experimental/__device/device.cuh +++ b/cudax/include/cuda/experimental/__device/device.cuh @@ -68,7 +68,7 @@ public: template <::cudaDeviceAttr _Attr> using attr_result_t = typename detail::__dev_attr<_Attr>::type; -#ifndef DOXYGEN_SHOULD_SKIP_THIS // Do not document +#ifndef _CCCL_DOXYGEN_INVOKED // Do not document # if defined(_CCCL_COMPILER_MSVC) // When __EDG__ is defined, std::construct_at will not permit constructing // a device object from an __emplace_device object. This is a workaround. diff --git a/cudax/include/cuda/experimental/__memory_resource/device_memory_resource.cuh b/cudax/include/cuda/experimental/__memory_resource/device_memory_resource.cuh index bae301feb0b..7d54dd4f750 100644 --- a/cudax/include/cuda/experimental/__memory_resource/device_memory_resource.cuh +++ b/cudax/include/cuda/experimental/__memory_resource/device_memory_resource.cuh @@ -405,11 +405,11 @@ public: return __pool_; } -# ifndef DOXYGEN_SHOULD_SKIP_THIS // Doxygen cannot handle the friend function +# ifndef _CCCL_DOXYGEN_INVOKED // Doxygen cannot handle the friend function //! @brief Enables the \c device_accessible property for \c device_memory_resource. //! @relates device_memory_resource friend constexpr void get_property(device_memory_resource const&, _CUDA_VMR::device_accessible) noexcept {} -# endif // DOXYGEN_SHOULD_SKIP_THIS +# endif // _CCCL_DOXYGEN_INVOKED }; static_assert(_CUDA_VMR::resource_with, ""); diff --git a/cudax/include/cuda/experimental/__stf/internal/data_interface.cuh b/cudax/include/cuda/experimental/__stf/internal/data_interface.cuh index 0d2026fdbbe..7fe81211569 100644 --- a/cudax/include/cuda/experimental/__stf/internal/data_interface.cuh +++ b/cudax/include/cuda/experimental/__stf/internal/data_interface.cuh @@ -213,7 +213,7 @@ public: */ virtual size_t data_hash(instance_id_t instance_id) const = 0; -#ifndef DOXYGEN_SHOULD_SKIP_THIS // doxygen fails to parse this +#ifndef _CCCL_DOXYGEN_INVOKED // doxygen fails to parse this /** * @brief Returns the size of the data represented by this logical data. * @@ -221,7 +221,7 @@ public: * purposes, or for the scheduling strategies. */ virtual size_t data_footprint() const = 0; -#endif // DOXYGEN_SHOULD_SKIP_THIS +#endif // _CCCL_DOXYGEN_INVOKED /** * @brief Get the part of the data interface that is common to all data instances. diff --git a/cudax/include/cuda/experimental/__stf/internal/execution_policy.cuh b/cudax/include/cuda/experimental/__stf/internal/execution_policy.cuh index d2cc954bfa9..e79dca54141 100644 --- a/cudax/include/cuda/experimental/__stf/internal/execution_policy.cuh +++ b/cudax/include/cuda/experimental/__stf/internal/execution_policy.cuh @@ -301,7 +301,7 @@ public: * * @tparam level The level in the hierarchy to check for the `sync` property. Level starts from 0 (top-level). */ -#ifndef DOXYGEN_SHOULD_SKIP_THIS // doxygen fails to parse this +#ifndef _CCCL_DOXYGEN_INVOKED // doxygen fails to parse this template static inline constexpr bool is_synchronizable = [] { if constexpr (level > 0) @@ -395,7 +395,7 @@ private: mem mem_bytes = mem(0); }; -#ifndef DOXYGEN_SHOULD_SKIP_THIS // doxygen fails to parse this +#ifndef _CCCL_DOXYGEN_INVOKED // doxygen fails to parse this /** * @brief Creates and returns a `thread_hierarchy_spec` object with no synchronization and dynamic width. * @@ -480,7 +480,7 @@ constexpr auto con(const P&... p) return R(p...); } /// @} -#endif // DOXYGEN_SHOULD_SKIP_THIS +#endif // _CCCL_DOXYGEN_INVOKED #ifdef UNITTESTED_FILE diff --git a/cudax/include/cuda/experimental/__stf/internal/reduction_base.cuh b/cudax/include/cuda/experimental/__stf/internal/reduction_base.cuh index 0b6a0cd7c78..1d8d00d6670 100644 --- a/cudax/include/cuda/experimental/__stf/internal/reduction_base.cuh +++ b/cudax/include/cuda/experimental/__stf/internal/reduction_base.cuh @@ -42,7 +42,7 @@ public: reduction_operator_base& operator=(const reduction_operator_base&) = delete; reduction_operator_base(const reduction_operator_base&) = delete; -#ifndef DOXYGEN_SHOULD_SKIP_THIS // doxygen fails here +#ifndef _CCCL_DOXYGEN_INVOKED // doxygen fails here // Reduction operator (inout, in) virtual void op_untyped( @@ -62,7 +62,7 @@ public: const exec_place& e, event_list& prereq_in) = 0; -#endif // DOXYGEN_SHOULD_SKIP_THIS +#endif // _CCCL_DOXYGEN_INVOKED private: // not used for now ... diff --git a/cudax/include/cuda/experimental/__stf/places/exec/host/callback_queues.cuh b/cudax/include/cuda/experimental/__stf/places/exec/host/callback_queues.cuh index 0b011cce0f5..2d3036ec143 100644 --- a/cudax/include/cuda/experimental/__stf/places/exec/host/callback_queues.cuh +++ b/cudax/include/cuda/experimental/__stf/places/exec/host/callback_queues.cuh @@ -30,7 +30,7 @@ #include #include -#ifndef DOXYGEN_SHOULD_SKIP_THIS // do not document +#ifndef _CCCL_DOXYGEN_INVOKED // do not document # if !defined(_CCCL_COMPILER_MSVC) # define STATEFUL_CALLBACKS @@ -603,4 +603,4 @@ inline bool cudaCallbackQueueProgress(callback_queue* q, bool flag) } // end namespace cuda::experimental::stf # endif // !_CCCL_COMPILER_MSVC -#endif // DOXYGEN_SHOULD_SKIP_THIS do not document +#endif // _CCCL_DOXYGEN_INVOKED do not document diff --git a/cudax/include/cuda/experimental/__stf/places/inner_shape.cuh b/cudax/include/cuda/experimental/__stf/places/inner_shape.cuh index 04b2badf7f2..383f43961be 100644 --- a/cudax/include/cuda/experimental/__stf/places/inner_shape.cuh +++ b/cudax/include/cuda/experimental/__stf/places/inner_shape.cuh @@ -31,7 +31,7 @@ namespace cuda::experimental::stf { -#ifndef DOXYGEN_SHOULD_SKIP_THIS // doxygen fails to parse this +#ifndef _CCCL_DOXYGEN_INVOKED // doxygen fails to parse this /** * @brief Applying "inner" on a mdspan shape returns an explicit shape which extents @@ -89,7 +89,7 @@ _CCCL_HOST_DEVICE box inner(const box& s) return box(inner_extents); } -#endif // DOXYGEN_SHOULD_SKIP_THIS +#endif // _CCCL_DOXYGEN_INVOKED #ifdef UNITTESTED_FILE UNITTEST("inner explicit shape (explicit bounds)") diff --git a/cudax/include/cuda/experimental/__stf/stream/reduction.cuh b/cudax/include/cuda/experimental/__stf/stream/reduction.cuh index 4493672c70b..deea02bbd9c 100644 --- a/cudax/include/cuda/experimental/__stf/stream/reduction.cuh +++ b/cudax/include/cuda/experimental/__stf/stream/reduction.cuh @@ -65,7 +65,7 @@ public: const exec_place& e, cudaStream_t s) = 0; -#ifndef DOXYGEN_SHOULD_SKIP_THIS // doxygen has issues with this code +#ifndef _CCCL_DOXYGEN_INVOKED // doxygen has issues with this code void op_untyped( logical_data_untyped& d, const data_place& inout_memory_node, @@ -110,7 +110,7 @@ public: prereqs = async_op.end(d.get_ctx()); } -#endif // DOXYGEN_SHOULD_SKIP_THIS // doxygen has issues with this code +#endif // _CCCL_DOXYGEN_INVOKED // doxygen has issues with this code }; /** diff --git a/cudax/include/cuda/experimental/__stf/stream/stream_task.cuh b/cudax/include/cuda/experimental/__stf/stream/stream_task.cuh index 3c51f7304bb..48a28aa6648 100644 --- a/cudax/include/cuda/experimental/__stf/stream/stream_task.cuh +++ b/cudax/include/cuda/experimental/__stf/stream/stream_task.cuh @@ -617,7 +617,7 @@ private: template class deferred_stream_task; -#ifndef DOXYGEN_SHOULD_SKIP_THIS // doxygen has issues with this code +#ifndef _CCCL_DOXYGEN_INVOKED // doxygen has issues with this code /* * Base of all deferred tasks. Stores the needed information for typed deferred tasks to run (see below). */ @@ -877,6 +877,6 @@ public: }; } }; -#endif // DOXYGEN_SHOULD_SKIP_THIS +#endif // _CCCL_DOXYGEN_INVOKED } // namespace cuda::experimental::stf diff --git a/cudax/include/cuda/experimental/__stf/utility/core.cuh b/cudax/include/cuda/experimental/__stf/utility/core.cuh index e0eb417aad7..23b0ff5560f 100644 --- a/cudax/include/cuda/experimental/__stf/utility/core.cuh +++ b/cudax/include/cuda/experimental/__stf/utility/core.cuh @@ -79,7 +79,7 @@ inline int setenv(const char* name, const char* value, int overwrite) } #endif -#ifndef DOXYGEN_SHOULD_SKIP_THIS // FIXME Doxygen is lost with decltype(auto) +#ifndef _CCCL_DOXYGEN_INVOKED // FIXME Doxygen is lost with decltype(auto) /** * @brief Custom move function that performs checks on the argument type. * @@ -97,7 +97,7 @@ _CCCL_HOST_DEVICE constexpr decltype(auto) mv(T&& obj) static_assert(!::std::is_const_v<::std::remove_reference_t>, "Misleading move from const lvalue."); return ::std::move(obj); } -#endif // DOXYGEN_SHOULD_SKIP_THIS +#endif // _CCCL_DOXYGEN_INVOKED /** * @brief Creates a `std::shared_ptr` managing a copy of the given object. @@ -609,7 +609,7 @@ private: [[no_unique_address]] state_t payload = state_t(); }; -#ifndef DOXYGEN_SHOULD_SKIP_THIS // Do not document +#ifndef _CCCL_DOXYGEN_INVOKED // Do not document // Operator implementations # define _3197bc91feaf98030b2cc0b441d7b0ea(op) \ template \ @@ -691,6 +691,6 @@ _3197bc91feaf98030b2cc0b441d7b0ea(>=); # undef _3197bc91feaf98030b2cc0b441d7b0ea -#endif // DOXYGEN_SHOULD_SKIP_THIS +#endif // _CCCL_DOXYGEN_INVOKED } // namespace cuda::experimental::stf diff --git a/cudax/include/cuda/experimental/__stf/utility/unittest.cuh b/cudax/include/cuda/experimental/__stf/utility/unittest.cuh index 3cc470df80e..dd42fbdd9bd 100644 --- a/cudax/include/cuda/experimental/__stf/utility/unittest.cuh +++ b/cudax/include/cuda/experimental/__stf/utility/unittest.cuh @@ -31,7 +31,7 @@ #include -#ifndef DOXYGEN_SHOULD_SKIP_THIS // Do not document +#ifndef _CCCL_DOXYGEN_INVOKED // Do not document // One level of macro indirection is required in order to resolve __COUNTER__, // and get varname1 instead of varname__COUNTER__. # define _55f56f4e3b45c8cf3fa50b28fed72e2a(a, b) _a56ec7069122ad2e0888a508ecdc4639(a, b) @@ -705,7 +705,7 @@ UNITTEST("cuda::std::source_location") test_func(); }; -#else // DOXYGEN_SHOULD_SKIP_THIS Do not document +#else // _CCCL_DOXYGEN_INVOKED Do not document // Ensure these are ignored by Doxygen # define UNITTEST(name, ...) -#endif // DOXYGEN_SHOULD_SKIP_THIS Do not document +#endif // _CCCL_DOXYGEN_INVOKED Do not document diff --git a/cudax/include/cuda/experimental/__utility/ensure_current_device.cuh b/cudax/include/cuda/experimental/__utility/ensure_current_device.cuh index 6c37d4f6996..c644dd19a1c 100644 --- a/cudax/include/cuda/experimental/__utility/ensure_current_device.cuh +++ b/cudax/include/cuda/experimental/__utility/ensure_current_device.cuh @@ -27,7 +27,7 @@ #include #include -#ifndef DOXYGEN_SHOULD_SKIP_THIS // Do not document +#ifndef _CCCL_DOXYGEN_INVOKED // Do not document namespace cuda::experimental { @@ -101,5 +101,5 @@ struct [[maybe_unused]] __ensure_current_device } }; } // namespace cuda::experimental -#endif // DOXYGEN_SHOULD_SKIP_THIS +#endif // _CCCL_DOXYGEN_INVOKED #endif // _CUDAX__UTILITY_ENSURE_CURRENT_DEVICE diff --git a/docs/repo.toml b/docs/repo.toml index 9a684c4d5f4..f7c426f13db 100644 --- a/docs/repo.toml +++ b/docs/repo.toml @@ -159,11 +159,10 @@ doxygen_predefined = [ "_CCCL_DIAG_SUPPRESS_ICC(x)=", "_CCCL_DIAG_SUPPRESS_MSVC(x)=", "_CCCL_DIAG_SUPPRESS_NVHPC(x)=", + "_CCCL_DOXYGEN_INVOKED", "_CCCL_REQUIRES(x)= ::cuda::std::enable_if_t = 0>", "_CCCL_TEMPLATE(x)=template x _CCCL_EAT_REST", - "DOXYGEN_SHOULD_SKIP_THIS", - "DOXYGEN_ACTIVE", "__device__", "__host__", "__forceinline__", @@ -275,8 +274,7 @@ doxygen_predefined = [ "CUDASTF_HOST=", "CUDASTF_DEVICE=", "CUDASTF_HOST_DEVICE=", - "DOXYGEN_SHOULD_SKIP_THIS", - "DOXYGEN_ACTIVE", + "_CCCL_DOXYGEN_INVOKED", "_LIBCUDACXX_DEPRECATED_IN_CXX11", "THRUST_DISABLE_NAMESPACE_MAGIC", "THRUST_IGNORE_NAMESPACE_MAGIC_ERROR", @@ -445,8 +443,7 @@ doxygen_predefined = [ "_CUDAX_TRIVIAL_DEVICE_API", "_CUDAX_PUBLIC_API", "LIBCUDACXX_ENABLE_EXPERIMENTAL_MEMORY_RESOURCE=", - "DOXYGEN_SHOULD_SKIP_THIS", - "DOXYGEN_ACTIVE", + "_CCCL_DOXYGEN_INVOKED", ] # make sure to use ./fetch_imgs.sh diff --git a/libcudacxx/include/cuda/std/__type_traits/type_list.h b/libcudacxx/include/cuda/std/__type_traits/type_list.h index bef58f29966..4bd928b0013 100644 --- a/libcudacxx/include/cuda/std/__type_traits/type_list.h +++ b/libcudacxx/include/cuda/std/__type_traits/type_list.h @@ -42,7 +42,7 @@ _LIBCUDACXX_BEGIN_NAMESPACE_STD -#ifndef DOXYGEN_SHOULD_SKIP_THIS // Do not document +#ifndef _CCCL_DOXYGEN_INVOKED // Do not document template struct __type_list; @@ -947,7 +947,7 @@ template using __type_iota = decltype(__detail::__type_iota_fn<_Ty, _Start, _Stride>(static_cast*>(nullptr))); -#endif // DOXYGEN_SHOULD_SKIP_THIS +#endif // _CCCL_DOXYGEN_INVOKED _LIBCUDACXX_END_NAMESPACE_STD diff --git a/thrust/thrust/detail/type_deduction.h b/thrust/thrust/detail/type_deduction.h index 08f31630bb5..a1d41de9676 100644 --- a/thrust/thrust/detail/type_deduction.h +++ b/thrust/thrust/detail/type_deduction.h @@ -59,7 +59,7 @@ /// // Trailing return types seem to confuse Doxygen, and cause it to interpret // parts of the function's body as new function signatures. -#if defined(THRUST_DOXYGEN) +#if defined(_CCCL_DOXYGEN_INVOKED) # define THRUST_DECLTYPE_RETURNS(...) \ { \ return (__VA_ARGS__); \ @@ -81,7 +81,7 @@ /// // Trailing return types seem to confuse Doxygen, and cause it to interpret // parts of the function's body as new function signatures. -#if defined(THRUST_DOXYGEN) +#if defined(_CCCL_DOXYGEN_INVOKED) # define THRUST_DECLTYPE_RETURNS(...) \ { \ return (__VA_ARGS__); \ diff --git a/thrust/thrust/device_malloc_allocator.h b/thrust/thrust/device_malloc_allocator.h index e5d2e04fc19..c9de52a8404 100644 --- a/thrust/thrust/device_malloc_allocator.h +++ b/thrust/thrust/device_malloc_allocator.h @@ -40,12 +40,12 @@ THRUST_NAMESPACE_BEGIN // forward declarations to WAR circular #includes -#ifndef DOXYGEN_SHOULD_SKIP_THIS // Do not document +#ifndef _CCCL_DOXYGEN_INVOKED // Do not document template class device_ptr; template device_ptr device_malloc(const std::size_t n); -#endif // DOXYGEN_SHOULD_SKIP_THIS +#endif // _CCCL_DOXYGEN_INVOKED /*! \addtogroup allocators Allocators * \ingroup memory_management diff --git a/thrust/thrust/device_ptr.h b/thrust/thrust/device_ptr.h index 5c5f55a3a83..058d12cb83f 100644 --- a/thrust/thrust/device_ptr.h +++ b/thrust/thrust/device_ptr.h @@ -154,14 +154,14 @@ class device_ptr return *this; } -#if THRUST_DOXYGEN +#ifdef _CCCL_DOXYGEN_INVOKED /*! \brief Return the raw pointer that this \c device_ptr points to. */ _CCCL_HOST_DEVICE T* get() const; #endif }; -#if THRUST_DOXYGEN +#ifdef _CCCL_DOXYGEN_INVOKED /*! Write the address that a \c device_ptr points to to an output stream. * * \param os The output stream. diff --git a/thrust/thrust/device_reference.h b/thrust/thrust/device_reference.h index 40a6790a5a1..545d5449bee 100644 --- a/thrust/thrust/device_reference.h +++ b/thrust/thrust/device_reference.h @@ -961,7 +961,7 @@ _CCCL_HOST_DEVICE void swap(device_reference& x, device_reference& y) // declare these methods for the purpose of Doxygenating them // they actually are defined for a base class -#if THRUST_DOXYGEN +#ifdef _CCCL_DOXYGEN_INVOKED /*! Writes to an output stream the value of a \p device_reference. * * \param os The output stream. diff --git a/thrust/thrust/memory.h b/thrust/thrust/memory.h index 6462545590b..290c99b7b2e 100644 --- a/thrust/thrust/memory.h +++ b/thrust/thrust/memory.h @@ -138,7 +138,7 @@ template _CCCL_HOST_DEVICE pointer malloc(const thrust::detail::execution_policy_base& system, std::size_t n); -#endif // DOXYGEN_SHOULD_SKIP_THIS +#endif // _CCCL_DOXYGEN_INVOKED /*! This version of \p malloc allocates typed uninitialized storage associated with a given system. * diff --git a/thrust/thrust/optional.h b/thrust/thrust/optional.h index 6762271cb47..bb9bf1cfb4b 100644 --- a/thrust/thrust/optional.h +++ b/thrust/thrust/optional.h @@ -1976,7 +1976,7 @@ optional(T) -> optional; #endif // Doxygen chokes on the trailing return types used below. -#if !defined(THRUST_DOXYGEN) +#if !defined(_CCCL_DOXYGEN_INVOKED) /// \exclude namespace detail { @@ -2034,7 +2034,7 @@ _CCCL_HOST_DEVICE auto optional_map_impl(Opt&& opt, F&& f) -> optional` acts similarly /// to a `T*`, but provides more operations and shows intent more clearly. diff --git a/thrust/thrust/pair.h b/thrust/thrust/pair.h index e3c74677993..9f35a388bc7 100644 --- a/thrust/thrust/pair.h +++ b/thrust/thrust/pair.h @@ -49,12 +49,12 @@ THRUST_NAMESPACE_BEGIN * \tparam N This parameter selects the member of interest. * \tparam T A \c pair type of interest. */ -#ifdef DOXYGEN_SHOULD_SKIP_THIS // Provide a fake alias for doxygen +#ifdef _CCCL_DOXYGEN_INVOKED // Provide a fake alias for doxygen template using tuple_element = _CUDA_VSTD::tuple_element; -#else // ^^^ DOXYGEN_SHOULD_SKIP_THIS ^^^ / vvv !DOXYGEN_SHOULD_SKIP_THIS vvv +#else // ^^^ _CCCL_DOXYGEN_INVOKED ^^^ / vvv !_CCCL_DOXYGEN_INVOKED vvv using _CUDA_VSTD::tuple_element; -#endif // DOXYGEN_SHOULD_SKIP_THIS +#endif // _CCCL_DOXYGEN_INVOKED /*! This convenience metafunction is included for compatibility with * \p tuple. It returns \c 2, the number of elements of a \p pair, @@ -62,12 +62,12 @@ using _CUDA_VSTD::tuple_element; * * \tparam Pair A \c pair type of interest. */ -#ifdef DOXYGEN_SHOULD_SKIP_THIS // Provide a fake alias for doxygen +#ifdef _CCCL_DOXYGEN_INVOKED // Provide a fake alias for doxygen template using tuple_size = _CUDA_VSTD::tuple_size; -#else // ^^^ DOXYGEN_SHOULD_SKIP_THIS ^^^ / vvv !DOXYGEN_SHOULD_SKIP_THIS vvv +#else // ^^^ _CCCL_DOXYGEN_INVOKED ^^^ / vvv !_CCCL_DOXYGEN_INVOKED vvv using _CUDA_VSTD::tuple_size; -#endif // DOXYGEN_SHOULD_SKIP_THIS +#endif // _CCCL_DOXYGEN_INVOKED /*! \p pair is a generic data structure encapsulating a heterogeneous * pair of values. @@ -80,12 +80,12 @@ using _CUDA_VSTD::tuple_size; * requirements on the type of \p T2. T2's type is * provided by pair::second_type. */ -#ifdef DOXYGEN_SHOULD_SKIP_THIS // Provide a fake alias for doxygen +#ifdef _CCCL_DOXYGEN_INVOKED // Provide a fake alias for doxygen template using pair = _CUDA_VSTD::pair; -#else // ^^^ DOXYGEN_SHOULD_SKIP_THIS ^^^ / vvv !DOXYGEN_SHOULD_SKIP_THIS vvv +#else // ^^^ _CCCL_DOXYGEN_INVOKED ^^^ / vvv !_CCCL_DOXYGEN_INVOKED vvv using _CUDA_VSTD::pair; -#endif // DOXYGEN_SHOULD_SKIP_THIS +#endif // _CCCL_DOXYGEN_INVOKED using _CUDA_VSTD::get; using _CUDA_VSTD::make_pair; diff --git a/thrust/thrust/random/linear_congruential_engine.h b/thrust/thrust/random/linear_congruential_engine.h index ce47c08b619..c289667749f 100644 --- a/thrust/thrust/random/linear_congruential_engine.h +++ b/thrust/thrust/random/linear_congruential_engine.h @@ -143,11 +143,11 @@ class linear_congruential_engine /*! The smallest value this \p linear_congruential_engine may potentially produce. */ -#ifndef DOXYGEN_SHOULD_SKIP_THIS // Doxygen breaks on the ternary :shrug: +#ifndef _CCCL_DOXYGEN_INVOKED // Doxygen breaks on the ternary :shrug: static const result_type min = c == 0u ? 1u : 0u; #else static const result_type min = 0u; -#endif // DOXYGEN_SHOULD_SKIP_THIS +#endif // _CCCL_DOXYGEN_INVOKED /*! The largest value this \p linear_congruential_engine may potentially produce. */ diff --git a/thrust/thrust/tuple.h b/thrust/thrust/tuple.h index 1f8ed8943e5..d0d13670f0c 100644 --- a/thrust/thrust/tuple.h +++ b/thrust/thrust/tuple.h @@ -94,12 +94,12 @@ _CCCL_HOST_DEVICE inline bool operator>(const null_type&, const null_type&) * \see pair * \see tuple */ -#ifdef DOXYGEN_SHOULD_SKIP_THIS // Provide a fake alias for doxygen +#ifdef _CCCL_DOXYGEN_INVOKED // Provide a fake alias for doxygen template using tuple_element = _CUDA_VSTD::tuple_element; -#else // ^^^ DOXYGEN_SHOULD_SKIP_THIS ^^^ / vvv !DOXYGEN_SHOULD_SKIP_THIS vvv +#else // ^^^ _CCCL_DOXYGEN_INVOKED ^^^ / vvv !_CCCL_DOXYGEN_INVOKED vvv using _CUDA_VSTD::tuple_element; -#endif // DOXYGEN_SHOULD_SKIP_THIS +#endif // _CCCL_DOXYGEN_INVOKED /*! This metafunction returns the number of elements * of a \p tuple type of interest. @@ -109,12 +109,12 @@ using _CUDA_VSTD::tuple_element; * \see pair * \see tuple */ -#ifdef DOXYGEN_SHOULD_SKIP_THIS // Provide a fake alias for doxygen +#ifdef _CCCL_DOXYGEN_INVOKED // Provide a fake alias for doxygen template using tuple_size = _CUDA_VSTD::tuple_size; -#else // ^^^ DOXYGEN_SHOULD_SKIP_THIS ^^^ / vvv !DOXYGEN_SHOULD_SKIP_THIS vvv +#else // ^^^ _CCCL_DOXYGEN_INVOKED ^^^ / vvv !_CCCL_DOXYGEN_INVOKED vvv using _CUDA_VSTD::tuple_size; -#endif // DOXYGEN_SHOULD_SKIP_THIS +#endif // _CCCL_DOXYGEN_INVOKED /*! \brief \p tuple is a heterogeneous, fixed-size collection of values. * An instantiation of \p tuple with two arguments is similar to an @@ -153,12 +153,12 @@ using _CUDA_VSTD::tuple_size; * \see tuple_size * \see tie */ -#ifdef DOXYGEN_SHOULD_SKIP_THIS // Provide a fake alias for doxygen +#ifdef _CCCL_DOXYGEN_INVOKED // Provide a fake alias for doxygen template using tuple = _CUDA_VSTD::tuple; -#else // ^^^ DOXYGEN_SHOULD_SKIP_THIS ^^^ / vvv !DOXYGEN_SHOULD_SKIP_THIS vvv +#else // ^^^ _CCCL_DOXYGEN_INVOKED ^^^ / vvv !_CCCL_DOXYGEN_INVOKED vvv using _CUDA_VSTD::tuple; -#endif // DOXYGEN_SHOULD_SKIP_THIS +#endif // _CCCL_DOXYGEN_INVOKED using _CUDA_VSTD::get; using _CUDA_VSTD::make_tuple; From 667886ef0d7db34a412b06aba94bd0a9bf502bb9 Mon Sep 17 00:00:00 2001 From: David Bayer <48736217+davebayer@users.noreply.github.com> Date: Fri, 22 Nov 2024 08:21:33 +0100 Subject: [PATCH 05/45] implement C++26 `std::span::at` (#2924) Co-authored-by: Bernhard Manfred Gruber --- .../cuda/std/detail/libcxx/include/span | 19 ++ libcudacxx/include/cuda/std/version | 2 +- .../views/views.span/span.elem/at.pass.cpp | 225 ++++++++++++++++++ 3 files changed, 245 insertions(+), 1 deletion(-) create mode 100644 libcudacxx/test/libcudacxx/std/containers/views/views.span/span.elem/at.pass.cpp diff --git a/libcudacxx/include/cuda/std/detail/libcxx/include/span b/libcudacxx/include/cuda/std/detail/libcxx/include/span index 8257ac93f1b..afe5ea34519 100644 --- a/libcudacxx/include/cuda/std/detail/libcxx/include/span +++ b/libcudacxx/include/cuda/std/detail/libcxx/include/span @@ -171,6 +171,7 @@ template #include #include #include // for ptrdiff_t +#include // standard-mandated includes #include @@ -502,6 +503,15 @@ public: return __data_[__idx]; } + _LIBCUDACXX_HIDE_FROM_ABI constexpr reference at(size_type __idx) const + { + if (__idx >= size()) + { + _CUDA_VSTD::__throw_out_of_range("span::at"); + } + return __data_[__idx]; + } + _LIBCUDACXX_HIDE_FROM_ABI constexpr reference front() const noexcept { _CCCL_ASSERT(!empty(), "span::front() on empty span"); @@ -731,6 +741,15 @@ public: return __data_[__idx]; } + _LIBCUDACXX_HIDE_FROM_ABI constexpr reference at(size_type __idx) const + { + if (__idx >= size()) + { + _CUDA_VSTD::__throw_out_of_range("span::at"); + } + return __data_[__idx]; + } + _LIBCUDACXX_HIDE_FROM_ABI constexpr reference front() const noexcept { _CCCL_ASSERT(!empty(), "span::front() on empty span"); diff --git a/libcudacxx/include/cuda/std/version b/libcudacxx/include/cuda/std/version index 059bfcccc66..841aa449c77 100644 --- a/libcudacxx/include/cuda/std/version +++ b/libcudacxx/include/cuda/std/version @@ -60,7 +60,7 @@ // # define __cccl_lib_shared_timed_mutex 201402L # endif // !_LIBCUDACXX_HAS_NO_THREADS # define __cccl_lib_source_location 201907L -# define __cccl_lib_span 202002L +# define __cccl_lib_span 202311L // # define __cccl_lib_string_udls 201304L # define __cccl_lib_transformation_trait_aliases 201304L # define __cccl_lib_transparent_operators 201210L diff --git a/libcudacxx/test/libcudacxx/std/containers/views/views.span/span.elem/at.pass.cpp b/libcudacxx/test/libcudacxx/std/containers/views/views.span/span.elem/at.pass.cpp new file mode 100644 index 00000000000..47f45804aad --- /dev/null +++ b/libcudacxx/test/libcudacxx/std/containers/views/views.span/span.elem/at.pass.cpp @@ -0,0 +1,225 @@ +//===----------------------------------------------------------------------===// +// +// Part of the libcu++ Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// SPDX-FileCopyrightText: Copyright (c) 2024 NVIDIA CORPORATION & AFFILIATES. +// +//===----------------------------------------------------------------------===// +// UNSUPPORTED: c++11 + +// + +// constexpr reference at(size_type idx) const; + +#include +#include +#include +#include +#include +#include +#include + +#include "test_macros.h" + +#ifndef TEST_HAS_NO_EXCEPTIONS +# include +#endif // !TEST_HAS_NO_EXCEPTIONS + +template +__host__ __device__ constexpr void testSpanAt(SpanT&& anySpan, int index, int expectedValue) +{ + // non-const + { + auto elem = anySpan.at(index); + ASSERT_SAME_TYPE(ReferenceT, decltype(anySpan.at(index))); + assert(elem == expectedValue); + } + + // const + { + auto elem = cuda::std::as_const(anySpan).at(index); + ASSERT_SAME_TYPE(ReferenceT, decltype(cuda::std::as_const(anySpan).at(index))); + assert(elem == expectedValue); + } +} + +__host__ __device__ constexpr bool test() +{ + // With static extent + { + cuda::std::array arr{0, 1, 2, 3, 4, 5, 9084}; + cuda::std::span arrSpan{arr}; + + assert(cuda::std::dynamic_extent != arrSpan.extent); + + using ReferenceT = typename decltype(arrSpan)::reference; + + testSpanAt(arrSpan, 0, 0); + testSpanAt(arrSpan, 1, 1); + testSpanAt(arrSpan, 6, 9084); + } + + // With dynamic extent + { + cuda::std::array arr{0, 1, 2, 3, 4, 5, 9084}; + cuda::std::span dynSpan{arr}; + + assert(cuda::std::dynamic_extent == dynSpan.extent); + + using ReferenceT = typename decltype(dynSpan)::reference; + + testSpanAt(dynSpan, 0, 0); + testSpanAt(dynSpan, 1, 1); + testSpanAt(dynSpan, 6, 9084); + } + + return true; +} + +#ifndef TEST_HAS_NO_EXCEPTIONS +void test_exceptions() +{ + // With static extent + { + cuda::std::array arr{0, 1, 2, 3, 4, 5, 9084, cuda::std::numeric_limits::max()}; + const cuda::std::span arrSpan{arr}; + + try + { + using SizeT = typename decltype(arrSpan)::size_type; + cuda::std::ignore = arrSpan.at(cuda::std::numeric_limits::max()); + assert(false); + } + catch (const std::out_of_range&) + { + // pass + } + catch (...) + { + assert(false); + } + + try + { + cuda::std::ignore = arrSpan.at(arr.size()); + assert(false); + } + catch (const std::out_of_range&) + { + // pass + } + catch (...) + { + assert(false); + } + + try + { + cuda::std::ignore = arrSpan.at(arr.size() - 1); + // pass + assert(arrSpan.at(arr.size() - 1) == cuda::std::numeric_limits::max()); + } + catch (...) + { + assert(false); + } + } + + { + cuda::std::array arr{}; + const cuda::std::span arrSpan{arr}; + + try + { + cuda::std::ignore = arrSpan.at(0); + assert(false); + } + catch (const std::out_of_range&) + { + // pass + } + catch (...) + { + assert(false); + } + } + + // With dynamic extent + + { + cuda::std::array arr{0, 1, 2, 3, 4, 5, 9084, cuda::std::numeric_limits::max()}; + const cuda::std::span dynSpan{arr}; + + try + { + using SizeT = typename decltype(dynSpan)::size_type; + cuda::std::ignore = dynSpan.at(cuda::std::numeric_limits::max()); + assert(false); + } + catch (const std::out_of_range&) + { + // pass + } + catch (...) + { + assert(false); + } + + try + { + cuda::std::ignore = dynSpan.at(arr.size()); + assert(false); + } + catch (const std::out_of_range&) + { + // pass + } + catch (...) + { + assert(false); + } + + try + { + cuda::std::ignore = dynSpan.at(arr.size() - 1); + assert(dynSpan.at(arr.size() - 1) == cuda::std::numeric_limits::max()); + } + catch (...) + { + assert(false); + } + } + + { + cuda::std::array arr{}; + const cuda::std::span dynSpan{arr}; + + try + { + cuda::std::ignore = dynSpan.at(0); + assert(false); + } + catch (const std::out_of_range&) + { + // pass + } + catch (...) + { + assert(false); + } + } +} +#endif // TEST_HAS_NO_EXCEPTIONS + +int main(int, char**) +{ + test(); + static_assert(test(), ""); + +#ifndef TEST_HAS_NO_EXCEPTIONS + NV_IF_TARGET(NV_IS_HOST, (test_exceptions();)) +#endif // TEST_HAS_NO_EXCEPTIONS + + return 0; +} From bc45573d680911f18aa8e8c0a970ef8b0742ab9c Mon Sep 17 00:00:00 2001 From: David Bayer <48736217+davebayer@users.noreply.github.com> Date: Fri, 22 Nov 2024 08:51:57 +0100 Subject: [PATCH 06/45] move msvc compiler macros to new version (#2885) Co-authored-by: Michael Schellenberger Costa Co-authored-by: Bernhard Manfred Gruber Co-authored-by: Eric Niebler --- cub/cub/detail/nvtx.cuh | 2 +- cub/cub/util_compiler.cuh | 8 +- cub/cub/util_cpp_dialect.cuh | 6 +- cub/cub/util_deprecated.cuh | 2 +- .../catch2_test_device_for_each_in_extents.cu | 6 +- cub/test/catch2_test_device_transform.cu | 4 +- cub/test/test_warning_suppression.cuh | 2 +- cudax/examples/stf/fdtd_mgpu.cu | 8 +- .../cuda/experimental/__async/lazy.cuh | 2 +- .../cuda/experimental/__async/meta.cuh | 2 +- .../cuda/experimental/__async/tuple.cuh | 2 +- .../cuda/experimental/__async/variant.cuh | 2 +- .../uninitialized_async_buffer.cuh | 5 +- .../__container/uninitialized_buffer.cuh | 5 +- .../cuda/experimental/__device/device.cuh | 2 +- .../__memory_resource/any_resource.cuh | 2 +- .../__memory_resource/device_memory_pool.cuh | 4 +- .../device_memory_resource.cuh | 4 +- .../__memory_resource/shared_resource.cuh | 2 +- .../experimental/__stf/graph/graph_task.cuh | 8 +- .../__stf/internal/backend_ctx.cuh | 8 +- .../places/exec/host/callback_queues.cuh | 4 +- .../experimental/__stf/stream/stream_ctx.cuh | 4 +- .../cuda/experimental/__stf/utility/core.cuh | 2 +- .../experimental/__stf/utility/traits.cuh | 12 +- cudax/test/stf/dot/basic.cu | 4 +- cudax/test/stf/dot/graph_print_to_dot.cu | 4 +- cudax/test/stf/dot/with_events.cu | 4 +- cudax/test/stf/error_checks/ctx_mismatch.cu | 6 +- .../error_checks/data_interface_mismatch.cu | 6 +- .../test/stf/error_checks/double_finalize.cu | 6 +- cudax/test/stf/error_checks/erase_frozen.cu | 6 +- .../error_checks/misformed_tasks_dbl_end.cu | 6 +- .../error_checks/misformed_tasks_dbl_start.cu | 6 +- .../test/stf/error_checks/non_managed_data.cu | 6 +- .../stf/error_checks/slice_check_bounds.cu | 6 +- .../stf/error_checks/uninitialized_data.cu | 6 +- .../stf/error_checks/unsatisfiable_spec.cu | 6 +- cudax/test/stf/error_checks/write_frozen.cu | 6 +- cudax/test/stf/parallel_for/fdtd.cu | 8 +- cudax/test/stf/reclaiming/graph.cu | 8 +- cudax/test/stf/stress/task_bench.cu | 8 +- cudax/test/stf/tools/auto_dump/auto_dump.cu | 4 +- .../device_memory_resource.h | 4 +- .../cuda/__memory_resource/get_property.h | 4 +- .../managed_memory_resource.h | 4 +- .../pinned_memory_resource.h | 4 +- .../cuda/__memory_resource/properties.h | 4 +- .../include/cuda/__memory_resource/resource.h | 4 +- .../cuda/__memory_resource/resource_ref.h | 4 +- .../std/__algorithm/iterator_operations.h | 4 +- .../std/__algorithm/ranges_iterator_concept.h | 4 +- .../include/cuda/std/__atomic/platform.h | 2 +- .../std/__atomic/platform/msvc_to_builtins.h | 4 +- libcudacxx/include/cuda/std/__bit/clz.h | 8 +- libcudacxx/include/cuda/std/__bit/ctz.h | 8 +- libcudacxx/include/cuda/std/__bit/popc.h | 8 +- libcudacxx/include/cuda/std/__cccl/assert.h | 8 +- .../include/cuda/std/__cccl/attributes.h | 10 +- libcudacxx/include/cuda/std/__cccl/builtin.h | 109 ++++++++---------- libcudacxx/include/cuda/std/__cccl/compiler.h | 26 ++--- .../include/cuda/std/__cccl/diagnostic.h | 26 ++--- libcudacxx/include/cuda/std/__cccl/dialect.h | 6 +- .../include/cuda/std/__cccl/exceptions.h | 4 +- libcudacxx/include/cuda/std/__cccl/rtti.h | 4 +- .../include/cuda/std/__cccl/system_header.h | 9 +- .../include/cuda/std/__cccl/unreachable.h | 8 +- .../include/cuda/std/__cccl/visibility.h | 14 +-- .../cuda/std/__concepts/concept_macros.h | 6 +- .../cuda/std/__concepts/convertible_to.h | 8 +- .../cuda/std/__concepts/destructible.h | 6 +- .../include/cuda/std/__concepts/swappable.h | 8 +- libcudacxx/include/cuda/std/__fwd/get.h | 4 +- libcudacxx/include/cuda/std/__fwd/subrange.h | 4 +- .../include/cuda/std/__iterator/concepts.h | 6 +- .../include/cuda/std/__iterator/distance.h | 4 +- .../cuda/std/__iterator/iterator_traits.h | 6 +- .../cuda/std/__iterator/move_iterator.h | 20 ++-- libcudacxx/include/cuda/std/__iterator/next.h | 4 +- libcudacxx/include/cuda/std/__iterator/prev.h | 4 +- .../cuda/std/__iterator/reverse_iterator.h | 6 +- .../std/__iterator/unreachable_sentinel.h | 8 +- .../cuda/std/__memory/assume_aligned.h | 4 +- libcudacxx/include/cuda/std/__ranges/access.h | 4 +- .../include/cuda/std/__ranges/concepts.h | 4 +- .../include/cuda/std/__ranges/dangling.h | 4 +- libcudacxx/include/cuda/std/__ranges/data.h | 4 +- libcudacxx/include/cuda/std/__ranges/empty.h | 4 +- libcudacxx/include/cuda/std/__ranges/rbegin.h | 4 +- libcudacxx/include/cuda/std/__ranges/rend.h | 4 +- libcudacxx/include/cuda/std/__ranges/size.h | 4 +- .../include/cuda/std/__ranges/subrange.h | 4 +- .../include/cuda/std/__ranges/unwrap_end.h | 4 +- .../cuda/std/__ranges/view_interface.h | 4 +- libcudacxx/include/cuda/std/__ranges/views.h | 4 +- .../std/__tuple_dir/structured_bindings.h | 4 +- .../include/cuda/std/__tuple_dir/tuple_like.h | 4 +- .../cuda/std/__type_traits/common_reference.h | 6 +- .../cuda/std/__type_traits/common_type.h | 6 +- .../cuda/std/__type_traits/disjunction.h | 4 +- .../cuda/std/__type_traits/is_convertible.h | 4 +- .../std/__type_traits/is_primary_template.h | 6 +- .../cuda/std/__type_traits/type_list.h | 6 +- .../include/cuda/std/__type_traits/type_set.h | 2 +- .../include/cuda/std/__utility/auto_cast.h | 2 +- .../include/cuda/std/__utility/declval.h | 3 +- libcudacxx/include/cuda/std/bitset | 6 +- .../cuda/std/detail/libcxx/include/__config | 24 ++-- .../cuda/std/detail/libcxx/include/climits | 2 +- .../cuda/std/detail/libcxx/include/cmath | 18 +-- .../cuda/std/detail/libcxx/include/limits | 2 +- .../cuda/std/detail/libcxx/include/span | 12 +- .../cuda/std/detail/libcxx/include/variant | 4 +- libcudacxx/include/cuda/std/inplace_vector | 52 ++++----- libcudacxx/include/cuda/std/version | 4 +- .../support.srcloc/general.pass.cpp | 8 +- .../bitset.members/to_ullong.pass.cpp | 2 +- .../bitset.members/to_ulong.pass.cpp | 2 +- libcudacxx/test/support/test_macros.h | 6 +- thrust/testing/async_sort.cu | 2 +- thrust/testing/cuda/transform.cu | 4 +- thrust/testing/functional.cu | 2 +- thrust/testing/set_difference.cu | 2 +- thrust/testing/set_intersection.cu | 2 +- thrust/testing/vector_manipulation.cu | 2 +- thrust/thrust/detail/config/compiler.h | 8 +- thrust/thrust/detail/config/compiler_fence.h | 4 +- thrust/thrust/detail/config/cpp_dialect.h | 6 +- thrust/thrust/detail/config/deprecated.h | 2 +- thrust/thrust/iterator/permutation_iterator.h | 8 +- thrust/thrust/iterator/reverse_iterator.h | 6 +- thrust/thrust/iterator/transform_iterator.h | 8 +- thrust/thrust/iterator/zip_iterator.h | 6 +- thrust/thrust/optional.h | 4 +- thrust/thrust/system/detail/error_code.inl | 8 +- .../thrust/system/detail/error_condition.inl | 8 +- thrust/thrust/system/error_code.h | 16 +-- .../type_traits/is_contiguous_iterator.h | 2 +- 138 files changed, 458 insertions(+), 481 deletions(-) diff --git a/cub/cub/detail/nvtx.cuh b/cub/cub/detail/nvtx.cuh index 3bda5e596f3..35fae565b0b 100644 --- a/cub/cub/detail/nvtx.cuh +++ b/cub/cub/detail/nvtx.cuh @@ -96,7 +96,7 @@ CUB_NAMESPACE_END # define CUB_DETAIL_NVTX_RANGE_SCOPE(name) CUB_DETAIL_NVTX_RANGE_SCOPE_IF(true, name) # else // NVTX3_CPP_DEFINITIONS_V1_0 -# if defined(_CCCL_COMPILER_MSVC) +# if _CCCL_COMPILER(MSVC) # pragma message( \ "warning: nvtx3.hpp is available but does not define the V1 API. This is odd. Please open a GitHub issue at: https://github.com/NVIDIA/cccl/issues.") # else diff --git a/cub/cub/util_compiler.cuh b/cub/cub/util_compiler.cuh index 6385e795045..2110268617c 100644 --- a/cub/cub/util_compiler.cuh +++ b/cub/cub/util_compiler.cuh @@ -66,13 +66,13 @@ #define CUB_DEVICE_COMPILER_CLANG 4 // figure out which host compiler we're using -#if defined(_CCCL_COMPILER_MSVC) +#if _CCCL_COMPILER(MSVC) //! deprecated [Since 2.7] # define CUB_HOST_COMPILER CUB_HOST_COMPILER_MSVC //! deprecated [Since 2.7] -# define CUB_MSVC_VERSION _CCCL_MSVC_VERSION +# define CUB_MSVC_VERSION _MSC_VER //! deprecated [Since 2.7] -# define CUB_MSVC_VERSION_FULL _CCCL_MSVC_VERSION_FULL +# define CUB_MSVC_VERSION_FULL _MSC_FULL_VER #elif _CCCL_COMPILER(CLANG) //! deprecated [Since 2.7] # define CUB_HOST_COMPILER CUB_HOST_COMPILER_CLANG @@ -89,7 +89,7 @@ #if defined(_CCCL_CUDA_COMPILER_NVCC) || defined(_CCCL_CUDA_COMPILER_NVHPC) //! deprecated [Since 2.7] # define CUB_DEVICE_COMPILER CUB_DEVICE_COMPILER_NVCC -#elif defined(_CCCL_COMPILER_MSVC) +#elif _CCCL_COMPILER(MSVC) //! deprecated [Since 2.7] # define CUB_DEVICE_COMPILER CUB_DEVICE_COMPILER_MSVC #elif _CCCL_COMPILER(GCC) diff --git a/cub/cub/util_cpp_dialect.cuh b/cub/cub/util_cpp_dialect.cuh index 6f54239bf84..e2affdb3304 100644 --- a/cub/cub/util_cpp_dialect.cuh +++ b/cub/cub/util_cpp_dialect.cuh @@ -80,7 +80,7 @@ # define CUB_CPP_DIALECT _CCCL_STD_VER // Define CUB_COMPILER_DEPRECATION macro: -# if defined(_CCCL_COMPILER_MSVC) +# if _CCCL_COMPILER(MSVC) # define CUB_COMP_DEPR_IMPL(msg) _CCCL_PRAGMA(message(__FILE__ ":" _CCCL_TO_STRING(__LINE__) ": warning: " #msg)) # else // clang / gcc: # define CUB_COMP_DEPR_IMPL(msg) _CCCL_PRAGMA(GCC warning #msg) @@ -101,10 +101,10 @@ CUB_COMPILER_DEPRECATION(GCC 5.0); # elif _CCCL_COMPILER(CLANG, <, 7) CUB_COMPILER_DEPRECATION(Clang 7.0); -# elif defined(_CCCL_COMPILER_MSVC) && _CCCL_MSVC_VERSION < 1910 +# elif _CCCL_COMPILER(MSVC, <, 19, 10) // <2017. Hard upgrade message: CUB_COMPILER_DEPRECATION(MSVC 2019(19.20 / 16.0 / 14.20)); -# elif defined(_CCCL_COMPILER_MSVC) && _CCCL_MSVC_VERSION < 1920 +# elif _CCCL_COMPILER(MSVC2017) // >=2017, <2019. Soft deprecation message: CUB_COMPILER_DEPRECATION_SOFT(MSVC 2019(19.20 / 16.0 / 14.20), MSVC 2017); # endif diff --git a/cub/cub/util_deprecated.cuh b/cub/cub/util_deprecated.cuh index 250c3f53b16..c227d4309b9 100644 --- a/cub/cub/util_deprecated.cuh +++ b/cub/cub/util_deprecated.cuh @@ -55,7 +55,7 @@ #elif _CCCL_STD_VER >= 2014 # define CUB_DEPRECATED [[deprecated]] # define CUB_DEPRECATED_BECAUSE(MSG) [[deprecated(MSG)]] -#elif defined(_CCCL_COMPILER_MSVC) +#elif _CCCL_COMPILER(MSVC) # define CUB_DEPRECATED __declspec(deprecated) # define CUB_DEPRECATED_BECAUSE(MSG) __declspec(deprecated(MSG)) #elif _CCCL_COMPILER(CLANG) diff --git a/cub/test/catch2_test_device_for_each_in_extents.cu b/cub/test/catch2_test_device_for_each_in_extents.cu index 51f9b7e6d37..6f11810101c 100644 --- a/cub/test/catch2_test_device_for_each_in_extents.cu +++ b/cub/test/catch2_test_device_for_each_in_extents.cu @@ -26,9 +26,9 @@ ******************************************************************************/ #include -// TODO: remove _CCCL_COMPILER_MSVC check after MSVC bug related to vector comparison is fixed: +// TODO: remove _CCCL_COMPILER(MSVC) check after MSVC bug related to vector comparison is fixed: // "error C3546: '...': there are no parameter packs available to expand" -#if __cccl_lib_mdspan && !defined(_CCCL_COMPILER_MSVC) +#if __cccl_lib_mdspan && !_CCCL_COMPILER(MSVC) # include @@ -181,4 +181,4 @@ C2H_TEST("DeviceForEachInExtents 3D dynamic", "[ForEachInExtents][dynamic][devic REQUIRE(h_output == h_output_gpu); } -#endif // __cccl_lib_mdspan && !defined(_CCCL_COMPILER_MSVC) +#endif // __cccl_lib_mdspan && !_CCCL_COMPILER(MSVC) diff --git a/cub/test/catch2_test_device_transform.cu b/cub/test/catch2_test_device_transform.cu index 4da07e330b6..db05da6c032 100644 --- a/cub/test/catch2_test_device_transform.cu +++ b/cub/test/catch2_test_device_transform.cu @@ -178,10 +178,10 @@ struct alignas(Alignment) overaligned_addable_t using overaligned_types = c2h::type_list -#ifndef _CCCL_COMPILER_MSVC // error C2719: [...] formal parameter with requested alignment of 256 won't be aligned +#if !_CCCL_COMPILER(MSVC) // error C2719: [...] formal parameter with requested alignment of 256 won't be aligned , overaligned_addable_t<256> -#endif // _CCCL_COMPILER_MSVC +#endif // !_CCCL_COMPILER(MSVC) >; // test with types exceeding the memcpy_async and bulk copy alignments (16 and 128 bytes respectively) diff --git a/cub/test/test_warning_suppression.cuh b/cub/test/test_warning_suppression.cuh index e11d199e0a8..46c6080fed7 100644 --- a/cub/test/test_warning_suppression.cuh +++ b/cub/test/test_warning_suppression.cuh @@ -33,7 +33,7 @@ // C4127: conditional expression is constant // This can be fixed with `if constexpr` when available, but there's no way to // silence these pre-C++17. -#if defined(_CCCL_COMPILER_MSVC) +#if _CCCL_COMPILER(MSVC) # if _CCCL_STD_VER < 2017 # pragma warning(disable : 4127) # endif diff --git a/cudax/examples/stf/fdtd_mgpu.cu b/cudax/examples/stf/fdtd_mgpu.cu index a9a54d1f993..d991c97f258 100644 --- a/cudax/examples/stf/fdtd_mgpu.cu +++ b/cudax/examples/stf/fdtd_mgpu.cu @@ -21,7 +21,7 @@ using namespace cuda::experimental::stf; // FIXME : MSVC has trouble with box constructors -#if !defined(_CCCL_COMPILER_MSVC) +#if !_CCCL_COMPILER(MSVC) void write_vtk_2D(const std::string& filename, slice Ez, double dx, double dy, double /*unused*/) { FILE* f = fopen(filename.c_str(), "w"); @@ -99,11 +99,11 @@ _CCCL_DEVICE double Source(double t, double x, double y, double z) constexpr double k = 2 * pi / wavelength; return sin(k * x - omega * t); } -#endif // !defined(_CCCL_COMPILER_MSVC) +#endif // !_CCCL_COMPILER(MSVC) int main([[maybe_unused]] int argc, [[maybe_unused]] char** argv) { -#if !defined(_CCCL_COMPILER_MSVC) +#if !_CCCL_COMPILER(MSVC) context ctx; // Initialize the time loop @@ -292,5 +292,5 @@ int main([[maybe_unused]] int argc, [[maybe_unused]] char** argv) }; ctx.finalize(); -#endif // !defined(_CCCL_COMPILER_MSVC) +#endif // !_CCCL_COMPILER(MSVC) } diff --git a/cudax/include/cuda/experimental/__async/lazy.cuh b/cudax/include/cuda/experimental/__async/lazy.cuh index 95f7a4a0adb..7655b658401 100644 --- a/cudax/include/cuda/experimental/__async/lazy.cuh +++ b/cudax/include/cuda/experimental/__async/lazy.cuh @@ -136,7 +136,7 @@ struct __lazy_tupl<_CUDA_VSTD::index_sequence<_Idx...>, _Ts...> : __detail::__la bool __engaged_[sizeof...(_Ts)] = {}; }; -#if defined(_CCCL_COMPILER_MSVC) +#if _CCCL_COMPILER(MSVC) template struct __mk_lazy_tuple_ { diff --git a/cudax/include/cuda/experimental/__async/meta.cuh b/cudax/include/cuda/experimental/__async/meta.cuh index dbe2d21a15a..4f2b00a9de2 100644 --- a/cudax/include/cuda/experimental/__async/meta.cuh +++ b/cudax/include/cuda/experimental/__async/meta.cuh @@ -135,7 +135,7 @@ inline constexpr bool __type_is_error<_ERROR<_What...>&> = true; // True if any of the types in _Ts... are errors; false otherwise. template inline constexpr bool __type_contains_error = -#if defined(_CCCL_COMPILER_MSVC) +#if _CCCL_COMPILER(MSVC) (__type_is_error<_Ts> || ...); #else __ustdex_unhandled_error(static_cast<_CUDA_VSTD::__type_list<_Ts...>*>(nullptr)); diff --git a/cudax/include/cuda/experimental/__async/tuple.cuh b/cudax/include/cuda/experimental/__async/tuple.cuh index 3891ec47df7..06e74e3aabc 100644 --- a/cudax/include/cuda/experimental/__async/tuple.cuh +++ b/cudax/include/cuda/experimental/__async/tuple.cuh @@ -82,7 +82,7 @@ template using __apply_result_t = decltype(__declval<_Tupl>().__apply(__declval<_Fn>(), __declval<_Tupl>(), __declval<_Us>()...)); -#if defined(_CCCL_COMPILER_MSVC) +#if _CCCL_COMPILER(MSVC) template struct __mk_tuple_ { diff --git a/cudax/include/cuda/experimental/__async/variant.cuh b/cudax/include/cuda/experimental/__async/variant.cuh index 3398cdc9717..2c8c5b1ea16 100644 --- a/cudax/include/cuda/experimental/__async/variant.cuh +++ b/cudax/include/cuda/experimental/__async/variant.cuh @@ -169,7 +169,7 @@ public: } }; -#if defined(_CCCL_COMPILER_MSVC) +#if _CCCL_COMPILER(MSVC) template struct __mk_variant_ { diff --git a/cudax/include/cuda/experimental/__container/uninitialized_async_buffer.cuh b/cudax/include/cuda/experimental/__container/uninitialized_async_buffer.cuh index 5bfd60da9d3..fb502cbbf7d 100644 --- a/cudax/include/cuda/experimental/__container/uninitialized_async_buffer.cuh +++ b/cudax/include/cuda/experimental/__container/uninitialized_async_buffer.cuh @@ -34,8 +34,7 @@ #include -#if _CCCL_STD_VER >= 2014 && !defined(_CCCL_COMPILER_MSVC_2017) \ - && defined(LIBCUDACXX_ENABLE_EXPERIMENTAL_MEMORY_RESOURCE) +#if _CCCL_STD_VER >= 2014 && !_CCCL_COMPILER(MSVC2017) && defined(LIBCUDACXX_ENABLE_EXPERIMENTAL_MEMORY_RESOURCE) //! @file //! The \c uninitialized_async_buffer class provides a typed buffer allocated in stream-order from a given memory @@ -299,6 +298,6 @@ using uninitialized_async_device_buffer = uninitialized_async_buffer<_Tp, _CUDA_ } // namespace cuda::experimental -#endif // _CCCL_STD_VER >= 2014 && !_CCCL_COMPILER_MSVC_2017 && LIBCUDACXX_ENABLE_EXPERIMENTAL_MEMORY_RESOURCE +#endif // _CCCL_STD_VER >= 2014 && !_CCCL_COMPILER(MSVC2017) && LIBCUDACXX_ENABLE_EXPERIMENTAL_MEMORY_RESOURCE #endif //__CUDAX__CONTAINERS_UNINITIALIZED_ASYNC_BUFFER_H diff --git a/cudax/include/cuda/experimental/__container/uninitialized_buffer.cuh b/cudax/include/cuda/experimental/__container/uninitialized_buffer.cuh index 38c968d25c8..9a2f1200678 100644 --- a/cudax/include/cuda/experimental/__container/uninitialized_buffer.cuh +++ b/cudax/include/cuda/experimental/__container/uninitialized_buffer.cuh @@ -33,8 +33,7 @@ #include -#if _CCCL_STD_VER >= 2014 && !defined(_CCCL_COMPILER_MSVC_2017) \ - && defined(LIBCUDACXX_ENABLE_EXPERIMENTAL_MEMORY_RESOURCE) +#if _CCCL_STD_VER >= 2014 && !_CCCL_COMPILER(MSVC2017) && defined(LIBCUDACXX_ENABLE_EXPERIMENTAL_MEMORY_RESOURCE) //! @file //! The \c uninitialized_buffer class provides a typed buffer allocated from a given memory resource. @@ -264,6 +263,6 @@ using uninitialized_device_buffer = uninitialized_buffer<_Tp, _CUDA_VMR::device_ } // namespace cuda::experimental -#endif // _CCCL_STD_VER >= 2014 && !_CCCL_COMPILER_MSVC_2017 && LIBCUDACXX_ENABLE_EXPERIMENTAL_MEMORY_RESOURCE +#endif // _CCCL_STD_VER >= 2014 && !_CCCL_COMPILER(MSVC2017) && LIBCUDACXX_ENABLE_EXPERIMENTAL_MEMORY_RESOURCE #endif //__CUDAX__CONTAINERS_UNINITIALIZED_BUFFER_H diff --git a/cudax/include/cuda/experimental/__device/device.cuh b/cudax/include/cuda/experimental/__device/device.cuh index 3e19bafb4e7..98db56f668e 100644 --- a/cudax/include/cuda/experimental/__device/device.cuh +++ b/cudax/include/cuda/experimental/__device/device.cuh @@ -69,7 +69,7 @@ public: using attr_result_t = typename detail::__dev_attr<_Attr>::type; #ifndef _CCCL_DOXYGEN_INVOKED // Do not document -# if defined(_CCCL_COMPILER_MSVC) +# if _CCCL_COMPILER(MSVC) // When __EDG__ is defined, std::construct_at will not permit constructing // a device object from an __emplace_device object. This is a workaround. device(detail::__emplace_device __ed) diff --git a/cudax/include/cuda/experimental/__memory_resource/any_resource.cuh b/cudax/include/cuda/experimental/__memory_resource/any_resource.cuh index f386853bb08..f442e56dcfe 100644 --- a/cudax/include/cuda/experimental/__memory_resource/any_resource.cuh +++ b/cudax/include/cuda/experimental/__memory_resource/any_resource.cuh @@ -28,7 +28,7 @@ #endif // cuda::mr is unavable on MSVC 2017 -#if defined(_CCCL_COMPILER_MSVC_2017) +#if _CCCL_COMPILER(MSVC2017) # error "The any_resource header is not supported on MSVC 2017" #endif diff --git a/cudax/include/cuda/experimental/__memory_resource/device_memory_pool.cuh b/cudax/include/cuda/experimental/__memory_resource/device_memory_pool.cuh index 4708930d8ad..c74f7d68f77 100644 --- a/cudax/include/cuda/experimental/__memory_resource/device_memory_pool.cuh +++ b/cudax/include/cuda/experimental/__memory_resource/device_memory_pool.cuh @@ -22,7 +22,7 @@ #endif // no system header // cudaMallocAsync was introduced in CTK 11.2 -#if !defined(_CCCL_COMPILER_MSVC_2017) && _CCCL_CUDACC_AT_LEAST(11, 2) +#if !_CCCL_COMPILER(MSVC2017) && _CCCL_CUDACC_AT_LEAST(11, 2) # if defined(_CCCL_CUDA_COMPILER_CLANG) # include @@ -428,6 +428,6 @@ public: # endif // _CCCL_STD_VER >= 2014 -#endif // !_CCCL_COMPILER_MSVC_2017 && _CCCL_CUDACC_AT_LEAST(11, 2) +#endif // !_CCCL_COMPILER(MSVC2017) && _CCCL_CUDACC_AT_LEAST(11, 2) #endif // _CUDAX__MEMORY_RESOURCE_DEVICE_MEMORY_POOL diff --git a/cudax/include/cuda/experimental/__memory_resource/device_memory_resource.cuh b/cudax/include/cuda/experimental/__memory_resource/device_memory_resource.cuh index 7d54dd4f750..fffe3dea722 100644 --- a/cudax/include/cuda/experimental/__memory_resource/device_memory_resource.cuh +++ b/cudax/include/cuda/experimental/__memory_resource/device_memory_resource.cuh @@ -22,7 +22,7 @@ #endif // no system header // cudaMallocAsync was introduced in CTK 11.2 -#if !defined(_CCCL_COMPILER_MSVC_2017) && _CCCL_CUDACC_AT_LEAST(11, 2) +#if !_CCCL_COMPILER(MSVC2017) && _CCCL_CUDACC_AT_LEAST(11, 2) # if defined(_CCCL_CUDA_COMPILER_CLANG) # include @@ -417,6 +417,6 @@ static_assert(_CUDA_VMR::resource_with= 2014 -#endif // !_CCCL_COMPILER_MSVC_2017 && _CCCL_CUDACC_AT_LEAST(11, 2) +#endif // !_CCCL_COMPILER(MSVC2017) && _CCCL_CUDACC_AT_LEAST(11, 2) #endif //_CUDAX__MEMORY_RESOURCE_CUDA_DEVICE_MEMORY_RESOURCE diff --git a/cudax/include/cuda/experimental/__memory_resource/shared_resource.cuh b/cudax/include/cuda/experimental/__memory_resource/shared_resource.cuh index e92538ae8a0..1b0a81320b1 100644 --- a/cudax/include/cuda/experimental/__memory_resource/shared_resource.cuh +++ b/cudax/include/cuda/experimental/__memory_resource/shared_resource.cuh @@ -28,7 +28,7 @@ #endif // cuda::mr is unavable on MSVC 2017 -#if defined(_CCCL_COMPILER_MSVC_2017) +#if _CCCL_COMPILER(MSVC2017) # error "The shared_resource header is not supported on MSVC 2017" #endif diff --git a/cudax/include/cuda/experimental/__stf/graph/graph_task.cuh b/cudax/include/cuda/experimental/__stf/graph/graph_task.cuh index 884abc7cdac..f10c883e2ee 100644 --- a/cudax/include/cuda/experimental/__stf/graph/graph_task.cuh +++ b/cudax/include/cuda/experimental/__stf/graph/graph_task.cuh @@ -420,11 +420,11 @@ public: return mv(*this); } -#if defined(_CCCL_COMPILER_MSVC) +#if _CCCL_COMPILER(MSVC) // TODO (miscco): figure out why MSVC is complaining about unreachable code here _CCCL_DIAG_PUSH _CCCL_DIAG_SUPPRESS_MSVC(4702) // unreachable code -#endif // _CCCL_COMPILER_MSVC +#endif // _CCCL_COMPILER(MSVC) template void operator->*(Fun&& f) @@ -518,9 +518,9 @@ public: ::std::apply(f, tuple_prepend(mv(childGraph), typed_deps())); } } -#if defined(_CCCL_COMPILER_MSVC) +#if _CCCL_COMPILER(MSVC) _CCCL_DIAG_POP -#endif // _CCCL_COMPILER_MSVC +#endif // _CCCL_COMPILER(MSVC) private: auto typed_deps() diff --git a/cudax/include/cuda/experimental/__stf/internal/backend_ctx.cuh b/cudax/include/cuda/experimental/__stf/internal/backend_ctx.cuh index ce162fc40c0..2822370c1f3 100644 --- a/cudax/include/cuda/experimental/__stf/internal/backend_ctx.cuh +++ b/cudax/include/cuda/experimental/__stf/internal/backend_ctx.cuh @@ -492,19 +492,19 @@ protected: return nullptr; } -#if defined(_CCCL_COMPILER_MSVC) +#if _CCCL_COMPILER(MSVC) _CCCL_DIAG_PUSH _CCCL_DIAG_SUPPRESS_MSVC(4702) // unreachable code -#endif // _CCCL_COMPILER_MSVC +#endif // _CCCL_COMPILER(MSVC) virtual event_list stream_to_event_list(cudaStream_t, ::std::string) const { fprintf(stderr, "Internal error.\n"); abort(); return event_list(); } -#if defined(_CCCL_COMPILER_MSVC) +#if _CCCL_COMPILER(MSVC) _CCCL_DIAG_POP -#endif // _CCCL_COMPILER_MSVC +#endif // _CCCL_COMPILER(MSVC) virtual size_t epoch() const { diff --git a/cudax/include/cuda/experimental/__stf/places/exec/host/callback_queues.cuh b/cudax/include/cuda/experimental/__stf/places/exec/host/callback_queues.cuh index 2d3036ec143..387a3594c1f 100644 --- a/cudax/include/cuda/experimental/__stf/places/exec/host/callback_queues.cuh +++ b/cudax/include/cuda/experimental/__stf/places/exec/host/callback_queues.cuh @@ -32,7 +32,7 @@ #ifndef _CCCL_DOXYGEN_INVOKED // do not document -# if !defined(_CCCL_COMPILER_MSVC) +# if !_CCCL_COMPILER(MSVC) # define STATEFUL_CALLBACKS namespace cuda::experimental::stf @@ -602,5 +602,5 @@ inline bool cudaCallbackQueueProgress(callback_queue* q, bool flag) } // end namespace cuda::experimental::stf -# endif // !_CCCL_COMPILER_MSVC +# endif // !_CCCL_COMPILER(MSVC) #endif // _CCCL_DOXYGEN_INVOKED do not document diff --git a/cudax/include/cuda/experimental/__stf/stream/stream_ctx.cuh b/cudax/include/cuda/experimental/__stf/stream/stream_ctx.cuh index 022179341d9..86cceb1b2d9 100644 --- a/cudax/include/cuda/experimental/__stf/stream/stream_ctx.cuh +++ b/cudax/include/cuda/experimental/__stf/stream/stream_ctx.cuh @@ -809,7 +809,7 @@ UNITTEST("movable stream_task") // FIXME : This test is causing some compiler errors with MSVC, so we disable // it on MSVC for now -# if !defined(_CCCL_COMPILER_MSVC) +# if !_CCCL_COMPILER(MSVC) UNITTEST("logical_data_untyped moveable") { using namespace cuda::experimental::stf; @@ -852,7 +852,7 @@ UNITTEST("logical_data_untyped moveable") ctx.finalize(); }; -# endif // !_CCCL_COMPILER_MSVC +# endif // !_CCCL_COMPILER(MSVC) # ifdef __CUDACC__ namespace reserved diff --git a/cudax/include/cuda/experimental/__stf/utility/core.cuh b/cudax/include/cuda/experimental/__stf/utility/core.cuh index 23b0ff5560f..42e68c36905 100644 --- a/cudax/include/cuda/experimental/__stf/utility/core.cuh +++ b/cudax/include/cuda/experimental/__stf/utility/core.cuh @@ -39,7 +39,7 @@ namespace cuda::experimental::stf { // Hack setenv on Windows -#if defined(_CCCL_COMPILER_MSVC) +#if _CCCL_COMPILER(MSVC) /** * @brief Sets an environment variable, mimicking the behavior of `std::setenv` on Windows. * diff --git a/cudax/include/cuda/experimental/__stf/utility/traits.cuh b/cudax/include/cuda/experimental/__stf/utility/traits.cuh index 402737a44d3..8308e56d702 100644 --- a/cudax/include/cuda/experimental/__stf/utility/traits.cuh +++ b/cudax/include/cuda/experimental/__stf/utility/traits.cuh @@ -44,11 +44,11 @@ namespace reserved template constexpr ::std::string_view type_name_IMPL() { -#if defined(_CCCL_COMPILER_MSVC) +#if _CCCL_COMPILER(MSVC) return __FUNCSIG__; -#else // ^^^ _CCCL_COMPILER_MSVC ^^^ / vvv !_CCCL_COMPILER_MSVC vvv +#else // ^^^ _CCCL_COMPILER(MSVC) ^^^ / vvv !_CCCL_COMPILER(MSVC) vvv return __PRETTY_FUNCTION__; -#endif // !_CCCL_COMPILER_MSVC +#endif // !_CCCL_COMPILER(MSVC) } // Length of prefix and suffix in __PRETTY_FUNCTION__ when used with `type_name`. @@ -73,14 +73,14 @@ inline constexpr ::std::pair type_name_affixes = [] { template constexpr ::std::string_view type_name_impl() { -#if defined(_CCCL_COMPILER_MSVC) +#if _CCCL_COMPILER(MSVC) constexpr ::std::string_view p = __FUNCSIG__; // MSVC does not provide constexpr methods so we make this utility much simpler and return __FUNCSIG__ directly return p; -#else // ^^^ _CCCL_COMPILER_MSVC ^^^ / vvv !_CCCL_COMPILER_MSVC vvv +#else // ^^^ _CCCL_COMPILER(MSVC) ^^^ / vvv !_CCCL_COMPILER(MSVC) vvv ::std::string_view p = __PRETTY_FUNCTION__; return p.substr(type_name_affixes.first, p.size() - type_name_affixes.first - type_name_affixes.second); -#endif // !_CCCL_COMPILER_MSVC +#endif // !_CCCL_COMPILER(MSVC) } } // namespace reserved diff --git a/cudax/test/stf/dot/basic.cu b/cudax/test/stf/dot/basic.cu index dce79545a71..b602f142a2f 100644 --- a/cudax/test/stf/dot/basic.cu +++ b/cudax/test/stf/dot/basic.cu @@ -20,7 +20,7 @@ using namespace cuda::experimental::stf; int main() { // TODO (miscco): Make it work for windows -#if !defined(_CCCL_COMPILER_MSVC) +#if !_CCCL_COMPILER(MSVC) // Generate a random filename int r = rand(); @@ -44,5 +44,5 @@ int main() EXPECT(access(filename, F_OK) != -1); EXPECT(unlink(filename) == 0); -#endif // !_CCCL_COMPILER_MSVC +#endif // !_CCCL_COMPILER(MSVC) } diff --git a/cudax/test/stf/dot/graph_print_to_dot.cu b/cudax/test/stf/dot/graph_print_to_dot.cu index 62e5abfd193..cd024f72770 100644 --- a/cudax/test/stf/dot/graph_print_to_dot.cu +++ b/cudax/test/stf/dot/graph_print_to_dot.cu @@ -22,7 +22,7 @@ __global__ void dummy() {} int main() { // TODO (miscco): Make it work for windows -#if !defined(_CCCL_COMPILER_MSVC) +#if !_CCCL_COMPILER(MSVC) // Generate a random filename int r = rand(); @@ -46,5 +46,5 @@ int main() EXPECT(access(filename, F_OK) != -1); EXPECT(unlink(filename) == 0); -#endif // !_CCCL_COMPILER_MSVC +#endif // !_CCCL_COMPILER(MSVC) } diff --git a/cudax/test/stf/dot/with_events.cu b/cudax/test/stf/dot/with_events.cu index a03425024b8..b3c636e1d69 100644 --- a/cudax/test/stf/dot/with_events.cu +++ b/cudax/test/stf/dot/with_events.cu @@ -20,7 +20,7 @@ using namespace cuda::experimental::stf; int main() { // TODO (miscco): Make it work for windows -#if !defined(_CCCL_COMPILER_MSVC) +#if !_CCCL_COMPILER(MSVC) // Generate a random filename int r = rand(); @@ -45,5 +45,5 @@ int main() EXPECT(access(filename, F_OK) != -1); EXPECT(unlink(filename) == 0); -#endif // !_CCCL_COMPILER_MSVC +#endif // !_CCCL_COMPILER(MSVC) } diff --git a/cudax/test/stf/error_checks/ctx_mismatch.cu b/cudax/test/stf/error_checks/ctx_mismatch.cu index cafa6873dec..c04d589c367 100644 --- a/cudax/test/stf/error_checks/ctx_mismatch.cu +++ b/cudax/test/stf/error_checks/ctx_mismatch.cu @@ -53,9 +53,9 @@ void run(double (&X)[n]) int main() { /* Setup an handler to catch the SIGABRT signal during the programming error */ -#if defined(_CCCL_COMPILER_MSVC) +#if _CCCL_COMPILER(MSVC) signal(SIGABRT, &cleanupRoutine); -#else // ^^^ _CCCL_COMPILER_MSVC ^^^ / vvv !_CCCL_COMPILER_MSVC +#else // ^^^ _CCCL_COMPILER(MSVC) ^^^ / vvv !_CCCL_COMPILER(MSVC) struct sigaction sigabrt_action {}; memset(&sigabrt_action, 0, sizeof(sigabrt_action)); @@ -66,7 +66,7 @@ int main() perror("sigaction SIGABRT"); exit(EXIT_FAILURE); } -#endif // !_CCCL_COMPILER_MSVC +#endif // !_CCCL_COMPILER(MSVC) const int n = 12; double X[n]; diff --git a/cudax/test/stf/error_checks/data_interface_mismatch.cu b/cudax/test/stf/error_checks/data_interface_mismatch.cu index 79969f390ba..ea2ada7e633 100644 --- a/cudax/test/stf/error_checks/data_interface_mismatch.cu +++ b/cudax/test/stf/error_checks/data_interface_mismatch.cu @@ -63,9 +63,9 @@ void run(double (&X)[n]) int main() { /* Setup an handler to catch the SIGABRT signal during the programming error */ -#if defined(_CCCL_COMPILER_MSVC) +#if _CCCL_COMPILER(MSVC) signal(SIGABRT, &cleanupRoutine); -#else // ^^^ _CCCL_COMPILER_MSVC ^^^ / vvv !_CCCL_COMPILER_MSVC +#else // ^^^ _CCCL_COMPILER(MSVC) ^^^ / vvv !_CCCL_COMPILER(MSVC) struct sigaction sigabrt_action {}; memset(&sigabrt_action, 0, sizeof(sigabrt_action)); @@ -76,7 +76,7 @@ int main() perror("sigaction SIGABRT"); exit(EXIT_FAILURE); } -#endif // !_CCCL_COMPILER_MSVC +#endif // !_CCCL_COMPILER(MSVC) const int n = 12; double X[n]; diff --git a/cudax/test/stf/error_checks/double_finalize.cu b/cudax/test/stf/error_checks/double_finalize.cu index 6de61c0c2b3..37913ca6e36 100644 --- a/cudax/test/stf/error_checks/double_finalize.cu +++ b/cudax/test/stf/error_checks/double_finalize.cu @@ -39,9 +39,9 @@ int main() // This test only works when assert() is enabled in #ifndef NDEBUG /* Setup an handler to catch the SIGABRT signal during the programming error */ -# if defined(_CCCL_COMPILER_MSVC) +# if _CCCL_COMPILER(MSVC) signal(SIGABRT, &cleanupRoutine); -# else // ^^^ _CCCL_COMPILER_MSVC ^^^ / vvv !_CCCL_COMPILER_MSVC +# else // ^^^ _CCCL_COMPILER(MSVC) ^^^ / vvv !_CCCL_COMPILER(MSVC) struct sigaction sigabrt_action {}; memset(&sigabrt_action, 0, sizeof(sigabrt_action)); @@ -52,7 +52,7 @@ int main() perror("sigaction SIGABRT"); exit(EXIT_FAILURE); } -# endif // !_CCCL_COMPILER_MSVC +# endif // !_CCCL_COMPILER(MSVC) context ctx; diff --git a/cudax/test/stf/error_checks/erase_frozen.cu b/cudax/test/stf/error_checks/erase_frozen.cu index 3e99c360aa2..624dfb062f8 100644 --- a/cudax/test/stf/error_checks/erase_frozen.cu +++ b/cudax/test/stf/error_checks/erase_frozen.cu @@ -40,9 +40,9 @@ void cleanupRoutine(int /*unused*/) int main() { /* Setup an handler to catch the SIGABRT signal during the programming error */ -#if defined(_CCCL_COMPILER_MSVC) +#if _CCCL_COMPILER(MSVC) signal(SIGABRT, &cleanupRoutine); -#else // ^^^ _CCCL_COMPILER_MSVC ^^^ / vvv !_CCCL_COMPILER_MSVC +#else // ^^^ _CCCL_COMPILER(MSVC) ^^^ / vvv !_CCCL_COMPILER(MSVC) struct sigaction sigabrt_action {}; memset(&sigabrt_action, 0, sizeof(sigabrt_action)); @@ -53,7 +53,7 @@ int main() perror("sigaction SIGABRT"); exit(EXIT_FAILURE); } -#endif // !_CCCL_COMPILER_MSVC +#endif // !_CCCL_COMPILER(MSVC) stream_ctx ctx; const int N = 16; diff --git a/cudax/test/stf/error_checks/misformed_tasks_dbl_end.cu b/cudax/test/stf/error_checks/misformed_tasks_dbl_end.cu index b91a8d0aabb..fa28e5467e0 100644 --- a/cudax/test/stf/error_checks/misformed_tasks_dbl_end.cu +++ b/cudax/test/stf/error_checks/misformed_tasks_dbl_end.cu @@ -39,9 +39,9 @@ int main() // This test only works when assert() is enabled in #ifndef NDEBUG /* Setup an handler to catch the SIGABRT signal during the programming error */ -# if defined(_CCCL_COMPILER_MSVC) +# if _CCCL_COMPILER(MSVC) signal(SIGABRT, &cleanupRoutine); -# else // ^^^ _CCCL_COMPILER_MSVC ^^^ / vvv !_CCCL_COMPILER_MSVC +# else // ^^^ _CCCL_COMPILER(MSVC) ^^^ / vvv !_CCCL_COMPILER(MSVC) struct sigaction sigabrt_action {}; memset(&sigabrt_action, 0, sizeof(sigabrt_action)); @@ -52,7 +52,7 @@ int main() perror("sigaction SIGABRT"); exit(EXIT_FAILURE); } -# endif // !_CCCL_COMPILER_MSVC +# endif // !_CCCL_COMPILER(MSVC) stream_ctx ctx; diff --git a/cudax/test/stf/error_checks/misformed_tasks_dbl_start.cu b/cudax/test/stf/error_checks/misformed_tasks_dbl_start.cu index 3f783773b0e..b35cb99457f 100644 --- a/cudax/test/stf/error_checks/misformed_tasks_dbl_start.cu +++ b/cudax/test/stf/error_checks/misformed_tasks_dbl_start.cu @@ -37,9 +37,9 @@ void cleanupRoutine(int /*unused*/) int main() { /* Setup an handler to catch the SIGABRT signal during the programming error */ -#if defined(_CCCL_COMPILER_MSVC) +#if _CCCL_COMPILER(MSVC) signal(SIGABRT, &cleanupRoutine); -#else // ^^^ _CCCL_COMPILER_MSVC ^^^ / vvv !_CCCL_COMPILER_MSVC +#else // ^^^ _CCCL_COMPILER(MSVC) ^^^ / vvv !_CCCL_COMPILER(MSVC) struct sigaction sigabrt_action {}; memset(&sigabrt_action, 0, sizeof(sigabrt_action)); @@ -50,7 +50,7 @@ int main() perror("sigaction SIGABRT"); exit(EXIT_FAILURE); } -#endif // !_CCCL_COMPILER_MSVC +#endif // !_CCCL_COMPILER(MSVC) stream_ctx ctx; diff --git a/cudax/test/stf/error_checks/non_managed_data.cu b/cudax/test/stf/error_checks/non_managed_data.cu index 387322a0912..a1188c7750f 100644 --- a/cudax/test/stf/error_checks/non_managed_data.cu +++ b/cudax/test/stf/error_checks/non_managed_data.cu @@ -41,9 +41,9 @@ int main() { /* Setup an handler to catch the SIGABRT signal during the programming error */ #ifndef NDEBUG -# if defined(_CCCL_COMPILER_MSVC) +# if _CCCL_COMPILER(MSVC) signal(SIGABRT, &cleanupRoutine); -# else // ^^^ _CCCL_COMPILER_MSVC ^^^ / vvv !_CCCL_COMPILER_MSVC +# else // ^^^ _CCCL_COMPILER(MSVC) ^^^ / vvv !_CCCL_COMPILER(MSVC) struct sigaction sigabrt_action {}; memset(&sigabrt_action, 0, sizeof(sigabrt_action)); @@ -54,7 +54,7 @@ int main() perror("sigaction SIGABRT"); exit(EXIT_FAILURE); } -# endif // !_CCCL_COMPILER_MSVC +# endif // !_CCCL_COMPILER(MSVC) stream_ctx ctx; diff --git a/cudax/test/stf/error_checks/slice_check_bounds.cu b/cudax/test/stf/error_checks/slice_check_bounds.cu index f27cebdd722..fecea9e7a55 100644 --- a/cudax/test/stf/error_checks/slice_check_bounds.cu +++ b/cudax/test/stf/error_checks/slice_check_bounds.cu @@ -48,9 +48,9 @@ int main() { /* Setup an handler to catch the SIGABRT signal during the programming error */ #ifndef NDEBUG -# if defined(_CCCL_COMPILER_MSVC) +# if _CCCL_COMPILER(MSVC) signal(SIGABRT, &cleanupRoutine); -# else // ^^^ _CCCL_COMPILER_MSVC ^^^ / vvv !_CCCL_COMPILER_MSVC +# else // ^^^ _CCCL_COMPILER(MSVC) ^^^ / vvv !_CCCL_COMPILER(MSVC) struct sigaction sigabrt_action {}; memset(&sigabrt_action, 0, sizeof(sigabrt_action)); @@ -61,7 +61,7 @@ int main() perror("sigaction SIGABRT"); exit(EXIT_FAILURE); } -# endif // !_CCCL_COMPILER_MSVC +# endif // !_CCCL_COMPILER(MSVC) context ctx; diff --git a/cudax/test/stf/error_checks/uninitialized_data.cu b/cudax/test/stf/error_checks/uninitialized_data.cu index efd45db4d1b..6af57556ad5 100644 --- a/cudax/test/stf/error_checks/uninitialized_data.cu +++ b/cudax/test/stf/error_checks/uninitialized_data.cu @@ -39,9 +39,9 @@ int main() { /* Setup an handler to catch the SIGABRT signal during the programming error */ #ifndef NDEBUG -# if defined(_CCCL_COMPILER_MSVC) +# if _CCCL_COMPILER(MSVC) signal(SIGABRT, &cleanupRoutine); -# else // ^^^ _CCCL_COMPILER_MSVC ^^^ / vvv !_CCCL_COMPILER_MSVC +# else // ^^^ _CCCL_COMPILER(MSVC) ^^^ / vvv !_CCCL_COMPILER(MSVC) struct sigaction sigabrt_action {}; memset(&sigabrt_action, 0, sizeof(sigabrt_action)); @@ -52,7 +52,7 @@ int main() perror("sigaction SIGABRT"); exit(EXIT_FAILURE); } -# endif // !_CCCL_COMPILER_MSVC +# endif // !_CCCL_COMPILER(MSVC) stream_ctx ctx; diff --git a/cudax/test/stf/error_checks/unsatisfiable_spec.cu b/cudax/test/stf/error_checks/unsatisfiable_spec.cu index ee3c10ad9cf..a0e4277979c 100644 --- a/cudax/test/stf/error_checks/unsatisfiable_spec.cu +++ b/cudax/test/stf/error_checks/unsatisfiable_spec.cu @@ -39,9 +39,9 @@ int main() { /* Setup an handler to catch the SIGABRT signal during the programming error */ #ifndef NDEBUG -# if defined(_CCCL_COMPILER_MSVC) +# if _CCCL_COMPILER(MSVC) signal(SIGABRT, &cleanupRoutine); -# else // ^^^ _CCCL_COMPILER_MSVC ^^^ / vvv !_CCCL_COMPILER_MSVC +# else // ^^^ _CCCL_COMPILER(MSVC) ^^^ / vvv !_CCCL_COMPILER(MSVC) struct sigaction sigabrt_action {}; memset(&sigabrt_action, 0, sizeof(sigabrt_action)); @@ -52,7 +52,7 @@ int main() perror("sigaction SIGABRT"); exit(EXIT_FAILURE); } -# endif // !_CCCL_COMPILER_MSVC +# endif // !_CCCL_COMPILER(MSVC) context ctx; diff --git a/cudax/test/stf/error_checks/write_frozen.cu b/cudax/test/stf/error_checks/write_frozen.cu index 1d46c702c0f..b4e08642a5e 100644 --- a/cudax/test/stf/error_checks/write_frozen.cu +++ b/cudax/test/stf/error_checks/write_frozen.cu @@ -40,9 +40,9 @@ void cleanupRoutine(int /*unused*/) int main() { /* Setup an handler to catch the SIGABRT signal during the programming error */ -#if defined(_CCCL_COMPILER_MSVC) +#if _CCCL_COMPILER(MSVC) signal(SIGABRT, &cleanupRoutine); -#else // ^^^ _CCCL_COMPILER_MSVC ^^^ / vvv !_CCCL_COMPILER_MSVC +#else // ^^^ _CCCL_COMPILER(MSVC) ^^^ / vvv !_CCCL_COMPILER(MSVC) struct sigaction sigabrt_action {}; memset(&sigabrt_action, 0, sizeof(sigabrt_action)); @@ -53,7 +53,7 @@ int main() perror("sigaction SIGABRT"); exit(EXIT_FAILURE); } -#endif // !_CCCL_COMPILER_MSVC +#endif // !_CCCL_COMPILER(MSVC) stream_ctx ctx; const int N = 16; diff --git a/cudax/test/stf/parallel_for/fdtd.cu b/cudax/test/stf/parallel_for/fdtd.cu index 0e97d2e7afd..686d613d710 100644 --- a/cudax/test/stf/parallel_for/fdtd.cu +++ b/cudax/test/stf/parallel_for/fdtd.cu @@ -14,7 +14,7 @@ using namespace cuda::experimental::stf; // FIXME : MSVC has trouble with box constructors -#if !defined(_CCCL_COMPILER_MSVC) +#if !_CCCL_COMPILER(MSVC) void write_vtk_2D(const std::string& filename, slice Ez, double dx, double dy, double /*unused*/) { FILE* f = fopen(filename.c_str(), "w"); @@ -92,11 +92,11 @@ __device__ double Source(double t, double x, double y, double z) constexpr double k = 2 * pi / wavelength; return sin(k * x - omega * t); } -#endif // !defined(_CCCL_COMPILER_MSVC) +#endif // !_CCCL_COMPILER(MSVC) int main([[maybe_unused]] int argc, [[maybe_unused]] char** argv) { -#if !defined(_CCCL_COMPILER_MSVC) +#if !_CCCL_COMPILER(MSVC) stream_ctx ctx; // Domain dimensions @@ -250,5 +250,5 @@ int main([[maybe_unused]] int argc, [[maybe_unused]] char** argv) } ctx.finalize(); -#endif // !defined(_CCCL_COMPILER_MSVC) +#endif // !_CCCL_COMPILER(MSVC) } diff --git a/cudax/test/stf/reclaiming/graph.cu b/cudax/test/stf/reclaiming/graph.cu index 3c77cb15980..56e82254bff 100644 --- a/cudax/test/stf/reclaiming/graph.cu +++ b/cudax/test/stf/reclaiming/graph.cu @@ -10,19 +10,19 @@ #include -#if !defined(_CCCL_COMPILER_MSVC) +#if !_CCCL_COMPILER(MSVC) using namespace cuda::experimental::stf; __global__ void kernel() { // No-op } -#endif // !defined(_CCCL_COMPILER_MSVC) +#endif // !_CCCL_COMPILER(MSVC) int main([[maybe_unused]] int argc, [[maybe_unused]] char** argv) { // TODO fix setenv -#if !defined(_CCCL_COMPILER_MSVC) +#if !_CCCL_COMPILER(MSVC) int nblocks = 4; size_t block_size = 1024 * 1024; @@ -68,5 +68,5 @@ int main([[maybe_unused]] int argc, [[maybe_unused]] char** argv) } ctx.finalize(); -#endif // !defined(_CCCL_COMPILER_MSVC) +#endif // !_CCCL_COMPILER(MSVC) } diff --git a/cudax/test/stf/stress/task_bench.cu b/cudax/test/stf/stress/task_bench.cu index 769b057075b..5d5d483c9ca 100644 --- a/cudax/test/stf/stress/task_bench.cu +++ b/cudax/test/stf/stress/task_bench.cu @@ -57,10 +57,10 @@ int log2Int(int n) return result; } -#if defined(_CCCL_COMPILER_MSVC) +#if _CCCL_COMPILER(MSVC) _CCCL_DIAG_PUSH _CCCL_DIAG_SUPPRESS_MSVC(4702) // unreachable code -#endif // _CCCL_COMPILER_MSVC +#endif // _CCCL_COMPILER(MSVC) bool skip_task(test_id id, int t, int i, int /*W*/) { switch (id) @@ -89,9 +89,9 @@ bool skip_task(test_id id, int t, int i, int /*W*/) abort(); return true; } -#if defined(_CCCL_COMPILER_MSVC) +#if _CCCL_COMPILER(MSVC) _CCCL_DIAG_POP -#endif // _CCCL_COMPILER_MSVC +#endif // _CCCL_COMPILER(MSVC) std::vector input_deps(test_id id, int t, int i, int W) { diff --git a/cudax/test/stf/tools/auto_dump/auto_dump.cu b/cudax/test/stf/tools/auto_dump/auto_dump.cu index 835016fcf2a..dd9b06981d4 100644 --- a/cudax/test/stf/tools/auto_dump/auto_dump.cu +++ b/cudax/test/stf/tools/auto_dump/auto_dump.cu @@ -21,7 +21,7 @@ using namespace cuda::experimental::stf; int main() { -#if !defined(_CCCL_COMPILER_MSVC) +#if !_CCCL_COMPILER(MSVC) // Generate a random dirname srand(static_cast(time(nullptr))); int r = rand(); @@ -60,5 +60,5 @@ int main() EXPECT(!std::filesystem::exists(dirname + "/" + std::to_string(2))); std::filesystem::remove_all(dirname); -#endif // !_CCCL_COMPILER_MSVC +#endif // !_CCCL_COMPILER(MSVC) } diff --git a/libcudacxx/include/cuda/__memory_resource/device_memory_resource.h b/libcudacxx/include/cuda/__memory_resource/device_memory_resource.h index d82ba355ff4..72e01a5521d 100644 --- a/libcudacxx/include/cuda/__memory_resource/device_memory_resource.h +++ b/libcudacxx/include/cuda/__memory_resource/device_memory_resource.h @@ -21,7 +21,7 @@ # pragma system_header #endif // no system header -#if !defined(_CCCL_COMPILER_MSVC_2017) && defined(LIBCUDACXX_ENABLE_EXPERIMENTAL_MEMORY_RESOURCE) +#if !_CCCL_COMPILER(MSVC2017) && defined(LIBCUDACXX_ENABLE_EXPERIMENTAL_MEMORY_RESOURCE) # if defined(_CCCL_CUDA_COMPILER_CLANG) # include @@ -214,6 +214,6 @@ _LIBCUDACXX_END_NAMESPACE_CUDA_MR # endif // _CCCL_STD_VER >= 2014 -#endif // !_CCCL_COMPILER_MSVC_2017 && LIBCUDACXX_ENABLE_EXPERIMENTAL_MEMORY_RESOURCE +#endif // !_CCCL_COMPILER(MSVC2017) && LIBCUDACXX_ENABLE_EXPERIMENTAL_MEMORY_RESOURCE #endif // _CUDA__MEMORY_RESOURCE_CUDA_MEMORY_RESOURCE_H diff --git a/libcudacxx/include/cuda/__memory_resource/get_property.h b/libcudacxx/include/cuda/__memory_resource/get_property.h index fcfa023e6fa..d9b762225eb 100644 --- a/libcudacxx/include/cuda/__memory_resource/get_property.h +++ b/libcudacxx/include/cuda/__memory_resource/get_property.h @@ -21,7 +21,7 @@ # pragma system_header #endif // no system header -#if !defined(_CCCL_COMPILER_MSVC_2017) && defined(LIBCUDACXX_ENABLE_EXPERIMENTAL_MEMORY_RESOURCE) +#if !_CCCL_COMPILER(MSVC2017) && defined(LIBCUDACXX_ENABLE_EXPERIMENTAL_MEMORY_RESOURCE) # include # include @@ -180,6 +180,6 @@ _LIBCUDACXX_END_NAMESPACE_CUDA # endif // _CCCL_STD_VER >= 2014 -#endif // !_CCCL_COMPILER_MSVC_2017 && LIBCUDACXX_ENABLE_EXPERIMENTAL_MEMORY_RESOURCE +#endif // !_CCCL_COMPILER(MSVC2017) && LIBCUDACXX_ENABLE_EXPERIMENTAL_MEMORY_RESOURCE #endif //_CUDA__MEMORY_RESOURCE_GET_PROPERTY_H diff --git a/libcudacxx/include/cuda/__memory_resource/managed_memory_resource.h b/libcudacxx/include/cuda/__memory_resource/managed_memory_resource.h index c1af2074beb..86835aede18 100644 --- a/libcudacxx/include/cuda/__memory_resource/managed_memory_resource.h +++ b/libcudacxx/include/cuda/__memory_resource/managed_memory_resource.h @@ -21,7 +21,7 @@ # pragma system_header #endif // no system header -#if !defined(_CCCL_COMPILER_MSVC_2017) && defined(LIBCUDACXX_ENABLE_EXPERIMENTAL_MEMORY_RESOURCE) +#if !_CCCL_COMPILER(MSVC2017) && defined(LIBCUDACXX_ENABLE_EXPERIMENTAL_MEMORY_RESOURCE) # if defined(_CCCL_CUDA_COMPILER_CLANG) # include @@ -196,6 +196,6 @@ _LIBCUDACXX_END_NAMESPACE_CUDA_MR # endif // _CCCL_STD_VER >= 2014 -#endif // !_CCCL_COMPILER_MSVC_2017 && LIBCUDACXX_ENABLE_EXPERIMENTAL_MEMORY_RESOURCE +#endif // !_CCCL_COMPILER(MSVC2017) && LIBCUDACXX_ENABLE_EXPERIMENTAL_MEMORY_RESOURCE #endif //_CUDA__MEMORY_RESOURCE_CUDA_MANAGED_MEMORY_RESOURCE_H diff --git a/libcudacxx/include/cuda/__memory_resource/pinned_memory_resource.h b/libcudacxx/include/cuda/__memory_resource/pinned_memory_resource.h index 2fe29653d75..819d485a104 100644 --- a/libcudacxx/include/cuda/__memory_resource/pinned_memory_resource.h +++ b/libcudacxx/include/cuda/__memory_resource/pinned_memory_resource.h @@ -21,7 +21,7 @@ # pragma system_header #endif // no system header -#if !defined(_CCCL_COMPILER_MSVC_2017) && defined(LIBCUDACXX_ENABLE_EXPERIMENTAL_MEMORY_RESOURCE) +#if !_CCCL_COMPILER(MSVC2017) && defined(LIBCUDACXX_ENABLE_EXPERIMENTAL_MEMORY_RESOURCE) # if defined(_CCCL_CUDA_COMPILER_CLANG) # include @@ -199,6 +199,6 @@ _LIBCUDACXX_END_NAMESPACE_CUDA_MR # endif // _CCCL_STD_VER >= 2014 -#endif // !_CCCL_COMPILER_MSVC_2017 && LIBCUDACXX_ENABLE_EXPERIMENTAL_MEMORY_RESOURCE +#endif // !_CCCL_COMPILER(MSVC2017) && LIBCUDACXX_ENABLE_EXPERIMENTAL_MEMORY_RESOURCE #endif //_CUDA__MEMORY_RESOURCE_CUDA_PINNED_MEMORY_RESOURCE_H diff --git a/libcudacxx/include/cuda/__memory_resource/properties.h b/libcudacxx/include/cuda/__memory_resource/properties.h index 42fbbda5f7b..6b0279eb06f 100644 --- a/libcudacxx/include/cuda/__memory_resource/properties.h +++ b/libcudacxx/include/cuda/__memory_resource/properties.h @@ -24,7 +24,7 @@ #include #include -#if !defined(_CCCL_COMPILER_MSVC_2017) && defined(LIBCUDACXX_ENABLE_EXPERIMENTAL_MEMORY_RESOURCE) +#if !_CCCL_COMPILER(MSVC2017) && defined(LIBCUDACXX_ENABLE_EXPERIMENTAL_MEMORY_RESOURCE) # if _CCCL_STD_VER >= 2014 @@ -68,6 +68,6 @@ _LIBCUDACXX_END_NAMESPACE_CUDA_MR # endif // _CCCL_STD_VER >= 2014 -#endif // !_CCCL_COMPILER_MSVC_2017 && LIBCUDACXX_ENABLE_EXPERIMENTAL_MEMORY_RESOURCE +#endif // !_CCCL_COMPILER(MSVC2017) && LIBCUDACXX_ENABLE_EXPERIMENTAL_MEMORY_RESOURCE #endif //_CUDA__MEMORY_RESOURCE_PROPERTIES_H diff --git a/libcudacxx/include/cuda/__memory_resource/resource.h b/libcudacxx/include/cuda/__memory_resource/resource.h index bfcf6d73174..0b864e649d6 100644 --- a/libcudacxx/include/cuda/__memory_resource/resource.h +++ b/libcudacxx/include/cuda/__memory_resource/resource.h @@ -21,7 +21,7 @@ # pragma system_header #endif // no system header -#if !defined(_CCCL_COMPILER_MSVC_2017) && defined(LIBCUDACXX_ENABLE_EXPERIMENTAL_MEMORY_RESOURCE) +#if !_CCCL_COMPILER(MSVC2017) && defined(LIBCUDACXX_ENABLE_EXPERIMENTAL_MEMORY_RESOURCE) # include # include @@ -129,6 +129,6 @@ _LIBCUDACXX_END_NAMESPACE_CUDA_MR # endif // _CCCL_STD_VER >= 2014 -#endif // !_CCCL_COMPILER_MSVC_2017 && LIBCUDACXX_ENABLE_EXPERIMENTAL_MEMORY_RESOURCE +#endif // !_CCCL_COMPILER(MSVC2017) && LIBCUDACXX_ENABLE_EXPERIMENTAL_MEMORY_RESOURCE #endif //_CUDA__MEMORY_RESOURCE_RESOURCE_H diff --git a/libcudacxx/include/cuda/__memory_resource/resource_ref.h b/libcudacxx/include/cuda/__memory_resource/resource_ref.h index 164625c6493..81831720349 100644 --- a/libcudacxx/include/cuda/__memory_resource/resource_ref.h +++ b/libcudacxx/include/cuda/__memory_resource/resource_ref.h @@ -21,7 +21,7 @@ # pragma system_header #endif // no system header -#if !defined(_CCCL_COMPILER_MSVC_2017) && defined(LIBCUDACXX_ENABLE_EXPERIMENTAL_MEMORY_RESOURCE) +#if !_CCCL_COMPILER(MSVC2017) && defined(LIBCUDACXX_ENABLE_EXPERIMENTAL_MEMORY_RESOURCE) # include # include @@ -640,6 +640,6 @@ _LIBCUDACXX_END_NAMESPACE_CUDA_MR # endif // _CCCL_STD_VER >= 2014 -#endif // !_CCCL_COMPILER_MSVC_2017 && LIBCUDACXX_ENABLE_EXPERIMENTAL_MEMORY_RESOURCE +#endif // !_CCCL_COMPILER(MSVC2017) && LIBCUDACXX_ENABLE_EXPERIMENTAL_MEMORY_RESOURCE #endif //_CUDA__MEMORY_RESOURCE_RESOURCE_REF_H diff --git a/libcudacxx/include/cuda/std/__algorithm/iterator_operations.h b/libcudacxx/include/cuda/std/__algorithm/iterator_operations.h index 7eb7a715962..be02de72b97 100644 --- a/libcudacxx/include/cuda/std/__algorithm/iterator_operations.h +++ b/libcudacxx/include/cuda/std/__algorithm/iterator_operations.h @@ -44,7 +44,7 @@ _LIBCUDACXX_BEGIN_NAMESPACE_STD template struct _IterOps; -#if _CCCL_STD_VER >= 2017 && !defined(_CCCL_COMPILER_MSVC_2017) +#if _CCCL_STD_VER >= 2017 && !_CCCL_COMPILER(MSVC2017) struct _RangeAlgPolicy {}; @@ -69,7 +69,7 @@ struct _IterOps<_RangeAlgPolicy> static constexpr auto __advance_to = _CUDA_VRANGES::advance; }; -#endif // _CCCL_STD_VER >= 2017 && !_CCCL_COMPILER_MSVC_2017 +#endif // _CCCL_STD_VER >= 2017 && !_CCCL_COMPILER(MSVC2017) struct _ClassicAlgPolicy {}; diff --git a/libcudacxx/include/cuda/std/__algorithm/ranges_iterator_concept.h b/libcudacxx/include/cuda/std/__algorithm/ranges_iterator_concept.h index 7562eedbf95..43e674b976f 100644 --- a/libcudacxx/include/cuda/std/__algorithm/ranges_iterator_concept.h +++ b/libcudacxx/include/cuda/std/__algorithm/ranges_iterator_concept.h @@ -24,7 +24,7 @@ #include #include -#if _CCCL_STD_VER >= 2017 && !defined(_CCCL_COMPILER_MSVC_2017) +#if _CCCL_STD_VER >= 2017 && !_CCCL_COMPILER(MSVC2017) _LIBCUDACXX_BEGIN_NAMESPACE_RANGES @@ -60,6 +60,6 @@ using __iterator_concept = decltype(__get_iterator_concept<_Iter>()); _LIBCUDACXX_END_NAMESPACE_RANGES -#endif // _CCCL_STD_VER >= 2017 && !_CCCL_COMPILER_MSVC_2017 +#endif // _CCCL_STD_VER >= 2017 && !_CCCL_COMPILER(MSVC2017) #endif // _LIBCUDACXX___ALGORITHM_RANGES_ITERATOR_CONCEPT_H diff --git a/libcudacxx/include/cuda/std/__atomic/platform.h b/libcudacxx/include/cuda/std/__atomic/platform.h index 6367e20234e..66eae16ffaa 100644 --- a/libcudacxx/include/cuda/std/__atomic/platform.h +++ b/libcudacxx/include/cuda/std/__atomic/platform.h @@ -21,7 +21,7 @@ # pragma system_header #endif // no system header -#if defined(_CCCL_COMPILER_MSVC) +#if _CCCL_COMPILER(MSVC) # include #endif diff --git a/libcudacxx/include/cuda/std/__atomic/platform/msvc_to_builtins.h b/libcudacxx/include/cuda/std/__atomic/platform/msvc_to_builtins.h index eb5721c8022..8c532260284 100644 --- a/libcudacxx/include/cuda/std/__atomic/platform/msvc_to_builtins.h +++ b/libcudacxx/include/cuda/std/__atomic/platform/msvc_to_builtins.h @@ -21,7 +21,7 @@ # pragma system_header #endif // no system header -#if defined(_CCCL_COMPILER_MSVC) +#if _CCCL_COMPILER(MSVC) # include # include @@ -637,6 +637,6 @@ _Type __atomic_fetch_min(_Type volatile* __ptr, _Delta __val, int __memorder) _LIBCUDACXX_END_NAMESPACE_STD -#endif // defined(_CCCL_COMPILER_MSVC) +#endif // _CCCL_COMPILER(MSVC) #endif // __LIBCUDACXX___ATOMIC_PLATFORM_MSVC_H diff --git a/libcudacxx/include/cuda/std/__bit/clz.h b/libcudacxx/include/cuda/std/__bit/clz.h index c61508f8287..267f022737a 100644 --- a/libcudacxx/include/cuda/std/__bit/clz.h +++ b/libcudacxx/include/cuda/std/__bit/clz.h @@ -24,9 +24,9 @@ #include #include -#if defined(_CCCL_COMPILER_MSVC) +#if _CCCL_COMPILER(MSVC) # include -#endif // _CCCL_COMPILER_MSVC +#endif // _CCCL_COMPILER(MSVC) _LIBCUDACXX_BEGIN_NAMESPACE_STD @@ -55,7 +55,7 @@ _LIBCUDACXX_HIDE_FROM_ABI constexpr int __binary_clz64(uint64_t __x) return __binary_clz32(__x >> 32 * !!(__x & 0xFFFFFFFF00000000), 32 * !(__x & 0xFFFFFFFF00000000)); } -#if !defined(_CCCL_COMPILER_MSVC) +#if !_CCCL_COMPILER(MSVC) _LIBCUDACXX_HIDE_FROM_ABI constexpr int __constexpr_clz(uint32_t __x) noexcept { @@ -97,7 +97,7 @@ _LIBCUDACXX_HIDE_FROM_ABI constexpr int __libcpp_clz(uint64_t __x) noexcept return __constexpr_clz(__x); } -#else // defined(_CCCL_COMPILER_MSVC) +#else // _CCCL_COMPILER(MSVC) // Precondition: __x != 0 _LIBCUDACXX_HIDE_FROM_ABI constexpr int __libcpp_clz(uint32_t __x) diff --git a/libcudacxx/include/cuda/std/__bit/ctz.h b/libcudacxx/include/cuda/std/__bit/ctz.h index 0f08f67d38b..9d2e771bd61 100644 --- a/libcudacxx/include/cuda/std/__bit/ctz.h +++ b/libcudacxx/include/cuda/std/__bit/ctz.h @@ -24,9 +24,9 @@ #include #include -#if defined(_CCCL_COMPILER_MSVC) +#if _CCCL_COMPILER(MSVC) # include -#endif // _CCCL_COMPILER_MSVC +#endif // _CCCL_COMPILER(MSVC) _LIBCUDACXX_BEGIN_NAMESPACE_STD @@ -55,7 +55,7 @@ _LIBCUDACXX_HIDE_FROM_ABI constexpr int __binary_ctz64(uint64_t __x) noexcept return __binary_ctz32(__x >> 32 * !(__x & 0x00000000FFFFFFFF), 32 * !(__x & 0x00000000FFFFFFFF)); } -#if !defined(_CCCL_COMPILER_MSVC) +#if !_CCCL_COMPILER(MSVC) _LIBCUDACXX_HIDE_FROM_ABI constexpr int __constexpr_ctz(uint32_t __x) noexcept { @@ -99,7 +99,7 @@ _LIBCUDACXX_HIDE_FROM_ABI constexpr int __libcpp_ctz(uint64_t __x) noexcept return __constexpr_ctz(__x); } -#else // defined(_CCCL_COMPILER_MSVC) +#else // _CCCL_COMPILER(MSVC) // Precondition: __x != 0 _LIBCUDACXX_HIDE_FROM_ABI constexpr int __libcpp_ctz(uint32_t __x) diff --git a/libcudacxx/include/cuda/std/__bit/popc.h b/libcudacxx/include/cuda/std/__bit/popc.h index d3ec52342ad..dc22999b985 100644 --- a/libcudacxx/include/cuda/std/__bit/popc.h +++ b/libcudacxx/include/cuda/std/__bit/popc.h @@ -24,7 +24,7 @@ #include #include -#if defined(_CCCL_COMPILER_MSVC) +#if _CCCL_COMPILER(MSVC) # include # if defined(_M_ARM64) @@ -35,7 +35,7 @@ # define _LIBCUDACXX_MSVC_POPC64(x) __popcnt64(x) # endif // !_M_ARM64 -#endif // _CCCL_COMPILER_MSVC +#endif // _CCCL_COMPILER(MSVC) _LIBCUDACXX_BEGIN_NAMESPACE_STD @@ -56,7 +56,7 @@ _LIBCUDACXX_HIDE_FROM_ABI constexpr int __fallback_popc64(uint64_t __x) return __fallback_popc32(__x - ((__x >> 1) & 0x5555555555555555)); } -#if !defined(_CCCL_COMPILER_MSVC) +#if !_CCCL_COMPILER(MSVC) _LIBCUDACXX_HIDE_FROM_ABI constexpr int __constexpr_popcount(uint32_t __x) noexcept { @@ -98,7 +98,7 @@ _LIBCUDACXX_HIDE_FROM_ABI constexpr int __libcpp_popc(uint64_t __x) noexcept return __constexpr_popcount(static_cast(__x)); } -#else // defined(_CCCL_COMPILER_MSVC) +#else // _CCCL_COMPILER(MSVC) _LIBCUDACXX_HIDE_FROM_ABI constexpr int __libcpp_popc(uint32_t __x) { diff --git a/libcudacxx/include/cuda/std/__cccl/assert.h b/libcudacxx/include/cuda/std/__cccl/assert.h index 70b2398cccf..8c48296619c 100644 --- a/libcudacxx/include/cuda/std/__cccl/assert.h +++ b/libcudacxx/include/cuda/std/__cccl/assert.h @@ -64,7 +64,7 @@ //! _CCCL_ASSERT_IMPL_HOST should never be used directly #if _CCCL_COMPILER(NVRTC) // There is no host standard library in nvrtc # define _CCCL_ASSERT_IMPL_HOST(expression, message) ((void) 0) -#elif _CCCL_HAS_INCLUDE() && defined(_CCCL_COMPILER_MSVC) // MSVC uses _STL_VERIFY from +#elif _CCCL_HAS_INCLUDE() && _CCCL_COMPILER(MSVC) // MSVC uses _STL_VERIFY from # include # define _CCCL_ASSERT_IMPL_HOST(expression, message) _STL_VERIFY(expression, message) #else // ^^^ MSVC STL ^^^ / vvv !MSVC STL vvv @@ -97,15 +97,15 @@ _CCCL_HOST_DEVICE _CCCL_BUILTIN_EXPECT(static_cast(expression), 1) \ ? (void) 0 : __assertfail(message, __FILE__, __LINE__, __func__, sizeof(char)) #elif defined(_CCCL_CUDA_COMPILER_NVCC) //! Use __assert_fail to implement device side asserts -# if defined(_CCCL_COMPILER_MSVC) +# if _CCCL_COMPILER(MSVC) # define _CCCL_ASSERT_IMPL_DEVICE(expression, message) \ _CCCL_BUILTIN_EXPECT(static_cast(expression), 1) \ ? (void) 0 : _wassert(_CRT_WIDE(#message), __FILEW__, __LINE__) -# else // ^^^ _CCCL_COMPILER_MSVC ^^^ / vvv !_CCCL_COMPILER_MSVC vvv +# else // ^^^ _CCCL_COMPILER(MSVC) ^^^ / vvv !_CCCL_COMPILER(MSVC) vvv # define _CCCL_ASSERT_IMPL_DEVICE(expression, message) \ _CCCL_BUILTIN_EXPECT(static_cast(expression), 1) \ ? (void) 0 : __assert_fail(message, __FILE__, __LINE__, __func__) -# endif // !_CCCL_COMPILER_MSVC +# endif // !_CCCL_COMPILER(MSVC) #elif defined(_CCCL_CUDA_COMPILER) # define _CCCL_ASSERT_IMPL_DEVICE(expression, message) _CCCL_ASSERT_IMPL_HOST(expression, message) #else // ^^^ _CCCL_CUDA_COMPILER ^^^ / vvv !_CCCL_CUDA_COMPILER vvv diff --git a/libcudacxx/include/cuda/std/__cccl/attributes.h b/libcudacxx/include/cuda/std/__cccl/attributes.h index 7a8bcb49f0b..0ed5fdd2b7f 100644 --- a/libcudacxx/include/cuda/std/__cccl/attributes.h +++ b/libcudacxx/include/cuda/std/__cccl/attributes.h @@ -80,7 +80,7 @@ # define _CCCL_HAS_NO_ATTRIBUTE_NO_UNIQUE_ADDRESS #endif // !_CCCL_HAS_NO_ATTRIBUTE_NO_UNIQUE_ADDRESS && _CCCL_COMPILER(CLANG) -#if _CCCL_HAS_CPP_ATTRIBUTE(nodiscard) || (defined(_CCCL_COMPILER_MSVC) && _CCCL_STD_VER >= 2017) +#if _CCCL_HAS_CPP_ATTRIBUTE(nodiscard) || (_CCCL_COMPILER(MSVC) && _CCCL_STD_VER >= 2017) # define _CCCL_NODISCARD [[nodiscard]] #else // ^^^ has nodiscard ^^^ / vvv no nodiscard vvv # define _CCCL_NODISCARD @@ -101,7 +101,7 @@ # define _CCCL_ALIAS_ATTRIBUTE(...) __VA_ARGS__ #endif // _CCCL_CUDACC_AT_LEAST(11, 3) -#if defined(_CCCL_COMPILER_MSVC) +#if _CCCL_COMPILER(MSVC) # define _CCCL_NORETURN __declspec(noreturn) #elif _CCCL_HAS_CPP_ATTRIBUTE(noreturn) # define _CCCL_NORETURN [[noreturn]] @@ -109,10 +109,10 @@ # define _CCCL_NORETURN __attribute__((noreturn)) #endif -#if defined(_CCCL_COMPILER_MSVC) // vvv _CCCL_COMPILER_MSVC vvv +#if _CCCL_COMPILER(MSVC) // vvv _CCCL_COMPILER(MSVC) vvv # define _CCCL_RESTRICT __restrict -#else // ^^^ _CCCL_COMPILER_MSVC ^^^ / vvv !_CCCL_COMPILER_MSVC vvv +#else // ^^^ _CCCL_COMPILER(MSVC) ^^^ / vvv !_CCCL_COMPILER(MSVC) vvv # define _CCCL_RESTRICT __restrict__ -#endif // ^^^ !_CCCL_COMPILER_MSVC ^^^ +#endif // ^^^ !_CCCL_COMPILER(MSVC) ^^^ #endif // __CCCL_ATTRIBUTES_H diff --git a/libcudacxx/include/cuda/std/__cccl/builtin.h b/libcudacxx/include/cuda/std/__cccl/builtin.h index ac93b0f8caf..b3a53918054 100644 --- a/libcudacxx/include/cuda/std/__cccl/builtin.h +++ b/libcudacxx/include/cuda/std/__cccl/builtin.h @@ -86,8 +86,7 @@ # define _CCCL_BUILTIN_ARRAY_EXTENT(...) __array_extent(__VA_ARGS__) #endif // _CCCL_HAS_BUILTIN(__array_extent) -#if _CCCL_HAS_BUILTIN(__builtin_assume_aligned) || (defined(_CCCL_COMPILER_MSVC) && _CCCL_MSVC_VERSION >= 1923) \ - || _CCCL_COMPILER(GCC) +#if _CCCL_HAS_BUILTIN(__builtin_assume_aligned) || _CCCL_COMPILER(MSVC, >=, 19, 23) || _CCCL_COMPILER(GCC) # define _CCCL_BUILTIN_ASSUME_ALIGNED(...) __builtin_assume_aligned(__VA_ARGS__) #endif // _CCCL_HAS_BUILTIN(__builtin_assume_aligned) @@ -97,7 +96,7 @@ #endif // _CCCL_CUDACC_BELOW(11, 2) // nvhpc has a bug where it supports __builtin_addressof but does not mark it via _CCCL_CHECK_BUILTIN -#if _CCCL_CHECK_BUILTIN(builtin_addressof) || _CCCL_COMPILER(GCC, >=, 7) || defined(_CCCL_COMPILER_MSVC) \ +#if _CCCL_CHECK_BUILTIN(builtin_addressof) || _CCCL_COMPILER(GCC, >=, 7) || _CCCL_COMPILER(MSVC) \ || _CCCL_COMPILER(NVHPC) # define _CCCL_BUILTIN_ADDRESSOF(...) __builtin_addressof(__VA_ARGS__) #endif // _CCCL_CHECK_BUILTIN(builtin_addressof) @@ -117,7 +116,7 @@ #endif // _CCCL_CUDACC_BELOW(11, 2) // MSVC supports __builtin_bit_cast from 19.25 on -#if _CCCL_CHECK_BUILTIN(builtin_bit_cast) || (defined(_CCCL_COMPILER_MSVC) && _CCCL_MSVC_VERSION > 1925) +#if _CCCL_CHECK_BUILTIN(builtin_bit_cast) || _CCCL_COMPILER(MSVC, >, 19, 25) # define _CCCL_BUILTIN_BIT_CAST(...) __builtin_bit_cast(__VA_ARGS__) #endif // _CCCL_CHECK_BUILTIN(builtin_bit_cast) @@ -127,7 +126,7 @@ # undef _CCCL_BUILTIN_BIT_CAST #endif // clang < 10 || nvcc < 11.7 -#if _CCCL_HAS_BUILTIN(__builtin_COLUMN) || defined(_CCCL_COMPILER_MSVC) && _CCCL_MSVC_VERSION >= 1927 +#if _CCCL_HAS_BUILTIN(__builtin_COLUMN) || _CCCL_COMPILER(MSVC, >=, 19, 27) # define _CCCL_BUILTIN_COLUMN() __builtin_COLUMN() #else // ^^^ _CCCL_HAS_BUILTIN(__builtin_COLUMN) ^^^ / vvv !_CCCL_HAS_BUILTIN(__builtin_COLUMN) vvv # define _CCCL_BUILTIN_COLUMN() 0 @@ -143,12 +142,11 @@ # define _CCCL_BUILTIN_CONSTANT_P(...) __builtin_constant_p(__VA_ARGS__) #endif // _CCCL_CHECK_BUILTIN(builtin_contant_p) -#if _CCCL_CHECK_BUILTIN(builtin_expect) || defined(_CCCL_COMPILER_MSVC) || _CCCL_COMPILER(GCC) +#if _CCCL_CHECK_BUILTIN(builtin_expect) || _CCCL_COMPILER(MSVC) || _CCCL_COMPILER(GCC) # define _CCCL_BUILTIN_EXPECT(...) __builtin_expect(__VA_ARGS__) #endif // _CCCL_CHECK_BUILTIN(builtin_expect) -#if _CCCL_HAS_BUILTIN(__builtin_FILE) || _CCCL_COMPILER(GCC) \ - || (defined(_CCCL_COMPILER_MSVC) && _CCCL_MSVC_VERSION >= 1927) +#if _CCCL_HAS_BUILTIN(__builtin_FILE) || _CCCL_COMPILER(GCC) || _CCCL_COMPILER(MSVC, >=, 19, 27) # define _CCCL_BUILTIN_FILE() __builtin_FILE() #else // ^^^ _CCCL_HAS_BUILTIN(__builtin_FILE) ^^^ / vvv !_CCCL_HAS_BUILTIN(__builtin_FILE) vvv # define _CCCL_BUILTIN_FILE() __FILE__ @@ -160,8 +158,7 @@ # define _CCCL_BUILTIN_FILE() __FILE__ #endif // _CCCL_CUDACC_BELOW(11, 3) -#if _CCCL_HAS_BUILTIN(__builtin_FUNCTION) || _CCCL_COMPILER(GCC) \ - || (defined(_CCCL_COMPILER_MSVC) && _CCCL_MSVC_VERSION >= 1927) +#if _CCCL_HAS_BUILTIN(__builtin_FUNCTION) || _CCCL_COMPILER(GCC) || _CCCL_COMPILER(MSVC, >=, 19, 27) # define _CCCL_BUILTIN_FUNCTION() __builtin_FUNCTION() #else // ^^^ _CCCL_HAS_BUILTIN(__builtin_FUNCTION) ^^^ / vvv !_CCCL_HAS_BUILTIN(__builtin_FUNCTION) vvv # define _CCCL_BUILTIN_FUNCTION() "__builtin_FUNCTION is unsupported" @@ -174,7 +171,7 @@ #endif // _CCCL_CUDACC_BELOW(11, 3) #if _CCCL_CHECK_BUILTIN(builtin_is_constant_evaluated) || _CCCL_COMPILER(GCC, >=, 9) \ - || (defined(_CCCL_COMPILER_MSVC) && _CCCL_MSVC_VERSION > 1924 && _CCCL_CUDACC_AT_LEAST(11, 3)) + || (_CCCL_COMPILER(MSVC, >, 19, 24) && _CCCL_CUDACC_AT_LEAST(11, 3)) # define _CCCL_BUILTIN_IS_CONSTANT_EVALUATED(...) __builtin_is_constant_evaluated(__VA_ARGS__) #endif // _CCCL_CHECK_BUILTIN(builtin_is_constant_evaluated) @@ -193,8 +190,7 @@ # undef _CCCL_BUILTIN_LAUNDER #endif // clang < 10 || nvcc < 11.3 -#if _CCCL_HAS_BUILTIN(__builtin_LINE) || _CCCL_COMPILER(GCC) \ - || (defined(_CCCL_COMPILER_MSVC) && _CCCL_MSVC_VERSION >= 1927) +#if _CCCL_HAS_BUILTIN(__builtin_LINE) || _CCCL_COMPILER(GCC) || _CCCL_COMPILER(MSVC, >=, 19, 27) # define _CCCL_BUILTIN_LINE() __builtin_LINE() #else // ^^^ _CCCL_HAS_BUILTIN(__builtin_LINE) ^^^ / vvv !_CCCL_HAS_BUILTIN(__builtin_LINE) vvv # define _CCCL_BUILTIN_LINE() __LINE__ @@ -216,27 +212,27 @@ # define _CCCL_BUILTIN_DECAY(...) __decay(__VA_ARGS__) #endif // _CCCL_HAS_BUILTIN(__decay) && clang-cuda -#if _CCCL_CHECK_BUILTIN(has_nothrow_assign) || _CCCL_COMPILER(GCC, >=, 4, 3) || defined(_CCCL_COMPILER_MSVC) \ +#if _CCCL_CHECK_BUILTIN(has_nothrow_assign) || _CCCL_COMPILER(GCC, >=, 4, 3) || _CCCL_COMPILER(MSVC) \ || _CCCL_COMPILER(NVRTC) # define _CCCL_BUILTIN_HAS_NOTHROW_ASSIGN(...) __has_nothrow_assign(__VA_ARGS__) #endif // _CCCL_CHECK_BUILTIN(has_nothrow_assign) && gcc >= 4.3 -#if _CCCL_CHECK_BUILTIN(has_nothrow_constructor) || _CCCL_COMPILER(GCC, >=, 4, 3) || defined(_CCCL_COMPILER_MSVC) \ +#if _CCCL_CHECK_BUILTIN(has_nothrow_constructor) || _CCCL_COMPILER(GCC, >=, 4, 3) || _CCCL_COMPILER(MSVC) \ || _CCCL_COMPILER(NVRTC) # define _CCCL_BUILTIN_HAS_NOTHROW_CONSTRUCTOR(...) __has_nothrow_constructor(__VA_ARGS__) #endif // _CCCL_CHECK_BUILTIN(has_nothrow_constructor) && gcc >= 4.3 -#if _CCCL_CHECK_BUILTIN(has_nothrow_copy) || _CCCL_COMPILER(GCC, >=, 4, 3) || defined(_CCCL_COMPILER_MSVC) \ +#if _CCCL_CHECK_BUILTIN(has_nothrow_copy) || _CCCL_COMPILER(GCC, >=, 4, 3) || _CCCL_COMPILER(MSVC) \ || _CCCL_COMPILER(NVRTC) # define _CCCL_BUILTIN_HAS_NOTHROW_COPY(...) __has_nothrow_copy(__VA_ARGS__) #endif // _CCCL_CHECK_BUILTIN(has_nothrow_copy) && gcc >= 4.3 -#if _CCCL_CHECK_BUILTIN(has_trivial_constructor) || _CCCL_COMPILER(GCC, >=, 4, 3) || defined(_CCCL_COMPILER_MSVC) \ +#if _CCCL_CHECK_BUILTIN(has_trivial_constructor) || _CCCL_COMPILER(GCC, >=, 4, 3) || _CCCL_COMPILER(MSVC) \ || _CCCL_COMPILER(NVRTC) # define _CCCL_BUILTIN_HAS_TRIVIAL_CONSTRUCTOR(...) __has_trivial_constructor(__VA_ARGS__) #endif // _CCCL_CHECK_BUILTIN(has_trivial_constructor) && gcc >= 4.3 -#if _CCCL_CHECK_BUILTIN(has_trivial_destructor) || _CCCL_COMPILER(GCC, >=, 4, 3) || defined(_CCCL_COMPILER_MSVC) \ +#if _CCCL_CHECK_BUILTIN(has_trivial_destructor) || _CCCL_COMPILER(GCC, >=, 4, 3) || _CCCL_COMPILER(MSVC) \ || _CCCL_COMPILER(NVRTC) # define _CCCL_BUILTIN_HAS_TRIVIAL_DESTRUCTOR(...) __has_trivial_destructor(__VA_ARGS__) #endif // _CCCL_CHECK_BUILTIN(has_trivial_destructor) && gcc >= 4.3 @@ -245,7 +241,7 @@ # define _CCCL_BUILTIN_HAS_UNIQUE_OBJECT_REPRESENTATIONS(...) __has_unique_object_representations(__VA_ARGS__) #endif // _CCCL_CHECK_BUILTIN(has_unique_object_representations) && gcc >= 7.0 -#if _CCCL_CHECK_BUILTIN(has_virtual_destructor) || _CCCL_COMPILER(GCC, >=, 4, 3) || defined(_CCCL_COMPILER_MSVC) \ +#if _CCCL_CHECK_BUILTIN(has_virtual_destructor) || _CCCL_COMPILER(GCC, >=, 4, 3) || _CCCL_COMPILER(MSVC) \ || _CCCL_COMPILER(NVRTC) # define _CCCL_BUILTIN_HAS_VIRTUAL_DESTRUCTOR(...) __has_virtual_destructor(__VA_ARGS__) #endif // _CCCL_CHECK_BUILTIN(has_virtual_destructor) && gcc >= 4.3 @@ -254,8 +250,8 @@ # define _CCCL_BUILTIN_INTEGER_PACK(...) __integer_pack(__VA_ARGS__) #endif // _CCCL_HAS_BUILTIN(__integer_pack) -#if _CCCL_CHECK_BUILTIN(is_aggregate) || _CCCL_COMPILER(GCC, >=, 7) \ - || (defined(_CCCL_COMPILER_MSVC) && _CCCL_MSVC_VERSION > 1914) || _CCCL_COMPILER(NVRTC) +#if _CCCL_CHECK_BUILTIN(is_aggregate) || _CCCL_COMPILER(GCC, >=, 7) || _CCCL_COMPILER(MSVC, >, 19, 14) \ + || _CCCL_COMPILER(NVRTC) # define _CCCL_BUILTIN_IS_AGGREGATE(...) __is_aggregate(__VA_ARGS__) #endif // _CCCL_CHECK_BUILTIN(is_aggregate) && gcc >= 7.0 @@ -268,17 +264,15 @@ # undef _CCCL_BUILTIN_IS_ARRAY #endif // clang < 19 -#if _CCCL_CHECK_BUILTIN(is_assignable) || defined(_CCCL_COMPILER_MSVC) || _CCCL_COMPILER(GCC, >=, 9) +#if _CCCL_CHECK_BUILTIN(is_assignable) || _CCCL_COMPILER(MSVC) || _CCCL_COMPILER(GCC, >=, 9) # define _CCCL_BUILTIN_IS_ASSIGNABLE(...) __is_assignable(__VA_ARGS__) #endif // _CCCL_CHECK_BUILTIN(is_assignable) && gcc >= 9.0 -#if _CCCL_CHECK_BUILTIN(is_base_of) || _CCCL_COMPILER(GCC, >=, 4, 3) || defined(_CCCL_COMPILER_MSVC) \ - || _CCCL_COMPILER(NVRTC) +#if _CCCL_CHECK_BUILTIN(is_base_of) || _CCCL_COMPILER(GCC, >=, 4, 3) || _CCCL_COMPILER(MSVC) || _CCCL_COMPILER(NVRTC) # define _CCCL_BUILTIN_IS_BASE_OF(...) __is_base_of(__VA_ARGS__) #endif // _CCCL_CHECK_BUILTIN(is_base_of) && gcc >= 4.3 -#if _CCCL_CHECK_BUILTIN(is_class) || _CCCL_COMPILER(GCC, >=, 4, 3) || defined(_CCCL_COMPILER_MSVC) \ - || _CCCL_COMPILER(NVRTC) +#if _CCCL_CHECK_BUILTIN(is_class) || _CCCL_COMPILER(GCC, >=, 4, 3) || _CCCL_COMPILER(MSVC) || _CCCL_COMPILER(NVRTC) # define _CCCL_BUILTIN_IS_CLASS(...) __is_class(__VA_ARGS__) #endif // _CCCL_CHECK_BUILTIN(is_class) && gcc >= 4.3 @@ -290,31 +284,27 @@ # define _CCCL_BUILTIN_IS_CONST(...) __is_const(__VA_ARGS__) #endif // _CCCL_HAS_BUILTIN(__is_const) -#if _CCCL_CHECK_BUILTIN(is_constructible) || _CCCL_COMPILER(GCC, >=, 8) || defined(_CCCL_COMPILER_MSVC) \ - || _CCCL_COMPILER(NVRTC) +#if _CCCL_CHECK_BUILTIN(is_constructible) || _CCCL_COMPILER(GCC, >=, 8) || _CCCL_COMPILER(MSVC) || _CCCL_COMPILER(NVRTC) # define _CCCL_BUILTIN_IS_CONSTRUCTIBLE(...) __is_constructible(__VA_ARGS__) #endif // _CCCL_CHECK_BUILTIN(is_constructible) && gcc >= 8.0 -#if _CCCL_CHECK_BUILTIN(is_convertible_to) || defined(_CCCL_COMPILER_MSVC) || _CCCL_COMPILER(NVRTC) +#if _CCCL_CHECK_BUILTIN(is_convertible_to) || _CCCL_COMPILER(MSVC) || _CCCL_COMPILER(NVRTC) # define _CCCL_BUILTIN_IS_CONVERTIBLE_TO(...) __is_convertible_to(__VA_ARGS__) #endif // _CCCL_CHECK_BUILTIN(is_convertible_to) -#if _CCCL_CHECK_BUILTIN(is_destructible) || defined(_CCCL_COMPILER_MSVC) +#if _CCCL_CHECK_BUILTIN(is_destructible) || _CCCL_COMPILER(MSVC) # define _CCCL_BUILTIN_IS_DESTRUCTIBLE(...) __is_destructible(__VA_ARGS__) #endif // _CCCL_CHECK_BUILTIN(is_destructible) -#if _CCCL_CHECK_BUILTIN(is_empty) || _CCCL_COMPILER(GCC, >=, 4, 3) || defined(_CCCL_COMPILER_MSVC) \ - || _CCCL_COMPILER(NVRTC) +#if _CCCL_CHECK_BUILTIN(is_empty) || _CCCL_COMPILER(GCC, >=, 4, 3) || _CCCL_COMPILER(MSVC) || _CCCL_COMPILER(NVRTC) # define _CCCL_BUILTIN_IS_EMPTY(...) __is_empty(__VA_ARGS__) #endif // _CCCL_CHECK_BUILTIN(is_empty) && gcc >= 4.3 -#if _CCCL_CHECK_BUILTIN(is_enum) || _CCCL_COMPILER(GCC, >=, 4, 3) || defined(_CCCL_COMPILER_MSVC) \ - || _CCCL_COMPILER(NVRTC) +#if _CCCL_CHECK_BUILTIN(is_enum) || _CCCL_COMPILER(GCC, >=, 4, 3) || _CCCL_COMPILER(MSVC) || _CCCL_COMPILER(NVRTC) # define _CCCL_BUILTIN_IS_ENUM(...) __is_enum(__VA_ARGS__) #endif // _CCCL_CHECK_BUILTIN(is_enum) && gcc >= 4.3 -#if _CCCL_CHECK_BUILTIN(is_final) || _CCCL_COMPILER(GCC, >=, 4, 7) || defined(_CCCL_COMPILER_MSVC) \ - || _CCCL_COMPILER(NVRTC) +#if _CCCL_CHECK_BUILTIN(is_final) || _CCCL_COMPILER(GCC, >=, 4, 7) || _CCCL_COMPILER(MSVC) || _CCCL_COMPILER(NVRTC) # define _CCCL_BUILTIN_IS_FINAL(...) __is_final(__VA_ARGS__) #endif // _CCCL_CHECK_BUILTIN(is_final) && gcc >= 4.7 @@ -341,7 +331,7 @@ # define _CCCL_BUILTIN_IS_INTEGRAL(...) __is_integral(__VA_ARGS__) #endif // _CCCL_HAS_BUILTIN(__is_integral) -#if _CCCL_CHECK_BUILTIN(is_literal_type) || _CCCL_COMPILER(GCC, >=, 4, 6) || defined(_CCCL_COMPILER_MSVC) \ +#if _CCCL_CHECK_BUILTIN(is_literal_type) || _CCCL_COMPILER(GCC, >=, 4, 6) || _CCCL_COMPILER(MSVC) \ || _CCCL_COMPILER(NVRTC) # define _CCCL_BUILTIN_IS_LITERAL(...) __is_literal_type(__VA_ARGS__) #endif // _CCCL_CHECK_BUILTIN(is_literal_type) && gcc >= 4.6 @@ -367,15 +357,15 @@ # define _CCCL_BUILTIN_IS_MEMBER_POINTER(...) __is_member_pointer(__VA_ARGS__) #endif // _CCCL_HAS_BUILTIN(__is_member_pointer) -#if _CCCL_CHECK_BUILTIN(is_nothrow_assignable) || defined(_CCCL_COMPILER_MSVC) || _CCCL_COMPILER(NVRTC) +#if _CCCL_CHECK_BUILTIN(is_nothrow_assignable) || _CCCL_COMPILER(MSVC) || _CCCL_COMPILER(NVRTC) # define _CCCL_BUILTIN_IS_NOTHROW_ASSIGNABLE(...) __is_nothrow_assignable(__VA_ARGS__) #endif // _CCCL_CHECK_BUILTIN(is_nothrow_assignable) -#if _CCCL_CHECK_BUILTIN(is_nothrow_constructible) || defined(_CCCL_COMPILER_MSVC) || _CCCL_COMPILER(NVRTC) +#if _CCCL_CHECK_BUILTIN(is_nothrow_constructible) || _CCCL_COMPILER(MSVC) || _CCCL_COMPILER(NVRTC) # define _CCCL_BUILTIN_IS_NOTHROW_CONSTRUCTIBLE(...) __is_nothrow_constructible(__VA_ARGS__) #endif // _CCCL_CHECK_BUILTIN(is_nothrow_constructible) -#if _CCCL_CHECK_BUILTIN(is_nothrow_destructible) || defined(_CCCL_COMPILER_MSVC) || _CCCL_COMPILER(NVRTC) +#if _CCCL_CHECK_BUILTIN(is_nothrow_destructible) || _CCCL_COMPILER(MSVC) || _CCCL_COMPILER(NVRTC) # define _CCCL_BUILTIN_IS_NOTHROW_DESTRUCTIBLE(...) __is_nothrow_destructible(__VA_ARGS__) #endif // _CCCL_CHECK_BUILTIN(is_nothrow_destructible) @@ -388,8 +378,7 @@ # undef _CCCL_BUILTIN_IS_OBJECT #endif // _CCCL_CUDACC_BELOW(11, 3) -#if _CCCL_CHECK_BUILTIN(is_pod) || _CCCL_COMPILER(GCC, >=, 4, 3) || defined(_CCCL_COMPILER_MSVC) \ - || _CCCL_COMPILER(NVRTC) +#if _CCCL_CHECK_BUILTIN(is_pod) || _CCCL_COMPILER(GCC, >=, 4, 3) || _CCCL_COMPILER(MSVC) || _CCCL_COMPILER(NVRTC) # define _CCCL_BUILTIN_IS_POD(...) __is_pod(__VA_ARGS__) #endif // _CCCL_CHECK_BUILTIN(is_pod) && gcc >= 4.3 @@ -398,7 +387,7 @@ # define _CCCL_BUILTIN_IS_POINTER(...) __is_pointer(__VA_ARGS__) #endif // _CCCL_HAS_BUILTIN(__is_pointer) -#if _CCCL_CHECK_BUILTIN(is_polymorphic) || _CCCL_COMPILER(GCC, >=, 4, 3) || defined(_CCCL_COMPILER_MSVC) \ +#if _CCCL_CHECK_BUILTIN(is_polymorphic) || _CCCL_COMPILER(GCC, >=, 4, 3) || _CCCL_COMPILER(MSVC) \ || _CCCL_COMPILER(NVRTC) # define _CCCL_BUILTIN_IS_POLYMORPHIC(...) __is_polymorphic(__VA_ARGS__) #endif // _CCCL_CHECK_BUILTIN(is_polymorphic) && gcc >= 4.3 @@ -430,38 +419,36 @@ # define _CCCL_BUILTIN_IS_SIGNED(...) __is_signed(__VA_ARGS__) #endif // _CCCL_HAS_BUILTIN(__is_signed) -#if _CCCL_CHECK_BUILTIN(is_standard_layout) || _CCCL_COMPILER(GCC, >=, 4, 7) || defined(_CCCL_COMPILER_MSVC) \ +#if _CCCL_CHECK_BUILTIN(is_standard_layout) || _CCCL_COMPILER(GCC, >=, 4, 7) || _CCCL_COMPILER(MSVC) \ || _CCCL_COMPILER(NVRTC) # define _CCCL_BUILTIN_IS_STANDARD_LAYOUT(...) __is_standard_layout(__VA_ARGS__) #endif // _CCCL_CHECK_BUILTIN(is_standard_layout) && gcc >= 4.7 -#if _CCCL_CHECK_BUILTIN(is_trivial) || _CCCL_COMPILER(GCC, >=, 4, 5) || defined(_CCCL_COMPILER_MSVC) \ - || _CCCL_COMPILER(NVRTC) +#if _CCCL_CHECK_BUILTIN(is_trivial) || _CCCL_COMPILER(GCC, >=, 4, 5) || _CCCL_COMPILER(MSVC) || _CCCL_COMPILER(NVRTC) # define _CCCL_BUILTIN_IS_TRIVIAL(...) __is_trivial(__VA_ARGS__) #endif // _CCCL_CHECK_BUILTIN(is_trivial) && gcc >= 4.5 -#if _CCCL_CHECK_BUILTIN(is_trivially_assignable) || _CCCL_COMPILER(GCC, >=, 5, 1) || defined(_CCCL_COMPILER_MSVC) \ +#if _CCCL_CHECK_BUILTIN(is_trivially_assignable) || _CCCL_COMPILER(GCC, >=, 5, 1) || _CCCL_COMPILER(MSVC) \ || _CCCL_COMPILER(NVRTC) # define _CCCL_BUILTIN_IS_TRIVIALLY_ASSIGNABLE(...) __is_trivially_assignable(__VA_ARGS__) #endif // _CCCL_CHECK_BUILTIN(is_trivially_assignable) && gcc >= 5.1 -#if _CCCL_CHECK_BUILTIN(is_trivially_constructible) || _CCCL_COMPILER(GCC, >=, 5, 1) || defined(_CCCL_COMPILER_MSVC) \ +#if _CCCL_CHECK_BUILTIN(is_trivially_constructible) || _CCCL_COMPILER(GCC, >=, 5, 1) || _CCCL_COMPILER(MSVC) \ || _CCCL_COMPILER(NVRTC) # define _CCCL_BUILTIN_IS_TRIVIALLY_CONSTRUCTIBLE(...) __is_trivially_constructible(__VA_ARGS__) #endif // _CCCL_CHECK_BUILTIN(is_trivially_constructible) && gcc >= 5.1 -#if _CCCL_CHECK_BUILTIN(is_trivially_copyable) || _CCCL_COMPILER(GCC, >=, 5, 1) || defined(_CCCL_COMPILER_MSVC) \ +#if _CCCL_CHECK_BUILTIN(is_trivially_copyable) || _CCCL_COMPILER(GCC, >=, 5, 1) || _CCCL_COMPILER(MSVC) \ || _CCCL_COMPILER(NVRTC) # define _CCCL_BUILTIN_IS_TRIVIALLY_COPYABLE(...) __is_trivially_copyable(__VA_ARGS__) #endif // _CCCL_CHECK_BUILTIN(is_trivially_copyable) && gcc >= 5.1 -#if _CCCL_CHECK_BUILTIN(is_trivially_destructible) || defined(_CCCL_COMPILER_MSVC) +#if _CCCL_CHECK_BUILTIN(is_trivially_destructible) || _CCCL_COMPILER(MSVC) # define _CCCL_BUILTIN_IS_TRIVIALLY_DESTRUCTIBLE(...) __is_trivially_destructible(__VA_ARGS__) #endif // _CCCL_CHECK_BUILTIN(is_trivially_destructible) -#if _CCCL_CHECK_BUILTIN(is_union) || _CCCL_COMPILER(GCC, >=, 4, 3) || defined(_CCCL_COMPILER_MSVC) \ - || _CCCL_COMPILER(NVRTC) +#if _CCCL_CHECK_BUILTIN(is_union) || _CCCL_COMPILER(GCC, >=, 4, 3) || _CCCL_COMPILER(MSVC) || _CCCL_COMPILER(NVRTC) # define _CCCL_BUILTIN_IS_UNION(...) __is_union(__VA_ARGS__) #endif // _CCCL_CHECK_BUILTIN(is_union) && gcc >= 4.3 @@ -496,7 +483,7 @@ # define _CCCL_BUILTIN_ISNAN(...) __builtin_isnan(__VA_ARGS__) #endif // _CCCL_CHECK_BUILTIN(isnan) -#if _CCCL_CHECK_BUILTIN(make_integer_seq) || (defined(_CCCL_COMPILER_MSVC) && _CCCL_MSVC_VERSION >= 1923) +#if _CCCL_CHECK_BUILTIN(make_integer_seq) || _CCCL_COMPILER(MSVC, >=, 19, 23) # define _CCCL_BUILTIN_MAKE_INTEGER_SEQ(...) __make_integer_seq<__VA_ARGS__> #endif // _CCCL_CHECK_BUILTIN(make_integer_seq) @@ -553,27 +540,27 @@ # undef _CCCL_BUILTIN_TYPE_PACK_ELEMENT #endif // _CCCL_CUDACC_BELOW(12, 2) -#if _CCCL_CHECK_BUILTIN(underlying_type) || _CCCL_COMPILER(GCC, >=, 4, 7) || defined(_CCCL_COMPILER_MSVC) \ +#if _CCCL_CHECK_BUILTIN(underlying_type) || _CCCL_COMPILER(GCC, >=, 4, 7) || _CCCL_COMPILER(MSVC) \ || _CCCL_COMPILER(NVRTC) # define _CCCL_BUILTIN_UNDERLYING_TYPE(...) __underlying_type(__VA_ARGS__) #endif // _CCCL_CHECK_BUILTIN(underlying_type) && gcc >= 4.7 -#if defined(_CCCL_COMPILER_MSVC) +#if _CCCL_COMPILER(MSVC) # // To use __builtin_FUNCSIG(), both MSVC and nvcc need to support it -# if _CCCL_MSVC_VERSION >= 1935 && _CCCL_CUDACC_AT_LEAST(12, 3) +# if _CCCL_COMPILER(MSVC, >=, 19, 35) && _CCCL_CUDACC_AT_LEAST(12, 3) # define _CCCL_BUILTIN_PRETTY_FUNCTION() __builtin_FUNCSIG() -# else // ^^^ _CCCL_MSVC_VERSION >= 1935 ^^^ / vvv _CCCL_MSVC_VERSION < 1935 vvv +# else // ^^^ _CCCL_COMPILER(MSVC, >=, 19, 35) ^^^ / vvv _CCCL_COMPILER(MSVC, <, 19, 35) vvv # define _CCCL_BUILTIN_PRETTY_FUNCTION() __FUNCSIG__ # define _CCCL_BROKEN_MSVC_FUNCSIG -# endif // _CCCL_MSVC_VERSION < 1935 -#else // ^^^ _CCCL_COMPILER_MSVC ^^^ / vvv !_CCCL_COMPILER_MSVC vvv +# endif // _CCCL_COMPILER(MSVC, <, 19, 35) +#else // ^^^ _CCCL_COMPILER(MSVC) ^^^ / vvv !_CCCL_COMPILER(MSVC) vvv # define _CCCL_BUILTIN_PRETTY_FUNCTION() __PRETTY_FUNCTION__ -#endif // !_CCCL_COMPILER_MSVC +#endif // !_CCCL_COMPILER(MSVC) // GCC's builtin_strlen isn't reliable at constexpr time // MSVC does not expose builtin_strlen before C++17 // NVRTC does not expose builtin_strlen -#if !_CCCL_COMPILER(GCC) && !_CCCL_COMPILER(NVRTC) && !(defined(_CCCL_COMPILER_MSVC) && _CCCL_STD_VER < 2017) +#if !_CCCL_COMPILER(GCC) && !_CCCL_COMPILER(NVRTC) && !(_CCCL_COMPILER(MSVC) && _CCCL_STD_VER < 2017) # define _CCCL_BUILTIN_STRLEN(...) __builtin_strlen(__VA_ARGS__) #endif diff --git a/libcudacxx/include/cuda/std/__cccl/compiler.h b/libcudacxx/include/cuda/std/__cccl/compiler.h index 89c054a12a1..fd7e93d22cb 100644 --- a/libcudacxx/include/cuda/std/__cccl/compiler.h +++ b/libcudacxx/include/cuda/std/__cccl/compiler.h @@ -27,9 +27,14 @@ #elif defined(__GNUC__) # define _CCCL_COMPILER_GCC _CCCL_COMPILER_MAKE_VERSION(__GNUC__, __GNUC_MINOR__) #elif defined(_MSC_VER) -# define _CCCL_COMPILER_MSVC -# define _CCCL_MSVC_VERSION _MSC_VER -# define _CCCL_MSVC_VERSION_FULL _MSC_FULL_VER +# define _CCCL_COMPILER_MSVC _CCCL_COMPILER_MAKE_VERSION(_MSC_VER / 100, _MSC_VER % 100) +# define _CCCL_COMPILER_MSVC2017 (_CCCL_COMPILER_MSVC < _CCCL_COMPILER_MAKE_VERSION(19, 20)) +# define _CCCL_COMPILER_MSVC2019 \ + (_CCCL_COMPILER_MSVC >= _CCCL_COMPILER_MAKE_VERSION(19, 20) \ + && _CCCL_COMPILER_MSVC < _CCCL_COMPILER_MAKE_VERSION(19, 30)) +# define _CCCL_COMPILER_MSVC2022 \ + (_CCCL_COMPILER_MSVC >= _CCCL_COMPILER_MAKE_VERSION(19, 30) \ + && _CCCL_COMPILER_MSVC < _CCCL_COMPILER_MAKE_VERSION(19, 40)) #elif defined(__CUDACC_RTC__) # define _CCCL_COMPILER_NVRTC _CCCL_COMPILER_MAKE_VERSION(__CUDACC_VER_MAJOR__, __CUDACC_VER_MINOR__) #endif @@ -52,17 +57,6 @@ _CCCL_COMPILER_COMPARE_BAD_ARG_COUNT)) #define _CCCL_COMPILER(...) _CCCL_COMPILER_SELECT(_CCCL_COMPILER_##__VA_ARGS__)(_CCCL_COMPILER_##__VA_ARGS__) -// Convenient shortcut to determine which version of MSVC we are dealing with -#if defined(_CCCL_COMPILER_MSVC) -# if _MSC_VER < 1920 -# define _CCCL_COMPILER_MSVC_2017 -# elif _MSC_VER < 1930 -# define _CCCL_COMPILER_MSVC_2019 -# else // _MSC_VER < 1940 -# define _CCCL_COMPILER_MSVC_2022 -# endif // _MSC_VER < 1940 -#endif // _CCCL_COMPILER_MSVC - // Determine the cuda compiler #if defined(__NVCC__) # define _CCCL_CUDA_COMPILER_NVCC @@ -100,10 +94,10 @@ #define _CCCL_TO_STRING(_STR) _CCCL_TO_STRING2(_STR) // Define the pragma for the host compiler -#if defined(_CCCL_COMPILER_MSVC) +#if _CCCL_COMPILER(MSVC) # define _CCCL_PRAGMA(_ARG) __pragma(_ARG) #else # define _CCCL_PRAGMA(_ARG) _Pragma(_CCCL_TO_STRING(_ARG)) -#endif // defined(_CCCL_COMPILER_MSVC) +#endif // _CCCL_COMPILER(MSVC) #endif // __CCCL_COMPILER_H diff --git a/libcudacxx/include/cuda/std/__cccl/diagnostic.h b/libcudacxx/include/cuda/std/__cccl/diagnostic.h index fdedae215f3..5824c65b8eb 100644 --- a/libcudacxx/include/cuda/std/__cccl/diagnostic.h +++ b/libcudacxx/include/cuda/std/__cccl/diagnostic.h @@ -55,7 +55,7 @@ # define _CCCL_DIAG_SUPPRESS_NVHPC(str) _CCCL_PRAGMA(diag_suppress str) # define _CCCL_DIAG_SUPPRESS_MSVC(str) # define _CCCL_DIAG_SUPPRESS_ICC(str) -#elif defined(_CCCL_COMPILER_MSVC) +#elif _CCCL_COMPILER(MSVC) # define _CCCL_DIAG_PUSH _CCCL_PRAGMA(warning(push)) # define _CCCL_DIAG_POP _CCCL_PRAGMA(warning(pop)) # define _CCCL_DIAG_SUPPRESS_CLANG(str) @@ -96,17 +96,17 @@ _CCCL_DIAG_PUSH \ _CCCL_DIAG_SUPPRESS_NVHPC(deprecated_entity) # define _CCCL_SUPPRESS_DEPRECATED_POP _CCCL_DIAG_POP -#elif defined(_CCCL_COMPILER_MSVC) +#elif _CCCL_COMPILER(MSVC) # define _CCCL_SUPPRESS_DEPRECATED_PUSH \ _CCCL_DIAG_PUSH \ _CCCL_DIAG_SUPPRESS_MSVC(4996) # define _CCCL_SUPPRESS_DEPRECATED_POP _CCCL_DIAG_POP #else // !_CCCL_COMPILER(CLANG) && !_CCCL_COMPILER(ICC) && && !_CCCL_COMPILER(GCC) && !_CCCL_COMPILER(NVHPC) && - // !_CCCL_COMPILER_MSVC + // !_CCCL_COMPILER(MSVC) # define _CCCL_SUPPRESS_DEPRECATED_PUSH # define _CCCL_SUPPRESS_DEPRECATED_POP #endif // !_CCCL_COMPILER(CLANG) && !_CCCL_COMPILER(ICC) && && !_CCCL_COMPILER(GCC) && !_CCCL_COMPILER(NVHPC) && - // !_CCCL_COMPILER_MSVC + // !_CCCL_COMPILER(MSVC) // Enable us to selectively silence cuda compiler warnings #if defined(_CCCL_CUDA_COMPILER) @@ -114,31 +114,31 @@ # define _CCCL_NV_DIAG_SUPPRESS(_WARNING) # define _CCCL_NV_DIAG_DEFAULT(_WARNING) # elif defined(__NVCC_DIAG_PRAGMA_SUPPORT__) || _CCCL_COMPILER(ICC) -# if defined(_CCCL_COMPILER_MSVC) +# if _CCCL_COMPILER(MSVC) # define _CCCL_NV_DIAG_SUPPRESS(_WARNING) _CCCL_PRAGMA(nv_diag_suppress _WARNING) # define _CCCL_NV_DIAG_DEFAULT(_WARNING) _CCCL_PRAGMA(nv_diag_default _WARNING) # else // ^^^ _CCCL_COMPILER_{MSVC,ICC}^^^ / vvv !_CCCL_COMPILER_{MSVC,ICC} vvv # define _CCCL_NV_DIAG_SUPPRESS(_WARNING) _CCCL_PRAGMA(nv_diagnostic push) _CCCL_PRAGMA(nv_diag_suppress _WARNING) # define _CCCL_NV_DIAG_DEFAULT(_WARNING) _CCCL_PRAGMA(nv_diagnostic pop) -# endif // !_CCCL_COMPILER_MSVC +# endif // !_CCCL_COMPILER(MSVC) # elif _CCCL_COMPILER(NVHPC) # define _CCCL_NV_DIAG_SUPPRESS(_WARNING) _CCCL_PRAGMA(diagnostic push) _CCCL_PRAGMA(diag_suppress _WARNING) # define _CCCL_NV_DIAG_DEFAULT(_WARNING) _CCCL_PRAGMA(diagnostic pop) # else // ^^^ __NVCC_DIAG_PRAGMA_SUPPORT__ ^^^ / vvv !__NVCC_DIAG_PRAGMA_SUPPORT__ vvv -# if defined(_CCCL_COMPILER_MSVC_2017) // MSVC 2017 has issues with restoring the warning +# if _CCCL_COMPILER(MSVC2017) // MSVC 2017 has issues with restoring the warning # define _CCCL_NV_DIAG_SUPPRESS(_WARNING) _CCCL_PRAGMA(diag_suppress _WARNING) # define _CCCL_NV_DIAG_DEFAULT(_WARNING) -# else // ^^^ _CCCL_COMPILER_MSVC_2017 ^^^ / vvv !_CCCL_COMPILER_MSVC_2017 vvv +# else // ^^^ _CCCL_COMPILER(MSVC2017) ^^^ / vvv !_CCCL_COMPILER(MSVC2017) vvv # define _CCCL_NV_DIAG_SUPPRESS(_WARNING) _CCCL_PRAGMA(diag_suppress _WARNING) # define _CCCL_NV_DIAG_DEFAULT(_WARNING) _CCCL_PRAGMA(diag_default _WARNING) -# endif // !_CCCL_COMPILER_MSVC_2017 +# endif // !_CCCL_COMPILER(MSVC2017) # endif // !__NVCC_DIAG_PRAGMA_SUPPORT__ #else // ^^^ _CCCL_CUDA_COMPILER ^^^ / vvv !_CCCL_CUDA_COMPILER vvv # define _CCCL_NV_DIAG_SUPPRESS(_WARNING) # define _CCCL_NV_DIAG_DEFAULT(_WARNING) #endif // other compilers -#if defined(_CCCL_COMPILER_MSVC) +#if _CCCL_COMPILER(MSVC) # define _CCCL_HAS_PRAGMA_MSVC_WARNING # if !defined(_LIBCUDACXX_DISABLE_PRAGMA_MSVC_WARNING) # define _CCCL_USE_PRAGMA_MSVC_WARNING @@ -160,13 +160,13 @@ # define _CCCL_MSVC_WARNINGS_PUSH \ _CCCL_PRAGMA(warning(push)) _CCCL_PRAGMA(warning(disable : _CCCL_MSVC_DISABLED_WARNINGS)) # define _CCCL_MSVC_WARNINGS_POP _CCCL_PRAGMA(warning(pop)) -#else // ^^^ _CCCL_COMPILER_MSVC ^^^ / vvv !_CCCL_COMPILER_MSVC vvv +#else // ^^^ _CCCL_COMPILER(MSVC) ^^^ / vvv !_CCCL_COMPILER(MSVC) vvv # define _CCCL_MSVC_WARNINGS_PUSH # define _CCCL_MSVC_WARNINGS_POP -#endif // !_CCCL_COMPILER_MSVC +#endif // !_CCCL_COMPILER(MSVC) #ifndef _CCCL_HAS_NO_PRAGMA_PUSH_POP_MACRO -# if defined(_CCCL_COMPILER_MSVC_2017) || _CCCL_COMPILER(NVRTC) +# if _CCCL_COMPILER(MSVC2017) || _CCCL_COMPILER(NVRTC) # define _CCCL_HAS_NO_PRAGMA_PUSH_POP_MACRO # endif #endif // _CCCL_HAS_NO_PRAGMA_PUSH_POP_MACRO diff --git a/libcudacxx/include/cuda/std/__cccl/dialect.h b/libcudacxx/include/cuda/std/__cccl/dialect.h index 4b96695de73..8dfedd5a3cc 100644 --- a/libcudacxx/include/cuda/std/__cccl/dialect.h +++ b/libcudacxx/include/cuda/std/__cccl/dialect.h @@ -22,7 +22,7 @@ # pragma system_header #endif // no system header -#if defined(_CCCL_COMPILER_MSVC) +#if _CCCL_COMPILER(MSVC) # if _MSVC_LANG <= 201103L # define _CCCL_STD_VER 2011 # elif _MSVC_LANG <= 201402L @@ -34,7 +34,7 @@ # else # define _CCCL_STD_VER 2023 // current year, or date of c++2b ratification # endif -#else // ^^^ _CCCL_COMPILER_MSVC ^^^ / vvv !_CCCL_COMPILER_MSVC vvv +#else // ^^^ _CCCL_COMPILER(MSVC) ^^^ / vvv !_CCCL_COMPILER(MSVC) vvv # if __cplusplus <= 199711L # define _CCCL_STD_VER 2003 # elif __cplusplus <= 201103L @@ -50,7 +50,7 @@ # else # define _CCCL_STD_VER 2024 // current year, or date of c++2c ratification # endif -#endif // !_CCCL_COMPILER_MSVC +#endif // !_CCCL_COMPILER(MSVC) #if _CCCL_STD_VER >= 2014 # define _CCCL_CONSTEXPR_CXX14 constexpr diff --git a/libcudacxx/include/cuda/std/__cccl/exceptions.h b/libcudacxx/include/cuda/std/__cccl/exceptions.h index 24124bfa126..9f9e439e14e 100644 --- a/libcudacxx/include/cuda/std/__cccl/exceptions.h +++ b/libcudacxx/include/cuda/std/__cccl/exceptions.h @@ -25,8 +25,8 @@ #ifndef _CCCL_NO_EXCEPTIONS # if defined(CCCL_DISABLE_EXCEPTIONS) // Escape hatch for users to manually disable exceptions # define _CCCL_NO_EXCEPTIONS -# elif _CCCL_COMPILER(NVRTC) || (defined(_CCCL_COMPILER_MSVC) && _CPPUNWIND == 0) \ - || (!defined(_CCCL_COMPILER_MSVC) && !__EXCEPTIONS) // Catches all non msvc based compilers +# elif _CCCL_COMPILER(NVRTC) || (_CCCL_COMPILER(MSVC) && _CPPUNWIND == 0) \ + || (!_CCCL_COMPILER(MSVC) && !__EXCEPTIONS) // Catches all non msvc based compilers # define _CCCL_NO_EXCEPTIONS # endif #endif // !_CCCL_NO_EXCEPTIONS diff --git a/libcudacxx/include/cuda/std/__cccl/rtti.h b/libcudacxx/include/cuda/std/__cccl/rtti.h index 502407a0607..174b6313d87 100644 --- a/libcudacxx/include/cuda/std/__cccl/rtti.h +++ b/libcudacxx/include/cuda/std/__cccl/rtti.h @@ -38,7 +38,7 @@ # endif # elif _CCCL_COMPILER(NVRTC) # define _CCCL_NO_RTTI -# elif defined(_CCCL_COMPILER_MSVC) +# elif _CCCL_COMPILER(MSVC) # if _CPPRTTI == 0 # define _CCCL_NO_RTTI # endif @@ -65,7 +65,7 @@ # endif # elif _CCCL_COMPILER(NVRTC) # define _CCCL_NO_TYPEID -# elif defined(_CCCL_COMPILER_MSVC) +# elif _CCCL_COMPILER(MSVC) // No-op, MSVC always supports typeid even when RTTI is disabled # elif _CCCL_COMPILER(CLANG) # if !_CCCL_HAS_FEATURE(cxx_rtti) diff --git a/libcudacxx/include/cuda/std/__cccl/system_header.h b/libcudacxx/include/cuda/std/__cccl/system_header.h index 2285bcf1651..d557dc88682 100644 --- a/libcudacxx/include/cuda/std/__cccl/system_header.h +++ b/libcudacxx/include/cuda/std/__cccl/system_header.h @@ -19,19 +19,18 @@ # define _CCCL_FORCE_SYSTEM_HEADER_GCC #elif _CCCL_COMPILER(CLANG) # define _CCCL_FORCE_SYSTEM_HEADER_CLANG -#elif defined(_CCCL_COMPILER_MSVC) +#elif _CCCL_COMPILER(MSVC) # define _CCCL_FORCE_SYSTEM_HEADER_MSVC #endif // other compilers // Potentially enable that cccl headers are treated as system headers -#if !defined(_CCCL_NO_SYSTEM_HEADER) \ - && !(defined(_CCCL_COMPILER_MSVC) && defined(_LIBCUDACXX_DISABLE_PRAGMA_MSVC_WARNING)) && !_CCCL_COMPILER(NVRTC) \ - && !defined(_LIBCUDACXX_DISABLE_PRAGMA_GCC_SYSTEM_HEADER) +#if !defined(_CCCL_NO_SYSTEM_HEADER) && !(_CCCL_COMPILER(MSVC) && defined(_LIBCUDACXX_DISABLE_PRAGMA_MSVC_WARNING)) \ + && !_CCCL_COMPILER(NVRTC) && !defined(_LIBCUDACXX_DISABLE_PRAGMA_GCC_SYSTEM_HEADER) # if _CCCL_COMPILER(GCC) || _CCCL_COMPILER(NVHPC) || _CCCL_COMPILER(ICC) # define _CCCL_IMPLICIT_SYSTEM_HEADER_GCC # elif _CCCL_COMPILER(CLANG) # define _CCCL_IMPLICIT_SYSTEM_HEADER_CLANG -# elif defined(_CCCL_COMPILER_MSVC) +# elif _CCCL_COMPILER(MSVC) # define _CCCL_IMPLICIT_SYSTEM_HEADER_MSVC # endif // other compilers #endif // Use system header diff --git a/libcudacxx/include/cuda/std/__cccl/unreachable.h b/libcudacxx/include/cuda/std/__cccl/unreachable.h index eb6ae9a63db..f92a042fb05 100644 --- a/libcudacxx/include/cuda/std/__cccl/unreachable.h +++ b/libcudacxx/include/cuda/std/__cccl/unreachable.h @@ -35,18 +35,18 @@ # define _CCCL_UNREACHABLE() __builtin_unreachable() # endif // CUDACC above 11.4 #else // ^^^ __CUDA_ARCH__ ^^^ / vvv !__CUDA_ARCH__ vvv -# if defined(_CCCL_COMPILER_MSVC_2017) +# if _CCCL_COMPILER(MSVC2017) template _LIBCUDACXX_HIDE_FROM_ABI __declspec(noreturn) void __cccl_unreachable_fallback() { __assume(0); } # define _CCCL_UNREACHABLE() __cccl_unreachable_fallback() -# elif defined(_CCCL_COMPILER_MSVC) +# elif _CCCL_COMPILER(MSVC) # define _CCCL_UNREACHABLE() __assume(0) -# else // ^^^ _CCCL_COMPILER_MSVC ^^^ / vvv !_CCCL_COMPILER_MSVC vvv +# else // ^^^ _CCCL_COMPILER(MSVC) ^^^ / vvv !_CCCL_COMPILER(MSVC) vvv # define _CCCL_UNREACHABLE() __builtin_unreachable() -# endif // !_CCCL_COMPILER_MSVC +# endif // !_CCCL_COMPILER(MSVC) #endif // !__CUDA_ARCH__ #endif // __CCCL_UNREACHABLE_H diff --git a/libcudacxx/include/cuda/std/__cccl/visibility.h b/libcudacxx/include/cuda/std/__cccl/visibility.h index ad35694a448..781e5a4fefa 100644 --- a/libcudacxx/include/cuda/std/__cccl/visibility.h +++ b/libcudacxx/include/cuda/std/__cccl/visibility.h @@ -37,21 +37,21 @@ #endif // _CCCL_COMPILER(NVHPC) // Enable us to hide kernels -#if defined(_CCCL_COMPILER_MSVC) || _CCCL_COMPILER(NVRTC) +#if _CCCL_COMPILER(MSVC) || _CCCL_COMPILER(NVRTC) # define _CCCL_VISIBILITY_HIDDEN #else // ^^^ _CCCL_COMPILER(NVRTC) ^^^ / vvv _CCCL_COMPILER(NVRTC) vvv # define _CCCL_VISIBILITY_HIDDEN __attribute__((__visibility__("hidden"))) #endif // !_CCCL_COMPILER(NVRTC) -#if defined(_CCCL_COMPILER_MSVC) +#if _CCCL_COMPILER(MSVC) # define _CCCL_VISIBILITY_DEFAULT __declspec(dllimport) -#elif _CCCL_COMPILER(NVRTC) // ^^^ _CCCL_COMPILER_MSVC ^^^ / vvv _CCCL_COMPILER(NVRTC) vvv +#elif _CCCL_COMPILER(NVRTC) // ^^^ _CCCL_COMPILER(MSVC) ^^^ / vvv _CCCL_COMPILER(NVRTC) vvv # define _CCCL_VISIBILITY_DEFAULT #else // ^^^ _CCCL_COMPILER(NVRTC) ^^^ / vvv !_CCCL_COMPILER(NVRTC) vvv # define _CCCL_VISIBILITY_DEFAULT __attribute__((__visibility__("default"))) #endif // !_CCCL_COMPILER(NVRTC) -#if defined(_CCCL_COMPILER_MSVC) || _CCCL_COMPILER(NVRTC) +#if _CCCL_COMPILER(MSVC) || _CCCL_COMPILER(NVRTC) # define _CCCL_TYPE_VISIBILITY_DEFAULT #elif _CCCL_HAS_ATTRIBUTE(__type_visibility__) # define _CCCL_TYPE_VISIBILITY_DEFAULT __attribute__((__type_visibility__("default"))) @@ -59,11 +59,11 @@ # define _CCCL_TYPE_VISIBILITY_DEFAULT _CCCL_VISIBILITY_DEFAULT #endif // !_CCCL_COMPILER(NVRTC) -#if defined(_CCCL_COMPILER_MSVC) +#if _CCCL_COMPILER(MSVC) # define _CCCL_FORCEINLINE __forceinline -#else // ^^^ _CCCL_COMPILER_MSVC ^^^ / vvv _CCCL_COMPILER_MSVC vvv +#else // ^^^ _CCCL_COMPILER(MSVC) ^^^ / vvv _CCCL_COMPILER(MSVC) vvv # define _CCCL_FORCEINLINE __inline__ __attribute__((__always_inline__)) -#endif // !_CCCL_COMPILER_MSVC +#endif // !_CCCL_COMPILER(MSVC) #if _CCCL_HAS_ATTRIBUTE(exclude_from_explicit_instantiation) # define _CCCL_EXCLUDE_FROM_EXPLICIT_INSTANTIATION __attribute__((exclude_from_explicit_instantiation)) diff --git a/libcudacxx/include/cuda/std/__concepts/concept_macros.h b/libcudacxx/include/cuda/std/__concepts/concept_macros.h index 6dc147e9084..2850c38a493 100644 --- a/libcudacxx/include/cuda/std/__concepts/concept_macros.h +++ b/libcudacxx/include/cuda/std/__concepts/concept_macros.h @@ -78,14 +78,14 @@ _LIBCUDACXX_HIDE_FROM_ABI constexpr bool __is_true() return true; } -# if _CCCL_COMPILER(CLANG) || defined(_CCCL_COMPILER_MSVC) +# if _CCCL_COMPILER(CLANG) || _CCCL_COMPILER(MSVC) template _LIBCUDACXX_HIDE_FROM_ABI __cccl_enable_if_t<_Bp> __cccl_requires() {} -# else // ^^^ _CCCL_COMPILER(CLANG) || defined(_CCCL_COMPILER_MSVC) ^^^ / vvv other compilers vvv +# else // ^^^ _CCCL_COMPILER(CLANG) || _CCCL_COMPILER(MSVC) ^^^ / vvv other compilers vvv template = 0> _CCCL_INLINE_VAR constexpr int __cccl_requires = 0; -# endif // !_CCCL_COMPILER(CLANG) && !defined(_CCCL_COMPILER_MSVC) +# endif // !_CCCL_COMPILER(CLANG) && !_CCCL_COMPILER(MSVC) template _LIBCUDACXX_HIDE_FROM_ABI auto __cccl_make_dependent(_Tp*, _Tag<_Args...>*) -> _Tp; diff --git a/libcudacxx/include/cuda/std/__concepts/convertible_to.h b/libcudacxx/include/cuda/std/__concepts/convertible_to.h index 329b493b490..169383cb095 100644 --- a/libcudacxx/include/cuda/std/__concepts/convertible_to.h +++ b/libcudacxx/include/cuda/std/__concepts/convertible_to.h @@ -35,9 +35,9 @@ concept convertible_to = is_convertible_v<_From, _To> && requires { static_cast< #elif _CCCL_STD_VER >= 2014 // ^^^ C++20 ^^^ / vvv C++14/17 vvv -# if defined(_CCCL_COMPILER_MSVC) +# if _CCCL_COMPILER(MSVC) _CCCL_NV_DIAG_SUPPRESS(1211) // nonstandard cast to array type ignored -# endif // _CCCL_COMPILER_MSVC +# endif // _CCCL_COMPILER(MSVC) _CCCL_NV_DIAG_SUPPRESS(171) // invalid type conversion, e.g. [with _From=int **, _To=const int *const *] // We cannot put this conversion check with the other constraint, as types with deleted operator will break here @@ -55,9 +55,9 @@ _CCCL_CONCEPT_FRAGMENT( template _CCCL_CONCEPT convertible_to = _CCCL_FRAGMENT(__convertible_to_, _From, _To); -# if defined(_CCCL_COMPILER_MSVC) +# if _CCCL_COMPILER(MSVC) _CCCL_NV_DIAG_DEFAULT(1211) // nonstandard cast to array type ignored -# endif // _CCCL_COMPILER_MSVC +# endif // _CCCL_COMPILER(MSVC) _CCCL_NV_DIAG_DEFAULT(171) // invalid type conversion, e.g. [with _From=int **, _To=const int *const *] #endif // _CCCL_STD_VER >= 2014 diff --git a/libcudacxx/include/cuda/std/__concepts/destructible.h b/libcudacxx/include/cuda/std/__concepts/destructible.h index 421b5e41335..90426478490 100644 --- a/libcudacxx/include/cuda/std/__concepts/destructible.h +++ b/libcudacxx/include/cuda/std/__concepts/destructible.h @@ -32,12 +32,12 @@ _LIBCUDACXX_BEGIN_NAMESPACE_STD #if _CCCL_STD_VER > 2011 -# if defined(_CCCL_COMPILER_MSVC) +# if _CCCL_COMPILER(MSVC) template _CCCL_CONCEPT destructible = __is_nothrow_destructible(_Tp); -# else // ^^^ _CCCL_COMPILER_MSVC ^^^ / vvv !_CCCL_COMPILER_MSVC vvv +# else // ^^^ _CCCL_COMPILER(MSVC) ^^^ / vvv !_CCCL_COMPILER(MSVC) vvv template _CCCL_INLINE_VAR constexpr bool __destructible_impl = false; @@ -67,7 +67,7 @@ _CCCL_INLINE_VAR constexpr bool __destructible<_Tp[_Nm]> = __destructible<_Tp>; template _CCCL_CONCEPT destructible = __destructible<_Tp>; -# endif // !_CCCL_COMPILER_MSVC +# endif // !_CCCL_COMPILER(MSVC) #endif // _CCCL_STD_VER > 2011 diff --git a/libcudacxx/include/cuda/std/__concepts/swappable.h b/libcudacxx/include/cuda/std/__concepts/swappable.h index 8635bc9cc6c..8688e71a702 100644 --- a/libcudacxx/include/cuda/std/__concepts/swappable.h +++ b/libcudacxx/include/cuda/std/__concepts/swappable.h @@ -37,9 +37,9 @@ #include #include -#if defined(_CCCL_COMPILER_MSVC) +#if _CCCL_COMPILER(MSVC) _CCCL_NV_DIAG_SUPPRESS(461) // nonstandard cast to array type ignored -#endif // _CCCL_COMPILER_MSVC +#endif // _CCCL_COMPILER(MSVC) #if _CCCL_STD_VER > 2011 @@ -199,8 +199,8 @@ _LIBCUDACXX_END_NAMESPACE_STD #endif // _CCCL_STD_VER > 2011 -#if defined(_CCCL_COMPILER_MSVC) +#if _CCCL_COMPILER(MSVC) _CCCL_NV_DIAG_DEFAULT(461) // nonstandard cast to array type ignored -#endif // _CCCL_COMPILER_MSVC +#endif // _CCCL_COMPILER(MSVC) #endif // _LIBCUDACXX___CONCEPTS_SWAPPABLE_H diff --git a/libcudacxx/include/cuda/std/__fwd/get.h b/libcudacxx/include/cuda/std/__fwd/get.h index 9280f9d45d3..6fd977fd158 100644 --- a/libcudacxx/include/cuda/std/__fwd/get.h +++ b/libcudacxx/include/cuda/std/__fwd/get.h @@ -72,7 +72,7 @@ _LIBCUDACXX_HIDE_FROM_ABI _CCCL_CONSTEXPR_CXX14 const _Tp&& get(const array<_Tp, _LIBCUDACXX_END_NAMESPACE_STD -#if _CCCL_STD_VER >= 2017 && !defined(_CCCL_COMPILER_MSVC_2017) +#if _CCCL_STD_VER >= 2017 && !_CCCL_COMPILER(MSVC2017) _LIBCUDACXX_BEGIN_NAMESPACE_RANGES @@ -109,6 +109,6 @@ using _CUDA_VRANGES::get; _LIBCUDACXX_END_NAMESPACE_STD -#endif // _CCCL_STD_VER >= 2017 && !_CCCL_COMPILER_MSVC_2017 +#endif // _CCCL_STD_VER >= 2017 && !_CCCL_COMPILER(MSVC2017) #endif // _LIBCUDACXX___FWD_GET_H diff --git a/libcudacxx/include/cuda/std/__fwd/subrange.h b/libcudacxx/include/cuda/std/__fwd/subrange.h index ba6b5e45ef5..d89df6f0ab2 100644 --- a/libcudacxx/include/cuda/std/__fwd/subrange.h +++ b/libcudacxx/include/cuda/std/__fwd/subrange.h @@ -22,7 +22,7 @@ #include -#if _CCCL_STD_VER >= 2017 && !defined(_CCCL_COMPILER_MSVC_2017) +#if _CCCL_STD_VER >= 2017 && !_CCCL_COMPILER(MSVC2017) _LIBCUDACXX_BEGIN_NAMESPACE_RANGES @@ -52,6 +52,6 @@ class _CCCL_TYPE_VISIBILITY_DEFAULT subrange; _LIBCUDACXX_END_NAMESPACE_RANGES_ABI _LIBCUDACXX_END_NAMESPACE_RANGES -#endif // _CCCL_STD_VER >= 2017 && !_CCCL_COMPILER_MSVC_2017 +#endif // _CCCL_STD_VER >= 2017 && !_CCCL_COMPILER(MSVC2017) #endif // _LIBCUDACXX___FWD_SUBRANGE_H diff --git a/libcudacxx/include/cuda/std/__iterator/concepts.h b/libcudacxx/include/cuda/std/__iterator/concepts.h index 59b2d0818dc..e4e507afe83 100644 --- a/libcudacxx/include/cuda/std/__iterator/concepts.h +++ b/libcudacxx/include/cuda/std/__iterator/concepts.h @@ -403,7 +403,7 @@ template _CCCL_CONCEPT bidirectional_iterator = _CCCL_FRAGMENT(__bidirectional_iterator_, _Ip); // [iterator.concept.random.access] -# if defined(_CCCL_COMPILER_MSVC_2017) +# if _CCCL_COMPILER(MSVC2017) // For whatever reasons MSVC2017 cannot check decltype(__n + __j) template _CCCL_CONCEPT_FRAGMENT( @@ -415,7 +415,7 @@ _CCCL_CONCEPT_FRAGMENT( requires(same_as<_Ip&, decltype(__i -= __n)>), requires(same_as<_Ip, decltype(__j - __n)>), requires(same_as, decltype(__j[__n])>))); -# else // ^^^ _CCCL_COMPILER_MSVC_2017 ^^^ / vvv !_CCCL_COMPILER_MSVC_2017 vvv +# else // ^^^ _CCCL_COMPILER(MSVC2017) ^^^ / vvv !_CCCL_COMPILER(MSVC2017) vvv template _CCCL_CONCEPT_FRAGMENT( __random_access_iterator_operations_, @@ -426,7 +426,7 @@ _CCCL_CONCEPT_FRAGMENT( requires(same_as<_Ip&, decltype(__i -= __n)>), requires(same_as<_Ip, decltype(__j - __n)>), requires(same_as, decltype(__j[__n])>))); -# endif // !_CCCL_COMPILER_MSVC_2017 +# endif // !_CCCL_COMPILER(MSVC2017) template _CCCL_CONCEPT __random_access_iterator_operations = _CCCL_FRAGMENT(__random_access_iterator_operations_, _Ip); diff --git a/libcudacxx/include/cuda/std/__iterator/distance.h b/libcudacxx/include/cuda/std/__iterator/distance.h index 1e6fae1c988..441c30c9a9f 100644 --- a/libcudacxx/include/cuda/std/__iterator/distance.h +++ b/libcudacxx/include/cuda/std/__iterator/distance.h @@ -59,7 +59,7 @@ distance(_InputIter __first, _InputIter __last) } _LIBCUDACXX_END_NAMESPACE_STD -#if _CCCL_STD_VER > 2014 && !defined(_CCCL_COMPILER_MSVC_2017) +#if _CCCL_STD_VER > 2014 && !_CCCL_COMPILER(MSVC2017) // [range.iter.op.distance] @@ -118,6 +118,6 @@ _CCCL_GLOBAL_CONSTANT auto distance = __distance::__fn{}; } // namespace __cpo _LIBCUDACXX_END_NAMESPACE_RANGES -#endif // _CCCL_STD_VER > 2014 && !defined(_CCCL_COMPILER_MSVC_2017) +#endif // _CCCL_STD_VER > 2014 && !_CCCL_COMPILER(MSVC2017) #endif // _LIBCUDACXX___ITERATOR_DISTANCE_H diff --git a/libcudacxx/include/cuda/std/__iterator/iterator_traits.h b/libcudacxx/include/cuda/std/__iterator/iterator_traits.h index f20dde7d1b1..759af45cc3a 100644 --- a/libcudacxx/include/cuda/std/__iterator/iterator_traits.h +++ b/libcudacxx/include/cuda/std/__iterator/iterator_traits.h @@ -40,11 +40,11 @@ #include #if !_CCCL_COMPILER(NVRTC) -# if defined(_CCCL_COMPILER_MSVC) +# if _CCCL_COMPILER(MSVC) # include // for ::std::input_iterator_tag -# else // ^^^ _CCCL_COMPILER_MSVC ^^^ / vvv !_CCCL_COMPILER_MSVC vvv +# else // ^^^ _CCCL_COMPILER(MSVC) ^^^ / vvv !_CCCL_COMPILER(MSVC) vvv # include // for ::std::input_iterator_tag -# endif // !_CCCL_COMPILER_MSVC +# endif // !_CCCL_COMPILER(MSVC) # if _CCCL_STD_VER >= 2020 template diff --git a/libcudacxx/include/cuda/std/__iterator/move_iterator.h b/libcudacxx/include/cuda/std/__iterator/move_iterator.h index 7e2e176b817..efdf656366a 100644 --- a/libcudacxx/include/cuda/std/__iterator/move_iterator.h +++ b/libcudacxx/include/cuda/std/__iterator/move_iterator.h @@ -107,7 +107,7 @@ class _CCCL_TYPE_VISIBILITY_DEFAULT move_iterator _Iter __current_; #if _CCCL_STD_VER >= 2017 -# if !defined(_CCCL_COMPILER_MSVC_2017) +# if !_CCCL_COMPILER(MSVC2017) _LIBCUDACXX_HIDE_FROM_ABI static constexpr auto __mi_get_iter_concept() { if constexpr (random_access_iterator<_Iter>) @@ -128,22 +128,22 @@ class _CCCL_TYPE_VISIBILITY_DEFAULT move_iterator } _CCCL_UNREACHABLE(); } -# endif // !_CCCL_COMPILER_MSVC_2017 +# endif // !_CCCL_COMPILER(MSVC2017) #endif // _CCCL_STD_VER >= 2017 public: #if _CCCL_STD_VER > 2014 using iterator_type = _Iter; -# if defined(_CCCL_COMPILER_MSVC_2017) +# if _CCCL_COMPILER(MSVC2017) // clang-format off using iterator_concept = conditional_t, random_access_iterator_tag, conditional_t, bidirectional_iterator_tag, conditional_t, forward_iterator_tag, input_iterator_tag>>>; // clang-format on -# else // ^^^ _CCCL_COMPILER_MSVC_2017 ^^^ / vvv !_CCCL_COMPILER_MSVC_2017 vvv +# else // ^^^ _CCCL_COMPILER(MSVC2017) ^^^ / vvv !_CCCL_COMPILER(MSVC2017) vvv using iterator_concept = decltype(__mi_get_iter_concept()); -# endif // !_CCCL_COMPILER_MSVC_2017 +# endif // !_CCCL_COMPILER(MSVC2017) // iterator_category is inherited and not always present using value_type = iter_value_t<_Iter>; @@ -370,7 +370,7 @@ class _CCCL_TYPE_VISIBILITY_DEFAULT move_iterator return _CUDA_VRANGES::iter_move(__i.__current_); } -# if defined(_CCCL_COMPILER_MSVC_2017) // MSVC2017 cannot find _Iter otherwise +# if _CCCL_COMPILER(MSVC2017) // MSVC2017 cannot find _Iter otherwise template _LIBCUDACXX_HIDE_FROM_ABI friend constexpr auto iter_swap( const move_iterator<_Iter1>& __x, const move_iterator<_Iter2>& __y) noexcept(__noexcept_swappable<_Iter1, _Iter2>) @@ -378,7 +378,7 @@ class _CCCL_TYPE_VISIBILITY_DEFAULT move_iterator { return _CUDA_VRANGES::iter_swap(__x.__current_, __y.__current_); } -# else // ^^^ _CCCL_COMPILER_MSVC_2017 ^^^ / vvv !_CCCL_COMPILER_MSVC_2017 vvv +# else // ^^^ _CCCL_COMPILER(MSVC2017) ^^^ / vvv !_CCCL_COMPILER(MSVC2017) vvv template _LIBCUDACXX_HIDE_FROM_ABI friend constexpr auto iter_swap(const move_iterator& __x, const move_iterator<_Iter2>& __y) noexcept(__noexcept_swappable<_Iter, _Iter2>) @@ -386,17 +386,17 @@ class _CCCL_TYPE_VISIBILITY_DEFAULT move_iterator { return _CUDA_VRANGES::iter_swap(__x.__current_, __y.__current_); } -# endif // !_CCCL_COMPILER_MSVC_2017 +# endif // !_CCCL_COMPILER(MSVC2017) #endif // _CCCL_STD_VER > 2014 }; _LIBCUDACXX_CTAD_SUPPORTED_FOR_TYPE(move_iterator); // Some compilers have issues determining _IsFancyPointer -#if _CCCL_COMPILER(GCC) || defined(_CCCL_COMPILER_MSVC) +#if _CCCL_COMPILER(GCC) || _CCCL_COMPILER(MSVC) template struct _IsFancyPointer> : _IsFancyPointer<_Iter> {}; -#endif // _CCCL_COMPILER(GCC) || _CCCL_COMPILER_MSVC +#endif // _CCCL_COMPILER(GCC) || _CCCL_COMPILER(MSVC) template _LIBCUDACXX_HIDE_FROM_ABI _CCCL_CONSTEXPR_CXX14 bool diff --git a/libcudacxx/include/cuda/std/__iterator/next.h b/libcudacxx/include/cuda/std/__iterator/next.h index 4651214e4bd..f100e76ec9d 100644 --- a/libcudacxx/include/cuda/std/__iterator/next.h +++ b/libcudacxx/include/cuda/std/__iterator/next.h @@ -42,7 +42,7 @@ next(_InputIter __x, typename iterator_traits<_InputIter>::difference_type __n = _LIBCUDACXX_END_NAMESPACE_STD -#if _CCCL_STD_VER > 2014 && !defined(_CCCL_COMPILER_MSVC_2017) +#if _CCCL_STD_VER > 2014 && !_CCCL_COMPILER(MSVC2017) // [range.iter.op.next] @@ -90,6 +90,6 @@ _CCCL_GLOBAL_CONSTANT auto next = __next::__fn{}; } // namespace __cpo _LIBCUDACXX_END_NAMESPACE_RANGES -#endif // _CCCL_STD_VER > 2014 && !defined(_CCCL_COMPILER_MSVC_2017) +#endif // _CCCL_STD_VER > 2014 && !_CCCL_COMPILER(MSVC2017) #endif // _LIBCUDACXX___ITERATOR_NEXT_H diff --git a/libcudacxx/include/cuda/std/__iterator/prev.h b/libcudacxx/include/cuda/std/__iterator/prev.h index f28098d9e45..1e5e78d043e 100644 --- a/libcudacxx/include/cuda/std/__iterator/prev.h +++ b/libcudacxx/include/cuda/std/__iterator/prev.h @@ -41,7 +41,7 @@ prev(_InputIter __x, typename iterator_traits<_InputIter>::difference_type __n = _LIBCUDACXX_END_NAMESPACE_STD -#if _CCCL_STD_VER > 2014 && !defined(_CCCL_COMPILER_MSVC_2017) +#if _CCCL_STD_VER > 2014 && !_CCCL_COMPILER(MSVC2017) // [range.iter.op.prev] @@ -81,6 +81,6 @@ _CCCL_GLOBAL_CONSTANT auto prev = __prev::__fn{}; } // namespace __cpo _LIBCUDACXX_END_NAMESPACE_RANGES -#endif // _CCCL_STD_VER > 2014 && !defined(_CCCL_COMPILER_MSVC_2017) +#endif // _CCCL_STD_VER > 2014 && !_CCCL_COMPILER(MSVC2017) #endif // _LIBCUDACXX___ITERATOR_PREV_H diff --git a/libcudacxx/include/cuda/std/__iterator/reverse_iterator.h b/libcudacxx/include/cuda/std/__iterator/reverse_iterator.h index 6f2b0cce65e..982312731f9 100644 --- a/libcudacxx/include/cuda/std/__iterator/reverse_iterator.h +++ b/libcudacxx/include/cuda/std/__iterator/reverse_iterator.h @@ -248,7 +248,7 @@ class _CCCL_TYPE_VISIBILITY_DEFAULT reverse_iterator return _CUDA_VRANGES::iter_move(--__tmp); } -# if defined(_CCCL_COMPILER_MSVC_2017) // MSVC2017 cannot find _Iter otherwise +# if _CCCL_COMPILER(MSVC2017) // MSVC2017 cannot find _Iter otherwise template _LIBCUDACXX_HIDE_FROM_ABI friend constexpr auto iter_swap(const reverse_iterator<_Iter1>& __x, @@ -259,7 +259,7 @@ class _CCCL_TYPE_VISIBILITY_DEFAULT reverse_iterator auto __ytmp = __y.base(); _CUDA_VRANGES::iter_swap(--__xtmp, --__ytmp); } -# else // ^^^ _CCCL_COMPILER_MSVC_2017 ^^^ / vvv !_CCCL_COMPILER_MSVC_2017 vvv +# else // ^^^ _CCCL_COMPILER(MSVC2017) ^^^ / vvv !_CCCL_COMPILER(MSVC2017) vvv template _LIBCUDACXX_HIDE_FROM_ABI friend constexpr auto iter_swap(const reverse_iterator& __x, @@ -270,7 +270,7 @@ class _CCCL_TYPE_VISIBILITY_DEFAULT reverse_iterator auto __ytmp = __y.base(); return _CUDA_VRANGES::iter_swap(--__xtmp, --__ytmp); } -# endif // !_CCCL_COMPILER_MSVC_2017 +# endif // !_CCCL_COMPILER(MSVC2017) #endif // _CCCL_STD_VER > 2014 }; diff --git a/libcudacxx/include/cuda/std/__iterator/unreachable_sentinel.h b/libcudacxx/include/cuda/std/__iterator/unreachable_sentinel.h index 785c6d149c9..3ffffea090c 100644 --- a/libcudacxx/include/cuda/std/__iterator/unreachable_sentinel.h +++ b/libcudacxx/include/cuda/std/__iterator/unreachable_sentinel.h @@ -35,12 +35,12 @@ _LIBCUDACXX_BEGIN_NAMESPACE_RANGES_ABI // are only ever found through ADL struct unreachable_sentinel_t -# ifdef _CCCL_COMPILER_MSVC +# if _CCCL_COMPILER(MSVC) ; namespace __unreachable_sentinel_detail { struct __unreachable_base -# endif // _CCCL_COMPILER_MSVC +# endif // _CCCL_COMPILER(MSVC) { _CCCL_TEMPLATE(class _Iter) _CCCL_REQUIRES(weakly_incrementable<_Iter>) @@ -74,11 +74,11 @@ struct __unreachable_base # endif // _CCCL_STD_VER < 2020 }; -# ifdef _CCCL_COMPILER_MSVC +# if _CCCL_COMPILER(MSVC) } // namespace __unreachable_sentinel_detail struct unreachable_sentinel_t : __unreachable_sentinel_detail::__unreachable_base {}; -# endif // _CCCL_COMPILER_MSVC +# endif // _CCCL_COMPILER(MSVC) _LIBCUDACXX_END_NAMESPACE_RANGES_ABI diff --git a/libcudacxx/include/cuda/std/__memory/assume_aligned.h b/libcudacxx/include/cuda/std/__memory/assume_aligned.h index c8f9310ed1a..ce7b70e6a01 100644 --- a/libcudacxx/include/cuda/std/__memory/assume_aligned.h +++ b/libcudacxx/include/cuda/std/__memory/assume_aligned.h @@ -36,9 +36,9 @@ _CCCL_NODISCARD _LIBCUDACXX_HIDE_FROM_ABI _CCCL_CONSTEXPR_CXX14 _Tp* assume_alig #if defined(_CCCL_BUILTIN_IS_CONSTANT_EVALUATED) && defined(_CCCL_BUILTIN_ASSUME_ALIGNED) if (!_CCCL_BUILTIN_IS_CONSTANT_EVALUATED()) { -# if !defined(_CCCL_COMPILER_MSVC) // MSVC checks within the builtin +# if !_CCCL_COMPILER(MSVC) // MSVC checks within the builtin _CCCL_ASSERT(reinterpret_cast(__ptr) % _Align == 0, "Alignment assumption is violated"); -# endif // !_CCCL_COMPILER_MSVC +# endif // !_CCCL_COMPILER(MSVC) return static_cast<_Tp*>(_CCCL_BUILTIN_ASSUME_ALIGNED(__ptr, _Align)); } else diff --git a/libcudacxx/include/cuda/std/__ranges/access.h b/libcudacxx/include/cuda/std/__ranges/access.h index 9a18ddd88b6..2c1525e1ad4 100644 --- a/libcudacxx/include/cuda/std/__ranges/access.h +++ b/libcudacxx/include/cuda/std/__ranges/access.h @@ -33,7 +33,7 @@ _LIBCUDACXX_BEGIN_NAMESPACE_RANGES -#if _CCCL_STD_VER > 2014 && !defined(_CCCL_COMPILER_MSVC_2017) +#if _CCCL_STD_VER > 2014 && !_CCCL_COMPILER(MSVC2017) template _CCCL_CONCEPT __can_borrow = is_lvalue_reference_v<_Tp> || enable_borrowed_range>; @@ -279,7 +279,7 @@ inline namespace __cpo { _CCCL_GLOBAL_CONSTANT auto cend = __cend::__fn{}; } // namespace __cpo -#endif // _CCCL_STD_VER > 2014 && !_CCCL_COMPILER_MSVC_2017 +#endif // _CCCL_STD_VER > 2014 && !_CCCL_COMPILER(MSVC2017) _LIBCUDACXX_END_NAMESPACE_RANGES diff --git a/libcudacxx/include/cuda/std/__ranges/concepts.h b/libcudacxx/include/cuda/std/__ranges/concepts.h index 8b4aa426b51..26d7fe421e7 100644 --- a/libcudacxx/include/cuda/std/__ranges/concepts.h +++ b/libcudacxx/include/cuda/std/__ranges/concepts.h @@ -44,7 +44,7 @@ _LIBCUDACXX_BEGIN_NAMESPACE_RANGES -#if _CCCL_STD_VER >= 2017 && !defined(_CCCL_COMPILER_MSVC_2017) +#if _CCCL_STD_VER >= 2017 && !_CCCL_COMPILER(MSVC2017) # if _CCCL_STD_VER >= 2020 @@ -301,7 +301,7 @@ template _CCCL_CONCEPT __container_compatible_range = _CCCL_FRAGMENT(__container_compatible_range_, _Range, _Tp); # endif // _CCCL_STD_VER <= 2017 -#endif // _CCCL_STD_VER >= 2017 && !_CCCL_COMPILER_MSVC_2017 +#endif // _CCCL_STD_VER >= 2017 && !_CCCL_COMPILER(MSVC2017) _LIBCUDACXX_END_NAMESPACE_RANGES diff --git a/libcudacxx/include/cuda/std/__ranges/dangling.h b/libcudacxx/include/cuda/std/__ranges/dangling.h index e0974298c03..b97e5e5555a 100644 --- a/libcudacxx/include/cuda/std/__ranges/dangling.h +++ b/libcudacxx/include/cuda/std/__ranges/dangling.h @@ -27,7 +27,7 @@ _LIBCUDACXX_BEGIN_NAMESPACE_RANGES -#if _CCCL_STD_VER >= 2017 && !defined(_CCCL_COMPILER_MSVC_2017) +#if _CCCL_STD_VER >= 2017 && !_CCCL_COMPILER(MSVC2017) struct dangling { @@ -47,7 +47,7 @@ using borrowed_iterator_t = enable_if_t, _If, ite // borrowed_subrange_t defined in <__ranges/subrange.h> -#endif // _CCCL_STD_VER >= 2017 && !_CCCL_COMPILER_MSVC_2017 +#endif // _CCCL_STD_VER >= 2017 && !_CCCL_COMPILER(MSVC2017) _LIBCUDACXX_END_NAMESPACE_RANGES diff --git a/libcudacxx/include/cuda/std/__ranges/data.h b/libcudacxx/include/cuda/std/__ranges/data.h index 0e949a12489..f5bf6015963 100644 --- a/libcudacxx/include/cuda/std/__ranges/data.h +++ b/libcudacxx/include/cuda/std/__ranges/data.h @@ -34,7 +34,7 @@ _LIBCUDACXX_BEGIN_NAMESPACE_RANGES -#if _CCCL_STD_VER >= 2017 && !defined(_CCCL_COMPILER_MSVC_2017) +#if _CCCL_STD_VER >= 2017 && !_CCCL_COMPILER(MSVC2017) // [range.prim.data] @@ -128,7 +128,7 @@ inline namespace __cpo _CCCL_GLOBAL_CONSTANT auto cdata = __cdata::__fn{}; } // namespace __cpo -#endif // _CCCL_STD_VER >= 2017 && !_CCCL_COMPILER_MSVC_2017 +#endif // _CCCL_STD_VER >= 2017 && !_CCCL_COMPILER(MSVC2017) _LIBCUDACXX_END_NAMESPACE_RANGES diff --git a/libcudacxx/include/cuda/std/__ranges/empty.h b/libcudacxx/include/cuda/std/__ranges/empty.h index 9eee04a6644..d8f8213e9a8 100644 --- a/libcudacxx/include/cuda/std/__ranges/empty.h +++ b/libcudacxx/include/cuda/std/__ranges/empty.h @@ -27,7 +27,7 @@ _LIBCUDACXX_BEGIN_NAMESPACE_RANGES -#if _CCCL_STD_VER >= 2017 && !defined(_CCCL_COMPILER_MSVC_2017) +#if _CCCL_STD_VER >= 2017 && !_CCCL_COMPILER(MSVC2017) // [range.prim.empty] @@ -104,7 +104,7 @@ inline namespace __cpo _CCCL_GLOBAL_CONSTANT auto empty = __empty::__fn{}; } // namespace __cpo -#endif // _CCCL_STD_VER >= 2017 && !_CCCL_COMPILER_MSVC_2017 +#endif // _CCCL_STD_VER >= 2017 && !_CCCL_COMPILER(MSVC2017) _LIBCUDACXX_END_NAMESPACE_RANGES diff --git a/libcudacxx/include/cuda/std/__ranges/rbegin.h b/libcudacxx/include/cuda/std/__ranges/rbegin.h index 671e8e31798..8b70f702797 100644 --- a/libcudacxx/include/cuda/std/__ranges/rbegin.h +++ b/libcudacxx/include/cuda/std/__ranges/rbegin.h @@ -33,7 +33,7 @@ _LIBCUDACXX_BEGIN_NAMESPACE_RANGES -#if _CCCL_STD_VER >= 2017 && !defined(_CCCL_COMPILER_MSVC_2017) +#if _CCCL_STD_VER >= 2017 && !_CCCL_COMPILER(MSVC2017) // [ranges.access.rbegin] @@ -168,7 +168,7 @@ inline namespace __cpo _CCCL_GLOBAL_CONSTANT auto crbegin = __crbegin::__fn{}; } // namespace __cpo -#endif // _CCCL_STD_VER >= 2017 && && !_CCCL_COMPILER_MSVC_2017 +#endif // _CCCL_STD_VER >= 2017 && && !_CCCL_COMPILER(MSVC2017) _LIBCUDACXX_END_NAMESPACE_RANGES diff --git a/libcudacxx/include/cuda/std/__ranges/rend.h b/libcudacxx/include/cuda/std/__ranges/rend.h index 28ec5e9e021..5c266d63bdd 100644 --- a/libcudacxx/include/cuda/std/__ranges/rend.h +++ b/libcudacxx/include/cuda/std/__ranges/rend.h @@ -34,7 +34,7 @@ _LIBCUDACXX_BEGIN_NAMESPACE_RANGES -#if _CCCL_STD_VER >= 2017 && !defined(_CCCL_COMPILER_MSVC_2017) +#if _CCCL_STD_VER >= 2017 && !_CCCL_COMPILER(MSVC2017) // [range.access.rend] @@ -174,7 +174,7 @@ inline namespace __cpo _CCCL_GLOBAL_CONSTANT auto crend = __crend::__fn{}; } // namespace __cpo -#endif // _CCCL_STD_VER >= 2017 && !_CCCL_COMPILER_MSVC_2017 +#endif // _CCCL_STD_VER >= 2017 && !_CCCL_COMPILER(MSVC2017) _LIBCUDACXX_END_NAMESPACE_RANGES diff --git a/libcudacxx/include/cuda/std/__ranges/size.h b/libcudacxx/include/cuda/std/__ranges/size.h index 0c87e1c1ef3..04487441586 100644 --- a/libcudacxx/include/cuda/std/__ranges/size.h +++ b/libcudacxx/include/cuda/std/__ranges/size.h @@ -36,7 +36,7 @@ _LIBCUDACXX_BEGIN_NAMESPACE_RANGES -#if _CCCL_STD_VER >= 2017 && !defined(_CCCL_COMPILER_MSVC_2017) +#if _CCCL_STD_VER >= 2017 && !_CCCL_COMPILER(MSVC2017) template _CCCL_INLINE_VAR constexpr bool disable_sized_range = false; @@ -200,7 +200,7 @@ inline namespace __cpo _CCCL_GLOBAL_CONSTANT auto ssize = __ssize::__fn{}; } // namespace __cpo -#endif // _CCCL_STD_VER >= 2017 && !_CCCL_COMPILER_MSVC_2017 +#endif // _CCCL_STD_VER >= 2017 && !_CCCL_COMPILER(MSVC2017) _LIBCUDACXX_END_NAMESPACE_RANGES diff --git a/libcudacxx/include/cuda/std/__ranges/subrange.h b/libcudacxx/include/cuda/std/__ranges/subrange.h index a9eb9f2572f..190df21d43b 100644 --- a/libcudacxx/include/cuda/std/__ranges/subrange.h +++ b/libcudacxx/include/cuda/std/__ranges/subrange.h @@ -51,7 +51,7 @@ #include #include -#if _CCCL_STD_VER >= 2017 && !defined(_CCCL_COMPILER_MSVC_2017) +#if _CCCL_STD_VER >= 2017 && !_CCCL_COMPILER(MSVC2017) // MSVC complains about [[msvc::no_unique_address]] prior to C++20 as a vendor extension _CCCL_DIAG_PUSH @@ -514,6 +514,6 @@ _LIBCUDACXX_END_NAMESPACE_STD _CCCL_DIAG_POP -#endif // _CCCL_STD_VER >= 2017 && !defined(_CCCL_COMPILER_MSVC_2017) +#endif // _CCCL_STD_VER >= 2017 && !_CCCL_COMPILER(MSVC2017) #endif // _LIBCUDACXX___RANGES_SUBRANGE_H diff --git a/libcudacxx/include/cuda/std/__ranges/unwrap_end.h b/libcudacxx/include/cuda/std/__ranges/unwrap_end.h index f134f141e8f..9e0b6636ff0 100644 --- a/libcudacxx/include/cuda/std/__ranges/unwrap_end.h +++ b/libcudacxx/include/cuda/std/__ranges/unwrap_end.h @@ -27,7 +27,7 @@ _LIBCUDACXX_BEGIN_NAMESPACE_RANGES -#if _CCCL_STD_VER >= 2017 && !defined(_CCCL_COMPILER_MSVC_2017) +#if _CCCL_STD_VER >= 2017 && !_CCCL_COMPILER(MSVC2017) _CCCL_TEMPLATE(class _Range) _CCCL_REQUIRES(forward_range<_Range>) @@ -46,7 +46,7 @@ _CCCL_NODISCARD _LIBCUDACXX_HIDE_FROM_ABI constexpr iterator_t<_Range> __unwrap_ _CCCL_UNREACHABLE(); } -#endif // _CCCL_STD_VER >= 2017 && !_CCCL_COMPILER_MSVC_2017 +#endif // _CCCL_STD_VER >= 2017 && !_CCCL_COMPILER(MSVC2017) _LIBCUDACXX_END_NAMESPACE_RANGES diff --git a/libcudacxx/include/cuda/std/__ranges/view_interface.h b/libcudacxx/include/cuda/std/__ranges/view_interface.h index f7f14b53c09..661e20c1b68 100644 --- a/libcudacxx/include/cuda/std/__ranges/view_interface.h +++ b/libcudacxx/include/cuda/std/__ranges/view_interface.h @@ -37,7 +37,7 @@ _LIBCUDACXX_BEGIN_NAMESPACE_RANGES -#if _CCCL_STD_VER >= 2017 && !defined(_CCCL_COMPILER_MSVC_2017) +#if _CCCL_STD_VER >= 2017 && !_CCCL_COMPILER(MSVC2017) # if _CCCL_STD_VER >= 2020 template @@ -178,7 +178,7 @@ class view_interface _LIBCUDACXX_END_NAMESPACE_RANGES_ABI -#endif // _CCCL_STD_VER >= 2017 && !_CCCL_COMPILER_MSVC_2017 +#endif // _CCCL_STD_VER >= 2017 && !_CCCL_COMPILER(MSVC2017) _LIBCUDACXX_END_NAMESPACE_RANGES diff --git a/libcudacxx/include/cuda/std/__ranges/views.h b/libcudacxx/include/cuda/std/__ranges/views.h index 8941de6c14d..3954877f117 100644 --- a/libcudacxx/include/cuda/std/__ranges/views.h +++ b/libcudacxx/include/cuda/std/__ranges/views.h @@ -21,7 +21,7 @@ # pragma system_header #endif // no system header -#if _CCCL_STD_VER >= 2017 && !defined(_CCCL_COMPILER_MSVC_2017) +#if _CCCL_STD_VER >= 2017 && !_CCCL_COMPILER(MSVC2017) _LIBCUDACXX_BEGIN_NAMESPACE_VIEWS @@ -33,6 +33,6 @@ namespace views = ranges::views; _LIBCUDACXX_END_NAMESPACE_STD -#endif // _CCCL_STD_VER >= 2017 && !_CCCL_COMPILER_MSVC_2017 +#endif // _CCCL_STD_VER >= 2017 && !_CCCL_COMPILER(MSVC2017) #endif // _LIBCUDACXX___RANGES_VIEWS diff --git a/libcudacxx/include/cuda/std/__tuple_dir/structured_bindings.h b/libcudacxx/include/cuda/std/__tuple_dir/structured_bindings.h index 8c381a9af91..e054f78729e 100644 --- a/libcudacxx/include/cuda/std/__tuple_dir/structured_bindings.h +++ b/libcudacxx/include/cuda/std/__tuple_dir/structured_bindings.h @@ -157,7 +157,7 @@ struct tuple_element<_Ip, const volatile _CUDA_VSTD::tuple<_Tp...>> : _CUDA_VSTD::tuple_element<_Ip, const volatile _CUDA_VSTD::tuple<_Tp...>> {}; -# if !defined(_CCCL_COMPILER_MSVC_2017) +# if !_CCCL_COMPILER(MSVC2017) template struct tuple_size<_CUDA_VRANGES::subrange<_Ip, _Sp, _Kp>> : _CUDA_VSTD::tuple_size<_CUDA_VRANGES::subrange<_Ip, _Sp, _Kp>> @@ -197,7 +197,7 @@ template struct tuple_element<_Idx, const volatile _CUDA_VRANGES::subrange<_Ip, _Sp, _Kp>> : _CUDA_VSTD::tuple_element<_Idx, const volatile _CUDA_VRANGES::subrange<_Ip, _Sp, _Kp>> {}; -# endif // !_CCCL_COMPILER_MSVC_2017 +# endif // !_CCCL_COMPILER(MSVC2017) } // namespace std #endif // _CCCL_STD_VER >= 2017 diff --git a/libcudacxx/include/cuda/std/__tuple_dir/tuple_like.h b/libcudacxx/include/cuda/std/__tuple_dir/tuple_like.h index b1f2273b035..28a6b1dada9 100644 --- a/libcudacxx/include/cuda/std/__tuple_dir/tuple_like.h +++ b/libcudacxx/include/cuda/std/__tuple_dir/tuple_like.h @@ -56,11 +56,11 @@ template struct __tuple_like> : true_type {}; -#if _CCCL_STD_VER >= 2017 && !defined(_CCCL_COMPILER_MSVC_2017) +#if _CCCL_STD_VER >= 2017 && !_CCCL_COMPILER(MSVC2017) template struct __tuple_like<_CUDA_VRANGES::subrange<_Ip, _Sp, _Kp>> : true_type {}; -#endif // _CCCL_STD_VER >= 2017 && !_CCCL_COMPILER_MSVC_2017 +#endif // _CCCL_STD_VER >= 2017 && !_CCCL_COMPILER(MSVC2017) template struct __tuple_like<__tuple_types<_Tp...>> : true_type diff --git a/libcudacxx/include/cuda/std/__type_traits/common_reference.h b/libcudacxx/include/cuda/std/__type_traits/common_reference.h index 020925bfb2c..6f62a1033ef 100644 --- a/libcudacxx/include/cuda/std/__type_traits/common_reference.h +++ b/libcudacxx/include/cuda/std/__type_traits/common_reference.h @@ -42,7 +42,7 @@ _LIBCUDACXX_BEGIN_NAMESPACE_STD // common_reference // Let COND_RES(X, Y) be: -#ifdef _CCCL_COMPILER_MSVC // Workaround for DevCom-1627396 +#if _CCCL_COMPILER(MSVC) // Workaround for DevCom-1627396 template _Tp __returns_exactly() noexcept; // not defined @@ -67,10 +67,10 @@ struct __cond_res_workaround<_Tp, _Up, void_t<__cond_res_if_right<_Tp, _Up>>> template using __cond_res = typename __cond_res_workaround<_Xp, _Yp>::type; -#else // ^^^ MSVC ^^^ / vvv !MSVC vvv +#else // ^^^ _CCCL_COMPILER(MSVC) ^^^ / vvv !_CCCL_COMPILER(MSVC) vvv template using __cond_res = decltype(false ? _CUDA_VSTD::declval<_Xp (&)()>()() : _CUDA_VSTD::declval<_Yp (&)()>()()); -#endif // !MSVC +#endif // !_CCCL_COMPILER(MSVC) // Let `XREF(A)` denote a unary alias template `T` such that `T` denotes the same type as `U` // with the addition of `A`'s cv and reference qualifiers, for a non-reference cv-unqualified type diff --git a/libcudacxx/include/cuda/std/__type_traits/common_type.h b/libcudacxx/include/cuda/std/__type_traits/common_type.h index 319d6fb7143..09067b52084 100644 --- a/libcudacxx/include/cuda/std/__type_traits/common_type.h +++ b/libcudacxx/include/cuda/std/__type_traits/common_type.h @@ -90,11 +90,11 @@ struct __common_type2_imp : __common_type3<_Tp, _Up> // branches have diverging return types, this happens for extended floating point types template using __msvc_declval_workaround = -#if defined(_CCCL_COMPILER_MSVC) +#if _CCCL_COMPILER(MSVC) enable_if_t<_CCCL_TRAIT(is_same, __cond_type<_Tp, _Up>, __cond_type<_Up, _Tp>)>; -#else // ^^^ _CCCL_COMPILER_MSVC ^^^ / vvv !_CCCL_COMPILER_MSVC vvv +#else // ^^^ _CCCL_COMPILER(MSVC) ^^^ / vvv !_CCCL_COMPILER(MSVC) vvv void; -#endif // !_CCCL_COMPILER_MSVC +#endif // !_CCCL_COMPILER(MSVC) // sub-bullet 3 - "if decay_t() : declval())> ..." template diff --git a/libcudacxx/include/cuda/std/__type_traits/disjunction.h b/libcudacxx/include/cuda/std/__type_traits/disjunction.h index 01fe64735a5..61bfca3f428 100644 --- a/libcudacxx/include/cuda/std/__type_traits/disjunction.h +++ b/libcudacxx/include/cuda/std/__type_traits/disjunction.h @@ -51,7 +51,7 @@ struct _OrImpl template using _Or _CCCL_NODEBUG_ALIAS = typename _OrImpl::template _Result; -#ifdef _CCCL_COMPILER_MSVC +#if _CCCL_COMPILER(MSVC) template struct disjunction : false_type {}; @@ -63,7 +63,7 @@ struct disjunction<_First, _Rest...> : _OrImpl::template _Result struct disjunction : _Or<_Args...> {}; -#endif // !MSVC +#endif // !_CCCL_COMPILER(MSVC) #if !defined(_CCCL_NO_VARIABLE_TEMPLATES) template diff --git a/libcudacxx/include/cuda/std/__type_traits/is_convertible.h b/libcudacxx/include/cuda/std/__type_traits/is_convertible.h index 11b16014cb8..4fbcb82deb6 100644 --- a/libcudacxx/include/cuda/std/__type_traits/is_convertible.h +++ b/libcudacxx/include/cuda/std/__type_traits/is_convertible.h @@ -43,7 +43,7 @@ template _CCCL_INLINE_VAR constexpr bool is_convertible_v = _CCCL_BUILTIN_IS_CONVERTIBLE_TO(_T1, _T2); # endif // !_CCCL_NO_VARIABLE_TEMPLATES -# ifdef _CCCL_COMPILER_MSVC // Workaround for DevCom-1627396 +# if _CCCL_COMPILER(MSVC) // Workaround for DevCom-1627396 template struct is_convertible<_Ty&, volatile _Ty&> : true_type {}; @@ -71,7 +71,7 @@ _CCCL_INLINE_VAR constexpr bool is_convertible_v<_Ty&, const volatile _Ty&> = tr template _CCCL_INLINE_VAR constexpr bool is_convertible_v = true; -# endif // _CCCL_COMPILER_MSVC +# endif // _CCCL_COMPILER(MSVC) #else // ^^^ _CCCL_BUILTIN_IS_CONVERTIBLE_TO ^^^ / vvv !_CCCL_BUILTIN_IS_CONVERTIBLE_TO vvv diff --git a/libcudacxx/include/cuda/std/__type_traits/is_primary_template.h b/libcudacxx/include/cuda/std/__type_traits/is_primary_template.h index 9c6a7ebc53d..d9d536d2b80 100644 --- a/libcudacxx/include/cuda/std/__type_traits/is_primary_template.h +++ b/libcudacxx/include/cuda/std/__type_traits/is_primary_template.h @@ -27,7 +27,7 @@ _LIBCUDACXX_BEGIN_NAMESPACE_STD -#if defined(_CCCL_COMPILER_MSVC) +#if _CCCL_COMPILER(MSVC) template struct __is_primary_template : false_type {}; @@ -37,13 +37,13 @@ struct __is_primary_template<_Tp, void_t> : public is_same<_Tp, typename _Tp::__primary_template> {}; -#else // ^^^ _CCCL_COMPILER_MSVC ^^^ / vvv !_CCCL_COMPILER_MSVC vvv +#else // ^^^ _CCCL_COMPILER(MSVC) ^^^ / vvv !_CCCL_COMPILER(MSVC) vvv template using __test_for_primary_template = enable_if_t<_IsSame<_Tp, typename _Tp::__primary_template>::value>; template using __is_primary_template = _IsValidExpansion<__test_for_primary_template, _Tp>; -#endif // !_CCCL_COMPILER_MSVC +#endif // !_CCCL_COMPILER(MSVC) _LIBCUDACXX_END_NAMESPACE_STD diff --git a/libcudacxx/include/cuda/std/__type_traits/type_list.h b/libcudacxx/include/cuda/std/__type_traits/type_list.h index 4bd928b0013..00f69273673 100644 --- a/libcudacxx/include/cuda/std/__type_traits/type_list.h +++ b/libcudacxx/include/cuda/std/__type_traits/type_list.h @@ -559,7 +559,7 @@ using __type_back = __type_at_c<_List::__size - 1, _List>; namespace __detail { -# if defined(_CCCL_COMPILER_MSVC) && _CCCL_MSVC_VERSION < 1938 +# if _CCCL_COMPILER(MSVC, <, 19, 38) // A workaround for https://developercommunity.visualstudio.com/t/fatal-error-C1001:-Internal-compiler-err/10405847 struct __type_concat_fn { @@ -586,7 +586,7 @@ struct __type_concat_fn template using __call _CCCL_NODEBUG_ALIAS = __type<__trait<_Lists...>>; }; -# else // ^^^ _CCCL_COMPILER_MSVC < 19.38 ^^^ / vvv !(_CCCL_COMPILER_MSVC < 19.38) vvv +# else // ^^^ _CCCL_COMPILER(MSVC, <, 19, 38) ^^^ / vvv _CCCL_COMPILER(MSVC, >=, 19, 38) vvv template struct _CCCL_TYPE_VISIBILITY_DEFAULT __type_maybe_concat_fn { @@ -646,7 +646,7 @@ struct _CCCL_TYPE_VISIBILITY_DEFAULT __type_concat_fn __type_list_ptr<>{nullptr}, __type_list_ptr<>{nullptr})); }; -# endif // !(_CCCL_COMPILER_MSVC < 19.38) +# endif // _CCCL_COMPILER(MSVC, >=, 19, 38) } // namespace __detail //! \brief Concatenate a list of type lists into a single type list. diff --git a/libcudacxx/include/cuda/std/__type_traits/type_set.h b/libcudacxx/include/cuda/std/__type_traits/type_set.h index e73c6161070..c83ebd06af7 100644 --- a/libcudacxx/include/cuda/std/__type_traits/type_set.h +++ b/libcudacxx/include/cuda/std/__type_traits/type_set.h @@ -84,7 +84,7 @@ struct __bulk_insert template <> struct __bulk_insert { -#if defined(_CCCL_COMPILER_MSVC) && _CCCL_MSVC_VERSION < 1920 +#if _CCCL_COMPILER(MSVC, <, 19, 20) template _LIBCUDACXX_HIDE_FROM_ABI static auto __insert_fn(__type_list<_Ty, _Us...>*) -> typename __bulk_insert::template __call, _Us...>; diff --git a/libcudacxx/include/cuda/std/__utility/auto_cast.h b/libcudacxx/include/cuda/std/__utility/auto_cast.h index ad4107ab178..b766493501f 100644 --- a/libcudacxx/include/cuda/std/__utility/auto_cast.h +++ b/libcudacxx/include/cuda/std/__utility/auto_cast.h @@ -23,7 +23,7 @@ #include -#if _CCCL_STD_VER < 2020 && defined(_CCCL_COMPILER_MSVC) +#if _CCCL_STD_VER < 2020 && _CCCL_COMPILER(MSVC) # define _LIBCUDACXX_AUTO_CAST(expr) (_CUDA_VSTD::decay_t) (expr) #else # define _LIBCUDACXX_AUTO_CAST(expr) static_cast<_CUDA_VSTD::decay_t>(expr) diff --git a/libcudacxx/include/cuda/std/__utility/declval.h b/libcudacxx/include/cuda/std/__utility/declval.h index 96499be6e67..d7f701c201a 100644 --- a/libcudacxx/include/cuda/std/__utility/declval.h +++ b/libcudacxx/include/cuda/std/__utility/declval.h @@ -30,8 +30,7 @@ _LIBCUDACXX_BEGIN_NAMESPACE_STD // MSVC < 19.39 to miscompile so we use the fallback instead. The use of the // `__identity_t` alias is help MSVC parse the declaration correctly. #if !defined(_CCCL_NO_VARIABLE_TEMPLATES) && !defined(_CCCL_NO_NOEXCEPT_FUNCTION_TYPE) \ - && !(defined(_CCCL_CUDA_COMPILER_NVCC) && _CCCL_CUDACC_BELOW(12, 4)) \ - && !(defined(_CCCL_COMPILER_MSVC) && _CCCL_MSVC_VERSION < 1939) + && !(defined(_CCCL_CUDA_COMPILER_NVCC) && _CCCL_CUDACC_BELOW(12, 4)) && !_CCCL_COMPILER(MSVC, <, 19, 39) template using __identity_t _CCCL_NODEBUG_ALIAS = _Tp; diff --git a/libcudacxx/include/cuda/std/bitset b/libcudacxx/include/cuda/std/bitset index 7c9839bc043..60d0e912c80 100644 --- a/libcudacxx/include/cuda/std/bitset +++ b/libcudacxx/include/cuda/std/bitset @@ -526,15 +526,15 @@ protected: _LIBCUDACXX_HIDE_FROM_ABI _CCCL_CONSTEXPR_CXX14 unsigned long to_ulong() const { -#ifdef _CCCL_COMPILER_MSVC +#if _CCCL_COMPILER(MSVC) if (static_cast(__first_.__data) != __first_.__data) { _CUDA_VSTD::__throw_overflow_error("bitset to_ulong overflow error"); } return static_cast(__first_.__data); -#else // ^^ MSVC ^^ | vv !MSVC vv +#else // ^^ _CCCL_COMPILER(MSVC) ^^ | vv !_CCCL_COMPILER(MSVC) vv return __first_.__data; -#endif // !MSVC +#endif // !_CCCL_COMPILER(MSVC) } _LIBCUDACXX_HIDE_FROM_ABI _CCCL_CONSTEXPR_CXX14 unsigned long long to_ullong() const diff --git a/libcudacxx/include/cuda/std/detail/libcxx/include/__config b/libcudacxx/include/cuda/std/detail/libcxx/include/__config index 2cdeeb4c1ef..2944ef6d4c8 100644 --- a/libcudacxx/include/cuda/std/detail/libcxx/include/__config +++ b/libcudacxx/include/cuda/std/detail/libcxx/include/__config @@ -115,7 +115,7 @@ extern "C++" { # elif defined(_LIBCUDACXX_ABI_FORCE_MICROSOFT) # define _LIBCUDACXX_ABI_MICROSOFT # else -# if defined(_WIN32) && defined(_CCCL_COMPILER_MSVC) +# if defined(_WIN32) && _CCCL_COMPILER(MSVC) # define _LIBCUDACXX_ABI_MICROSOFT # else # define _LIBCUDACXX_ABI_ITANIUM @@ -186,7 +186,7 @@ extern "C++" { # define _LIBCUDACXX_MSVCRT_LIKE // If mingw not explicitly detected, assume using MS C runtime only if // a MS compatibility version is specified. -# if defined(_CCCL_COMPILER_MSVC) && !defined(__MINGW32__) +# if _CCCL_COMPILER(MSVC) && !defined(__MINGW32__) # define _LIBCUDACXX_MSVCRT // Using Microsoft's C Runtime library # endif # if (defined(_M_AMD64) || defined(__x86_64__)) || (defined(_M_ARM) || defined(__arm__)) @@ -287,14 +287,14 @@ extern "C++" { # define __alignof(x) alignof(x) # endif // _CCCL_COMPILER(NVRTC) -# if defined(_CCCL_COMPILER_MSVC) +# if _CCCL_COMPILER(MSVC) # define __alignof__ __alignof # endif # define _LIBCUDACXX_ALIGNOF(_Tp) alignof(_Tp) # define _LIBCUDACXX_PREFERRED_ALIGNOF(_Tp) __alignof(_Tp) -# if defined(_CCCL_COMPILER_MSVC) +# if _CCCL_COMPILER(MSVC) # define _CCCL_ALIGNAS_TYPE(x) alignas(x) # define _CCCL_ALIGNAS(x) __declspec(align(x)) # elif _CCCL_HAS_FEATURE(cxx_alignas) @@ -303,7 +303,7 @@ extern "C++" { # else # define _CCCL_ALIGNAS_TYPE(x) __attribute__((__aligned__(_LIBCUDACXX_ALIGNOF(x)))) # define _CCCL_ALIGNAS(x) __attribute__((__aligned__(x))) -# endif // !_CCCL_COMPILER_MSVC && !_CCCL_HAS_FEATURE(cxx_alignas) +# endif // !_CCCL_COMPILER(MSVC) && !_CCCL_HAS_FEATURE(cxx_alignas) // This is wrapped in __CUDA_ARCH__ to prevent error: "ignoring '#pragma unroll' // [-Werror=unknown-pragmas]" @@ -389,7 +389,7 @@ typedef __char32_t char32_t; # define _LIBCUDACXX_DISABLE_EXTENSION_WARNING __extension__ -# elif defined(_CCCL_COMPILER_MSVC) +# elif _CCCL_COMPILER(MSVC) # define _LIBCUDACXX_WARNING(x) _CCCL_PRAGMA(message(__FILE__ "(" _CCCL_TO_STRING(__LINE__) ") : warning note: " x)) @@ -461,7 +461,7 @@ typedef __char32_t char32_t; # endif // _LIBCUDACXX_HAS_NO_CXX20_CHRONO_LITERALS # ifndef _LIBCUDACXX_HAS_NO_INT128 -# if defined(_CCCL_COMPILER_MSVC) || (_CCCL_COMPILER(NVRTC) && !defined(__CUDACC_RTC_INT128__)) \ +# if _CCCL_COMPILER(MSVC) || (_CCCL_COMPILER(NVRTC) && !defined(__CUDACC_RTC_INT128__)) \ || (defined(_CCCL_CUDA_COMPILER_NVCC) && _CCCL_CUDACC_BELOW(11, 5)) || !defined(__SIZEOF_INT128__) # define _LIBCUDACXX_HAS_NO_INT128 # endif @@ -605,7 +605,7 @@ typedef unsigned int char32_t; // If we are getting operator new from the MSVC CRT, then allocation overloads // for align_val_t were added in 19.12, aka VS 2017 version 15.3. -# if defined(_LIBCUDACXX_MSVCRT) && defined(_CCCL_COMPILER_MSVC) && _MSC_VER < 1912 +# if defined(_LIBCUDACXX_MSVCRT) && _CCCL_COMPILER(MSVC, <, 19, 12) # define _LIBCUDACXX_HAS_NO_LIBRARY_ALIGNED_ALLOCATION # elif defined(_LIBCUDACXX_ABI_VCRUNTIME) && !defined(__cpp_aligned_new) // We're deferring to Microsoft's STL to provide aligned new et al. We don't @@ -681,7 +681,7 @@ typedef unsigned int char32_t; # endif // _LIBCUDACXX_HAS_THREAD_API_CUDA # ifndef _LIBCUDACXX_HAS_THREAD_API_WIN32 -# if defined(_CCCL_COMPILER_MSVC) && !defined(_LIBCUDACXX_HAS_THREAD_API_CUDA) +# if _CCCL_COMPILER(MSVC) && !defined(_LIBCUDACXX_HAS_THREAD_API_CUDA) # define _LIBCUDACXX_HAS_THREAD_API_WIN32 # endif # endif // _LIBCUDACXX_HAS_THREAD_API_WIN32 @@ -744,7 +744,7 @@ typedef unsigned int char32_t; # define _LIBCUDACXX_HAS_GCC_ATOMIC_IMP # elif _CCCL_COMPILER(NVHPC) # define _LIBCUDACXX_HAS_GCC_ATOMIC_IMP -# elif defined(_CCCL_COMPILER_MSVC) +# elif _CCCL_COMPILER(MSVC) # define _LIBCUDACXX_HAS_MSVC_ATOMIC_IMPL # endif @@ -800,7 +800,7 @@ typedef unsigned int char32_t; # define _LIBCUDACXX_PREFERRED_NAME(x) # endif -# if defined(_LIBCUDACXX_ABI_MICROSOFT) && (defined(_CCCL_COMPILER_MSVC) || __has_declspec_attribute(empty_bases)) +# if defined(_LIBCUDACXX_ABI_MICROSOFT) && (_CCCL_COMPILER(MSVC) || __has_declspec_attribute(empty_bases)) # define _LIBCUDACXX_DECLSPEC_EMPTY_BASES __declspec(empty_bases) # else # define _LIBCUDACXX_DECLSPEC_EMPTY_BASES @@ -822,7 +822,7 @@ typedef unsigned int char32_t; # define _LIBCUDACXX_HAS_NO_CONSTEXPR_COMPLEX_OPERATIONS # elif _CCCL_COMPILER(NVRTC) # define _LIBCUDACXX_HAS_NO_CONSTEXPR_COMPLEX_OPERATIONS -# elif defined(_CCCL_COMPILER_MSVC) +# elif _CCCL_COMPILER(MSVC) # define _LIBCUDACXX_HAS_NO_CONSTEXPR_COMPLEX_OPERATIONS # elif _CCCL_CUDACC_BELOW(11, 8) # define _LIBCUDACXX_HAS_NO_CONSTEXPR_COMPLEX_OPERATIONS diff --git a/libcudacxx/include/cuda/std/detail/libcxx/include/climits b/libcudacxx/include/cuda/std/detail/libcxx/include/climits index ffe87d1caf0..f5b285ccc25 100644 --- a/libcudacxx/include/cuda/std/detail/libcxx/include/climits +++ b/libcudacxx/include/cuda/std/detail/libcxx/include/climits @@ -51,7 +51,7 @@ Macros: _CCCL_PUSH_MACROS -#if defined(_CCCL_COMPILER_MSVC) +#if _CCCL_COMPILER(MSVC) # include #endif // _LIBCUDACXX_MSVCRT diff --git a/libcudacxx/include/cuda/std/detail/libcxx/include/cmath b/libcudacxx/include/cuda/std/detail/libcxx/include/cmath index bf0f4da10f5..7066ddec4f2 100644 --- a/libcudacxx/include/cuda/std/detail/libcxx/include/cmath +++ b/libcudacxx/include/cuda/std/detail/libcxx/include/cmath @@ -683,7 +683,7 @@ __constexpr_isfinite(_A1 __lcpp_x) noexcept return isfinite(__lcpp_x); } -#if defined(_CCCL_COMPILER_MSVC) || _CCCL_COMPILER(NVRTC) +#if _CCCL_COMPILER(MSVC) || _CCCL_COMPILER(NVRTC) template _LIBCUDACXX_HIDE_FROM_ABI _A1 __constexpr_copysign(_A1 __x, _A1 __y) noexcept { @@ -715,9 +715,9 @@ __constexpr_copysign(_A1 __x, _A2 __y) noexcept static_assert((!(_IsSame<_A1, __result_type>::value && _IsSame<_A2, __result_type>::value)), ""); return __builtin_copysign((__result_type) __x, (__result_type) __y); } -#endif // !_CCCL_COMPILER_MSVC +#endif // !_CCCL_COMPILER(MSVC) -#if defined(_CCCL_COMPILER_MSVC) || _CCCL_COMPILER(NVRTC) +#if _CCCL_COMPILER(MSVC) || _CCCL_COMPILER(NVRTC) template _LIBCUDACXX_HIDE_FROM_ABI _A1 __constexpr_fabs(_A1 __x) noexcept { @@ -744,9 +744,9 @@ _LIBCUDACXX_HIDE_FROM_ABI _CCCL_CONSTEXPR_CXX14 double __constexpr_fabs(_Tp __x) { return __builtin_fabs(static_cast(__x)); } -#endif // !_CCCL_COMPILER_MSVC +#endif // !_CCCL_COMPILER(MSVC) -#if defined(_CCCL_COMPILER_MSVC) || _CCCL_COMPILER(NVRTC) +#if _CCCL_COMPILER(MSVC) || _CCCL_COMPILER(NVRTC) template _LIBCUDACXX_HIDE_FROM_ABI _A1 __constexpr_fmax(_A1 __x, _A1 __y) noexcept { @@ -829,9 +829,9 @@ _LIBCUDACXX_HIDE_FROM_ABI _CCCL_CONSTEXPR_CXX14_COMPLEX __promote_t<_Tp, _Up> __ using __result_type = __promote_t<_Tp, _Up>; return _CUDA_VSTD::__constexpr_fmax(static_cast<__result_type>(__x), static_cast<__result_type>(__y)); } -#endif // !_CCCL_COMPILER_MSVC +#endif // !_CCCL_COMPILER(MSVC) -#if defined(_CCCL_COMPILER_MSVC) || _CCCL_COMPILER(NVRTC) || defined(_CCCL_CUDA_COMPILER_CLANG) +#if _CCCL_COMPILER(MSVC) || _CCCL_COMPILER(NVRTC) || defined(_CCCL_CUDA_COMPILER_CLANG) template _LIBCUDACXX_HIDE_FROM_ABI _A1 __constexpr_logb(_A1 __x) { @@ -874,7 +874,7 @@ _LIBCUDACXX_HIDE_FROM_ABI _CCCL_CONSTEXPR_CXX14_COMPLEX _Tp __constexpr_logb(_Tp } #endif // !_MSVC -#if defined(_CCCL_COMPILER_MSVC) || _CCCL_COMPILER(NVRTC) || defined(_CCCL_CUDA_COMPILER_CLANG) +#if _CCCL_COMPILER(MSVC) || _CCCL_COMPILER(NVRTC) || defined(_CCCL_CUDA_COMPILER_CLANG) template _LIBCUDACXX_HIDE_FROM_ABI _Tp __constexpr_scalbn(_Tp __x, int __i) { @@ -958,7 +958,7 @@ _LIBCUDACXX_HIDE_FROM_ABI _CCCL_CONSTEXPR_CXX14_COMPLEX _Tp __constexpr_scalbn(_ # endif // defined(_CCCL_BUILTIN_IS_CONSTANT_EVALUATED) return __builtin_scalbn(__x, __exp); } -#endif // !_CCCL_COMPILER_MSVC +#endif // !_CCCL_COMPILER(MSVC) #if _CCCL_STD_VER > 2017 template diff --git a/libcudacxx/include/cuda/std/detail/libcxx/include/limits b/libcudacxx/include/cuda/std/detail/libcxx/include/limits index 06845b5f664..ea830da6046 100644 --- a/libcudacxx/include/cuda/std/detail/libcxx/include/limits +++ b/libcudacxx/include/cuda/std/detail/libcxx/include/limits @@ -118,7 +118,7 @@ template<> class numeric_limits; _CCCL_PUSH_MACROS -#if defined(_CCCL_COMPILER_MSVC) +#if _CCCL_COMPILER(MSVC) # include #endif // _LIBCUDACXX_MSVCRT diff --git a/libcudacxx/include/cuda/std/detail/libcxx/include/span b/libcudacxx/include/cuda/std/detail/libcxx/include/span index afe5ea34519..75774146c09 100644 --- a/libcudacxx/include/cuda/std/detail/libcxx/include/span +++ b/libcudacxx/include/cuda/std/detail/libcxx/include/span @@ -203,9 +203,9 @@ template _CCCL_CONCEPT __span_array_convertible = _CCCL_TRAIT(is_convertible, _From (*)[], _To (*)[]); // We want to ensure that span interacts nicely with containers that might not have had the ranges treatment -# if defined(__cpp_lib_ranges) && !defined(_CCCL_COMPILER_MSVC_2017) +# if defined(__cpp_lib_ranges) && !_CCCL_COMPILER(MSVC2017) # define _CCCL_SPAN_USES_RANGES -# endif // __cpp_lib_ranges && !_CCCL_COMPILER_MSVC_2017 +# endif // __cpp_lib_ranges && !_CCCL_COMPILER(MSVC2017) # if defined(_CCCL_SPAN_USES_RANGES) template @@ -367,7 +367,7 @@ public: } # endif // !_CCCL_SPAN_USES_RANGES -# if _CCCL_COMPILER(NVRTC) || defined(_CCCL_COMPILER_MSVC_2017) +# if _CCCL_COMPILER(NVRTC) || _CCCL_COMPILER(MSVC2017) template = 0> _LIBCUDACXX_HIDE_FROM_ABI constexpr span(type_identity_t (&__arr)[_Sz]) noexcept : __data_{__arr} @@ -376,7 +376,7 @@ public: _LIBCUDACXX_HIDE_FROM_ABI constexpr span(type_identity_t (&__arr)[_Extent]) noexcept : __data_{__arr} {} -# endif // !_CCCL_COMPILER(NVRTC) && !_CCCL_COMPILER_MSVC_2017 +# endif // !_CCCL_COMPILER(NVRTC) && !_CCCL_COMPILER(MSVC2017) _CCCL_TEMPLATE(class _OtherElementType) _CCCL_REQUIRES(__span_array_convertible<_OtherElementType, element_type>) @@ -855,7 +855,7 @@ _CCCL_HOST_DEVICE span(const _Container&) -> span= 2017 && !defined(_CCCL_COMPILER_MSVC_2017) +#if _CCCL_STD_VER >= 2017 && !_CCCL_COMPILER(MSVC2017) _LIBCUDACXX_BEGIN_NAMESPACE_RANGES template _CCCL_INLINE_VAR constexpr bool enable_borrowed_range> = true; @@ -863,6 +863,6 @@ _CCCL_INLINE_VAR constexpr bool enable_borrowed_range> = true template _CCCL_INLINE_VAR constexpr bool enable_view> = true; _LIBCUDACXX_END_NAMESPACE_RANGES -#endif // _CCCL_STD_VER >= 2017 && !_CCCL_COMPILER_MSVC_2017 +#endif // _CCCL_STD_VER >= 2017 && !_CCCL_COMPILER(MSVC2017) #endif // _LIBCUDACXX_SPAN diff --git a/libcudacxx/include/cuda/std/detail/libcxx/include/variant b/libcudacxx/include/cuda/std/detail/libcxx/include/variant index c40abf1fd23..d71967a3c85 100644 --- a/libcudacxx/include/cuda/std/detail/libcxx/include/variant +++ b/libcudacxx/include/cuda/std/detail/libcxx/include/variant @@ -293,7 +293,7 @@ _LIBCUDACXX_END_NAMESPACE_STD_NOVERSION #endif // !_CCCL_NO_EXCEPTIONS -#if _CCCL_STD_VER >= 2014 && !defined(_CCCL_COMPILER_MSVC_2017) +#if _CCCL_STD_VER >= 2014 && !_CCCL_COMPILER(MSVC2017) _LIBCUDACXX_BEGIN_NAMESPACE_STD @@ -2152,6 +2152,6 @@ _LIBCUDACXX_HIDE_FROM_ABI constexpr auto&& __unchecked_get(variant<_Types...>& _ _LIBCUDACXX_END_NAMESPACE_STD -#endif // _CCCL_STD_VER >= 2014 && !_CCCL_COMPILER_MSVC_2017 +#endif // _CCCL_STD_VER >= 2014 && !_CCCL_COMPILER(MSVC2017) #endif // _LIBCUDACXX_VARIANT diff --git a/libcudacxx/include/cuda/std/inplace_vector b/libcudacxx/include/cuda/std/inplace_vector index 73449b26188..5137fce4209 100644 --- a/libcudacxx/include/cuda/std/inplace_vector +++ b/libcudacxx/include/cuda/std/inplace_vector @@ -315,11 +315,11 @@ protected: iterator __curr = __dest; for (; __first != __last; ++__curr, (void) ++__first) { -# if _CCCL_STD_VER >= 2017 && !defined(_CCCL_COMPILER_MSVC_2017) +# if _CCCL_STD_VER >= 2017 && !_CCCL_COMPILER(MSVC2017) ::new (_CUDA_VSTD::__voidify(*__curr)) _Tp(_CUDA_VRANGES::iter_move(__first)); # else // ^^^ C++17 ^^^ / vvv C++14 vvv ::new (_CUDA_VSTD::__voidify(*__curr)) _Tp(_CUDA_VSTD::move(*__first)); -# endif // _CCCL_STD_VER <= 2014 || _CCCL_COMPILER_MSVC_2017 +# endif // _CCCL_STD_VER <= 2014 || _CCCL_COMPILER(MSVC2017) } this->__size_ += static_cast<__size_type>(__curr - __dest); } @@ -332,11 +332,11 @@ protected: auto __guard = __make_exception_guard(_Rollback_change_size<__inplace_vector_storage>{this, __dest, __curr}); for (; __first != __last; ++__curr, (void) ++__first) { -# if _CCCL_STD_VER >= 2017 && !defined(_CCCL_COMPILER_MSVC_2017) +# if _CCCL_STD_VER >= 2017 && !_CCCL_COMPILER(MSVC2017) ::new (_CUDA_VSTD::__voidify(*__curr)) _Tp(_CUDA_VRANGES::iter_move(__first)); # else // ^^^ C++17 ^^^ / vvv C++14 vvv ::new (_CUDA_VSTD::__voidify(*__curr)) _Tp(_CUDA_VSTD::move(*__first)); -# endif // _CCCL_STD_VER <= 2014 || _CCCL_COMPILER_MSVC_2017 +# endif // _CCCL_STD_VER <= 2014 || _CCCL_COMPILER(MSVC2017) } __guard.__complete(); this->__size_ += static_cast<__size_type>(__curr - __dest); @@ -590,22 +590,22 @@ protected: _LIBCUDACXX_HIDE_FROM_ABI constexpr void __uninitialized_copy(_Iter __first, _Iter __last, iterator __dest) noexcept { _CUDA_VSTD::copy(__first, __last, __dest); -# if _CCCL_STD_VER >= 2017 && !defined(_CCCL_COMPILER_MSVC_2017) +# if _CCCL_STD_VER >= 2017 && !_CCCL_COMPILER(MSVC2017) __size_ += static_cast<__size_type>(_CUDA_VRANGES::distance(__first, __last)); # else // ^^^ C++17 ^^^ / vvv C++14 vvv __size_ += static_cast<__size_type>(_CUDA_VSTD::distance(__first, __last)); -# endif // _CCCL_STD_VER <= 2014 || _CCCL_COMPILER_MSVC_2017 +# endif // _CCCL_STD_VER <= 2014 || _CCCL_COMPILER(MSVC2017) } template _LIBCUDACXX_HIDE_FROM_ABI constexpr void __uninitialized_move(_Iter __first, _Iter __last, iterator __dest) noexcept { _CUDA_VSTD::copy(__first, __last, __dest); -# if _CCCL_STD_VER >= 2017 && !defined(_CCCL_COMPILER_MSVC_2017) +# if _CCCL_STD_VER >= 2017 && !_CCCL_COMPILER(MSVC2017) __size_ += static_cast<__size_type>(_CUDA_VRANGES::distance(__first, __last)); # else // ^^^ C++17 ^^^ / vvv C++14 vvv __size_ += static_cast<__size_type>(_CUDA_VSTD::distance(__first, __last)); -# endif // _CCCL_STD_VER <= 2014 || _CCCL_COMPILER_MSVC_2017 +# endif // _CCCL_STD_VER <= 2014 || _CCCL_COMPILER(MSVC2017) } }; @@ -670,9 +670,9 @@ struct __inplace_vector_base<_Tp, 0, __inplace_vector_specialization::__empty> _LIBCUDACXX_HIDE_FROM_ABI constexpr _Tp& unchecked_emplace_back(_Args&&...) noexcept { _CCCL_UNREACHABLE(); -# if defined(_CCCL_COMPILER_MSVC) +# if _CCCL_COMPILER(MSVC) return *begin(); -# endif // _CCCL_COMPILER_MSVC +# endif // _CCCL_COMPILER(MSVC) } protected: @@ -800,7 +800,7 @@ public: } } -# if _CCCL_STD_VER >= 2017 && !defined(_CCCL_COMPILER_MSVC_2017) +# if _CCCL_STD_VER >= 2017 && !_CCCL_COMPILER(MSVC2017) _CCCL_TEMPLATE(class _Range) _CCCL_REQUIRES( _CUDA_VRANGES::__container_compatible_range<_Range, _Tp> _CCCL_AND(!_CUDA_VRANGES::forward_range<_Range>)) @@ -851,7 +851,7 @@ public: this->__uninitialized_move(_CUDA_VRANGES::begin(__range), _CUDA_VRANGES::__unwrap_end(__range), this->begin()); } } -# endif // _CCCL_STD_VER >= 2017 && !defined(_CCCL_COMPILER_MSVC_2017) +# endif // _CCCL_STD_VER >= 2017 && !_CCCL_COMPILER(MSVC2017) _LIBCUDACXX_HIDE_FROM_ABI constexpr inplace_vector& operator=(initializer_list<_Tp> __ilist) { @@ -961,7 +961,7 @@ public: } } -# if _CCCL_STD_VER >= 2017 && !defined(_CCCL_COMPILER_MSVC_2017) +# if _CCCL_STD_VER >= 2017 && !_CCCL_COMPILER(MSVC2017) _CCCL_TEMPLATE(class _Range) _CCCL_REQUIRES( _CUDA_VRANGES::__container_compatible_range<_Range, _Tp> _CCCL_AND(!_CUDA_VRANGES::forward_range<_Range>)) @@ -1037,7 +1037,7 @@ public: this->__uninitialized_copy(__middle, __last, this->end()); } } -# endif // _CCCL_STD_VER >= 2017 && !defined(_CCCL_COMPILER_MSVC_2017) +# endif // _CCCL_STD_VER >= 2017 && !_CCCL_COMPILER(MSVC2017) // [containers.sequences.inplace.vector.access], element access _CCCL_NODISCARD _LIBCUDACXX_HIDE_FROM_ABI constexpr reference at(const size_type __pos) @@ -1307,7 +1307,7 @@ public: return __res; } -# if _CCCL_STD_VER >= 2017 && !defined(_CCCL_COMPILER_MSVC_2017) +# if _CCCL_STD_VER >= 2017 && !_CCCL_COMPILER(MSVC2017) _CCCL_TEMPLATE(class _Range) _CCCL_REQUIRES( _CUDA_VRANGES::__container_compatible_range<_Range, _Tp> _CCCL_AND(!_CUDA_VRANGES::forward_range<_Range>)) @@ -1355,7 +1355,7 @@ public: auto __first = _CUDA_VRANGES::begin(__range); insert(this->end(), __first, _CUDA_VRANGES::__unwrap_end(__range)); } -# endif // _CCCL_STD_VER >= 2017 && !defined(_CCCL_COMPILER_MSVC_2017) +# endif // _CCCL_STD_VER >= 2017 && !_CCCL_COMPILER(MSVC2017) template _LIBCUDACXX_HIDE_FROM_ABI constexpr iterator emplace(const_iterator __cpos, _Args&&... __args) @@ -1451,7 +1451,7 @@ public: return _CUDA_VSTD::addressof(this->unchecked_emplace_back(_CUDA_VSTD::move(__value))); } -# if _CCCL_STD_VER >= 2017 && !defined(_CCCL_COMPILER_MSVC_2017) +# if _CCCL_STD_VER >= 2017 && !_CCCL_COMPILER(MSVC2017) _CCCL_TEMPLATE(class _Range) _CCCL_REQUIRES( _CUDA_VRANGES::__container_compatible_range<_Range, _Tp> _CCCL_AND(!_CUDA_VRANGES::forward_range<_Range>)) @@ -1498,7 +1498,7 @@ public: this->__uninitialized_move(__first, __middle, this->end()); return __middle; } -# endif // _CCCL_STD_VER >= 2017 && !defined(_CCCL_COMPILER_MSVC_2017) +# endif // _CCCL_STD_VER >= 2017 && !_CCCL_COMPILER(MSVC2017) using __base::unchecked_emplace_back; @@ -1786,7 +1786,7 @@ public: } } -# if _CCCL_STD_VER >= 2017 && !defined(_CCCL_COMPILER_MSVC_2017) +# if _CCCL_STD_VER >= 2017 && !_CCCL_COMPILER(MSVC2017) _CCCL_TEMPLATE(class _Range) _CCCL_REQUIRES(_CUDA_VRANGES::__container_compatible_range<_Range, _Tp>) _LIBCUDACXX_HIDE_FROM_ABI constexpr inplace_vector(from_range_t, _Range&& __range) @@ -1797,7 +1797,7 @@ public: _CUDA_VSTD::__throw_bad_alloc(); } } -# endif // _CCCL_STD_VER >= 2017 && !defined(_CCCL_COMPILER_MSVC_2017) +# endif // _CCCL_STD_VER >= 2017 && !_CCCL_COMPILER(MSVC2017) _LIBCUDACXX_HIDE_FROM_ABI constexpr inplace_vector& operator=(initializer_list<_Tp> __ilist) { @@ -1838,7 +1838,7 @@ public: return; } -# if _CCCL_STD_VER >= 2017 && !defined(_CCCL_COMPILER_MSVC_2017) +# if _CCCL_STD_VER >= 2017 && !_CCCL_COMPILER(MSVC2017) _CCCL_TEMPLATE(class _Range) _CCCL_REQUIRES(_CUDA_VRANGES::__container_compatible_range<_Range, _Tp>) _LIBCUDACXX_HIDE_FROM_ABI constexpr void assign_range(_Range&& __range) @@ -1849,7 +1849,7 @@ public: } return; } -# endif // _CCCL_STD_VER >= 2017 && !defined(_CCCL_COMPILER_MSVC_2017) +# endif // _CCCL_STD_VER >= 2017 && !_CCCL_COMPILER(MSVC2017) // [containers.sequences.inplace.vector.access], element access _CCCL_NODISCARD _LIBCUDACXX_HIDE_FROM_ABI constexpr reference at(const size_type __pos) @@ -1996,7 +1996,7 @@ public: return nullptr; } -# if _CCCL_STD_VER >= 2017 && !defined(_CCCL_COMPILER_MSVC_2017) +# if _CCCL_STD_VER >= 2017 && !_CCCL_COMPILER(MSVC2017) _CCCL_TEMPLATE(class _Range) _CCCL_REQUIRES(_CUDA_VRANGES::__container_compatible_range<_Range, _Tp>) _LIBCUDACXX_HIDE_FROM_ABI constexpr iterator insert_range(const_iterator __cpos, _Range&& __range) @@ -2017,7 +2017,7 @@ public: _CUDA_VSTD::__throw_bad_alloc(); } } -# endif // _CCCL_STD_VER >= 2017 && !defined(_CCCL_COMPILER_MSVC_2017) +# endif // _CCCL_STD_VER >= 2017 && !_CCCL_COMPILER(MSVC2017) template _LIBCUDACXX_HIDE_FROM_ABI constexpr iterator emplace(const_iterator, _Args&&...) @@ -2061,14 +2061,14 @@ public: return nullptr; } -# if _CCCL_STD_VER >= 2017 && !defined(_CCCL_COMPILER_MSVC_2017) +# if _CCCL_STD_VER >= 2017 && !_CCCL_COMPILER(MSVC2017) _CCCL_TEMPLATE(class _Range) _CCCL_REQUIRES(_CUDA_VRANGES::__container_compatible_range<_Range, _Tp>) _LIBCUDACXX_HIDE_FROM_ABI constexpr _CUDA_VRANGES::iterator_t<_Range> try_append_range(_Range&& __range) noexcept { return _CUDA_VRANGES::begin(__range); } -# endif // _CCCL_STD_VER >= 2017 && !defined(_CCCL_COMPILER_MSVC_2017) +# endif // _CCCL_STD_VER >= 2017 && !_CCCL_COMPILER(MSVC2017) using __base::unchecked_emplace_back; diff --git a/libcudacxx/include/cuda/std/version b/libcudacxx/include/cuda/std/version index 841aa449c77..238259f45ef 100644 --- a/libcudacxx/include/cuda/std/version +++ b/libcudacxx/include/cuda/std/version @@ -48,9 +48,9 @@ # define __cccl_lib_is_null_pointer 201309L # define __cccl_lib_make_reverse_iterator 201402L // # define __cccl_lib_make_unique 201304L -# if !defined(_CCCL_COMPILER_MSVC) || _CCCL_STD_VER >= 2020 +# if !_CCCL_COMPILER(MSVC) || _CCCL_STD_VER >= 2020 # define __cccl_lib_mdspan 202207L -# endif // _CCCL_COMPILER_MSVC && _CCCL_STD_VER >= 2020 +# endif // _CCCL_COMPILER(MSVC) && _CCCL_STD_VER >= 2020 # define __cccl_lib_null_iterators 201304L # define __cccl_lib_optional 202110L // # define __cccl_lib_quoted_string_io 201304L diff --git a/libcudacxx/test/libcudacxx/std/language.support/support.srcloc/general.pass.cpp b/libcudacxx/test/libcudacxx/std/language.support/support.srcloc/general.pass.cpp index 976997b99b2..2fc33268dfc 100644 --- a/libcudacxx/test/libcudacxx/std/language.support/support.srcloc/general.pass.cpp +++ b/libcudacxx/test/libcudacxx/std/language.support/support.srcloc/general.pass.cpp @@ -68,7 +68,7 @@ ASSERT_NOEXCEPT(device_empty.function_name()); constexpr cuda::std::source_location cur = cuda::std::source_location::current(); static_assert(cur.line() == 1000, ""); -#if _CCCL_HAS_BUILTIN(__builtin_COLUMN) || defined(TEST_COMPILER_MSVC) && _CCCL_MSVC_VERSION >= 1927 +#if _CCCL_HAS_BUILTIN(__builtin_COLUMN) || defined(TEST_COMPILER_MSVC) && _MSC_VER >= 1927 static_assert(cur.column() > 0, ""); #else // ^^^ _CCCL_BULTIN_COLUMN ^^^ / vvv !_CCCL_BULTIN_COLUMN vvv static_assert(cur.column() == 0, ""); @@ -78,7 +78,7 @@ static_assert(cur.file_name()[0] == __FILE__[0] && cur.file_name()[1] == __FILE_ ""); // MSVC below 19.27 is broken with function name -#if !defined(_CCCL_COMPILER_MSVC) || _CCCL_MSVC_VERSION >= 1927 +#if !_CCCL_COMPILER(MSVC) || _MSC_VER >= 1927 static_assert(cur.function_name()[0] == '\0', ""); #else // ^^^ __builtin_FUNCTION ^^^ / vvv !__builtin_FUNCTION vvv static_assert(compare_strings(cur.function_name(), "__builtin_FUNCTION is unsupported")); @@ -139,14 +139,14 @@ __host__ __device__ void test() assert(compare_strings(local.file_name(), __FILE__)); // MSVC below 19.27 is broken with function name -#if !defined(_CCCL_COMPILER_MSVC) || _CCCL_MSVC_VERSION >= 1927 +#if !_CCCL_COMPILER(MSVC) || _MSC_VER >= 1927 assert(find_substring(local.function_name(), "test")); #else // ^^^ __builtin_FUNCTION ^^^ / vvv !__builtin_FUNCTION vvv assert(compare_strings(local.function_name(), "__builtin_FUNCTION is unsupported")); #endif // !__builtin_FUNCTION assert(local.line() == 2000); -#if _CCCL_HAS_BUILTIN(__builtin_COLUMN) || defined(TEST_COMPILER_MSVC) && _CCCL_MSVC_VERSION >= 1927 +#if _CCCL_HAS_BUILTIN(__builtin_COLUMN) || defined(TEST_COMPILER_MSVC) && _MSC_VER >= 1927 assert(cur.column() > 0); #else // ^^^ _CCCL_BULTIN_COLUMN ^^^ / vvv !_CCCL_BULTIN_COLUMN vvv assert(cur.column() == 0); diff --git a/libcudacxx/test/libcudacxx/std/utilities/template.bitset/bitset.members/to_ullong.pass.cpp b/libcudacxx/test/libcudacxx/std/utilities/template.bitset/bitset.members/to_ullong.pass.cpp index db3e3877dc0..25a0d2aff05 100644 --- a/libcudacxx/test/libcudacxx/std/utilities/template.bitset/bitset.members/to_ullong.pass.cpp +++ b/libcudacxx/test/libcudacxx/std/utilities/template.bitset/bitset.members/to_ullong.pass.cpp @@ -67,7 +67,7 @@ __host__ __device__ TEST_CONSTEXPR_CXX14 bool test() int main(int, char**) { test(); -#if TEST_STD_VER >= 2014 && (_CCCL_CUDACC_AT_LEAST(11, 8) || !defined(_CCCL_COMPILER_MSVC)) +#if TEST_STD_VER >= 2014 && (_CCCL_CUDACC_AT_LEAST(11, 8) || !_CCCL_COMPILER(MSVC)) static_assert(test(), ""); #endif // TEST_STD_VER >= 2014 diff --git a/libcudacxx/test/libcudacxx/std/utilities/template.bitset/bitset.members/to_ulong.pass.cpp b/libcudacxx/test/libcudacxx/std/utilities/template.bitset/bitset.members/to_ulong.pass.cpp index 2c22b439ffb..61953443dfd 100644 --- a/libcudacxx/test/libcudacxx/std/utilities/template.bitset/bitset.members/to_ulong.pass.cpp +++ b/libcudacxx/test/libcudacxx/std/utilities/template.bitset/bitset.members/to_ulong.pass.cpp @@ -66,7 +66,7 @@ __host__ __device__ TEST_CONSTEXPR_CXX14 bool test() int main(int, char**) { test(); -#if TEST_STD_VER >= 2014 && (_CCCL_CUDACC_AT_LEAST(11, 8) || !defined(_CCCL_COMPILER_MSVC)) +#if TEST_STD_VER >= 2014 && (_CCCL_CUDACC_AT_LEAST(11, 8) || !_CCCL_COMPILER(MSVC)) static_assert(test(), ""); #endif // TEST_STD_VER >= 2014 diff --git a/libcudacxx/test/support/test_macros.h b/libcudacxx/test/support/test_macros.h index c42adf2d0bb..c81987a0dc1 100644 --- a/libcudacxx/test/support/test_macros.h +++ b/libcudacxx/test/support/test_macros.h @@ -148,7 +148,7 @@ #endif #if TEST_HAS_BUILTIN(__builtin_is_constant_evaluated) || _CCCL_COMPILER(GCC, >=, 9) \ - || (defined(_CCCL_COMPILER_MSVC) && _MSC_VER > 1924 && _CCCL_CUDACC_AT_LEAST(11, 3)) + || (_CCCL_COMPILER(MSVC) && _MSC_VER > 1924 && _CCCL_CUDACC_AT_LEAST(11, 3)) # define TEST_IS_CONSTANT_EVALUATED() _CUDA_VSTD::__libcpp_is_constant_evaluated() #else # define TEST_IS_CONSTANT_EVALUATED() false @@ -245,8 +245,8 @@ #endif #ifndef TEST_HAS_NO_EXCEPTIONS -# if (defined(_CCCL_COMPILER_MSVC) && _HAS_EXCEPTIONS == 0) \ - || (!defined(_CCCL_COMPILER_MSVC) && !__EXCEPTIONS) // Catches all non msvc based compilers +# if (_CCCL_COMPILER(MSVC) && _HAS_EXCEPTIONS == 0) || (!_CCCL_COMPILER(MSVC) && !__EXCEPTIONS) // Catches all non msvc + // based compilers # define TEST_HAS_NO_EXCEPTIONS # endif #endif // !TEST_HAS_NO_EXCEPTIONS diff --git a/thrust/testing/async_sort.cu b/thrust/testing/async_sort.cu index feb5cb5624a..77144779814 100644 --- a/thrust/testing/async_sort.cu +++ b/thrust/testing/async_sort.cu @@ -1,7 +1,7 @@ #include // Disabled on MSVC && NVCC < 11.1 for GH issue #1098. -#if defined(_CCCL_COMPILER_MSVC) && defined(__CUDACC__) +#if _CCCL_COMPILER(MSVC) && defined(__CUDACC__) # if (__CUDACC_VER_MAJOR__ < 11) || (__CUDACC_VER_MAJOR__ == 11 && __CUDACC_VER_MINOR__ < 1) # define THRUST_BUG_1098_ACTIVE # endif // NVCC version check diff --git a/thrust/testing/cuda/transform.cu b/thrust/testing/cuda/transform.cu index 888264ffce2..2e474ccfb5a 100644 --- a/thrust/testing/cuda/transform.cu +++ b/thrust/testing/cuda/transform.cu @@ -355,7 +355,7 @@ struct sum_five }; // The following test cannot be compiled because of a bug in the conversion of thrust::tuple on MSVC 2017 -#ifndef _CCCL_COMPILER_MSVC_2017 +#if !_CCCL_COMPILER(MSVC2017) // we specialize zip_function for sum_five, but do nothing in the call operator so the test below would fail if the // zip_function is actually called (and not unwrapped) THRUST_NAMESPACE_BEGIN @@ -420,4 +420,4 @@ void TestTransformZipIteratorUnwrapping() } } DECLARE_UNITTEST(TestTransformZipIteratorUnwrapping); -#endif // !_CCCL_COMPILER_MSVC_2017 +#endif // !_CCCL_COMPILER(MSVC2017) diff --git a/thrust/testing/functional.cu b/thrust/testing/functional.cu index 20478dbcb9b..7757ed47bed 100644 --- a/thrust/testing/functional.cu +++ b/thrust/testing/functional.cu @@ -212,7 +212,7 @@ THRUST_DISABLE_BROKEN_GCC_VECTORIZER void TestIdentityFunctional() // value categories when casting to different type static_assert(::cuda::std::is_same{}(3.14)), int&&>::value, ""); // unfortunately, old versions of MSVC pick the `const int&` overload instead of `int&&` -#if defined(_CCCL_COMPILER_MSVC) && _CCCL_MSVC_VERSION >= 1929 +#if _CCCL_COMPILER(MSVC, >=, 19, 29) static_assert(::cuda::std::is_same{}(d)), int&&>::value, ""); static_assert(::cuda::std::is_same{}(as_const(d))), int&&>::value, ""); #endif diff --git a/thrust/testing/set_difference.cu b/thrust/testing/set_difference.cu index cdb538d384c..5fe1de1fc1e 100644 --- a/thrust/testing/set_difference.cu +++ b/thrust/testing/set_difference.cu @@ -172,7 +172,7 @@ DECLARE_VARIABLE_UNITTEST(TestSetDifferenceMultiset); // FIXME: disabled on Windows, because it causes a failure on the internal CI system in one specific configuration. // That failure will be tracked in a new NVBug, this is disabled to unblock submitting all the other changes. -#if !defined(_CCCL_COMPILER_MSVC) +#if !_CCCL_COMPILER(MSVC) void TestSetDifferenceWithBigIndexesHelper(int magnitude) { thrust::counting_iterator begin(0); diff --git a/thrust/testing/set_intersection.cu b/thrust/testing/set_intersection.cu index 392e23b7337..af95e2cdf07 100644 --- a/thrust/testing/set_intersection.cu +++ b/thrust/testing/set_intersection.cu @@ -206,7 +206,7 @@ DECLARE_VARIABLE_UNITTEST(TestSetIntersectionMultiset); // FIXME: disabled on Windows, because it causes a failure on the internal CI system in one specific configuration. // That failure will be tracked in a new NVBug, this is disabled to unblock submitting all the other changes. -#if !defined(_CCCL_COMPILER_MSVC) +#if !_CCCL_COMPILER(MSVC) void TestSetDifferenceWithBigIndexesHelper(int magnitude) { thrust::counting_iterator begin1(0); diff --git a/thrust/testing/vector_manipulation.cu b/thrust/testing/vector_manipulation.cu index e5492eeb1a0..3a10492319c 100644 --- a/thrust/testing/vector_manipulation.cu +++ b/thrust/testing/vector_manipulation.cu @@ -20,7 +20,7 @@ void TestVectorManipulation(size_t n) ASSERT_EQUAL(test1.size(), n); ASSERT_EQUAL((test1 == std::vector(n, T(3))), true); -#if defined(_CCCL_COMPILER_MSVC) && (_MSC_VER <= 1400) +#if _CCCL_COMPILER(MSVC, <=, 14) // XXX MSVC 2005's STL unintentionally uses adl to dispatch advance which // produces an ambiguity between std::advance & thrust::advance // don't produce a KNOWN_FAILURE, just ignore the issue diff --git a/thrust/thrust/detail/config/compiler.h b/thrust/thrust/detail/config/compiler.h index 6b73b04f55e..25d8ebfb29e 100644 --- a/thrust/thrust/detail/config/compiler.h +++ b/thrust/thrust/detail/config/compiler.h @@ -55,13 +55,13 @@ #define THRUST_DEVICE_COMPILER_NVCC 4 // figure out which host compiler we're using -#if defined(_CCCL_COMPILER_MSVC) +#if _CCCL_COMPILER(MSVC) //! deprecated [Since 2.7] # define THRUST_HOST_COMPILER THRUST_HOST_COMPILER_MSVC //! deprecated [Since 2.7] -# define THRUST_MSVC_VERSION _CCCL_MSVC_VERSION +# define THRUST_MSVC_VERSION _MSC_VER //! deprecated [Since 2.7] -# define THRUST_MSVC_VERSION_FULL _CCCL_MSVC_VERSION_FULL +# define THRUST_MSVC_VERSION_FULL _MSC_FULL_VER #elif _CCCL_COMPILER(ICC) //! deprecated [Since 2.7] # define THRUST_HOST_COMPILER THRUST_HOST_COMPILER_INTEL @@ -91,7 +91,7 @@ #if defined(__CUDACC__) || defined(_NVHPC_CUDA) //! deprecated [Since 2.7] # define THRUST_DEVICE_COMPILER THRUST_DEVICE_COMPILER_NVCC -#elif defined(_CCCL_COMPILER_MSVC) +#elif _CCCL_COMPILER(MSVC) //! deprecated [Since 2.7] # define THRUST_DEVICE_COMPILER THRUST_DEVICE_COMPILER_MSVC #elif _CCCL_COMPILER(GCC) diff --git a/thrust/thrust/detail/config/compiler_fence.h b/thrust/thrust/detail/config/compiler_fence.h index fc8cda95682..4b93b682c99 100644 --- a/thrust/thrust/detail/config/compiler_fence.h +++ b/thrust/thrust/detail/config/compiler_fence.h @@ -28,7 +28,7 @@ #include -#if defined(_CCCL_COMPILER_MSVC) +#if _CCCL_COMPILER(MSVC) # pragma message( \ "warning: The functionality in this header is unsafe, deprecated, and will soon be removed. Use C++11 atomics instead.") #else @@ -36,7 +36,7 @@ #endif // msvc case -#if defined(_CCCL_COMPILER_MSVC) +#if _CCCL_COMPILER(MSVC) # ifndef _DEBUG diff --git a/thrust/thrust/detail/config/cpp_dialect.h b/thrust/thrust/detail/config/cpp_dialect.h index e7589b39638..87733b15d55 100644 --- a/thrust/thrust/detail/config/cpp_dialect.h +++ b/thrust/thrust/detail/config/cpp_dialect.h @@ -68,7 +68,7 @@ #define THRUST_CPP_DIALECT _CCCL_STD_VER // Define THRUST_COMPILER_DEPRECATION macro: -#if defined(_CCCL_COMPILER_MSVC) +#if _CCCL_COMPILER(MSVC) # define THRUST_COMP_DEPR_IMPL(msg) _CCCL_PRAGMA(message(__FILE__ ":" _CCCL_TO_STRING(__LINE__) ": warning: " #msg)) #else // clang / gcc: # define THRUST_COMP_DEPR_IMPL(msg) _CCCL_PRAGMA(GCC warning #msg) @@ -89,10 +89,10 @@ THRUST_COMPILER_DEPRECATION(GCC 5.0); # elif _CCCL_COMPILER(CLANG, <, 7) THRUST_COMPILER_DEPRECATION(Clang 7.0); -# elif defined(_CCCL_COMPILER_MSVC) && THRUST_MSVC_VERSION < 1910 +# elif _CCCL_COMPILER(MSVC, <, 19, 10) // <2017. Hard upgrade message: THRUST_COMPILER_DEPRECATION(MSVC 2019(19.20 / 16.0 / 14.20)); -# elif defined(_CCCL_COMPILER_MSVC) && THRUST_MSVC_VERSION < 1920 +# elif _CCCL_COMPILER(MSVC2017) // >=2017, <2019. Soft deprecation message: THRUST_COMPILER_DEPRECATION_SOFT(MSVC 2019(19.20 / 16.0 / 14.20), MSVC 2017); # endif diff --git a/thrust/thrust/detail/config/deprecated.h b/thrust/thrust/detail/config/deprecated.h index af53047212d..29204f49287 100644 --- a/thrust/thrust/detail/config/deprecated.h +++ b/thrust/thrust/detail/config/deprecated.h @@ -43,7 +43,7 @@ #elif _CCCL_STD_VER >= 2014 # define THRUST_DEPRECATED [[deprecated]] # define THRUST_DEPRECATED_BECAUSE(MSG) [[deprecated(MSG)]] -#elif defined(_CCCL_COMPILER_MSVC) +#elif _CCCL_COMPILER(MSVC) # define THRUST_DEPRECATED __declspec(deprecated) # define THRUST_DEPRECATED_BECAUSE(MSG) __declspec(deprecated(MSG)) #elif _CCCL_COMPILER(CLANG) diff --git a/thrust/thrust/iterator/permutation_iterator.h b/thrust/thrust/iterator/permutation_iterator.h index 821a0a2484d..38dd35456b8 100644 --- a/thrust/thrust/iterator/permutation_iterator.h +++ b/thrust/thrust/iterator/permutation_iterator.h @@ -170,10 +170,10 @@ class permutation_iterator : public thrust::detail::permutation_iterator_basebase()); } -#if defined(_CCCL_COMPILER_MSVC_2017) +#if _CCCL_COMPILER(MSVC2017) _CCCL_DIAG_POP -#endif // _CCCL_COMPILER_MSVC_2017 +#endif // _CCCL_COMPILER(MSVC2017) // make friends for the copy constructor template diff --git a/thrust/thrust/iterator/reverse_iterator.h b/thrust/thrust/iterator/reverse_iterator.h index 21d258e6d31..a3e6b737b68 100644 --- a/thrust/thrust/iterator/reverse_iterator.h +++ b/thrust/thrust/iterator/reverse_iterator.h @@ -163,11 +163,11 @@ class reverse_iterator : public detail::reverse_iterator_base::type /*! Default constructor does nothing. */ -#if defined(_CCCL_COMPILER_MSVC_2017) +#if _CCCL_COMPILER(MSVC2017) inline _CCCL_HOST_DEVICE zip_iterator() {} -#else // ^^^ _CCCL_COMPILER_MSVC_2017 ^^^ / vvv !_CCCL_COMPILER_MSVC_2017 vvv +#else // ^^^ _CCCL_COMPILER(MSVC2017) ^^^ / vvv !_CCCL_COMPILER(MSVC2017) vvv zip_iterator() = default; -#endif // !_CCCL_COMPILER_MSVC_2017 +#endif // !_CCCL_COMPILER(MSVC2017) /*! This constructor creates a new \p zip_iterator from a * \p tuple of iterators. diff --git a/thrust/thrust/optional.h b/thrust/thrust/optional.h index bb9bf1cfb4b..dbee5ebda24 100644 --- a/thrust/thrust/optional.h +++ b/thrust/thrust/optional.h @@ -37,7 +37,7 @@ #include #include -#if defined(_CCCL_COMPILER_MSVC) && _MSC_VER == 1900 +#if _CCCL_COMPILER(MSVC, ==, 19, 00) # define THRUST_OPTIONAL_MSVC2015 #endif @@ -231,7 +231,7 @@ using enable_assign_from_other = detail::enable_if_t< && !std::is_assignable&>::value && !std::is_assignable&&>::value && !std::is_assignable&>::value && !std::is_assignable&&>::value>; -#if defined(_CCCL_COMPILER_MSVC) +#if _CCCL_COMPILER(MSVC) // TODO make a version which works with MSVC template struct is_swappable : std::true_type diff --git a/thrust/thrust/system/detail/error_code.inl b/thrust/thrust/system/detail/error_code.inl index e27c7db0286..0f41a9da220 100644 --- a/thrust/thrust/system/detail/error_code.inl +++ b/thrust/thrust/system/detail/error_code.inl @@ -50,10 +50,10 @@ error_code ::error_code(int val, const error_category& cat) template error_code ::error_code(ErrorCodeEnum e // XXX WAR msvc's problem with enable_if -#if !defined(_CCCL_COMPILER_MSVC) +#if !_CCCL_COMPILER(MSVC) , ::cuda::std::enable_if_t::value>* -#endif // !_CCCL_COMPILER_MSVC +#endif // !_CCCL_COMPILER(MSVC) ) { *this = make_error_code(e); @@ -67,11 +67,11 @@ void error_code ::assign(int val, const error_category& cat) template // XXX WAR msvc's problem with enable_if -#if !defined(_CCCL_COMPILER_MSVC) +#if !_CCCL_COMPILER(MSVC) ::cuda::std::enable_if_t::value, error_code>& #else error_code& -#endif // !_CCCL_COMPILER_MSVC +#endif // !_CCCL_COMPILER(MSVC) error_code ::operator=(ErrorCodeEnum e) { *this = make_error_code(e); diff --git a/thrust/thrust/system/detail/error_condition.inl b/thrust/thrust/system/detail/error_condition.inl index a63323be760..f9ad1f2b696 100644 --- a/thrust/thrust/system/detail/error_condition.inl +++ b/thrust/thrust/system/detail/error_condition.inl @@ -51,10 +51,10 @@ error_condition ::error_condition(int val, const error_category& cat) template error_condition ::error_condition(ErrorConditionEnum e // XXX WAR msvc's problem with enable_if -#if !defined(_CCCL_COMPILER_MSVC) +#if !_CCCL_COMPILER(MSVC) , ::cuda::std::enable_if_t::value>* -#endif // !_CCCL_COMPILER_MSVC +#endif // !_CCCL_COMPILER(MSVC) ) { *this = make_error_condition(e); @@ -68,11 +68,11 @@ void error_condition ::assign(int val, const error_category& cat) template // XXX WAR msvc's problem with enable_if -#if !defined(_CCCL_COMPILER_MSVC) +#if !_CCCL_COMPILER(MSVC) ::cuda::std::enable_if_t::value, error_condition>& #else error_condition& -#endif // !_CCCL_COMPILER_MSVC +#endif // !_CCCL_COMPILER(MSVC) error_condition ::operator=(ErrorConditionEnum e) { *this = make_error_condition(e); diff --git a/thrust/thrust/system/error_code.h b/thrust/thrust/system/error_code.h index d5313db8b2b..1573f8a3a82 100644 --- a/thrust/thrust/system/error_code.h +++ b/thrust/thrust/system/error_code.h @@ -256,10 +256,10 @@ class error_code template error_code(ErrorCodeEnum e // XXX WAR msvc's problem with enable_if -#if !defined(_CCCL_COMPILER_MSVC) +#if !_CCCL_COMPILER(MSVC) , ::cuda::std::enable_if_t::value>* = 0 -#endif // !_CCCL_COMPILER_MSVC +#endif // !_CCCL_COMPILER(MSVC) ); // [19.5.2.3] modifiers: @@ -272,11 +272,11 @@ class error_code */ template // XXX WAR msvc's problem with enable_if -#if !defined(_CCCL_COMPILER_MSVC) +#if !_CCCL_COMPILER(MSVC) ::cuda::std::enable_if_t::value, error_code>& #else error_code& -#endif // !_CCCL_COMPILER_MSVC +#endif // !_CCCL_COMPILER(MSVC) operator=(ErrorCodeEnum e); /*! \post value() == 0 and category() == system_category(). @@ -367,10 +367,10 @@ class error_condition template error_condition(ErrorConditionEnum e // XXX WAR msvc's problem with enable_if -#if !defined(_CCCL_COMPILER_MSVC) +#if !_CCCL_COMPILER(MSVC) , ::cuda::std::enable_if_t::value>* = 0 -#endif // !_CCCL_COMPILER_MSVC +#endif // !_CCCL_COMPILER(MSVC) ); // [19.5.3.3] modifiers @@ -391,11 +391,11 @@ class error_condition */ template // XXX WAR msvc's problem with enable_if -#if !defined(_CCCL_COMPILER_MSVC) +#if !_CCCL_COMPILER(MSVC) ::cuda::std::enable_if_t::value, error_condition>& #else error_condition& -#endif // !_CCCL_COMPILER_MSVC +#endif // !_CCCL_COMPILER(MSVC) operator=(ErrorConditionEnum e); /*! Clears this \p error_code object. diff --git a/thrust/thrust/type_traits/is_contiguous_iterator.h b/thrust/thrust/type_traits/is_contiguous_iterator.h index 26ef2020e6b..303b54f38a4 100644 --- a/thrust/thrust/type_traits/is_contiguous_iterator.h +++ b/thrust/thrust/type_traits/is_contiguous_iterator.h @@ -39,7 +39,7 @@ #include #include -#if defined(_CCCL_COMPILER_MSVC) && _MSC_VER < 1916 // MSVC 2017 version 15.9 +#if _CCCL_COMPILER(MSVC, <, 19, 16) // MSVC 2017 version 15.9 # include # include # include From cee542b88fb4e943f8ee04c0b1e200ae5fb4bd3b Mon Sep 17 00:00:00 2001 From: Bernhard Manfred Gruber Date: Fri, 22 Nov 2024 11:28:08 +0100 Subject: [PATCH 07/45] Reorganize PTX tests to match generator (#2930) --- .../cuda/ptx/generated/barrier_cluster.inc | 40 ++ .../cuda/ptx/generated/cp_async_bulk.inc | 37 ++ .../generated/cp_async_bulk_commit_group.inc | 10 + .../ptx/generated/cp_async_bulk_multicast.inc | 18 + .../ptx/generated/cp_async_bulk_tensor.inc | 117 ++++ .../cp_async_bulk_tensor_multicast.inc | 82 +++ .../generated/cp_async_bulk_wait_group.inc | 18 + .../ptx/generated/cp_reduce_async_bulk.inc | 476 +++++++++++++++ .../generated/cp_reduce_async_bulk_bf16.inc | 44 ++ .../generated/cp_reduce_async_bulk_f16.inc | 35 ++ .../generated/cp_reduce_async_bulk_tensor.inc | 392 ++++++++++++ .../libcudacxx/cuda/ptx/generated/fence.inc | 38 ++ .../ptx/generated/fence_mbarrier_init.inc | 11 + .../cuda/ptx/generated/fence_proxy_alias.inc | 9 + .../cuda/ptx/generated/fence_proxy_async.inc | 24 + .../fence_proxy_tensormap_generic.inc | 44 ++ .../cuda/ptx/generated/get_sreg.inc | 331 +++++++++++ .../cuda/ptx/generated/getctarank.inc | 10 + .../cuda/ptx/generated/mbarrier_arrive.inc | 74 +++ .../generated/mbarrier_arrive_expect_tx.inc | 31 + .../generated/mbarrier_arrive_no_complete.inc | 10 + .../cuda/ptx/generated/mbarrier_init.inc | 10 + .../cuda/ptx/generated/mbarrier_try_wait.inc | 53 ++ .../generated/mbarrier_try_wait_parity.inc | 52 ++ .../cuda/ptx/generated/mbarrier_wait.inc | 24 + .../ptx/generated/mbarrier_wait_parity.inc | 24 + .../cuda/ptx/generated/red_async.inc | 120 ++++ .../cuda/ptx/generated/st_async.inc | 35 ++ .../ptx/generated/tensormap_cp_fenceproxy.inc | 29 + .../cuda/ptx/generated/tensormap_replace.inc | 198 +++++++ .../ptx/ptx.barrier.cluster.compile.pass.cpp | 42 +- ...p.async.bulk.commit_group.compile.pass.cpp | 11 +- .../ptx/ptx.cp.async.bulk.compile.pass.cpp | 38 +- ...x.cp.async.bulk.multicast.compile.pass.cpp | 19 +- .../ptx.cp.async.bulk.tensor.compile.pass.cpp | 118 +--- ...ync.bulk.tensor.multicast.compile.pass.cpp | 83 +-- ....cp.async.bulk.wait_group.compile.pass.cpp | 19 +- .../ptx.cp.reduce.async.bulk.compile.pass.cpp | 560 +----------------- ....reduce.async.bulk.tensor.compile.pass.cpp | 393 +----------- .../cuda/ptx/ptx.fence.compile.pass.cpp | 135 +---- .../cuda/ptx/ptx.get_sreg.compile.pass.cpp | 332 +---------- .../cuda/ptx/ptx.getctarank.compile.pass.cpp | 11 +- .../ptx/ptx.mbarrier.arrive.compile.pass.cpp | 120 +--- .../ptx/ptx.mbarrier.init.compile.pass.cpp | 11 +- .../ptx/ptx.mbarrier.wait.compile.pass.cpp | 160 +---- .../cuda/ptx/ptx.red.async.compile.pass.cpp | 121 +--- .../cuda/ptx/ptx.st.async.compile.pass.cpp | 36 +- ...x.tensormap.cp_fenceproxy.compile.pass.cpp | 30 +- .../ptx.tensormap.replace.compile.pass.cpp | 199 +------ 49 files changed, 2427 insertions(+), 2407 deletions(-) create mode 100644 libcudacxx/test/libcudacxx/cuda/ptx/generated/barrier_cluster.inc create mode 100644 libcudacxx/test/libcudacxx/cuda/ptx/generated/cp_async_bulk.inc create mode 100644 libcudacxx/test/libcudacxx/cuda/ptx/generated/cp_async_bulk_commit_group.inc create mode 100644 libcudacxx/test/libcudacxx/cuda/ptx/generated/cp_async_bulk_multicast.inc create mode 100644 libcudacxx/test/libcudacxx/cuda/ptx/generated/cp_async_bulk_tensor.inc create mode 100644 libcudacxx/test/libcudacxx/cuda/ptx/generated/cp_async_bulk_tensor_multicast.inc create mode 100644 libcudacxx/test/libcudacxx/cuda/ptx/generated/cp_async_bulk_wait_group.inc create mode 100644 libcudacxx/test/libcudacxx/cuda/ptx/generated/cp_reduce_async_bulk.inc create mode 100644 libcudacxx/test/libcudacxx/cuda/ptx/generated/cp_reduce_async_bulk_bf16.inc create mode 100644 libcudacxx/test/libcudacxx/cuda/ptx/generated/cp_reduce_async_bulk_f16.inc create mode 100644 libcudacxx/test/libcudacxx/cuda/ptx/generated/cp_reduce_async_bulk_tensor.inc create mode 100644 libcudacxx/test/libcudacxx/cuda/ptx/generated/fence.inc create mode 100644 libcudacxx/test/libcudacxx/cuda/ptx/generated/fence_mbarrier_init.inc create mode 100644 libcudacxx/test/libcudacxx/cuda/ptx/generated/fence_proxy_alias.inc create mode 100644 libcudacxx/test/libcudacxx/cuda/ptx/generated/fence_proxy_async.inc create mode 100644 libcudacxx/test/libcudacxx/cuda/ptx/generated/fence_proxy_tensormap_generic.inc create mode 100644 libcudacxx/test/libcudacxx/cuda/ptx/generated/get_sreg.inc create mode 100644 libcudacxx/test/libcudacxx/cuda/ptx/generated/getctarank.inc create mode 100644 libcudacxx/test/libcudacxx/cuda/ptx/generated/mbarrier_arrive.inc create mode 100644 libcudacxx/test/libcudacxx/cuda/ptx/generated/mbarrier_arrive_expect_tx.inc create mode 100644 libcudacxx/test/libcudacxx/cuda/ptx/generated/mbarrier_arrive_no_complete.inc create mode 100644 libcudacxx/test/libcudacxx/cuda/ptx/generated/mbarrier_init.inc create mode 100644 libcudacxx/test/libcudacxx/cuda/ptx/generated/mbarrier_try_wait.inc create mode 100644 libcudacxx/test/libcudacxx/cuda/ptx/generated/mbarrier_try_wait_parity.inc create mode 100644 libcudacxx/test/libcudacxx/cuda/ptx/generated/mbarrier_wait.inc create mode 100644 libcudacxx/test/libcudacxx/cuda/ptx/generated/mbarrier_wait_parity.inc create mode 100644 libcudacxx/test/libcudacxx/cuda/ptx/generated/red_async.inc create mode 100644 libcudacxx/test/libcudacxx/cuda/ptx/generated/st_async.inc create mode 100644 libcudacxx/test/libcudacxx/cuda/ptx/generated/tensormap_cp_fenceproxy.inc create mode 100644 libcudacxx/test/libcudacxx/cuda/ptx/generated/tensormap_replace.inc diff --git a/libcudacxx/test/libcudacxx/cuda/ptx/generated/barrier_cluster.inc b/libcudacxx/test/libcudacxx/cuda/ptx/generated/barrier_cluster.inc new file mode 100644 index 00000000000..cad5510ba70 --- /dev/null +++ b/libcudacxx/test/libcudacxx/cuda/ptx/generated/barrier_cluster.inc @@ -0,0 +1,40 @@ +__global__ void test_barrier_cluster(void** fn_ptr) +{ +#if __cccl_ptx_isa >= 780 + NV_IF_TARGET(NV_PROVIDES_SM_90, + ( + // barrier.cluster.arrive; + * fn_ptr++ = reinterpret_cast(static_cast(cuda::ptx::barrier_cluster_arrive));)); +#endif // __cccl_ptx_isa >= 780 + +#if __cccl_ptx_isa >= 780 + NV_IF_TARGET(NV_PROVIDES_SM_90, + ( + // barrier.cluster.wait; + * fn_ptr++ = reinterpret_cast(static_cast(cuda::ptx::barrier_cluster_wait));)); +#endif // __cccl_ptx_isa >= 780 + +#if __cccl_ptx_isa >= 800 + NV_IF_TARGET(NV_PROVIDES_SM_90, + ( + // barrier.cluster.arrive.release; + * fn_ptr++ = reinterpret_cast( + static_cast(cuda::ptx::barrier_cluster_arrive));)); +#endif // __cccl_ptx_isa >= 800 + +#if __cccl_ptx_isa >= 800 + NV_IF_TARGET(NV_PROVIDES_SM_90, + ( + // barrier.cluster.arrive.relaxed; + * fn_ptr++ = reinterpret_cast( + static_cast(cuda::ptx::barrier_cluster_arrive));)); +#endif // __cccl_ptx_isa >= 800 + +#if __cccl_ptx_isa >= 800 + NV_IF_TARGET(NV_PROVIDES_SM_90, + ( + // barrier.cluster.wait.acquire; + * fn_ptr++ = reinterpret_cast( + static_cast(cuda::ptx::barrier_cluster_wait));)); +#endif // __cccl_ptx_isa >= 800 +} diff --git a/libcudacxx/test/libcudacxx/cuda/ptx/generated/cp_async_bulk.inc b/libcudacxx/test/libcudacxx/cuda/ptx/generated/cp_async_bulk.inc new file mode 100644 index 00000000000..cd66de989a2 --- /dev/null +++ b/libcudacxx/test/libcudacxx/cuda/ptx/generated/cp_async_bulk.inc @@ -0,0 +1,37 @@ +__global__ void test_cp_async_bulk(void** fn_ptr) +{ +#if __cccl_ptx_isa >= 800 + NV_IF_TARGET( + NV_PROVIDES_SM_90, + ( + // cp.async.bulk.shared::cluster.global.mbarrier::complete_tx::bytes [dstMem], [srcMem], size, [smem_bar]; // + // 1a. unicast + * fn_ptr++ = reinterpret_cast( + static_cast( + cuda::ptx::cp_async_bulk));)); +#endif // __cccl_ptx_isa >= 800 + +#if __cccl_ptx_isa >= 800 + NV_IF_TARGET( + NV_PROVIDES_SM_90, + ( + // cp.async.bulk.shared::cluster.shared::cta.mbarrier::complete_tx::bytes [dstMem], [srcMem], size, + // [rdsmem_bar]; // 2. + * fn_ptr++ = reinterpret_cast( + static_cast( + cuda::ptx::cp_async_bulk));)); +#endif // __cccl_ptx_isa >= 800 + +#if __cccl_ptx_isa >= 800 + NV_IF_TARGET( + NV_PROVIDES_SM_90, + ( + // cp.async.bulk.global.shared::cta.bulk_group [dstMem], [srcMem], size; // 3. + * fn_ptr++ = reinterpret_cast( + static_cast( + cuda::ptx::cp_async_bulk));)); +#endif // __cccl_ptx_isa >= 800 +} diff --git a/libcudacxx/test/libcudacxx/cuda/ptx/generated/cp_async_bulk_commit_group.inc b/libcudacxx/test/libcudacxx/cuda/ptx/generated/cp_async_bulk_commit_group.inc new file mode 100644 index 00000000000..afdf14abb8a --- /dev/null +++ b/libcudacxx/test/libcudacxx/cuda/ptx/generated/cp_async_bulk_commit_group.inc @@ -0,0 +1,10 @@ +__global__ void test_cp_async_bulk_commit_group(void** fn_ptr) +{ +#if __cccl_ptx_isa >= 800 + NV_IF_TARGET( + NV_PROVIDES_SM_90, + ( + // cp.async.bulk.commit_group; + * fn_ptr++ = reinterpret_cast(static_cast(cuda::ptx::cp_async_bulk_commit_group));)); +#endif // __cccl_ptx_isa >= 800 +} diff --git a/libcudacxx/test/libcudacxx/cuda/ptx/generated/cp_async_bulk_multicast.inc b/libcudacxx/test/libcudacxx/cuda/ptx/generated/cp_async_bulk_multicast.inc new file mode 100644 index 00000000000..b2bd0d968d9 --- /dev/null +++ b/libcudacxx/test/libcudacxx/cuda/ptx/generated/cp_async_bulk_multicast.inc @@ -0,0 +1,18 @@ +__global__ void test_cp_async_bulk_multicast(void** fn_ptr) +{ +#if __cccl_ptx_isa >= 800 + NV_IF_TARGET( + NV_HAS_FEATURE_SM_90a, + ( + // cp.async.bulk.shared::cluster.global.mbarrier::complete_tx::bytes.multicast::cluster [dstMem], [srcMem], + // size, [smem_bar], ctaMask; // 1. + * fn_ptr++ = reinterpret_cast( + static_cast(cuda::ptx::cp_async_bulk));)); +#endif // __cccl_ptx_isa >= 800 +} diff --git a/libcudacxx/test/libcudacxx/cuda/ptx/generated/cp_async_bulk_tensor.inc b/libcudacxx/test/libcudacxx/cuda/ptx/generated/cp_async_bulk_tensor.inc new file mode 100644 index 00000000000..f9d0d240d28 --- /dev/null +++ b/libcudacxx/test/libcudacxx/cuda/ptx/generated/cp_async_bulk_tensor.inc @@ -0,0 +1,117 @@ +__global__ void test_cp_async_bulk_tensor(void** fn_ptr) +{ +#if __cccl_ptx_isa >= 800 + NV_IF_TARGET( + NV_PROVIDES_SM_90, + ( + // cp.async.bulk.tensor.1d.shared::cluster.global.tile.mbarrier::complete_tx::bytes [dstMem], [tensorMap, + // tensorCoords], [smem_bar];// 1a. + * fn_ptr++ = reinterpret_cast( + static_cast( + cuda::ptx::cp_async_bulk_tensor));)); +#endif // __cccl_ptx_isa >= 800 + +#if __cccl_ptx_isa >= 800 + NV_IF_TARGET( + NV_PROVIDES_SM_90, + ( + // cp.async.bulk.tensor.1d.global.shared::cta.tile.bulk_group [tensorMap, tensorCoords], [srcMem]; // 3a. + * fn_ptr++ = reinterpret_cast( + static_cast( + cuda::ptx::cp_async_bulk_tensor));)); +#endif // __cccl_ptx_isa >= 800 + +#if __cccl_ptx_isa >= 800 + NV_IF_TARGET( + NV_PROVIDES_SM_90, + ( + // cp.async.bulk.tensor.2d.shared::cluster.global.tile.mbarrier::complete_tx::bytes [dstMem], [tensorMap, + // tensorCoords], [smem_bar];// 1b. + * fn_ptr++ = reinterpret_cast( + static_cast( + cuda::ptx::cp_async_bulk_tensor));)); +#endif // __cccl_ptx_isa >= 800 + +#if __cccl_ptx_isa >= 800 + NV_IF_TARGET( + NV_PROVIDES_SM_90, + ( + // cp.async.bulk.tensor.2d.global.shared::cta.tile.bulk_group [tensorMap, tensorCoords], [srcMem]; // 3b. + * fn_ptr++ = reinterpret_cast( + static_cast( + cuda::ptx::cp_async_bulk_tensor));)); +#endif // __cccl_ptx_isa >= 800 + +#if __cccl_ptx_isa >= 800 + NV_IF_TARGET( + NV_PROVIDES_SM_90, + ( + // cp.async.bulk.tensor.3d.shared::cluster.global.tile.mbarrier::complete_tx::bytes [dstMem], [tensorMap, + // tensorCoords], [smem_bar];// 1c. + * fn_ptr++ = reinterpret_cast( + static_cast( + cuda::ptx::cp_async_bulk_tensor));)); +#endif // __cccl_ptx_isa >= 800 + +#if __cccl_ptx_isa >= 800 + NV_IF_TARGET( + NV_PROVIDES_SM_90, + ( + // cp.async.bulk.tensor.3d.global.shared::cta.tile.bulk_group [tensorMap, tensorCoords], [srcMem]; // 3c. + * fn_ptr++ = reinterpret_cast( + static_cast( + cuda::ptx::cp_async_bulk_tensor));)); +#endif // __cccl_ptx_isa >= 800 + +#if __cccl_ptx_isa >= 800 + NV_IF_TARGET( + NV_PROVIDES_SM_90, + ( + // cp.async.bulk.tensor.4d.shared::cluster.global.tile.mbarrier::complete_tx::bytes [dstMem], [tensorMap, + // tensorCoords], [smem_bar];// 1d. + * fn_ptr++ = reinterpret_cast( + static_cast( + cuda::ptx::cp_async_bulk_tensor));)); +#endif // __cccl_ptx_isa >= 800 + +#if __cccl_ptx_isa >= 800 + NV_IF_TARGET( + NV_PROVIDES_SM_90, + ( + // cp.async.bulk.tensor.4d.global.shared::cta.tile.bulk_group [tensorMap, tensorCoords], [srcMem]; // 3d. + * fn_ptr++ = reinterpret_cast( + static_cast( + cuda::ptx::cp_async_bulk_tensor));)); +#endif // __cccl_ptx_isa >= 800 + +#if __cccl_ptx_isa >= 800 + NV_IF_TARGET( + NV_PROVIDES_SM_90, + ( + // cp.async.bulk.tensor.5d.shared::cluster.global.tile.mbarrier::complete_tx::bytes [dstMem], [tensorMap, + // tensorCoords], [smem_bar];// 1e. + * fn_ptr++ = reinterpret_cast( + static_cast( + cuda::ptx::cp_async_bulk_tensor));)); +#endif // __cccl_ptx_isa >= 800 + +#if __cccl_ptx_isa >= 800 + NV_IF_TARGET( + NV_PROVIDES_SM_90, + ( + // cp.async.bulk.tensor.5d.global.shared::cta.tile.bulk_group [tensorMap, tensorCoords], [srcMem]; // 3e. + * fn_ptr++ = reinterpret_cast( + static_cast( + cuda::ptx::cp_async_bulk_tensor));)); +#endif // __cccl_ptx_isa >= 800 +} diff --git a/libcudacxx/test/libcudacxx/cuda/ptx/generated/cp_async_bulk_tensor_multicast.inc b/libcudacxx/test/libcudacxx/cuda/ptx/generated/cp_async_bulk_tensor_multicast.inc new file mode 100644 index 00000000000..2851aab6d7c --- /dev/null +++ b/libcudacxx/test/libcudacxx/cuda/ptx/generated/cp_async_bulk_tensor_multicast.inc @@ -0,0 +1,82 @@ +__global__ void test_cp_async_bulk_tensor_multicast(void** fn_ptr) +{ +#if __cccl_ptx_isa >= 800 + NV_IF_TARGET( + NV_HAS_FEATURE_SM_90a, + ( + // cp.async.bulk.tensor.1d.shared::cluster.global.tile.mbarrier::complete_tx::bytes.multicast::cluster [dstMem], + // [tensorMap, tensorCoords], [smem_bar], ctaMask; // 2a. + * fn_ptr++ = reinterpret_cast( + static_cast(cuda::ptx::cp_async_bulk_tensor));)); +#endif // __cccl_ptx_isa >= 800 + +#if __cccl_ptx_isa >= 800 + NV_IF_TARGET( + NV_HAS_FEATURE_SM_90a, + ( + // cp.async.bulk.tensor.2d.shared::cluster.global.tile.mbarrier::complete_tx::bytes.multicast::cluster [dstMem], + // [tensorMap, tensorCoords], [smem_bar], ctaMask; // 2b. + * fn_ptr++ = reinterpret_cast( + static_cast(cuda::ptx::cp_async_bulk_tensor));)); +#endif // __cccl_ptx_isa >= 800 + +#if __cccl_ptx_isa >= 800 + NV_IF_TARGET( + NV_HAS_FEATURE_SM_90a, + ( + // cp.async.bulk.tensor.3d.shared::cluster.global.tile.mbarrier::complete_tx::bytes.multicast::cluster [dstMem], + // [tensorMap, tensorCoords], [smem_bar], ctaMask; // 2c. + * fn_ptr++ = reinterpret_cast( + static_cast(cuda::ptx::cp_async_bulk_tensor));)); +#endif // __cccl_ptx_isa >= 800 + +#if __cccl_ptx_isa >= 800 + NV_IF_TARGET( + NV_HAS_FEATURE_SM_90a, + ( + // cp.async.bulk.tensor.4d.shared::cluster.global.tile.mbarrier::complete_tx::bytes.multicast::cluster [dstMem], + // [tensorMap, tensorCoords], [smem_bar], ctaMask; // 2d. + * fn_ptr++ = reinterpret_cast( + static_cast(cuda::ptx::cp_async_bulk_tensor));)); +#endif // __cccl_ptx_isa >= 800 + +#if __cccl_ptx_isa >= 800 + NV_IF_TARGET( + NV_HAS_FEATURE_SM_90a, + ( + // cp.async.bulk.tensor.5d.shared::cluster.global.tile.mbarrier::complete_tx::bytes.multicast::cluster [dstMem], + // [tensorMap, tensorCoords], [smem_bar], ctaMask; // 2e. + * fn_ptr++ = reinterpret_cast( + static_cast(cuda::ptx::cp_async_bulk_tensor));)); +#endif // __cccl_ptx_isa >= 800 +} diff --git a/libcudacxx/test/libcudacxx/cuda/ptx/generated/cp_async_bulk_wait_group.inc b/libcudacxx/test/libcudacxx/cuda/ptx/generated/cp_async_bulk_wait_group.inc new file mode 100644 index 00000000000..0139a65f6ce --- /dev/null +++ b/libcudacxx/test/libcudacxx/cuda/ptx/generated/cp_async_bulk_wait_group.inc @@ -0,0 +1,18 @@ +__global__ void test_cp_async_bulk_wait_group(void** fn_ptr) +{ +#if __cccl_ptx_isa >= 800 + NV_IF_TARGET(NV_PROVIDES_SM_90, + ( + // cp.async.bulk.wait_group N; + * fn_ptr++ = reinterpret_cast( + static_cast)>(cuda::ptx::cp_async_bulk_wait_group));)); +#endif // __cccl_ptx_isa >= 800 + +#if __cccl_ptx_isa >= 800 + NV_IF_TARGET(NV_PROVIDES_SM_90, + ( + // cp.async.bulk.wait_group.read N; + * fn_ptr++ = reinterpret_cast( + static_cast)>(cuda::ptx::cp_async_bulk_wait_group_read));)); +#endif // __cccl_ptx_isa >= 800 +} diff --git a/libcudacxx/test/libcudacxx/cuda/ptx/generated/cp_reduce_async_bulk.inc b/libcudacxx/test/libcudacxx/cuda/ptx/generated/cp_reduce_async_bulk.inc new file mode 100644 index 00000000000..5ee274bcbe8 --- /dev/null +++ b/libcudacxx/test/libcudacxx/cuda/ptx/generated/cp_reduce_async_bulk.inc @@ -0,0 +1,476 @@ +__global__ void test_cp_reduce_async_bulk(void** fn_ptr) +{ +#if __cccl_ptx_isa >= 800 + NV_IF_TARGET( + NV_PROVIDES_SM_90, + ( + // cp.reduce.async.bulk.shared::cluster.shared::cta.mbarrier::complete_tx::bytes.and.b32 [dstMem], [srcMem], + // size, [rdsmem_bar]; // 1. + * fn_ptr++ = reinterpret_cast( + static_cast(cuda::ptx::cp_reduce_async_bulk));)); +#endif // __cccl_ptx_isa >= 800 + +#if __cccl_ptx_isa >= 800 + NV_IF_TARGET( + NV_PROVIDES_SM_90, + ( + // cp.reduce.async.bulk.shared::cluster.shared::cta.mbarrier::complete_tx::bytes.or.b32 [dstMem], [srcMem], + // size, [rdsmem_bar]; // 1. + * fn_ptr++ = reinterpret_cast( + static_cast(cuda::ptx::cp_reduce_async_bulk));)); +#endif // __cccl_ptx_isa >= 800 + +#if __cccl_ptx_isa >= 800 + NV_IF_TARGET( + NV_PROVIDES_SM_90, + ( + // cp.reduce.async.bulk.shared::cluster.shared::cta.mbarrier::complete_tx::bytes.xor.b32 [dstMem], [srcMem], + // size, [rdsmem_bar]; // 1. + * fn_ptr++ = reinterpret_cast( + static_cast(cuda::ptx::cp_reduce_async_bulk));)); +#endif // __cccl_ptx_isa >= 800 + +#if __cccl_ptx_isa >= 800 + NV_IF_TARGET( + NV_PROVIDES_SM_90, + ( + // cp.reduce.async.bulk.shared::cluster.shared::cta.mbarrier::complete_tx::bytes.min.u32 [dstMem], [srcMem], + // size, [rdsmem_bar]; // 1. + * fn_ptr++ = reinterpret_cast( + static_cast(cuda::ptx::cp_reduce_async_bulk));)); +#endif // __cccl_ptx_isa >= 800 + +#if __cccl_ptx_isa >= 800 + NV_IF_TARGET( + NV_PROVIDES_SM_90, + ( + // cp.reduce.async.bulk.shared::cluster.shared::cta.mbarrier::complete_tx::bytes.max.u32 [dstMem], [srcMem], + // size, [rdsmem_bar]; // 1. + * fn_ptr++ = reinterpret_cast( + static_cast(cuda::ptx::cp_reduce_async_bulk));)); +#endif // __cccl_ptx_isa >= 800 + +#if __cccl_ptx_isa >= 800 + NV_IF_TARGET( + NV_PROVIDES_SM_90, + ( + // cp.reduce.async.bulk.shared::cluster.shared::cta.mbarrier::complete_tx::bytes.add.u32 [dstMem], [srcMem], + // size, [rdsmem_bar]; // 1. + * fn_ptr++ = reinterpret_cast( + static_cast(cuda::ptx::cp_reduce_async_bulk));)); +#endif // __cccl_ptx_isa >= 800 + +#if __cccl_ptx_isa >= 800 + NV_IF_TARGET( + NV_PROVIDES_SM_90, + ( + // cp.reduce.async.bulk.shared::cluster.shared::cta.mbarrier::complete_tx::bytes.inc.u32 [dstMem], [srcMem], + // size, [rdsmem_bar]; // 1. + * fn_ptr++ = reinterpret_cast( + static_cast(cuda::ptx::cp_reduce_async_bulk));)); +#endif // __cccl_ptx_isa >= 800 + +#if __cccl_ptx_isa >= 800 + NV_IF_TARGET( + NV_PROVIDES_SM_90, + ( + // cp.reduce.async.bulk.shared::cluster.shared::cta.mbarrier::complete_tx::bytes.dec.u32 [dstMem], [srcMem], + // size, [rdsmem_bar]; // 1. + * fn_ptr++ = reinterpret_cast( + static_cast(cuda::ptx::cp_reduce_async_bulk));)); +#endif // __cccl_ptx_isa >= 800 + +#if __cccl_ptx_isa >= 800 + NV_IF_TARGET( + NV_PROVIDES_SM_90, + ( + // cp.reduce.async.bulk.shared::cluster.shared::cta.mbarrier::complete_tx::bytes.min.s32 [dstMem], [srcMem], + // size, [rdsmem_bar]; // 1. + * fn_ptr++ = reinterpret_cast( + static_cast(cuda::ptx::cp_reduce_async_bulk));)); +#endif // __cccl_ptx_isa >= 800 + +#if __cccl_ptx_isa >= 800 + NV_IF_TARGET( + NV_PROVIDES_SM_90, + ( + // cp.reduce.async.bulk.shared::cluster.shared::cta.mbarrier::complete_tx::bytes.max.s32 [dstMem], [srcMem], + // size, [rdsmem_bar]; // 1. + * fn_ptr++ = reinterpret_cast( + static_cast(cuda::ptx::cp_reduce_async_bulk));)); +#endif // __cccl_ptx_isa >= 800 + +#if __cccl_ptx_isa >= 800 + NV_IF_TARGET( + NV_PROVIDES_SM_90, + ( + // cp.reduce.async.bulk.shared::cluster.shared::cta.mbarrier::complete_tx::bytes.add.s32 [dstMem], [srcMem], + // size, [rdsmem_bar]; // 1. + * fn_ptr++ = reinterpret_cast( + static_cast(cuda::ptx::cp_reduce_async_bulk));)); +#endif // __cccl_ptx_isa >= 800 + +#if __cccl_ptx_isa >= 800 + NV_IF_TARGET( + NV_PROVIDES_SM_90, + ( + // cp.reduce.async.bulk.shared::cluster.shared::cta.mbarrier::complete_tx::bytes.add.u64 [dstMem], [srcMem], + // size, [rdsmem_bar]; // 1. + * fn_ptr++ = reinterpret_cast( + static_cast(cuda::ptx::cp_reduce_async_bulk));)); +#endif // __cccl_ptx_isa >= 800 + +#if __cccl_ptx_isa >= 800 + NV_IF_TARGET( + NV_PROVIDES_SM_90, + ( + // cp.reduce.async.bulk.shared::cluster.shared::cta.mbarrier::complete_tx::bytes.add.u64 [dstMem], [srcMem], + // size, [rdsmem_bar]; // 2. + * fn_ptr++ = reinterpret_cast( + static_cast(cuda::ptx::cp_reduce_async_bulk));)); +#endif // __cccl_ptx_isa >= 800 + +#if __cccl_ptx_isa >= 800 + NV_IF_TARGET( + NV_PROVIDES_SM_90, + ( + // cp.reduce.async.bulk.global.shared::cta.bulk_group.and.b32 [dstMem], [srcMem], size; // 3. + * fn_ptr++ = reinterpret_cast( + static_cast(cuda::ptx::cp_reduce_async_bulk)); + // cp.reduce.async.bulk.global.shared::cta.bulk_group.and.b64 [dstMem], [srcMem], size; // 3. + * fn_ptr++ = reinterpret_cast( + static_cast(cuda::ptx::cp_reduce_async_bulk));)); +#endif // __cccl_ptx_isa >= 800 + +#if __cccl_ptx_isa >= 800 + NV_IF_TARGET( + NV_PROVIDES_SM_90, + ( + // cp.reduce.async.bulk.global.shared::cta.bulk_group.or.b32 [dstMem], [srcMem], size; // 3. + * fn_ptr++ = reinterpret_cast( + static_cast(cuda::ptx::cp_reduce_async_bulk)); + // cp.reduce.async.bulk.global.shared::cta.bulk_group.or.b64 [dstMem], [srcMem], size; // 3. + * fn_ptr++ = reinterpret_cast( + static_cast(cuda::ptx::cp_reduce_async_bulk));)); +#endif // __cccl_ptx_isa >= 800 + +#if __cccl_ptx_isa >= 800 + NV_IF_TARGET( + NV_PROVIDES_SM_90, + ( + // cp.reduce.async.bulk.global.shared::cta.bulk_group.xor.b32 [dstMem], [srcMem], size; // 3. + * fn_ptr++ = reinterpret_cast( + static_cast(cuda::ptx::cp_reduce_async_bulk)); + // cp.reduce.async.bulk.global.shared::cta.bulk_group.xor.b64 [dstMem], [srcMem], size; // 3. + * fn_ptr++ = reinterpret_cast( + static_cast(cuda::ptx::cp_reduce_async_bulk));)); +#endif // __cccl_ptx_isa >= 800 + +#if __cccl_ptx_isa >= 800 + NV_IF_TARGET( + NV_PROVIDES_SM_90, + ( + // cp.reduce.async.bulk.global.shared::cta.bulk_group.min.u32 [dstMem], [srcMem], size; // 4. + * fn_ptr++ = reinterpret_cast( + static_cast(cuda::ptx::cp_reduce_async_bulk));)); +#endif // __cccl_ptx_isa >= 800 + +#if __cccl_ptx_isa >= 800 + NV_IF_TARGET( + NV_PROVIDES_SM_90, + ( + // cp.reduce.async.bulk.global.shared::cta.bulk_group.max.u32 [dstMem], [srcMem], size; // 4. + * fn_ptr++ = reinterpret_cast( + static_cast(cuda::ptx::cp_reduce_async_bulk));)); +#endif // __cccl_ptx_isa >= 800 + +#if __cccl_ptx_isa >= 800 + NV_IF_TARGET( + NV_PROVIDES_SM_90, + ( + // cp.reduce.async.bulk.global.shared::cta.bulk_group.add.u32 [dstMem], [srcMem], size; // 4. + * fn_ptr++ = reinterpret_cast( + static_cast(cuda::ptx::cp_reduce_async_bulk));)); +#endif // __cccl_ptx_isa >= 800 + +#if __cccl_ptx_isa >= 800 + NV_IF_TARGET( + NV_PROVIDES_SM_90, + ( + // cp.reduce.async.bulk.global.shared::cta.bulk_group.inc.u32 [dstMem], [srcMem], size; // 4. + * fn_ptr++ = reinterpret_cast( + static_cast(cuda::ptx::cp_reduce_async_bulk));)); +#endif // __cccl_ptx_isa >= 800 + +#if __cccl_ptx_isa >= 800 + NV_IF_TARGET( + NV_PROVIDES_SM_90, + ( + // cp.reduce.async.bulk.global.shared::cta.bulk_group.dec.u32 [dstMem], [srcMem], size; // 4. + * fn_ptr++ = reinterpret_cast( + static_cast(cuda::ptx::cp_reduce_async_bulk));)); +#endif // __cccl_ptx_isa >= 800 + +#if __cccl_ptx_isa >= 800 + NV_IF_TARGET( + NV_PROVIDES_SM_90, + ( + // cp.reduce.async.bulk.global.shared::cta.bulk_group.min.s32 [dstMem], [srcMem], size; // 4. + * fn_ptr++ = reinterpret_cast( + static_cast( + cuda::ptx::cp_reduce_async_bulk));)); +#endif // __cccl_ptx_isa >= 800 + +#if __cccl_ptx_isa >= 800 + NV_IF_TARGET( + NV_PROVIDES_SM_90, + ( + // cp.reduce.async.bulk.global.shared::cta.bulk_group.max.s32 [dstMem], [srcMem], size; // 4. + * fn_ptr++ = reinterpret_cast( + static_cast( + cuda::ptx::cp_reduce_async_bulk));)); +#endif // __cccl_ptx_isa >= 800 + +#if __cccl_ptx_isa >= 800 + NV_IF_TARGET( + NV_PROVIDES_SM_90, + ( + // cp.reduce.async.bulk.global.shared::cta.bulk_group.add.s32 [dstMem], [srcMem], size; // 4. + * fn_ptr++ = reinterpret_cast( + static_cast( + cuda::ptx::cp_reduce_async_bulk));)); +#endif // __cccl_ptx_isa >= 800 + +#if __cccl_ptx_isa >= 800 + NV_IF_TARGET( + NV_PROVIDES_SM_90, + ( + // cp.reduce.async.bulk.global.shared::cta.bulk_group.min.u64 [dstMem], [srcMem], size; // 4. + * fn_ptr++ = reinterpret_cast( + static_cast(cuda::ptx::cp_reduce_async_bulk));)); +#endif // __cccl_ptx_isa >= 800 + +#if __cccl_ptx_isa >= 800 + NV_IF_TARGET( + NV_PROVIDES_SM_90, + ( + // cp.reduce.async.bulk.global.shared::cta.bulk_group.max.u64 [dstMem], [srcMem], size; // 4. + * fn_ptr++ = reinterpret_cast( + static_cast(cuda::ptx::cp_reduce_async_bulk));)); +#endif // __cccl_ptx_isa >= 800 + +#if __cccl_ptx_isa >= 800 + NV_IF_TARGET( + NV_PROVIDES_SM_90, + ( + // cp.reduce.async.bulk.global.shared::cta.bulk_group.add.u64 [dstMem], [srcMem], size; // 4. + * fn_ptr++ = reinterpret_cast( + static_cast(cuda::ptx::cp_reduce_async_bulk));)); +#endif // __cccl_ptx_isa >= 800 + +#if __cccl_ptx_isa >= 800 + NV_IF_TARGET( + NV_PROVIDES_SM_90, + ( + // cp.reduce.async.bulk.global.shared::cta.bulk_group.min.s64 [dstMem], [srcMem], size; // 4. + * fn_ptr++ = reinterpret_cast( + static_cast( + cuda::ptx::cp_reduce_async_bulk));)); +#endif // __cccl_ptx_isa >= 800 + +#if __cccl_ptx_isa >= 800 + NV_IF_TARGET( + NV_PROVIDES_SM_90, + ( + // cp.reduce.async.bulk.global.shared::cta.bulk_group.max.s64 [dstMem], [srcMem], size; // 4. + * fn_ptr++ = reinterpret_cast( + static_cast( + cuda::ptx::cp_reduce_async_bulk));)); +#endif // __cccl_ptx_isa >= 800 + +#if __cccl_ptx_isa >= 800 + NV_IF_TARGET( + NV_PROVIDES_SM_90, + ( + // cp.reduce.async.bulk.global.shared::cta.bulk_group.add.f32 [dstMem], [srcMem], size; // 4. + * fn_ptr++ = reinterpret_cast( + static_cast( + cuda::ptx::cp_reduce_async_bulk));)); +#endif // __cccl_ptx_isa >= 800 + +#if __cccl_ptx_isa >= 800 + NV_IF_TARGET( + NV_PROVIDES_SM_90, + ( + // cp.reduce.async.bulk.global.shared::cta.bulk_group.add.f64 [dstMem], [srcMem], size; // 4. + * fn_ptr++ = reinterpret_cast( + static_cast( + cuda::ptx::cp_reduce_async_bulk));)); +#endif // __cccl_ptx_isa >= 800 + +#if __cccl_ptx_isa >= 800 + NV_IF_TARGET( + NV_PROVIDES_SM_90, + ( + // cp.reduce.async.bulk.global.shared::cta.bulk_group.add.u64 [dstMem], [srcMem], size; // 6. + * fn_ptr++ = reinterpret_cast( + static_cast( + cuda::ptx::cp_reduce_async_bulk));)); +#endif // __cccl_ptx_isa >= 800 +} diff --git a/libcudacxx/test/libcudacxx/cuda/ptx/generated/cp_reduce_async_bulk_bf16.inc b/libcudacxx/test/libcudacxx/cuda/ptx/generated/cp_reduce_async_bulk_bf16.inc new file mode 100644 index 00000000000..fe38374fe00 --- /dev/null +++ b/libcudacxx/test/libcudacxx/cuda/ptx/generated/cp_reduce_async_bulk_bf16.inc @@ -0,0 +1,44 @@ +__global__ void test_cp_reduce_async_bulk_bf16(void** fn_ptr) +{ +# if __cccl_ptx_isa >= 800 + NV_IF_TARGET( + NV_PROVIDES_SM_90, + ( + // cp.reduce.async.bulk.global.shared::cta.bulk_group.min.bf16 [dstMem], [srcMem], size; // 4. + * fn_ptr++ = reinterpret_cast( + static_cast(cuda::ptx::cp_reduce_async_bulk));)); +# endif // __cccl_ptx_isa >= 800 + +# if __cccl_ptx_isa >= 800 + NV_IF_TARGET( + NV_PROVIDES_SM_90, + ( + // cp.reduce.async.bulk.global.shared::cta.bulk_group.max.bf16 [dstMem], [srcMem], size; // 4. + * fn_ptr++ = reinterpret_cast( + static_cast(cuda::ptx::cp_reduce_async_bulk));)); +# endif // __cccl_ptx_isa >= 800 + +# if __cccl_ptx_isa >= 800 + NV_IF_TARGET( + NV_PROVIDES_SM_90, + ( + // cp.reduce.async.bulk.global.shared::cta.bulk_group.add.noftz.bf16 [dstMem], [srcMem], size; // 5. + * fn_ptr++ = reinterpret_cast( + static_cast(cuda::ptx::cp_reduce_async_bulk));)); +# endif // __cccl_ptx_isa >= 800 +} diff --git a/libcudacxx/test/libcudacxx/cuda/ptx/generated/cp_reduce_async_bulk_f16.inc b/libcudacxx/test/libcudacxx/cuda/ptx/generated/cp_reduce_async_bulk_f16.inc new file mode 100644 index 00000000000..e7e58cfcb80 --- /dev/null +++ b/libcudacxx/test/libcudacxx/cuda/ptx/generated/cp_reduce_async_bulk_f16.inc @@ -0,0 +1,35 @@ +__global__ void test_cp_reduce_async_bulk_f16(void** fn_ptr) +{ +# if __cccl_ptx_isa >= 800 + NV_IF_TARGET( + NV_PROVIDES_SM_90, + ( + // cp.reduce.async.bulk.global.shared::cta.bulk_group.min.f16 [dstMem], [srcMem], size; // 4. + * fn_ptr++ = reinterpret_cast( + static_cast( + cuda::ptx::cp_reduce_async_bulk));)); +# endif // __cccl_ptx_isa >= 800 + +# if __cccl_ptx_isa >= 800 + NV_IF_TARGET( + NV_PROVIDES_SM_90, + ( + // cp.reduce.async.bulk.global.shared::cta.bulk_group.max.f16 [dstMem], [srcMem], size; // 4. + * fn_ptr++ = reinterpret_cast( + static_cast( + cuda::ptx::cp_reduce_async_bulk));)); +# endif // __cccl_ptx_isa >= 800 + +# if __cccl_ptx_isa >= 800 + NV_IF_TARGET( + NV_PROVIDES_SM_90, + ( + // cp.reduce.async.bulk.global.shared::cta.bulk_group.add.noftz.f16 [dstMem], [srcMem], size; // 5. + * fn_ptr++ = reinterpret_cast( + static_cast( + cuda::ptx::cp_reduce_async_bulk));)); +# endif // __cccl_ptx_isa >= 800 +} diff --git a/libcudacxx/test/libcudacxx/cuda/ptx/generated/cp_reduce_async_bulk_tensor.inc b/libcudacxx/test/libcudacxx/cuda/ptx/generated/cp_reduce_async_bulk_tensor.inc new file mode 100644 index 00000000000..6f0a7d710ce --- /dev/null +++ b/libcudacxx/test/libcudacxx/cuda/ptx/generated/cp_reduce_async_bulk_tensor.inc @@ -0,0 +1,392 @@ +__global__ void test_cp_reduce_async_bulk_tensor(void** fn_ptr) +{ +#if __cccl_ptx_isa >= 800 + NV_IF_TARGET( + NV_PROVIDES_SM_90, + ( + // cp.reduce.async.bulk.tensor.1d.global.shared::cta.add.tile.bulk_group [tensorMap, tensorCoords], [srcMem]; // + // 1a. + * fn_ptr++ = reinterpret_cast( + static_cast(cuda::ptx::cp_reduce_async_bulk_tensor)); + // cp.reduce.async.bulk.tensor.1d.global.shared::cta.min.tile.bulk_group [tensorMap, tensorCoords], + // [srcMem]; // 1a. + * fn_ptr++ = reinterpret_cast( + static_cast(cuda::ptx::cp_reduce_async_bulk_tensor)); + // cp.reduce.async.bulk.tensor.1d.global.shared::cta.max.tile.bulk_group [tensorMap, tensorCoords], + // [srcMem]; // 1a. + * fn_ptr++ = reinterpret_cast( + static_cast(cuda::ptx::cp_reduce_async_bulk_tensor)); + // cp.reduce.async.bulk.tensor.1d.global.shared::cta.inc.tile.bulk_group [tensorMap, tensorCoords], + // [srcMem]; // 1a. + * fn_ptr++ = reinterpret_cast( + static_cast(cuda::ptx::cp_reduce_async_bulk_tensor)); + // cp.reduce.async.bulk.tensor.1d.global.shared::cta.dec.tile.bulk_group [tensorMap, tensorCoords], + // [srcMem]; // 1a. + * fn_ptr++ = reinterpret_cast( + static_cast(cuda::ptx::cp_reduce_async_bulk_tensor)); + // cp.reduce.async.bulk.tensor.1d.global.shared::cta.and.tile.bulk_group [tensorMap, tensorCoords], + // [srcMem]; // 1a. + * fn_ptr++ = reinterpret_cast( + static_cast(cuda::ptx::cp_reduce_async_bulk_tensor)); + // cp.reduce.async.bulk.tensor.1d.global.shared::cta.or.tile.bulk_group [tensorMap, tensorCoords], [srcMem]; + // // 1a. + * fn_ptr++ = reinterpret_cast( + static_cast(cuda::ptx::cp_reduce_async_bulk_tensor)); + // cp.reduce.async.bulk.tensor.1d.global.shared::cta.xor.tile.bulk_group [tensorMap, tensorCoords], + // [srcMem]; // 1a. + * fn_ptr++ = reinterpret_cast( + static_cast(cuda::ptx::cp_reduce_async_bulk_tensor));)); +#endif // __cccl_ptx_isa >= 800 + +#if __cccl_ptx_isa >= 800 + NV_IF_TARGET( + NV_PROVIDES_SM_90, + ( + // cp.reduce.async.bulk.tensor.2d.global.shared::cta.add.tile.bulk_group [tensorMap, tensorCoords], [srcMem]; // + // 1b. + * fn_ptr++ = reinterpret_cast( + static_cast(cuda::ptx::cp_reduce_async_bulk_tensor)); + // cp.reduce.async.bulk.tensor.2d.global.shared::cta.min.tile.bulk_group [tensorMap, tensorCoords], + // [srcMem]; // 1b. + * fn_ptr++ = reinterpret_cast( + static_cast(cuda::ptx::cp_reduce_async_bulk_tensor)); + // cp.reduce.async.bulk.tensor.2d.global.shared::cta.max.tile.bulk_group [tensorMap, tensorCoords], + // [srcMem]; // 1b. + * fn_ptr++ = reinterpret_cast( + static_cast(cuda::ptx::cp_reduce_async_bulk_tensor)); + // cp.reduce.async.bulk.tensor.2d.global.shared::cta.inc.tile.bulk_group [tensorMap, tensorCoords], + // [srcMem]; // 1b. + * fn_ptr++ = reinterpret_cast( + static_cast(cuda::ptx::cp_reduce_async_bulk_tensor)); + // cp.reduce.async.bulk.tensor.2d.global.shared::cta.dec.tile.bulk_group [tensorMap, tensorCoords], + // [srcMem]; // 1b. + * fn_ptr++ = reinterpret_cast( + static_cast(cuda::ptx::cp_reduce_async_bulk_tensor)); + // cp.reduce.async.bulk.tensor.2d.global.shared::cta.and.tile.bulk_group [tensorMap, tensorCoords], + // [srcMem]; // 1b. + * fn_ptr++ = reinterpret_cast( + static_cast(cuda::ptx::cp_reduce_async_bulk_tensor)); + // cp.reduce.async.bulk.tensor.2d.global.shared::cta.or.tile.bulk_group [tensorMap, tensorCoords], [srcMem]; + // // 1b. + * fn_ptr++ = reinterpret_cast( + static_cast(cuda::ptx::cp_reduce_async_bulk_tensor)); + // cp.reduce.async.bulk.tensor.2d.global.shared::cta.xor.tile.bulk_group [tensorMap, tensorCoords], + // [srcMem]; // 1b. + * fn_ptr++ = reinterpret_cast( + static_cast(cuda::ptx::cp_reduce_async_bulk_tensor));)); +#endif // __cccl_ptx_isa >= 800 + +#if __cccl_ptx_isa >= 800 + NV_IF_TARGET( + NV_PROVIDES_SM_90, + ( + // cp.reduce.async.bulk.tensor.3d.global.shared::cta.add.tile.bulk_group [tensorMap, tensorCoords], [srcMem]; // + // 1c. + * fn_ptr++ = reinterpret_cast( + static_cast(cuda::ptx::cp_reduce_async_bulk_tensor)); + // cp.reduce.async.bulk.tensor.3d.global.shared::cta.min.tile.bulk_group [tensorMap, tensorCoords], + // [srcMem]; // 1c. + * fn_ptr++ = reinterpret_cast( + static_cast(cuda::ptx::cp_reduce_async_bulk_tensor)); + // cp.reduce.async.bulk.tensor.3d.global.shared::cta.max.tile.bulk_group [tensorMap, tensorCoords], + // [srcMem]; // 1c. + * fn_ptr++ = reinterpret_cast( + static_cast(cuda::ptx::cp_reduce_async_bulk_tensor)); + // cp.reduce.async.bulk.tensor.3d.global.shared::cta.inc.tile.bulk_group [tensorMap, tensorCoords], + // [srcMem]; // 1c. + * fn_ptr++ = reinterpret_cast( + static_cast(cuda::ptx::cp_reduce_async_bulk_tensor)); + // cp.reduce.async.bulk.tensor.3d.global.shared::cta.dec.tile.bulk_group [tensorMap, tensorCoords], + // [srcMem]; // 1c. + * fn_ptr++ = reinterpret_cast( + static_cast(cuda::ptx::cp_reduce_async_bulk_tensor)); + // cp.reduce.async.bulk.tensor.3d.global.shared::cta.and.tile.bulk_group [tensorMap, tensorCoords], + // [srcMem]; // 1c. + * fn_ptr++ = reinterpret_cast( + static_cast(cuda::ptx::cp_reduce_async_bulk_tensor)); + // cp.reduce.async.bulk.tensor.3d.global.shared::cta.or.tile.bulk_group [tensorMap, tensorCoords], [srcMem]; + // // 1c. + * fn_ptr++ = reinterpret_cast( + static_cast(cuda::ptx::cp_reduce_async_bulk_tensor)); + // cp.reduce.async.bulk.tensor.3d.global.shared::cta.xor.tile.bulk_group [tensorMap, tensorCoords], + // [srcMem]; // 1c. + * fn_ptr++ = reinterpret_cast( + static_cast(cuda::ptx::cp_reduce_async_bulk_tensor));)); +#endif // __cccl_ptx_isa >= 800 + +#if __cccl_ptx_isa >= 800 + NV_IF_TARGET( + NV_PROVIDES_SM_90, + ( + // cp.reduce.async.bulk.tensor.4d.global.shared::cta.add.tile.bulk_group [tensorMap, tensorCoords], [srcMem]; // + // 1d. + * fn_ptr++ = reinterpret_cast( + static_cast(cuda::ptx::cp_reduce_async_bulk_tensor)); + // cp.reduce.async.bulk.tensor.4d.global.shared::cta.min.tile.bulk_group [tensorMap, tensorCoords], + // [srcMem]; // 1d. + * fn_ptr++ = reinterpret_cast( + static_cast(cuda::ptx::cp_reduce_async_bulk_tensor)); + // cp.reduce.async.bulk.tensor.4d.global.shared::cta.max.tile.bulk_group [tensorMap, tensorCoords], + // [srcMem]; // 1d. + * fn_ptr++ = reinterpret_cast( + static_cast(cuda::ptx::cp_reduce_async_bulk_tensor)); + // cp.reduce.async.bulk.tensor.4d.global.shared::cta.inc.tile.bulk_group [tensorMap, tensorCoords], + // [srcMem]; // 1d. + * fn_ptr++ = reinterpret_cast( + static_cast(cuda::ptx::cp_reduce_async_bulk_tensor)); + // cp.reduce.async.bulk.tensor.4d.global.shared::cta.dec.tile.bulk_group [tensorMap, tensorCoords], + // [srcMem]; // 1d. + * fn_ptr++ = reinterpret_cast( + static_cast(cuda::ptx::cp_reduce_async_bulk_tensor)); + // cp.reduce.async.bulk.tensor.4d.global.shared::cta.and.tile.bulk_group [tensorMap, tensorCoords], + // [srcMem]; // 1d. + * fn_ptr++ = reinterpret_cast( + static_cast(cuda::ptx::cp_reduce_async_bulk_tensor)); + // cp.reduce.async.bulk.tensor.4d.global.shared::cta.or.tile.bulk_group [tensorMap, tensorCoords], [srcMem]; + // // 1d. + * fn_ptr++ = reinterpret_cast( + static_cast(cuda::ptx::cp_reduce_async_bulk_tensor)); + // cp.reduce.async.bulk.tensor.4d.global.shared::cta.xor.tile.bulk_group [tensorMap, tensorCoords], + // [srcMem]; // 1d. + * fn_ptr++ = reinterpret_cast( + static_cast(cuda::ptx::cp_reduce_async_bulk_tensor));)); +#endif // __cccl_ptx_isa >= 800 + +#if __cccl_ptx_isa >= 800 + NV_IF_TARGET( + NV_PROVIDES_SM_90, + ( + // cp.reduce.async.bulk.tensor.5d.global.shared::cta.add.tile.bulk_group [tensorMap, tensorCoords], [srcMem]; // + // 1e. + * fn_ptr++ = reinterpret_cast( + static_cast(cuda::ptx::cp_reduce_async_bulk_tensor)); + // cp.reduce.async.bulk.tensor.5d.global.shared::cta.min.tile.bulk_group [tensorMap, tensorCoords], + // [srcMem]; // 1e. + * fn_ptr++ = reinterpret_cast( + static_cast(cuda::ptx::cp_reduce_async_bulk_tensor)); + // cp.reduce.async.bulk.tensor.5d.global.shared::cta.max.tile.bulk_group [tensorMap, tensorCoords], + // [srcMem]; // 1e. + * fn_ptr++ = reinterpret_cast( + static_cast(cuda::ptx::cp_reduce_async_bulk_tensor)); + // cp.reduce.async.bulk.tensor.5d.global.shared::cta.inc.tile.bulk_group [tensorMap, tensorCoords], + // [srcMem]; // 1e. + * fn_ptr++ = reinterpret_cast( + static_cast(cuda::ptx::cp_reduce_async_bulk_tensor)); + // cp.reduce.async.bulk.tensor.5d.global.shared::cta.dec.tile.bulk_group [tensorMap, tensorCoords], + // [srcMem]; // 1e. + * fn_ptr++ = reinterpret_cast( + static_cast(cuda::ptx::cp_reduce_async_bulk_tensor)); + // cp.reduce.async.bulk.tensor.5d.global.shared::cta.and.tile.bulk_group [tensorMap, tensorCoords], + // [srcMem]; // 1e. + * fn_ptr++ = reinterpret_cast( + static_cast(cuda::ptx::cp_reduce_async_bulk_tensor)); + // cp.reduce.async.bulk.tensor.5d.global.shared::cta.or.tile.bulk_group [tensorMap, tensorCoords], [srcMem]; + // // 1e. + * fn_ptr++ = reinterpret_cast( + static_cast(cuda::ptx::cp_reduce_async_bulk_tensor)); + // cp.reduce.async.bulk.tensor.5d.global.shared::cta.xor.tile.bulk_group [tensorMap, tensorCoords], + // [srcMem]; // 1e. + * fn_ptr++ = reinterpret_cast( + static_cast(cuda::ptx::cp_reduce_async_bulk_tensor));)); +#endif // __cccl_ptx_isa >= 800 +} diff --git a/libcudacxx/test/libcudacxx/cuda/ptx/generated/fence.inc b/libcudacxx/test/libcudacxx/cuda/ptx/generated/fence.inc new file mode 100644 index 00000000000..2e464580de9 --- /dev/null +++ b/libcudacxx/test/libcudacxx/cuda/ptx/generated/fence.inc @@ -0,0 +1,38 @@ +__global__ void test_fence(void** fn_ptr) +{ +#if __cccl_ptx_isa >= 600 + NV_IF_TARGET( + NV_PROVIDES_SM_70, + ( + // fence.sc.cta; // 1. + * fn_ptr++ = + reinterpret_cast(static_cast(cuda::ptx::fence)); + // fence.sc.gpu; // 1. + * fn_ptr++ = reinterpret_cast( + static_cast(cuda::ptx::fence)); + // fence.sc.sys; // 1. + * fn_ptr++ = reinterpret_cast( + static_cast(cuda::ptx::fence)); + // fence.acq_rel.cta; // 1. + * fn_ptr++ = reinterpret_cast( + static_cast(cuda::ptx::fence)); + // fence.acq_rel.gpu; // 1. + * fn_ptr++ = reinterpret_cast( + static_cast(cuda::ptx::fence)); + // fence.acq_rel.sys; // 1. + * fn_ptr++ = reinterpret_cast( + static_cast(cuda::ptx::fence));)); +#endif // __cccl_ptx_isa >= 600 + +#if __cccl_ptx_isa >= 780 + NV_IF_TARGET( + NV_PROVIDES_SM_90, + ( + // fence.sc.cluster; // 2. + * fn_ptr++ = reinterpret_cast( + static_cast(cuda::ptx::fence)); + // fence.acq_rel.cluster; // 2. + * fn_ptr++ = reinterpret_cast( + static_cast(cuda::ptx::fence));)); +#endif // __cccl_ptx_isa >= 780 +} diff --git a/libcudacxx/test/libcudacxx/cuda/ptx/generated/fence_mbarrier_init.inc b/libcudacxx/test/libcudacxx/cuda/ptx/generated/fence_mbarrier_init.inc new file mode 100644 index 00000000000..f503c1d055b --- /dev/null +++ b/libcudacxx/test/libcudacxx/cuda/ptx/generated/fence_mbarrier_init.inc @@ -0,0 +1,11 @@ +__global__ void test_fence_mbarrier_init(void** fn_ptr) +{ +#if __cccl_ptx_isa >= 800 + NV_IF_TARGET( + NV_PROVIDES_SM_90, + ( + // fence.mbarrier_init.release.cluster; // 3. + * fn_ptr++ = reinterpret_cast(static_cast( + cuda::ptx::fence_mbarrier_init));)); +#endif // __cccl_ptx_isa >= 800 +} diff --git a/libcudacxx/test/libcudacxx/cuda/ptx/generated/fence_proxy_alias.inc b/libcudacxx/test/libcudacxx/cuda/ptx/generated/fence_proxy_alias.inc new file mode 100644 index 00000000000..a8021d3f5be --- /dev/null +++ b/libcudacxx/test/libcudacxx/cuda/ptx/generated/fence_proxy_alias.inc @@ -0,0 +1,9 @@ +__global__ void test_fence_proxy_alias(void** fn_ptr) +{ +#if __cccl_ptx_isa >= 750 + NV_IF_TARGET(NV_PROVIDES_SM_70, + ( + // fence.proxy.alias; // 4. + * fn_ptr++ = reinterpret_cast(static_cast(cuda::ptx::fence_proxy_alias));)); +#endif // __cccl_ptx_isa >= 750 +} diff --git a/libcudacxx/test/libcudacxx/cuda/ptx/generated/fence_proxy_async.inc b/libcudacxx/test/libcudacxx/cuda/ptx/generated/fence_proxy_async.inc new file mode 100644 index 00000000000..e3d8e6d160a --- /dev/null +++ b/libcudacxx/test/libcudacxx/cuda/ptx/generated/fence_proxy_async.inc @@ -0,0 +1,24 @@ +__global__ void test_fence_proxy_async(void** fn_ptr) +{ +#if __cccl_ptx_isa >= 800 + NV_IF_TARGET(NV_PROVIDES_SM_90, + ( + // fence.proxy.async; // 5. + * fn_ptr++ = reinterpret_cast(static_cast(cuda::ptx::fence_proxy_async));)); +#endif // __cccl_ptx_isa >= 800 + +#if __cccl_ptx_isa >= 800 + NV_IF_TARGET( + NV_PROVIDES_SM_90, + ( + // fence.proxy.async.global; // 6. + * fn_ptr++ = + reinterpret_cast(static_cast(cuda::ptx::fence_proxy_async)); + // fence.proxy.async.shared::cluster; // 6. + * fn_ptr++ = + reinterpret_cast(static_cast(cuda::ptx::fence_proxy_async)); + // fence.proxy.async.shared::cta; // 6. + * fn_ptr++ = reinterpret_cast( + static_cast(cuda::ptx::fence_proxy_async));)); +#endif // __cccl_ptx_isa >= 800 +} diff --git a/libcudacxx/test/libcudacxx/cuda/ptx/generated/fence_proxy_tensormap_generic.inc b/libcudacxx/test/libcudacxx/cuda/ptx/generated/fence_proxy_tensormap_generic.inc new file mode 100644 index 00000000000..1e0ea93a387 --- /dev/null +++ b/libcudacxx/test/libcudacxx/cuda/ptx/generated/fence_proxy_tensormap_generic.inc @@ -0,0 +1,44 @@ +__global__ void test_fence_proxy_tensormap_generic(void** fn_ptr) +{ +#if __cccl_ptx_isa >= 830 + NV_IF_TARGET( + NV_PROVIDES_SM_90, + ( + // fence.proxy.tensormap::generic.release.cta; // 7. + * fn_ptr++ = reinterpret_cast(static_cast( + cuda::ptx::fence_proxy_tensormap_generic)); + // fence.proxy.tensormap::generic.release.cluster; // 7. + * fn_ptr++ = + reinterpret_cast(static_cast( + cuda::ptx::fence_proxy_tensormap_generic)); + // fence.proxy.tensormap::generic.release.gpu; // 7. + * fn_ptr++ = reinterpret_cast(static_cast( + cuda::ptx::fence_proxy_tensormap_generic)); + // fence.proxy.tensormap::generic.release.sys; // 7. + * fn_ptr++ = reinterpret_cast(static_cast( + cuda::ptx::fence_proxy_tensormap_generic));)); +#endif // __cccl_ptx_isa >= 830 + +#if __cccl_ptx_isa >= 830 + NV_IF_TARGET( + NV_PROVIDES_SM_90, + ( + // fence.proxy.tensormap::generic.acquire.cta [addr], size; // 8. + * fn_ptr++ = reinterpret_cast( + static_cast)>( + cuda::ptx::fence_proxy_tensormap_generic)); + // fence.proxy.tensormap::generic.acquire.cluster [addr], size; // 8. + * fn_ptr++ = reinterpret_cast( + static_cast)>( + cuda::ptx::fence_proxy_tensormap_generic)); + // fence.proxy.tensormap::generic.acquire.gpu [addr], size; // 8. + * fn_ptr++ = reinterpret_cast( + static_cast)>( + cuda::ptx::fence_proxy_tensormap_generic)); + // fence.proxy.tensormap::generic.acquire.sys [addr], size; // 8. + * fn_ptr++ = reinterpret_cast( + static_cast)>( + cuda::ptx::fence_proxy_tensormap_generic));)); +#endif // __cccl_ptx_isa >= 830 +} diff --git a/libcudacxx/test/libcudacxx/cuda/ptx/generated/get_sreg.inc b/libcudacxx/test/libcudacxx/cuda/ptx/generated/get_sreg.inc new file mode 100644 index 00000000000..90842352f90 --- /dev/null +++ b/libcudacxx/test/libcudacxx/cuda/ptx/generated/get_sreg.inc @@ -0,0 +1,331 @@ +__global__ void test_get_sreg(void** fn_ptr) +{ +#if __cccl_ptx_isa >= 200 + // mov.u32 sreg_value, %%tid.x; + *fn_ptr++ = reinterpret_cast(static_cast(cuda::ptx::get_sreg_tid_x)); +#endif // __cccl_ptx_isa >= 200 + +#if __cccl_ptx_isa >= 200 + // mov.u32 sreg_value, %%tid.y; + *fn_ptr++ = reinterpret_cast(static_cast(cuda::ptx::get_sreg_tid_y)); +#endif // __cccl_ptx_isa >= 200 + +#if __cccl_ptx_isa >= 200 + // mov.u32 sreg_value, %%tid.z; + *fn_ptr++ = reinterpret_cast(static_cast(cuda::ptx::get_sreg_tid_z)); +#endif // __cccl_ptx_isa >= 200 + +#if __cccl_ptx_isa >= 200 + // mov.u32 sreg_value, %%ntid.x; + *fn_ptr++ = reinterpret_cast(static_cast(cuda::ptx::get_sreg_ntid_x)); +#endif // __cccl_ptx_isa >= 200 + +#if __cccl_ptx_isa >= 200 + // mov.u32 sreg_value, %%ntid.y; + *fn_ptr++ = reinterpret_cast(static_cast(cuda::ptx::get_sreg_ntid_y)); +#endif // __cccl_ptx_isa >= 200 + +#if __cccl_ptx_isa >= 200 + // mov.u32 sreg_value, %%ntid.z; + *fn_ptr++ = reinterpret_cast(static_cast(cuda::ptx::get_sreg_ntid_z)); +#endif // __cccl_ptx_isa >= 200 + +#if __cccl_ptx_isa >= 130 + // mov.u32 sreg_value, %%laneid; + *fn_ptr++ = reinterpret_cast(static_cast(cuda::ptx::get_sreg_laneid)); +#endif // __cccl_ptx_isa >= 130 + +#if __cccl_ptx_isa >= 130 + // mov.u32 sreg_value, %%warpid; + *fn_ptr++ = reinterpret_cast(static_cast(cuda::ptx::get_sreg_warpid)); +#endif // __cccl_ptx_isa >= 130 + +#if __cccl_ptx_isa >= 200 + NV_IF_TARGET(NV_PROVIDES_SM_35, + ( + // mov.u32 sreg_value, %%nwarpid; + * fn_ptr++ = reinterpret_cast(static_cast(cuda::ptx::get_sreg_nwarpid));)); +#endif // __cccl_ptx_isa >= 200 + +#if __cccl_ptx_isa >= 200 + // mov.u32 sreg_value, %%ctaid.x; + *fn_ptr++ = reinterpret_cast(static_cast(cuda::ptx::get_sreg_ctaid_x)); +#endif // __cccl_ptx_isa >= 200 + +#if __cccl_ptx_isa >= 200 + // mov.u32 sreg_value, %%ctaid.y; + *fn_ptr++ = reinterpret_cast(static_cast(cuda::ptx::get_sreg_ctaid_y)); +#endif // __cccl_ptx_isa >= 200 + +#if __cccl_ptx_isa >= 200 + // mov.u32 sreg_value, %%ctaid.z; + *fn_ptr++ = reinterpret_cast(static_cast(cuda::ptx::get_sreg_ctaid_z)); +#endif // __cccl_ptx_isa >= 200 + +#if __cccl_ptx_isa >= 200 + // mov.u32 sreg_value, %%nctaid.x; + *fn_ptr++ = reinterpret_cast(static_cast(cuda::ptx::get_sreg_nctaid_x)); +#endif // __cccl_ptx_isa >= 200 + +#if __cccl_ptx_isa >= 200 + // mov.u32 sreg_value, %%nctaid.y; + *fn_ptr++ = reinterpret_cast(static_cast(cuda::ptx::get_sreg_nctaid_y)); +#endif // __cccl_ptx_isa >= 200 + +#if __cccl_ptx_isa >= 200 + // mov.u32 sreg_value, %%nctaid.z; + *fn_ptr++ = reinterpret_cast(static_cast(cuda::ptx::get_sreg_nctaid_z)); +#endif // __cccl_ptx_isa >= 200 + +#if __cccl_ptx_isa >= 130 + // mov.u32 sreg_value, %%smid; + *fn_ptr++ = reinterpret_cast(static_cast(cuda::ptx::get_sreg_smid)); +#endif // __cccl_ptx_isa >= 130 + +#if __cccl_ptx_isa >= 200 + NV_IF_TARGET(NV_PROVIDES_SM_35, + ( + // mov.u32 sreg_value, %%nsmid; + * fn_ptr++ = reinterpret_cast(static_cast(cuda::ptx::get_sreg_nsmid));)); +#endif // __cccl_ptx_isa >= 200 + +#if __cccl_ptx_isa >= 300 + // mov.u64 sreg_value, %%gridid; + *fn_ptr++ = reinterpret_cast(static_cast(cuda::ptx::get_sreg_gridid)); +#endif // __cccl_ptx_isa >= 300 + +#if __cccl_ptx_isa >= 780 + NV_IF_TARGET( + NV_PROVIDES_SM_90, + ( + // mov.pred sreg_value, %%is_explicit_cluster; + * fn_ptr++ = reinterpret_cast(static_cast(cuda::ptx::get_sreg_is_explicit_cluster));)); +#endif // __cccl_ptx_isa >= 780 + +#if __cccl_ptx_isa >= 780 + NV_IF_TARGET( + NV_PROVIDES_SM_90, + ( + // mov.u32 sreg_value, %%clusterid.x; + * fn_ptr++ = reinterpret_cast(static_cast(cuda::ptx::get_sreg_clusterid_x));)); +#endif // __cccl_ptx_isa >= 780 + +#if __cccl_ptx_isa >= 780 + NV_IF_TARGET( + NV_PROVIDES_SM_90, + ( + // mov.u32 sreg_value, %%clusterid.y; + * fn_ptr++ = reinterpret_cast(static_cast(cuda::ptx::get_sreg_clusterid_y));)); +#endif // __cccl_ptx_isa >= 780 + +#if __cccl_ptx_isa >= 780 + NV_IF_TARGET( + NV_PROVIDES_SM_90, + ( + // mov.u32 sreg_value, %%clusterid.z; + * fn_ptr++ = reinterpret_cast(static_cast(cuda::ptx::get_sreg_clusterid_z));)); +#endif // __cccl_ptx_isa >= 780 + +#if __cccl_ptx_isa >= 780 + NV_IF_TARGET( + NV_PROVIDES_SM_90, + ( + // mov.u32 sreg_value, %%nclusterid.x; + * fn_ptr++ = reinterpret_cast(static_cast(cuda::ptx::get_sreg_nclusterid_x));)); +#endif // __cccl_ptx_isa >= 780 + +#if __cccl_ptx_isa >= 780 + NV_IF_TARGET( + NV_PROVIDES_SM_90, + ( + // mov.u32 sreg_value, %%nclusterid.y; + * fn_ptr++ = reinterpret_cast(static_cast(cuda::ptx::get_sreg_nclusterid_y));)); +#endif // __cccl_ptx_isa >= 780 + +#if __cccl_ptx_isa >= 780 + NV_IF_TARGET( + NV_PROVIDES_SM_90, + ( + // mov.u32 sreg_value, %%nclusterid.z; + * fn_ptr++ = reinterpret_cast(static_cast(cuda::ptx::get_sreg_nclusterid_z));)); +#endif // __cccl_ptx_isa >= 780 + +#if __cccl_ptx_isa >= 780 + NV_IF_TARGET( + NV_PROVIDES_SM_90, + ( + // mov.u32 sreg_value, %%cluster_ctaid.x; + * fn_ptr++ = reinterpret_cast(static_cast(cuda::ptx::get_sreg_cluster_ctaid_x));)); +#endif // __cccl_ptx_isa >= 780 + +#if __cccl_ptx_isa >= 780 + NV_IF_TARGET( + NV_PROVIDES_SM_90, + ( + // mov.u32 sreg_value, %%cluster_ctaid.y; + * fn_ptr++ = reinterpret_cast(static_cast(cuda::ptx::get_sreg_cluster_ctaid_y));)); +#endif // __cccl_ptx_isa >= 780 + +#if __cccl_ptx_isa >= 780 + NV_IF_TARGET( + NV_PROVIDES_SM_90, + ( + // mov.u32 sreg_value, %%cluster_ctaid.z; + * fn_ptr++ = reinterpret_cast(static_cast(cuda::ptx::get_sreg_cluster_ctaid_z));)); +#endif // __cccl_ptx_isa >= 780 + +#if __cccl_ptx_isa >= 780 + NV_IF_TARGET( + NV_PROVIDES_SM_90, + ( + // mov.u32 sreg_value, %%cluster_nctaid.x; + * fn_ptr++ = reinterpret_cast(static_cast(cuda::ptx::get_sreg_cluster_nctaid_x));)); +#endif // __cccl_ptx_isa >= 780 + +#if __cccl_ptx_isa >= 780 + NV_IF_TARGET( + NV_PROVIDES_SM_90, + ( + // mov.u32 sreg_value, %%cluster_nctaid.y; + * fn_ptr++ = reinterpret_cast(static_cast(cuda::ptx::get_sreg_cluster_nctaid_y));)); +#endif // __cccl_ptx_isa >= 780 + +#if __cccl_ptx_isa >= 780 + NV_IF_TARGET( + NV_PROVIDES_SM_90, + ( + // mov.u32 sreg_value, %%cluster_nctaid.z; + * fn_ptr++ = reinterpret_cast(static_cast(cuda::ptx::get_sreg_cluster_nctaid_z));)); +#endif // __cccl_ptx_isa >= 780 + +#if __cccl_ptx_isa >= 780 + NV_IF_TARGET( + NV_PROVIDES_SM_90, + ( + // mov.u32 sreg_value, %%cluster_ctarank; + * fn_ptr++ = reinterpret_cast(static_cast(cuda::ptx::get_sreg_cluster_ctarank));)); +#endif // __cccl_ptx_isa >= 780 + +#if __cccl_ptx_isa >= 780 + NV_IF_TARGET( + NV_PROVIDES_SM_90, + ( + // mov.u32 sreg_value, %%cluster_nctarank; + * fn_ptr++ = reinterpret_cast(static_cast(cuda::ptx::get_sreg_cluster_nctarank));)); +#endif // __cccl_ptx_isa >= 780 + +#if __cccl_ptx_isa >= 200 + NV_IF_TARGET( + NV_PROVIDES_SM_35, + ( + // mov.u32 sreg_value, %%lanemask_eq; + * fn_ptr++ = reinterpret_cast(static_cast(cuda::ptx::get_sreg_lanemask_eq));)); +#endif // __cccl_ptx_isa >= 200 + +#if __cccl_ptx_isa >= 200 + NV_IF_TARGET( + NV_PROVIDES_SM_35, + ( + // mov.u32 sreg_value, %%lanemask_le; + * fn_ptr++ = reinterpret_cast(static_cast(cuda::ptx::get_sreg_lanemask_le));)); +#endif // __cccl_ptx_isa >= 200 + +#if __cccl_ptx_isa >= 200 + NV_IF_TARGET( + NV_PROVIDES_SM_35, + ( + // mov.u32 sreg_value, %%lanemask_lt; + * fn_ptr++ = reinterpret_cast(static_cast(cuda::ptx::get_sreg_lanemask_lt));)); +#endif // __cccl_ptx_isa >= 200 + +#if __cccl_ptx_isa >= 200 + NV_IF_TARGET( + NV_PROVIDES_SM_35, + ( + // mov.u32 sreg_value, %%lanemask_ge; + * fn_ptr++ = reinterpret_cast(static_cast(cuda::ptx::get_sreg_lanemask_ge));)); +#endif // __cccl_ptx_isa >= 200 + +#if __cccl_ptx_isa >= 200 + NV_IF_TARGET( + NV_PROVIDES_SM_35, + ( + // mov.u32 sreg_value, %%lanemask_gt; + * fn_ptr++ = reinterpret_cast(static_cast(cuda::ptx::get_sreg_lanemask_gt));)); +#endif // __cccl_ptx_isa >= 200 + +#if __cccl_ptx_isa >= 100 + // mov.u32 sreg_value, %%clock; + *fn_ptr++ = reinterpret_cast(static_cast(cuda::ptx::get_sreg_clock)); +#endif // __cccl_ptx_isa >= 100 + +#if __cccl_ptx_isa >= 500 + NV_IF_TARGET(NV_PROVIDES_SM_35, + ( + // mov.u32 sreg_value, %%clock_hi; + * fn_ptr++ = reinterpret_cast(static_cast(cuda::ptx::get_sreg_clock_hi));)); +#endif // __cccl_ptx_isa >= 500 + +#if __cccl_ptx_isa >= 200 + NV_IF_TARGET(NV_PROVIDES_SM_35, + ( + // mov.u64 sreg_value, %%clock64; + * fn_ptr++ = reinterpret_cast(static_cast(cuda::ptx::get_sreg_clock64));)); +#endif // __cccl_ptx_isa >= 200 + +#if __cccl_ptx_isa >= 310 + NV_IF_TARGET( + NV_PROVIDES_SM_35, + ( + // mov.u64 sreg_value, %%globaltimer; + * fn_ptr++ = reinterpret_cast(static_cast(cuda::ptx::get_sreg_globaltimer));)); +#endif // __cccl_ptx_isa >= 310 + +#if __cccl_ptx_isa >= 310 + NV_IF_TARGET( + NV_PROVIDES_SM_35, + ( + // mov.u32 sreg_value, %%globaltimer_lo; + * fn_ptr++ = reinterpret_cast(static_cast(cuda::ptx::get_sreg_globaltimer_lo));)); +#endif // __cccl_ptx_isa >= 310 + +#if __cccl_ptx_isa >= 310 + NV_IF_TARGET( + NV_PROVIDES_SM_35, + ( + // mov.u32 sreg_value, %%globaltimer_hi; + * fn_ptr++ = reinterpret_cast(static_cast(cuda::ptx::get_sreg_globaltimer_hi));)); +#endif // __cccl_ptx_isa >= 310 + +#if __cccl_ptx_isa >= 410 + NV_IF_TARGET( + NV_PROVIDES_SM_35, + ( + // mov.u32 sreg_value, %%total_smem_size; + * fn_ptr++ = reinterpret_cast(static_cast(cuda::ptx::get_sreg_total_smem_size));)); +#endif // __cccl_ptx_isa >= 410 + +#if __cccl_ptx_isa >= 810 + NV_IF_TARGET( + NV_PROVIDES_SM_90, + ( + // mov.u32 sreg_value, %%aggr_smem_size; + * fn_ptr++ = reinterpret_cast(static_cast(cuda::ptx::get_sreg_aggr_smem_size));)); +#endif // __cccl_ptx_isa >= 810 + +#if __cccl_ptx_isa >= 410 + NV_IF_TARGET( + NV_PROVIDES_SM_35, + ( + // mov.u32 sreg_value, %%dynamic_smem_size; + * fn_ptr++ = reinterpret_cast(static_cast(cuda::ptx::get_sreg_dynamic_smem_size));)); +#endif // __cccl_ptx_isa >= 410 + +#if __cccl_ptx_isa >= 800 + NV_IF_TARGET( + NV_PROVIDES_SM_50, + ( + // mov.u64 sreg_value, %%current_graph_exec; + * fn_ptr++ = reinterpret_cast(static_cast(cuda::ptx::get_sreg_current_graph_exec));)); +#endif // __cccl_ptx_isa >= 800 +} diff --git a/libcudacxx/test/libcudacxx/cuda/ptx/generated/getctarank.inc b/libcudacxx/test/libcudacxx/cuda/ptx/generated/getctarank.inc new file mode 100644 index 00000000000..28b04c9f738 --- /dev/null +++ b/libcudacxx/test/libcudacxx/cuda/ptx/generated/getctarank.inc @@ -0,0 +1,10 @@ +__global__ void test_getctarank(void** fn_ptr) +{ +#if __cccl_ptx_isa >= 780 + NV_IF_TARGET(NV_PROVIDES_SM_90, + ( + // getctarank.shared::cluster.u32 dest, addr; + * fn_ptr++ = reinterpret_cast( + static_cast(cuda::ptx::getctarank));)); +#endif // __cccl_ptx_isa >= 780 +} diff --git a/libcudacxx/test/libcudacxx/cuda/ptx/generated/mbarrier_arrive.inc b/libcudacxx/test/libcudacxx/cuda/ptx/generated/mbarrier_arrive.inc new file mode 100644 index 00000000000..4a94ec51d45 --- /dev/null +++ b/libcudacxx/test/libcudacxx/cuda/ptx/generated/mbarrier_arrive.inc @@ -0,0 +1,74 @@ +__global__ void test_mbarrier_arrive(void** fn_ptr) +{ +#if __cccl_ptx_isa >= 700 + NV_IF_TARGET( + NV_PROVIDES_SM_80, + ( + // mbarrier.arrive.shared.b64 state, [addr]; // 1. + * fn_ptr++ = reinterpret_cast(static_cast(cuda::ptx::mbarrier_arrive));)); +#endif // __cccl_ptx_isa >= 700 + +#if __cccl_ptx_isa >= 780 + NV_IF_TARGET(NV_PROVIDES_SM_90, + ( + // mbarrier.arrive.shared::cta.b64 state, [addr], count; // 2. + * fn_ptr++ = reinterpret_cast( + static_cast(cuda::ptx::mbarrier_arrive));)); +#endif // __cccl_ptx_isa >= 780 + +#if __cccl_ptx_isa >= 800 + NV_IF_TARGET( + NV_PROVIDES_SM_90, + ( + // mbarrier.arrive.release.cta.shared::cta.b64 state, [addr]; // 3a. + * fn_ptr++ = reinterpret_cast( + static_cast( + cuda::ptx::mbarrier_arrive)); + // mbarrier.arrive.release.cluster.shared::cta.b64 state, [addr]; // 3a. + * fn_ptr++ = reinterpret_cast( + static_cast( + cuda::ptx::mbarrier_arrive));)); +#endif // __cccl_ptx_isa >= 800 + +#if __cccl_ptx_isa >= 800 + NV_IF_TARGET( + NV_PROVIDES_SM_90, + ( + // mbarrier.arrive.release.cta.shared::cta.b64 state, [addr], count; // 3b. + * fn_ptr++ = reinterpret_cast( + static_cast( + cuda::ptx::mbarrier_arrive)); + // mbarrier.arrive.release.cluster.shared::cta.b64 state, [addr], count; // 3b. + * fn_ptr++ = reinterpret_cast( + static_cast(cuda::ptx::mbarrier_arrive));)); +#endif // __cccl_ptx_isa >= 800 + +#if __cccl_ptx_isa >= 800 + NV_IF_TARGET( + NV_PROVIDES_SM_90, + ( + // mbarrier.arrive.release.cluster.shared::cluster.b64 _, [addr]; // 4a. + * fn_ptr++ = reinterpret_cast( + static_cast( + cuda::ptx::mbarrier_arrive));)); +#endif // __cccl_ptx_isa >= 800 + +#if __cccl_ptx_isa >= 800 + NV_IF_TARGET( + NV_PROVIDES_SM_90, + ( + // mbarrier.arrive.release.cluster.shared::cluster.b64 _, [addr], count; // 4b. + * fn_ptr++ = reinterpret_cast( + static_cast( + cuda::ptx::mbarrier_arrive));)); +#endif // __cccl_ptx_isa >= 800 +} diff --git a/libcudacxx/test/libcudacxx/cuda/ptx/generated/mbarrier_arrive_expect_tx.inc b/libcudacxx/test/libcudacxx/cuda/ptx/generated/mbarrier_arrive_expect_tx.inc new file mode 100644 index 00000000000..085723a452b --- /dev/null +++ b/libcudacxx/test/libcudacxx/cuda/ptx/generated/mbarrier_arrive_expect_tx.inc @@ -0,0 +1,31 @@ +__global__ void test_mbarrier_arrive_expect_tx(void** fn_ptr) +{ +#if __cccl_ptx_isa >= 800 + NV_IF_TARGET( + NV_PROVIDES_SM_90, + ( + // mbarrier.arrive.expect_tx.release.cta.shared::cta.b64 state, [addr], tx_count; // 8. + * fn_ptr++ = reinterpret_cast( + static_cast( + cuda::ptx::mbarrier_arrive_expect_tx)); + // mbarrier.arrive.expect_tx.release.cluster.shared::cta.b64 state, [addr], tx_count; // 8. + * fn_ptr++ = reinterpret_cast( + static_cast(cuda::ptx::mbarrier_arrive_expect_tx));)); +#endif // __cccl_ptx_isa >= 800 + +#if __cccl_ptx_isa >= 800 + NV_IF_TARGET( + NV_PROVIDES_SM_90, + ( + // mbarrier.arrive.expect_tx.release.cluster.shared::cluster.b64 _, [addr], tx_count; // 9. + * fn_ptr++ = reinterpret_cast( + static_cast( + cuda::ptx::mbarrier_arrive_expect_tx));)); +#endif // __cccl_ptx_isa >= 800 +} diff --git a/libcudacxx/test/libcudacxx/cuda/ptx/generated/mbarrier_arrive_no_complete.inc b/libcudacxx/test/libcudacxx/cuda/ptx/generated/mbarrier_arrive_no_complete.inc new file mode 100644 index 00000000000..d1d017cd3c2 --- /dev/null +++ b/libcudacxx/test/libcudacxx/cuda/ptx/generated/mbarrier_arrive_no_complete.inc @@ -0,0 +1,10 @@ +__global__ void test_mbarrier_arrive_no_complete(void** fn_ptr) +{ +#if __cccl_ptx_isa >= 700 + NV_IF_TARGET(NV_PROVIDES_SM_80, + ( + // mbarrier.arrive.noComplete.shared.b64 state, [addr], count; // 5. + * fn_ptr++ = reinterpret_cast( + static_cast(cuda::ptx::mbarrier_arrive_no_complete));)); +#endif // __cccl_ptx_isa >= 700 +} diff --git a/libcudacxx/test/libcudacxx/cuda/ptx/generated/mbarrier_init.inc b/libcudacxx/test/libcudacxx/cuda/ptx/generated/mbarrier_init.inc new file mode 100644 index 00000000000..f814161d1f9 --- /dev/null +++ b/libcudacxx/test/libcudacxx/cuda/ptx/generated/mbarrier_init.inc @@ -0,0 +1,10 @@ +__global__ void test_mbarrier_init(void** fn_ptr) +{ +#if __cccl_ptx_isa >= 700 + NV_IF_TARGET(NV_PROVIDES_SM_80, + ( + // mbarrier.init.shared.b64 [addr], count; + * fn_ptr++ = reinterpret_cast( + static_cast(cuda::ptx::mbarrier_init));)); +#endif // __cccl_ptx_isa >= 700 +} diff --git a/libcudacxx/test/libcudacxx/cuda/ptx/generated/mbarrier_try_wait.inc b/libcudacxx/test/libcudacxx/cuda/ptx/generated/mbarrier_try_wait.inc new file mode 100644 index 00000000000..e9d8661a07e --- /dev/null +++ b/libcudacxx/test/libcudacxx/cuda/ptx/generated/mbarrier_try_wait.inc @@ -0,0 +1,53 @@ +__global__ void test_mbarrier_try_wait(void** fn_ptr) +{ +#if __cccl_ptx_isa >= 780 + NV_IF_TARGET(NV_PROVIDES_SM_90, + ( + // mbarrier.try_wait.shared::cta.b64 waitComplete, [addr], state; // 5a. + * fn_ptr++ = reinterpret_cast( + static_cast(cuda::ptx::mbarrier_try_wait));)); +#endif // __cccl_ptx_isa >= 780 + +#if __cccl_ptx_isa >= 780 + NV_IF_TARGET( + NV_PROVIDES_SM_90, + ( + // mbarrier.try_wait.shared::cta.b64 waitComplete, [addr], state, suspendTimeHint; // + // 5b. + * fn_ptr++ = reinterpret_cast( + static_cast(cuda::ptx::mbarrier_try_wait));)); +#endif // __cccl_ptx_isa >= 780 + +#if __cccl_ptx_isa >= 800 + NV_IF_TARGET( + NV_PROVIDES_SM_90, + ( + // mbarrier.try_wait.acquire.cta.shared::cta.b64 waitComplete, [addr], state; // + // 6a. + * fn_ptr++ = reinterpret_cast( + static_cast( + cuda::ptx::mbarrier_try_wait)); + // mbarrier.try_wait.acquire.cluster.shared::cta.b64 waitComplete, [addr], state; // 6a. + * fn_ptr++ = reinterpret_cast( + static_cast( + cuda::ptx::mbarrier_try_wait));)); +#endif // __cccl_ptx_isa >= 800 + +#if __cccl_ptx_isa >= 800 + NV_IF_TARGET( + NV_PROVIDES_SM_90, + ( + // mbarrier.try_wait.acquire.cta.shared::cta.b64 waitComplete, [addr], state , suspendTimeHint; // + // 6b. + * fn_ptr++ = reinterpret_cast( + static_cast( + cuda::ptx::mbarrier_try_wait)); + // mbarrier.try_wait.acquire.cluster.shared::cta.b64 waitComplete, [addr], state , suspendTimeHint; + // // 6b. + * fn_ptr++ = reinterpret_cast( + static_cast( + cuda::ptx::mbarrier_try_wait));)); +#endif // __cccl_ptx_isa >= 800 +} diff --git a/libcudacxx/test/libcudacxx/cuda/ptx/generated/mbarrier_try_wait_parity.inc b/libcudacxx/test/libcudacxx/cuda/ptx/generated/mbarrier_try_wait_parity.inc new file mode 100644 index 00000000000..f8c3875451a --- /dev/null +++ b/libcudacxx/test/libcudacxx/cuda/ptx/generated/mbarrier_try_wait_parity.inc @@ -0,0 +1,52 @@ +__global__ void test_mbarrier_try_wait_parity(void** fn_ptr) +{ +#if __cccl_ptx_isa >= 780 + NV_IF_TARGET(NV_PROVIDES_SM_90, + ( + // mbarrier.try_wait.parity.shared::cta.b64 waitComplete, [addr], phaseParity; // 7a. + * fn_ptr++ = reinterpret_cast( + static_cast(cuda::ptx::mbarrier_try_wait_parity));)); +#endif // __cccl_ptx_isa >= 780 + +#if __cccl_ptx_isa >= 780 + NV_IF_TARGET( + NV_PROVIDES_SM_90, + ( + // mbarrier.try_wait.parity.shared::cta.b64 waitComplete, [addr], phaseParity, suspendTimeHint; // 7b. + * fn_ptr++ = reinterpret_cast( + static_cast(cuda::ptx::mbarrier_try_wait_parity));)); +#endif // __cccl_ptx_isa >= 780 + +#if __cccl_ptx_isa >= 800 + NV_IF_TARGET( + NV_PROVIDES_SM_90, + ( + // mbarrier.try_wait.parity.acquire.cta.shared::cta.b64 waitComplete, [addr], phaseParity; // + // 8a. + * fn_ptr++ = reinterpret_cast( + static_cast( + cuda::ptx::mbarrier_try_wait_parity)); + // mbarrier.try_wait.parity.acquire.cluster.shared::cta.b64 waitComplete, [addr], phaseParity; // 8a. + * fn_ptr++ = reinterpret_cast( + static_cast( + cuda::ptx::mbarrier_try_wait_parity));)); +#endif // __cccl_ptx_isa >= 800 + +#if __cccl_ptx_isa >= 800 + NV_IF_TARGET( + NV_PROVIDES_SM_90, + ( + // mbarrier.try_wait.parity.acquire.cta.shared::cta.b64 waitComplete, [addr], phaseParity, suspendTimeHint; // + // 8b. + * fn_ptr++ = reinterpret_cast( + static_cast( + cuda::ptx::mbarrier_try_wait_parity)); + // mbarrier.try_wait.parity.acquire.cluster.shared::cta.b64 waitComplete, [addr], phaseParity, + // suspendTimeHint; // 8b. + * fn_ptr++ = reinterpret_cast( + static_cast( + cuda::ptx::mbarrier_try_wait_parity));)); +#endif // __cccl_ptx_isa >= 800 +} diff --git a/libcudacxx/test/libcudacxx/cuda/ptx/generated/mbarrier_wait.inc b/libcudacxx/test/libcudacxx/cuda/ptx/generated/mbarrier_wait.inc new file mode 100644 index 00000000000..80129e5016c --- /dev/null +++ b/libcudacxx/test/libcudacxx/cuda/ptx/generated/mbarrier_wait.inc @@ -0,0 +1,24 @@ +__global__ void test_mbarrier_test_wait(void** fn_ptr) +{ +#if __cccl_ptx_isa >= 700 + NV_IF_TARGET(NV_PROVIDES_SM_80, + ( + // mbarrier.test_wait.shared.b64 waitComplete, [addr], state; // 1. + * fn_ptr++ = reinterpret_cast( + static_cast(cuda::ptx::mbarrier_test_wait));)); +#endif // __cccl_ptx_isa >= 700 + +#if __cccl_ptx_isa >= 800 + NV_IF_TARGET( + NV_PROVIDES_SM_90, + ( + // mbarrier.test_wait.acquire.cta.shared::cta.b64 waitComplete, [addr], state; // 2. + * fn_ptr++ = reinterpret_cast( + static_cast( + cuda::ptx::mbarrier_test_wait)); + // mbarrier.test_wait.acquire.cluster.shared::cta.b64 waitComplete, [addr], state; // 2. + * fn_ptr++ = reinterpret_cast( + static_cast( + cuda::ptx::mbarrier_test_wait));)); +#endif // __cccl_ptx_isa >= 800 +} diff --git a/libcudacxx/test/libcudacxx/cuda/ptx/generated/mbarrier_wait_parity.inc b/libcudacxx/test/libcudacxx/cuda/ptx/generated/mbarrier_wait_parity.inc new file mode 100644 index 00000000000..30902c58905 --- /dev/null +++ b/libcudacxx/test/libcudacxx/cuda/ptx/generated/mbarrier_wait_parity.inc @@ -0,0 +1,24 @@ +__global__ void test_mbarrier_test_wait_parity(void** fn_ptr) +{ +#if __cccl_ptx_isa >= 710 + NV_IF_TARGET(NV_PROVIDES_SM_80, + ( + // mbarrier.test_wait.parity.shared.b64 waitComplete, [addr], phaseParity; // 3. + * fn_ptr++ = reinterpret_cast( + static_cast(cuda::ptx::mbarrier_test_wait_parity));)); +#endif // __cccl_ptx_isa >= 710 + +#if __cccl_ptx_isa >= 800 + NV_IF_TARGET( + NV_PROVIDES_SM_90, + ( + // mbarrier.test_wait.parity.acquire.cta.shared::cta.b64 waitComplete, [addr], phaseParity; // 4. + * fn_ptr++ = reinterpret_cast( + static_cast( + cuda::ptx::mbarrier_test_wait_parity)); + // mbarrier.test_wait.parity.acquire.cluster.shared::cta.b64 waitComplete, [addr], phaseParity; // 4. + * fn_ptr++ = reinterpret_cast( + static_cast( + cuda::ptx::mbarrier_test_wait_parity));)); +#endif // __cccl_ptx_isa >= 800 +} diff --git a/libcudacxx/test/libcudacxx/cuda/ptx/generated/red_async.inc b/libcudacxx/test/libcudacxx/cuda/ptx/generated/red_async.inc new file mode 100644 index 00000000000..0d562fd31a7 --- /dev/null +++ b/libcudacxx/test/libcudacxx/cuda/ptx/generated/red_async.inc @@ -0,0 +1,120 @@ +__global__ void test_red_async(void** fn_ptr) +{ +#if __cccl_ptx_isa >= 810 + NV_IF_TARGET( + NV_PROVIDES_SM_90, + ( + // red.async.relaxed.cluster.shared::cluster.mbarrier::complete_tx::bytes.inc.u32 [dest], value, [remote_bar]; + * fn_ptr++ = reinterpret_cast( + static_cast(cuda::ptx::red_async));)); +#endif // __cccl_ptx_isa >= 810 + +#if __cccl_ptx_isa >= 810 + NV_IF_TARGET( + NV_PROVIDES_SM_90, + ( + // red.async.relaxed.cluster.shared::cluster.mbarrier::complete_tx::bytes.dec.u32 [dest], value, [remote_bar]; + * fn_ptr++ = reinterpret_cast( + static_cast(cuda::ptx::red_async));)); +#endif // __cccl_ptx_isa >= 810 + +#if __cccl_ptx_isa >= 810 + NV_IF_TARGET( + NV_PROVIDES_SM_90, + ( + // red.async.relaxed.cluster.shared::cluster.mbarrier::complete_tx::bytes.min.u32 [dest], value, [remote_bar]; + * fn_ptr++ = reinterpret_cast( + static_cast(cuda::ptx::red_async));)); +#endif // __cccl_ptx_isa >= 810 + +#if __cccl_ptx_isa >= 810 + NV_IF_TARGET( + NV_PROVIDES_SM_90, + ( + // red.async.relaxed.cluster.shared::cluster.mbarrier::complete_tx::bytes.max.u32 [dest], value, [remote_bar]; + * fn_ptr++ = reinterpret_cast( + static_cast(cuda::ptx::red_async));)); +#endif // __cccl_ptx_isa >= 810 + +#if __cccl_ptx_isa >= 810 + NV_IF_TARGET( + NV_PROVIDES_SM_90, + ( + // red.async.relaxed.cluster.shared::cluster.mbarrier::complete_tx::bytes.add.u32 [dest], value, [remote_bar]; + * fn_ptr++ = reinterpret_cast( + static_cast(cuda::ptx::red_async));)); +#endif // __cccl_ptx_isa >= 810 + +#if __cccl_ptx_isa >= 810 + NV_IF_TARGET( + NV_PROVIDES_SM_90, + ( + // red.async.relaxed.cluster.shared::cluster.mbarrier::complete_tx::bytes.min.s32 [dest], value, [remote_bar]; + * fn_ptr++ = reinterpret_cast( + static_cast(cuda::ptx::red_async));)); +#endif // __cccl_ptx_isa >= 810 + +#if __cccl_ptx_isa >= 810 + NV_IF_TARGET( + NV_PROVIDES_SM_90, + ( + // red.async.relaxed.cluster.shared::cluster.mbarrier::complete_tx::bytes.max.s32 [dest], value, [remote_bar]; + * fn_ptr++ = reinterpret_cast( + static_cast(cuda::ptx::red_async));)); +#endif // __cccl_ptx_isa >= 810 + +#if __cccl_ptx_isa >= 810 + NV_IF_TARGET( + NV_PROVIDES_SM_90, + ( + // red.async.relaxed.cluster.shared::cluster.mbarrier::complete_tx::bytes.add.s32 [dest], value, [remote_bar]; + * fn_ptr++ = reinterpret_cast( + static_cast(cuda::ptx::red_async));)); +#endif // __cccl_ptx_isa >= 810 + +#if __cccl_ptx_isa >= 810 + NV_IF_TARGET( + NV_PROVIDES_SM_90, + ( + // red.async.relaxed.cluster.shared::cluster.mbarrier::complete_tx::bytes.and.b32 [dest], value, [remote_bar]; + * fn_ptr++ = reinterpret_cast( + static_cast(cuda::ptx::red_async));)); +#endif // __cccl_ptx_isa >= 810 + +#if __cccl_ptx_isa >= 810 + NV_IF_TARGET( + NV_PROVIDES_SM_90, + ( + // red.async.relaxed.cluster.shared::cluster.mbarrier::complete_tx::bytes.or.b32 [dest], value, [remote_bar]; + * fn_ptr++ = reinterpret_cast( + static_cast(cuda::ptx::red_async));)); +#endif // __cccl_ptx_isa >= 810 + +#if __cccl_ptx_isa >= 810 + NV_IF_TARGET( + NV_PROVIDES_SM_90, + ( + // red.async.relaxed.cluster.shared::cluster.mbarrier::complete_tx::bytes.xor.b32 [dest], value, [remote_bar]; + * fn_ptr++ = reinterpret_cast( + static_cast(cuda::ptx::red_async));)); +#endif // __cccl_ptx_isa >= 810 + +#if __cccl_ptx_isa >= 810 + NV_IF_TARGET( + NV_PROVIDES_SM_90, + ( + // red.async.relaxed.cluster.shared::cluster.mbarrier::complete_tx::bytes.add.u64 [dest], value, [remote_bar]; + * fn_ptr++ = reinterpret_cast( + static_cast(cuda::ptx::red_async));)); +#endif // __cccl_ptx_isa >= 810 + +#if __cccl_ptx_isa >= 810 + NV_IF_TARGET( + NV_PROVIDES_SM_90, + ( + // red.async.relaxed.cluster.shared::cluster.mbarrier::complete_tx::bytes.add.u64 [dest], value, [remote_bar]; + // // .u64 intentional + * fn_ptr++ = reinterpret_cast( + static_cast(cuda::ptx::red_async));)); +#endif // __cccl_ptx_isa >= 810 +} diff --git a/libcudacxx/test/libcudacxx/cuda/ptx/generated/st_async.inc b/libcudacxx/test/libcudacxx/cuda/ptx/generated/st_async.inc new file mode 100644 index 00000000000..4efb95ef217 --- /dev/null +++ b/libcudacxx/test/libcudacxx/cuda/ptx/generated/st_async.inc @@ -0,0 +1,35 @@ +__global__ void test_st_async(void** fn_ptr) +{ +#if __cccl_ptx_isa >= 810 + NV_IF_TARGET( + NV_PROVIDES_SM_90, + ( + // st.async.weak.shared::cluster.mbarrier::complete_tx::bytes.b32 [addr], value, [remote_bar]; // 1. + * fn_ptr++ = + reinterpret_cast(static_cast(cuda::ptx::st_async)); + // st.async.weak.shared::cluster.mbarrier::complete_tx::bytes.b64 [addr], value, [remote_bar]; // 1. + * fn_ptr++ = reinterpret_cast( + static_cast(cuda::ptx::st_async));)); +#endif // __cccl_ptx_isa >= 810 + +#if __cccl_ptx_isa >= 810 + NV_IF_TARGET( + NV_PROVIDES_SM_90, + ( + // st.async.weak.shared::cluster.mbarrier::complete_tx::bytes.v2.b32 [addr], value, [remote_bar]; // 2. + * fn_ptr++ = + reinterpret_cast(static_cast(cuda::ptx::st_async)); + // st.async.weak.shared::cluster.mbarrier::complete_tx::bytes.v2.b64 [addr], value, [remote_bar]; // 2. + * fn_ptr++ = reinterpret_cast( + static_cast(cuda::ptx::st_async));)); +#endif // __cccl_ptx_isa >= 810 + +#if __cccl_ptx_isa >= 810 + NV_IF_TARGET(NV_PROVIDES_SM_90, + ( + // st.async.weak.shared::cluster.mbarrier::complete_tx::bytes.v4.b32 [addr], value, [remote_bar]; + // // 3. + * fn_ptr++ = reinterpret_cast( + static_cast(cuda::ptx::st_async));)); +#endif // __cccl_ptx_isa >= 810 +} diff --git a/libcudacxx/test/libcudacxx/cuda/ptx/generated/tensormap_cp_fenceproxy.inc b/libcudacxx/test/libcudacxx/cuda/ptx/generated/tensormap_cp_fenceproxy.inc new file mode 100644 index 00000000000..9a0a8c1f615 --- /dev/null +++ b/libcudacxx/test/libcudacxx/cuda/ptx/generated/tensormap_cp_fenceproxy.inc @@ -0,0 +1,29 @@ +__global__ void test_tensormap_cp_fenceproxy(void** fn_ptr) +{ +#if __cccl_ptx_isa >= 830 + NV_IF_TARGET( + NV_PROVIDES_SM_90, + ( + // tensormap.cp_fenceproxy.global.shared::cta.tensormap::generic.release.cta.sync.aligned [dst], [src], size; + * fn_ptr++ = reinterpret_cast( + static_cast)>( + cuda::ptx::tensormap_cp_fenceproxy)); + // tensormap.cp_fenceproxy.global.shared::cta.tensormap::generic.release.cluster.sync.aligned [dst], [src], + // size; + * fn_ptr++ = reinterpret_cast( + static_cast)>( + cuda::ptx::tensormap_cp_fenceproxy)); + // tensormap.cp_fenceproxy.global.shared::cta.tensormap::generic.release.gpu.sync.aligned [dst], [src], size; + * fn_ptr++ = reinterpret_cast( + static_cast)>( + cuda::ptx::tensormap_cp_fenceproxy)); + // tensormap.cp_fenceproxy.global.shared::cta.tensormap::generic.release.sys.sync.aligned [dst], [src], size; + * fn_ptr++ = reinterpret_cast( + static_cast)>( + cuda::ptx::tensormap_cp_fenceproxy));)); +#endif // __cccl_ptx_isa >= 830 +} diff --git a/libcudacxx/test/libcudacxx/cuda/ptx/generated/tensormap_replace.inc b/libcudacxx/test/libcudacxx/cuda/ptx/generated/tensormap_replace.inc new file mode 100644 index 00000000000..c69f3d11964 --- /dev/null +++ b/libcudacxx/test/libcudacxx/cuda/ptx/generated/tensormap_replace.inc @@ -0,0 +1,198 @@ +__global__ void test_tensormap_replace(void** fn_ptr) +{ +#if __cccl_ptx_isa >= 830 + NV_IF_TARGET( + NV_HAS_FEATURE_SM_90a, + ( + // tensormap.replace.tile.global_address.global.b1024.b64 [tm_addr], new_val; + * fn_ptr++ = reinterpret_cast(static_cast( + cuda::ptx::tensormap_replace_global_address));)); +#endif // __cccl_ptx_isa >= 830 + +#if __cccl_ptx_isa >= 830 + NV_IF_TARGET( + NV_HAS_FEATURE_SM_90a, + ( + // tensormap.replace.tile.global_address.shared::cta.b1024.b64 [tm_addr], new_val; + * fn_ptr++ = reinterpret_cast(static_cast( + cuda::ptx::tensormap_replace_global_address));)); +#endif // __cccl_ptx_isa >= 830 + +#if __cccl_ptx_isa >= 830 + NV_IF_TARGET( + NV_HAS_FEATURE_SM_90a, + ( + // tensormap.replace.tile.rank.global.b1024.b32 [tm_addr], new_val; + * fn_ptr++ = reinterpret_cast( + static_cast(cuda::ptx::tensormap_replace_rank));)); +#endif // __cccl_ptx_isa >= 830 + +#if __cccl_ptx_isa >= 830 + NV_IF_TARGET( + NV_HAS_FEATURE_SM_90a, + ( + // tensormap.replace.tile.rank.shared::cta.b1024.b32 [tm_addr], new_val; + * fn_ptr++ = reinterpret_cast( + static_cast(cuda::ptx::tensormap_replace_rank));)); +#endif // __cccl_ptx_isa >= 830 + +#if __cccl_ptx_isa >= 830 + NV_IF_TARGET( + NV_HAS_FEATURE_SM_90a, + ( + // tensormap.replace.tile.box_dim.global.b1024.b32 [tm_addr], ord, new_val; + * fn_ptr++ = + reinterpret_cast(static_cast, int32_t)>( + cuda::ptx::tensormap_replace_box_dim));)); +#endif // __cccl_ptx_isa >= 830 + +#if __cccl_ptx_isa >= 830 + NV_IF_TARGET( + NV_HAS_FEATURE_SM_90a, + ( + // tensormap.replace.tile.box_dim.shared::cta.b1024.b32 [tm_addr], ord, new_val; + * fn_ptr++ = + reinterpret_cast(static_cast, int32_t)>( + cuda::ptx::tensormap_replace_box_dim));)); +#endif // __cccl_ptx_isa >= 830 + +#if __cccl_ptx_isa >= 830 + NV_IF_TARGET( + NV_HAS_FEATURE_SM_90a, + ( + // tensormap.replace.tile.global_dim.global.b1024.b32 [tm_addr], ord, new_val; + * fn_ptr++ = + reinterpret_cast(static_cast, int32_t)>( + cuda::ptx::tensormap_replace_global_dim));)); +#endif // __cccl_ptx_isa >= 830 + +#if __cccl_ptx_isa >= 830 + NV_IF_TARGET( + NV_HAS_FEATURE_SM_90a, + ( + // tensormap.replace.tile.global_dim.shared::cta.b1024.b32 [tm_addr], ord, new_val; + * fn_ptr++ = + reinterpret_cast(static_cast, int32_t)>( + cuda::ptx::tensormap_replace_global_dim));)); +#endif // __cccl_ptx_isa >= 830 + +#if __cccl_ptx_isa >= 830 + NV_IF_TARGET( + NV_HAS_FEATURE_SM_90a, + ( + // tensormap.replace.tile.global_stride.global.b1024.b64 [tm_addr], ord, new_val; + * fn_ptr++ = + reinterpret_cast(static_cast, int64_t)>( + cuda::ptx::tensormap_replace_global_stride));)); +#endif // __cccl_ptx_isa >= 830 + +#if __cccl_ptx_isa >= 830 + NV_IF_TARGET( + NV_HAS_FEATURE_SM_90a, + ( + // tensormap.replace.tile.global_stride.shared::cta.b1024.b64 [tm_addr], ord, new_val; + * fn_ptr++ = + reinterpret_cast(static_cast, int64_t)>( + cuda::ptx::tensormap_replace_global_stride));)); +#endif // __cccl_ptx_isa >= 830 + +#if __cccl_ptx_isa >= 830 + NV_IF_TARGET( + NV_HAS_FEATURE_SM_90a, + ( + // tensormap.replace.tile.element_stride.global.b1024.b32 [tm_addr], ord, new_val; + * fn_ptr++ = + reinterpret_cast(static_cast, int32_t)>( + cuda::ptx::tensormap_replace_element_size));)); +#endif // __cccl_ptx_isa >= 830 + +#if __cccl_ptx_isa >= 830 + NV_IF_TARGET( + NV_HAS_FEATURE_SM_90a, + ( + // tensormap.replace.tile.element_stride.shared::cta.b1024.b32 [tm_addr], ord, new_val; + * fn_ptr++ = + reinterpret_cast(static_cast, int32_t)>( + cuda::ptx::tensormap_replace_element_size));)); +#endif // __cccl_ptx_isa >= 830 + +#if __cccl_ptx_isa >= 830 + NV_IF_TARGET( + NV_HAS_FEATURE_SM_90a, + ( + // tensormap.replace.tile.elemtype.global.b1024.b32 [tm_addr], new_val; + * fn_ptr++ = + reinterpret_cast(static_cast)>( + cuda::ptx::tensormap_replace_elemtype));)); +#endif // __cccl_ptx_isa >= 830 + +#if __cccl_ptx_isa >= 830 + NV_IF_TARGET( + NV_HAS_FEATURE_SM_90a, + ( + // tensormap.replace.tile.elemtype.shared::cta.b1024.b32 [tm_addr], new_val; + * fn_ptr++ = + reinterpret_cast(static_cast)>( + cuda::ptx::tensormap_replace_elemtype));)); +#endif // __cccl_ptx_isa >= 830 + +#if __cccl_ptx_isa >= 830 + NV_IF_TARGET( + NV_HAS_FEATURE_SM_90a, + ( + // tensormap.replace.tile.interleave_layout.global.b1024.b32 [tm_addr], new_val; + * fn_ptr++ = + reinterpret_cast(static_cast)>( + cuda::ptx::tensormap_replace_interleave_layout));)); +#endif // __cccl_ptx_isa >= 830 + +#if __cccl_ptx_isa >= 830 + NV_IF_TARGET( + NV_HAS_FEATURE_SM_90a, + ( + // tensormap.replace.tile.interleave_layout.shared::cta.b1024.b32 [tm_addr], new_val; + * fn_ptr++ = + reinterpret_cast(static_cast)>( + cuda::ptx::tensormap_replace_interleave_layout));)); +#endif // __cccl_ptx_isa >= 830 + +#if __cccl_ptx_isa >= 830 + NV_IF_TARGET( + NV_HAS_FEATURE_SM_90a, + ( + // tensormap.replace.tile.swizzle_mode.global.b1024.b32 [tm_addr], new_val; + * fn_ptr++ = + reinterpret_cast(static_cast)>( + cuda::ptx::tensormap_replace_swizzle_mode));)); +#endif // __cccl_ptx_isa >= 830 + +#if __cccl_ptx_isa >= 830 + NV_IF_TARGET( + NV_HAS_FEATURE_SM_90a, + ( + // tensormap.replace.tile.swizzle_mode.shared::cta.b1024.b32 [tm_addr], new_val; + * fn_ptr++ = + reinterpret_cast(static_cast)>( + cuda::ptx::tensormap_replace_swizzle_mode));)); +#endif // __cccl_ptx_isa >= 830 + +#if __cccl_ptx_isa >= 830 + NV_IF_TARGET( + NV_HAS_FEATURE_SM_90a, + ( + // tensormap.replace.tile.fill_mode.global.b1024.b32 [tm_addr], new_val; + * fn_ptr++ = + reinterpret_cast(static_cast)>( + cuda::ptx::tensormap_replace_fill_mode));)); +#endif // __cccl_ptx_isa >= 830 + +#if __cccl_ptx_isa >= 830 + NV_IF_TARGET( + NV_HAS_FEATURE_SM_90a, + ( + // tensormap.replace.tile.fill_mode.shared::cta.b1024.b32 [tm_addr], new_val; + * fn_ptr++ = + reinterpret_cast(static_cast)>( + cuda::ptx::tensormap_replace_fill_mode));)); +#endif // __cccl_ptx_isa >= 830 +} diff --git a/libcudacxx/test/libcudacxx/cuda/ptx/ptx.barrier.cluster.compile.pass.cpp b/libcudacxx/test/libcudacxx/cuda/ptx/ptx.barrier.cluster.compile.pass.cpp index e6088d2f317..c460a2e5b09 100644 --- a/libcudacxx/test/libcudacxx/cuda/ptx/ptx.barrier.cluster.compile.pass.cpp +++ b/libcudacxx/test/libcudacxx/cuda/ptx/ptx.barrier.cluster.compile.pass.cpp @@ -31,48 +31,10 @@ * */ -__global__ void test_barrier_cluster(void** fn_ptr) -{ -#if __cccl_ptx_isa >= 780 - NV_IF_TARGET(NV_PROVIDES_SM_90, - ( - // barrier.cluster.arrive; - * fn_ptr++ = reinterpret_cast(static_cast(cuda::ptx::barrier_cluster_arrive));)); -#endif // __cccl_ptx_isa >= 780 - -#if __cccl_ptx_isa >= 780 - NV_IF_TARGET(NV_PROVIDES_SM_90, - ( - // barrier.cluster.wait; - * fn_ptr++ = reinterpret_cast(static_cast(cuda::ptx::barrier_cluster_wait));)); -#endif // __cccl_ptx_isa >= 780 - -#if __cccl_ptx_isa >= 800 - NV_IF_TARGET(NV_PROVIDES_SM_90, - ( - // barrier.cluster.arrive.release; - * fn_ptr++ = reinterpret_cast( - static_cast(cuda::ptx::barrier_cluster_arrive));)); -#endif // __cccl_ptx_isa >= 800 - -#if __cccl_ptx_isa >= 800 - NV_IF_TARGET(NV_PROVIDES_SM_90, - ( - // barrier.cluster.arrive.relaxed; - * fn_ptr++ = reinterpret_cast( - static_cast(cuda::ptx::barrier_cluster_arrive));)); -#endif // __cccl_ptx_isa >= 800 - -#if __cccl_ptx_isa >= 800 - NV_IF_TARGET(NV_PROVIDES_SM_90, - ( - // barrier.cluster.wait.acquire; - * fn_ptr++ = reinterpret_cast( - static_cast(cuda::ptx::barrier_cluster_wait));)); -#endif // __cccl_ptx_isa >= 800 -} +#include "generated/barrier_cluster.inc" int main(int, char**) { + // FIXME(bgruber): why no call to test_barrier_cluster? return 0; } diff --git a/libcudacxx/test/libcudacxx/cuda/ptx/ptx.cp.async.bulk.commit_group.compile.pass.cpp b/libcudacxx/test/libcudacxx/cuda/ptx/ptx.cp.async.bulk.commit_group.compile.pass.cpp index b4dff69d5b7..4695221dbc5 100644 --- a/libcudacxx/test/libcudacxx/cuda/ptx/ptx.cp.async.bulk.commit_group.compile.pass.cpp +++ b/libcudacxx/test/libcudacxx/cuda/ptx/ptx.cp.async.bulk.commit_group.compile.pass.cpp @@ -31,16 +31,7 @@ * */ -__global__ void test_cp_async_bulk_commit_group(void** fn_ptr) -{ -#if __cccl_ptx_isa >= 800 - NV_IF_TARGET( - NV_PROVIDES_SM_90, - ( - // cp.async.bulk.commit_group; - * fn_ptr++ = reinterpret_cast(static_cast(cuda::ptx::cp_async_bulk_commit_group));)); -#endif // __cccl_ptx_isa >= 800 -} +#include "generated/cp_async_bulk_commit_group.inc" int main(int, char**) { diff --git a/libcudacxx/test/libcudacxx/cuda/ptx/ptx.cp.async.bulk.compile.pass.cpp b/libcudacxx/test/libcudacxx/cuda/ptx/ptx.cp.async.bulk.compile.pass.cpp index b234c35fcdc..b1811727b66 100644 --- a/libcudacxx/test/libcudacxx/cuda/ptx/ptx.cp.async.bulk.compile.pass.cpp +++ b/libcudacxx/test/libcudacxx/cuda/ptx/ptx.cp.async.bulk.compile.pass.cpp @@ -31,43 +31,7 @@ * */ -__global__ void test_cp_async_bulk(void** fn_ptr) -{ -#if __cccl_ptx_isa >= 800 - NV_IF_TARGET( - NV_PROVIDES_SM_90, - ( - // cp.async.bulk.shared::cluster.global.mbarrier::complete_tx::bytes [dstMem], [srcMem], size, [smem_bar]; // - // 1a. unicast - * fn_ptr++ = reinterpret_cast( - static_cast( - cuda::ptx::cp_async_bulk));)); -#endif // __cccl_ptx_isa >= 800 - -#if __cccl_ptx_isa >= 800 - NV_IF_TARGET( - NV_PROVIDES_SM_90, - ( - // cp.async.bulk.shared::cluster.shared::cta.mbarrier::complete_tx::bytes [dstMem], [srcMem], size, - // [rdsmem_bar]; // 2. - * fn_ptr++ = reinterpret_cast( - static_cast( - cuda::ptx::cp_async_bulk));)); -#endif // __cccl_ptx_isa >= 800 - -#if __cccl_ptx_isa >= 800 - NV_IF_TARGET( - NV_PROVIDES_SM_90, - ( - // cp.async.bulk.global.shared::cta.bulk_group [dstMem], [srcMem], size; // 3. - * fn_ptr++ = reinterpret_cast( - static_cast( - cuda::ptx::cp_async_bulk));)); -#endif // __cccl_ptx_isa >= 800 -} +#include "generated/cp_async_bulk.inc" int main(int, char**) { diff --git a/libcudacxx/test/libcudacxx/cuda/ptx/ptx.cp.async.bulk.multicast.compile.pass.cpp b/libcudacxx/test/libcudacxx/cuda/ptx/ptx.cp.async.bulk.multicast.compile.pass.cpp index 8dbc81741d2..c040528cabc 100644 --- a/libcudacxx/test/libcudacxx/cuda/ptx/ptx.cp.async.bulk.multicast.compile.pass.cpp +++ b/libcudacxx/test/libcudacxx/cuda/ptx/ptx.cp.async.bulk.multicast.compile.pass.cpp @@ -33,24 +33,7 @@ * */ -__global__ void test_cp_async_bulk_multicast(void** fn_ptr) -{ -#if __cccl_ptx_isa >= 800 - NV_IF_TARGET( - NV_HAS_FEATURE_SM_90a, - ( - // cp.async.bulk.shared::cluster.global.mbarrier::complete_tx::bytes.multicast::cluster [dstMem], [srcMem], - // size, [smem_bar], ctaMask; // 1. - * fn_ptr++ = reinterpret_cast( - static_cast(cuda::ptx::cp_async_bulk));)); -#endif // __cccl_ptx_isa >= 800 -} +#include "generated/cp_async_bulk_multicast.inc" int main(int, char**) { diff --git a/libcudacxx/test/libcudacxx/cuda/ptx/ptx.cp.async.bulk.tensor.compile.pass.cpp b/libcudacxx/test/libcudacxx/cuda/ptx/ptx.cp.async.bulk.tensor.compile.pass.cpp index 64d9b9590a3..0b69b8a8f1c 100644 --- a/libcudacxx/test/libcudacxx/cuda/ptx/ptx.cp.async.bulk.tensor.compile.pass.cpp +++ b/libcudacxx/test/libcudacxx/cuda/ptx/ptx.cp.async.bulk.tensor.compile.pass.cpp @@ -31,123 +31,7 @@ * */ -__global__ void test_cp_async_bulk_tensor(void** fn_ptr) -{ -#if __cccl_ptx_isa >= 800 - NV_IF_TARGET( - NV_PROVIDES_SM_90, - ( - // cp.async.bulk.tensor.1d.shared::cluster.global.tile.mbarrier::complete_tx::bytes [dstMem], [tensorMap, - // tensorCoords], [smem_bar];// 1a. - * fn_ptr++ = reinterpret_cast( - static_cast( - cuda::ptx::cp_async_bulk_tensor));)); -#endif // __cccl_ptx_isa >= 800 - -#if __cccl_ptx_isa >= 800 - NV_IF_TARGET( - NV_PROVIDES_SM_90, - ( - // cp.async.bulk.tensor.1d.global.shared::cta.tile.bulk_group [tensorMap, tensorCoords], [srcMem]; // 3a. - * fn_ptr++ = reinterpret_cast( - static_cast( - cuda::ptx::cp_async_bulk_tensor));)); -#endif // __cccl_ptx_isa >= 800 - -#if __cccl_ptx_isa >= 800 - NV_IF_TARGET( - NV_PROVIDES_SM_90, - ( - // cp.async.bulk.tensor.2d.shared::cluster.global.tile.mbarrier::complete_tx::bytes [dstMem], [tensorMap, - // tensorCoords], [smem_bar];// 1b. - * fn_ptr++ = reinterpret_cast( - static_cast( - cuda::ptx::cp_async_bulk_tensor));)); -#endif // __cccl_ptx_isa >= 800 - -#if __cccl_ptx_isa >= 800 - NV_IF_TARGET( - NV_PROVIDES_SM_90, - ( - // cp.async.bulk.tensor.2d.global.shared::cta.tile.bulk_group [tensorMap, tensorCoords], [srcMem]; // 3b. - * fn_ptr++ = reinterpret_cast( - static_cast( - cuda::ptx::cp_async_bulk_tensor));)); -#endif // __cccl_ptx_isa >= 800 - -#if __cccl_ptx_isa >= 800 - NV_IF_TARGET( - NV_PROVIDES_SM_90, - ( - // cp.async.bulk.tensor.3d.shared::cluster.global.tile.mbarrier::complete_tx::bytes [dstMem], [tensorMap, - // tensorCoords], [smem_bar];// 1c. - * fn_ptr++ = reinterpret_cast( - static_cast( - cuda::ptx::cp_async_bulk_tensor));)); -#endif // __cccl_ptx_isa >= 800 - -#if __cccl_ptx_isa >= 800 - NV_IF_TARGET( - NV_PROVIDES_SM_90, - ( - // cp.async.bulk.tensor.3d.global.shared::cta.tile.bulk_group [tensorMap, tensorCoords], [srcMem]; // 3c. - * fn_ptr++ = reinterpret_cast( - static_cast( - cuda::ptx::cp_async_bulk_tensor));)); -#endif // __cccl_ptx_isa >= 800 - -#if __cccl_ptx_isa >= 800 - NV_IF_TARGET( - NV_PROVIDES_SM_90, - ( - // cp.async.bulk.tensor.4d.shared::cluster.global.tile.mbarrier::complete_tx::bytes [dstMem], [tensorMap, - // tensorCoords], [smem_bar];// 1d. - * fn_ptr++ = reinterpret_cast( - static_cast( - cuda::ptx::cp_async_bulk_tensor));)); -#endif // __cccl_ptx_isa >= 800 - -#if __cccl_ptx_isa >= 800 - NV_IF_TARGET( - NV_PROVIDES_SM_90, - ( - // cp.async.bulk.tensor.4d.global.shared::cta.tile.bulk_group [tensorMap, tensorCoords], [srcMem]; // 3d. - * fn_ptr++ = reinterpret_cast( - static_cast( - cuda::ptx::cp_async_bulk_tensor));)); -#endif // __cccl_ptx_isa >= 800 - -#if __cccl_ptx_isa >= 800 - NV_IF_TARGET( - NV_PROVIDES_SM_90, - ( - // cp.async.bulk.tensor.5d.shared::cluster.global.tile.mbarrier::complete_tx::bytes [dstMem], [tensorMap, - // tensorCoords], [smem_bar];// 1e. - * fn_ptr++ = reinterpret_cast( - static_cast( - cuda::ptx::cp_async_bulk_tensor));)); -#endif // __cccl_ptx_isa >= 800 - -#if __cccl_ptx_isa >= 800 - NV_IF_TARGET( - NV_PROVIDES_SM_90, - ( - // cp.async.bulk.tensor.5d.global.shared::cta.tile.bulk_group [tensorMap, tensorCoords], [srcMem]; // 3e. - * fn_ptr++ = reinterpret_cast( - static_cast( - cuda::ptx::cp_async_bulk_tensor));)); -#endif // __cccl_ptx_isa >= 800 -} +#include "generated/cp_async_bulk_tensor.inc" int main(int, char**) { diff --git a/libcudacxx/test/libcudacxx/cuda/ptx/ptx.cp.async.bulk.tensor.multicast.compile.pass.cpp b/libcudacxx/test/libcudacxx/cuda/ptx/ptx.cp.async.bulk.tensor.multicast.compile.pass.cpp index 2a3457396d0..7d53d9ee0c9 100644 --- a/libcudacxx/test/libcudacxx/cuda/ptx/ptx.cp.async.bulk.tensor.multicast.compile.pass.cpp +++ b/libcudacxx/test/libcudacxx/cuda/ptx/ptx.cp.async.bulk.tensor.multicast.compile.pass.cpp @@ -33,88 +33,7 @@ * */ -__global__ void test_cp_async_bulk_tensor_multicast(void** fn_ptr) -{ -#if __cccl_ptx_isa >= 800 - NV_IF_TARGET( - NV_HAS_FEATURE_SM_90a, - ( - // cp.async.bulk.tensor.1d.shared::cluster.global.tile.mbarrier::complete_tx::bytes.multicast::cluster [dstMem], - // [tensorMap, tensorCoords], [smem_bar], ctaMask; // 2a. - * fn_ptr++ = reinterpret_cast( - static_cast(cuda::ptx::cp_async_bulk_tensor));)); -#endif // __cccl_ptx_isa >= 800 - -#if __cccl_ptx_isa >= 800 - NV_IF_TARGET( - NV_HAS_FEATURE_SM_90a, - ( - // cp.async.bulk.tensor.2d.shared::cluster.global.tile.mbarrier::complete_tx::bytes.multicast::cluster [dstMem], - // [tensorMap, tensorCoords], [smem_bar], ctaMask; // 2b. - * fn_ptr++ = reinterpret_cast( - static_cast(cuda::ptx::cp_async_bulk_tensor));)); -#endif // __cccl_ptx_isa >= 800 - -#if __cccl_ptx_isa >= 800 - NV_IF_TARGET( - NV_HAS_FEATURE_SM_90a, - ( - // cp.async.bulk.tensor.3d.shared::cluster.global.tile.mbarrier::complete_tx::bytes.multicast::cluster [dstMem], - // [tensorMap, tensorCoords], [smem_bar], ctaMask; // 2c. - * fn_ptr++ = reinterpret_cast( - static_cast(cuda::ptx::cp_async_bulk_tensor));)); -#endif // __cccl_ptx_isa >= 800 - -#if __cccl_ptx_isa >= 800 - NV_IF_TARGET( - NV_HAS_FEATURE_SM_90a, - ( - // cp.async.bulk.tensor.4d.shared::cluster.global.tile.mbarrier::complete_tx::bytes.multicast::cluster [dstMem], - // [tensorMap, tensorCoords], [smem_bar], ctaMask; // 2d. - * fn_ptr++ = reinterpret_cast( - static_cast(cuda::ptx::cp_async_bulk_tensor));)); -#endif // __cccl_ptx_isa >= 800 - -#if __cccl_ptx_isa >= 800 - NV_IF_TARGET( - NV_HAS_FEATURE_SM_90a, - ( - // cp.async.bulk.tensor.5d.shared::cluster.global.tile.mbarrier::complete_tx::bytes.multicast::cluster [dstMem], - // [tensorMap, tensorCoords], [smem_bar], ctaMask; // 2e. - * fn_ptr++ = reinterpret_cast( - static_cast(cuda::ptx::cp_async_bulk_tensor));)); -#endif // __cccl_ptx_isa >= 800 -} +#include "generated/cp_async_bulk_tensor_multicast.inc" int main(int, char**) { diff --git a/libcudacxx/test/libcudacxx/cuda/ptx/ptx.cp.async.bulk.wait_group.compile.pass.cpp b/libcudacxx/test/libcudacxx/cuda/ptx/ptx.cp.async.bulk.wait_group.compile.pass.cpp index 3bfa9bbc7dd..39df53c5f9d 100644 --- a/libcudacxx/test/libcudacxx/cuda/ptx/ptx.cp.async.bulk.wait_group.compile.pass.cpp +++ b/libcudacxx/test/libcudacxx/cuda/ptx/ptx.cp.async.bulk.wait_group.compile.pass.cpp @@ -31,24 +31,7 @@ * */ -__global__ void test_cp_async_bulk_wait_group(void** fn_ptr) -{ -#if __cccl_ptx_isa >= 800 - NV_IF_TARGET(NV_PROVIDES_SM_90, - ( - // cp.async.bulk.wait_group N; - * fn_ptr++ = reinterpret_cast( - static_cast)>(cuda::ptx::cp_async_bulk_wait_group));)); -#endif // __cccl_ptx_isa >= 800 - -#if __cccl_ptx_isa >= 800 - NV_IF_TARGET(NV_PROVIDES_SM_90, - ( - // cp.async.bulk.wait_group.read N; - * fn_ptr++ = reinterpret_cast( - static_cast)>(cuda::ptx::cp_async_bulk_wait_group_read));)); -#endif // __cccl_ptx_isa >= 800 -} +#include "generated/cp_async_bulk_wait_group.inc" int main(int, char**) { diff --git a/libcudacxx/test/libcudacxx/cuda/ptx/ptx.cp.reduce.async.bulk.compile.pass.cpp b/libcudacxx/test/libcudacxx/cuda/ptx/ptx.cp.reduce.async.bulk.compile.pass.cpp index b1d06ca49c0..a186e34a809 100644 --- a/libcudacxx/test/libcudacxx/cuda/ptx/ptx.cp.reduce.async.bulk.compile.pass.cpp +++ b/libcudacxx/test/libcudacxx/cuda/ptx/ptx.cp.reduce.async.bulk.compile.pass.cpp @@ -31,568 +31,14 @@ * */ -__global__ void test_cp_reduce_async_bulk(void** fn_ptr) -{ -#if __cccl_ptx_isa >= 800 - NV_IF_TARGET( - NV_PROVIDES_SM_90, - ( - // cp.reduce.async.bulk.shared::cluster.shared::cta.mbarrier::complete_tx::bytes.and.b32 [dstMem], [srcMem], - // size, [rdsmem_bar]; // 1. - * fn_ptr++ = reinterpret_cast( - static_cast(cuda::ptx::cp_reduce_async_bulk));)); -#endif // __cccl_ptx_isa >= 800 - -#if __cccl_ptx_isa >= 800 - NV_IF_TARGET( - NV_PROVIDES_SM_90, - ( - // cp.reduce.async.bulk.shared::cluster.shared::cta.mbarrier::complete_tx::bytes.or.b32 [dstMem], [srcMem], - // size, [rdsmem_bar]; // 1. - * fn_ptr++ = reinterpret_cast( - static_cast(cuda::ptx::cp_reduce_async_bulk));)); -#endif // __cccl_ptx_isa >= 800 - -#if __cccl_ptx_isa >= 800 - NV_IF_TARGET( - NV_PROVIDES_SM_90, - ( - // cp.reduce.async.bulk.shared::cluster.shared::cta.mbarrier::complete_tx::bytes.xor.b32 [dstMem], [srcMem], - // size, [rdsmem_bar]; // 1. - * fn_ptr++ = reinterpret_cast( - static_cast(cuda::ptx::cp_reduce_async_bulk));)); -#endif // __cccl_ptx_isa >= 800 - -#if __cccl_ptx_isa >= 800 - NV_IF_TARGET( - NV_PROVIDES_SM_90, - ( - // cp.reduce.async.bulk.shared::cluster.shared::cta.mbarrier::complete_tx::bytes.min.u32 [dstMem], [srcMem], - // size, [rdsmem_bar]; // 1. - * fn_ptr++ = reinterpret_cast( - static_cast(cuda::ptx::cp_reduce_async_bulk));)); -#endif // __cccl_ptx_isa >= 800 - -#if __cccl_ptx_isa >= 800 - NV_IF_TARGET( - NV_PROVIDES_SM_90, - ( - // cp.reduce.async.bulk.shared::cluster.shared::cta.mbarrier::complete_tx::bytes.max.u32 [dstMem], [srcMem], - // size, [rdsmem_bar]; // 1. - * fn_ptr++ = reinterpret_cast( - static_cast(cuda::ptx::cp_reduce_async_bulk));)); -#endif // __cccl_ptx_isa >= 800 - -#if __cccl_ptx_isa >= 800 - NV_IF_TARGET( - NV_PROVIDES_SM_90, - ( - // cp.reduce.async.bulk.shared::cluster.shared::cta.mbarrier::complete_tx::bytes.add.u32 [dstMem], [srcMem], - // size, [rdsmem_bar]; // 1. - * fn_ptr++ = reinterpret_cast( - static_cast(cuda::ptx::cp_reduce_async_bulk));)); -#endif // __cccl_ptx_isa >= 800 - -#if __cccl_ptx_isa >= 800 - NV_IF_TARGET( - NV_PROVIDES_SM_90, - ( - // cp.reduce.async.bulk.shared::cluster.shared::cta.mbarrier::complete_tx::bytes.inc.u32 [dstMem], [srcMem], - // size, [rdsmem_bar]; // 1. - * fn_ptr++ = reinterpret_cast( - static_cast(cuda::ptx::cp_reduce_async_bulk));)); -#endif // __cccl_ptx_isa >= 800 - -#if __cccl_ptx_isa >= 800 - NV_IF_TARGET( - NV_PROVIDES_SM_90, - ( - // cp.reduce.async.bulk.shared::cluster.shared::cta.mbarrier::complete_tx::bytes.dec.u32 [dstMem], [srcMem], - // size, [rdsmem_bar]; // 1. - * fn_ptr++ = reinterpret_cast( - static_cast(cuda::ptx::cp_reduce_async_bulk));)); -#endif // __cccl_ptx_isa >= 800 - -#if __cccl_ptx_isa >= 800 - NV_IF_TARGET( - NV_PROVIDES_SM_90, - ( - // cp.reduce.async.bulk.shared::cluster.shared::cta.mbarrier::complete_tx::bytes.min.s32 [dstMem], [srcMem], - // size, [rdsmem_bar]; // 1. - * fn_ptr++ = reinterpret_cast( - static_cast(cuda::ptx::cp_reduce_async_bulk));)); -#endif // __cccl_ptx_isa >= 800 - -#if __cccl_ptx_isa >= 800 - NV_IF_TARGET( - NV_PROVIDES_SM_90, - ( - // cp.reduce.async.bulk.shared::cluster.shared::cta.mbarrier::complete_tx::bytes.max.s32 [dstMem], [srcMem], - // size, [rdsmem_bar]; // 1. - * fn_ptr++ = reinterpret_cast( - static_cast(cuda::ptx::cp_reduce_async_bulk));)); -#endif // __cccl_ptx_isa >= 800 - -#if __cccl_ptx_isa >= 800 - NV_IF_TARGET( - NV_PROVIDES_SM_90, - ( - // cp.reduce.async.bulk.shared::cluster.shared::cta.mbarrier::complete_tx::bytes.add.s32 [dstMem], [srcMem], - // size, [rdsmem_bar]; // 1. - * fn_ptr++ = reinterpret_cast( - static_cast(cuda::ptx::cp_reduce_async_bulk));)); -#endif // __cccl_ptx_isa >= 800 - -#if __cccl_ptx_isa >= 800 - NV_IF_TARGET( - NV_PROVIDES_SM_90, - ( - // cp.reduce.async.bulk.shared::cluster.shared::cta.mbarrier::complete_tx::bytes.add.u64 [dstMem], [srcMem], - // size, [rdsmem_bar]; // 1. - * fn_ptr++ = reinterpret_cast( - static_cast(cuda::ptx::cp_reduce_async_bulk));)); -#endif // __cccl_ptx_isa >= 800 - -#if __cccl_ptx_isa >= 800 - NV_IF_TARGET( - NV_PROVIDES_SM_90, - ( - // cp.reduce.async.bulk.shared::cluster.shared::cta.mbarrier::complete_tx::bytes.add.u64 [dstMem], [srcMem], - // size, [rdsmem_bar]; // 2. - * fn_ptr++ = reinterpret_cast( - static_cast(cuda::ptx::cp_reduce_async_bulk));)); -#endif // __cccl_ptx_isa >= 800 - -#if __cccl_ptx_isa >= 800 - NV_IF_TARGET( - NV_PROVIDES_SM_90, - ( - // cp.reduce.async.bulk.global.shared::cta.bulk_group.and.b32 [dstMem], [srcMem], size; // 3. - * fn_ptr++ = reinterpret_cast( - static_cast(cuda::ptx::cp_reduce_async_bulk)); - // cp.reduce.async.bulk.global.shared::cta.bulk_group.and.b64 [dstMem], [srcMem], size; // 3. - * fn_ptr++ = reinterpret_cast( - static_cast(cuda::ptx::cp_reduce_async_bulk));)); -#endif // __cccl_ptx_isa >= 800 - -#if __cccl_ptx_isa >= 800 - NV_IF_TARGET( - NV_PROVIDES_SM_90, - ( - // cp.reduce.async.bulk.global.shared::cta.bulk_group.or.b32 [dstMem], [srcMem], size; // 3. - * fn_ptr++ = reinterpret_cast( - static_cast(cuda::ptx::cp_reduce_async_bulk)); - // cp.reduce.async.bulk.global.shared::cta.bulk_group.or.b64 [dstMem], [srcMem], size; // 3. - * fn_ptr++ = reinterpret_cast( - static_cast(cuda::ptx::cp_reduce_async_bulk));)); -#endif // __cccl_ptx_isa >= 800 - -#if __cccl_ptx_isa >= 800 - NV_IF_TARGET( - NV_PROVIDES_SM_90, - ( - // cp.reduce.async.bulk.global.shared::cta.bulk_group.xor.b32 [dstMem], [srcMem], size; // 3. - * fn_ptr++ = reinterpret_cast( - static_cast(cuda::ptx::cp_reduce_async_bulk)); - // cp.reduce.async.bulk.global.shared::cta.bulk_group.xor.b64 [dstMem], [srcMem], size; // 3. - * fn_ptr++ = reinterpret_cast( - static_cast(cuda::ptx::cp_reduce_async_bulk));)); -#endif // __cccl_ptx_isa >= 800 - -#if __cccl_ptx_isa >= 800 - NV_IF_TARGET( - NV_PROVIDES_SM_90, - ( - // cp.reduce.async.bulk.global.shared::cta.bulk_group.min.u32 [dstMem], [srcMem], size; // 4. - * fn_ptr++ = reinterpret_cast( - static_cast(cuda::ptx::cp_reduce_async_bulk));)); -#endif // __cccl_ptx_isa >= 800 - -#if __cccl_ptx_isa >= 800 - NV_IF_TARGET( - NV_PROVIDES_SM_90, - ( - // cp.reduce.async.bulk.global.shared::cta.bulk_group.max.u32 [dstMem], [srcMem], size; // 4. - * fn_ptr++ = reinterpret_cast( - static_cast(cuda::ptx::cp_reduce_async_bulk));)); -#endif // __cccl_ptx_isa >= 800 - -#if __cccl_ptx_isa >= 800 - NV_IF_TARGET( - NV_PROVIDES_SM_90, - ( - // cp.reduce.async.bulk.global.shared::cta.bulk_group.add.u32 [dstMem], [srcMem], size; // 4. - * fn_ptr++ = reinterpret_cast( - static_cast(cuda::ptx::cp_reduce_async_bulk));)); -#endif // __cccl_ptx_isa >= 800 - -#if __cccl_ptx_isa >= 800 - NV_IF_TARGET( - NV_PROVIDES_SM_90, - ( - // cp.reduce.async.bulk.global.shared::cta.bulk_group.inc.u32 [dstMem], [srcMem], size; // 4. - * fn_ptr++ = reinterpret_cast( - static_cast(cuda::ptx::cp_reduce_async_bulk));)); -#endif // __cccl_ptx_isa >= 800 - -#if __cccl_ptx_isa >= 800 - NV_IF_TARGET( - NV_PROVIDES_SM_90, - ( - // cp.reduce.async.bulk.global.shared::cta.bulk_group.dec.u32 [dstMem], [srcMem], size; // 4. - * fn_ptr++ = reinterpret_cast( - static_cast(cuda::ptx::cp_reduce_async_bulk));)); -#endif // __cccl_ptx_isa >= 800 - -#if __cccl_ptx_isa >= 800 - NV_IF_TARGET( - NV_PROVIDES_SM_90, - ( - // cp.reduce.async.bulk.global.shared::cta.bulk_group.min.s32 [dstMem], [srcMem], size; // 4. - * fn_ptr++ = reinterpret_cast( - static_cast( - cuda::ptx::cp_reduce_async_bulk));)); -#endif // __cccl_ptx_isa >= 800 - -#if __cccl_ptx_isa >= 800 - NV_IF_TARGET( - NV_PROVIDES_SM_90, - ( - // cp.reduce.async.bulk.global.shared::cta.bulk_group.max.s32 [dstMem], [srcMem], size; // 4. - * fn_ptr++ = reinterpret_cast( - static_cast( - cuda::ptx::cp_reduce_async_bulk));)); -#endif // __cccl_ptx_isa >= 800 - -#if __cccl_ptx_isa >= 800 - NV_IF_TARGET( - NV_PROVIDES_SM_90, - ( - // cp.reduce.async.bulk.global.shared::cta.bulk_group.add.s32 [dstMem], [srcMem], size; // 4. - * fn_ptr++ = reinterpret_cast( - static_cast( - cuda::ptx::cp_reduce_async_bulk));)); -#endif // __cccl_ptx_isa >= 800 - -#if __cccl_ptx_isa >= 800 - NV_IF_TARGET( - NV_PROVIDES_SM_90, - ( - // cp.reduce.async.bulk.global.shared::cta.bulk_group.min.u64 [dstMem], [srcMem], size; // 4. - * fn_ptr++ = reinterpret_cast( - static_cast(cuda::ptx::cp_reduce_async_bulk));)); -#endif // __cccl_ptx_isa >= 800 - -#if __cccl_ptx_isa >= 800 - NV_IF_TARGET( - NV_PROVIDES_SM_90, - ( - // cp.reduce.async.bulk.global.shared::cta.bulk_group.max.u64 [dstMem], [srcMem], size; // 4. - * fn_ptr++ = reinterpret_cast( - static_cast(cuda::ptx::cp_reduce_async_bulk));)); -#endif // __cccl_ptx_isa >= 800 - -#if __cccl_ptx_isa >= 800 - NV_IF_TARGET( - NV_PROVIDES_SM_90, - ( - // cp.reduce.async.bulk.global.shared::cta.bulk_group.add.u64 [dstMem], [srcMem], size; // 4. - * fn_ptr++ = reinterpret_cast( - static_cast(cuda::ptx::cp_reduce_async_bulk));)); -#endif // __cccl_ptx_isa >= 800 - -#if __cccl_ptx_isa >= 800 - NV_IF_TARGET( - NV_PROVIDES_SM_90, - ( - // cp.reduce.async.bulk.global.shared::cta.bulk_group.min.s64 [dstMem], [srcMem], size; // 4. - * fn_ptr++ = reinterpret_cast( - static_cast( - cuda::ptx::cp_reduce_async_bulk));)); -#endif // __cccl_ptx_isa >= 800 - -#if __cccl_ptx_isa >= 800 - NV_IF_TARGET( - NV_PROVIDES_SM_90, - ( - // cp.reduce.async.bulk.global.shared::cta.bulk_group.max.s64 [dstMem], [srcMem], size; // 4. - * fn_ptr++ = reinterpret_cast( - static_cast( - cuda::ptx::cp_reduce_async_bulk));)); -#endif // __cccl_ptx_isa >= 800 - -#if __cccl_ptx_isa >= 800 - NV_IF_TARGET( - NV_PROVIDES_SM_90, - ( - // cp.reduce.async.bulk.global.shared::cta.bulk_group.add.f32 [dstMem], [srcMem], size; // 4. - * fn_ptr++ = reinterpret_cast( - static_cast( - cuda::ptx::cp_reduce_async_bulk));)); -#endif // __cccl_ptx_isa >= 800 - -#if __cccl_ptx_isa >= 800 - NV_IF_TARGET( - NV_PROVIDES_SM_90, - ( - // cp.reduce.async.bulk.global.shared::cta.bulk_group.add.f64 [dstMem], [srcMem], size; // 4. - * fn_ptr++ = reinterpret_cast( - static_cast( - cuda::ptx::cp_reduce_async_bulk));)); -#endif // __cccl_ptx_isa >= 800 - -#if __cccl_ptx_isa >= 800 - NV_IF_TARGET( - NV_PROVIDES_SM_90, - ( - // cp.reduce.async.bulk.global.shared::cta.bulk_group.add.u64 [dstMem], [srcMem], size; // 6. - * fn_ptr++ = reinterpret_cast( - static_cast( - cuda::ptx::cp_reduce_async_bulk));)); -#endif // __cccl_ptx_isa >= 800 -} +#include "generated/cp_reduce_async_bulk.inc" #ifdef _LIBCUDACXX_HAS_NVF16 -__global__ void test_cp_reduce_async_bulk_f16(void** fn_ptr) -{ -# if __cccl_ptx_isa >= 800 - NV_IF_TARGET( - NV_PROVIDES_SM_90, - ( - // cp.reduce.async.bulk.global.shared::cta.bulk_group.min.f16 [dstMem], [srcMem], size; // 4. - * fn_ptr++ = reinterpret_cast( - static_cast( - cuda::ptx::cp_reduce_async_bulk));)); -# endif // __cccl_ptx_isa >= 800 - -# if __cccl_ptx_isa >= 800 - NV_IF_TARGET( - NV_PROVIDES_SM_90, - ( - // cp.reduce.async.bulk.global.shared::cta.bulk_group.max.f16 [dstMem], [srcMem], size; // 4. - * fn_ptr++ = reinterpret_cast( - static_cast( - cuda::ptx::cp_reduce_async_bulk));)); -# endif // __cccl_ptx_isa >= 800 - -# if __cccl_ptx_isa >= 800 - NV_IF_TARGET( - NV_PROVIDES_SM_90, - ( - // cp.reduce.async.bulk.global.shared::cta.bulk_group.add.noftz.f16 [dstMem], [srcMem], size; // 5. - * fn_ptr++ = reinterpret_cast( - static_cast( - cuda::ptx::cp_reduce_async_bulk));)); -# endif // __cccl_ptx_isa >= 800 -} - +# include "generated/cp_reduce_async_bulk_f16.inc" #endif // _LIBCUDACXX_HAS_NVF16 #ifdef _LIBCUDACXX_HAS_NVBF16 -__global__ void test_cp_reduce_async_bulk_bf16(void** fn_ptr) -{ -# if __cccl_ptx_isa >= 800 - NV_IF_TARGET( - NV_PROVIDES_SM_90, - ( - // cp.reduce.async.bulk.global.shared::cta.bulk_group.min.bf16 [dstMem], [srcMem], size; // 4. - * fn_ptr++ = reinterpret_cast( - static_cast(cuda::ptx::cp_reduce_async_bulk));)); -# endif // __cccl_ptx_isa >= 800 - -# if __cccl_ptx_isa >= 800 - NV_IF_TARGET( - NV_PROVIDES_SM_90, - ( - // cp.reduce.async.bulk.global.shared::cta.bulk_group.max.bf16 [dstMem], [srcMem], size; // 4. - * fn_ptr++ = reinterpret_cast( - static_cast(cuda::ptx::cp_reduce_async_bulk));)); -# endif // __cccl_ptx_isa >= 800 - -# if __cccl_ptx_isa >= 800 - NV_IF_TARGET( - NV_PROVIDES_SM_90, - ( - // cp.reduce.async.bulk.global.shared::cta.bulk_group.add.noftz.bf16 [dstMem], [srcMem], size; // 5. - * fn_ptr++ = reinterpret_cast( - static_cast(cuda::ptx::cp_reduce_async_bulk));)); -# endif // __cccl_ptx_isa >= 800 -} - +# include "generated/cp_reduce_async_bulk_bf16.inc" #endif // _LIBCUDACXX_HAS_NVBF16 int main(int, char**) diff --git a/libcudacxx/test/libcudacxx/cuda/ptx/ptx.cp.reduce.async.bulk.tensor.compile.pass.cpp b/libcudacxx/test/libcudacxx/cuda/ptx/ptx.cp.reduce.async.bulk.tensor.compile.pass.cpp index 5ae7d313c36..14abc0d3ae6 100644 --- a/libcudacxx/test/libcudacxx/cuda/ptx/ptx.cp.reduce.async.bulk.tensor.compile.pass.cpp +++ b/libcudacxx/test/libcudacxx/cuda/ptx/ptx.cp.reduce.async.bulk.tensor.compile.pass.cpp @@ -31,398 +31,7 @@ * */ -__global__ void test_cp_reduce_async_bulk_tensor(void** fn_ptr) -{ -#if __cccl_ptx_isa >= 800 - NV_IF_TARGET( - NV_PROVIDES_SM_90, - ( - // cp.reduce.async.bulk.tensor.1d.global.shared::cta.add.tile.bulk_group [tensorMap, tensorCoords], [srcMem]; // - // 1a. - * fn_ptr++ = reinterpret_cast( - static_cast(cuda::ptx::cp_reduce_async_bulk_tensor)); - // cp.reduce.async.bulk.tensor.1d.global.shared::cta.min.tile.bulk_group [tensorMap, tensorCoords], - // [srcMem]; // 1a. - * fn_ptr++ = reinterpret_cast( - static_cast(cuda::ptx::cp_reduce_async_bulk_tensor)); - // cp.reduce.async.bulk.tensor.1d.global.shared::cta.max.tile.bulk_group [tensorMap, tensorCoords], - // [srcMem]; // 1a. - * fn_ptr++ = reinterpret_cast( - static_cast(cuda::ptx::cp_reduce_async_bulk_tensor)); - // cp.reduce.async.bulk.tensor.1d.global.shared::cta.inc.tile.bulk_group [tensorMap, tensorCoords], - // [srcMem]; // 1a. - * fn_ptr++ = reinterpret_cast( - static_cast(cuda::ptx::cp_reduce_async_bulk_tensor)); - // cp.reduce.async.bulk.tensor.1d.global.shared::cta.dec.tile.bulk_group [tensorMap, tensorCoords], - // [srcMem]; // 1a. - * fn_ptr++ = reinterpret_cast( - static_cast(cuda::ptx::cp_reduce_async_bulk_tensor)); - // cp.reduce.async.bulk.tensor.1d.global.shared::cta.and.tile.bulk_group [tensorMap, tensorCoords], - // [srcMem]; // 1a. - * fn_ptr++ = reinterpret_cast( - static_cast(cuda::ptx::cp_reduce_async_bulk_tensor)); - // cp.reduce.async.bulk.tensor.1d.global.shared::cta.or.tile.bulk_group [tensorMap, tensorCoords], [srcMem]; - // // 1a. - * fn_ptr++ = reinterpret_cast( - static_cast(cuda::ptx::cp_reduce_async_bulk_tensor)); - // cp.reduce.async.bulk.tensor.1d.global.shared::cta.xor.tile.bulk_group [tensorMap, tensorCoords], - // [srcMem]; // 1a. - * fn_ptr++ = reinterpret_cast( - static_cast(cuda::ptx::cp_reduce_async_bulk_tensor));)); -#endif // __cccl_ptx_isa >= 800 - -#if __cccl_ptx_isa >= 800 - NV_IF_TARGET( - NV_PROVIDES_SM_90, - ( - // cp.reduce.async.bulk.tensor.2d.global.shared::cta.add.tile.bulk_group [tensorMap, tensorCoords], [srcMem]; // - // 1b. - * fn_ptr++ = reinterpret_cast( - static_cast(cuda::ptx::cp_reduce_async_bulk_tensor)); - // cp.reduce.async.bulk.tensor.2d.global.shared::cta.min.tile.bulk_group [tensorMap, tensorCoords], - // [srcMem]; // 1b. - * fn_ptr++ = reinterpret_cast( - static_cast(cuda::ptx::cp_reduce_async_bulk_tensor)); - // cp.reduce.async.bulk.tensor.2d.global.shared::cta.max.tile.bulk_group [tensorMap, tensorCoords], - // [srcMem]; // 1b. - * fn_ptr++ = reinterpret_cast( - static_cast(cuda::ptx::cp_reduce_async_bulk_tensor)); - // cp.reduce.async.bulk.tensor.2d.global.shared::cta.inc.tile.bulk_group [tensorMap, tensorCoords], - // [srcMem]; // 1b. - * fn_ptr++ = reinterpret_cast( - static_cast(cuda::ptx::cp_reduce_async_bulk_tensor)); - // cp.reduce.async.bulk.tensor.2d.global.shared::cta.dec.tile.bulk_group [tensorMap, tensorCoords], - // [srcMem]; // 1b. - * fn_ptr++ = reinterpret_cast( - static_cast(cuda::ptx::cp_reduce_async_bulk_tensor)); - // cp.reduce.async.bulk.tensor.2d.global.shared::cta.and.tile.bulk_group [tensorMap, tensorCoords], - // [srcMem]; // 1b. - * fn_ptr++ = reinterpret_cast( - static_cast(cuda::ptx::cp_reduce_async_bulk_tensor)); - // cp.reduce.async.bulk.tensor.2d.global.shared::cta.or.tile.bulk_group [tensorMap, tensorCoords], [srcMem]; - // // 1b. - * fn_ptr++ = reinterpret_cast( - static_cast(cuda::ptx::cp_reduce_async_bulk_tensor)); - // cp.reduce.async.bulk.tensor.2d.global.shared::cta.xor.tile.bulk_group [tensorMap, tensorCoords], - // [srcMem]; // 1b. - * fn_ptr++ = reinterpret_cast( - static_cast(cuda::ptx::cp_reduce_async_bulk_tensor));)); -#endif // __cccl_ptx_isa >= 800 - -#if __cccl_ptx_isa >= 800 - NV_IF_TARGET( - NV_PROVIDES_SM_90, - ( - // cp.reduce.async.bulk.tensor.3d.global.shared::cta.add.tile.bulk_group [tensorMap, tensorCoords], [srcMem]; // - // 1c. - * fn_ptr++ = reinterpret_cast( - static_cast(cuda::ptx::cp_reduce_async_bulk_tensor)); - // cp.reduce.async.bulk.tensor.3d.global.shared::cta.min.tile.bulk_group [tensorMap, tensorCoords], - // [srcMem]; // 1c. - * fn_ptr++ = reinterpret_cast( - static_cast(cuda::ptx::cp_reduce_async_bulk_tensor)); - // cp.reduce.async.bulk.tensor.3d.global.shared::cta.max.tile.bulk_group [tensorMap, tensorCoords], - // [srcMem]; // 1c. - * fn_ptr++ = reinterpret_cast( - static_cast(cuda::ptx::cp_reduce_async_bulk_tensor)); - // cp.reduce.async.bulk.tensor.3d.global.shared::cta.inc.tile.bulk_group [tensorMap, tensorCoords], - // [srcMem]; // 1c. - * fn_ptr++ = reinterpret_cast( - static_cast(cuda::ptx::cp_reduce_async_bulk_tensor)); - // cp.reduce.async.bulk.tensor.3d.global.shared::cta.dec.tile.bulk_group [tensorMap, tensorCoords], - // [srcMem]; // 1c. - * fn_ptr++ = reinterpret_cast( - static_cast(cuda::ptx::cp_reduce_async_bulk_tensor)); - // cp.reduce.async.bulk.tensor.3d.global.shared::cta.and.tile.bulk_group [tensorMap, tensorCoords], - // [srcMem]; // 1c. - * fn_ptr++ = reinterpret_cast( - static_cast(cuda::ptx::cp_reduce_async_bulk_tensor)); - // cp.reduce.async.bulk.tensor.3d.global.shared::cta.or.tile.bulk_group [tensorMap, tensorCoords], [srcMem]; - // // 1c. - * fn_ptr++ = reinterpret_cast( - static_cast(cuda::ptx::cp_reduce_async_bulk_tensor)); - // cp.reduce.async.bulk.tensor.3d.global.shared::cta.xor.tile.bulk_group [tensorMap, tensorCoords], - // [srcMem]; // 1c. - * fn_ptr++ = reinterpret_cast( - static_cast(cuda::ptx::cp_reduce_async_bulk_tensor));)); -#endif // __cccl_ptx_isa >= 800 - -#if __cccl_ptx_isa >= 800 - NV_IF_TARGET( - NV_PROVIDES_SM_90, - ( - // cp.reduce.async.bulk.tensor.4d.global.shared::cta.add.tile.bulk_group [tensorMap, tensorCoords], [srcMem]; // - // 1d. - * fn_ptr++ = reinterpret_cast( - static_cast(cuda::ptx::cp_reduce_async_bulk_tensor)); - // cp.reduce.async.bulk.tensor.4d.global.shared::cta.min.tile.bulk_group [tensorMap, tensorCoords], - // [srcMem]; // 1d. - * fn_ptr++ = reinterpret_cast( - static_cast(cuda::ptx::cp_reduce_async_bulk_tensor)); - // cp.reduce.async.bulk.tensor.4d.global.shared::cta.max.tile.bulk_group [tensorMap, tensorCoords], - // [srcMem]; // 1d. - * fn_ptr++ = reinterpret_cast( - static_cast(cuda::ptx::cp_reduce_async_bulk_tensor)); - // cp.reduce.async.bulk.tensor.4d.global.shared::cta.inc.tile.bulk_group [tensorMap, tensorCoords], - // [srcMem]; // 1d. - * fn_ptr++ = reinterpret_cast( - static_cast(cuda::ptx::cp_reduce_async_bulk_tensor)); - // cp.reduce.async.bulk.tensor.4d.global.shared::cta.dec.tile.bulk_group [tensorMap, tensorCoords], - // [srcMem]; // 1d. - * fn_ptr++ = reinterpret_cast( - static_cast(cuda::ptx::cp_reduce_async_bulk_tensor)); - // cp.reduce.async.bulk.tensor.4d.global.shared::cta.and.tile.bulk_group [tensorMap, tensorCoords], - // [srcMem]; // 1d. - * fn_ptr++ = reinterpret_cast( - static_cast(cuda::ptx::cp_reduce_async_bulk_tensor)); - // cp.reduce.async.bulk.tensor.4d.global.shared::cta.or.tile.bulk_group [tensorMap, tensorCoords], [srcMem]; - // // 1d. - * fn_ptr++ = reinterpret_cast( - static_cast(cuda::ptx::cp_reduce_async_bulk_tensor)); - // cp.reduce.async.bulk.tensor.4d.global.shared::cta.xor.tile.bulk_group [tensorMap, tensorCoords], - // [srcMem]; // 1d. - * fn_ptr++ = reinterpret_cast( - static_cast(cuda::ptx::cp_reduce_async_bulk_tensor));)); -#endif // __cccl_ptx_isa >= 800 - -#if __cccl_ptx_isa >= 800 - NV_IF_TARGET( - NV_PROVIDES_SM_90, - ( - // cp.reduce.async.bulk.tensor.5d.global.shared::cta.add.tile.bulk_group [tensorMap, tensorCoords], [srcMem]; // - // 1e. - * fn_ptr++ = reinterpret_cast( - static_cast(cuda::ptx::cp_reduce_async_bulk_tensor)); - // cp.reduce.async.bulk.tensor.5d.global.shared::cta.min.tile.bulk_group [tensorMap, tensorCoords], - // [srcMem]; // 1e. - * fn_ptr++ = reinterpret_cast( - static_cast(cuda::ptx::cp_reduce_async_bulk_tensor)); - // cp.reduce.async.bulk.tensor.5d.global.shared::cta.max.tile.bulk_group [tensorMap, tensorCoords], - // [srcMem]; // 1e. - * fn_ptr++ = reinterpret_cast( - static_cast(cuda::ptx::cp_reduce_async_bulk_tensor)); - // cp.reduce.async.bulk.tensor.5d.global.shared::cta.inc.tile.bulk_group [tensorMap, tensorCoords], - // [srcMem]; // 1e. - * fn_ptr++ = reinterpret_cast( - static_cast(cuda::ptx::cp_reduce_async_bulk_tensor)); - // cp.reduce.async.bulk.tensor.5d.global.shared::cta.dec.tile.bulk_group [tensorMap, tensorCoords], - // [srcMem]; // 1e. - * fn_ptr++ = reinterpret_cast( - static_cast(cuda::ptx::cp_reduce_async_bulk_tensor)); - // cp.reduce.async.bulk.tensor.5d.global.shared::cta.and.tile.bulk_group [tensorMap, tensorCoords], - // [srcMem]; // 1e. - * fn_ptr++ = reinterpret_cast( - static_cast(cuda::ptx::cp_reduce_async_bulk_tensor)); - // cp.reduce.async.bulk.tensor.5d.global.shared::cta.or.tile.bulk_group [tensorMap, tensorCoords], [srcMem]; - // // 1e. - * fn_ptr++ = reinterpret_cast( - static_cast(cuda::ptx::cp_reduce_async_bulk_tensor)); - // cp.reduce.async.bulk.tensor.5d.global.shared::cta.xor.tile.bulk_group [tensorMap, tensorCoords], - // [srcMem]; // 1e. - * fn_ptr++ = reinterpret_cast( - static_cast(cuda::ptx::cp_reduce_async_bulk_tensor));)); -#endif // __cccl_ptx_isa >= 800 -} +#include "generated/cp_reduce_async_bulk_tensor.inc" int main(int, char**) { diff --git a/libcudacxx/test/libcudacxx/cuda/ptx/ptx.fence.compile.pass.cpp b/libcudacxx/test/libcudacxx/cuda/ptx/ptx.fence.compile.pass.cpp index 0be4f6b32fe..641cb83f172 100644 --- a/libcudacxx/test/libcudacxx/cuda/ptx/ptx.fence.compile.pass.cpp +++ b/libcudacxx/test/libcudacxx/cuda/ptx/ptx.fence.compile.pass.cpp @@ -31,136 +31,11 @@ * */ -__global__ void test_fence(void** fn_ptr) -{ -#if __cccl_ptx_isa >= 600 - NV_IF_TARGET( - NV_PROVIDES_SM_70, - ( - // fence.sc.cta; // 1. - * fn_ptr++ = - reinterpret_cast(static_cast(cuda::ptx::fence)); - // fence.sc.gpu; // 1. - * fn_ptr++ = reinterpret_cast( - static_cast(cuda::ptx::fence)); - // fence.sc.sys; // 1. - * fn_ptr++ = reinterpret_cast( - static_cast(cuda::ptx::fence)); - // fence.acq_rel.cta; // 1. - * fn_ptr++ = reinterpret_cast( - static_cast(cuda::ptx::fence)); - // fence.acq_rel.gpu; // 1. - * fn_ptr++ = reinterpret_cast( - static_cast(cuda::ptx::fence)); - // fence.acq_rel.sys; // 1. - * fn_ptr++ = reinterpret_cast( - static_cast(cuda::ptx::fence));)); -#endif // __cccl_ptx_isa >= 600 - -#if __cccl_ptx_isa >= 780 - NV_IF_TARGET( - NV_PROVIDES_SM_90, - ( - // fence.sc.cluster; // 2. - * fn_ptr++ = reinterpret_cast( - static_cast(cuda::ptx::fence)); - // fence.acq_rel.cluster; // 2. - * fn_ptr++ = reinterpret_cast( - static_cast(cuda::ptx::fence));)); -#endif // __cccl_ptx_isa >= 780 -} - -__global__ void test_fence_mbarrier_init(void** fn_ptr) -{ -#if __cccl_ptx_isa >= 800 - NV_IF_TARGET( - NV_PROVIDES_SM_90, - ( - // fence.mbarrier_init.release.cluster; // 3. - * fn_ptr++ = reinterpret_cast(static_cast( - cuda::ptx::fence_mbarrier_init));)); -#endif // __cccl_ptx_isa >= 800 -} - -__global__ void test_fence_proxy_alias(void** fn_ptr) -{ -#if __cccl_ptx_isa >= 750 - NV_IF_TARGET(NV_PROVIDES_SM_70, - ( - // fence.proxy.alias; // 4. - * fn_ptr++ = reinterpret_cast(static_cast(cuda::ptx::fence_proxy_alias));)); -#endif // __cccl_ptx_isa >= 750 -} - -__global__ void test_fence_proxy_async(void** fn_ptr) -{ -#if __cccl_ptx_isa >= 800 - NV_IF_TARGET(NV_PROVIDES_SM_90, - ( - // fence.proxy.async; // 5. - * fn_ptr++ = reinterpret_cast(static_cast(cuda::ptx::fence_proxy_async));)); -#endif // __cccl_ptx_isa >= 800 - -#if __cccl_ptx_isa >= 800 - NV_IF_TARGET( - NV_PROVIDES_SM_90, - ( - // fence.proxy.async.global; // 6. - * fn_ptr++ = - reinterpret_cast(static_cast(cuda::ptx::fence_proxy_async)); - // fence.proxy.async.shared::cluster; // 6. - * fn_ptr++ = - reinterpret_cast(static_cast(cuda::ptx::fence_proxy_async)); - // fence.proxy.async.shared::cta; // 6. - * fn_ptr++ = reinterpret_cast( - static_cast(cuda::ptx::fence_proxy_async));)); -#endif // __cccl_ptx_isa >= 800 -} - -__global__ void test_fence_proxy_tensormap_generic(void** fn_ptr) -{ -#if __cccl_ptx_isa >= 830 - NV_IF_TARGET( - NV_PROVIDES_SM_90, - ( - // fence.proxy.tensormap::generic.release.cta; // 7. - * fn_ptr++ = reinterpret_cast(static_cast( - cuda::ptx::fence_proxy_tensormap_generic)); - // fence.proxy.tensormap::generic.release.cluster; // 7. - * fn_ptr++ = - reinterpret_cast(static_cast( - cuda::ptx::fence_proxy_tensormap_generic)); - // fence.proxy.tensormap::generic.release.gpu; // 7. - * fn_ptr++ = reinterpret_cast(static_cast( - cuda::ptx::fence_proxy_tensormap_generic)); - // fence.proxy.tensormap::generic.release.sys; // 7. - * fn_ptr++ = reinterpret_cast(static_cast( - cuda::ptx::fence_proxy_tensormap_generic));)); -#endif // __cccl_ptx_isa >= 830 - -#if __cccl_ptx_isa >= 830 - NV_IF_TARGET( - NV_PROVIDES_SM_90, - ( - // fence.proxy.tensormap::generic.acquire.cta [addr], size; // 8. - * fn_ptr++ = reinterpret_cast( - static_cast)>( - cuda::ptx::fence_proxy_tensormap_generic)); - // fence.proxy.tensormap::generic.acquire.cluster [addr], size; // 8. - * fn_ptr++ = reinterpret_cast( - static_cast)>( - cuda::ptx::fence_proxy_tensormap_generic)); - // fence.proxy.tensormap::generic.acquire.gpu [addr], size; // 8. - * fn_ptr++ = reinterpret_cast( - static_cast)>( - cuda::ptx::fence_proxy_tensormap_generic)); - // fence.proxy.tensormap::generic.acquire.sys [addr], size; // 8. - * fn_ptr++ = reinterpret_cast( - static_cast)>( - cuda::ptx::fence_proxy_tensormap_generic));)); -#endif // __cccl_ptx_isa >= 830 -} +#include "generated/fence.inc" +#include "generated/fence_mbarrier_init.inc" +#include "generated/fence_proxy_alias.inc" +#include "generated/fence_proxy_async.inc" +#include "generated/fence_proxy_tensormap_generic.inc" int main(int, char**) { diff --git a/libcudacxx/test/libcudacxx/cuda/ptx/ptx.get_sreg.compile.pass.cpp b/libcudacxx/test/libcudacxx/cuda/ptx/ptx.get_sreg.compile.pass.cpp index 0003afb2fe2..697cc00a1be 100644 --- a/libcudacxx/test/libcudacxx/cuda/ptx/ptx.get_sreg.compile.pass.cpp +++ b/libcudacxx/test/libcudacxx/cuda/ptx/ptx.get_sreg.compile.pass.cpp @@ -32,337 +32,7 @@ * */ -__global__ void test_get_sreg(void** fn_ptr) -{ -#if __cccl_ptx_isa >= 200 - // mov.u32 sreg_value, %%tid.x; - *fn_ptr++ = reinterpret_cast(static_cast(cuda::ptx::get_sreg_tid_x)); -#endif // __cccl_ptx_isa >= 200 - -#if __cccl_ptx_isa >= 200 - // mov.u32 sreg_value, %%tid.y; - *fn_ptr++ = reinterpret_cast(static_cast(cuda::ptx::get_sreg_tid_y)); -#endif // __cccl_ptx_isa >= 200 - -#if __cccl_ptx_isa >= 200 - // mov.u32 sreg_value, %%tid.z; - *fn_ptr++ = reinterpret_cast(static_cast(cuda::ptx::get_sreg_tid_z)); -#endif // __cccl_ptx_isa >= 200 - -#if __cccl_ptx_isa >= 200 - // mov.u32 sreg_value, %%ntid.x; - *fn_ptr++ = reinterpret_cast(static_cast(cuda::ptx::get_sreg_ntid_x)); -#endif // __cccl_ptx_isa >= 200 - -#if __cccl_ptx_isa >= 200 - // mov.u32 sreg_value, %%ntid.y; - *fn_ptr++ = reinterpret_cast(static_cast(cuda::ptx::get_sreg_ntid_y)); -#endif // __cccl_ptx_isa >= 200 - -#if __cccl_ptx_isa >= 200 - // mov.u32 sreg_value, %%ntid.z; - *fn_ptr++ = reinterpret_cast(static_cast(cuda::ptx::get_sreg_ntid_z)); -#endif // __cccl_ptx_isa >= 200 - -#if __cccl_ptx_isa >= 130 - // mov.u32 sreg_value, %%laneid; - *fn_ptr++ = reinterpret_cast(static_cast(cuda::ptx::get_sreg_laneid)); -#endif // __cccl_ptx_isa >= 130 - -#if __cccl_ptx_isa >= 130 - // mov.u32 sreg_value, %%warpid; - *fn_ptr++ = reinterpret_cast(static_cast(cuda::ptx::get_sreg_warpid)); -#endif // __cccl_ptx_isa >= 130 - -#if __cccl_ptx_isa >= 200 - NV_IF_TARGET(NV_PROVIDES_SM_35, - ( - // mov.u32 sreg_value, %%nwarpid; - * fn_ptr++ = reinterpret_cast(static_cast(cuda::ptx::get_sreg_nwarpid));)); -#endif // __cccl_ptx_isa >= 200 - -#if __cccl_ptx_isa >= 200 - // mov.u32 sreg_value, %%ctaid.x; - *fn_ptr++ = reinterpret_cast(static_cast(cuda::ptx::get_sreg_ctaid_x)); -#endif // __cccl_ptx_isa >= 200 - -#if __cccl_ptx_isa >= 200 - // mov.u32 sreg_value, %%ctaid.y; - *fn_ptr++ = reinterpret_cast(static_cast(cuda::ptx::get_sreg_ctaid_y)); -#endif // __cccl_ptx_isa >= 200 - -#if __cccl_ptx_isa >= 200 - // mov.u32 sreg_value, %%ctaid.z; - *fn_ptr++ = reinterpret_cast(static_cast(cuda::ptx::get_sreg_ctaid_z)); -#endif // __cccl_ptx_isa >= 200 - -#if __cccl_ptx_isa >= 200 - // mov.u32 sreg_value, %%nctaid.x; - *fn_ptr++ = reinterpret_cast(static_cast(cuda::ptx::get_sreg_nctaid_x)); -#endif // __cccl_ptx_isa >= 200 - -#if __cccl_ptx_isa >= 200 - // mov.u32 sreg_value, %%nctaid.y; - *fn_ptr++ = reinterpret_cast(static_cast(cuda::ptx::get_sreg_nctaid_y)); -#endif // __cccl_ptx_isa >= 200 - -#if __cccl_ptx_isa >= 200 - // mov.u32 sreg_value, %%nctaid.z; - *fn_ptr++ = reinterpret_cast(static_cast(cuda::ptx::get_sreg_nctaid_z)); -#endif // __cccl_ptx_isa >= 200 - -#if __cccl_ptx_isa >= 130 - // mov.u32 sreg_value, %%smid; - *fn_ptr++ = reinterpret_cast(static_cast(cuda::ptx::get_sreg_smid)); -#endif // __cccl_ptx_isa >= 130 - -#if __cccl_ptx_isa >= 200 - NV_IF_TARGET(NV_PROVIDES_SM_35, - ( - // mov.u32 sreg_value, %%nsmid; - * fn_ptr++ = reinterpret_cast(static_cast(cuda::ptx::get_sreg_nsmid));)); -#endif // __cccl_ptx_isa >= 200 - -#if __cccl_ptx_isa >= 300 - // mov.u64 sreg_value, %%gridid; - *fn_ptr++ = reinterpret_cast(static_cast(cuda::ptx::get_sreg_gridid)); -#endif // __cccl_ptx_isa >= 300 - -#if __cccl_ptx_isa >= 780 - NV_IF_TARGET( - NV_PROVIDES_SM_90, - ( - // mov.pred sreg_value, %%is_explicit_cluster; - * fn_ptr++ = reinterpret_cast(static_cast(cuda::ptx::get_sreg_is_explicit_cluster));)); -#endif // __cccl_ptx_isa >= 780 - -#if __cccl_ptx_isa >= 780 - NV_IF_TARGET( - NV_PROVIDES_SM_90, - ( - // mov.u32 sreg_value, %%clusterid.x; - * fn_ptr++ = reinterpret_cast(static_cast(cuda::ptx::get_sreg_clusterid_x));)); -#endif // __cccl_ptx_isa >= 780 - -#if __cccl_ptx_isa >= 780 - NV_IF_TARGET( - NV_PROVIDES_SM_90, - ( - // mov.u32 sreg_value, %%clusterid.y; - * fn_ptr++ = reinterpret_cast(static_cast(cuda::ptx::get_sreg_clusterid_y));)); -#endif // __cccl_ptx_isa >= 780 - -#if __cccl_ptx_isa >= 780 - NV_IF_TARGET( - NV_PROVIDES_SM_90, - ( - // mov.u32 sreg_value, %%clusterid.z; - * fn_ptr++ = reinterpret_cast(static_cast(cuda::ptx::get_sreg_clusterid_z));)); -#endif // __cccl_ptx_isa >= 780 - -#if __cccl_ptx_isa >= 780 - NV_IF_TARGET( - NV_PROVIDES_SM_90, - ( - // mov.u32 sreg_value, %%nclusterid.x; - * fn_ptr++ = reinterpret_cast(static_cast(cuda::ptx::get_sreg_nclusterid_x));)); -#endif // __cccl_ptx_isa >= 780 - -#if __cccl_ptx_isa >= 780 - NV_IF_TARGET( - NV_PROVIDES_SM_90, - ( - // mov.u32 sreg_value, %%nclusterid.y; - * fn_ptr++ = reinterpret_cast(static_cast(cuda::ptx::get_sreg_nclusterid_y));)); -#endif // __cccl_ptx_isa >= 780 - -#if __cccl_ptx_isa >= 780 - NV_IF_TARGET( - NV_PROVIDES_SM_90, - ( - // mov.u32 sreg_value, %%nclusterid.z; - * fn_ptr++ = reinterpret_cast(static_cast(cuda::ptx::get_sreg_nclusterid_z));)); -#endif // __cccl_ptx_isa >= 780 - -#if __cccl_ptx_isa >= 780 - NV_IF_TARGET( - NV_PROVIDES_SM_90, - ( - // mov.u32 sreg_value, %%cluster_ctaid.x; - * fn_ptr++ = reinterpret_cast(static_cast(cuda::ptx::get_sreg_cluster_ctaid_x));)); -#endif // __cccl_ptx_isa >= 780 - -#if __cccl_ptx_isa >= 780 - NV_IF_TARGET( - NV_PROVIDES_SM_90, - ( - // mov.u32 sreg_value, %%cluster_ctaid.y; - * fn_ptr++ = reinterpret_cast(static_cast(cuda::ptx::get_sreg_cluster_ctaid_y));)); -#endif // __cccl_ptx_isa >= 780 - -#if __cccl_ptx_isa >= 780 - NV_IF_TARGET( - NV_PROVIDES_SM_90, - ( - // mov.u32 sreg_value, %%cluster_ctaid.z; - * fn_ptr++ = reinterpret_cast(static_cast(cuda::ptx::get_sreg_cluster_ctaid_z));)); -#endif // __cccl_ptx_isa >= 780 - -#if __cccl_ptx_isa >= 780 - NV_IF_TARGET( - NV_PROVIDES_SM_90, - ( - // mov.u32 sreg_value, %%cluster_nctaid.x; - * fn_ptr++ = reinterpret_cast(static_cast(cuda::ptx::get_sreg_cluster_nctaid_x));)); -#endif // __cccl_ptx_isa >= 780 - -#if __cccl_ptx_isa >= 780 - NV_IF_TARGET( - NV_PROVIDES_SM_90, - ( - // mov.u32 sreg_value, %%cluster_nctaid.y; - * fn_ptr++ = reinterpret_cast(static_cast(cuda::ptx::get_sreg_cluster_nctaid_y));)); -#endif // __cccl_ptx_isa >= 780 - -#if __cccl_ptx_isa >= 780 - NV_IF_TARGET( - NV_PROVIDES_SM_90, - ( - // mov.u32 sreg_value, %%cluster_nctaid.z; - * fn_ptr++ = reinterpret_cast(static_cast(cuda::ptx::get_sreg_cluster_nctaid_z));)); -#endif // __cccl_ptx_isa >= 780 - -#if __cccl_ptx_isa >= 780 - NV_IF_TARGET( - NV_PROVIDES_SM_90, - ( - // mov.u32 sreg_value, %%cluster_ctarank; - * fn_ptr++ = reinterpret_cast(static_cast(cuda::ptx::get_sreg_cluster_ctarank));)); -#endif // __cccl_ptx_isa >= 780 - -#if __cccl_ptx_isa >= 780 - NV_IF_TARGET( - NV_PROVIDES_SM_90, - ( - // mov.u32 sreg_value, %%cluster_nctarank; - * fn_ptr++ = reinterpret_cast(static_cast(cuda::ptx::get_sreg_cluster_nctarank));)); -#endif // __cccl_ptx_isa >= 780 - -#if __cccl_ptx_isa >= 200 - NV_IF_TARGET( - NV_PROVIDES_SM_35, - ( - // mov.u32 sreg_value, %%lanemask_eq; - * fn_ptr++ = reinterpret_cast(static_cast(cuda::ptx::get_sreg_lanemask_eq));)); -#endif // __cccl_ptx_isa >= 200 - -#if __cccl_ptx_isa >= 200 - NV_IF_TARGET( - NV_PROVIDES_SM_35, - ( - // mov.u32 sreg_value, %%lanemask_le; - * fn_ptr++ = reinterpret_cast(static_cast(cuda::ptx::get_sreg_lanemask_le));)); -#endif // __cccl_ptx_isa >= 200 - -#if __cccl_ptx_isa >= 200 - NV_IF_TARGET( - NV_PROVIDES_SM_35, - ( - // mov.u32 sreg_value, %%lanemask_lt; - * fn_ptr++ = reinterpret_cast(static_cast(cuda::ptx::get_sreg_lanemask_lt));)); -#endif // __cccl_ptx_isa >= 200 - -#if __cccl_ptx_isa >= 200 - NV_IF_TARGET( - NV_PROVIDES_SM_35, - ( - // mov.u32 sreg_value, %%lanemask_ge; - * fn_ptr++ = reinterpret_cast(static_cast(cuda::ptx::get_sreg_lanemask_ge));)); -#endif // __cccl_ptx_isa >= 200 - -#if __cccl_ptx_isa >= 200 - NV_IF_TARGET( - NV_PROVIDES_SM_35, - ( - // mov.u32 sreg_value, %%lanemask_gt; - * fn_ptr++ = reinterpret_cast(static_cast(cuda::ptx::get_sreg_lanemask_gt));)); -#endif // __cccl_ptx_isa >= 200 - -#if __cccl_ptx_isa >= 100 - // mov.u32 sreg_value, %%clock; - *fn_ptr++ = reinterpret_cast(static_cast(cuda::ptx::get_sreg_clock)); -#endif // __cccl_ptx_isa >= 100 - -#if __cccl_ptx_isa >= 500 - NV_IF_TARGET(NV_PROVIDES_SM_35, - ( - // mov.u32 sreg_value, %%clock_hi; - * fn_ptr++ = reinterpret_cast(static_cast(cuda::ptx::get_sreg_clock_hi));)); -#endif // __cccl_ptx_isa >= 500 - -#if __cccl_ptx_isa >= 200 - NV_IF_TARGET(NV_PROVIDES_SM_35, - ( - // mov.u64 sreg_value, %%clock64; - * fn_ptr++ = reinterpret_cast(static_cast(cuda::ptx::get_sreg_clock64));)); -#endif // __cccl_ptx_isa >= 200 - -#if __cccl_ptx_isa >= 310 - NV_IF_TARGET( - NV_PROVIDES_SM_35, - ( - // mov.u64 sreg_value, %%globaltimer; - * fn_ptr++ = reinterpret_cast(static_cast(cuda::ptx::get_sreg_globaltimer));)); -#endif // __cccl_ptx_isa >= 310 - -#if __cccl_ptx_isa >= 310 - NV_IF_TARGET( - NV_PROVIDES_SM_35, - ( - // mov.u32 sreg_value, %%globaltimer_lo; - * fn_ptr++ = reinterpret_cast(static_cast(cuda::ptx::get_sreg_globaltimer_lo));)); -#endif // __cccl_ptx_isa >= 310 - -#if __cccl_ptx_isa >= 310 - NV_IF_TARGET( - NV_PROVIDES_SM_35, - ( - // mov.u32 sreg_value, %%globaltimer_hi; - * fn_ptr++ = reinterpret_cast(static_cast(cuda::ptx::get_sreg_globaltimer_hi));)); -#endif // __cccl_ptx_isa >= 310 - -#if __cccl_ptx_isa >= 410 - NV_IF_TARGET( - NV_PROVIDES_SM_35, - ( - // mov.u32 sreg_value, %%total_smem_size; - * fn_ptr++ = reinterpret_cast(static_cast(cuda::ptx::get_sreg_total_smem_size));)); -#endif // __cccl_ptx_isa >= 410 - -#if __cccl_ptx_isa >= 810 - NV_IF_TARGET( - NV_PROVIDES_SM_90, - ( - // mov.u32 sreg_value, %%aggr_smem_size; - * fn_ptr++ = reinterpret_cast(static_cast(cuda::ptx::get_sreg_aggr_smem_size));)); -#endif // __cccl_ptx_isa >= 810 - -#if __cccl_ptx_isa >= 410 - NV_IF_TARGET( - NV_PROVIDES_SM_35, - ( - // mov.u32 sreg_value, %%dynamic_smem_size; - * fn_ptr++ = reinterpret_cast(static_cast(cuda::ptx::get_sreg_dynamic_smem_size));)); -#endif // __cccl_ptx_isa >= 410 - -#if __cccl_ptx_isa >= 800 - NV_IF_TARGET( - NV_PROVIDES_SM_50, - ( - // mov.u64 sreg_value, %%current_graph_exec; - * fn_ptr++ = reinterpret_cast(static_cast(cuda::ptx::get_sreg_current_graph_exec));)); -#endif // __cccl_ptx_isa >= 800 -} +#include "generated/get_sreg.inc" int main(int, char**) { diff --git a/libcudacxx/test/libcudacxx/cuda/ptx/ptx.getctarank.compile.pass.cpp b/libcudacxx/test/libcudacxx/cuda/ptx/ptx.getctarank.compile.pass.cpp index 73112e871b0..80fc71c0998 100644 --- a/libcudacxx/test/libcudacxx/cuda/ptx/ptx.getctarank.compile.pass.cpp +++ b/libcudacxx/test/libcudacxx/cuda/ptx/ptx.getctarank.compile.pass.cpp @@ -31,16 +31,7 @@ * */ -__global__ void test_getctarank(void** fn_ptr) -{ -#if __cccl_ptx_isa >= 780 - NV_IF_TARGET(NV_PROVIDES_SM_90, - ( - // getctarank.shared::cluster.u32 dest, addr; - * fn_ptr++ = reinterpret_cast( - static_cast(cuda::ptx::getctarank));)); -#endif // __cccl_ptx_isa >= 780 -} +#include "generated/getctarank.inc" int main(int, char**) { diff --git a/libcudacxx/test/libcudacxx/cuda/ptx/ptx.mbarrier.arrive.compile.pass.cpp b/libcudacxx/test/libcudacxx/cuda/ptx/ptx.mbarrier.arrive.compile.pass.cpp index 3a213d9bce3..2350b176630 100644 --- a/libcudacxx/test/libcudacxx/cuda/ptx/ptx.mbarrier.arrive.compile.pass.cpp +++ b/libcudacxx/test/libcudacxx/cuda/ptx/ptx.mbarrier.arrive.compile.pass.cpp @@ -31,123 +31,9 @@ * */ -__global__ void test_mbarrier_arrive(void** fn_ptr) -{ -#if __cccl_ptx_isa >= 700 - NV_IF_TARGET( - NV_PROVIDES_SM_80, - ( - // mbarrier.arrive.shared.b64 state, [addr]; // 1. - * fn_ptr++ = reinterpret_cast(static_cast(cuda::ptx::mbarrier_arrive));)); -#endif // __cccl_ptx_isa >= 700 - -#if __cccl_ptx_isa >= 780 - NV_IF_TARGET(NV_PROVIDES_SM_90, - ( - // mbarrier.arrive.shared::cta.b64 state, [addr], count; // 2. - * fn_ptr++ = reinterpret_cast( - static_cast(cuda::ptx::mbarrier_arrive));)); -#endif // __cccl_ptx_isa >= 780 - -#if __cccl_ptx_isa >= 800 - NV_IF_TARGET( - NV_PROVIDES_SM_90, - ( - // mbarrier.arrive.release.cta.shared::cta.b64 state, [addr]; // 3a. - * fn_ptr++ = reinterpret_cast( - static_cast( - cuda::ptx::mbarrier_arrive)); - // mbarrier.arrive.release.cluster.shared::cta.b64 state, [addr]; // 3a. - * fn_ptr++ = reinterpret_cast( - static_cast( - cuda::ptx::mbarrier_arrive));)); -#endif // __cccl_ptx_isa >= 800 - -#if __cccl_ptx_isa >= 800 - NV_IF_TARGET( - NV_PROVIDES_SM_90, - ( - // mbarrier.arrive.release.cta.shared::cta.b64 state, [addr], count; // 3b. - * fn_ptr++ = reinterpret_cast( - static_cast( - cuda::ptx::mbarrier_arrive)); - // mbarrier.arrive.release.cluster.shared::cta.b64 state, [addr], count; // 3b. - * fn_ptr++ = reinterpret_cast( - static_cast(cuda::ptx::mbarrier_arrive));)); -#endif // __cccl_ptx_isa >= 800 - -#if __cccl_ptx_isa >= 800 - NV_IF_TARGET( - NV_PROVIDES_SM_90, - ( - // mbarrier.arrive.release.cluster.shared::cluster.b64 _, [addr]; // 4a. - * fn_ptr++ = reinterpret_cast( - static_cast( - cuda::ptx::mbarrier_arrive));)); -#endif // __cccl_ptx_isa >= 800 - -#if __cccl_ptx_isa >= 800 - NV_IF_TARGET( - NV_PROVIDES_SM_90, - ( - // mbarrier.arrive.release.cluster.shared::cluster.b64 _, [addr], count; // 4b. - * fn_ptr++ = reinterpret_cast( - static_cast( - cuda::ptx::mbarrier_arrive));)); -#endif // __cccl_ptx_isa >= 800 -} - -__global__ void test_mbarrier_arrive_no_complete(void** fn_ptr) -{ -#if __cccl_ptx_isa >= 700 - NV_IF_TARGET(NV_PROVIDES_SM_80, - ( - // mbarrier.arrive.noComplete.shared.b64 state, [addr], count; // 5. - * fn_ptr++ = reinterpret_cast( - static_cast(cuda::ptx::mbarrier_arrive_no_complete));)); -#endif // __cccl_ptx_isa >= 700 -} - -__global__ void test_mbarrier_arrive_expect_tx(void** fn_ptr) -{ -#if __cccl_ptx_isa >= 800 - NV_IF_TARGET( - NV_PROVIDES_SM_90, - ( - // mbarrier.arrive.expect_tx.release.cta.shared::cta.b64 state, [addr], tx_count; // 8. - * fn_ptr++ = reinterpret_cast( - static_cast( - cuda::ptx::mbarrier_arrive_expect_tx)); - // mbarrier.arrive.expect_tx.release.cluster.shared::cta.b64 state, [addr], tx_count; // 8. - * fn_ptr++ = reinterpret_cast( - static_cast(cuda::ptx::mbarrier_arrive_expect_tx));)); -#endif // __cccl_ptx_isa >= 800 - -#if __cccl_ptx_isa >= 800 - NV_IF_TARGET( - NV_PROVIDES_SM_90, - ( - // mbarrier.arrive.expect_tx.release.cluster.shared::cluster.b64 _, [addr], tx_count; // 9. - * fn_ptr++ = reinterpret_cast( - static_cast( - cuda::ptx::mbarrier_arrive_expect_tx));)); -#endif // __cccl_ptx_isa >= 800 -} +#include "generated/mbarrier_arrive.inc" +#include "generated/mbarrier_arrive_expect_tx.inc" +#include "generated/mbarrier_arrive_no_complete.inc" int main(int, char**) { diff --git a/libcudacxx/test/libcudacxx/cuda/ptx/ptx.mbarrier.init.compile.pass.cpp b/libcudacxx/test/libcudacxx/cuda/ptx/ptx.mbarrier.init.compile.pass.cpp index 6aa0f87e41e..b445a61a8a9 100644 --- a/libcudacxx/test/libcudacxx/cuda/ptx/ptx.mbarrier.init.compile.pass.cpp +++ b/libcudacxx/test/libcudacxx/cuda/ptx/ptx.mbarrier.init.compile.pass.cpp @@ -31,16 +31,7 @@ * */ -__global__ void test_mbarrier_init(void** fn_ptr) -{ -#if __cccl_ptx_isa >= 700 - NV_IF_TARGET(NV_PROVIDES_SM_80, - ( - // mbarrier.init.shared.b64 [addr], count; - * fn_ptr++ = reinterpret_cast( - static_cast(cuda::ptx::mbarrier_init));)); -#endif // __cccl_ptx_isa >= 700 -} +#include "generated/mbarrier_init.inc" int main(int, char**) { diff --git a/libcudacxx/test/libcudacxx/cuda/ptx/ptx.mbarrier.wait.compile.pass.cpp b/libcudacxx/test/libcudacxx/cuda/ptx/ptx.mbarrier.wait.compile.pass.cpp index 007ccdef29c..e9c17a2024d 100644 --- a/libcudacxx/test/libcudacxx/cuda/ptx/ptx.mbarrier.wait.compile.pass.cpp +++ b/libcudacxx/test/libcudacxx/cuda/ptx/ptx.mbarrier.wait.compile.pass.cpp @@ -31,162 +31,10 @@ * */ -__global__ void test_mbarrier_test_wait(void** fn_ptr) -{ -#if __cccl_ptx_isa >= 700 - NV_IF_TARGET(NV_PROVIDES_SM_80, - ( - // mbarrier.test_wait.shared.b64 waitComplete, [addr], state; // 1. - * fn_ptr++ = reinterpret_cast( - static_cast(cuda::ptx::mbarrier_test_wait));)); -#endif // __cccl_ptx_isa >= 700 - -#if __cccl_ptx_isa >= 800 - NV_IF_TARGET( - NV_PROVIDES_SM_90, - ( - // mbarrier.test_wait.acquire.cta.shared::cta.b64 waitComplete, [addr], state; // 2. - * fn_ptr++ = reinterpret_cast( - static_cast( - cuda::ptx::mbarrier_test_wait)); - // mbarrier.test_wait.acquire.cluster.shared::cta.b64 waitComplete, [addr], state; // 2. - * fn_ptr++ = reinterpret_cast( - static_cast( - cuda::ptx::mbarrier_test_wait));)); -#endif // __cccl_ptx_isa >= 800 -} - -__global__ void test_mbarrier_test_wait_parity(void** fn_ptr) -{ -#if __cccl_ptx_isa >= 710 - NV_IF_TARGET(NV_PROVIDES_SM_80, - ( - // mbarrier.test_wait.parity.shared.b64 waitComplete, [addr], phaseParity; // 3. - * fn_ptr++ = reinterpret_cast( - static_cast(cuda::ptx::mbarrier_test_wait_parity));)); -#endif // __cccl_ptx_isa >= 710 - -#if __cccl_ptx_isa >= 800 - NV_IF_TARGET( - NV_PROVIDES_SM_90, - ( - // mbarrier.test_wait.parity.acquire.cta.shared::cta.b64 waitComplete, [addr], phaseParity; // 4. - * fn_ptr++ = reinterpret_cast( - static_cast( - cuda::ptx::mbarrier_test_wait_parity)); - // mbarrier.test_wait.parity.acquire.cluster.shared::cta.b64 waitComplete, [addr], phaseParity; // 4. - * fn_ptr++ = reinterpret_cast( - static_cast( - cuda::ptx::mbarrier_test_wait_parity));)); -#endif // __cccl_ptx_isa >= 800 -} - -__global__ void test_mbarrier_try_wait(void** fn_ptr) -{ -#if __cccl_ptx_isa >= 780 - NV_IF_TARGET(NV_PROVIDES_SM_90, - ( - // mbarrier.try_wait.shared::cta.b64 waitComplete, [addr], state; // 5a. - * fn_ptr++ = reinterpret_cast( - static_cast(cuda::ptx::mbarrier_try_wait));)); -#endif // __cccl_ptx_isa >= 780 - -#if __cccl_ptx_isa >= 780 - NV_IF_TARGET( - NV_PROVIDES_SM_90, - ( - // mbarrier.try_wait.shared::cta.b64 waitComplete, [addr], state, suspendTimeHint; // - // 5b. - * fn_ptr++ = reinterpret_cast( - static_cast(cuda::ptx::mbarrier_try_wait));)); -#endif // __cccl_ptx_isa >= 780 - -#if __cccl_ptx_isa >= 800 - NV_IF_TARGET( - NV_PROVIDES_SM_90, - ( - // mbarrier.try_wait.acquire.cta.shared::cta.b64 waitComplete, [addr], state; // - // 6a. - * fn_ptr++ = reinterpret_cast( - static_cast( - cuda::ptx::mbarrier_try_wait)); - // mbarrier.try_wait.acquire.cluster.shared::cta.b64 waitComplete, [addr], state; // 6a. - * fn_ptr++ = reinterpret_cast( - static_cast( - cuda::ptx::mbarrier_try_wait));)); -#endif // __cccl_ptx_isa >= 800 - -#if __cccl_ptx_isa >= 800 - NV_IF_TARGET( - NV_PROVIDES_SM_90, - ( - // mbarrier.try_wait.acquire.cta.shared::cta.b64 waitComplete, [addr], state , suspendTimeHint; // - // 6b. - * fn_ptr++ = reinterpret_cast( - static_cast( - cuda::ptx::mbarrier_try_wait)); - // mbarrier.try_wait.acquire.cluster.shared::cta.b64 waitComplete, [addr], state , suspendTimeHint; - // // 6b. - * fn_ptr++ = reinterpret_cast( - static_cast( - cuda::ptx::mbarrier_try_wait));)); -#endif // __cccl_ptx_isa >= 800 -} - -__global__ void test_mbarrier_try_wait_parity(void** fn_ptr) -{ -#if __cccl_ptx_isa >= 780 - NV_IF_TARGET(NV_PROVIDES_SM_90, - ( - // mbarrier.try_wait.parity.shared::cta.b64 waitComplete, [addr], phaseParity; // 7a. - * fn_ptr++ = reinterpret_cast( - static_cast(cuda::ptx::mbarrier_try_wait_parity));)); -#endif // __cccl_ptx_isa >= 780 - -#if __cccl_ptx_isa >= 780 - NV_IF_TARGET( - NV_PROVIDES_SM_90, - ( - // mbarrier.try_wait.parity.shared::cta.b64 waitComplete, [addr], phaseParity, suspendTimeHint; // 7b. - * fn_ptr++ = reinterpret_cast( - static_cast(cuda::ptx::mbarrier_try_wait_parity));)); -#endif // __cccl_ptx_isa >= 780 - -#if __cccl_ptx_isa >= 800 - NV_IF_TARGET( - NV_PROVIDES_SM_90, - ( - // mbarrier.try_wait.parity.acquire.cta.shared::cta.b64 waitComplete, [addr], phaseParity; // - // 8a. - * fn_ptr++ = reinterpret_cast( - static_cast( - cuda::ptx::mbarrier_try_wait_parity)); - // mbarrier.try_wait.parity.acquire.cluster.shared::cta.b64 waitComplete, [addr], phaseParity; // 8a. - * fn_ptr++ = reinterpret_cast( - static_cast( - cuda::ptx::mbarrier_try_wait_parity));)); -#endif // __cccl_ptx_isa >= 800 - -#if __cccl_ptx_isa >= 800 - NV_IF_TARGET( - NV_PROVIDES_SM_90, - ( - // mbarrier.try_wait.parity.acquire.cta.shared::cta.b64 waitComplete, [addr], phaseParity, suspendTimeHint; // - // 8b. - * fn_ptr++ = reinterpret_cast( - static_cast( - cuda::ptx::mbarrier_try_wait_parity)); - // mbarrier.try_wait.parity.acquire.cluster.shared::cta.b64 waitComplete, [addr], phaseParity, - // suspendTimeHint; // 8b. - * fn_ptr++ = reinterpret_cast( - static_cast( - cuda::ptx::mbarrier_try_wait_parity));)); -#endif // __cccl_ptx_isa >= 800 -} +#include "generated/mbarrier_try_wait.inc" +#include "generated/mbarrier_try_wait_parity.inc" +#include "generated/mbarrier_wait.inc" +#include "generated/mbarrier_wait_parity.inc" int main(int, char**) { diff --git a/libcudacxx/test/libcudacxx/cuda/ptx/ptx.red.async.compile.pass.cpp b/libcudacxx/test/libcudacxx/cuda/ptx/ptx.red.async.compile.pass.cpp index 5a910b77fbd..4a380ec8396 100644 --- a/libcudacxx/test/libcudacxx/cuda/ptx/ptx.red.async.compile.pass.cpp +++ b/libcudacxx/test/libcudacxx/cuda/ptx/ptx.red.async.compile.pass.cpp @@ -31,126 +31,7 @@ * */ -__global__ void test_red_async(void** fn_ptr) -{ -#if __cccl_ptx_isa >= 810 - NV_IF_TARGET( - NV_PROVIDES_SM_90, - ( - // red.async.relaxed.cluster.shared::cluster.mbarrier::complete_tx::bytes.inc.u32 [dest], value, [remote_bar]; - * fn_ptr++ = reinterpret_cast( - static_cast(cuda::ptx::red_async));)); -#endif // __cccl_ptx_isa >= 810 - -#if __cccl_ptx_isa >= 810 - NV_IF_TARGET( - NV_PROVIDES_SM_90, - ( - // red.async.relaxed.cluster.shared::cluster.mbarrier::complete_tx::bytes.dec.u32 [dest], value, [remote_bar]; - * fn_ptr++ = reinterpret_cast( - static_cast(cuda::ptx::red_async));)); -#endif // __cccl_ptx_isa >= 810 - -#if __cccl_ptx_isa >= 810 - NV_IF_TARGET( - NV_PROVIDES_SM_90, - ( - // red.async.relaxed.cluster.shared::cluster.mbarrier::complete_tx::bytes.min.u32 [dest], value, [remote_bar]; - * fn_ptr++ = reinterpret_cast( - static_cast(cuda::ptx::red_async));)); -#endif // __cccl_ptx_isa >= 810 - -#if __cccl_ptx_isa >= 810 - NV_IF_TARGET( - NV_PROVIDES_SM_90, - ( - // red.async.relaxed.cluster.shared::cluster.mbarrier::complete_tx::bytes.max.u32 [dest], value, [remote_bar]; - * fn_ptr++ = reinterpret_cast( - static_cast(cuda::ptx::red_async));)); -#endif // __cccl_ptx_isa >= 810 - -#if __cccl_ptx_isa >= 810 - NV_IF_TARGET( - NV_PROVIDES_SM_90, - ( - // red.async.relaxed.cluster.shared::cluster.mbarrier::complete_tx::bytes.add.u32 [dest], value, [remote_bar]; - * fn_ptr++ = reinterpret_cast( - static_cast(cuda::ptx::red_async));)); -#endif // __cccl_ptx_isa >= 810 - -#if __cccl_ptx_isa >= 810 - NV_IF_TARGET( - NV_PROVIDES_SM_90, - ( - // red.async.relaxed.cluster.shared::cluster.mbarrier::complete_tx::bytes.min.s32 [dest], value, [remote_bar]; - * fn_ptr++ = reinterpret_cast( - static_cast(cuda::ptx::red_async));)); -#endif // __cccl_ptx_isa >= 810 - -#if __cccl_ptx_isa >= 810 - NV_IF_TARGET( - NV_PROVIDES_SM_90, - ( - // red.async.relaxed.cluster.shared::cluster.mbarrier::complete_tx::bytes.max.s32 [dest], value, [remote_bar]; - * fn_ptr++ = reinterpret_cast( - static_cast(cuda::ptx::red_async));)); -#endif // __cccl_ptx_isa >= 810 - -#if __cccl_ptx_isa >= 810 - NV_IF_TARGET( - NV_PROVIDES_SM_90, - ( - // red.async.relaxed.cluster.shared::cluster.mbarrier::complete_tx::bytes.add.s32 [dest], value, [remote_bar]; - * fn_ptr++ = reinterpret_cast( - static_cast(cuda::ptx::red_async));)); -#endif // __cccl_ptx_isa >= 810 - -#if __cccl_ptx_isa >= 810 - NV_IF_TARGET( - NV_PROVIDES_SM_90, - ( - // red.async.relaxed.cluster.shared::cluster.mbarrier::complete_tx::bytes.and.b32 [dest], value, [remote_bar]; - * fn_ptr++ = reinterpret_cast( - static_cast(cuda::ptx::red_async));)); -#endif // __cccl_ptx_isa >= 810 - -#if __cccl_ptx_isa >= 810 - NV_IF_TARGET( - NV_PROVIDES_SM_90, - ( - // red.async.relaxed.cluster.shared::cluster.mbarrier::complete_tx::bytes.or.b32 [dest], value, [remote_bar]; - * fn_ptr++ = reinterpret_cast( - static_cast(cuda::ptx::red_async));)); -#endif // __cccl_ptx_isa >= 810 - -#if __cccl_ptx_isa >= 810 - NV_IF_TARGET( - NV_PROVIDES_SM_90, - ( - // red.async.relaxed.cluster.shared::cluster.mbarrier::complete_tx::bytes.xor.b32 [dest], value, [remote_bar]; - * fn_ptr++ = reinterpret_cast( - static_cast(cuda::ptx::red_async));)); -#endif // __cccl_ptx_isa >= 810 - -#if __cccl_ptx_isa >= 810 - NV_IF_TARGET( - NV_PROVIDES_SM_90, - ( - // red.async.relaxed.cluster.shared::cluster.mbarrier::complete_tx::bytes.add.u64 [dest], value, [remote_bar]; - * fn_ptr++ = reinterpret_cast( - static_cast(cuda::ptx::red_async));)); -#endif // __cccl_ptx_isa >= 810 - -#if __cccl_ptx_isa >= 810 - NV_IF_TARGET( - NV_PROVIDES_SM_90, - ( - // red.async.relaxed.cluster.shared::cluster.mbarrier::complete_tx::bytes.add.u64 [dest], value, [remote_bar]; - // // .u64 intentional - * fn_ptr++ = reinterpret_cast( - static_cast(cuda::ptx::red_async));)); -#endif // __cccl_ptx_isa >= 810 -} +#include "generated/red_async.inc" int main(int, char**) { diff --git a/libcudacxx/test/libcudacxx/cuda/ptx/ptx.st.async.compile.pass.cpp b/libcudacxx/test/libcudacxx/cuda/ptx/ptx.st.async.compile.pass.cpp index 1cc0c1e2d74..2c74f48e04d 100644 --- a/libcudacxx/test/libcudacxx/cuda/ptx/ptx.st.async.compile.pass.cpp +++ b/libcudacxx/test/libcudacxx/cuda/ptx/ptx.st.async.compile.pass.cpp @@ -31,41 +31,7 @@ * */ -__global__ void test_st_async(void** fn_ptr) -{ -#if __cccl_ptx_isa >= 810 - NV_IF_TARGET( - NV_PROVIDES_SM_90, - ( - // st.async.weak.shared::cluster.mbarrier::complete_tx::bytes.b32 [addr], value, [remote_bar]; // 1. - * fn_ptr++ = - reinterpret_cast(static_cast(cuda::ptx::st_async)); - // st.async.weak.shared::cluster.mbarrier::complete_tx::bytes.b64 [addr], value, [remote_bar]; // 1. - * fn_ptr++ = reinterpret_cast( - static_cast(cuda::ptx::st_async));)); -#endif // __cccl_ptx_isa >= 810 - -#if __cccl_ptx_isa >= 810 - NV_IF_TARGET( - NV_PROVIDES_SM_90, - ( - // st.async.weak.shared::cluster.mbarrier::complete_tx::bytes.v2.b32 [addr], value, [remote_bar]; // 2. - * fn_ptr++ = - reinterpret_cast(static_cast(cuda::ptx::st_async)); - // st.async.weak.shared::cluster.mbarrier::complete_tx::bytes.v2.b64 [addr], value, [remote_bar]; // 2. - * fn_ptr++ = reinterpret_cast( - static_cast(cuda::ptx::st_async));)); -#endif // __cccl_ptx_isa >= 810 - -#if __cccl_ptx_isa >= 810 - NV_IF_TARGET(NV_PROVIDES_SM_90, - ( - // st.async.weak.shared::cluster.mbarrier::complete_tx::bytes.v4.b32 [addr], value, [remote_bar]; - // // 3. - * fn_ptr++ = reinterpret_cast( - static_cast(cuda::ptx::st_async));)); -#endif // __cccl_ptx_isa >= 810 -} +#include "generated/st_async.inc" int main(int, char**) { diff --git a/libcudacxx/test/libcudacxx/cuda/ptx/ptx.tensormap.cp_fenceproxy.compile.pass.cpp b/libcudacxx/test/libcudacxx/cuda/ptx/ptx.tensormap.cp_fenceproxy.compile.pass.cpp index 9d923951f0c..d0d3a967836 100644 --- a/libcudacxx/test/libcudacxx/cuda/ptx/ptx.tensormap.cp_fenceproxy.compile.pass.cpp +++ b/libcudacxx/test/libcudacxx/cuda/ptx/ptx.tensormap.cp_fenceproxy.compile.pass.cpp @@ -31,35 +31,7 @@ * */ -__global__ void test_tensormap_cp_fenceproxy(void** fn_ptr) -{ -#if __cccl_ptx_isa >= 830 - NV_IF_TARGET( - NV_PROVIDES_SM_90, - ( - // tensormap.cp_fenceproxy.global.shared::cta.tensormap::generic.release.cta.sync.aligned [dst], [src], size; - * fn_ptr++ = reinterpret_cast( - static_cast)>( - cuda::ptx::tensormap_cp_fenceproxy)); - // tensormap.cp_fenceproxy.global.shared::cta.tensormap::generic.release.cluster.sync.aligned [dst], [src], - // size; - * fn_ptr++ = reinterpret_cast( - static_cast)>( - cuda::ptx::tensormap_cp_fenceproxy)); - // tensormap.cp_fenceproxy.global.shared::cta.tensormap::generic.release.gpu.sync.aligned [dst], [src], size; - * fn_ptr++ = reinterpret_cast( - static_cast)>( - cuda::ptx::tensormap_cp_fenceproxy)); - // tensormap.cp_fenceproxy.global.shared::cta.tensormap::generic.release.sys.sync.aligned [dst], [src], size; - * fn_ptr++ = reinterpret_cast( - static_cast)>( - cuda::ptx::tensormap_cp_fenceproxy));)); -#endif // __cccl_ptx_isa >= 830 -} +#include "generated/tensormap_cp_fenceproxy.inc" int main(int, char**) { diff --git a/libcudacxx/test/libcudacxx/cuda/ptx/ptx.tensormap.replace.compile.pass.cpp b/libcudacxx/test/libcudacxx/cuda/ptx/ptx.tensormap.replace.compile.pass.cpp index f7360eacbcd..d780ff26dca 100644 --- a/libcudacxx/test/libcudacxx/cuda/ptx/ptx.tensormap.replace.compile.pass.cpp +++ b/libcudacxx/test/libcudacxx/cuda/ptx/ptx.tensormap.replace.compile.pass.cpp @@ -31,204 +31,7 @@ * */ -__global__ void test_tensormap_replace(void** fn_ptr) -{ -#if __cccl_ptx_isa >= 830 - NV_IF_TARGET( - NV_HAS_FEATURE_SM_90a, - ( - // tensormap.replace.tile.global_address.global.b1024.b64 [tm_addr], new_val; - * fn_ptr++ = reinterpret_cast(static_cast( - cuda::ptx::tensormap_replace_global_address));)); -#endif // __cccl_ptx_isa >= 830 - -#if __cccl_ptx_isa >= 830 - NV_IF_TARGET( - NV_HAS_FEATURE_SM_90a, - ( - // tensormap.replace.tile.global_address.shared::cta.b1024.b64 [tm_addr], new_val; - * fn_ptr++ = reinterpret_cast(static_cast( - cuda::ptx::tensormap_replace_global_address));)); -#endif // __cccl_ptx_isa >= 830 - -#if __cccl_ptx_isa >= 830 - NV_IF_TARGET( - NV_HAS_FEATURE_SM_90a, - ( - // tensormap.replace.tile.rank.global.b1024.b32 [tm_addr], new_val; - * fn_ptr++ = reinterpret_cast( - static_cast(cuda::ptx::tensormap_replace_rank));)); -#endif // __cccl_ptx_isa >= 830 - -#if __cccl_ptx_isa >= 830 - NV_IF_TARGET( - NV_HAS_FEATURE_SM_90a, - ( - // tensormap.replace.tile.rank.shared::cta.b1024.b32 [tm_addr], new_val; - * fn_ptr++ = reinterpret_cast( - static_cast(cuda::ptx::tensormap_replace_rank));)); -#endif // __cccl_ptx_isa >= 830 - -#if __cccl_ptx_isa >= 830 - NV_IF_TARGET( - NV_HAS_FEATURE_SM_90a, - ( - // tensormap.replace.tile.box_dim.global.b1024.b32 [tm_addr], ord, new_val; - * fn_ptr++ = - reinterpret_cast(static_cast, int32_t)>( - cuda::ptx::tensormap_replace_box_dim));)); -#endif // __cccl_ptx_isa >= 830 - -#if __cccl_ptx_isa >= 830 - NV_IF_TARGET( - NV_HAS_FEATURE_SM_90a, - ( - // tensormap.replace.tile.box_dim.shared::cta.b1024.b32 [tm_addr], ord, new_val; - * fn_ptr++ = - reinterpret_cast(static_cast, int32_t)>( - cuda::ptx::tensormap_replace_box_dim));)); -#endif // __cccl_ptx_isa >= 830 - -#if __cccl_ptx_isa >= 830 - NV_IF_TARGET( - NV_HAS_FEATURE_SM_90a, - ( - // tensormap.replace.tile.global_dim.global.b1024.b32 [tm_addr], ord, new_val; - * fn_ptr++ = - reinterpret_cast(static_cast, int32_t)>( - cuda::ptx::tensormap_replace_global_dim));)); -#endif // __cccl_ptx_isa >= 830 - -#if __cccl_ptx_isa >= 830 - NV_IF_TARGET( - NV_HAS_FEATURE_SM_90a, - ( - // tensormap.replace.tile.global_dim.shared::cta.b1024.b32 [tm_addr], ord, new_val; - * fn_ptr++ = - reinterpret_cast(static_cast, int32_t)>( - cuda::ptx::tensormap_replace_global_dim));)); -#endif // __cccl_ptx_isa >= 830 - -#if __cccl_ptx_isa >= 830 - NV_IF_TARGET( - NV_HAS_FEATURE_SM_90a, - ( - // tensormap.replace.tile.global_stride.global.b1024.b64 [tm_addr], ord, new_val; - * fn_ptr++ = - reinterpret_cast(static_cast, int64_t)>( - cuda::ptx::tensormap_replace_global_stride));)); -#endif // __cccl_ptx_isa >= 830 - -#if __cccl_ptx_isa >= 830 - NV_IF_TARGET( - NV_HAS_FEATURE_SM_90a, - ( - // tensormap.replace.tile.global_stride.shared::cta.b1024.b64 [tm_addr], ord, new_val; - * fn_ptr++ = - reinterpret_cast(static_cast, int64_t)>( - cuda::ptx::tensormap_replace_global_stride));)); -#endif // __cccl_ptx_isa >= 830 - -#if __cccl_ptx_isa >= 830 - NV_IF_TARGET( - NV_HAS_FEATURE_SM_90a, - ( - // tensormap.replace.tile.element_stride.global.b1024.b32 [tm_addr], ord, new_val; - * fn_ptr++ = - reinterpret_cast(static_cast, int32_t)>( - cuda::ptx::tensormap_replace_element_size));)); -#endif // __cccl_ptx_isa >= 830 - -#if __cccl_ptx_isa >= 830 - NV_IF_TARGET( - NV_HAS_FEATURE_SM_90a, - ( - // tensormap.replace.tile.element_stride.shared::cta.b1024.b32 [tm_addr], ord, new_val; - * fn_ptr++ = - reinterpret_cast(static_cast, int32_t)>( - cuda::ptx::tensormap_replace_element_size));)); -#endif // __cccl_ptx_isa >= 830 - -#if __cccl_ptx_isa >= 830 - NV_IF_TARGET( - NV_HAS_FEATURE_SM_90a, - ( - // tensormap.replace.tile.elemtype.global.b1024.b32 [tm_addr], new_val; - * fn_ptr++ = - reinterpret_cast(static_cast)>( - cuda::ptx::tensormap_replace_elemtype));)); -#endif // __cccl_ptx_isa >= 830 - -#if __cccl_ptx_isa >= 830 - NV_IF_TARGET( - NV_HAS_FEATURE_SM_90a, - ( - // tensormap.replace.tile.elemtype.shared::cta.b1024.b32 [tm_addr], new_val; - * fn_ptr++ = - reinterpret_cast(static_cast)>( - cuda::ptx::tensormap_replace_elemtype));)); -#endif // __cccl_ptx_isa >= 830 - -#if __cccl_ptx_isa >= 830 - NV_IF_TARGET( - NV_HAS_FEATURE_SM_90a, - ( - // tensormap.replace.tile.interleave_layout.global.b1024.b32 [tm_addr], new_val; - * fn_ptr++ = - reinterpret_cast(static_cast)>( - cuda::ptx::tensormap_replace_interleave_layout));)); -#endif // __cccl_ptx_isa >= 830 - -#if __cccl_ptx_isa >= 830 - NV_IF_TARGET( - NV_HAS_FEATURE_SM_90a, - ( - // tensormap.replace.tile.interleave_layout.shared::cta.b1024.b32 [tm_addr], new_val; - * fn_ptr++ = - reinterpret_cast(static_cast)>( - cuda::ptx::tensormap_replace_interleave_layout));)); -#endif // __cccl_ptx_isa >= 830 - -#if __cccl_ptx_isa >= 830 - NV_IF_TARGET( - NV_HAS_FEATURE_SM_90a, - ( - // tensormap.replace.tile.swizzle_mode.global.b1024.b32 [tm_addr], new_val; - * fn_ptr++ = - reinterpret_cast(static_cast)>( - cuda::ptx::tensormap_replace_swizzle_mode));)); -#endif // __cccl_ptx_isa >= 830 - -#if __cccl_ptx_isa >= 830 - NV_IF_TARGET( - NV_HAS_FEATURE_SM_90a, - ( - // tensormap.replace.tile.swizzle_mode.shared::cta.b1024.b32 [tm_addr], new_val; - * fn_ptr++ = - reinterpret_cast(static_cast)>( - cuda::ptx::tensormap_replace_swizzle_mode));)); -#endif // __cccl_ptx_isa >= 830 - -#if __cccl_ptx_isa >= 830 - NV_IF_TARGET( - NV_HAS_FEATURE_SM_90a, - ( - // tensormap.replace.tile.fill_mode.global.b1024.b32 [tm_addr], new_val; - * fn_ptr++ = - reinterpret_cast(static_cast)>( - cuda::ptx::tensormap_replace_fill_mode));)); -#endif // __cccl_ptx_isa >= 830 - -#if __cccl_ptx_isa >= 830 - NV_IF_TARGET( - NV_HAS_FEATURE_SM_90a, - ( - // tensormap.replace.tile.fill_mode.shared::cta.b1024.b32 [tm_addr], new_val; - * fn_ptr++ = - reinterpret_cast(static_cast)>( - cuda::ptx::tensormap_replace_fill_mode));)); -#endif // __cccl_ptx_isa >= 830 -} +#include "generated/tensormap_replace.inc" int main(int, char**) { From ee46f3e8f0f091b449923354ee2a189312b5031a Mon Sep 17 00:00:00 2001 From: Bernhard Manfred Gruber Date: Fri, 22 Nov 2024 12:44:44 +0100 Subject: [PATCH 08/45] Reorganize PTX docs to match generator (#2929) Co-authored-by: Allard Hendriksen --- docs/libcudacxx/ptx/instructions.rst | 32 +- .../ptx/instructions/barrier_cluster.rst | 16 + .../ptx/instructions/cp_async_bulk.rst | 30 ++ ...oup.rst => cp_async_bulk_commit_group.rst} | 8 +- .../ptx/instructions/cp_async_bulk_tensor.rst | 23 ++ .../instructions/cp_async_bulk_wait_group.rst | 9 + .../ptx/instructions/cp_reduce_async_bulk.rst | 61 +++ .../cp_reduce_async_bulk_tensor.rst | 9 + docs/libcudacxx/ptx/instructions/fence.rst | 257 +----------- .../barrier_cluster.rst} | 15 - .../cp_async_bulk.rst} | 43 -- .../generated/cp_async_bulk_commit_group.rst | 7 + .../generated/cp_async_bulk_multicast.rst | 16 + .../cp_async_bulk_tensor.rst} | 105 ----- .../cp_async_bulk_tensor_multicast.rst | 84 ++++ .../cp_async_bulk_wait_group.rst} | 8 - .../cp_reduce_async_bulk.rst} | 164 -------- .../generated/cp_reduce_async_bulk_bf16.rst | 53 +++ .../generated/cp_reduce_async_bulk_f16.rst | 53 +++ .../cp_reduce_async_bulk_tensor.rst} | 8 - .../ptx/instructions/generated/fence.rst | 95 +++++ .../generated/fence_mbarrier_init.rst | 11 + .../generated/fence_proxy_alias.rst | 7 + .../generated/fence_proxy_async.rst | 37 ++ .../fence_proxy_tensormap_generic.rst | 103 +++++ .../ptx/instructions/generated/getctarank.rst | 10 + .../generated/mbarrier_arrive.rst | 111 +++++ .../generated/mbarrier_arrive_expect_tx.rst | 47 +++ .../generated/mbarrier_arrive_no_complete.rst | 9 + .../mbarrier_expect_tx.rst} | 8 - .../mbarrier_init.rst} | 8 - .../generated/mbarrier_test_wait.rst | 37 ++ .../generated/mbarrier_test_wait_parity.rst | 37 ++ .../generated/mbarrier_try_wait.rst | 78 ++++ .../generated/mbarrier_try_wait_parity.rst | 78 ++++ .../red_async.rst} | 30 -- .../generated/special_registers.rst | 383 +++++++++++++++++ .../{st.async.rst => generated/st_async.rst} | 13 - .../tensormap_cp_fenceproxy.rst} | 8 - .../tensormap_replace.rst} | 8 - .../ptx/instructions/getctarank.rst | 11 +- .../ptx/instructions/mbarrier.arrive.rst | 232 ----------- .../ptx/instructions/mbarrier.test_wait.rst | 91 ----- .../ptx/instructions/mbarrier.try_wait.rst | 174 -------- .../ptx/instructions/mbarrier_arrive.rst | 68 ++++ .../ptx/instructions/mbarrier_expect_tx.rst | 9 + .../ptx/instructions/mbarrier_init.rst | 9 + .../ptx/instructions/mbarrier_test_wait.rst | 19 + .../ptx/instructions/mbarrier_try_wait.rst | 20 + .../libcudacxx/ptx/instructions/red_async.rst | 31 ++ .../ptx/instructions/special_registers.rst | 384 +----------------- docs/libcudacxx/ptx/instructions/st_async.rst | 14 + .../instructions/tensormap_cp_fenceproxy.rst | 9 + .../ptx/instructions/tensormap_replace.rst | 9 + 54 files changed, 1616 insertions(+), 1583 deletions(-) create mode 100644 docs/libcudacxx/ptx/instructions/barrier_cluster.rst create mode 100644 docs/libcudacxx/ptx/instructions/cp_async_bulk.rst rename docs/libcudacxx/ptx/instructions/{cp.async.bulk.commit_group.rst => cp_async_bulk_commit_group.rst} (58%) create mode 100644 docs/libcudacxx/ptx/instructions/cp_async_bulk_tensor.rst create mode 100644 docs/libcudacxx/ptx/instructions/cp_async_bulk_wait_group.rst create mode 100644 docs/libcudacxx/ptx/instructions/cp_reduce_async_bulk.rst create mode 100644 docs/libcudacxx/ptx/instructions/cp_reduce_async_bulk_tensor.rst rename docs/libcudacxx/ptx/instructions/{barrier.cluster.rst => generated/barrier_cluster.rst} (70%) rename docs/libcudacxx/ptx/instructions/{cp.async.bulk.rst => generated/cp_async_bulk.rst} (57%) create mode 100644 docs/libcudacxx/ptx/instructions/generated/cp_async_bulk_commit_group.rst create mode 100644 docs/libcudacxx/ptx/instructions/generated/cp_async_bulk_multicast.rst rename docs/libcudacxx/ptx/instructions/{cp.async.bulk.tensor.rst => generated/cp_async_bulk_tensor.rst} (59%) create mode 100644 docs/libcudacxx/ptx/instructions/generated/cp_async_bulk_tensor_multicast.rst rename docs/libcudacxx/ptx/instructions/{cp.async.bulk.wait_group.rst => generated/cp_async_bulk_wait_group.rst} (62%) rename docs/libcudacxx/ptx/instructions/{cp.reduce.async.bulk.rst => generated/cp_reduce_async_bulk.rst} (80%) create mode 100644 docs/libcudacxx/ptx/instructions/generated/cp_reduce_async_bulk_bf16.rst create mode 100644 docs/libcudacxx/ptx/instructions/generated/cp_reduce_async_bulk_f16.rst rename docs/libcudacxx/ptx/instructions/{cp.reduce.async.bulk.tensor.rst => generated/cp_reduce_async_bulk_tensor.rst} (98%) create mode 100644 docs/libcudacxx/ptx/instructions/generated/fence.rst create mode 100644 docs/libcudacxx/ptx/instructions/generated/fence_mbarrier_init.rst create mode 100644 docs/libcudacxx/ptx/instructions/generated/fence_proxy_alias.rst create mode 100644 docs/libcudacxx/ptx/instructions/generated/fence_proxy_async.rst create mode 100644 docs/libcudacxx/ptx/instructions/generated/fence_proxy_tensormap_generic.rst create mode 100644 docs/libcudacxx/ptx/instructions/generated/getctarank.rst create mode 100644 docs/libcudacxx/ptx/instructions/generated/mbarrier_arrive.rst create mode 100644 docs/libcudacxx/ptx/instructions/generated/mbarrier_arrive_expect_tx.rst create mode 100644 docs/libcudacxx/ptx/instructions/generated/mbarrier_arrive_no_complete.rst rename docs/libcudacxx/ptx/instructions/{mbarrier.expect_tx.rst => generated/mbarrier_expect_tx.rst} (88%) rename docs/libcudacxx/ptx/instructions/{mbarrier.init.rst => generated/mbarrier_init.rst} (50%) create mode 100644 docs/libcudacxx/ptx/instructions/generated/mbarrier_test_wait.rst create mode 100644 docs/libcudacxx/ptx/instructions/generated/mbarrier_test_wait_parity.rst create mode 100644 docs/libcudacxx/ptx/instructions/generated/mbarrier_try_wait.rst create mode 100644 docs/libcudacxx/ptx/instructions/generated/mbarrier_try_wait_parity.rst rename docs/libcudacxx/ptx/instructions/{red.async.rst => generated/red_async.rst} (89%) create mode 100644 docs/libcudacxx/ptx/instructions/generated/special_registers.rst rename docs/libcudacxx/ptx/instructions/{st.async.rst => generated/st_async.rst} (83%) rename docs/libcudacxx/ptx/instructions/{tensormap.cp_fenceproxy.rst => generated/tensormap_cp_fenceproxy.rst} (89%) rename docs/libcudacxx/ptx/instructions/{tensormap.replace.rst => generated/tensormap_replace.rst} (97%) delete mode 100644 docs/libcudacxx/ptx/instructions/mbarrier.arrive.rst delete mode 100644 docs/libcudacxx/ptx/instructions/mbarrier.test_wait.rst delete mode 100644 docs/libcudacxx/ptx/instructions/mbarrier.try_wait.rst create mode 100644 docs/libcudacxx/ptx/instructions/mbarrier_arrive.rst create mode 100644 docs/libcudacxx/ptx/instructions/mbarrier_expect_tx.rst create mode 100644 docs/libcudacxx/ptx/instructions/mbarrier_init.rst create mode 100644 docs/libcudacxx/ptx/instructions/mbarrier_test_wait.rst create mode 100644 docs/libcudacxx/ptx/instructions/mbarrier_try_wait.rst create mode 100644 docs/libcudacxx/ptx/instructions/red_async.rst create mode 100644 docs/libcudacxx/ptx/instructions/st_async.rst create mode 100644 docs/libcudacxx/ptx/instructions/tensormap_cp_fenceproxy.rst create mode 100644 docs/libcudacxx/ptx/instructions/tensormap_replace.rst diff --git a/docs/libcudacxx/ptx/instructions.rst b/docs/libcudacxx/ptx/instructions.rst index a518dad0ff2..f0776974eec 100644 --- a/docs/libcudacxx/ptx/instructions.rst +++ b/docs/libcudacxx/ptx/instructions.rst @@ -6,25 +6,25 @@ PTX Instructions .. toctree:: :maxdepth: 1 - instructions/barrier.cluster - instructions/cp.async.bulk - instructions/cp.async.bulk.commit_group - instructions/cp.async.bulk.wait_group - instructions/cp.async.bulk.tensor - instructions/cp.reduce.async.bulk - instructions/cp.reduce.async.bulk.tensor + instructions/barrier_cluster + instructions/cp_async_bulk + instructions/cp_async_bulk_commit_group + instructions/cp_async_bulk_wait_group + instructions/cp_async_bulk_tensor + instructions/cp_reduce_async_bulk + instructions/cp_reduce_async_bulk_tensor instructions/fence instructions/getctarank instructions/mapa - instructions/mbarrier.init - instructions/mbarrier.arrive - instructions/mbarrier.expect_tx - instructions/mbarrier.test_wait - instructions/mbarrier.try_wait - instructions/red.async - instructions/st.async - instructions/tensormap.replace - instructions/tensormap.cp_fenceproxy + instructions/mbarrier_init + instructions/mbarrier_arrive + instructions/mbarrier_expect_tx + instructions/mbarrier_test_wait + instructions/mbarrier_try_wait + instructions/red_async + instructions/st_async + instructions/tensormap_replace + instructions/tensormap_cp_fenceproxy instructions/special_registers diff --git a/docs/libcudacxx/ptx/instructions/barrier_cluster.rst b/docs/libcudacxx/ptx/instructions/barrier_cluster.rst new file mode 100644 index 00000000000..bc8943bc619 --- /dev/null +++ b/docs/libcudacxx/ptx/instructions/barrier_cluster.rst @@ -0,0 +1,16 @@ +.. _libcudacxx-ptx-instructions-barrier-cluster: + +barrier.cluster +=============== + +- PTX ISA: + `barrier.cluster `__ + +Similar functionality is provided through the builtins +``__cluster_barrier_arrive(), __cluster_barrier_arrive_relaxed(), __cluster_barrier_wait()``, +as well as the ``cooperative_groups::cluster_group`` +`API `__. + +The ``.aligned`` variants of the instructions are not exposed. + +.. include:: generated/barrier_cluster.rst diff --git a/docs/libcudacxx/ptx/instructions/cp_async_bulk.rst b/docs/libcudacxx/ptx/instructions/cp_async_bulk.rst new file mode 100644 index 00000000000..32121ef8a12 --- /dev/null +++ b/docs/libcudacxx/ptx/instructions/cp_async_bulk.rst @@ -0,0 +1,30 @@ +.. _libcudacxx-ptx-instructions-cp-async-bulk: + +cp.async.bulk +============= + +- PTX ISA: + `cp.async.bulk `__ + +Implementation notes +-------------------- + +**NOTE.** Both ``srcMem`` and ``dstMem`` must be 16-byte aligned, and +``size`` must be a multiple of 16. + +Changelog +--------- + +- In earlier versions, ``cp_async_bulk_multicast`` was enabled for + SM_90. This has been changed to SM_90a. + + +Unicast +------- + +.. include:: generated/cp_async_bulk.rst + +Multicast +--------- + +.. include:: generated/cp_async_bulk_multicast.rst diff --git a/docs/libcudacxx/ptx/instructions/cp.async.bulk.commit_group.rst b/docs/libcudacxx/ptx/instructions/cp_async_bulk_commit_group.rst similarity index 58% rename from docs/libcudacxx/ptx/instructions/cp.async.bulk.commit_group.rst rename to docs/libcudacxx/ptx/instructions/cp_async_bulk_commit_group.rst index cc549f54163..8efc5ac0488 100644 --- a/docs/libcudacxx/ptx/instructions/cp.async.bulk.commit_group.rst +++ b/docs/libcudacxx/ptx/instructions/cp_async_bulk_commit_group.rst @@ -6,10 +6,4 @@ cp.async.bulk.commit_group - PTX ISA: `cp.async.bulk.commit_group `__ -cp.async.bulk.commit_group -^^^^^^^^^^^^^^^^^^^^^^^^^^ -.. code:: cuda - - // cp.async.bulk.commit_group; // PTX ISA 80, SM_90 - template - __device__ static inline void cp_async_bulk_commit_group(); +.. include:: generated/cp_async_bulk_commit_group.rst diff --git a/docs/libcudacxx/ptx/instructions/cp_async_bulk_tensor.rst b/docs/libcudacxx/ptx/instructions/cp_async_bulk_tensor.rst new file mode 100644 index 00000000000..bde3488bac9 --- /dev/null +++ b/docs/libcudacxx/ptx/instructions/cp_async_bulk_tensor.rst @@ -0,0 +1,23 @@ +.. _libcudacxx-ptx-instructions-cp-async-bulk-tensor: + +cp.async.bulk.tensor +==================== + +- PTX ISA: + `cp.async.bulk.tensor `__ + +Changelog +--------- + +- In earlier versions, ``cp_async_bulk_tensor_multicast`` was enabled + for SM_90. This has been changed to SM_90a. + +Unicast +------- + +.. include:: generated/cp_async_bulk_tensor.rst + +Multicast +--------- + +.. include:: generated/cp_async_bulk_tensor_multicast.rst diff --git a/docs/libcudacxx/ptx/instructions/cp_async_bulk_wait_group.rst b/docs/libcudacxx/ptx/instructions/cp_async_bulk_wait_group.rst new file mode 100644 index 00000000000..e24bb0fc9fd --- /dev/null +++ b/docs/libcudacxx/ptx/instructions/cp_async_bulk_wait_group.rst @@ -0,0 +1,9 @@ +.. _libcudacxx-ptx-instructions-cp-async-bulk-wait_group: + +cp.async.bulk.wait_group +======================== + +- PTX ISA: + `cp.async.bulk.wait_group `__ + +.. include:: generated/cp_async_bulk_wait_group.rst diff --git a/docs/libcudacxx/ptx/instructions/cp_reduce_async_bulk.rst b/docs/libcudacxx/ptx/instructions/cp_reduce_async_bulk.rst new file mode 100644 index 00000000000..a4710b5ce30 --- /dev/null +++ b/docs/libcudacxx/ptx/instructions/cp_reduce_async_bulk.rst @@ -0,0 +1,61 @@ +.. _libcudacxx-ptx-instructions-cp-reduce-async-bulk: + +cp.reduce.async.bulk +==================== + +- PTX ISA: + `cp.reduce.async.bulk `__ + + +Integer and floating point instructions +--------------------------------------- + +.. include:: generated/cp_reduce_async_bulk.rst + +Emulation of ``.s64`` instruction +--------------------------------- + +PTX does not currently (CTK 12.3) expose +``cp.reduce.async.bulk.add.s64``. This exposure is emulated in +``cuda::ptx`` using: + +.. code:: cuda + + // cp.reduce.async.bulk.dst.src.mbarrier::complete_tx::bytes.op.u64 [dstMem], [srcMem], size, [rdsmem_bar]; // 2. PTX ISA 80, SM_90 + // .dst = { .shared::cluster } + // .src = { .shared::cta } + // .type = { .s64 } + // .op = { .add } + template + __device__ static inline void cp_reduce_async_bulk( + cuda::ptx::space_cluster_t, + cuda::ptx::space_shared_t, + cuda::ptx::op_add_t, + int64_t* dstMem, + const int64_t* srcMem, + uint32_t size, + uint64_t* rdsmem_bar); + + // cp.reduce.async.bulk.dst.src.bulk_group.op.u64 [dstMem], [srcMem], size; // 6. PTX ISA 80, SM_90 + // .dst = { .global } + // .src = { .shared::cta } + // .type = { .s64 } + // .op = { .add } + template + __device__ static inline void cp_reduce_async_bulk( + cuda::ptx::space_global_t, + cuda::ptx::space_shared_t, + cuda::ptx::op_add_t, + int64_t* dstMem, + const int64_t* srcMem, + uint32_t size); + +FP16 instructions +----------------- + +.. include:: generated/cp_reduce_async_bulk_f16.rst + +BF16 instructions +----------------- + +.. include:: generated/cp_reduce_async_bulk_bf16.rst diff --git a/docs/libcudacxx/ptx/instructions/cp_reduce_async_bulk_tensor.rst b/docs/libcudacxx/ptx/instructions/cp_reduce_async_bulk_tensor.rst new file mode 100644 index 00000000000..598d9e1e3ea --- /dev/null +++ b/docs/libcudacxx/ptx/instructions/cp_reduce_async_bulk_tensor.rst @@ -0,0 +1,9 @@ +.. _libcudacxx-ptx-instructions-cp-reduce-async-bulk-tensor: + +cp.reduce.async.bulk.tensor +=========================== + +- PTX ISA: + `cp.reduce.async.bulk.tensor `__ + +.. include:: generated/cp_reduce_async_bulk_tensor.rst diff --git a/docs/libcudacxx/ptx/instructions/fence.rst b/docs/libcudacxx/ptx/instructions/fence.rst index 8a4e7f281cb..82de170f63b 100644 --- a/docs/libcudacxx/ptx/instructions/fence.rst +++ b/docs/libcudacxx/ptx/instructions/fence.rst @@ -11,272 +11,25 @@ fence fence ----- -fence.sc.cta -^^^^^^^^^^^^ -.. code:: cuda - - // fence{.sem}.scope; // 1. PTX ISA 60, SM_70 - // .sem = { .sc, .acq_rel } - // .scope = { .cta, .gpu, .sys } - template - __device__ static inline void fence( - cuda::ptx::sem_t sem, - cuda::ptx::scope_t scope); - -fence.sc.gpu -^^^^^^^^^^^^ -.. code:: cuda - - // fence{.sem}.scope; // 1. PTX ISA 60, SM_70 - // .sem = { .sc, .acq_rel } - // .scope = { .cta, .gpu, .sys } - template - __device__ static inline void fence( - cuda::ptx::sem_t sem, - cuda::ptx::scope_t scope); - -fence.sc.sys -^^^^^^^^^^^^ -.. code:: cuda - - // fence{.sem}.scope; // 1. PTX ISA 60, SM_70 - // .sem = { .sc, .acq_rel } - // .scope = { .cta, .gpu, .sys } - template - __device__ static inline void fence( - cuda::ptx::sem_t sem, - cuda::ptx::scope_t scope); - -fence.acq_rel.cta -^^^^^^^^^^^^^^^^^ -.. code:: cuda - - // fence{.sem}.scope; // 1. PTX ISA 60, SM_70 - // .sem = { .sc, .acq_rel } - // .scope = { .cta, .gpu, .sys } - template - __device__ static inline void fence( - cuda::ptx::sem_t sem, - cuda::ptx::scope_t scope); - -fence.acq_rel.gpu -^^^^^^^^^^^^^^^^^ -.. code:: cuda - - // fence{.sem}.scope; // 1. PTX ISA 60, SM_70 - // .sem = { .sc, .acq_rel } - // .scope = { .cta, .gpu, .sys } - template - __device__ static inline void fence( - cuda::ptx::sem_t sem, - cuda::ptx::scope_t scope); - -fence.acq_rel.sys -^^^^^^^^^^^^^^^^^ -.. code:: cuda - - // fence{.sem}.scope; // 1. PTX ISA 60, SM_70 - // .sem = { .sc, .acq_rel } - // .scope = { .cta, .gpu, .sys } - template - __device__ static inline void fence( - cuda::ptx::sem_t sem, - cuda::ptx::scope_t scope); - -fence.sc.cluster -^^^^^^^^^^^^^^^^ -.. code:: cuda - - // fence{.sem}.scope; // 2. PTX ISA 78, SM_90 - // .sem = { .sc, .acq_rel } - // .scope = { .cluster } - template - __device__ static inline void fence( - cuda::ptx::sem_t sem, - cuda::ptx::scope_cluster_t); - -fence.acq_rel.cluster -^^^^^^^^^^^^^^^^^^^^^ -.. code:: cuda - - // fence{.sem}.scope; // 2. PTX ISA 78, SM_90 - // .sem = { .sc, .acq_rel } - // .scope = { .cluster } - template - __device__ static inline void fence( - cuda::ptx::sem_t sem, - cuda::ptx::scope_cluster_t); +.. include:: generated/fence.rst fence.mbarrier_init ------------------- -fence.mbarrier_init.release.cluster -^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ -.. code:: cuda - - // fence.mbarrier_init.sem.scope; // 3. PTX ISA 80, SM_90 - // .sem = { .release } - // .scope = { .cluster } - template - __device__ static inline void fence_mbarrier_init( - cuda::ptx::sem_release_t, - cuda::ptx::scope_cluster_t); +.. include:: generated/fence_mbarrier_init.rst fence.proxy.alias ----------------- -fence.proxy.alias -^^^^^^^^^^^^^^^^^ -.. code:: cuda - - // fence.proxy.alias; // 4. PTX ISA 75, SM_70 - template - __device__ static inline void fence_proxy_alias(); +.. include:: generated/fence_proxy_alias.rst fence.proxy.async ----------------- -fence.proxy.async -^^^^^^^^^^^^^^^^^ -.. code:: cuda - - // fence.proxy.async; // 5. PTX ISA 80, SM_90 - template - __device__ static inline void fence_proxy_async(); -fence.proxy.async.global -^^^^^^^^^^^^^^^^^^^^^^^^ -.. code:: cuda - - // fence.proxy.async{.space}; // 6. PTX ISA 80, SM_90 - // .space = { .global, .shared::cluster, .shared::cta } - template - __device__ static inline void fence_proxy_async( - cuda::ptx::space_t space); - -fence.proxy.async.shared::cluster -^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ -.. code:: cuda - - // fence.proxy.async{.space}; // 6. PTX ISA 80, SM_90 - // .space = { .global, .shared::cluster, .shared::cta } - template - __device__ static inline void fence_proxy_async( - cuda::ptx::space_t space); - -fence.proxy.async.shared::cta -^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ -.. code:: cuda - - // fence.proxy.async{.space}; // 6. PTX ISA 80, SM_90 - // .space = { .global, .shared::cluster, .shared::cta } - template - __device__ static inline void fence_proxy_async( - cuda::ptx::space_t space); +.. include:: generated/fence_proxy_async.rst fence.proxy.tensormap --------------------- -fence.proxy.tensormap::generic.release.cta -^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ -.. code:: cuda - - // fence.proxy.tensormap::generic.release.scope; // 7. PTX ISA 83, SM_90 - // .sem = { .release } - // .scope = { .cta, .cluster, .gpu, .sys } - template - __device__ static inline void fence_proxy_tensormap_generic( - cuda::ptx::sem_release_t, - cuda::ptx::scope_t scope); - -fence.proxy.tensormap::generic.release.cluster -^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ -.. code:: cuda - - // fence.proxy.tensormap::generic.release.scope; // 7. PTX ISA 83, SM_90 - // .sem = { .release } - // .scope = { .cta, .cluster, .gpu, .sys } - template - __device__ static inline void fence_proxy_tensormap_generic( - cuda::ptx::sem_release_t, - cuda::ptx::scope_t scope); - -fence.proxy.tensormap::generic.release.gpu -^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ -.. code:: cuda - - // fence.proxy.tensormap::generic.release.scope; // 7. PTX ISA 83, SM_90 - // .sem = { .release } - // .scope = { .cta, .cluster, .gpu, .sys } - template - __device__ static inline void fence_proxy_tensormap_generic( - cuda::ptx::sem_release_t, - cuda::ptx::scope_t scope); - -fence.proxy.tensormap::generic.release.sys -^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ -.. code:: cuda - - // fence.proxy.tensormap::generic.release.scope; // 7. PTX ISA 83, SM_90 - // .sem = { .release } - // .scope = { .cta, .cluster, .gpu, .sys } - template - __device__ static inline void fence_proxy_tensormap_generic( - cuda::ptx::sem_release_t, - cuda::ptx::scope_t scope); - -fence.proxy.tensormap::generic.acquire.cta -^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ -.. code:: cuda - - // fence.proxy.tensormap::generic.sem.scope [addr], size; // 8. PTX ISA 83, SM_90 - // .sem = { .acquire } - // .scope = { .cta, .cluster, .gpu, .sys } - template - __device__ static inline void fence_proxy_tensormap_generic( - cuda::ptx::sem_acquire_t, - cuda::ptx::scope_t scope, - const void* addr, - cuda::ptx::n32_t size); - -fence.proxy.tensormap::generic.acquire.cluster -^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ -.. code:: cuda - - // fence.proxy.tensormap::generic.sem.scope [addr], size; // 8. PTX ISA 83, SM_90 - // .sem = { .acquire } - // .scope = { .cta, .cluster, .gpu, .sys } - template - __device__ static inline void fence_proxy_tensormap_generic( - cuda::ptx::sem_acquire_t, - cuda::ptx::scope_t scope, - const void* addr, - cuda::ptx::n32_t size); - -fence.proxy.tensormap::generic.acquire.gpu -^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ -.. code:: cuda - - // fence.proxy.tensormap::generic.sem.scope [addr], size; // 8. PTX ISA 83, SM_90 - // .sem = { .acquire } - // .scope = { .cta, .cluster, .gpu, .sys } - template - __device__ static inline void fence_proxy_tensormap_generic( - cuda::ptx::sem_acquire_t, - cuda::ptx::scope_t scope, - const void* addr, - cuda::ptx::n32_t size); - -fence.proxy.tensormap::generic.acquire.sys -^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ -.. code:: cuda - - // fence.proxy.tensormap::generic.sem.scope [addr], size; // 8. PTX ISA 83, SM_90 - // .sem = { .acquire } - // .scope = { .cta, .cluster, .gpu, .sys } - template - __device__ static inline void fence_proxy_tensormap_generic( - cuda::ptx::sem_acquire_t, - cuda::ptx::scope_t scope, - const void* addr, - cuda::ptx::n32_t size); +.. include:: generated/fence_proxy_tensormap_generic.rst diff --git a/docs/libcudacxx/ptx/instructions/barrier.cluster.rst b/docs/libcudacxx/ptx/instructions/generated/barrier_cluster.rst similarity index 70% rename from docs/libcudacxx/ptx/instructions/barrier.cluster.rst rename to docs/libcudacxx/ptx/instructions/generated/barrier_cluster.rst index 99048587eb5..bd994990c05 100644 --- a/docs/libcudacxx/ptx/instructions/barrier.cluster.rst +++ b/docs/libcudacxx/ptx/instructions/generated/barrier_cluster.rst @@ -1,18 +1,3 @@ -.. _libcudacxx-ptx-instructions-barrier-cluster: - -barrier.cluster -=============== - -- PTX ISA: - `barrier.cluster `__ - -Similar functionality is provided through the builtins -``__cluster_barrier_arrive(), __cluster_barrier_arrive_relaxed(), __cluster_barrier_wait()``, -as well as the ``cooperative_groups::cluster_group`` -`API `__. - -The ``.aligned`` variants of the instructions are not exposed. - barrier.cluster.arrive ^^^^^^^^^^^^^^^^^^^^^^ .. code:: cuda diff --git a/docs/libcudacxx/ptx/instructions/cp.async.bulk.rst b/docs/libcudacxx/ptx/instructions/generated/cp_async_bulk.rst similarity index 57% rename from docs/libcudacxx/ptx/instructions/cp.async.bulk.rst rename to docs/libcudacxx/ptx/instructions/generated/cp_async_bulk.rst index 434a44a15a4..f5c236f8bf9 100644 --- a/docs/libcudacxx/ptx/instructions/cp.async.bulk.rst +++ b/docs/libcudacxx/ptx/instructions/generated/cp_async_bulk.rst @@ -1,26 +1,3 @@ -.. _libcudacxx-ptx-instructions-cp-async-bulk: - -cp.async.bulk -============= - -- PTX ISA: - `cp.async.bulk `__ - -Implementation notes --------------------- - -**NOTE.** Both ``srcMem`` and ``dstMem`` must be 16-byte aligned, and -``size`` must be a multiple of 16. - -Changelog ---------- - -- In earlier versions, ``cp_async_bulk_multicast`` was enabled for - SM_90. This has been changed to SM_90a. - -Unicast -------- - cp.async.bulk.shared::cluster.global.mbarrier::complete_tx::bytes ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ .. code:: cuda @@ -67,23 +44,3 @@ cp.async.bulk.global.shared::cta.bulk_group void* dstMem, const void* srcMem, const uint32_t& size); - -Multicast ---------- - -cp.async.bulk.shared::cluster.global.mbarrier::complete_tx::bytes.multicast::cluster -^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ -.. code:: cuda - - // cp.async.bulk{.dst}{.src}.mbarrier::complete_tx::bytes.multicast::cluster [dstMem], [srcMem], size, [smem_bar], ctaMask; // 1. PTX ISA 80, SM_90a - // .dst = { .shared::cluster } - // .src = { .global } - template - __device__ static inline void cp_async_bulk( - cuda::ptx::space_cluster_t, - cuda::ptx::space_global_t, - void* dstMem, - const void* srcMem, - const uint32_t& size, - uint64_t* smem_bar, - const uint16_t& ctaMask); diff --git a/docs/libcudacxx/ptx/instructions/generated/cp_async_bulk_commit_group.rst b/docs/libcudacxx/ptx/instructions/generated/cp_async_bulk_commit_group.rst new file mode 100644 index 00000000000..984b4aff976 --- /dev/null +++ b/docs/libcudacxx/ptx/instructions/generated/cp_async_bulk_commit_group.rst @@ -0,0 +1,7 @@ +cp.async.bulk.commit_group +^^^^^^^^^^^^^^^^^^^^^^^^^^ +.. code:: cuda + + // cp.async.bulk.commit_group; // PTX ISA 80, SM_90 + template + __device__ static inline void cp_async_bulk_commit_group(); diff --git a/docs/libcudacxx/ptx/instructions/generated/cp_async_bulk_multicast.rst b/docs/libcudacxx/ptx/instructions/generated/cp_async_bulk_multicast.rst new file mode 100644 index 00000000000..9cb15d06fa3 --- /dev/null +++ b/docs/libcudacxx/ptx/instructions/generated/cp_async_bulk_multicast.rst @@ -0,0 +1,16 @@ +cp.async.bulk.shared::cluster.global.mbarrier::complete_tx::bytes.multicast::cluster +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +.. code:: cuda + + // cp.async.bulk{.dst}{.src}.mbarrier::complete_tx::bytes.multicast::cluster [dstMem], [srcMem], size, [smem_bar], ctaMask; // 1. PTX ISA 80, SM_90a + // .dst = { .shared::cluster } + // .src = { .global } + template + __device__ static inline void cp_async_bulk( + cuda::ptx::space_cluster_t, + cuda::ptx::space_global_t, + void* dstMem, + const void* srcMem, + const uint32_t& size, + uint64_t* smem_bar, + const uint16_t& ctaMask); diff --git a/docs/libcudacxx/ptx/instructions/cp.async.bulk.tensor.rst b/docs/libcudacxx/ptx/instructions/generated/cp_async_bulk_tensor.rst similarity index 59% rename from docs/libcudacxx/ptx/instructions/cp.async.bulk.tensor.rst rename to docs/libcudacxx/ptx/instructions/generated/cp_async_bulk_tensor.rst index f095abcd1a3..40eb070e66a 100644 --- a/docs/libcudacxx/ptx/instructions/cp.async.bulk.tensor.rst +++ b/docs/libcudacxx/ptx/instructions/generated/cp_async_bulk_tensor.rst @@ -1,20 +1,3 @@ -.. _libcudacxx-ptx-instructions-cp-async-bulk-tensor: - -cp.async.bulk.tensor -==================== - -- PTX ISA: - `cp.async.bulk.tensor `__ - -Changelog ---------- - -- In earlier versions, ``cp_async_bulk_tensor_multicast`` was enabled - for SM_90. This has been changed to SM_90a. - -Unicast -------- - cp.async.bulk.tensor.1d.shared::cluster.global.tile.mbarrier::complete_tx::bytes ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ .. code:: cuda @@ -169,91 +152,3 @@ cp.async.bulk.tensor.5d.global.shared::cta.tile.bulk_group const void* tensorMap, const int32_t (&tensorCoords)[5], const void* srcMem); - -Multicast ---------- - -cp.async.bulk.tensor.1d.shared::cluster.global.tile.mbarrier::complete_tx::bytes.multicast::cluster -^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ -.. code:: cuda - - // cp.async.bulk.tensor.1d.dst.src.tile.mbarrier::complete_tx::bytes.multicast::cluster [dstMem], [tensorMap, tensorCoords], [smem_bar], ctaMask; // 2a. PTX ISA 80, SM_90a - // .dst = { .shared::cluster } - // .src = { .global } - template - __device__ static inline void cp_async_bulk_tensor( - cuda::ptx::space_cluster_t, - cuda::ptx::space_global_t, - void* dstMem, - const void* tensorMap, - const int32_t (&tensorCoords)[1], - uint64_t* smem_bar, - const uint16_t& ctaMask); - -cp.async.bulk.tensor.2d.shared::cluster.global.tile.mbarrier::complete_tx::bytes.multicast::cluster -^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ -.. code:: cuda - - // cp.async.bulk.tensor.2d.dst.src.tile.mbarrier::complete_tx::bytes.multicast::cluster [dstMem], [tensorMap, tensorCoords], [smem_bar], ctaMask; // 2b. PTX ISA 80, SM_90a - // .dst = { .shared::cluster } - // .src = { .global } - template - __device__ static inline void cp_async_bulk_tensor( - cuda::ptx::space_cluster_t, - cuda::ptx::space_global_t, - void* dstMem, - const void* tensorMap, - const int32_t (&tensorCoords)[2], - uint64_t* smem_bar, - const uint16_t& ctaMask); - -cp.async.bulk.tensor.3d.shared::cluster.global.tile.mbarrier::complete_tx::bytes.multicast::cluster -^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ -.. code:: cuda - - // cp.async.bulk.tensor.3d.dst.src.tile.mbarrier::complete_tx::bytes.multicast::cluster [dstMem], [tensorMap, tensorCoords], [smem_bar], ctaMask; // 2c. PTX ISA 80, SM_90a - // .dst = { .shared::cluster } - // .src = { .global } - template - __device__ static inline void cp_async_bulk_tensor( - cuda::ptx::space_cluster_t, - cuda::ptx::space_global_t, - void* dstMem, - const void* tensorMap, - const int32_t (&tensorCoords)[3], - uint64_t* smem_bar, - const uint16_t& ctaMask); - -cp.async.bulk.tensor.4d.shared::cluster.global.tile.mbarrier::complete_tx::bytes.multicast::cluster -^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ -.. code:: cuda - - // cp.async.bulk.tensor.4d.dst.src.tile.mbarrier::complete_tx::bytes.multicast::cluster [dstMem], [tensorMap, tensorCoords], [smem_bar], ctaMask; // 2d. PTX ISA 80, SM_90a - // .dst = { .shared::cluster } - // .src = { .global } - template - __device__ static inline void cp_async_bulk_tensor( - cuda::ptx::space_cluster_t, - cuda::ptx::space_global_t, - void* dstMem, - const void* tensorMap, - const int32_t (&tensorCoords)[4], - uint64_t* smem_bar, - const uint16_t& ctaMask); - -cp.async.bulk.tensor.5d.shared::cluster.global.tile.mbarrier::complete_tx::bytes.multicast::cluster -^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ -.. code:: cuda - - // cp.async.bulk.tensor.5d.dst.src.tile.mbarrier::complete_tx::bytes.multicast::cluster [dstMem], [tensorMap, tensorCoords], [smem_bar], ctaMask; // 2e. PTX ISA 80, SM_90a - // .dst = { .shared::cluster } - // .src = { .global } - template - __device__ static inline void cp_async_bulk_tensor( - cuda::ptx::space_cluster_t, - cuda::ptx::space_global_t, - void* dstMem, - const void* tensorMap, - const int32_t (&tensorCoords)[5], - uint64_t* smem_bar, - const uint16_t& ctaMask); diff --git a/docs/libcudacxx/ptx/instructions/generated/cp_async_bulk_tensor_multicast.rst b/docs/libcudacxx/ptx/instructions/generated/cp_async_bulk_tensor_multicast.rst new file mode 100644 index 00000000000..2481c80bf3c --- /dev/null +++ b/docs/libcudacxx/ptx/instructions/generated/cp_async_bulk_tensor_multicast.rst @@ -0,0 +1,84 @@ +cp.async.bulk.tensor.1d.shared::cluster.global.tile.mbarrier::complete_tx::bytes.multicast::cluster +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +.. code:: cuda + + // cp.async.bulk.tensor.1d.dst.src.tile.mbarrier::complete_tx::bytes.multicast::cluster [dstMem], [tensorMap, tensorCoords], [smem_bar], ctaMask; // 2a. PTX ISA 80, SM_90a + // .dst = { .shared::cluster } + // .src = { .global } + template + __device__ static inline void cp_async_bulk_tensor( + cuda::ptx::space_cluster_t, + cuda::ptx::space_global_t, + void* dstMem, + const void* tensorMap, + const int32_t (&tensorCoords)[1], + uint64_t* smem_bar, + const uint16_t& ctaMask); + +cp.async.bulk.tensor.2d.shared::cluster.global.tile.mbarrier::complete_tx::bytes.multicast::cluster +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +.. code:: cuda + + // cp.async.bulk.tensor.2d.dst.src.tile.mbarrier::complete_tx::bytes.multicast::cluster [dstMem], [tensorMap, tensorCoords], [smem_bar], ctaMask; // 2b. PTX ISA 80, SM_90a + // .dst = { .shared::cluster } + // .src = { .global } + template + __device__ static inline void cp_async_bulk_tensor( + cuda::ptx::space_cluster_t, + cuda::ptx::space_global_t, + void* dstMem, + const void* tensorMap, + const int32_t (&tensorCoords)[2], + uint64_t* smem_bar, + const uint16_t& ctaMask); + +cp.async.bulk.tensor.3d.shared::cluster.global.tile.mbarrier::complete_tx::bytes.multicast::cluster +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +.. code:: cuda + + // cp.async.bulk.tensor.3d.dst.src.tile.mbarrier::complete_tx::bytes.multicast::cluster [dstMem], [tensorMap, tensorCoords], [smem_bar], ctaMask; // 2c. PTX ISA 80, SM_90a + // .dst = { .shared::cluster } + // .src = { .global } + template + __device__ static inline void cp_async_bulk_tensor( + cuda::ptx::space_cluster_t, + cuda::ptx::space_global_t, + void* dstMem, + const void* tensorMap, + const int32_t (&tensorCoords)[3], + uint64_t* smem_bar, + const uint16_t& ctaMask); + +cp.async.bulk.tensor.4d.shared::cluster.global.tile.mbarrier::complete_tx::bytes.multicast::cluster +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +.. code:: cuda + + // cp.async.bulk.tensor.4d.dst.src.tile.mbarrier::complete_tx::bytes.multicast::cluster [dstMem], [tensorMap, tensorCoords], [smem_bar], ctaMask; // 2d. PTX ISA 80, SM_90a + // .dst = { .shared::cluster } + // .src = { .global } + template + __device__ static inline void cp_async_bulk_tensor( + cuda::ptx::space_cluster_t, + cuda::ptx::space_global_t, + void* dstMem, + const void* tensorMap, + const int32_t (&tensorCoords)[4], + uint64_t* smem_bar, + const uint16_t& ctaMask); + +cp.async.bulk.tensor.5d.shared::cluster.global.tile.mbarrier::complete_tx::bytes.multicast::cluster +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +.. code:: cuda + + // cp.async.bulk.tensor.5d.dst.src.tile.mbarrier::complete_tx::bytes.multicast::cluster [dstMem], [tensorMap, tensorCoords], [smem_bar], ctaMask; // 2e. PTX ISA 80, SM_90a + // .dst = { .shared::cluster } + // .src = { .global } + template + __device__ static inline void cp_async_bulk_tensor( + cuda::ptx::space_cluster_t, + cuda::ptx::space_global_t, + void* dstMem, + const void* tensorMap, + const int32_t (&tensorCoords)[5], + uint64_t* smem_bar, + const uint16_t& ctaMask); diff --git a/docs/libcudacxx/ptx/instructions/cp.async.bulk.wait_group.rst b/docs/libcudacxx/ptx/instructions/generated/cp_async_bulk_wait_group.rst similarity index 62% rename from docs/libcudacxx/ptx/instructions/cp.async.bulk.wait_group.rst rename to docs/libcudacxx/ptx/instructions/generated/cp_async_bulk_wait_group.rst index 8939292d340..08ebd3c28a7 100644 --- a/docs/libcudacxx/ptx/instructions/cp.async.bulk.wait_group.rst +++ b/docs/libcudacxx/ptx/instructions/generated/cp_async_bulk_wait_group.rst @@ -1,11 +1,3 @@ -.. _libcudacxx-ptx-instructions-cp-async-bulk-wait_group: - -cp.async.bulk.wait_group -======================== - -- PTX ISA: - `cp.async.bulk.wait_group `__ - cp.async.bulk.wait_group ^^^^^^^^^^^^^^^^^^^^^^^^ .. code:: cuda diff --git a/docs/libcudacxx/ptx/instructions/cp.reduce.async.bulk.rst b/docs/libcudacxx/ptx/instructions/generated/cp_reduce_async_bulk.rst similarity index 80% rename from docs/libcudacxx/ptx/instructions/cp.reduce.async.bulk.rst rename to docs/libcudacxx/ptx/instructions/generated/cp_reduce_async_bulk.rst index 571e1d9842f..cc82d633375 100644 --- a/docs/libcudacxx/ptx/instructions/cp.reduce.async.bulk.rst +++ b/docs/libcudacxx/ptx/instructions/generated/cp_reduce_async_bulk.rst @@ -1,15 +1,3 @@ -.. _libcudacxx-ptx-instructions-cp-reduce-async-bulk: - -cp.reduce.async.bulk -==================== - -- PTX ISA: - `cp.reduce.async.bulk `__ - - -Integer and floating point instructions ---------------------------------------- - cp.reduce.async.bulk.shared::cluster.shared::cta.mbarrier::complete_tx::bytes.and.b32 ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ .. code:: cuda @@ -652,155 +640,3 @@ cp.reduce.async.bulk.global.shared::cta.bulk_group.add.u64 int64_t* dstMem, const int64_t* srcMem, uint32_t size); - -Emulation of ``.s64`` instruction ---------------------------------- - -PTX does not currently (CTK 12.3) expose -``cp.reduce.async.bulk.add.s64``. This exposure is emulated in -``cuda::ptx`` using: - -.. code:: cuda - - // cp.reduce.async.bulk.dst.src.mbarrier::complete_tx::bytes.op.u64 [dstMem], [srcMem], size, [rdsmem_bar]; // 2. PTX ISA 80, SM_90 - // .dst = { .shared::cluster } - // .src = { .shared::cta } - // .type = { .s64 } - // .op = { .add } - template - __device__ static inline void cp_reduce_async_bulk( - cuda::ptx::space_cluster_t, - cuda::ptx::space_shared_t, - cuda::ptx::op_add_t, - int64_t* dstMem, - const int64_t* srcMem, - uint32_t size, - uint64_t* rdsmem_bar); - - // cp.reduce.async.bulk.dst.src.bulk_group.op.u64 [dstMem], [srcMem], size; // 6. PTX ISA 80, SM_90 - // .dst = { .global } - // .src = { .shared::cta } - // .type = { .s64 } - // .op = { .add } - template - __device__ static inline void cp_reduce_async_bulk( - cuda::ptx::space_global_t, - cuda::ptx::space_shared_t, - cuda::ptx::op_add_t, - int64_t* dstMem, - const int64_t* srcMem, - uint32_t size); - -FP16 instructions ------------------ - -cp.reduce.async.bulk.global.shared::cta.bulk_group.min.f16 -^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ -.. code:: cuda - - // cp.reduce.async.bulk.dst.src.bulk_group.op.type [dstMem], [srcMem], size; // 4. PTX ISA 80, SM_90 - // .dst = { .global } - // .src = { .shared::cta } - // .type = { .f16 } - // .op = { .min } - template - __device__ static inline void cp_reduce_async_bulk( - cuda::ptx::space_global_t, - cuda::ptx::space_shared_t, - cuda::ptx::op_min_t, - __half* dstMem, - const __half* srcMem, - uint32_t size); - -cp.reduce.async.bulk.global.shared::cta.bulk_group.max.f16 -^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ -.. code:: cuda - - // cp.reduce.async.bulk.dst.src.bulk_group.op.type [dstMem], [srcMem], size; // 4. PTX ISA 80, SM_90 - // .dst = { .global } - // .src = { .shared::cta } - // .type = { .f16 } - // .op = { .max } - template - __device__ static inline void cp_reduce_async_bulk( - cuda::ptx::space_global_t, - cuda::ptx::space_shared_t, - cuda::ptx::op_max_t, - __half* dstMem, - const __half* srcMem, - uint32_t size); - -cp.reduce.async.bulk.global.shared::cta.bulk_group.add.noftz.f16 -^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ -.. code:: cuda - - // cp.reduce.async.bulk.dst.src.bulk_group.op.noftz.type [dstMem], [srcMem], size; // 5. PTX ISA 80, SM_90 - // .dst = { .global } - // .src = { .shared::cta } - // .type = { .f16 } - // .op = { .add } - template - __device__ static inline void cp_reduce_async_bulk( - cuda::ptx::space_global_t, - cuda::ptx::space_shared_t, - cuda::ptx::op_add_t, - __half* dstMem, - const __half* srcMem, - uint32_t size); - -BF16 instructions ------------------ - -cp.reduce.async.bulk.global.shared::cta.bulk_group.min.bf16 -^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ -.. code:: cuda - - // cp.reduce.async.bulk.dst.src.bulk_group.op.type [dstMem], [srcMem], size; // 4. PTX ISA 80, SM_90 - // .dst = { .global } - // .src = { .shared::cta } - // .type = { .bf16 } - // .op = { .min } - template - __device__ static inline void cp_reduce_async_bulk( - cuda::ptx::space_global_t, - cuda::ptx::space_shared_t, - cuda::ptx::op_min_t, - __nv_bfloat16* dstMem, - const __nv_bfloat16* srcMem, - uint32_t size); - -cp.reduce.async.bulk.global.shared::cta.bulk_group.max.bf16 -^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ -.. code:: cuda - - // cp.reduce.async.bulk.dst.src.bulk_group.op.type [dstMem], [srcMem], size; // 4. PTX ISA 80, SM_90 - // .dst = { .global } - // .src = { .shared::cta } - // .type = { .bf16 } - // .op = { .max } - template - __device__ static inline void cp_reduce_async_bulk( - cuda::ptx::space_global_t, - cuda::ptx::space_shared_t, - cuda::ptx::op_max_t, - __nv_bfloat16* dstMem, - const __nv_bfloat16* srcMem, - uint32_t size); - -cp.reduce.async.bulk.global.shared::cta.bulk_group.add.noftz.bf16 -^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ -.. code:: cuda - - // cp.reduce.async.bulk.dst.src.bulk_group.op.noftz.type [dstMem], [srcMem], size; // 5. PTX ISA 80, SM_90 - // .dst = { .global } - // .src = { .shared::cta } - // .type = { .bf16 } - // .op = { .add } - template - __device__ static inline void cp_reduce_async_bulk( - cuda::ptx::space_global_t, - cuda::ptx::space_shared_t, - cuda::ptx::op_add_t, - __nv_bfloat16* dstMem, - const __nv_bfloat16* srcMem, - uint32_t size); diff --git a/docs/libcudacxx/ptx/instructions/generated/cp_reduce_async_bulk_bf16.rst b/docs/libcudacxx/ptx/instructions/generated/cp_reduce_async_bulk_bf16.rst new file mode 100644 index 00000000000..e4dea98a119 --- /dev/null +++ b/docs/libcudacxx/ptx/instructions/generated/cp_reduce_async_bulk_bf16.rst @@ -0,0 +1,53 @@ +cp.reduce.async.bulk.global.shared::cta.bulk_group.min.bf16 +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +.. code:: cuda + + // cp.reduce.async.bulk.dst.src.bulk_group.op.type [dstMem], [srcMem], size; // 4. PTX ISA 80, SM_90 + // .dst = { .global } + // .src = { .shared::cta } + // .type = { .bf16 } + // .op = { .min } + template + __device__ static inline void cp_reduce_async_bulk( + cuda::ptx::space_global_t, + cuda::ptx::space_shared_t, + cuda::ptx::op_min_t, + __nv_bfloat16* dstMem, + const __nv_bfloat16* srcMem, + uint32_t size); + +cp.reduce.async.bulk.global.shared::cta.bulk_group.max.bf16 +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +.. code:: cuda + + // cp.reduce.async.bulk.dst.src.bulk_group.op.type [dstMem], [srcMem], size; // 4. PTX ISA 80, SM_90 + // .dst = { .global } + // .src = { .shared::cta } + // .type = { .bf16 } + // .op = { .max } + template + __device__ static inline void cp_reduce_async_bulk( + cuda::ptx::space_global_t, + cuda::ptx::space_shared_t, + cuda::ptx::op_max_t, + __nv_bfloat16* dstMem, + const __nv_bfloat16* srcMem, + uint32_t size); + +cp.reduce.async.bulk.global.shared::cta.bulk_group.add.noftz.bf16 +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +.. code:: cuda + + // cp.reduce.async.bulk.dst.src.bulk_group.op.noftz.type [dstMem], [srcMem], size; // 5. PTX ISA 80, SM_90 + // .dst = { .global } + // .src = { .shared::cta } + // .type = { .bf16 } + // .op = { .add } + template + __device__ static inline void cp_reduce_async_bulk( + cuda::ptx::space_global_t, + cuda::ptx::space_shared_t, + cuda::ptx::op_add_t, + __nv_bfloat16* dstMem, + const __nv_bfloat16* srcMem, + uint32_t size); diff --git a/docs/libcudacxx/ptx/instructions/generated/cp_reduce_async_bulk_f16.rst b/docs/libcudacxx/ptx/instructions/generated/cp_reduce_async_bulk_f16.rst new file mode 100644 index 00000000000..18c5e0bfc60 --- /dev/null +++ b/docs/libcudacxx/ptx/instructions/generated/cp_reduce_async_bulk_f16.rst @@ -0,0 +1,53 @@ +cp.reduce.async.bulk.global.shared::cta.bulk_group.min.f16 +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +.. code:: cuda + + // cp.reduce.async.bulk.dst.src.bulk_group.op.type [dstMem], [srcMem], size; // 4. PTX ISA 80, SM_90 + // .dst = { .global } + // .src = { .shared::cta } + // .type = { .f16 } + // .op = { .min } + template + __device__ static inline void cp_reduce_async_bulk( + cuda::ptx::space_global_t, + cuda::ptx::space_shared_t, + cuda::ptx::op_min_t, + __half* dstMem, + const __half* srcMem, + uint32_t size); + +cp.reduce.async.bulk.global.shared::cta.bulk_group.max.f16 +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +.. code:: cuda + + // cp.reduce.async.bulk.dst.src.bulk_group.op.type [dstMem], [srcMem], size; // 4. PTX ISA 80, SM_90 + // .dst = { .global } + // .src = { .shared::cta } + // .type = { .f16 } + // .op = { .max } + template + __device__ static inline void cp_reduce_async_bulk( + cuda::ptx::space_global_t, + cuda::ptx::space_shared_t, + cuda::ptx::op_max_t, + __half* dstMem, + const __half* srcMem, + uint32_t size); + +cp.reduce.async.bulk.global.shared::cta.bulk_group.add.noftz.f16 +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +.. code:: cuda + + // cp.reduce.async.bulk.dst.src.bulk_group.op.noftz.type [dstMem], [srcMem], size; // 5. PTX ISA 80, SM_90 + // .dst = { .global } + // .src = { .shared::cta } + // .type = { .f16 } + // .op = { .add } + template + __device__ static inline void cp_reduce_async_bulk( + cuda::ptx::space_global_t, + cuda::ptx::space_shared_t, + cuda::ptx::op_add_t, + __half* dstMem, + const __half* srcMem, + uint32_t size); diff --git a/docs/libcudacxx/ptx/instructions/cp.reduce.async.bulk.tensor.rst b/docs/libcudacxx/ptx/instructions/generated/cp_reduce_async_bulk_tensor.rst similarity index 98% rename from docs/libcudacxx/ptx/instructions/cp.reduce.async.bulk.tensor.rst rename to docs/libcudacxx/ptx/instructions/generated/cp_reduce_async_bulk_tensor.rst index 7ea7b5675aa..c653b01cd60 100644 --- a/docs/libcudacxx/ptx/instructions/cp.reduce.async.bulk.tensor.rst +++ b/docs/libcudacxx/ptx/instructions/generated/cp_reduce_async_bulk_tensor.rst @@ -1,11 +1,3 @@ -.. _libcudacxx-ptx-instructions-cp-reduce-async-bulk-tensor: - -cp.reduce.async.bulk.tensor -=========================== - -- PTX ISA: - `cp.reduce.async.bulk.tensor `__ - cp.reduce.async.bulk.tensor.1d.global.shared::cta.add.tile.bulk_group ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ .. code:: cuda diff --git a/docs/libcudacxx/ptx/instructions/generated/fence.rst b/docs/libcudacxx/ptx/instructions/generated/fence.rst new file mode 100644 index 00000000000..2fe14dcb3b2 --- /dev/null +++ b/docs/libcudacxx/ptx/instructions/generated/fence.rst @@ -0,0 +1,95 @@ +fence.sc.cta +^^^^^^^^^^^^ +.. code:: cuda + + // fence{.sem}.scope; // 1. PTX ISA 60, SM_70 + // .sem = { .sc, .acq_rel } + // .scope = { .cta, .gpu, .sys } + template + __device__ static inline void fence( + cuda::ptx::sem_t sem, + cuda::ptx::scope_t scope); + +fence.sc.gpu +^^^^^^^^^^^^ +.. code:: cuda + + // fence{.sem}.scope; // 1. PTX ISA 60, SM_70 + // .sem = { .sc, .acq_rel } + // .scope = { .cta, .gpu, .sys } + template + __device__ static inline void fence( + cuda::ptx::sem_t sem, + cuda::ptx::scope_t scope); + +fence.sc.sys +^^^^^^^^^^^^ +.. code:: cuda + + // fence{.sem}.scope; // 1. PTX ISA 60, SM_70 + // .sem = { .sc, .acq_rel } + // .scope = { .cta, .gpu, .sys } + template + __device__ static inline void fence( + cuda::ptx::sem_t sem, + cuda::ptx::scope_t scope); + +fence.acq_rel.cta +^^^^^^^^^^^^^^^^^ +.. code:: cuda + + // fence{.sem}.scope; // 1. PTX ISA 60, SM_70 + // .sem = { .sc, .acq_rel } + // .scope = { .cta, .gpu, .sys } + template + __device__ static inline void fence( + cuda::ptx::sem_t sem, + cuda::ptx::scope_t scope); + +fence.acq_rel.gpu +^^^^^^^^^^^^^^^^^ +.. code:: cuda + + // fence{.sem}.scope; // 1. PTX ISA 60, SM_70 + // .sem = { .sc, .acq_rel } + // .scope = { .cta, .gpu, .sys } + template + __device__ static inline void fence( + cuda::ptx::sem_t sem, + cuda::ptx::scope_t scope); + +fence.acq_rel.sys +^^^^^^^^^^^^^^^^^ +.. code:: cuda + + // fence{.sem}.scope; // 1. PTX ISA 60, SM_70 + // .sem = { .sc, .acq_rel } + // .scope = { .cta, .gpu, .sys } + template + __device__ static inline void fence( + cuda::ptx::sem_t sem, + cuda::ptx::scope_t scope); + +fence.sc.cluster +^^^^^^^^^^^^^^^^ +.. code:: cuda + + // fence{.sem}.scope; // 2. PTX ISA 78, SM_90 + // .sem = { .sc, .acq_rel } + // .scope = { .cluster } + template + __device__ static inline void fence( + cuda::ptx::sem_t sem, + cuda::ptx::scope_cluster_t); + +fence.acq_rel.cluster +^^^^^^^^^^^^^^^^^^^^^ +.. code:: cuda + + // fence{.sem}.scope; // 2. PTX ISA 78, SM_90 + // .sem = { .sc, .acq_rel } + // .scope = { .cluster } + template + __device__ static inline void fence( + cuda::ptx::sem_t sem, + cuda::ptx::scope_cluster_t); diff --git a/docs/libcudacxx/ptx/instructions/generated/fence_mbarrier_init.rst b/docs/libcudacxx/ptx/instructions/generated/fence_mbarrier_init.rst new file mode 100644 index 00000000000..0f5298e3359 --- /dev/null +++ b/docs/libcudacxx/ptx/instructions/generated/fence_mbarrier_init.rst @@ -0,0 +1,11 @@ +fence.mbarrier_init.release.cluster +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +.. code:: cuda + + // fence.mbarrier_init.sem.scope; // 3. PTX ISA 80, SM_90 + // .sem = { .release } + // .scope = { .cluster } + template + __device__ static inline void fence_mbarrier_init( + cuda::ptx::sem_release_t, + cuda::ptx::scope_cluster_t); diff --git a/docs/libcudacxx/ptx/instructions/generated/fence_proxy_alias.rst b/docs/libcudacxx/ptx/instructions/generated/fence_proxy_alias.rst new file mode 100644 index 00000000000..935aab9b6df --- /dev/null +++ b/docs/libcudacxx/ptx/instructions/generated/fence_proxy_alias.rst @@ -0,0 +1,7 @@ +fence.proxy.alias +^^^^^^^^^^^^^^^^^ +.. code:: cuda + + // fence.proxy.alias; // 4. PTX ISA 75, SM_70 + template + __device__ static inline void fence_proxy_alias(); diff --git a/docs/libcudacxx/ptx/instructions/generated/fence_proxy_async.rst b/docs/libcudacxx/ptx/instructions/generated/fence_proxy_async.rst new file mode 100644 index 00000000000..3e741a1f6c4 --- /dev/null +++ b/docs/libcudacxx/ptx/instructions/generated/fence_proxy_async.rst @@ -0,0 +1,37 @@ +fence.proxy.async +^^^^^^^^^^^^^^^^^ +.. code:: cuda + + // fence.proxy.async; // 5. PTX ISA 80, SM_90 + template + __device__ static inline void fence_proxy_async(); + +fence.proxy.async.global +^^^^^^^^^^^^^^^^^^^^^^^^ +.. code:: cuda + + // fence.proxy.async{.space}; // 6. PTX ISA 80, SM_90 + // .space = { .global, .shared::cluster, .shared::cta } + template + __device__ static inline void fence_proxy_async( + cuda::ptx::space_t space); + +fence.proxy.async.shared::cluster +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +.. code:: cuda + + // fence.proxy.async{.space}; // 6. PTX ISA 80, SM_90 + // .space = { .global, .shared::cluster, .shared::cta } + template + __device__ static inline void fence_proxy_async( + cuda::ptx::space_t space); + +fence.proxy.async.shared::cta +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +.. code:: cuda + + // fence.proxy.async{.space}; // 6. PTX ISA 80, SM_90 + // .space = { .global, .shared::cluster, .shared::cta } + template + __device__ static inline void fence_proxy_async( + cuda::ptx::space_t space); diff --git a/docs/libcudacxx/ptx/instructions/generated/fence_proxy_tensormap_generic.rst b/docs/libcudacxx/ptx/instructions/generated/fence_proxy_tensormap_generic.rst new file mode 100644 index 00000000000..db582971c3d --- /dev/null +++ b/docs/libcudacxx/ptx/instructions/generated/fence_proxy_tensormap_generic.rst @@ -0,0 +1,103 @@ +fence.proxy.tensormap::generic.release.cta +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +.. code:: cuda + + // fence.proxy.tensormap::generic.release.scope; // 7. PTX ISA 83, SM_90 + // .sem = { .release } + // .scope = { .cta, .cluster, .gpu, .sys } + template + __device__ static inline void fence_proxy_tensormap_generic( + cuda::ptx::sem_release_t, + cuda::ptx::scope_t scope); + +fence.proxy.tensormap::generic.release.cluster +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +.. code:: cuda + + // fence.proxy.tensormap::generic.release.scope; // 7. PTX ISA 83, SM_90 + // .sem = { .release } + // .scope = { .cta, .cluster, .gpu, .sys } + template + __device__ static inline void fence_proxy_tensormap_generic( + cuda::ptx::sem_release_t, + cuda::ptx::scope_t scope); + +fence.proxy.tensormap::generic.release.gpu +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +.. code:: cuda + + // fence.proxy.tensormap::generic.release.scope; // 7. PTX ISA 83, SM_90 + // .sem = { .release } + // .scope = { .cta, .cluster, .gpu, .sys } + template + __device__ static inline void fence_proxy_tensormap_generic( + cuda::ptx::sem_release_t, + cuda::ptx::scope_t scope); + +fence.proxy.tensormap::generic.release.sys +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +.. code:: cuda + + // fence.proxy.tensormap::generic.release.scope; // 7. PTX ISA 83, SM_90 + // .sem = { .release } + // .scope = { .cta, .cluster, .gpu, .sys } + template + __device__ static inline void fence_proxy_tensormap_generic( + cuda::ptx::sem_release_t, + cuda::ptx::scope_t scope); + +fence.proxy.tensormap::generic.acquire.cta +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +.. code:: cuda + + // fence.proxy.tensormap::generic.sem.scope [addr], size; // 8. PTX ISA 83, SM_90 + // .sem = { .acquire } + // .scope = { .cta, .cluster, .gpu, .sys } + template + __device__ static inline void fence_proxy_tensormap_generic( + cuda::ptx::sem_acquire_t, + cuda::ptx::scope_t scope, + const void* addr, + cuda::ptx::n32_t size); + +fence.proxy.tensormap::generic.acquire.cluster +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +.. code:: cuda + + // fence.proxy.tensormap::generic.sem.scope [addr], size; // 8. PTX ISA 83, SM_90 + // .sem = { .acquire } + // .scope = { .cta, .cluster, .gpu, .sys } + template + __device__ static inline void fence_proxy_tensormap_generic( + cuda::ptx::sem_acquire_t, + cuda::ptx::scope_t scope, + const void* addr, + cuda::ptx::n32_t size); + +fence.proxy.tensormap::generic.acquire.gpu +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +.. code:: cuda + + // fence.proxy.tensormap::generic.sem.scope [addr], size; // 8. PTX ISA 83, SM_90 + // .sem = { .acquire } + // .scope = { .cta, .cluster, .gpu, .sys } + template + __device__ static inline void fence_proxy_tensormap_generic( + cuda::ptx::sem_acquire_t, + cuda::ptx::scope_t scope, + const void* addr, + cuda::ptx::n32_t size); + +fence.proxy.tensormap::generic.acquire.sys +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +.. code:: cuda + + // fence.proxy.tensormap::generic.sem.scope [addr], size; // 8. PTX ISA 83, SM_90 + // .sem = { .acquire } + // .scope = { .cta, .cluster, .gpu, .sys } + template + __device__ static inline void fence_proxy_tensormap_generic( + cuda::ptx::sem_acquire_t, + cuda::ptx::scope_t scope, + const void* addr, + cuda::ptx::n32_t size); diff --git a/docs/libcudacxx/ptx/instructions/generated/getctarank.rst b/docs/libcudacxx/ptx/instructions/generated/getctarank.rst new file mode 100644 index 00000000000..c85f52ee302 --- /dev/null +++ b/docs/libcudacxx/ptx/instructions/generated/getctarank.rst @@ -0,0 +1,10 @@ +getctarank.shared::cluster.u32 +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +.. code:: cuda + + // getctarank{.space}.u32 dest, addr; // PTX ISA 78, SM_90 + // .space = { .shared::cluster } + template + __device__ static inline uint32_t getctarank( + cuda::ptx::space_cluster_t, + const void* addr); diff --git a/docs/libcudacxx/ptx/instructions/generated/mbarrier_arrive.rst b/docs/libcudacxx/ptx/instructions/generated/mbarrier_arrive.rst new file mode 100644 index 00000000000..92cd106cad9 --- /dev/null +++ b/docs/libcudacxx/ptx/instructions/generated/mbarrier_arrive.rst @@ -0,0 +1,111 @@ +mbarrier.arrive.shared.b64 +^^^^^^^^^^^^^^^^^^^^^^^^^^ +.. code:: cuda + + // mbarrier.arrive.shared.b64 state, [addr]; // 1. PTX ISA 70, SM_80 + template + __device__ static inline uint64_t mbarrier_arrive( + uint64_t* addr); + +mbarrier.arrive.shared::cta.b64 +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +.. code:: cuda + + // mbarrier.arrive.shared::cta.b64 state, [addr], count; // 2. PTX ISA 78, SM_90 + template + __device__ static inline uint64_t mbarrier_arrive( + uint64_t* addr, + const uint32_t& count); + +mbarrier.arrive.release.cta.shared::cta.b64 +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +.. code:: cuda + + // mbarrier.arrive{.sem}{.scope}{.space}.b64 state, [addr]; // 3a. PTX ISA 80, SM_90 + // .sem = { .release } + // .scope = { .cta, .cluster } + // .space = { .shared::cta } + template + __device__ static inline uint64_t mbarrier_arrive( + cuda::ptx::sem_release_t, + cuda::ptx::scope_t scope, + cuda::ptx::space_shared_t, + uint64_t* addr); + +mbarrier.arrive.release.cluster.shared::cta.b64 +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +.. code:: cuda + + // mbarrier.arrive{.sem}{.scope}{.space}.b64 state, [addr]; // 3a. PTX ISA 80, SM_90 + // .sem = { .release } + // .scope = { .cta, .cluster } + // .space = { .shared::cta } + template + __device__ static inline uint64_t mbarrier_arrive( + cuda::ptx::sem_release_t, + cuda::ptx::scope_t scope, + cuda::ptx::space_shared_t, + uint64_t* addr); + +mbarrier.arrive.release.cta.shared::cta.b64 +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +.. code:: cuda + + // mbarrier.arrive{.sem}{.scope}{.space}.b64 state, [addr], count; // 3b. PTX ISA 80, SM_90 + // .sem = { .release } + // .scope = { .cta, .cluster } + // .space = { .shared::cta } + template + __device__ static inline uint64_t mbarrier_arrive( + cuda::ptx::sem_release_t, + cuda::ptx::scope_t scope, + cuda::ptx::space_shared_t, + uint64_t* addr, + const uint32_t& count); + +mbarrier.arrive.release.cluster.shared::cta.b64 +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +.. code:: cuda + + // mbarrier.arrive{.sem}{.scope}{.space}.b64 state, [addr], count; // 3b. PTX ISA 80, SM_90 + // .sem = { .release } + // .scope = { .cta, .cluster } + // .space = { .shared::cta } + template + __device__ static inline uint64_t mbarrier_arrive( + cuda::ptx::sem_release_t, + cuda::ptx::scope_t scope, + cuda::ptx::space_shared_t, + uint64_t* addr, + const uint32_t& count); + +mbarrier.arrive.release.cluster.shared::cluster.b64 +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +.. code:: cuda + + // mbarrier.arrive{.sem}{.scope}{.space}.b64 _, [addr]; // 4a. PTX ISA 80, SM_90 + // .sem = { .release } + // .scope = { .cluster } + // .space = { .shared::cluster } + template + __device__ static inline void mbarrier_arrive( + cuda::ptx::sem_release_t, + cuda::ptx::scope_cluster_t, + cuda::ptx::space_cluster_t, + uint64_t* addr); + +mbarrier.arrive.release.cluster.shared::cluster.b64 +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +.. code:: cuda + + // mbarrier.arrive{.sem}{.scope}{.space}.b64 _, [addr], count; // 4b. PTX ISA 80, SM_90 + // .sem = { .release } + // .scope = { .cluster } + // .space = { .shared::cluster } + template + __device__ static inline void mbarrier_arrive( + cuda::ptx::sem_release_t, + cuda::ptx::scope_cluster_t, + cuda::ptx::space_cluster_t, + uint64_t* addr, + const uint32_t& count); diff --git a/docs/libcudacxx/ptx/instructions/generated/mbarrier_arrive_expect_tx.rst b/docs/libcudacxx/ptx/instructions/generated/mbarrier_arrive_expect_tx.rst new file mode 100644 index 00000000000..0087ae2f458 --- /dev/null +++ b/docs/libcudacxx/ptx/instructions/generated/mbarrier_arrive_expect_tx.rst @@ -0,0 +1,47 @@ +mbarrier.arrive.expect_tx.release.cta.shared::cta.b64 +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +.. code:: cuda + + // mbarrier.arrive.expect_tx{.sem}{.scope}{.space}.b64 state, [addr], tx_count; // 8. PTX ISA 80, SM_90 + // .sem = { .release } + // .scope = { .cta, .cluster } + // .space = { .shared::cta } + template + __device__ static inline uint64_t mbarrier_arrive_expect_tx( + cuda::ptx::sem_release_t, + cuda::ptx::scope_t scope, + cuda::ptx::space_shared_t, + uint64_t* addr, + const uint32_t& tx_count); + +mbarrier.arrive.expect_tx.release.cluster.shared::cta.b64 +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +.. code:: cuda + + // mbarrier.arrive.expect_tx{.sem}{.scope}{.space}.b64 state, [addr], tx_count; // 8. PTX ISA 80, SM_90 + // .sem = { .release } + // .scope = { .cta, .cluster } + // .space = { .shared::cta } + template + __device__ static inline uint64_t mbarrier_arrive_expect_tx( + cuda::ptx::sem_release_t, + cuda::ptx::scope_t scope, + cuda::ptx::space_shared_t, + uint64_t* addr, + const uint32_t& tx_count); + +mbarrier.arrive.expect_tx.release.cluster.shared::cluster.b64 +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +.. code:: cuda + + // mbarrier.arrive.expect_tx{.sem}{.scope}{.space}.b64 _, [addr], tx_count; // 9. PTX ISA 80, SM_90 + // .sem = { .release } + // .scope = { .cluster } + // .space = { .shared::cluster } + template + __device__ static inline void mbarrier_arrive_expect_tx( + cuda::ptx::sem_release_t, + cuda::ptx::scope_cluster_t, + cuda::ptx::space_cluster_t, + uint64_t* addr, + const uint32_t& tx_count); diff --git a/docs/libcudacxx/ptx/instructions/generated/mbarrier_arrive_no_complete.rst b/docs/libcudacxx/ptx/instructions/generated/mbarrier_arrive_no_complete.rst new file mode 100644 index 00000000000..b6d7edbbeee --- /dev/null +++ b/docs/libcudacxx/ptx/instructions/generated/mbarrier_arrive_no_complete.rst @@ -0,0 +1,9 @@ +mbarrier.arrive.noComplete.shared.b64 +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +.. code:: cuda + + // mbarrier.arrive.noComplete.shared.b64 state, [addr], count; // 5. PTX ISA 70, SM_80 + template + __device__ static inline uint64_t mbarrier_arrive_no_complete( + uint64_t* addr, + const uint32_t& count); diff --git a/docs/libcudacxx/ptx/instructions/mbarrier.expect_tx.rst b/docs/libcudacxx/ptx/instructions/generated/mbarrier_expect_tx.rst similarity index 88% rename from docs/libcudacxx/ptx/instructions/mbarrier.expect_tx.rst rename to docs/libcudacxx/ptx/instructions/generated/mbarrier_expect_tx.rst index 9b40db58d0c..b87d6f62a23 100644 --- a/docs/libcudacxx/ptx/instructions/mbarrier.expect_tx.rst +++ b/docs/libcudacxx/ptx/instructions/generated/mbarrier_expect_tx.rst @@ -1,11 +1,3 @@ -.. _libcudacxx-ptx-instructions-mbarrier-expect_tx: - -mbarrier.expect_tx -================== - -- PTX ISA: - `mbarrier.expect_tx `__ - mbarrier.expect_tx.relaxed.cta.shared::cta.b64 ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ .. code:: cuda diff --git a/docs/libcudacxx/ptx/instructions/mbarrier.init.rst b/docs/libcudacxx/ptx/instructions/generated/mbarrier_init.rst similarity index 50% rename from docs/libcudacxx/ptx/instructions/mbarrier.init.rst rename to docs/libcudacxx/ptx/instructions/generated/mbarrier_init.rst index 8c7e65eeab6..3e529d86d78 100644 --- a/docs/libcudacxx/ptx/instructions/mbarrier.init.rst +++ b/docs/libcudacxx/ptx/instructions/generated/mbarrier_init.rst @@ -1,11 +1,3 @@ -.. _libcudacxx-ptx-instructions-mbarrier-init: - -mbarrier.init -============= - -- PTX ISA: - `mbarrier.arrive `__ - mbarrier.init.shared.b64 ^^^^^^^^^^^^^^^^^^^^^^^^ .. code:: cuda diff --git a/docs/libcudacxx/ptx/instructions/generated/mbarrier_test_wait.rst b/docs/libcudacxx/ptx/instructions/generated/mbarrier_test_wait.rst new file mode 100644 index 00000000000..4cb241c7ca8 --- /dev/null +++ b/docs/libcudacxx/ptx/instructions/generated/mbarrier_test_wait.rst @@ -0,0 +1,37 @@ +mbarrier.test_wait.shared.b64 +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +.. code:: cuda + + // mbarrier.test_wait.shared.b64 waitComplete, [addr], state; // 1. PTX ISA 70, SM_80 + template + __device__ static inline bool mbarrier_test_wait( + uint64_t* addr, + const uint64_t& state); + +mbarrier.test_wait.acquire.cta.shared::cta.b64 +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +.. code:: cuda + + // mbarrier.test_wait{.sem}{.scope}.shared::cta.b64 waitComplete, [addr], state; // 2. PTX ISA 80, SM_90 + // .sem = { .acquire } + // .scope = { .cta, .cluster } + template + __device__ static inline bool mbarrier_test_wait( + cuda::ptx::sem_acquire_t, + cuda::ptx::scope_t scope, + uint64_t* addr, + const uint64_t& state); + +mbarrier.test_wait.acquire.cluster.shared::cta.b64 +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +.. code:: cuda + + // mbarrier.test_wait{.sem}{.scope}.shared::cta.b64 waitComplete, [addr], state; // 2. PTX ISA 80, SM_90 + // .sem = { .acquire } + // .scope = { .cta, .cluster } + template + __device__ static inline bool mbarrier_test_wait( + cuda::ptx::sem_acquire_t, + cuda::ptx::scope_t scope, + uint64_t* addr, + const uint64_t& state); diff --git a/docs/libcudacxx/ptx/instructions/generated/mbarrier_test_wait_parity.rst b/docs/libcudacxx/ptx/instructions/generated/mbarrier_test_wait_parity.rst new file mode 100644 index 00000000000..e750c4a543f --- /dev/null +++ b/docs/libcudacxx/ptx/instructions/generated/mbarrier_test_wait_parity.rst @@ -0,0 +1,37 @@ +mbarrier.test_wait.parity.shared.b64 +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +.. code:: cuda + + // mbarrier.test_wait.parity.shared.b64 waitComplete, [addr], phaseParity; // 3. PTX ISA 71, SM_80 + template + __device__ static inline bool mbarrier_test_wait_parity( + uint64_t* addr, + const uint32_t& phaseParity); + +mbarrier.test_wait.parity.acquire.cta.shared::cta.b64 +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +.. code:: cuda + + // mbarrier.test_wait.parity{.sem}{.scope}.shared::cta.b64 waitComplete, [addr], phaseParity; // 4. PTX ISA 80, SM_90 + // .sem = { .acquire } + // .scope = { .cta, .cluster } + template + __device__ static inline bool mbarrier_test_wait_parity( + cuda::ptx::sem_acquire_t, + cuda::ptx::scope_t scope, + uint64_t* addr, + const uint32_t& phaseParity); + +mbarrier.test_wait.parity.acquire.cluster.shared::cta.b64 +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +.. code:: cuda + + // mbarrier.test_wait.parity{.sem}{.scope}.shared::cta.b64 waitComplete, [addr], phaseParity; // 4. PTX ISA 80, SM_90 + // .sem = { .acquire } + // .scope = { .cta, .cluster } + template + __device__ static inline bool mbarrier_test_wait_parity( + cuda::ptx::sem_acquire_t, + cuda::ptx::scope_t scope, + uint64_t* addr, + const uint32_t& phaseParity); diff --git a/docs/libcudacxx/ptx/instructions/generated/mbarrier_try_wait.rst b/docs/libcudacxx/ptx/instructions/generated/mbarrier_try_wait.rst new file mode 100644 index 00000000000..ce648c66ee9 --- /dev/null +++ b/docs/libcudacxx/ptx/instructions/generated/mbarrier_try_wait.rst @@ -0,0 +1,78 @@ +mbarrier.try_wait.shared::cta.b64 +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +.. code:: cuda + + // mbarrier.try_wait.shared::cta.b64 waitComplete, [addr], state; // 5a. PTX ISA 78, SM_90 + template + __device__ static inline bool mbarrier_try_wait( + uint64_t* addr, + const uint64_t& state); + +mbarrier.try_wait.shared::cta.b64 +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +.. code:: cuda + + // mbarrier.try_wait.shared::cta.b64 waitComplete, [addr], state, suspendTimeHint; // 5b. PTX ISA 78, SM_90 + template + __device__ static inline bool mbarrier_try_wait( + uint64_t* addr, + const uint64_t& state, + const uint32_t& suspendTimeHint); + +mbarrier.try_wait.acquire.cta.shared::cta.b64 +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +.. code:: cuda + + // mbarrier.try_wait{.sem}{.scope}.shared::cta.b64 waitComplete, [addr], state; // 6a. PTX ISA 80, SM_90 + // .sem = { .acquire } + // .scope = { .cta, .cluster } + template + __device__ static inline bool mbarrier_try_wait( + cuda::ptx::sem_acquire_t, + cuda::ptx::scope_t scope, + uint64_t* addr, + const uint64_t& state); + +mbarrier.try_wait.acquire.cluster.shared::cta.b64 +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +.. code:: cuda + + // mbarrier.try_wait{.sem}{.scope}.shared::cta.b64 waitComplete, [addr], state; // 6a. PTX ISA 80, SM_90 + // .sem = { .acquire } + // .scope = { .cta, .cluster } + template + __device__ static inline bool mbarrier_try_wait( + cuda::ptx::sem_acquire_t, + cuda::ptx::scope_t scope, + uint64_t* addr, + const uint64_t& state); + +mbarrier.try_wait.acquire.cta.shared::cta.b64 +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +.. code:: cuda + + // mbarrier.try_wait{.sem}{.scope}.shared::cta.b64 waitComplete, [addr], state , suspendTimeHint; // 6b. PTX ISA 80, SM_90 + // .sem = { .acquire } + // .scope = { .cta, .cluster } + template + __device__ static inline bool mbarrier_try_wait( + cuda::ptx::sem_acquire_t, + cuda::ptx::scope_t scope, + uint64_t* addr, + const uint64_t& state, + const uint32_t& suspendTimeHint); + +mbarrier.try_wait.acquire.cluster.shared::cta.b64 +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +.. code:: cuda + + // mbarrier.try_wait{.sem}{.scope}.shared::cta.b64 waitComplete, [addr], state , suspendTimeHint; // 6b. PTX ISA 80, SM_90 + // .sem = { .acquire } + // .scope = { .cta, .cluster } + template + __device__ static inline bool mbarrier_try_wait( + cuda::ptx::sem_acquire_t, + cuda::ptx::scope_t scope, + uint64_t* addr, + const uint64_t& state, + const uint32_t& suspendTimeHint); diff --git a/docs/libcudacxx/ptx/instructions/generated/mbarrier_try_wait_parity.rst b/docs/libcudacxx/ptx/instructions/generated/mbarrier_try_wait_parity.rst new file mode 100644 index 00000000000..3210dc0eab1 --- /dev/null +++ b/docs/libcudacxx/ptx/instructions/generated/mbarrier_try_wait_parity.rst @@ -0,0 +1,78 @@ +mbarrier.try_wait.parity.shared::cta.b64 +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +.. code:: cuda + + // mbarrier.try_wait.parity.shared::cta.b64 waitComplete, [addr], phaseParity; // 7a. PTX ISA 78, SM_90 + template + __device__ static inline bool mbarrier_try_wait_parity( + uint64_t* addr, + const uint32_t& phaseParity); + +mbarrier.try_wait.parity.shared::cta.b64 +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +.. code:: cuda + + // mbarrier.try_wait.parity.shared::cta.b64 waitComplete, [addr], phaseParity, suspendTimeHint; // 7b. PTX ISA 78, SM_90 + template + __device__ static inline bool mbarrier_try_wait_parity( + uint64_t* addr, + const uint32_t& phaseParity, + const uint32_t& suspendTimeHint); + +mbarrier.try_wait.parity.acquire.cta.shared::cta.b64 +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +.. code:: cuda + + // mbarrier.try_wait.parity{.sem}{.scope}.shared::cta.b64 waitComplete, [addr], phaseParity; // 8a. PTX ISA 80, SM_90 + // .sem = { .acquire } + // .scope = { .cta, .cluster } + template + __device__ static inline bool mbarrier_try_wait_parity( + cuda::ptx::sem_acquire_t, + cuda::ptx::scope_t scope, + uint64_t* addr, + const uint32_t& phaseParity); + +mbarrier.try_wait.parity.acquire.cluster.shared::cta.b64 +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +.. code:: cuda + + // mbarrier.try_wait.parity{.sem}{.scope}.shared::cta.b64 waitComplete, [addr], phaseParity; // 8a. PTX ISA 80, SM_90 + // .sem = { .acquire } + // .scope = { .cta, .cluster } + template + __device__ static inline bool mbarrier_try_wait_parity( + cuda::ptx::sem_acquire_t, + cuda::ptx::scope_t scope, + uint64_t* addr, + const uint32_t& phaseParity); + +mbarrier.try_wait.parity.acquire.cta.shared::cta.b64 +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +.. code:: cuda + + // mbarrier.try_wait.parity{.sem}{.scope}.shared::cta.b64 waitComplete, [addr], phaseParity, suspendTimeHint; // 8b. PTX ISA 80, SM_90 + // .sem = { .acquire } + // .scope = { .cta, .cluster } + template + __device__ static inline bool mbarrier_try_wait_parity( + cuda::ptx::sem_acquire_t, + cuda::ptx::scope_t scope, + uint64_t* addr, + const uint32_t& phaseParity, + const uint32_t& suspendTimeHint); + +mbarrier.try_wait.parity.acquire.cluster.shared::cta.b64 +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +.. code:: cuda + + // mbarrier.try_wait.parity{.sem}{.scope}.shared::cta.b64 waitComplete, [addr], phaseParity, suspendTimeHint; // 8b. PTX ISA 80, SM_90 + // .sem = { .acquire } + // .scope = { .cta, .cluster } + template + __device__ static inline bool mbarrier_try_wait_parity( + cuda::ptx::sem_acquire_t, + cuda::ptx::scope_t scope, + uint64_t* addr, + const uint32_t& phaseParity, + const uint32_t& suspendTimeHint); diff --git a/docs/libcudacxx/ptx/instructions/red.async.rst b/docs/libcudacxx/ptx/instructions/generated/red_async.rst similarity index 89% rename from docs/libcudacxx/ptx/instructions/red.async.rst rename to docs/libcudacxx/ptx/instructions/generated/red_async.rst index 62599548a22..d6b9cf36549 100644 --- a/docs/libcudacxx/ptx/instructions/red.async.rst +++ b/docs/libcudacxx/ptx/instructions/generated/red_async.rst @@ -1,16 +1,3 @@ -.. _libcudacxx-ptx-instructions-mbarrier-red-async: - -red.async -========= - -- PTX ISA: - `red.async `__ - -.. _red.async-1: - -red.async ---------- - red.async.relaxed.cluster.shared::cluster.mbarrier::complete_tx::bytes.inc.u32 ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ .. code:: cuda @@ -191,20 +178,3 @@ red.async.relaxed.cluster.shared::cluster.mbarrier::complete_tx::bytes.add.u64 int64_t* dest, const int64_t& value, int64_t* remote_bar); - -red.async ``.s64`` emulation ----------------------------- - -PTX does not currently (CTK 12.3) expose ``red.async.add.s64``. This -exposure is emulated in ``cuda::ptx`` using - -.. code:: cuda - - // red.async.relaxed.cluster.shared::cluster.mbarrier::complete_tx::bytes{.op}.u64 [dest], value, [remote_bar]; // .u64 intentional PTX ISA 81, SM_90 - // .op = { .add } - template - __device__ static inline void red_async( - cuda::ptx::op_add_t, - int64_t* dest, - const int64_t& value, - int64_t* remote_bar); diff --git a/docs/libcudacxx/ptx/instructions/generated/special_registers.rst b/docs/libcudacxx/ptx/instructions/generated/special_registers.rst new file mode 100644 index 00000000000..aa1add84781 --- /dev/null +++ b/docs/libcudacxx/ptx/instructions/generated/special_registers.rst @@ -0,0 +1,383 @@ +tid.x +^^^^^ +.. code:: cuda + + // mov.u32 sreg_value, %%tid.x; // PTX ISA 20 + template + __device__ static inline uint32_t get_sreg_tid_x(); + +tid.y +^^^^^ +.. code:: cuda + + // mov.u32 sreg_value, %%tid.y; // PTX ISA 20 + template + __device__ static inline uint32_t get_sreg_tid_y(); + +tid.z +^^^^^ +.. code:: cuda + + // mov.u32 sreg_value, %%tid.z; // PTX ISA 20 + template + __device__ static inline uint32_t get_sreg_tid_z(); + +ntid.x +^^^^^^ +.. code:: cuda + + // mov.u32 sreg_value, %%ntid.x; // PTX ISA 20 + template + __device__ static inline uint32_t get_sreg_ntid_x(); + +ntid.y +^^^^^^ +.. code:: cuda + + // mov.u32 sreg_value, %%ntid.y; // PTX ISA 20 + template + __device__ static inline uint32_t get_sreg_ntid_y(); + +ntid.z +^^^^^^ +.. code:: cuda + + // mov.u32 sreg_value, %%ntid.z; // PTX ISA 20 + template + __device__ static inline uint32_t get_sreg_ntid_z(); + +laneid +^^^^^^ +.. code:: cuda + + // mov.u32 sreg_value, %%laneid; // PTX ISA 13 + template + __device__ static inline uint32_t get_sreg_laneid(); + +warpid +^^^^^^ +.. code:: cuda + + // mov.u32 sreg_value, %%warpid; // PTX ISA 13 + template + __device__ static inline uint32_t get_sreg_warpid(); + +nwarpid +^^^^^^^ +.. code:: cuda + + // mov.u32 sreg_value, %%nwarpid; // PTX ISA 20, SM_35 + template + __device__ static inline uint32_t get_sreg_nwarpid(); + +ctaid.x +^^^^^^^ +.. code:: cuda + + // mov.u32 sreg_value, %%ctaid.x; // PTX ISA 20 + template + __device__ static inline uint32_t get_sreg_ctaid_x(); + +ctaid.y +^^^^^^^ +.. code:: cuda + + // mov.u32 sreg_value, %%ctaid.y; // PTX ISA 20 + template + __device__ static inline uint32_t get_sreg_ctaid_y(); + +ctaid.z +^^^^^^^ +.. code:: cuda + + // mov.u32 sreg_value, %%ctaid.z; // PTX ISA 20 + template + __device__ static inline uint32_t get_sreg_ctaid_z(); + +nctaid.x +^^^^^^^^ +.. code:: cuda + + // mov.u32 sreg_value, %%nctaid.x; // PTX ISA 20 + template + __device__ static inline uint32_t get_sreg_nctaid_x(); + +nctaid.y +^^^^^^^^ +.. code:: cuda + + // mov.u32 sreg_value, %%nctaid.y; // PTX ISA 20 + template + __device__ static inline uint32_t get_sreg_nctaid_y(); + +nctaid.z +^^^^^^^^ +.. code:: cuda + + // mov.u32 sreg_value, %%nctaid.z; // PTX ISA 20 + template + __device__ static inline uint32_t get_sreg_nctaid_z(); + +smid +^^^^ +.. code:: cuda + + // mov.u32 sreg_value, %%smid; // PTX ISA 13 + template + __device__ static inline uint32_t get_sreg_smid(); + +nsmid +^^^^^ +.. code:: cuda + + // mov.u32 sreg_value, %%nsmid; // PTX ISA 20, SM_35 + template + __device__ static inline uint32_t get_sreg_nsmid(); + +gridid +^^^^^^ +.. code:: cuda + + // mov.u64 sreg_value, %%gridid; // PTX ISA 30 + template + __device__ static inline uint64_t get_sreg_gridid(); + +is_explicit_cluster +^^^^^^^^^^^^^^^^^^^ +.. code:: cuda + + // mov.pred sreg_value, %%is_explicit_cluster; // PTX ISA 78, SM_90 + template + __device__ static inline bool get_sreg_is_explicit_cluster(); + +clusterid.x +^^^^^^^^^^^ +.. code:: cuda + + // mov.u32 sreg_value, %%clusterid.x; // PTX ISA 78, SM_90 + template + __device__ static inline uint32_t get_sreg_clusterid_x(); + +clusterid.y +^^^^^^^^^^^ +.. code:: cuda + + // mov.u32 sreg_value, %%clusterid.y; // PTX ISA 78, SM_90 + template + __device__ static inline uint32_t get_sreg_clusterid_y(); + +clusterid.z +^^^^^^^^^^^ +.. code:: cuda + + // mov.u32 sreg_value, %%clusterid.z; // PTX ISA 78, SM_90 + template + __device__ static inline uint32_t get_sreg_clusterid_z(); + +nclusterid.x +^^^^^^^^^^^^ +.. code:: cuda + + // mov.u32 sreg_value, %%nclusterid.x; // PTX ISA 78, SM_90 + template + __device__ static inline uint32_t get_sreg_nclusterid_x(); + +nclusterid.y +^^^^^^^^^^^^ +.. code:: cuda + + // mov.u32 sreg_value, %%nclusterid.y; // PTX ISA 78, SM_90 + template + __device__ static inline uint32_t get_sreg_nclusterid_y(); + +nclusterid.z +^^^^^^^^^^^^ +.. code:: cuda + + // mov.u32 sreg_value, %%nclusterid.z; // PTX ISA 78, SM_90 + template + __device__ static inline uint32_t get_sreg_nclusterid_z(); + +cluster_ctaid.x +^^^^^^^^^^^^^^^ +.. code:: cuda + + // mov.u32 sreg_value, %%cluster_ctaid.x; // PTX ISA 78, SM_90 + template + __device__ static inline uint32_t get_sreg_cluster_ctaid_x(); + +cluster_ctaid.y +^^^^^^^^^^^^^^^ +.. code:: cuda + + // mov.u32 sreg_value, %%cluster_ctaid.y; // PTX ISA 78, SM_90 + template + __device__ static inline uint32_t get_sreg_cluster_ctaid_y(); + +cluster_ctaid.z +^^^^^^^^^^^^^^^ +.. code:: cuda + + // mov.u32 sreg_value, %%cluster_ctaid.z; // PTX ISA 78, SM_90 + template + __device__ static inline uint32_t get_sreg_cluster_ctaid_z(); + +cluster_nctaid.x +^^^^^^^^^^^^^^^^ +.. code:: cuda + + // mov.u32 sreg_value, %%cluster_nctaid.x; // PTX ISA 78, SM_90 + template + __device__ static inline uint32_t get_sreg_cluster_nctaid_x(); + +cluster_nctaid.y +^^^^^^^^^^^^^^^^ +.. code:: cuda + + // mov.u32 sreg_value, %%cluster_nctaid.y; // PTX ISA 78, SM_90 + template + __device__ static inline uint32_t get_sreg_cluster_nctaid_y(); + +cluster_nctaid.z +^^^^^^^^^^^^^^^^ +.. code:: cuda + + // mov.u32 sreg_value, %%cluster_nctaid.z; // PTX ISA 78, SM_90 + template + __device__ static inline uint32_t get_sreg_cluster_nctaid_z(); + +cluster_ctarank +^^^^^^^^^^^^^^^ +.. code:: cuda + + // mov.u32 sreg_value, %%cluster_ctarank; // PTX ISA 78, SM_90 + template + __device__ static inline uint32_t get_sreg_cluster_ctarank(); + +cluster_nctarank +^^^^^^^^^^^^^^^^ +.. code:: cuda + + // mov.u32 sreg_value, %%cluster_nctarank; // PTX ISA 78, SM_90 + template + __device__ static inline uint32_t get_sreg_cluster_nctarank(); + +lanemask_eq +^^^^^^^^^^^ +.. code:: cuda + + // mov.u32 sreg_value, %%lanemask_eq; // PTX ISA 20, SM_35 + template + __device__ static inline uint32_t get_sreg_lanemask_eq(); + +lanemask_le +^^^^^^^^^^^ +.. code:: cuda + + // mov.u32 sreg_value, %%lanemask_le; // PTX ISA 20, SM_35 + template + __device__ static inline uint32_t get_sreg_lanemask_le(); + +lanemask_lt +^^^^^^^^^^^ +.. code:: cuda + + // mov.u32 sreg_value, %%lanemask_lt; // PTX ISA 20, SM_35 + template + __device__ static inline uint32_t get_sreg_lanemask_lt(); + +lanemask_ge +^^^^^^^^^^^ +.. code:: cuda + + // mov.u32 sreg_value, %%lanemask_ge; // PTX ISA 20, SM_35 + template + __device__ static inline uint32_t get_sreg_lanemask_ge(); + +lanemask_gt +^^^^^^^^^^^ +.. code:: cuda + + // mov.u32 sreg_value, %%lanemask_gt; // PTX ISA 20, SM_35 + template + __device__ static inline uint32_t get_sreg_lanemask_gt(); + +clock +^^^^^ +.. code:: cuda + + // mov.u32 sreg_value, %%clock; // PTX ISA 10 + template + __device__ static inline uint32_t get_sreg_clock(); + +clock_hi +^^^^^^^^ +.. code:: cuda + + // mov.u32 sreg_value, %%clock_hi; // PTX ISA 50, SM_35 + template + __device__ static inline uint32_t get_sreg_clock_hi(); + +clock64 +^^^^^^^ +.. code:: cuda + + // mov.u64 sreg_value, %%clock64; // PTX ISA 20, SM_35 + template + __device__ static inline uint64_t get_sreg_clock64(); + +globaltimer +^^^^^^^^^^^ +.. code:: cuda + + // mov.u64 sreg_value, %%globaltimer; // PTX ISA 31, SM_35 + template + __device__ static inline uint64_t get_sreg_globaltimer(); + +globaltimer_lo +^^^^^^^^^^^^^^ +.. code:: cuda + + // mov.u32 sreg_value, %%globaltimer_lo; // PTX ISA 31, SM_35 + template + __device__ static inline uint32_t get_sreg_globaltimer_lo(); + +globaltimer_hi +^^^^^^^^^^^^^^ +.. code:: cuda + + // mov.u32 sreg_value, %%globaltimer_hi; // PTX ISA 31, SM_35 + template + __device__ static inline uint32_t get_sreg_globaltimer_hi(); + +total_smem_size +^^^^^^^^^^^^^^^ +.. code:: cuda + + // mov.u32 sreg_value, %%total_smem_size; // PTX ISA 41, SM_35 + template + __device__ static inline uint32_t get_sreg_total_smem_size(); + +aggr_smem_size +^^^^^^^^^^^^^^ +.. code:: cuda + + // mov.u32 sreg_value, %%aggr_smem_size; // PTX ISA 81, SM_90 + template + __device__ static inline uint32_t get_sreg_aggr_smem_size(); + +dynamic_smem_size +^^^^^^^^^^^^^^^^^ +.. code:: cuda + + // mov.u32 sreg_value, %%dynamic_smem_size; // PTX ISA 41, SM_35 + template + __device__ static inline uint32_t get_sreg_dynamic_smem_size(); + +current_graph_exec +^^^^^^^^^^^^^^^^^^ +.. code:: cuda + + // mov.u64 sreg_value, %%current_graph_exec; // PTX ISA 80, SM_50 + template + __device__ static inline uint64_t get_sreg_current_graph_exec(); diff --git a/docs/libcudacxx/ptx/instructions/st.async.rst b/docs/libcudacxx/ptx/instructions/generated/st_async.rst similarity index 83% rename from docs/libcudacxx/ptx/instructions/st.async.rst rename to docs/libcudacxx/ptx/instructions/generated/st_async.rst index a2e1ebe46a6..c519ea57f70 100644 --- a/docs/libcudacxx/ptx/instructions/st.async.rst +++ b/docs/libcudacxx/ptx/instructions/generated/st_async.rst @@ -1,16 +1,3 @@ -.. _libcudacxx-ptx-instructions-st-async: - -st.async -======== - -- PTX ISA: - `st.async `__ -- Used in: :ref:`How to use st.async ` - -**NOTE.** Alignment of ``addr`` must be a multiple of vector size. For -instance, the ``addr`` supplied to the ``v2.b32`` variant must be -aligned to ``2 x 4 = 8`` bytes. - st.async.weak.shared::cluster.mbarrier::complete_tx::bytes.b32 ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ .. code:: cuda diff --git a/docs/libcudacxx/ptx/instructions/tensormap.cp_fenceproxy.rst b/docs/libcudacxx/ptx/instructions/generated/tensormap_cp_fenceproxy.rst similarity index 89% rename from docs/libcudacxx/ptx/instructions/tensormap.cp_fenceproxy.rst rename to docs/libcudacxx/ptx/instructions/generated/tensormap_cp_fenceproxy.rst index 1de158491a8..52fae102ad4 100644 --- a/docs/libcudacxx/ptx/instructions/tensormap.cp_fenceproxy.rst +++ b/docs/libcudacxx/ptx/instructions/generated/tensormap_cp_fenceproxy.rst @@ -1,11 +1,3 @@ -.. _libcudacxx-ptx-instructions-tensormap-cp_fenceproxy: - -tensormap.cp_fenceproxy -======================= - -- PTX ISA: - `tensormap.cp_fenceproxy `__ - tensormap.cp_fenceproxy.global.shared::cta.tensormap::generic.release.cta.sync.aligned ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ .. code:: cuda diff --git a/docs/libcudacxx/ptx/instructions/tensormap.replace.rst b/docs/libcudacxx/ptx/instructions/generated/tensormap_replace.rst similarity index 97% rename from docs/libcudacxx/ptx/instructions/tensormap.replace.rst rename to docs/libcudacxx/ptx/instructions/generated/tensormap_replace.rst index 7d8b839584e..33e6f1d839a 100644 --- a/docs/libcudacxx/ptx/instructions/tensormap.replace.rst +++ b/docs/libcudacxx/ptx/instructions/generated/tensormap_replace.rst @@ -1,11 +1,3 @@ -.. _libcudacxx-ptx-instructions-tensormap-replace: - -tensormap.replace -================= - -- PTX ISA: - `tensormap.replace `__ - tensormap.replace.tile.global_address.global.b1024.b64 ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ .. code:: cuda diff --git a/docs/libcudacxx/ptx/instructions/getctarank.rst b/docs/libcudacxx/ptx/instructions/getctarank.rst index 5bad6259103..d355ed80929 100644 --- a/docs/libcudacxx/ptx/instructions/getctarank.rst +++ b/docs/libcudacxx/ptx/instructions/getctarank.rst @@ -6,13 +6,4 @@ getctarank - PTX ISA: `getctarank `__ -getctarank.shared::cluster.u32 -^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ -.. code:: cuda - - // getctarank{.space}.u32 dest, addr; // PTX ISA 78, SM_90 - // .space = { .shared::cluster } - template - __device__ static inline uint32_t getctarank( - cuda::ptx::space_cluster_t, - const void* addr); +.. include:: generated/getctarank.rst diff --git a/docs/libcudacxx/ptx/instructions/mbarrier.arrive.rst b/docs/libcudacxx/ptx/instructions/mbarrier.arrive.rst deleted file mode 100644 index c383c59c6fd..00000000000 --- a/docs/libcudacxx/ptx/instructions/mbarrier.arrive.rst +++ /dev/null @@ -1,232 +0,0 @@ -.. _libcudacxx-ptx-instructions-mbarrier-arrive: - -mbarrier.arrive -=============== - -- PTX ISA: - `mbarrier.arrive `__ - -.. _mbarrier.arrive-1: - -mbarrier.arrive ---------------- - -Some of the listed PTX instructions below are semantically equivalent. -They differ in one important way: the shorter instructions are typically -supported on older compilers. - -mbarrier.arrive.shared.b64 -^^^^^^^^^^^^^^^^^^^^^^^^^^ -.. code:: cuda - - // mbarrier.arrive.shared.b64 state, [addr]; // 1. PTX ISA 70, SM_80 - template - __device__ static inline uint64_t mbarrier_arrive( - uint64_t* addr); - -mbarrier.arrive.shared::cta.b64 -^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ -.. code:: cuda - - // mbarrier.arrive.shared::cta.b64 state, [addr], count; // 2. PTX ISA 78, SM_90 - template - __device__ static inline uint64_t mbarrier_arrive( - uint64_t* addr, - const uint32_t& count); - -mbarrier.arrive.release.cta.shared::cta.b64 -^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ -.. code:: cuda - - // mbarrier.arrive{.sem}{.scope}{.space}.b64 state, [addr]; // 3a. PTX ISA 80, SM_90 - // .sem = { .release } - // .scope = { .cta, .cluster } - // .space = { .shared::cta } - template - __device__ static inline uint64_t mbarrier_arrive( - cuda::ptx::sem_release_t, - cuda::ptx::scope_t scope, - cuda::ptx::space_shared_t, - uint64_t* addr); - -mbarrier.arrive.release.cluster.shared::cta.b64 -^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ -.. code:: cuda - - // mbarrier.arrive{.sem}{.scope}{.space}.b64 state, [addr]; // 3a. PTX ISA 80, SM_90 - // .sem = { .release } - // .scope = { .cta, .cluster } - // .space = { .shared::cta } - template - __device__ static inline uint64_t mbarrier_arrive( - cuda::ptx::sem_release_t, - cuda::ptx::scope_t scope, - cuda::ptx::space_shared_t, - uint64_t* addr); - -mbarrier.arrive.release.cta.shared::cta.b64 -^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ -.. code:: cuda - - // mbarrier.arrive{.sem}{.scope}{.space}.b64 state, [addr], count; // 3b. PTX ISA 80, SM_90 - // .sem = { .release } - // .scope = { .cta, .cluster } - // .space = { .shared::cta } - template - __device__ static inline uint64_t mbarrier_arrive( - cuda::ptx::sem_release_t, - cuda::ptx::scope_t scope, - cuda::ptx::space_shared_t, - uint64_t* addr, - const uint32_t& count); - -mbarrier.arrive.release.cluster.shared::cta.b64 -^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ -.. code:: cuda - - // mbarrier.arrive{.sem}{.scope}{.space}.b64 state, [addr], count; // 3b. PTX ISA 80, SM_90 - // .sem = { .release } - // .scope = { .cta, .cluster } - // .space = { .shared::cta } - template - __device__ static inline uint64_t mbarrier_arrive( - cuda::ptx::sem_release_t, - cuda::ptx::scope_t scope, - cuda::ptx::space_shared_t, - uint64_t* addr, - const uint32_t& count); - -mbarrier.arrive.release.cluster.shared::cluster.b64 -^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ -.. code:: cuda - - // mbarrier.arrive{.sem}{.scope}{.space}.b64 _, [addr]; // 4a. PTX ISA 80, SM_90 - // .sem = { .release } - // .scope = { .cluster } - // .space = { .shared::cluster } - template - __device__ static inline void mbarrier_arrive( - cuda::ptx::sem_release_t, - cuda::ptx::scope_cluster_t, - cuda::ptx::space_cluster_t, - uint64_t* addr); - -mbarrier.arrive.release.cluster.shared::cluster.b64 -^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ -.. code:: cuda - - // mbarrier.arrive{.sem}{.scope}{.space}.b64 _, [addr], count; // 4b. PTX ISA 80, SM_90 - // .sem = { .release } - // .scope = { .cluster } - // .space = { .shared::cluster } - template - __device__ static inline void mbarrier_arrive( - cuda::ptx::sem_release_t, - cuda::ptx::scope_cluster_t, - cuda::ptx::space_cluster_t, - uint64_t* addr, - const uint32_t& count); - -mbarrier.arrive.no_complete ---------------------------- - -mbarrier.arrive.noComplete.shared.b64 -^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ -.. code:: cuda - - // mbarrier.arrive.noComplete.shared.b64 state, [addr], count; // 5. PTX ISA 70, SM_80 - template - __device__ static inline uint64_t mbarrier_arrive_no_complete( - uint64_t* addr, - const uint32_t& count); - -mbarrier.arrive.expect_tx -------------------------- - -mbarrier.arrive.expect_tx.release.cta.shared::cta.b64 -^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ -.. code:: cuda - - // mbarrier.arrive.expect_tx{.sem}{.scope}{.space}.b64 state, [addr], tx_count; // 8. PTX ISA 80, SM_90 - // .sem = { .release } - // .scope = { .cta, .cluster } - // .space = { .shared::cta } - template - __device__ static inline uint64_t mbarrier_arrive_expect_tx( - cuda::ptx::sem_release_t, - cuda::ptx::scope_t scope, - cuda::ptx::space_shared_t, - uint64_t* addr, - const uint32_t& tx_count); - -mbarrier.arrive.expect_tx.release.cluster.shared::cta.b64 -^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ -.. code:: cuda - - // mbarrier.arrive.expect_tx{.sem}{.scope}{.space}.b64 state, [addr], tx_count; // 8. PTX ISA 80, SM_90 - // .sem = { .release } - // .scope = { .cta, .cluster } - // .space = { .shared::cta } - template - __device__ static inline uint64_t mbarrier_arrive_expect_tx( - cuda::ptx::sem_release_t, - cuda::ptx::scope_t scope, - cuda::ptx::space_shared_t, - uint64_t* addr, - const uint32_t& tx_count); - -mbarrier.arrive.expect_tx.release.cluster.shared::cluster.b64 -^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ -.. code:: cuda - - // mbarrier.arrive.expect_tx{.sem}{.scope}{.space}.b64 _, [addr], tx_count; // 9. PTX ISA 80, SM_90 - // .sem = { .release } - // .scope = { .cluster } - // .space = { .shared::cluster } - template - __device__ static inline void mbarrier_arrive_expect_tx( - cuda::ptx::sem_release_t, - cuda::ptx::scope_cluster_t, - cuda::ptx::space_cluster_t, - uint64_t* addr, - const uint32_t& tx_count); - -Usage ------ - -.. code:: cuda - - #include - #include - #include - - __global__ void kernel() { - using cuda::ptx::sem_release; - using cuda::ptx::space_cluster; - using cuda::ptx::space_shared; - using cuda::ptx::scope_cluster; - using cuda::ptx::scope_cta; - - using barrier_t = cuda::barrier; - __shared__ barrier_t bar; - init(&bar, blockDim.x); - __syncthreads(); - - NV_IF_TARGET(NV_PROVIDES_SM_90, ( - // Arrive on local shared memory barrier: - uint64_t token; - token = cuda::ptx::mbarrier_arrive_expect_tx(sem_release, scope_cluster, space_shared, &bar, 1); - - // Get address of remote cluster barrier: - namespace cg = cooperative_groups; - cg::cluster_group cluster = cg::this_cluster(); - unsigned int other_block_rank = cluster.block_rank() ^ 1; - uint64_t * remote_bar = cluster.map_shared_rank(&bar, other_block_rank); - - // Sync cluster to ensure remote barrier is initialized. - cluster.sync(); - - // Arrive on remote cluster barrier: - cuda::ptx::mbarrier_arrive_expect_tx(sem_release, scope_cluster, space_cluster, remote_bar, 1); - ) - } diff --git a/docs/libcudacxx/ptx/instructions/mbarrier.test_wait.rst b/docs/libcudacxx/ptx/instructions/mbarrier.test_wait.rst deleted file mode 100644 index 23197e2eb7c..00000000000 --- a/docs/libcudacxx/ptx/instructions/mbarrier.test_wait.rst +++ /dev/null @@ -1,91 +0,0 @@ -.. _libcudacxx-ptx-instructions-mbarrier-test_wait: - -mbarrier.test_wait -================== - -- PTX ISA: - `mbarrier.test_wait `__ - -.. _mbarrier.test_wait-1: - -mbarrier.test_wait ------------------- - -mbarrier.test_wait.shared.b64 -^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ -.. code:: cuda - - // mbarrier.test_wait.shared.b64 waitComplete, [addr], state; // 1. PTX ISA 70, SM_80 - template - __device__ static inline bool mbarrier_test_wait( - uint64_t* addr, - const uint64_t& state); - -mbarrier.test_wait.acquire.cta.shared::cta.b64 -^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ -.. code:: cuda - - // mbarrier.test_wait{.sem}{.scope}.shared::cta.b64 waitComplete, [addr], state; // 2. PTX ISA 80, SM_90 - // .sem = { .acquire } - // .scope = { .cta, .cluster } - template - __device__ static inline bool mbarrier_test_wait( - cuda::ptx::sem_acquire_t, - cuda::ptx::scope_t scope, - uint64_t* addr, - const uint64_t& state); - -mbarrier.test_wait.acquire.cluster.shared::cta.b64 -^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ -.. code:: cuda - - // mbarrier.test_wait{.sem}{.scope}.shared::cta.b64 waitComplete, [addr], state; // 2. PTX ISA 80, SM_90 - // .sem = { .acquire } - // .scope = { .cta, .cluster } - template - __device__ static inline bool mbarrier_test_wait( - cuda::ptx::sem_acquire_t, - cuda::ptx::scope_t scope, - uint64_t* addr, - const uint64_t& state); - -mbarrier.test_wait.parity -------------------------- - -mbarrier.test_wait.parity.shared.b64 -^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ -.. code:: cuda - - // mbarrier.test_wait.parity.shared.b64 waitComplete, [addr], phaseParity; // 3. PTX ISA 71, SM_80 - template - __device__ static inline bool mbarrier_test_wait_parity( - uint64_t* addr, - const uint32_t& phaseParity); - -mbarrier.test_wait.parity.acquire.cta.shared::cta.b64 -^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ -.. code:: cuda - - // mbarrier.test_wait.parity{.sem}{.scope}.shared::cta.b64 waitComplete, [addr], phaseParity; // 4. PTX ISA 80, SM_90 - // .sem = { .acquire } - // .scope = { .cta, .cluster } - template - __device__ static inline bool mbarrier_test_wait_parity( - cuda::ptx::sem_acquire_t, - cuda::ptx::scope_t scope, - uint64_t* addr, - const uint32_t& phaseParity); - -mbarrier.test_wait.parity.acquire.cluster.shared::cta.b64 -^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ -.. code:: cuda - - // mbarrier.test_wait.parity{.sem}{.scope}.shared::cta.b64 waitComplete, [addr], phaseParity; // 4. PTX ISA 80, SM_90 - // .sem = { .acquire } - // .scope = { .cta, .cluster } - template - __device__ static inline bool mbarrier_test_wait_parity( - cuda::ptx::sem_acquire_t, - cuda::ptx::scope_t scope, - uint64_t* addr, - const uint32_t& phaseParity); diff --git a/docs/libcudacxx/ptx/instructions/mbarrier.try_wait.rst b/docs/libcudacxx/ptx/instructions/mbarrier.try_wait.rst deleted file mode 100644 index 762f5e100d7..00000000000 --- a/docs/libcudacxx/ptx/instructions/mbarrier.try_wait.rst +++ /dev/null @@ -1,174 +0,0 @@ -.. _libcudacxx-ptx-instructions-mbarrier-try_wait: - -mbarrier.try_wait -================= - -- PTX ISA: - `mbarrier.try_wait `__ - - -.. _mbarrier.try_wait-1: - -mbarrier.try_wait ------------------ - -mbarrier.try_wait.shared::cta.b64 -^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ -.. code:: cuda - - // mbarrier.try_wait.shared::cta.b64 waitComplete, [addr], state; // 5a. PTX ISA 78, SM_90 - template - __device__ static inline bool mbarrier_try_wait( - uint64_t* addr, - const uint64_t& state); - -mbarrier.try_wait.shared::cta.b64 -^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ -.. code:: cuda - - // mbarrier.try_wait.shared::cta.b64 waitComplete, [addr], state, suspendTimeHint; // 5b. PTX ISA 78, SM_90 - template - __device__ static inline bool mbarrier_try_wait( - uint64_t* addr, - const uint64_t& state, - const uint32_t& suspendTimeHint); - -mbarrier.try_wait.acquire.cta.shared::cta.b64 -^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ -.. code:: cuda - - // mbarrier.try_wait{.sem}{.scope}.shared::cta.b64 waitComplete, [addr], state; // 6a. PTX ISA 80, SM_90 - // .sem = { .acquire } - // .scope = { .cta, .cluster } - template - __device__ static inline bool mbarrier_try_wait( - cuda::ptx::sem_acquire_t, - cuda::ptx::scope_t scope, - uint64_t* addr, - const uint64_t& state); - -mbarrier.try_wait.acquire.cluster.shared::cta.b64 -^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ -.. code:: cuda - - // mbarrier.try_wait{.sem}{.scope}.shared::cta.b64 waitComplete, [addr], state; // 6a. PTX ISA 80, SM_90 - // .sem = { .acquire } - // .scope = { .cta, .cluster } - template - __device__ static inline bool mbarrier_try_wait( - cuda::ptx::sem_acquire_t, - cuda::ptx::scope_t scope, - uint64_t* addr, - const uint64_t& state); - -mbarrier.try_wait.acquire.cta.shared::cta.b64 -^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ -.. code:: cuda - - // mbarrier.try_wait{.sem}{.scope}.shared::cta.b64 waitComplete, [addr], state , suspendTimeHint; // 6b. PTX ISA 80, SM_90 - // .sem = { .acquire } - // .scope = { .cta, .cluster } - template - __device__ static inline bool mbarrier_try_wait( - cuda::ptx::sem_acquire_t, - cuda::ptx::scope_t scope, - uint64_t* addr, - const uint64_t& state, - const uint32_t& suspendTimeHint); - -mbarrier.try_wait.acquire.cluster.shared::cta.b64 -^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ -.. code:: cuda - - // mbarrier.try_wait{.sem}{.scope}.shared::cta.b64 waitComplete, [addr], state , suspendTimeHint; // 6b. PTX ISA 80, SM_90 - // .sem = { .acquire } - // .scope = { .cta, .cluster } - template - __device__ static inline bool mbarrier_try_wait( - cuda::ptx::sem_acquire_t, - cuda::ptx::scope_t scope, - uint64_t* addr, - const uint64_t& state, - const uint32_t& suspendTimeHint); - -mbarrier.try_wait.parity ------------------------- - -mbarrier.try_wait.parity.shared::cta.b64 -^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ -.. code:: cuda - - // mbarrier.try_wait.parity.shared::cta.b64 waitComplete, [addr], phaseParity; // 7a. PTX ISA 78, SM_90 - template - __device__ static inline bool mbarrier_try_wait_parity( - uint64_t* addr, - const uint32_t& phaseParity); - -mbarrier.try_wait.parity.shared::cta.b64 -^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ -.. code:: cuda - - // mbarrier.try_wait.parity.shared::cta.b64 waitComplete, [addr], phaseParity, suspendTimeHint; // 7b. PTX ISA 78, SM_90 - template - __device__ static inline bool mbarrier_try_wait_parity( - uint64_t* addr, - const uint32_t& phaseParity, - const uint32_t& suspendTimeHint); - -mbarrier.try_wait.parity.acquire.cta.shared::cta.b64 -^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ -.. code:: cuda - - // mbarrier.try_wait.parity{.sem}{.scope}.shared::cta.b64 waitComplete, [addr], phaseParity; // 8a. PTX ISA 80, SM_90 - // .sem = { .acquire } - // .scope = { .cta, .cluster } - template - __device__ static inline bool mbarrier_try_wait_parity( - cuda::ptx::sem_acquire_t, - cuda::ptx::scope_t scope, - uint64_t* addr, - const uint32_t& phaseParity); - -mbarrier.try_wait.parity.acquire.cluster.shared::cta.b64 -^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ -.. code:: cuda - - // mbarrier.try_wait.parity{.sem}{.scope}.shared::cta.b64 waitComplete, [addr], phaseParity; // 8a. PTX ISA 80, SM_90 - // .sem = { .acquire } - // .scope = { .cta, .cluster } - template - __device__ static inline bool mbarrier_try_wait_parity( - cuda::ptx::sem_acquire_t, - cuda::ptx::scope_t scope, - uint64_t* addr, - const uint32_t& phaseParity); - -mbarrier.try_wait.parity.acquire.cta.shared::cta.b64 -^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ -.. code:: cuda - - // mbarrier.try_wait.parity{.sem}{.scope}.shared::cta.b64 waitComplete, [addr], phaseParity, suspendTimeHint; // 8b. PTX ISA 80, SM_90 - // .sem = { .acquire } - // .scope = { .cta, .cluster } - template - __device__ static inline bool mbarrier_try_wait_parity( - cuda::ptx::sem_acquire_t, - cuda::ptx::scope_t scope, - uint64_t* addr, - const uint32_t& phaseParity, - const uint32_t& suspendTimeHint); - -mbarrier.try_wait.parity.acquire.cluster.shared::cta.b64 -^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ -.. code:: cuda - - // mbarrier.try_wait.parity{.sem}{.scope}.shared::cta.b64 waitComplete, [addr], phaseParity, suspendTimeHint; // 8b. PTX ISA 80, SM_90 - // .sem = { .acquire } - // .scope = { .cta, .cluster } - template - __device__ static inline bool mbarrier_try_wait_parity( - cuda::ptx::sem_acquire_t, - cuda::ptx::scope_t scope, - uint64_t* addr, - const uint32_t& phaseParity, - const uint32_t& suspendTimeHint); diff --git a/docs/libcudacxx/ptx/instructions/mbarrier_arrive.rst b/docs/libcudacxx/ptx/instructions/mbarrier_arrive.rst new file mode 100644 index 00000000000..f01e7a95465 --- /dev/null +++ b/docs/libcudacxx/ptx/instructions/mbarrier_arrive.rst @@ -0,0 +1,68 @@ +.. _libcudacxx-ptx-instructions-mbarrier-arrive: + +mbarrier.arrive +=============== + +- PTX ISA: + `mbarrier.arrive `__ + +.. _mbarrier.arrive-1: + +mbarrier.arrive +--------------- + +Some of the listed PTX instructions below are semantically equivalent. +They differ in one important way: the shorter instructions are typically +supported on older compilers. + +.. include:: generated/mbarrier_arrive.rst + +mbarrier.arrive.no_complete +--------------------------- + +.. include:: generated/mbarrier_arrive_no_complete.rst + +mbarrier.arrive.expect_tx +------------------------- + +.. include:: generated/mbarrier_arrive_expect_tx.rst + +Usage +----- + +.. code:: cuda + + #include + #include + #include + + __global__ void kernel() { + using cuda::ptx::sem_release; + using cuda::ptx::space_cluster; + using cuda::ptx::space_shared; + using cuda::ptx::scope_cluster; + using cuda::ptx::scope_cta; + + using barrier_t = cuda::barrier; + __shared__ barrier_t bar; + init(&bar, blockDim.x); + __syncthreads(); + + NV_IF_TARGET(NV_PROVIDES_SM_90, ( + // Arrive on local shared memory barrier: + uint64_t token; + token = cuda::ptx::mbarrier_arrive_expect_tx(sem_release, scope_cluster, space_shared, &bar, 1); + + // Get address of remote cluster barrier: + namespace cg = cooperative_groups; + cg::cluster_group cluster = cg::this_cluster(); + unsigned int other_block_rank = cluster.block_rank() ^ 1; + uint64_t * remote_bar = cluster.map_shared_rank(&bar, other_block_rank); + + // Sync cluster to ensure remote barrier is initialized. + cluster.sync(); + + // Arrive on remote cluster barrier: + cuda::ptx::mbarrier_arrive_expect_tx(sem_release, scope_cluster, space_cluster, remote_bar, 1); + ) + } diff --git a/docs/libcudacxx/ptx/instructions/mbarrier_expect_tx.rst b/docs/libcudacxx/ptx/instructions/mbarrier_expect_tx.rst new file mode 100644 index 00000000000..6c34813242f --- /dev/null +++ b/docs/libcudacxx/ptx/instructions/mbarrier_expect_tx.rst @@ -0,0 +1,9 @@ +.. _libcudacxx-ptx-instructions-mbarrier-expect_tx: + +mbarrier.expect_tx +================== + +- PTX ISA: + `mbarrier.expect_tx `__ + +.. include:: generated/mbarrier_expect_tx.rst diff --git a/docs/libcudacxx/ptx/instructions/mbarrier_init.rst b/docs/libcudacxx/ptx/instructions/mbarrier_init.rst new file mode 100644 index 00000000000..a736f53b0a2 --- /dev/null +++ b/docs/libcudacxx/ptx/instructions/mbarrier_init.rst @@ -0,0 +1,9 @@ +.. _libcudacxx-ptx-instructions-mbarrier-init: + +mbarrier.init +============= + +- PTX ISA: + `mbarrier.arrive `__ + +.. include:: generated/mbarrier_init.rst diff --git a/docs/libcudacxx/ptx/instructions/mbarrier_test_wait.rst b/docs/libcudacxx/ptx/instructions/mbarrier_test_wait.rst new file mode 100644 index 00000000000..d8a4e79473e --- /dev/null +++ b/docs/libcudacxx/ptx/instructions/mbarrier_test_wait.rst @@ -0,0 +1,19 @@ +.. _libcudacxx-ptx-instructions-mbarrier-test_wait: + +mbarrier.test_wait +================== + +- PTX ISA: + `mbarrier.test_wait `__ + +.. _mbarrier.test_wait-1: + +mbarrier.test_wait +------------------ + +.. include:: generated/mbarrier_test_wait.rst + +mbarrier.test_wait.parity +------------------------- + +.. include:: generated/mbarrier_test_wait_parity.rst diff --git a/docs/libcudacxx/ptx/instructions/mbarrier_try_wait.rst b/docs/libcudacxx/ptx/instructions/mbarrier_try_wait.rst new file mode 100644 index 00000000000..1869695f3f6 --- /dev/null +++ b/docs/libcudacxx/ptx/instructions/mbarrier_try_wait.rst @@ -0,0 +1,20 @@ +.. _libcudacxx-ptx-instructions-mbarrier-try_wait: + +mbarrier.try_wait +================= + +- PTX ISA: + `mbarrier.try_wait `__ + + +.. _mbarrier.try_wait-1: + +mbarrier.try_wait +----------------- + +.. include:: generated/mbarrier_try_wait.rst + +mbarrier.try_wait.parity +------------------------ + +.. include:: generated/mbarrier_try_wait_parity.rst diff --git a/docs/libcudacxx/ptx/instructions/red_async.rst b/docs/libcudacxx/ptx/instructions/red_async.rst new file mode 100644 index 00000000000..82ba07c38de --- /dev/null +++ b/docs/libcudacxx/ptx/instructions/red_async.rst @@ -0,0 +1,31 @@ +.. _libcudacxx-ptx-instructions-mbarrier-red-async: + +red.async +========= + +- PTX ISA: + `red.async `__ + +.. _red.async-1: + +red.async +--------- + +.. include:: generated/red_async.rst + +red.async ``.s64`` emulation +---------------------------- + +PTX does not currently (CTK 12.3) expose ``red.async.add.s64``. This +exposure is emulated in ``cuda::ptx`` using + +.. code:: cuda + + // red.async.relaxed.cluster.shared::cluster.mbarrier::complete_tx::bytes{.op}.u64 [dest], value, [remote_bar]; // .u64 intentional PTX ISA 81, SM_90 + // .op = { .add } + template + __device__ static inline void red_async( + cuda::ptx::op_add_t, + int64_t* dest, + const int64_t& value, + int64_t* remote_bar); diff --git a/docs/libcudacxx/ptx/instructions/special_registers.rst b/docs/libcudacxx/ptx/instructions/special_registers.rst index 375ce44622e..1e9597fa726 100644 --- a/docs/libcudacxx/ptx/instructions/special_registers.rst +++ b/docs/libcudacxx/ptx/instructions/special_registers.rst @@ -6,386 +6,4 @@ Special registers - PTX ISA: `Special Register `__ -tid.x -^^^^^ -.. code:: cuda - - // mov.u32 sreg_value, %%tid.x; // PTX ISA 20 - template - __device__ static inline uint32_t get_sreg_tid_x(); - -tid.y -^^^^^ -.. code:: cuda - - // mov.u32 sreg_value, %%tid.y; // PTX ISA 20 - template - __device__ static inline uint32_t get_sreg_tid_y(); - -tid.z -^^^^^ -.. code:: cuda - - // mov.u32 sreg_value, %%tid.z; // PTX ISA 20 - template - __device__ static inline uint32_t get_sreg_tid_z(); - -ntid.x -^^^^^^ -.. code:: cuda - - // mov.u32 sreg_value, %%ntid.x; // PTX ISA 20 - template - __device__ static inline uint32_t get_sreg_ntid_x(); - -ntid.y -^^^^^^ -.. code:: cuda - - // mov.u32 sreg_value, %%ntid.y; // PTX ISA 20 - template - __device__ static inline uint32_t get_sreg_ntid_y(); - -ntid.z -^^^^^^ -.. code:: cuda - - // mov.u32 sreg_value, %%ntid.z; // PTX ISA 20 - template - __device__ static inline uint32_t get_sreg_ntid_z(); - -laneid -^^^^^^ -.. code:: cuda - - // mov.u32 sreg_value, %%laneid; // PTX ISA 13 - template - __device__ static inline uint32_t get_sreg_laneid(); - -warpid -^^^^^^ -.. code:: cuda - - // mov.u32 sreg_value, %%warpid; // PTX ISA 13 - template - __device__ static inline uint32_t get_sreg_warpid(); - -nwarpid -^^^^^^^ -.. code:: cuda - - // mov.u32 sreg_value, %%nwarpid; // PTX ISA 20, SM_35 - template - __device__ static inline uint32_t get_sreg_nwarpid(); - -ctaid.x -^^^^^^^ -.. code:: cuda - - // mov.u32 sreg_value, %%ctaid.x; // PTX ISA 20 - template - __device__ static inline uint32_t get_sreg_ctaid_x(); - -ctaid.y -^^^^^^^ -.. code:: cuda - - // mov.u32 sreg_value, %%ctaid.y; // PTX ISA 20 - template - __device__ static inline uint32_t get_sreg_ctaid_y(); - -ctaid.z -^^^^^^^ -.. code:: cuda - - // mov.u32 sreg_value, %%ctaid.z; // PTX ISA 20 - template - __device__ static inline uint32_t get_sreg_ctaid_z(); - -nctaid.x -^^^^^^^^ -.. code:: cuda - - // mov.u32 sreg_value, %%nctaid.x; // PTX ISA 20 - template - __device__ static inline uint32_t get_sreg_nctaid_x(); - -nctaid.y -^^^^^^^^ -.. code:: cuda - - // mov.u32 sreg_value, %%nctaid.y; // PTX ISA 20 - template - __device__ static inline uint32_t get_sreg_nctaid_y(); - -nctaid.z -^^^^^^^^ -.. code:: cuda - - // mov.u32 sreg_value, %%nctaid.z; // PTX ISA 20 - template - __device__ static inline uint32_t get_sreg_nctaid_z(); - -smid -^^^^ -.. code:: cuda - - // mov.u32 sreg_value, %%smid; // PTX ISA 13 - template - __device__ static inline uint32_t get_sreg_smid(); - -nsmid -^^^^^ -.. code:: cuda - - // mov.u32 sreg_value, %%nsmid; // PTX ISA 20, SM_35 - template - __device__ static inline uint32_t get_sreg_nsmid(); - -gridid -^^^^^^ -.. code:: cuda - - // mov.u64 sreg_value, %%gridid; // PTX ISA 30 - template - __device__ static inline uint64_t get_sreg_gridid(); - -is_explicit_cluster -^^^^^^^^^^^^^^^^^^^ -.. code:: cuda - - // mov.pred sreg_value, %%is_explicit_cluster; // PTX ISA 78, SM_90 - template - __device__ static inline bool get_sreg_is_explicit_cluster(); - -clusterid.x -^^^^^^^^^^^ -.. code:: cuda - - // mov.u32 sreg_value, %%clusterid.x; // PTX ISA 78, SM_90 - template - __device__ static inline uint32_t get_sreg_clusterid_x(); - -clusterid.y -^^^^^^^^^^^ -.. code:: cuda - - // mov.u32 sreg_value, %%clusterid.y; // PTX ISA 78, SM_90 - template - __device__ static inline uint32_t get_sreg_clusterid_y(); - -clusterid.z -^^^^^^^^^^^ -.. code:: cuda - - // mov.u32 sreg_value, %%clusterid.z; // PTX ISA 78, SM_90 - template - __device__ static inline uint32_t get_sreg_clusterid_z(); - -nclusterid.x -^^^^^^^^^^^^ -.. code:: cuda - - // mov.u32 sreg_value, %%nclusterid.x; // PTX ISA 78, SM_90 - template - __device__ static inline uint32_t get_sreg_nclusterid_x(); - -nclusterid.y -^^^^^^^^^^^^ -.. code:: cuda - - // mov.u32 sreg_value, %%nclusterid.y; // PTX ISA 78, SM_90 - template - __device__ static inline uint32_t get_sreg_nclusterid_y(); - -nclusterid.z -^^^^^^^^^^^^ -.. code:: cuda - - // mov.u32 sreg_value, %%nclusterid.z; // PTX ISA 78, SM_90 - template - __device__ static inline uint32_t get_sreg_nclusterid_z(); - -cluster_ctaid.x -^^^^^^^^^^^^^^^ -.. code:: cuda - - // mov.u32 sreg_value, %%cluster_ctaid.x; // PTX ISA 78, SM_90 - template - __device__ static inline uint32_t get_sreg_cluster_ctaid_x(); - -cluster_ctaid.y -^^^^^^^^^^^^^^^ -.. code:: cuda - - // mov.u32 sreg_value, %%cluster_ctaid.y; // PTX ISA 78, SM_90 - template - __device__ static inline uint32_t get_sreg_cluster_ctaid_y(); - -cluster_ctaid.z -^^^^^^^^^^^^^^^ -.. code:: cuda - - // mov.u32 sreg_value, %%cluster_ctaid.z; // PTX ISA 78, SM_90 - template - __device__ static inline uint32_t get_sreg_cluster_ctaid_z(); - -cluster_nctaid.x -^^^^^^^^^^^^^^^^ -.. code:: cuda - - // mov.u32 sreg_value, %%cluster_nctaid.x; // PTX ISA 78, SM_90 - template - __device__ static inline uint32_t get_sreg_cluster_nctaid_x(); - -cluster_nctaid.y -^^^^^^^^^^^^^^^^ -.. code:: cuda - - // mov.u32 sreg_value, %%cluster_nctaid.y; // PTX ISA 78, SM_90 - template - __device__ static inline uint32_t get_sreg_cluster_nctaid_y(); - -cluster_nctaid.z -^^^^^^^^^^^^^^^^ -.. code:: cuda - - // mov.u32 sreg_value, %%cluster_nctaid.z; // PTX ISA 78, SM_90 - template - __device__ static inline uint32_t get_sreg_cluster_nctaid_z(); - -cluster_ctarank -^^^^^^^^^^^^^^^ -.. code:: cuda - - // mov.u32 sreg_value, %%cluster_ctarank; // PTX ISA 78, SM_90 - template - __device__ static inline uint32_t get_sreg_cluster_ctarank(); - -cluster_nctarank -^^^^^^^^^^^^^^^^ -.. code:: cuda - - // mov.u32 sreg_value, %%cluster_nctarank; // PTX ISA 78, SM_90 - template - __device__ static inline uint32_t get_sreg_cluster_nctarank(); - -lanemask_eq -^^^^^^^^^^^ -.. code:: cuda - - // mov.u32 sreg_value, %%lanemask_eq; // PTX ISA 20, SM_35 - template - __device__ static inline uint32_t get_sreg_lanemask_eq(); - -lanemask_le -^^^^^^^^^^^ -.. code:: cuda - - // mov.u32 sreg_value, %%lanemask_le; // PTX ISA 20, SM_35 - template - __device__ static inline uint32_t get_sreg_lanemask_le(); - -lanemask_lt -^^^^^^^^^^^ -.. code:: cuda - - // mov.u32 sreg_value, %%lanemask_lt; // PTX ISA 20, SM_35 - template - __device__ static inline uint32_t get_sreg_lanemask_lt(); - -lanemask_ge -^^^^^^^^^^^ -.. code:: cuda - - // mov.u32 sreg_value, %%lanemask_ge; // PTX ISA 20, SM_35 - template - __device__ static inline uint32_t get_sreg_lanemask_ge(); - -lanemask_gt -^^^^^^^^^^^ -.. code:: cuda - - // mov.u32 sreg_value, %%lanemask_gt; // PTX ISA 20, SM_35 - template - __device__ static inline uint32_t get_sreg_lanemask_gt(); - -clock -^^^^^ -.. code:: cuda - - // mov.u32 sreg_value, %%clock; // PTX ISA 10 - template - __device__ static inline uint32_t get_sreg_clock(); - -clock_hi -^^^^^^^^ -.. code:: cuda - - // mov.u32 sreg_value, %%clock_hi; // PTX ISA 50, SM_35 - template - __device__ static inline uint32_t get_sreg_clock_hi(); - -clock64 -^^^^^^^ -.. code:: cuda - - // mov.u64 sreg_value, %%clock64; // PTX ISA 20, SM_35 - template - __device__ static inline uint64_t get_sreg_clock64(); - -globaltimer -^^^^^^^^^^^ -.. code:: cuda - - // mov.u64 sreg_value, %%globaltimer; // PTX ISA 31, SM_35 - template - __device__ static inline uint64_t get_sreg_globaltimer(); - -globaltimer_lo -^^^^^^^^^^^^^^ -.. code:: cuda - - // mov.u32 sreg_value, %%globaltimer_lo; // PTX ISA 31, SM_35 - template - __device__ static inline uint32_t get_sreg_globaltimer_lo(); - -globaltimer_hi -^^^^^^^^^^^^^^ -.. code:: cuda - - // mov.u32 sreg_value, %%globaltimer_hi; // PTX ISA 31, SM_35 - template - __device__ static inline uint32_t get_sreg_globaltimer_hi(); - -total_smem_size -^^^^^^^^^^^^^^^ -.. code:: cuda - - // mov.u32 sreg_value, %%total_smem_size; // PTX ISA 41, SM_35 - template - __device__ static inline uint32_t get_sreg_total_smem_size(); - -aggr_smem_size -^^^^^^^^^^^^^^ -.. code:: cuda - - // mov.u32 sreg_value, %%aggr_smem_size; // PTX ISA 81, SM_90 - template - __device__ static inline uint32_t get_sreg_aggr_smem_size(); - -dynamic_smem_size -^^^^^^^^^^^^^^^^^ -.. code:: cuda - - // mov.u32 sreg_value, %%dynamic_smem_size; // PTX ISA 41, SM_35 - template - __device__ static inline uint32_t get_sreg_dynamic_smem_size(); - -current_graph_exec -^^^^^^^^^^^^^^^^^^ -.. code:: cuda - - // mov.u64 sreg_value, %%current_graph_exec; // PTX ISA 80, SM_50 - template - __device__ static inline uint64_t get_sreg_current_graph_exec(); +.. include:: generated/special_registers.rst diff --git a/docs/libcudacxx/ptx/instructions/st_async.rst b/docs/libcudacxx/ptx/instructions/st_async.rst new file mode 100644 index 00000000000..c71aebd7da3 --- /dev/null +++ b/docs/libcudacxx/ptx/instructions/st_async.rst @@ -0,0 +1,14 @@ +.. _libcudacxx-ptx-instructions-st-async: + +st.async +======== + +- PTX ISA: + `st.async `__ +- Used in: :ref:`How to use st.async ` + +**NOTE.** Alignment of ``addr`` must be a multiple of vector size. For +instance, the ``addr`` supplied to the ``v2.b32`` variant must be +aligned to ``2 x 4 = 8`` bytes. + +.. include:: generated/st_async.rst diff --git a/docs/libcudacxx/ptx/instructions/tensormap_cp_fenceproxy.rst b/docs/libcudacxx/ptx/instructions/tensormap_cp_fenceproxy.rst new file mode 100644 index 00000000000..2f7622bba2c --- /dev/null +++ b/docs/libcudacxx/ptx/instructions/tensormap_cp_fenceproxy.rst @@ -0,0 +1,9 @@ +.. _libcudacxx-ptx-instructions-tensormap-cp_fenceproxy: + +tensormap.cp_fenceproxy +======================= + +- PTX ISA: + `tensormap.cp_fenceproxy `__ + +.. include:: generated/tensormap_cp_fenceproxy.rst diff --git a/docs/libcudacxx/ptx/instructions/tensormap_replace.rst b/docs/libcudacxx/ptx/instructions/tensormap_replace.rst new file mode 100644 index 00000000000..331dcff313a --- /dev/null +++ b/docs/libcudacxx/ptx/instructions/tensormap_replace.rst @@ -0,0 +1,9 @@ +.. _libcudacxx-ptx-instructions-tensormap-replace: + +tensormap.replace +================= + +- PTX ISA: + `tensormap.replace `__ + +.. include:: generated/tensormap_replace.rst From f6ec34b40d69bc42c254de4aab8bda4008857c73 Mon Sep 17 00:00:00 2001 From: Michael Schellenberger Costa Date: Fri, 22 Nov 2024 13:31:30 +0100 Subject: [PATCH 09/45] Improve build instructions for libcu++ (#2881) * Improve build instructions for libcu++ * Add section about the options for the build script * Delegate more to the contributor guidelines --- .../libcudacxx/setup/building_and_testing.rst | 200 ++---------------- 1 file changed, 21 insertions(+), 179 deletions(-) diff --git a/docs/libcudacxx/setup/building_and_testing.rst b/docs/libcudacxx/setup/building_and_testing.rst index 7a420d0c09a..5b3b010a294 100644 --- a/docs/libcudacxx/setup/building_and_testing.rst +++ b/docs/libcudacxx/setup/building_and_testing.rst @@ -3,200 +3,42 @@ Building & Testing libcu++ ========================== -\*nix Systems, Native Build/Test --------------------------------- +libcu++ can be build and tested as shown in our `contributor guidelines `_. -The procedure is demonstrated for NVCC + GCC in C++11 mode on a -Debian-like Linux systems; the same basic steps are required on all -other platforms. - -Step 0: Install Build Requirements -~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ - -In a Bash shell: - -.. code:: bash - - # Install LLVM (needed for LLVM's CMake modules) - apt-get -y install llvm - - # Install CMake - apt-get -y install cmake - - # Install the LLVM Integrated Tester (`lit`) - apt-get -y install python-pip - pip install lit - - # Env vars that should be set, or kept in mind for use later - export LIBCUDACXX_ROOT=/path/to/libcudacxx # Git repo root. - -Step 1: Generate the Build Files -~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ - -In a Bash shell: +However, often only a small subset of the full test suite needs to be run during development. For that we rely on ``lit``. +After libcu++ has been configured either through the build scripts or directly via a cmake preset one can then run. .. code:: bash - cd ${LIBCUDACXX_ROOT} - cmake \ - -S ./ \ - -B build \ - -DCMAKE_CXX_COMPILER=$CXX \ - -DCMAKE_CUDA_COMPILER=$TOOLKIT/bin/nvcc \ - -DLIBCUDACXX_ENABLE_LIBCUDACXX_TESTS=ON - -Step 2: Build & Run the Tests -~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + cd build + lit libcudacxx-cpp17/RELATIVE_PATH_TO_TEST_OR_SUBFOLDER -sv -In a Bash shell: +This will build and run all tests within ``RELATIVE_PATH_TO_TEST_OR_SUBFOLDER`` which must be a valid path within the CCCL. +Note that the name of the top level folder is the same as the name of the preset. For the build script the default is +``libcudacxx-cpp17``. As an example this is how to run all tests for ``cuda::std::span``, which are located in +``libcudacxx/test/libcudacxx/std/containers/views/views.span`` .. code:: bash - cd ${LIBCUDACXX_ROOT}/build # build directory of this repo - ../utils/nvidia/linux/perform_tests.bash --skip-libcxx-tests - -\*nix Systems, Cross Build/Test -------------------------------- - -The procedure is demonstrated for NVCC + GCC cross compiler in C++14 -mode on a Debian-like Linux systems targeting an aarch64 L4T system; the -same basic steps are required on all other platforms. - -Step 0: Install Build Prerequisites -~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + cd build -Follow Step 0 for \*nix native builds/tests. + # Builds all tests within libcudacxx/test/libcudacxx/std/containers/views/views.span + lit libcudacxx-cpp17/libcudacxx/test/libcudacxx/std/containers/views/views.span -sv -.. _step-1-generate-the-build-files-1: + # Builds the individual test array.pass.cpp + lit libcudacxx-cpp17/libcudacxx/test/libcudacxx/std/containers/views/views.span/span.cons/array.pass.cpp -sv -Step 1: Generate the Build Files -~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ - -In a Bash shell: +If only building the tests and not running them is desired one can pass ``-Dexecutor="NoopExecutor()"`` to the lit invocation. +This is especially usefull if the machine has no GPU or testing a different architecture .. code:: bash - export HOST=executor.nvidia.com - export USERNAME=ubuntu - - cd ${LIBCUDACXX_ROOT} - cmake \ - -S ./ \ - -B build \ - -DCMAKE_CUDA_COMPILER=$TOOLKIT/bin/nvcc \ - -DCMAKE_CXX_COMPILER=aarch64-linux-gnu-g++ \ - -DLIBCUDACXX_ENABLE_LIBCUDACXX_TESTS=ON \ - -DLIBCXX_EXECUTOR="SSHExecutor(host='${HOST}', username='${USERNAME}')" - -Ensure that you can SSH to the target system from the host system -without inputing a password (e.g. use SSH keys). - -.. _step-2-build-run-the-tests-1: - -Step 2: Build & Run the Tests -~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ - -Follow Step 2 for \*nix native builds/tests. - -\*nix Systems, NVRTC Build/Test -------------------------------- - -The procedure is demonstrated for NVRTC in C++11 mode on a Debian-like -Linux systems; the same basic steps are required on all other platforms. - -.. _step-0-install-build-prerequisites-1: - -Step 0: Install Build Prerequisites -~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ - -Follow Step 0 for \*nix native builds/tests. - -.. _step-1-generate-the-build-files-2: - -Step 1: Generate the Build Files -~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + cd build + lit libcudacxx-cpp17/RELATIVE_PATH_TO_TEST_OR_SUBFOLDER -sv -Dexecutor="NoopExecutor()" -In a Bash shell: +Finally different standard modes can be tested by passing e.g ``--param=std=c++20`` .. code:: bash - cd ${LIBCUDACXX_ROOT} - cmake \ - -S ./ \ - -B build \ - -DCMAKE_CXX_COMPILER=$CC \ - -DCMAKE_CUDA_COMPILER=$TOOLKIT/bin/nvcc \ - -DLIBCUDACXX_ENABLE_LIBCUDACXX_TESTS=ON \ - -DLIBCUDACXX_TEST_WITH_NVRTC=ON - -.. _step-2-build-run-the-tests-2: - -Step 2: Build & Run the Tests -~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ - -Follow Step 2 for \*nix native builds/tests. - -Windows, Native Build/Test --------------------------- - -.. _step-0-install-build-requirements-1: - -Step 0: Install Build Requirements -~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ - -`Install Python `_. - -Download `the get-pip.py bootstrap -script `_ and run it. - -Install the LLVM Integrated Tester (``lit``) using a Visual Studio -command prompt: - -.. code:: bat - - pip install lit - -Step 0.5: Launching a Build Environment -~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ - -Visual Studio comes with a few build environments that are appropriate -to use. - -The ``x64 Native Tools Command Prompt`` and other similarly named -environments will work. - -If Powershell is desired, it would be best to launch it from within the -native tools. This helps avoid configuration step issues. - -.. _step-1-generate-the-build-files-3: - -Step 1: Generate the Build Files -~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ - -In a Visual Studio command prompt: - -.. code:: bat - - set LIBCUDACXX_ROOT=\path\to\libcudacxx # Helpful env var pointing to the git repo root. - cd %LIBCUDACXX_ROOT% - - cmake ^ - -S ./ ^ - -B build ^ - -G "Ninja" ^ - -DCMAKE_CXX_COMPILER=cl ^ - -DCMAKE_CUDA_COMPILER=nvcc ^ - -DCMAKE_CUDA_COMPILER_FORCED=ON ^ - -DLIBCUDACXX_ENABLE_LIBCUDACXX_TESTS=ON - -.. _step-2-build-run-the-tests-3: - -Step 2: Build & Run the Tests -~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ - -``SM_ARCH`` can be set to any integer value (Ex: “80”, “86”) - -.. code:: bat - - set LIBCUDACXX_SITE_CONFIG=%LIBCUDACXX_ROOT%\build\test\lit.site.cfg - lit %LIBCUDACXX_ROOT%\test -Dcompute_archs=%SM_ARCH% -sv --no-progress-bar + cd build + lit libcudacxx-cpp17/RELATIVE_PATH_TO_TEST_OR_SUBFOLDER -sv --param=std=c++20 From b27d512d43d9b28505ca8f3f86623640bcea1f8b Mon Sep 17 00:00:00 2001 From: Bernhard Manfred Gruber Date: Fri, 22 Nov 2024 13:37:55 +0100 Subject: [PATCH 10/45] Reorganize PTX headers to match generator (#2925) --- .../cuda/__ptx/instructions/barrier_cluster.h | 124 +- .../cuda/__ptx/instructions/cp_async_bulk.h | 158 +- .../instructions/cp_async_bulk_commit_group.h | 22 +- .../__ptx/instructions/cp_async_bulk_tensor.h | 657 +------ .../instructions/cp_async_bulk_wait_group.h | 46 +- .../__ptx/instructions/cp_reduce_async_bulk.h | 1673 +---------------- .../cp_reduce_async_bulk_tensor.h | 533 +----- .../include/cuda/__ptx/instructions/fence.h | 252 +-- .../generated/barrier_cluster.inc | 123 ++ .../instructions/generated/cp_async_bulk.inc | 111 ++ .../generated/cp_async_bulk_commit_group.inc | 21 + .../generated/cp_async_bulk_multicast.inc | 45 + .../generated/cp_async_bulk_tensor.inc | 416 ++++ .../cp_async_bulk_tensor_multicast.inc | 239 +++ .../generated/cp_async_bulk_wait_group.inc | 45 + .../generated/cp_reduce_async_bulk.inc | 1435 ++++++++++++++ .../generated/cp_reduce_async_bulk_bf16.inc | 127 ++ .../generated/cp_reduce_async_bulk_f16.inc | 110 ++ .../generated/cp_reduce_async_bulk_tensor.inc | 532 ++++++ .../__ptx/instructions/generated/fence.inc | 67 + .../generated/fence_mbarrier_init.inc | 27 + .../generated/fence_proxy_alias.inc | 21 + .../generated/fence_proxy_async.inc | 50 + .../fence_proxy_tensormap_generic.inc | 82 + .../__ptx/instructions/generated/get_sreg.inc | 1001 ++++++++++ .../instructions/generated/getctarank.inc | 27 + .../generated/mbarrier_arrive.inc | 205 ++ .../generated/mbarrier_arrive_expect_tx.inc | 79 + .../generated/mbarrier_arrive_no_complete.inc | 26 + .../instructions/generated/mbarrier_init.inc | 23 + .../generated/mbarrier_test_wait.inc | 75 + .../generated/mbarrier_test_wait_parity.inc | 75 + .../generated/mbarrier_try_wait.inc | 157 ++ .../generated/mbarrier_try_wait_parity.inc | 157 ++ .../instructions/generated/red_async.inc | 417 ++++ .../__ptx/instructions/generated/st_async.inc | 108 ++ .../generated/tensormap_cp_fenceproxy.inc | 54 + .../generated/tensormap_replace.inc | 569 ++++++ .../cuda/__ptx/instructions/get_sreg.h | 1002 +--------- .../cuda/__ptx/instructions/getctarank.h | 28 +- .../cuda/__ptx/instructions/mbarrier_arrive.h | 313 +-- .../cuda/__ptx/instructions/mbarrier_init.h | 24 +- .../cuda/__ptx/instructions/mbarrier_wait.h | 468 +---- .../cuda/__ptx/instructions/red_async.h | 418 +--- .../cuda/__ptx/instructions/st_async.h | 109 +- .../instructions/tensormap_cp_fenceproxy.h | 55 +- .../__ptx/instructions/tensormap_replace.h | 570 +----- 47 files changed, 6454 insertions(+), 6422 deletions(-) create mode 100644 libcudacxx/include/cuda/__ptx/instructions/generated/barrier_cluster.inc create mode 100644 libcudacxx/include/cuda/__ptx/instructions/generated/cp_async_bulk.inc create mode 100644 libcudacxx/include/cuda/__ptx/instructions/generated/cp_async_bulk_commit_group.inc create mode 100644 libcudacxx/include/cuda/__ptx/instructions/generated/cp_async_bulk_multicast.inc create mode 100644 libcudacxx/include/cuda/__ptx/instructions/generated/cp_async_bulk_tensor.inc create mode 100644 libcudacxx/include/cuda/__ptx/instructions/generated/cp_async_bulk_tensor_multicast.inc create mode 100644 libcudacxx/include/cuda/__ptx/instructions/generated/cp_async_bulk_wait_group.inc create mode 100644 libcudacxx/include/cuda/__ptx/instructions/generated/cp_reduce_async_bulk.inc create mode 100644 libcudacxx/include/cuda/__ptx/instructions/generated/cp_reduce_async_bulk_bf16.inc create mode 100644 libcudacxx/include/cuda/__ptx/instructions/generated/cp_reduce_async_bulk_f16.inc create mode 100644 libcudacxx/include/cuda/__ptx/instructions/generated/cp_reduce_async_bulk_tensor.inc create mode 100644 libcudacxx/include/cuda/__ptx/instructions/generated/fence.inc create mode 100644 libcudacxx/include/cuda/__ptx/instructions/generated/fence_mbarrier_init.inc create mode 100644 libcudacxx/include/cuda/__ptx/instructions/generated/fence_proxy_alias.inc create mode 100644 libcudacxx/include/cuda/__ptx/instructions/generated/fence_proxy_async.inc create mode 100644 libcudacxx/include/cuda/__ptx/instructions/generated/fence_proxy_tensormap_generic.inc create mode 100644 libcudacxx/include/cuda/__ptx/instructions/generated/get_sreg.inc create mode 100644 libcudacxx/include/cuda/__ptx/instructions/generated/getctarank.inc create mode 100644 libcudacxx/include/cuda/__ptx/instructions/generated/mbarrier_arrive.inc create mode 100644 libcudacxx/include/cuda/__ptx/instructions/generated/mbarrier_arrive_expect_tx.inc create mode 100644 libcudacxx/include/cuda/__ptx/instructions/generated/mbarrier_arrive_no_complete.inc create mode 100644 libcudacxx/include/cuda/__ptx/instructions/generated/mbarrier_init.inc create mode 100644 libcudacxx/include/cuda/__ptx/instructions/generated/mbarrier_test_wait.inc create mode 100644 libcudacxx/include/cuda/__ptx/instructions/generated/mbarrier_test_wait_parity.inc create mode 100644 libcudacxx/include/cuda/__ptx/instructions/generated/mbarrier_try_wait.inc create mode 100644 libcudacxx/include/cuda/__ptx/instructions/generated/mbarrier_try_wait_parity.inc create mode 100644 libcudacxx/include/cuda/__ptx/instructions/generated/red_async.inc create mode 100644 libcudacxx/include/cuda/__ptx/instructions/generated/st_async.inc create mode 100644 libcudacxx/include/cuda/__ptx/instructions/generated/tensormap_cp_fenceproxy.inc create mode 100644 libcudacxx/include/cuda/__ptx/instructions/generated/tensormap_replace.inc diff --git a/libcudacxx/include/cuda/__ptx/instructions/barrier_cluster.h b/libcudacxx/include/cuda/__ptx/instructions/barrier_cluster.h index bc7d88efd48..8b09ddd1110 100644 --- a/libcudacxx/include/cuda/__ptx/instructions/barrier_cluster.h +++ b/libcudacxx/include/cuda/__ptx/instructions/barrier_cluster.h @@ -32,129 +32,7 @@ _LIBCUDACXX_BEGIN_NAMESPACE_CUDA_PTX // 9.7.12.3. Parallel Synchronization and Communication Instructions: barrier.cluster // https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#parallel-synchronization-and-communication-instructions-barrier-cluster -/* -// barrier.cluster.arrive; // PTX ISA 78, SM_90 -// Marked volatile and as clobbering memory -template -__device__ static inline void barrier_cluster_arrive(); -*/ -#if __cccl_ptx_isa >= 780 -extern "C" _CCCL_DEVICE void __cuda_ptx_barrier_cluster_arrive_is_not_supported_before_SM_90__(); -template -_CCCL_DEVICE static inline void barrier_cluster_arrive() -{ - NV_IF_ELSE_TARGET( - NV_PROVIDES_SM_90, - (asm volatile("barrier.cluster.arrive;" - : - : - : "memory");), - ( - // Unsupported architectures will have a linker error with a semi-decent error message - __cuda_ptx_barrier_cluster_arrive_is_not_supported_before_SM_90__();)); -} -#endif // __cccl_ptx_isa >= 780 - -/* -// barrier.cluster.wait; // PTX ISA 78, SM_90 -// Marked volatile and as clobbering memory -template -__device__ static inline void barrier_cluster_wait(); -*/ -#if __cccl_ptx_isa >= 780 -extern "C" _CCCL_DEVICE void __cuda_ptx_barrier_cluster_wait_is_not_supported_before_SM_90__(); -template -_CCCL_DEVICE static inline void barrier_cluster_wait() -{ - NV_IF_ELSE_TARGET( - NV_PROVIDES_SM_90, - (asm volatile("barrier.cluster.wait;" - : - : - : "memory");), - ( - // Unsupported architectures will have a linker error with a semi-decent error message - __cuda_ptx_barrier_cluster_wait_is_not_supported_before_SM_90__();)); -} -#endif // __cccl_ptx_isa >= 780 - -/* -// barrier.cluster.arrive.sem; // PTX ISA 80, SM_90 -// .sem = { .release } -// Marked volatile and as clobbering memory -template -__device__ static inline void barrier_cluster_arrive( - cuda::ptx::sem_release_t); -*/ -#if __cccl_ptx_isa >= 800 -extern "C" _CCCL_DEVICE void __cuda_ptx_barrier_cluster_arrive_is_not_supported_before_SM_90__(); -template -_CCCL_DEVICE static inline void barrier_cluster_arrive(sem_release_t) -{ - // __sem == sem_release (due to parameter type constraint) - NV_IF_ELSE_TARGET( - NV_PROVIDES_SM_90, - (asm volatile("barrier.cluster.arrive.release;" - : - : - : "memory");), - ( - // Unsupported architectures will have a linker error with a semi-decent error message - __cuda_ptx_barrier_cluster_arrive_is_not_supported_before_SM_90__();)); -} -#endif // __cccl_ptx_isa >= 800 - -/* -// barrier.cluster.arrive.sem; // PTX ISA 80, SM_90 -// .sem = { .relaxed } -// Marked volatile -template -__device__ static inline void barrier_cluster_arrive( - cuda::ptx::sem_relaxed_t); -*/ -#if __cccl_ptx_isa >= 800 -extern "C" _CCCL_DEVICE void __cuda_ptx_barrier_cluster_arrive_is_not_supported_before_SM_90__(); -template -_CCCL_DEVICE static inline void barrier_cluster_arrive(sem_relaxed_t) -{ - // __sem == sem_relaxed (due to parameter type constraint) - NV_IF_ELSE_TARGET( - NV_PROVIDES_SM_90, - (asm volatile("barrier.cluster.arrive.relaxed;" - : - : - :);), - ( - // Unsupported architectures will have a linker error with a semi-decent error message - __cuda_ptx_barrier_cluster_arrive_is_not_supported_before_SM_90__();)); -} -#endif // __cccl_ptx_isa >= 800 - -/* -// barrier.cluster.wait.sem; // PTX ISA 80, SM_90 -// .sem = { .acquire } -// Marked volatile and as clobbering memory -template -__device__ static inline void barrier_cluster_wait( - cuda::ptx::sem_acquire_t); -*/ -#if __cccl_ptx_isa >= 800 -extern "C" _CCCL_DEVICE void __cuda_ptx_barrier_cluster_wait_is_not_supported_before_SM_90__(); -template -_CCCL_DEVICE static inline void barrier_cluster_wait(sem_acquire_t) -{ - // __sem == sem_acquire (due to parameter type constraint) - NV_IF_ELSE_TARGET( - NV_PROVIDES_SM_90, - (asm volatile("barrier.cluster.wait.acquire;" - : - : - : "memory");), - ( - // Unsupported architectures will have a linker error with a semi-decent error message - __cuda_ptx_barrier_cluster_wait_is_not_supported_before_SM_90__();)); -} -#endif // __cccl_ptx_isa >= 800 +#include _LIBCUDACXX_END_NAMESPACE_CUDA_PTX diff --git a/libcudacxx/include/cuda/__ptx/instructions/cp_async_bulk.h b/libcudacxx/include/cuda/__ptx/instructions/cp_async_bulk.h index 7acce210230..480a02a701e 100644 --- a/libcudacxx/include/cuda/__ptx/instructions/cp_async_bulk.h +++ b/libcudacxx/include/cuda/__ptx/instructions/cp_async_bulk.h @@ -32,162 +32,8 @@ _LIBCUDACXX_BEGIN_NAMESPACE_CUDA_PTX // 9.7.8.24.6. Data Movement and Conversion Instructions: cp.async.bulk // https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#data-movement-and-conversion-instructions-cp-async-bulk -/* -// cp.async.bulk.dst.src.mbarrier::complete_tx::bytes [dstMem], [srcMem], size, [smem_bar]; // 1a. unicast PTX ISA 80, -SM_90 -// .dst = { .shared::cluster } -// .src = { .global } -template -__device__ static inline void cp_async_bulk( - cuda::ptx::space_cluster_t, - cuda::ptx::space_global_t, - void* dstMem, - const void* srcMem, - const uint32_t& size, - uint64_t* smem_bar); -*/ -#if __cccl_ptx_isa >= 800 -extern "C" _CCCL_DEVICE void __cuda_ptx_cp_async_bulk_is_not_supported_before_SM_90__(); -template -_CCCL_DEVICE static inline void cp_async_bulk( - space_cluster_t, - space_global_t, - void* __dstMem, - const void* __srcMem, - const _CUDA_VSTD::uint32_t& __size, - _CUDA_VSTD::uint64_t* __smem_bar) -{ - // __space == space_cluster (due to parameter type constraint) - // __space == space_global (due to parameter type constraint) - NV_IF_ELSE_TARGET( - NV_PROVIDES_SM_90, - (asm("cp.async.bulk.shared::cluster.global.mbarrier::complete_tx::bytes [%0], [%1], %2, [%3]; // 1a. unicast" - : - : "r"(__as_ptr_smem(__dstMem)), "l"(__as_ptr_gmem(__srcMem)), "r"(__size), "r"(__as_ptr_smem(__smem_bar)) - : "memory");), - ( - // Unsupported architectures will have a linker error with a semi-decent error message - __cuda_ptx_cp_async_bulk_is_not_supported_before_SM_90__();)); -} -#endif // __cccl_ptx_isa >= 800 - -/* -// cp.async.bulk.dst.src.mbarrier::complete_tx::bytes [dstMem], [srcMem], size, [rdsmem_bar]; // 2. PTX ISA 80, SM_90 -// .dst = { .shared::cluster } -// .src = { .shared::cta } -template -__device__ static inline void cp_async_bulk( - cuda::ptx::space_cluster_t, - cuda::ptx::space_shared_t, - void* dstMem, - const void* srcMem, - const uint32_t& size, - uint64_t* rdsmem_bar); -*/ -#if __cccl_ptx_isa >= 800 -extern "C" _CCCL_DEVICE void __cuda_ptx_cp_async_bulk_is_not_supported_before_SM_90__(); -template -_CCCL_DEVICE static inline void cp_async_bulk( - space_cluster_t, - space_shared_t, - void* __dstMem, - const void* __srcMem, - const _CUDA_VSTD::uint32_t& __size, - _CUDA_VSTD::uint64_t* __rdsmem_bar) -{ - // __space == space_cluster (due to parameter type constraint) - // __space == space_shared (due to parameter type constraint) - NV_IF_ELSE_TARGET( - NV_PROVIDES_SM_90, - (asm("cp.async.bulk.shared::cluster.shared::cta.mbarrier::complete_tx::bytes [%0], [%1], %2, [%3]; // 2. " - : - : "r"(__as_ptr_remote_dsmem(__dstMem)), - "r"(__as_ptr_smem(__srcMem)), - "r"(__size), - "r"(__as_ptr_remote_dsmem(__rdsmem_bar)) - : "memory");), - ( - // Unsupported architectures will have a linker error with a semi-decent error message - __cuda_ptx_cp_async_bulk_is_not_supported_before_SM_90__();)); -} -#endif // __cccl_ptx_isa >= 800 - -/* -// cp.async.bulk.dst.src.bulk_group [dstMem], [srcMem], size; // 3. PTX ISA 80, SM_90 -// .dst = { .global } -// .src = { .shared::cta } -template -__device__ static inline void cp_async_bulk( - cuda::ptx::space_global_t, - cuda::ptx::space_shared_t, - void* dstMem, - const void* srcMem, - const uint32_t& size); -*/ -#if __cccl_ptx_isa >= 800 -extern "C" _CCCL_DEVICE void __cuda_ptx_cp_async_bulk_is_not_supported_before_SM_90__(); -template -_CCCL_DEVICE static inline void -cp_async_bulk(space_global_t, space_shared_t, void* __dstMem, const void* __srcMem, const _CUDA_VSTD::uint32_t& __size) -{ - // __space == space_global (due to parameter type constraint) - // __space == space_shared (due to parameter type constraint) - NV_IF_ELSE_TARGET( - NV_PROVIDES_SM_90, - (asm("cp.async.bulk.global.shared::cta.bulk_group [%0], [%1], %2; // 3. " - : - : "l"(__as_ptr_gmem(__dstMem)), "r"(__as_ptr_smem(__srcMem)), "r"(__size) - : "memory");), - ( - // Unsupported architectures will have a linker error with a semi-decent error message - __cuda_ptx_cp_async_bulk_is_not_supported_before_SM_90__();)); -} -#endif // __cccl_ptx_isa >= 800 -/* -// cp.async.bulk{.dst}{.src}.mbarrier::complete_tx::bytes.multicast::cluster [dstMem], [srcMem], size, [smem_bar], -ctaMask; // 1. PTX ISA 80, SM_90a -// .dst = { .shared::cluster } -// .src = { .global } -template -__device__ static inline void cp_async_bulk( - cuda::ptx::space_cluster_t, - cuda::ptx::space_global_t, - void* dstMem, - const void* srcMem, - const uint32_t& size, - uint64_t* smem_bar, - const uint16_t& ctaMask); -*/ -#if __cccl_ptx_isa >= 800 -extern "C" _CCCL_DEVICE void __cuda_ptx_cp_async_bulk_is_not_supported_before_SM_90a__(); -template -_CCCL_DEVICE static inline void cp_async_bulk( - space_cluster_t, - space_global_t, - void* __dstMem, - const void* __srcMem, - const _CUDA_VSTD::uint32_t& __size, - _CUDA_VSTD::uint64_t* __smem_bar, - const _CUDA_VSTD::uint16_t& __ctaMask) -{ - // __space == space_cluster (due to parameter type constraint) - // __space == space_global (due to parameter type constraint) - NV_IF_ELSE_TARGET( - NV_HAS_FEATURE_SM_90a, - (asm("cp.async.bulk.shared::cluster.global.mbarrier::complete_tx::bytes.multicast::cluster [%0], [%1], %2, [%3], " - "%4; // 1. " - : - : "r"(__as_ptr_smem(__dstMem)), - "l"(__as_ptr_gmem(__srcMem)), - "r"(__size), - "r"(__as_ptr_smem(__smem_bar)), - "h"(__ctaMask) - : "memory");), - ( - // Unsupported architectures will have a linker error with a semi-decent error message - __cuda_ptx_cp_async_bulk_is_not_supported_before_SM_90a__();)); -} -#endif // __cccl_ptx_isa >= 800 +#include +#include _LIBCUDACXX_END_NAMESPACE_CUDA_PTX diff --git a/libcudacxx/include/cuda/__ptx/instructions/cp_async_bulk_commit_group.h b/libcudacxx/include/cuda/__ptx/instructions/cp_async_bulk_commit_group.h index f0028105350..bd97259cf19 100644 --- a/libcudacxx/include/cuda/__ptx/instructions/cp_async_bulk_commit_group.h +++ b/libcudacxx/include/cuda/__ptx/instructions/cp_async_bulk_commit_group.h @@ -32,27 +32,7 @@ _LIBCUDACXX_BEGIN_NAMESPACE_CUDA_PTX // 9.7.8.24.12. Data Movement and Conversion Instructions: cp.async.bulk.commit_group // https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#data-movement-and-conversion-instructions-cp-async-bulk-commit-group -/* -// cp.async.bulk.commit_group; // PTX ISA 80, SM_90 -template -__device__ static inline void cp_async_bulk_commit_group(); -*/ -#if __cccl_ptx_isa >= 800 -extern "C" _CCCL_DEVICE void __cuda_ptx_cp_async_bulk_commit_group_is_not_supported_before_SM_90__(); -template -_CCCL_DEVICE static inline void cp_async_bulk_commit_group() -{ - NV_IF_ELSE_TARGET( - NV_PROVIDES_SM_90, - (asm volatile("cp.async.bulk.commit_group;" - : - : - :);), - ( - // Unsupported architectures will have a linker error with a semi-decent error message - __cuda_ptx_cp_async_bulk_commit_group_is_not_supported_before_SM_90__();)); -} -#endif // __cccl_ptx_isa >= 800 +#include _LIBCUDACXX_END_NAMESPACE_CUDA_PTX diff --git a/libcudacxx/include/cuda/__ptx/instructions/cp_async_bulk_tensor.h b/libcudacxx/include/cuda/__ptx/instructions/cp_async_bulk_tensor.h index b66981e8bbb..5b9f575ce5f 100644 --- a/libcudacxx/include/cuda/__ptx/instructions/cp_async_bulk_tensor.h +++ b/libcudacxx/include/cuda/__ptx/instructions/cp_async_bulk_tensor.h @@ -32,661 +32,8 @@ _LIBCUDACXX_BEGIN_NAMESPACE_CUDA_PTX // 9.7.8.24.9. Data Movement and Conversion Instructions: cp.async.bulk.tensor // https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#data-movement-and-conversion-instructions-cp-async-bulk-tensor -/* -// cp.async.bulk.tensor.1d.dst.src.tile.mbarrier::complete_tx::bytes [dstMem], [tensorMap, tensorCoords], [smem_bar];// -1a. PTX ISA 80, SM_90 -// .dst = { .shared::cluster } -// .src = { .global } -template -__device__ static inline void cp_async_bulk_tensor( - cuda::ptx::space_cluster_t, - cuda::ptx::space_global_t, - void* dstMem, - const void* tensorMap, - const int32_t (&tensorCoords)[1], - uint64_t* smem_bar); -*/ -#if __cccl_ptx_isa >= 800 -extern "C" _CCCL_DEVICE void __cuda_ptx_cp_async_bulk_tensor_is_not_supported_before_SM_90__(); -template -_CCCL_DEVICE static inline void cp_async_bulk_tensor( - space_cluster_t, - space_global_t, - void* __dstMem, - const void* __tensorMap, - const _CUDA_VSTD::int32_t (&__tensorCoords)[1], - _CUDA_VSTD::uint64_t* __smem_bar) -{ - // __space == space_cluster (due to parameter type constraint) - // __space == space_global (due to parameter type constraint) - NV_IF_ELSE_TARGET( - NV_PROVIDES_SM_90, - (asm("cp.async.bulk.tensor.1d.shared::cluster.global.tile.mbarrier::complete_tx::bytes [%0], [%1, {%2}], [%3];// " - "1a." - : - : "r"(__as_ptr_smem(__dstMem)), "l"(__tensorMap), "r"(__tensorCoords[0]), "r"(__as_ptr_smem(__smem_bar)) - : "memory");), - ( - // Unsupported architectures will have a linker error with a semi-decent error message - __cuda_ptx_cp_async_bulk_tensor_is_not_supported_before_SM_90__();)); -} -#endif // __cccl_ptx_isa >= 800 - -/* -// cp.async.bulk.tensor.1d.dst.src.tile.bulk_group [tensorMap, tensorCoords], [srcMem]; // 3a. PTX ISA 80, SM_90 -// .dst = { .global } -// .src = { .shared::cta } -template -__device__ static inline void cp_async_bulk_tensor( - cuda::ptx::space_global_t, - cuda::ptx::space_shared_t, - const void* tensorMap, - const int32_t (&tensorCoords)[1], - const void* srcMem); -*/ -#if __cccl_ptx_isa >= 800 -extern "C" _CCCL_DEVICE void __cuda_ptx_cp_async_bulk_tensor_is_not_supported_before_SM_90__(); -template -_CCCL_DEVICE static inline void cp_async_bulk_tensor( - space_global_t, - space_shared_t, - const void* __tensorMap, - const _CUDA_VSTD::int32_t (&__tensorCoords)[1], - const void* __srcMem) -{ - // __space == space_global (due to parameter type constraint) - // __space == space_shared (due to parameter type constraint) - NV_IF_ELSE_TARGET( - NV_PROVIDES_SM_90, - (asm("cp.async.bulk.tensor.1d.global.shared::cta.tile.bulk_group [%0, {%1}], [%2]; // 3a." - : - : "l"(__tensorMap), "r"(__tensorCoords[0]), "r"(__as_ptr_smem(__srcMem)) - : "memory");), - ( - // Unsupported architectures will have a linker error with a semi-decent error message - __cuda_ptx_cp_async_bulk_tensor_is_not_supported_before_SM_90__();)); -} -#endif // __cccl_ptx_isa >= 800 - -/* -// cp.async.bulk.tensor.2d.dst.src.tile.mbarrier::complete_tx::bytes [dstMem], [tensorMap, tensorCoords], [smem_bar];// -1b. PTX ISA 80, SM_90 -// .dst = { .shared::cluster } -// .src = { .global } -template -__device__ static inline void cp_async_bulk_tensor( - cuda::ptx::space_cluster_t, - cuda::ptx::space_global_t, - void* dstMem, - const void* tensorMap, - const int32_t (&tensorCoords)[2], - uint64_t* smem_bar); -*/ -#if __cccl_ptx_isa >= 800 -extern "C" _CCCL_DEVICE void __cuda_ptx_cp_async_bulk_tensor_is_not_supported_before_SM_90__(); -template -_CCCL_DEVICE static inline void cp_async_bulk_tensor( - space_cluster_t, - space_global_t, - void* __dstMem, - const void* __tensorMap, - const _CUDA_VSTD::int32_t (&__tensorCoords)[2], - _CUDA_VSTD::uint64_t* __smem_bar) -{ - // __space == space_cluster (due to parameter type constraint) - // __space == space_global (due to parameter type constraint) - NV_IF_ELSE_TARGET( - NV_PROVIDES_SM_90, - (asm("cp.async.bulk.tensor.2d.shared::cluster.global.tile.mbarrier::complete_tx::bytes [%0], [%1, {%2, %3}], " - "[%4];// 1b." - : - : "r"(__as_ptr_smem(__dstMem)), - "l"(__tensorMap), - "r"(__tensorCoords[0]), - "r"(__tensorCoords[1]), - "r"(__as_ptr_smem(__smem_bar)) - : "memory");), - ( - // Unsupported architectures will have a linker error with a semi-decent error message - __cuda_ptx_cp_async_bulk_tensor_is_not_supported_before_SM_90__();)); -} -#endif // __cccl_ptx_isa >= 800 - -/* -// cp.async.bulk.tensor.2d.dst.src.tile.bulk_group [tensorMap, tensorCoords], [srcMem]; // 3b. PTX ISA 80, SM_90 -// .dst = { .global } -// .src = { .shared::cta } -template -__device__ static inline void cp_async_bulk_tensor( - cuda::ptx::space_global_t, - cuda::ptx::space_shared_t, - const void* tensorMap, - const int32_t (&tensorCoords)[2], - const void* srcMem); -*/ -#if __cccl_ptx_isa >= 800 -extern "C" _CCCL_DEVICE void __cuda_ptx_cp_async_bulk_tensor_is_not_supported_before_SM_90__(); -template -_CCCL_DEVICE static inline void cp_async_bulk_tensor( - space_global_t, - space_shared_t, - const void* __tensorMap, - const _CUDA_VSTD::int32_t (&__tensorCoords)[2], - const void* __srcMem) -{ - // __space == space_global (due to parameter type constraint) - // __space == space_shared (due to parameter type constraint) - NV_IF_ELSE_TARGET( - NV_PROVIDES_SM_90, - (asm("cp.async.bulk.tensor.2d.global.shared::cta.tile.bulk_group [%0, {%1, %2}], [%3]; // 3b." - : - : "l"(__tensorMap), "r"(__tensorCoords[0]), "r"(__tensorCoords[1]), "r"(__as_ptr_smem(__srcMem)) - : "memory");), - ( - // Unsupported architectures will have a linker error with a semi-decent error message - __cuda_ptx_cp_async_bulk_tensor_is_not_supported_before_SM_90__();)); -} -#endif // __cccl_ptx_isa >= 800 - -/* -// cp.async.bulk.tensor.3d.dst.src.tile.mbarrier::complete_tx::bytes [dstMem], [tensorMap, tensorCoords], [smem_bar];// -1c. PTX ISA 80, SM_90 -// .dst = { .shared::cluster } -// .src = { .global } -template -__device__ static inline void cp_async_bulk_tensor( - cuda::ptx::space_cluster_t, - cuda::ptx::space_global_t, - void* dstMem, - const void* tensorMap, - const int32_t (&tensorCoords)[3], - uint64_t* smem_bar); -*/ -#if __cccl_ptx_isa >= 800 -extern "C" _CCCL_DEVICE void __cuda_ptx_cp_async_bulk_tensor_is_not_supported_before_SM_90__(); -template -_CCCL_DEVICE static inline void cp_async_bulk_tensor( - space_cluster_t, - space_global_t, - void* __dstMem, - const void* __tensorMap, - const _CUDA_VSTD::int32_t (&__tensorCoords)[3], - _CUDA_VSTD::uint64_t* __smem_bar) -{ - // __space == space_cluster (due to parameter type constraint) - // __space == space_global (due to parameter type constraint) - NV_IF_ELSE_TARGET( - NV_PROVIDES_SM_90, - (asm("cp.async.bulk.tensor.3d.shared::cluster.global.tile.mbarrier::complete_tx::bytes [%0], [%1, {%2, %3, %4}], " - "[%5];// 1c." - : - : "r"(__as_ptr_smem(__dstMem)), - "l"(__tensorMap), - "r"(__tensorCoords[0]), - "r"(__tensorCoords[1]), - "r"(__tensorCoords[2]), - "r"(__as_ptr_smem(__smem_bar)) - : "memory");), - ( - // Unsupported architectures will have a linker error with a semi-decent error message - __cuda_ptx_cp_async_bulk_tensor_is_not_supported_before_SM_90__();)); -} -#endif // __cccl_ptx_isa >= 800 - -/* -// cp.async.bulk.tensor.3d.dst.src.tile.bulk_group [tensorMap, tensorCoords], [srcMem]; // 3c. PTX ISA 80, SM_90 -// .dst = { .global } -// .src = { .shared::cta } -template -__device__ static inline void cp_async_bulk_tensor( - cuda::ptx::space_global_t, - cuda::ptx::space_shared_t, - const void* tensorMap, - const int32_t (&tensorCoords)[3], - const void* srcMem); -*/ -#if __cccl_ptx_isa >= 800 -extern "C" _CCCL_DEVICE void __cuda_ptx_cp_async_bulk_tensor_is_not_supported_before_SM_90__(); -template -_CCCL_DEVICE static inline void cp_async_bulk_tensor( - space_global_t, - space_shared_t, - const void* __tensorMap, - const _CUDA_VSTD::int32_t (&__tensorCoords)[3], - const void* __srcMem) -{ - // __space == space_global (due to parameter type constraint) - // __space == space_shared (due to parameter type constraint) - NV_IF_ELSE_TARGET( - NV_PROVIDES_SM_90, - (asm("cp.async.bulk.tensor.3d.global.shared::cta.tile.bulk_group [%0, {%1, %2, %3}], [%4]; // 3c." - : - : "l"(__tensorMap), - "r"(__tensorCoords[0]), - "r"(__tensorCoords[1]), - "r"(__tensorCoords[2]), - "r"(__as_ptr_smem(__srcMem)) - : "memory");), - ( - // Unsupported architectures will have a linker error with a semi-decent error message - __cuda_ptx_cp_async_bulk_tensor_is_not_supported_before_SM_90__();)); -} -#endif // __cccl_ptx_isa >= 800 - -/* -// cp.async.bulk.tensor.4d.dst.src.tile.mbarrier::complete_tx::bytes [dstMem], [tensorMap, tensorCoords], [smem_bar];// -1d. PTX ISA 80, SM_90 -// .dst = { .shared::cluster } -// .src = { .global } -template -__device__ static inline void cp_async_bulk_tensor( - cuda::ptx::space_cluster_t, - cuda::ptx::space_global_t, - void* dstMem, - const void* tensorMap, - const int32_t (&tensorCoords)[4], - uint64_t* smem_bar); -*/ -#if __cccl_ptx_isa >= 800 -extern "C" _CCCL_DEVICE void __cuda_ptx_cp_async_bulk_tensor_is_not_supported_before_SM_90__(); -template -_CCCL_DEVICE static inline void cp_async_bulk_tensor( - space_cluster_t, - space_global_t, - void* __dstMem, - const void* __tensorMap, - const _CUDA_VSTD::int32_t (&__tensorCoords)[4], - _CUDA_VSTD::uint64_t* __smem_bar) -{ - // __space == space_cluster (due to parameter type constraint) - // __space == space_global (due to parameter type constraint) - NV_IF_ELSE_TARGET( - NV_PROVIDES_SM_90, - (asm("cp.async.bulk.tensor.4d.shared::cluster.global.tile.mbarrier::complete_tx::bytes [%0], [%1, {%2, %3, %4, " - "%5}], [%6];// 1d." - : - : "r"(__as_ptr_smem(__dstMem)), - "l"(__tensorMap), - "r"(__tensorCoords[0]), - "r"(__tensorCoords[1]), - "r"(__tensorCoords[2]), - "r"(__tensorCoords[3]), - "r"(__as_ptr_smem(__smem_bar)) - : "memory");), - ( - // Unsupported architectures will have a linker error with a semi-decent error message - __cuda_ptx_cp_async_bulk_tensor_is_not_supported_before_SM_90__();)); -} -#endif // __cccl_ptx_isa >= 800 - -/* -// cp.async.bulk.tensor.4d.dst.src.tile.bulk_group [tensorMap, tensorCoords], [srcMem]; // 3d. PTX ISA 80, SM_90 -// .dst = { .global } -// .src = { .shared::cta } -template -__device__ static inline void cp_async_bulk_tensor( - cuda::ptx::space_global_t, - cuda::ptx::space_shared_t, - const void* tensorMap, - const int32_t (&tensorCoords)[4], - const void* srcMem); -*/ -#if __cccl_ptx_isa >= 800 -extern "C" _CCCL_DEVICE void __cuda_ptx_cp_async_bulk_tensor_is_not_supported_before_SM_90__(); -template -_CCCL_DEVICE static inline void cp_async_bulk_tensor( - space_global_t, - space_shared_t, - const void* __tensorMap, - const _CUDA_VSTD::int32_t (&__tensorCoords)[4], - const void* __srcMem) -{ - // __space == space_global (due to parameter type constraint) - // __space == space_shared (due to parameter type constraint) - NV_IF_ELSE_TARGET( - NV_PROVIDES_SM_90, - (asm("cp.async.bulk.tensor.4d.global.shared::cta.tile.bulk_group [%0, {%1, %2, %3, %4}], [%5]; // 3d." - : - : "l"(__tensorMap), - "r"(__tensorCoords[0]), - "r"(__tensorCoords[1]), - "r"(__tensorCoords[2]), - "r"(__tensorCoords[3]), - "r"(__as_ptr_smem(__srcMem)) - : "memory");), - ( - // Unsupported architectures will have a linker error with a semi-decent error message - __cuda_ptx_cp_async_bulk_tensor_is_not_supported_before_SM_90__();)); -} -#endif // __cccl_ptx_isa >= 800 - -/* -// cp.async.bulk.tensor.5d.dst.src.tile.mbarrier::complete_tx::bytes [dstMem], [tensorMap, tensorCoords], [smem_bar];// -1e. PTX ISA 80, SM_90 -// .dst = { .shared::cluster } -// .src = { .global } -template -__device__ static inline void cp_async_bulk_tensor( - cuda::ptx::space_cluster_t, - cuda::ptx::space_global_t, - void* dstMem, - const void* tensorMap, - const int32_t (&tensorCoords)[5], - uint64_t* smem_bar); -*/ -#if __cccl_ptx_isa >= 800 -extern "C" _CCCL_DEVICE void __cuda_ptx_cp_async_bulk_tensor_is_not_supported_before_SM_90__(); -template -_CCCL_DEVICE static inline void cp_async_bulk_tensor( - space_cluster_t, - space_global_t, - void* __dstMem, - const void* __tensorMap, - const _CUDA_VSTD::int32_t (&__tensorCoords)[5], - _CUDA_VSTD::uint64_t* __smem_bar) -{ - // __space == space_cluster (due to parameter type constraint) - // __space == space_global (due to parameter type constraint) - NV_IF_ELSE_TARGET( - NV_PROVIDES_SM_90, - (asm("cp.async.bulk.tensor.5d.shared::cluster.global.tile.mbarrier::complete_tx::bytes [%0], [%1, {%2, %3, %4, %5, " - "%6}], [%7];// 1e." - : - : "r"(__as_ptr_smem(__dstMem)), - "l"(__tensorMap), - "r"(__tensorCoords[0]), - "r"(__tensorCoords[1]), - "r"(__tensorCoords[2]), - "r"(__tensorCoords[3]), - "r"(__tensorCoords[4]), - "r"(__as_ptr_smem(__smem_bar)) - : "memory");), - ( - // Unsupported architectures will have a linker error with a semi-decent error message - __cuda_ptx_cp_async_bulk_tensor_is_not_supported_before_SM_90__();)); -} -#endif // __cccl_ptx_isa >= 800 - -/* -// cp.async.bulk.tensor.5d.dst.src.tile.bulk_group [tensorMap, tensorCoords], [srcMem]; // 3e. PTX ISA 80, SM_90 -// .dst = { .global } -// .src = { .shared::cta } -template -__device__ static inline void cp_async_bulk_tensor( - cuda::ptx::space_global_t, - cuda::ptx::space_shared_t, - const void* tensorMap, - const int32_t (&tensorCoords)[5], - const void* srcMem); -*/ -#if __cccl_ptx_isa >= 800 -extern "C" _CCCL_DEVICE void __cuda_ptx_cp_async_bulk_tensor_is_not_supported_before_SM_90__(); -template -_CCCL_DEVICE static inline void cp_async_bulk_tensor( - space_global_t, - space_shared_t, - const void* __tensorMap, - const _CUDA_VSTD::int32_t (&__tensorCoords)[5], - const void* __srcMem) -{ - // __space == space_global (due to parameter type constraint) - // __space == space_shared (due to parameter type constraint) - NV_IF_ELSE_TARGET( - NV_PROVIDES_SM_90, - (asm("cp.async.bulk.tensor.5d.global.shared::cta.tile.bulk_group [%0, {%1, %2, %3, %4, %5}], [%6]; // 3e." - : - : "l"(__tensorMap), - "r"(__tensorCoords[0]), - "r"(__tensorCoords[1]), - "r"(__tensorCoords[2]), - "r"(__tensorCoords[3]), - "r"(__tensorCoords[4]), - "r"(__as_ptr_smem(__srcMem)) - : "memory");), - ( - // Unsupported architectures will have a linker error with a semi-decent error message - __cuda_ptx_cp_async_bulk_tensor_is_not_supported_before_SM_90__();)); -} -#endif // __cccl_ptx_isa >= 800 -/* -// cp.async.bulk.tensor.1d.dst.src.tile.mbarrier::complete_tx::bytes.multicast::cluster [dstMem], [tensorMap, -tensorCoords], [smem_bar], ctaMask; // 2a. PTX ISA 80, SM_90a -// .dst = { .shared::cluster } -// .src = { .global } -template -__device__ static inline void cp_async_bulk_tensor( - cuda::ptx::space_cluster_t, - cuda::ptx::space_global_t, - void* dstMem, - const void* tensorMap, - const int32_t (&tensorCoords)[1], - uint64_t* smem_bar, - const uint16_t& ctaMask); -*/ -#if __cccl_ptx_isa >= 800 -extern "C" _CCCL_DEVICE void __cuda_ptx_cp_async_bulk_tensor_is_not_supported_before_SM_90a__(); -template -_CCCL_DEVICE static inline void cp_async_bulk_tensor( - space_cluster_t, - space_global_t, - void* __dstMem, - const void* __tensorMap, - const _CUDA_VSTD::int32_t (&__tensorCoords)[1], - _CUDA_VSTD::uint64_t* __smem_bar, - const _CUDA_VSTD::uint16_t& __ctaMask) -{ - // __space == space_cluster (due to parameter type constraint) - // __space == space_global (due to parameter type constraint) - NV_IF_ELSE_TARGET( - NV_HAS_FEATURE_SM_90a, - (asm("cp.async.bulk.tensor.1d.shared::cluster.global.tile.mbarrier::complete_tx::bytes.multicast::cluster [%0], " - "[%1, {%2}], [%3], %4; // 2a." - : - : "r"(__as_ptr_smem(__dstMem)), - "l"(__tensorMap), - "r"(__tensorCoords[0]), - "r"(__as_ptr_smem(__smem_bar)), - "h"(__ctaMask) - : "memory");), - ( - // Unsupported architectures will have a linker error with a semi-decent error message - __cuda_ptx_cp_async_bulk_tensor_is_not_supported_before_SM_90a__();)); -} -#endif // __cccl_ptx_isa >= 800 - -/* -// cp.async.bulk.tensor.2d.dst.src.tile.mbarrier::complete_tx::bytes.multicast::cluster [dstMem], [tensorMap, -tensorCoords], [smem_bar], ctaMask; // 2b. PTX ISA 80, SM_90a -// .dst = { .shared::cluster } -// .src = { .global } -template -__device__ static inline void cp_async_bulk_tensor( - cuda::ptx::space_cluster_t, - cuda::ptx::space_global_t, - void* dstMem, - const void* tensorMap, - const int32_t (&tensorCoords)[2], - uint64_t* smem_bar, - const uint16_t& ctaMask); -*/ -#if __cccl_ptx_isa >= 800 -extern "C" _CCCL_DEVICE void __cuda_ptx_cp_async_bulk_tensor_is_not_supported_before_SM_90a__(); -template -_CCCL_DEVICE static inline void cp_async_bulk_tensor( - space_cluster_t, - space_global_t, - void* __dstMem, - const void* __tensorMap, - const _CUDA_VSTD::int32_t (&__tensorCoords)[2], - _CUDA_VSTD::uint64_t* __smem_bar, - const _CUDA_VSTD::uint16_t& __ctaMask) -{ - // __space == space_cluster (due to parameter type constraint) - // __space == space_global (due to parameter type constraint) - NV_IF_ELSE_TARGET( - NV_HAS_FEATURE_SM_90a, - (asm("cp.async.bulk.tensor.2d.shared::cluster.global.tile.mbarrier::complete_tx::bytes.multicast::cluster [%0], " - "[%1, {%2, %3}], [%4], %5; // 2b." - : - : "r"(__as_ptr_smem(__dstMem)), - "l"(__tensorMap), - "r"(__tensorCoords[0]), - "r"(__tensorCoords[1]), - "r"(__as_ptr_smem(__smem_bar)), - "h"(__ctaMask) - : "memory");), - ( - // Unsupported architectures will have a linker error with a semi-decent error message - __cuda_ptx_cp_async_bulk_tensor_is_not_supported_before_SM_90a__();)); -} -#endif // __cccl_ptx_isa >= 800 - -/* -// cp.async.bulk.tensor.3d.dst.src.tile.mbarrier::complete_tx::bytes.multicast::cluster [dstMem], [tensorMap, -tensorCoords], [smem_bar], ctaMask; // 2c. PTX ISA 80, SM_90a -// .dst = { .shared::cluster } -// .src = { .global } -template -__device__ static inline void cp_async_bulk_tensor( - cuda::ptx::space_cluster_t, - cuda::ptx::space_global_t, - void* dstMem, - const void* tensorMap, - const int32_t (&tensorCoords)[3], - uint64_t* smem_bar, - const uint16_t& ctaMask); -*/ -#if __cccl_ptx_isa >= 800 -extern "C" _CCCL_DEVICE void __cuda_ptx_cp_async_bulk_tensor_is_not_supported_before_SM_90a__(); -template -_CCCL_DEVICE static inline void cp_async_bulk_tensor( - space_cluster_t, - space_global_t, - void* __dstMem, - const void* __tensorMap, - const _CUDA_VSTD::int32_t (&__tensorCoords)[3], - _CUDA_VSTD::uint64_t* __smem_bar, - const _CUDA_VSTD::uint16_t& __ctaMask) -{ - // __space == space_cluster (due to parameter type constraint) - // __space == space_global (due to parameter type constraint) - NV_IF_ELSE_TARGET( - NV_HAS_FEATURE_SM_90a, - (asm("cp.async.bulk.tensor.3d.shared::cluster.global.tile.mbarrier::complete_tx::bytes.multicast::cluster [%0], " - "[%1, {%2, %3, %4}], [%5], %6; // 2c." - : - : "r"(__as_ptr_smem(__dstMem)), - "l"(__tensorMap), - "r"(__tensorCoords[0]), - "r"(__tensorCoords[1]), - "r"(__tensorCoords[2]), - "r"(__as_ptr_smem(__smem_bar)), - "h"(__ctaMask) - : "memory");), - ( - // Unsupported architectures will have a linker error with a semi-decent error message - __cuda_ptx_cp_async_bulk_tensor_is_not_supported_before_SM_90a__();)); -} -#endif // __cccl_ptx_isa >= 800 - -/* -// cp.async.bulk.tensor.4d.dst.src.tile.mbarrier::complete_tx::bytes.multicast::cluster [dstMem], [tensorMap, -tensorCoords], [smem_bar], ctaMask; // 2d. PTX ISA 80, SM_90a -// .dst = { .shared::cluster } -// .src = { .global } -template -__device__ static inline void cp_async_bulk_tensor( - cuda::ptx::space_cluster_t, - cuda::ptx::space_global_t, - void* dstMem, - const void* tensorMap, - const int32_t (&tensorCoords)[4], - uint64_t* smem_bar, - const uint16_t& ctaMask); -*/ -#if __cccl_ptx_isa >= 800 -extern "C" _CCCL_DEVICE void __cuda_ptx_cp_async_bulk_tensor_is_not_supported_before_SM_90a__(); -template -_CCCL_DEVICE static inline void cp_async_bulk_tensor( - space_cluster_t, - space_global_t, - void* __dstMem, - const void* __tensorMap, - const _CUDA_VSTD::int32_t (&__tensorCoords)[4], - _CUDA_VSTD::uint64_t* __smem_bar, - const _CUDA_VSTD::uint16_t& __ctaMask) -{ - // __space == space_cluster (due to parameter type constraint) - // __space == space_global (due to parameter type constraint) - NV_IF_ELSE_TARGET( - NV_HAS_FEATURE_SM_90a, - (asm("cp.async.bulk.tensor.4d.shared::cluster.global.tile.mbarrier::complete_tx::bytes.multicast::cluster [%0], " - "[%1, {%2, %3, %4, %5}], [%6], %7; // 2d." - : - : "r"(__as_ptr_smem(__dstMem)), - "l"(__tensorMap), - "r"(__tensorCoords[0]), - "r"(__tensorCoords[1]), - "r"(__tensorCoords[2]), - "r"(__tensorCoords[3]), - "r"(__as_ptr_smem(__smem_bar)), - "h"(__ctaMask) - : "memory");), - ( - // Unsupported architectures will have a linker error with a semi-decent error message - __cuda_ptx_cp_async_bulk_tensor_is_not_supported_before_SM_90a__();)); -} -#endif // __cccl_ptx_isa >= 800 - -/* -// cp.async.bulk.tensor.5d.dst.src.tile.mbarrier::complete_tx::bytes.multicast::cluster [dstMem], [tensorMap, -tensorCoords], [smem_bar], ctaMask; // 2e. PTX ISA 80, SM_90a -// .dst = { .shared::cluster } -// .src = { .global } -template -__device__ static inline void cp_async_bulk_tensor( - cuda::ptx::space_cluster_t, - cuda::ptx::space_global_t, - void* dstMem, - const void* tensorMap, - const int32_t (&tensorCoords)[5], - uint64_t* smem_bar, - const uint16_t& ctaMask); -*/ -#if __cccl_ptx_isa >= 800 -extern "C" _CCCL_DEVICE void __cuda_ptx_cp_async_bulk_tensor_is_not_supported_before_SM_90a__(); -template -_CCCL_DEVICE static inline void cp_async_bulk_tensor( - space_cluster_t, - space_global_t, - void* __dstMem, - const void* __tensorMap, - const _CUDA_VSTD::int32_t (&__tensorCoords)[5], - _CUDA_VSTD::uint64_t* __smem_bar, - const _CUDA_VSTD::uint16_t& __ctaMask) -{ - // __space == space_cluster (due to parameter type constraint) - // __space == space_global (due to parameter type constraint) - NV_IF_ELSE_TARGET( - NV_HAS_FEATURE_SM_90a, - (asm("cp.async.bulk.tensor.5d.shared::cluster.global.tile.mbarrier::complete_tx::bytes.multicast::cluster [%0], " - "[%1, {%2, %3, %4, %5, %6}], [%7], %8; // 2e." - : - : "r"(__as_ptr_smem(__dstMem)), - "l"(__tensorMap), - "r"(__tensorCoords[0]), - "r"(__tensorCoords[1]), - "r"(__tensorCoords[2]), - "r"(__tensorCoords[3]), - "r"(__tensorCoords[4]), - "r"(__as_ptr_smem(__smem_bar)), - "h"(__ctaMask) - : "memory");), - ( - // Unsupported architectures will have a linker error with a semi-decent error message - __cuda_ptx_cp_async_bulk_tensor_is_not_supported_before_SM_90a__();)); -} -#endif // __cccl_ptx_isa >= 800 +#include +#include _LIBCUDACXX_END_NAMESPACE_CUDA_PTX diff --git a/libcudacxx/include/cuda/__ptx/instructions/cp_async_bulk_wait_group.h b/libcudacxx/include/cuda/__ptx/instructions/cp_async_bulk_wait_group.h index 5dcbf8572f4..00a3700e1a9 100644 --- a/libcudacxx/include/cuda/__ptx/instructions/cp_async_bulk_wait_group.h +++ b/libcudacxx/include/cuda/__ptx/instructions/cp_async_bulk_wait_group.h @@ -32,51 +32,7 @@ _LIBCUDACXX_BEGIN_NAMESPACE_CUDA_PTX // 9.7.8.24.13. Data Movement and Conversion Instructions: cp.async.bulk.wait_group // https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#data-movement-and-conversion-instructions-cp-async-bulk-wait-group -/* -// cp.async.bulk.wait_group N; // PTX ISA 80, SM_90 -template -__device__ static inline void cp_async_bulk_wait_group( - cuda::ptx::n32_t N); -*/ -#if __cccl_ptx_isa >= 800 -extern "C" _CCCL_DEVICE void __cuda_ptx_cp_async_bulk_wait_group_is_not_supported_before_SM_90__(); -template -_CCCL_DEVICE static inline void cp_async_bulk_wait_group(n32_t<_N32> __n) -{ - NV_IF_ELSE_TARGET( - NV_PROVIDES_SM_90, - (asm volatile("cp.async.bulk.wait_group %0;" - : - : "n"(__n.value) - : "memory");), - ( - // Unsupported architectures will have a linker error with a semi-decent error message - __cuda_ptx_cp_async_bulk_wait_group_is_not_supported_before_SM_90__();)); -} -#endif // __cccl_ptx_isa >= 800 - -/* -// cp.async.bulk.wait_group.read N; // PTX ISA 80, SM_90 -template -__device__ static inline void cp_async_bulk_wait_group_read( - cuda::ptx::n32_t N); -*/ -#if __cccl_ptx_isa >= 800 -extern "C" _CCCL_DEVICE void __cuda_ptx_cp_async_bulk_wait_group_read_is_not_supported_before_SM_90__(); -template -_CCCL_DEVICE static inline void cp_async_bulk_wait_group_read(n32_t<_N32> __n) -{ - NV_IF_ELSE_TARGET( - NV_PROVIDES_SM_90, - (asm volatile("cp.async.bulk.wait_group.read %0;" - : - : "n"(__n.value) - : "memory");), - ( - // Unsupported architectures will have a linker error with a semi-decent error message - __cuda_ptx_cp_async_bulk_wait_group_read_is_not_supported_before_SM_90__();)); -} -#endif // __cccl_ptx_isa >= 800 +#include _LIBCUDACXX_END_NAMESPACE_CUDA_PTX diff --git a/libcudacxx/include/cuda/__ptx/instructions/cp_reduce_async_bulk.h b/libcudacxx/include/cuda/__ptx/instructions/cp_reduce_async_bulk.h index ee89e33c1c2..ee6d90bc4d9 100644 --- a/libcudacxx/include/cuda/__ptx/instructions/cp_reduce_async_bulk.h +++ b/libcudacxx/include/cuda/__ptx/instructions/cp_reduce_async_bulk.h @@ -43,1679 +43,12 @@ _LIBCUDACXX_BEGIN_NAMESPACE_CUDA_PTX // 9.7.8.24.7. Data Movement and Conversion Instructions: cp.reduce.async.bulk // https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#data-movement-and-conversion-instructions-cp-reduce-async-bulk -/* -// cp.reduce.async.bulk.dst.src.mbarrier::complete_tx::bytes.op.type [dstMem], [srcMem], size, [rdsmem_bar]; // 1. PTX -ISA 80, SM_90 -// .dst = { .shared::cluster } -// .src = { .shared::cta } -// .type = { .b32 } -// .op = { .and } -template -__device__ static inline void cp_reduce_async_bulk( - cuda::ptx::space_cluster_t, - cuda::ptx::space_shared_t, - cuda::ptx::op_and_op_t, - B32* dstMem, - const B32* srcMem, - uint32_t size, - uint64_t* rdsmem_bar); -*/ -#if __cccl_ptx_isa >= 800 -extern "C" _CCCL_DEVICE void __cuda_ptx_cp_reduce_async_bulk_is_not_supported_before_SM_90__(); -template -_CCCL_DEVICE static inline void cp_reduce_async_bulk( - space_cluster_t, - space_shared_t, - op_and_op_t, - _B32* __dstMem, - const _B32* __srcMem, - _CUDA_VSTD::uint32_t __size, - _CUDA_VSTD::uint64_t* __rdsmem_bar) -{ - // __space == space_cluster (due to parameter type constraint) - // __space == space_shared (due to parameter type constraint) - // __type == type_b32 (due to parameter type constraint) - // __op == op_and_op (due to parameter type constraint) - NV_IF_ELSE_TARGET( - NV_PROVIDES_SM_90, - (asm("cp.reduce.async.bulk.shared::cluster.shared::cta.mbarrier::complete_tx::bytes.and.b32 [%0], [%1], %2, [%3]; " - "// 1." - : - : "r"(__as_ptr_remote_dsmem(__dstMem)), - "r"(__as_ptr_smem(__srcMem)), - "r"(__size), - "r"(__as_ptr_remote_dsmem(__rdsmem_bar)) - : "memory");), - ( - // Unsupported architectures will have a linker error with a semi-decent error message - __cuda_ptx_cp_reduce_async_bulk_is_not_supported_before_SM_90__();)); -} -#endif // __cccl_ptx_isa >= 800 - -/* -// cp.reduce.async.bulk.dst.src.mbarrier::complete_tx::bytes.op.type [dstMem], [srcMem], size, [rdsmem_bar]; // 1. PTX -ISA 80, SM_90 -// .dst = { .shared::cluster } -// .src = { .shared::cta } -// .type = { .b32 } -// .op = { .or } -template -__device__ static inline void cp_reduce_async_bulk( - cuda::ptx::space_cluster_t, - cuda::ptx::space_shared_t, - cuda::ptx::op_or_op_t, - B32* dstMem, - const B32* srcMem, - uint32_t size, - uint64_t* rdsmem_bar); -*/ -#if __cccl_ptx_isa >= 800 -extern "C" _CCCL_DEVICE void __cuda_ptx_cp_reduce_async_bulk_is_not_supported_before_SM_90__(); -template -_CCCL_DEVICE static inline void cp_reduce_async_bulk( - space_cluster_t, - space_shared_t, - op_or_op_t, - _B32* __dstMem, - const _B32* __srcMem, - _CUDA_VSTD::uint32_t __size, - _CUDA_VSTD::uint64_t* __rdsmem_bar) -{ - // __space == space_cluster (due to parameter type constraint) - // __space == space_shared (due to parameter type constraint) - // __type == type_b32 (due to parameter type constraint) - // __op == op_or_op (due to parameter type constraint) - NV_IF_ELSE_TARGET( - NV_PROVIDES_SM_90, - (asm("cp.reduce.async.bulk.shared::cluster.shared::cta.mbarrier::complete_tx::bytes.or.b32 [%0], [%1], %2, [%3]; " - "// 1." - : - : "r"(__as_ptr_remote_dsmem(__dstMem)), - "r"(__as_ptr_smem(__srcMem)), - "r"(__size), - "r"(__as_ptr_remote_dsmem(__rdsmem_bar)) - : "memory");), - ( - // Unsupported architectures will have a linker error with a semi-decent error message - __cuda_ptx_cp_reduce_async_bulk_is_not_supported_before_SM_90__();)); -} -#endif // __cccl_ptx_isa >= 800 - -/* -// cp.reduce.async.bulk.dst.src.mbarrier::complete_tx::bytes.op.type [dstMem], [srcMem], size, [rdsmem_bar]; // 1. PTX -ISA 80, SM_90 -// .dst = { .shared::cluster } -// .src = { .shared::cta } -// .type = { .b32 } -// .op = { .xor } -template -__device__ static inline void cp_reduce_async_bulk( - cuda::ptx::space_cluster_t, - cuda::ptx::space_shared_t, - cuda::ptx::op_xor_op_t, - B32* dstMem, - const B32* srcMem, - uint32_t size, - uint64_t* rdsmem_bar); -*/ -#if __cccl_ptx_isa >= 800 -extern "C" _CCCL_DEVICE void __cuda_ptx_cp_reduce_async_bulk_is_not_supported_before_SM_90__(); -template -_CCCL_DEVICE static inline void cp_reduce_async_bulk( - space_cluster_t, - space_shared_t, - op_xor_op_t, - _B32* __dstMem, - const _B32* __srcMem, - _CUDA_VSTD::uint32_t __size, - _CUDA_VSTD::uint64_t* __rdsmem_bar) -{ - // __space == space_cluster (due to parameter type constraint) - // __space == space_shared (due to parameter type constraint) - // __type == type_b32 (due to parameter type constraint) - // __op == op_xor_op (due to parameter type constraint) - NV_IF_ELSE_TARGET( - NV_PROVIDES_SM_90, - (asm("cp.reduce.async.bulk.shared::cluster.shared::cta.mbarrier::complete_tx::bytes.xor.b32 [%0], [%1], %2, [%3]; " - "// 1." - : - : "r"(__as_ptr_remote_dsmem(__dstMem)), - "r"(__as_ptr_smem(__srcMem)), - "r"(__size), - "r"(__as_ptr_remote_dsmem(__rdsmem_bar)) - : "memory");), - ( - // Unsupported architectures will have a linker error with a semi-decent error message - __cuda_ptx_cp_reduce_async_bulk_is_not_supported_before_SM_90__();)); -} -#endif // __cccl_ptx_isa >= 800 - -/* -// cp.reduce.async.bulk.dst.src.mbarrier::complete_tx::bytes.op.type [dstMem], [srcMem], size, [rdsmem_bar]; // 1. PTX -ISA 80, SM_90 -// .dst = { .shared::cluster } -// .src = { .shared::cta } -// .type = { .u32 } -// .op = { .min } -template -__device__ static inline void cp_reduce_async_bulk( - cuda::ptx::space_cluster_t, - cuda::ptx::space_shared_t, - cuda::ptx::op_min_t, - uint32_t* dstMem, - const uint32_t* srcMem, - uint32_t size, - uint64_t* rdsmem_bar); -*/ -#if __cccl_ptx_isa >= 800 -extern "C" _CCCL_DEVICE void __cuda_ptx_cp_reduce_async_bulk_is_not_supported_before_SM_90__(); -template -_CCCL_DEVICE static inline void cp_reduce_async_bulk( - space_cluster_t, - space_shared_t, - op_min_t, - _CUDA_VSTD::uint32_t* __dstMem, - const _CUDA_VSTD::uint32_t* __srcMem, - _CUDA_VSTD::uint32_t __size, - _CUDA_VSTD::uint64_t* __rdsmem_bar) -{ - // __space == space_cluster (due to parameter type constraint) - // __space == space_shared (due to parameter type constraint) - // __type == type_u32 (due to parameter type constraint) - // __op == op_min (due to parameter type constraint) - NV_IF_ELSE_TARGET( - NV_PROVIDES_SM_90, - (asm("cp.reduce.async.bulk.shared::cluster.shared::cta.mbarrier::complete_tx::bytes.min.u32 [%0], [%1], %2, [%3]; " - "// 1." - : - : "r"(__as_ptr_remote_dsmem(__dstMem)), - "r"(__as_ptr_smem(__srcMem)), - "r"(__size), - "r"(__as_ptr_remote_dsmem(__rdsmem_bar)) - : "memory");), - ( - // Unsupported architectures will have a linker error with a semi-decent error message - __cuda_ptx_cp_reduce_async_bulk_is_not_supported_before_SM_90__();)); -} -#endif // __cccl_ptx_isa >= 800 - -/* -// cp.reduce.async.bulk.dst.src.mbarrier::complete_tx::bytes.op.type [dstMem], [srcMem], size, [rdsmem_bar]; // 1. PTX -ISA 80, SM_90 -// .dst = { .shared::cluster } -// .src = { .shared::cta } -// .type = { .u32 } -// .op = { .max } -template -__device__ static inline void cp_reduce_async_bulk( - cuda::ptx::space_cluster_t, - cuda::ptx::space_shared_t, - cuda::ptx::op_max_t, - uint32_t* dstMem, - const uint32_t* srcMem, - uint32_t size, - uint64_t* rdsmem_bar); -*/ -#if __cccl_ptx_isa >= 800 -extern "C" _CCCL_DEVICE void __cuda_ptx_cp_reduce_async_bulk_is_not_supported_before_SM_90__(); -template -_CCCL_DEVICE static inline void cp_reduce_async_bulk( - space_cluster_t, - space_shared_t, - op_max_t, - _CUDA_VSTD::uint32_t* __dstMem, - const _CUDA_VSTD::uint32_t* __srcMem, - _CUDA_VSTD::uint32_t __size, - _CUDA_VSTD::uint64_t* __rdsmem_bar) -{ - // __space == space_cluster (due to parameter type constraint) - // __space == space_shared (due to parameter type constraint) - // __type == type_u32 (due to parameter type constraint) - // __op == op_max (due to parameter type constraint) - NV_IF_ELSE_TARGET( - NV_PROVIDES_SM_90, - (asm("cp.reduce.async.bulk.shared::cluster.shared::cta.mbarrier::complete_tx::bytes.max.u32 [%0], [%1], %2, [%3]; " - "// 1." - : - : "r"(__as_ptr_remote_dsmem(__dstMem)), - "r"(__as_ptr_smem(__srcMem)), - "r"(__size), - "r"(__as_ptr_remote_dsmem(__rdsmem_bar)) - : "memory");), - ( - // Unsupported architectures will have a linker error with a semi-decent error message - __cuda_ptx_cp_reduce_async_bulk_is_not_supported_before_SM_90__();)); -} -#endif // __cccl_ptx_isa >= 800 - -/* -// cp.reduce.async.bulk.dst.src.mbarrier::complete_tx::bytes.op.type [dstMem], [srcMem], size, [rdsmem_bar]; // 1. PTX -ISA 80, SM_90 -// .dst = { .shared::cluster } -// .src = { .shared::cta } -// .type = { .u32 } -// .op = { .add } -template -__device__ static inline void cp_reduce_async_bulk( - cuda::ptx::space_cluster_t, - cuda::ptx::space_shared_t, - cuda::ptx::op_add_t, - uint32_t* dstMem, - const uint32_t* srcMem, - uint32_t size, - uint64_t* rdsmem_bar); -*/ -#if __cccl_ptx_isa >= 800 -extern "C" _CCCL_DEVICE void __cuda_ptx_cp_reduce_async_bulk_is_not_supported_before_SM_90__(); -template -_CCCL_DEVICE static inline void cp_reduce_async_bulk( - space_cluster_t, - space_shared_t, - op_add_t, - _CUDA_VSTD::uint32_t* __dstMem, - const _CUDA_VSTD::uint32_t* __srcMem, - _CUDA_VSTD::uint32_t __size, - _CUDA_VSTD::uint64_t* __rdsmem_bar) -{ - // __space == space_cluster (due to parameter type constraint) - // __space == space_shared (due to parameter type constraint) - // __type == type_u32 (due to parameter type constraint) - // __op == op_add (due to parameter type constraint) - NV_IF_ELSE_TARGET( - NV_PROVIDES_SM_90, - (asm("cp.reduce.async.bulk.shared::cluster.shared::cta.mbarrier::complete_tx::bytes.add.u32 [%0], [%1], %2, [%3]; " - "// 1." - : - : "r"(__as_ptr_remote_dsmem(__dstMem)), - "r"(__as_ptr_smem(__srcMem)), - "r"(__size), - "r"(__as_ptr_remote_dsmem(__rdsmem_bar)) - : "memory");), - ( - // Unsupported architectures will have a linker error with a semi-decent error message - __cuda_ptx_cp_reduce_async_bulk_is_not_supported_before_SM_90__();)); -} -#endif // __cccl_ptx_isa >= 800 - -/* -// cp.reduce.async.bulk.dst.src.mbarrier::complete_tx::bytes.op.type [dstMem], [srcMem], size, [rdsmem_bar]; // 1. PTX -ISA 80, SM_90 -// .dst = { .shared::cluster } -// .src = { .shared::cta } -// .type = { .u32 } -// .op = { .inc } -template -__device__ static inline void cp_reduce_async_bulk( - cuda::ptx::space_cluster_t, - cuda::ptx::space_shared_t, - cuda::ptx::op_inc_t, - uint32_t* dstMem, - const uint32_t* srcMem, - uint32_t size, - uint64_t* rdsmem_bar); -*/ -#if __cccl_ptx_isa >= 800 -extern "C" _CCCL_DEVICE void __cuda_ptx_cp_reduce_async_bulk_is_not_supported_before_SM_90__(); -template -_CCCL_DEVICE static inline void cp_reduce_async_bulk( - space_cluster_t, - space_shared_t, - op_inc_t, - _CUDA_VSTD::uint32_t* __dstMem, - const _CUDA_VSTD::uint32_t* __srcMem, - _CUDA_VSTD::uint32_t __size, - _CUDA_VSTD::uint64_t* __rdsmem_bar) -{ - // __space == space_cluster (due to parameter type constraint) - // __space == space_shared (due to parameter type constraint) - // __type == type_u32 (due to parameter type constraint) - // __op == op_inc (due to parameter type constraint) - NV_IF_ELSE_TARGET( - NV_PROVIDES_SM_90, - (asm("cp.reduce.async.bulk.shared::cluster.shared::cta.mbarrier::complete_tx::bytes.inc.u32 [%0], [%1], %2, [%3]; " - "// 1." - : - : "r"(__as_ptr_remote_dsmem(__dstMem)), - "r"(__as_ptr_smem(__srcMem)), - "r"(__size), - "r"(__as_ptr_remote_dsmem(__rdsmem_bar)) - : "memory");), - ( - // Unsupported architectures will have a linker error with a semi-decent error message - __cuda_ptx_cp_reduce_async_bulk_is_not_supported_before_SM_90__();)); -} -#endif // __cccl_ptx_isa >= 800 - -/* -// cp.reduce.async.bulk.dst.src.mbarrier::complete_tx::bytes.op.type [dstMem], [srcMem], size, [rdsmem_bar]; // 1. PTX -ISA 80, SM_90 -// .dst = { .shared::cluster } -// .src = { .shared::cta } -// .type = { .u32 } -// .op = { .dec } -template -__device__ static inline void cp_reduce_async_bulk( - cuda::ptx::space_cluster_t, - cuda::ptx::space_shared_t, - cuda::ptx::op_dec_t, - uint32_t* dstMem, - const uint32_t* srcMem, - uint32_t size, - uint64_t* rdsmem_bar); -*/ -#if __cccl_ptx_isa >= 800 -extern "C" _CCCL_DEVICE void __cuda_ptx_cp_reduce_async_bulk_is_not_supported_before_SM_90__(); -template -_CCCL_DEVICE static inline void cp_reduce_async_bulk( - space_cluster_t, - space_shared_t, - op_dec_t, - _CUDA_VSTD::uint32_t* __dstMem, - const _CUDA_VSTD::uint32_t* __srcMem, - _CUDA_VSTD::uint32_t __size, - _CUDA_VSTD::uint64_t* __rdsmem_bar) -{ - // __space == space_cluster (due to parameter type constraint) - // __space == space_shared (due to parameter type constraint) - // __type == type_u32 (due to parameter type constraint) - // __op == op_dec (due to parameter type constraint) - NV_IF_ELSE_TARGET( - NV_PROVIDES_SM_90, - (asm("cp.reduce.async.bulk.shared::cluster.shared::cta.mbarrier::complete_tx::bytes.dec.u32 [%0], [%1], %2, [%3]; " - "// 1." - : - : "r"(__as_ptr_remote_dsmem(__dstMem)), - "r"(__as_ptr_smem(__srcMem)), - "r"(__size), - "r"(__as_ptr_remote_dsmem(__rdsmem_bar)) - : "memory");), - ( - // Unsupported architectures will have a linker error with a semi-decent error message - __cuda_ptx_cp_reduce_async_bulk_is_not_supported_before_SM_90__();)); -} -#endif // __cccl_ptx_isa >= 800 - -/* -// cp.reduce.async.bulk.dst.src.mbarrier::complete_tx::bytes.op.type [dstMem], [srcMem], size, [rdsmem_bar]; // 1. PTX -ISA 80, SM_90 -// .dst = { .shared::cluster } -// .src = { .shared::cta } -// .type = { .s32 } -// .op = { .min } -template -__device__ static inline void cp_reduce_async_bulk( - cuda::ptx::space_cluster_t, - cuda::ptx::space_shared_t, - cuda::ptx::op_min_t, - int32_t* dstMem, - const int32_t* srcMem, - uint32_t size, - uint64_t* rdsmem_bar); -*/ -#if __cccl_ptx_isa >= 800 -extern "C" _CCCL_DEVICE void __cuda_ptx_cp_reduce_async_bulk_is_not_supported_before_SM_90__(); -template -_CCCL_DEVICE static inline void cp_reduce_async_bulk( - space_cluster_t, - space_shared_t, - op_min_t, - _CUDA_VSTD::int32_t* __dstMem, - const _CUDA_VSTD::int32_t* __srcMem, - _CUDA_VSTD::uint32_t __size, - _CUDA_VSTD::uint64_t* __rdsmem_bar) -{ - // __space == space_cluster (due to parameter type constraint) - // __space == space_shared (due to parameter type constraint) - // __type == type_s32 (due to parameter type constraint) - // __op == op_min (due to parameter type constraint) - NV_IF_ELSE_TARGET( - NV_PROVIDES_SM_90, - (asm("cp.reduce.async.bulk.shared::cluster.shared::cta.mbarrier::complete_tx::bytes.min.s32 [%0], [%1], %2, [%3]; " - "// 1." - : - : "r"(__as_ptr_remote_dsmem(__dstMem)), - "r"(__as_ptr_smem(__srcMem)), - "r"(__size), - "r"(__as_ptr_remote_dsmem(__rdsmem_bar)) - : "memory");), - ( - // Unsupported architectures will have a linker error with a semi-decent error message - __cuda_ptx_cp_reduce_async_bulk_is_not_supported_before_SM_90__();)); -} -#endif // __cccl_ptx_isa >= 800 - -/* -// cp.reduce.async.bulk.dst.src.mbarrier::complete_tx::bytes.op.type [dstMem], [srcMem], size, [rdsmem_bar]; // 1. PTX -ISA 80, SM_90 -// .dst = { .shared::cluster } -// .src = { .shared::cta } -// .type = { .s32 } -// .op = { .max } -template -__device__ static inline void cp_reduce_async_bulk( - cuda::ptx::space_cluster_t, - cuda::ptx::space_shared_t, - cuda::ptx::op_max_t, - int32_t* dstMem, - const int32_t* srcMem, - uint32_t size, - uint64_t* rdsmem_bar); -*/ -#if __cccl_ptx_isa >= 800 -extern "C" _CCCL_DEVICE void __cuda_ptx_cp_reduce_async_bulk_is_not_supported_before_SM_90__(); -template -_CCCL_DEVICE static inline void cp_reduce_async_bulk( - space_cluster_t, - space_shared_t, - op_max_t, - _CUDA_VSTD::int32_t* __dstMem, - const _CUDA_VSTD::int32_t* __srcMem, - _CUDA_VSTD::uint32_t __size, - _CUDA_VSTD::uint64_t* __rdsmem_bar) -{ - // __space == space_cluster (due to parameter type constraint) - // __space == space_shared (due to parameter type constraint) - // __type == type_s32 (due to parameter type constraint) - // __op == op_max (due to parameter type constraint) - NV_IF_ELSE_TARGET( - NV_PROVIDES_SM_90, - (asm("cp.reduce.async.bulk.shared::cluster.shared::cta.mbarrier::complete_tx::bytes.max.s32 [%0], [%1], %2, [%3]; " - "// 1." - : - : "r"(__as_ptr_remote_dsmem(__dstMem)), - "r"(__as_ptr_smem(__srcMem)), - "r"(__size), - "r"(__as_ptr_remote_dsmem(__rdsmem_bar)) - : "memory");), - ( - // Unsupported architectures will have a linker error with a semi-decent error message - __cuda_ptx_cp_reduce_async_bulk_is_not_supported_before_SM_90__();)); -} -#endif // __cccl_ptx_isa >= 800 - -/* -// cp.reduce.async.bulk.dst.src.mbarrier::complete_tx::bytes.op.type [dstMem], [srcMem], size, [rdsmem_bar]; // 1. PTX -ISA 80, SM_90 -// .dst = { .shared::cluster } -// .src = { .shared::cta } -// .type = { .s32 } -// .op = { .add } -template -__device__ static inline void cp_reduce_async_bulk( - cuda::ptx::space_cluster_t, - cuda::ptx::space_shared_t, - cuda::ptx::op_add_t, - int32_t* dstMem, - const int32_t* srcMem, - uint32_t size, - uint64_t* rdsmem_bar); -*/ -#if __cccl_ptx_isa >= 800 -extern "C" _CCCL_DEVICE void __cuda_ptx_cp_reduce_async_bulk_is_not_supported_before_SM_90__(); -template -_CCCL_DEVICE static inline void cp_reduce_async_bulk( - space_cluster_t, - space_shared_t, - op_add_t, - _CUDA_VSTD::int32_t* __dstMem, - const _CUDA_VSTD::int32_t* __srcMem, - _CUDA_VSTD::uint32_t __size, - _CUDA_VSTD::uint64_t* __rdsmem_bar) -{ - // __space == space_cluster (due to parameter type constraint) - // __space == space_shared (due to parameter type constraint) - // __type == type_s32 (due to parameter type constraint) - // __op == op_add (due to parameter type constraint) - NV_IF_ELSE_TARGET( - NV_PROVIDES_SM_90, - (asm("cp.reduce.async.bulk.shared::cluster.shared::cta.mbarrier::complete_tx::bytes.add.s32 [%0], [%1], %2, [%3]; " - "// 1." - : - : "r"(__as_ptr_remote_dsmem(__dstMem)), - "r"(__as_ptr_smem(__srcMem)), - "r"(__size), - "r"(__as_ptr_remote_dsmem(__rdsmem_bar)) - : "memory");), - ( - // Unsupported architectures will have a linker error with a semi-decent error message - __cuda_ptx_cp_reduce_async_bulk_is_not_supported_before_SM_90__();)); -} -#endif // __cccl_ptx_isa >= 800 - -/* -// cp.reduce.async.bulk.dst.src.mbarrier::complete_tx::bytes.op.type [dstMem], [srcMem], size, [rdsmem_bar]; // 1. PTX -ISA 80, SM_90 -// .dst = { .shared::cluster } -// .src = { .shared::cta } -// .type = { .u64 } -// .op = { .add } -template -__device__ static inline void cp_reduce_async_bulk( - cuda::ptx::space_cluster_t, - cuda::ptx::space_shared_t, - cuda::ptx::op_add_t, - uint64_t* dstMem, - const uint64_t* srcMem, - uint32_t size, - uint64_t* rdsmem_bar); -*/ -#if __cccl_ptx_isa >= 800 -extern "C" _CCCL_DEVICE void __cuda_ptx_cp_reduce_async_bulk_is_not_supported_before_SM_90__(); -template -_CCCL_DEVICE static inline void cp_reduce_async_bulk( - space_cluster_t, - space_shared_t, - op_add_t, - _CUDA_VSTD::uint64_t* __dstMem, - const _CUDA_VSTD::uint64_t* __srcMem, - _CUDA_VSTD::uint32_t __size, - _CUDA_VSTD::uint64_t* __rdsmem_bar) -{ - // __space == space_cluster (due to parameter type constraint) - // __space == space_shared (due to parameter type constraint) - // __type == type_u64 (due to parameter type constraint) - // __op == op_add (due to parameter type constraint) - NV_IF_ELSE_TARGET( - NV_PROVIDES_SM_90, - (asm("cp.reduce.async.bulk.shared::cluster.shared::cta.mbarrier::complete_tx::bytes.add.u64 [%0], [%1], %2, [%3]; " - "// 1." - : - : "r"(__as_ptr_remote_dsmem(__dstMem)), - "r"(__as_ptr_smem(__srcMem)), - "r"(__size), - "r"(__as_ptr_remote_dsmem(__rdsmem_bar)) - : "memory");), - ( - // Unsupported architectures will have a linker error with a semi-decent error message - __cuda_ptx_cp_reduce_async_bulk_is_not_supported_before_SM_90__();)); -} -#endif // __cccl_ptx_isa >= 800 - -/* -// cp.reduce.async.bulk.dst.src.mbarrier::complete_tx::bytes.op.u64 [dstMem], [srcMem], size, [rdsmem_bar]; // 2. PTX -ISA 80, SM_90 -// .dst = { .shared::cluster } -// .src = { .shared::cta } -// .type = { .s64 } -// .op = { .add } -template -__device__ static inline void cp_reduce_async_bulk( - cuda::ptx::space_cluster_t, - cuda::ptx::space_shared_t, - cuda::ptx::op_add_t, - int64_t* dstMem, - const int64_t* srcMem, - uint32_t size, - uint64_t* rdsmem_bar); -*/ -#if __cccl_ptx_isa >= 800 -extern "C" _CCCL_DEVICE void __cuda_ptx_cp_reduce_async_bulk_is_not_supported_before_SM_90__(); -template -_CCCL_DEVICE static inline void cp_reduce_async_bulk( - space_cluster_t, - space_shared_t, - op_add_t, - _CUDA_VSTD::int64_t* __dstMem, - const _CUDA_VSTD::int64_t* __srcMem, - _CUDA_VSTD::uint32_t __size, - _CUDA_VSTD::uint64_t* __rdsmem_bar) -{ - // __space == space_cluster (due to parameter type constraint) - // __space == space_shared (due to parameter type constraint) - // __type == type_s64 (due to parameter type constraint) - // __op == op_add (due to parameter type constraint) - NV_IF_ELSE_TARGET( - NV_PROVIDES_SM_90, - (asm("cp.reduce.async.bulk.shared::cluster.shared::cta.mbarrier::complete_tx::bytes.add.u64 [%0], [%1], %2, [%3]; " - "// 2." - : - : "r"(__as_ptr_remote_dsmem(__dstMem)), - "r"(__as_ptr_smem(__srcMem)), - "r"(__size), - "r"(__as_ptr_remote_dsmem(__rdsmem_bar)) - : "memory");), - ( - // Unsupported architectures will have a linker error with a semi-decent error message - __cuda_ptx_cp_reduce_async_bulk_is_not_supported_before_SM_90__();)); -} -#endif // __cccl_ptx_isa >= 800 - -/* -// cp.reduce.async.bulk.dst.src.bulk_group.op.type [dstMem], [srcMem], size; // 3. PTX ISA 80, SM_90 -// .dst = { .global } -// .src = { .shared::cta } -// .type = { .b32, .b64 } -// .op = { .and } -template -__device__ static inline void cp_reduce_async_bulk( - cuda::ptx::space_global_t, - cuda::ptx::space_shared_t, - cuda::ptx::op_and_op_t, - Type* dstMem, - const Type* srcMem, - uint32_t size); -*/ -#if __cccl_ptx_isa >= 800 -extern "C" _CCCL_DEVICE void __cuda_ptx_cp_reduce_async_bulk_is_not_supported_before_SM_90__(); -template -_CCCL_DEVICE static inline void cp_reduce_async_bulk( - space_global_t, space_shared_t, op_and_op_t, _Type* __dstMem, const _Type* __srcMem, _CUDA_VSTD::uint32_t __size) -{ - // __space == space_global (due to parameter type constraint) - // __space == space_shared (due to parameter type constraint) - static_assert(sizeof(_Type) == 4 || sizeof(_Type) == 8, ""); - // __op == op_and_op (due to parameter type constraint) - NV_IF_ELSE_TARGET( - NV_PROVIDES_SM_90, - ( - _CCCL_IF_CONSTEXPR (sizeof(_Type) == 4) { - asm("cp.reduce.async.bulk.global.shared::cta.bulk_group.and.b32 [%0], [%1], %2; // 3." - : - : "l"(__as_ptr_gmem(__dstMem)), "r"(__as_ptr_smem(__srcMem)), "r"(__size) - : "memory"); - } _CCCL_ELSE_IF_CONSTEXPR (sizeof(_Type) == 8) { - asm("cp.reduce.async.bulk.global.shared::cta.bulk_group.and.b64 [%0], [%1], %2; // 3." - : - : "l"(__as_ptr_gmem(__dstMem)), "r"(__as_ptr_smem(__srcMem)), "r"(__size) - : "memory"); - }), - ( - // Unsupported architectures will have a linker error with a semi-decent error message - __cuda_ptx_cp_reduce_async_bulk_is_not_supported_before_SM_90__();)); -} -#endif // __cccl_ptx_isa >= 800 - -/* -// cp.reduce.async.bulk.dst.src.bulk_group.op.type [dstMem], [srcMem], size; // 3. PTX ISA 80, SM_90 -// .dst = { .global } -// .src = { .shared::cta } -// .type = { .b32, .b64 } -// .op = { .or } -template -__device__ static inline void cp_reduce_async_bulk( - cuda::ptx::space_global_t, - cuda::ptx::space_shared_t, - cuda::ptx::op_or_op_t, - Type* dstMem, - const Type* srcMem, - uint32_t size); -*/ -#if __cccl_ptx_isa >= 800 -extern "C" _CCCL_DEVICE void __cuda_ptx_cp_reduce_async_bulk_is_not_supported_before_SM_90__(); -template -_CCCL_DEVICE static inline void cp_reduce_async_bulk( - space_global_t, space_shared_t, op_or_op_t, _Type* __dstMem, const _Type* __srcMem, _CUDA_VSTD::uint32_t __size) -{ - // __space == space_global (due to parameter type constraint) - // __space == space_shared (due to parameter type constraint) - static_assert(sizeof(_Type) == 4 || sizeof(_Type) == 8, ""); - // __op == op_or_op (due to parameter type constraint) - NV_IF_ELSE_TARGET( - NV_PROVIDES_SM_90, - ( - _CCCL_IF_CONSTEXPR (sizeof(_Type) == 4) { - asm("cp.reduce.async.bulk.global.shared::cta.bulk_group.or.b32 [%0], [%1], %2; // 3." - : - : "l"(__as_ptr_gmem(__dstMem)), "r"(__as_ptr_smem(__srcMem)), "r"(__size) - : "memory"); - } _CCCL_ELSE_IF_CONSTEXPR (sizeof(_Type) == 8) { - asm("cp.reduce.async.bulk.global.shared::cta.bulk_group.or.b64 [%0], [%1], %2; // 3." - : - : "l"(__as_ptr_gmem(__dstMem)), "r"(__as_ptr_smem(__srcMem)), "r"(__size) - : "memory"); - }), - ( - // Unsupported architectures will have a linker error with a semi-decent error message - __cuda_ptx_cp_reduce_async_bulk_is_not_supported_before_SM_90__();)); -} -#endif // __cccl_ptx_isa >= 800 - -/* -// cp.reduce.async.bulk.dst.src.bulk_group.op.type [dstMem], [srcMem], size; // 3. PTX ISA 80, SM_90 -// .dst = { .global } -// .src = { .shared::cta } -// .type = { .b32, .b64 } -// .op = { .xor } -template -__device__ static inline void cp_reduce_async_bulk( - cuda::ptx::space_global_t, - cuda::ptx::space_shared_t, - cuda::ptx::op_xor_op_t, - Type* dstMem, - const Type* srcMem, - uint32_t size); -*/ -#if __cccl_ptx_isa >= 800 -extern "C" _CCCL_DEVICE void __cuda_ptx_cp_reduce_async_bulk_is_not_supported_before_SM_90__(); -template -_CCCL_DEVICE static inline void cp_reduce_async_bulk( - space_global_t, space_shared_t, op_xor_op_t, _Type* __dstMem, const _Type* __srcMem, _CUDA_VSTD::uint32_t __size) -{ - // __space == space_global (due to parameter type constraint) - // __space == space_shared (due to parameter type constraint) - static_assert(sizeof(_Type) == 4 || sizeof(_Type) == 8, ""); - // __op == op_xor_op (due to parameter type constraint) - NV_IF_ELSE_TARGET( - NV_PROVIDES_SM_90, - ( - _CCCL_IF_CONSTEXPR (sizeof(_Type) == 4) { - asm("cp.reduce.async.bulk.global.shared::cta.bulk_group.xor.b32 [%0], [%1], %2; // 3." - : - : "l"(__as_ptr_gmem(__dstMem)), "r"(__as_ptr_smem(__srcMem)), "r"(__size) - : "memory"); - } _CCCL_ELSE_IF_CONSTEXPR (sizeof(_Type) == 8) { - asm("cp.reduce.async.bulk.global.shared::cta.bulk_group.xor.b64 [%0], [%1], %2; // 3." - : - : "l"(__as_ptr_gmem(__dstMem)), "r"(__as_ptr_smem(__srcMem)), "r"(__size) - : "memory"); - }), - ( - // Unsupported architectures will have a linker error with a semi-decent error message - __cuda_ptx_cp_reduce_async_bulk_is_not_supported_before_SM_90__();)); -} -#endif // __cccl_ptx_isa >= 800 - -/* -// cp.reduce.async.bulk.dst.src.bulk_group.op.type [dstMem], [srcMem], size; // 4. PTX ISA 80, SM_90 -// .dst = { .global } -// .src = { .shared::cta } -// .type = { .u32 } -// .op = { .min } -template -__device__ static inline void cp_reduce_async_bulk( - cuda::ptx::space_global_t, - cuda::ptx::space_shared_t, - cuda::ptx::op_min_t, - uint32_t* dstMem, - const uint32_t* srcMem, - uint32_t size); -*/ -#if __cccl_ptx_isa >= 800 -extern "C" _CCCL_DEVICE void __cuda_ptx_cp_reduce_async_bulk_is_not_supported_before_SM_90__(); -template -_CCCL_DEVICE static inline void cp_reduce_async_bulk( - space_global_t, - space_shared_t, - op_min_t, - _CUDA_VSTD::uint32_t* __dstMem, - const _CUDA_VSTD::uint32_t* __srcMem, - _CUDA_VSTD::uint32_t __size) -{ - // __space == space_global (due to parameter type constraint) - // __space == space_shared (due to parameter type constraint) - // __type == type_u32 (due to parameter type constraint) - // __op == op_min (due to parameter type constraint) - NV_IF_ELSE_TARGET( - NV_PROVIDES_SM_90, - (asm("cp.reduce.async.bulk.global.shared::cta.bulk_group.min.u32 [%0], [%1], %2; // 4." - : - : "l"(__as_ptr_gmem(__dstMem)), "r"(__as_ptr_smem(__srcMem)), "r"(__size) - : "memory");), - ( - // Unsupported architectures will have a linker error with a semi-decent error message - __cuda_ptx_cp_reduce_async_bulk_is_not_supported_before_SM_90__();)); -} -#endif // __cccl_ptx_isa >= 800 - -/* -// cp.reduce.async.bulk.dst.src.bulk_group.op.type [dstMem], [srcMem], size; // 4. PTX ISA 80, SM_90 -// .dst = { .global } -// .src = { .shared::cta } -// .type = { .u32 } -// .op = { .max } -template -__device__ static inline void cp_reduce_async_bulk( - cuda::ptx::space_global_t, - cuda::ptx::space_shared_t, - cuda::ptx::op_max_t, - uint32_t* dstMem, - const uint32_t* srcMem, - uint32_t size); -*/ -#if __cccl_ptx_isa >= 800 -extern "C" _CCCL_DEVICE void __cuda_ptx_cp_reduce_async_bulk_is_not_supported_before_SM_90__(); -template -_CCCL_DEVICE static inline void cp_reduce_async_bulk( - space_global_t, - space_shared_t, - op_max_t, - _CUDA_VSTD::uint32_t* __dstMem, - const _CUDA_VSTD::uint32_t* __srcMem, - _CUDA_VSTD::uint32_t __size) -{ - // __space == space_global (due to parameter type constraint) - // __space == space_shared (due to parameter type constraint) - // __type == type_u32 (due to parameter type constraint) - // __op == op_max (due to parameter type constraint) - NV_IF_ELSE_TARGET( - NV_PROVIDES_SM_90, - (asm("cp.reduce.async.bulk.global.shared::cta.bulk_group.max.u32 [%0], [%1], %2; // 4." - : - : "l"(__as_ptr_gmem(__dstMem)), "r"(__as_ptr_smem(__srcMem)), "r"(__size) - : "memory");), - ( - // Unsupported architectures will have a linker error with a semi-decent error message - __cuda_ptx_cp_reduce_async_bulk_is_not_supported_before_SM_90__();)); -} -#endif // __cccl_ptx_isa >= 800 - -/* -// cp.reduce.async.bulk.dst.src.bulk_group.op.type [dstMem], [srcMem], size; // 4. PTX ISA 80, SM_90 -// .dst = { .global } -// .src = { .shared::cta } -// .type = { .u32 } -// .op = { .add } -template -__device__ static inline void cp_reduce_async_bulk( - cuda::ptx::space_global_t, - cuda::ptx::space_shared_t, - cuda::ptx::op_add_t, - uint32_t* dstMem, - const uint32_t* srcMem, - uint32_t size); -*/ -#if __cccl_ptx_isa >= 800 -extern "C" _CCCL_DEVICE void __cuda_ptx_cp_reduce_async_bulk_is_not_supported_before_SM_90__(); -template -_CCCL_DEVICE static inline void cp_reduce_async_bulk( - space_global_t, - space_shared_t, - op_add_t, - _CUDA_VSTD::uint32_t* __dstMem, - const _CUDA_VSTD::uint32_t* __srcMem, - _CUDA_VSTD::uint32_t __size) -{ - // __space == space_global (due to parameter type constraint) - // __space == space_shared (due to parameter type constraint) - // __type == type_u32 (due to parameter type constraint) - // __op == op_add (due to parameter type constraint) - NV_IF_ELSE_TARGET( - NV_PROVIDES_SM_90, - (asm("cp.reduce.async.bulk.global.shared::cta.bulk_group.add.u32 [%0], [%1], %2; // 4." - : - : "l"(__as_ptr_gmem(__dstMem)), "r"(__as_ptr_smem(__srcMem)), "r"(__size) - : "memory");), - ( - // Unsupported architectures will have a linker error with a semi-decent error message - __cuda_ptx_cp_reduce_async_bulk_is_not_supported_before_SM_90__();)); -} -#endif // __cccl_ptx_isa >= 800 - -/* -// cp.reduce.async.bulk.dst.src.bulk_group.op.type [dstMem], [srcMem], size; // 4. PTX ISA 80, SM_90 -// .dst = { .global } -// .src = { .shared::cta } -// .type = { .u32 } -// .op = { .inc } -template -__device__ static inline void cp_reduce_async_bulk( - cuda::ptx::space_global_t, - cuda::ptx::space_shared_t, - cuda::ptx::op_inc_t, - uint32_t* dstMem, - const uint32_t* srcMem, - uint32_t size); -*/ -#if __cccl_ptx_isa >= 800 -extern "C" _CCCL_DEVICE void __cuda_ptx_cp_reduce_async_bulk_is_not_supported_before_SM_90__(); -template -_CCCL_DEVICE static inline void cp_reduce_async_bulk( - space_global_t, - space_shared_t, - op_inc_t, - _CUDA_VSTD::uint32_t* __dstMem, - const _CUDA_VSTD::uint32_t* __srcMem, - _CUDA_VSTD::uint32_t __size) -{ - // __space == space_global (due to parameter type constraint) - // __space == space_shared (due to parameter type constraint) - // __type == type_u32 (due to parameter type constraint) - // __op == op_inc (due to parameter type constraint) - NV_IF_ELSE_TARGET( - NV_PROVIDES_SM_90, - (asm("cp.reduce.async.bulk.global.shared::cta.bulk_group.inc.u32 [%0], [%1], %2; // 4." - : - : "l"(__as_ptr_gmem(__dstMem)), "r"(__as_ptr_smem(__srcMem)), "r"(__size) - : "memory");), - ( - // Unsupported architectures will have a linker error with a semi-decent error message - __cuda_ptx_cp_reduce_async_bulk_is_not_supported_before_SM_90__();)); -} -#endif // __cccl_ptx_isa >= 800 - -/* -// cp.reduce.async.bulk.dst.src.bulk_group.op.type [dstMem], [srcMem], size; // 4. PTX ISA 80, SM_90 -// .dst = { .global } -// .src = { .shared::cta } -// .type = { .u32 } -// .op = { .dec } -template -__device__ static inline void cp_reduce_async_bulk( - cuda::ptx::space_global_t, - cuda::ptx::space_shared_t, - cuda::ptx::op_dec_t, - uint32_t* dstMem, - const uint32_t* srcMem, - uint32_t size); -*/ -#if __cccl_ptx_isa >= 800 -extern "C" _CCCL_DEVICE void __cuda_ptx_cp_reduce_async_bulk_is_not_supported_before_SM_90__(); -template -_CCCL_DEVICE static inline void cp_reduce_async_bulk( - space_global_t, - space_shared_t, - op_dec_t, - _CUDA_VSTD::uint32_t* __dstMem, - const _CUDA_VSTD::uint32_t* __srcMem, - _CUDA_VSTD::uint32_t __size) -{ - // __space == space_global (due to parameter type constraint) - // __space == space_shared (due to parameter type constraint) - // __type == type_u32 (due to parameter type constraint) - // __op == op_dec (due to parameter type constraint) - NV_IF_ELSE_TARGET( - NV_PROVIDES_SM_90, - (asm("cp.reduce.async.bulk.global.shared::cta.bulk_group.dec.u32 [%0], [%1], %2; // 4." - : - : "l"(__as_ptr_gmem(__dstMem)), "r"(__as_ptr_smem(__srcMem)), "r"(__size) - : "memory");), - ( - // Unsupported architectures will have a linker error with a semi-decent error message - __cuda_ptx_cp_reduce_async_bulk_is_not_supported_before_SM_90__();)); -} -#endif // __cccl_ptx_isa >= 800 - -/* -// cp.reduce.async.bulk.dst.src.bulk_group.op.type [dstMem], [srcMem], size; // 4. PTX ISA 80, SM_90 -// .dst = { .global } -// .src = { .shared::cta } -// .type = { .s32 } -// .op = { .min } -template -__device__ static inline void cp_reduce_async_bulk( - cuda::ptx::space_global_t, - cuda::ptx::space_shared_t, - cuda::ptx::op_min_t, - int32_t* dstMem, - const int32_t* srcMem, - uint32_t size); -*/ -#if __cccl_ptx_isa >= 800 -extern "C" _CCCL_DEVICE void __cuda_ptx_cp_reduce_async_bulk_is_not_supported_before_SM_90__(); -template -_CCCL_DEVICE static inline void cp_reduce_async_bulk( - space_global_t, - space_shared_t, - op_min_t, - _CUDA_VSTD::int32_t* __dstMem, - const _CUDA_VSTD::int32_t* __srcMem, - _CUDA_VSTD::uint32_t __size) -{ - // __space == space_global (due to parameter type constraint) - // __space == space_shared (due to parameter type constraint) - // __type == type_s32 (due to parameter type constraint) - // __op == op_min (due to parameter type constraint) - NV_IF_ELSE_TARGET( - NV_PROVIDES_SM_90, - (asm("cp.reduce.async.bulk.global.shared::cta.bulk_group.min.s32 [%0], [%1], %2; // 4." - : - : "l"(__as_ptr_gmem(__dstMem)), "r"(__as_ptr_smem(__srcMem)), "r"(__size) - : "memory");), - ( - // Unsupported architectures will have a linker error with a semi-decent error message - __cuda_ptx_cp_reduce_async_bulk_is_not_supported_before_SM_90__();)); -} -#endif // __cccl_ptx_isa >= 800 - -/* -// cp.reduce.async.bulk.dst.src.bulk_group.op.type [dstMem], [srcMem], size; // 4. PTX ISA 80, SM_90 -// .dst = { .global } -// .src = { .shared::cta } -// .type = { .s32 } -// .op = { .max } -template -__device__ static inline void cp_reduce_async_bulk( - cuda::ptx::space_global_t, - cuda::ptx::space_shared_t, - cuda::ptx::op_max_t, - int32_t* dstMem, - const int32_t* srcMem, - uint32_t size); -*/ -#if __cccl_ptx_isa >= 800 -extern "C" _CCCL_DEVICE void __cuda_ptx_cp_reduce_async_bulk_is_not_supported_before_SM_90__(); -template -_CCCL_DEVICE static inline void cp_reduce_async_bulk( - space_global_t, - space_shared_t, - op_max_t, - _CUDA_VSTD::int32_t* __dstMem, - const _CUDA_VSTD::int32_t* __srcMem, - _CUDA_VSTD::uint32_t __size) -{ - // __space == space_global (due to parameter type constraint) - // __space == space_shared (due to parameter type constraint) - // __type == type_s32 (due to parameter type constraint) - // __op == op_max (due to parameter type constraint) - NV_IF_ELSE_TARGET( - NV_PROVIDES_SM_90, - (asm("cp.reduce.async.bulk.global.shared::cta.bulk_group.max.s32 [%0], [%1], %2; // 4." - : - : "l"(__as_ptr_gmem(__dstMem)), "r"(__as_ptr_smem(__srcMem)), "r"(__size) - : "memory");), - ( - // Unsupported architectures will have a linker error with a semi-decent error message - __cuda_ptx_cp_reduce_async_bulk_is_not_supported_before_SM_90__();)); -} -#endif // __cccl_ptx_isa >= 800 - -/* -// cp.reduce.async.bulk.dst.src.bulk_group.op.type [dstMem], [srcMem], size; // 4. PTX ISA 80, SM_90 -// .dst = { .global } -// .src = { .shared::cta } -// .type = { .s32 } -// .op = { .add } -template -__device__ static inline void cp_reduce_async_bulk( - cuda::ptx::space_global_t, - cuda::ptx::space_shared_t, - cuda::ptx::op_add_t, - int32_t* dstMem, - const int32_t* srcMem, - uint32_t size); -*/ -#if __cccl_ptx_isa >= 800 -extern "C" _CCCL_DEVICE void __cuda_ptx_cp_reduce_async_bulk_is_not_supported_before_SM_90__(); -template -_CCCL_DEVICE static inline void cp_reduce_async_bulk( - space_global_t, - space_shared_t, - op_add_t, - _CUDA_VSTD::int32_t* __dstMem, - const _CUDA_VSTD::int32_t* __srcMem, - _CUDA_VSTD::uint32_t __size) -{ - // __space == space_global (due to parameter type constraint) - // __space == space_shared (due to parameter type constraint) - // __type == type_s32 (due to parameter type constraint) - // __op == op_add (due to parameter type constraint) - NV_IF_ELSE_TARGET( - NV_PROVIDES_SM_90, - (asm("cp.reduce.async.bulk.global.shared::cta.bulk_group.add.s32 [%0], [%1], %2; // 4." - : - : "l"(__as_ptr_gmem(__dstMem)), "r"(__as_ptr_smem(__srcMem)), "r"(__size) - : "memory");), - ( - // Unsupported architectures will have a linker error with a semi-decent error message - __cuda_ptx_cp_reduce_async_bulk_is_not_supported_before_SM_90__();)); -} -#endif // __cccl_ptx_isa >= 800 - -/* -// cp.reduce.async.bulk.dst.src.bulk_group.op.type [dstMem], [srcMem], size; // 4. PTX ISA 80, SM_90 -// .dst = { .global } -// .src = { .shared::cta } -// .type = { .u64 } -// .op = { .min } -template -__device__ static inline void cp_reduce_async_bulk( - cuda::ptx::space_global_t, - cuda::ptx::space_shared_t, - cuda::ptx::op_min_t, - uint64_t* dstMem, - const uint64_t* srcMem, - uint32_t size); -*/ -#if __cccl_ptx_isa >= 800 -extern "C" _CCCL_DEVICE void __cuda_ptx_cp_reduce_async_bulk_is_not_supported_before_SM_90__(); -template -_CCCL_DEVICE static inline void cp_reduce_async_bulk( - space_global_t, - space_shared_t, - op_min_t, - _CUDA_VSTD::uint64_t* __dstMem, - const _CUDA_VSTD::uint64_t* __srcMem, - _CUDA_VSTD::uint32_t __size) -{ - // __space == space_global (due to parameter type constraint) - // __space == space_shared (due to parameter type constraint) - // __type == type_u64 (due to parameter type constraint) - // __op == op_min (due to parameter type constraint) - NV_IF_ELSE_TARGET( - NV_PROVIDES_SM_90, - (asm("cp.reduce.async.bulk.global.shared::cta.bulk_group.min.u64 [%0], [%1], %2; // 4." - : - : "l"(__as_ptr_gmem(__dstMem)), "r"(__as_ptr_smem(__srcMem)), "r"(__size) - : "memory");), - ( - // Unsupported architectures will have a linker error with a semi-decent error message - __cuda_ptx_cp_reduce_async_bulk_is_not_supported_before_SM_90__();)); -} -#endif // __cccl_ptx_isa >= 800 - -/* -// cp.reduce.async.bulk.dst.src.bulk_group.op.type [dstMem], [srcMem], size; // 4. PTX ISA 80, SM_90 -// .dst = { .global } -// .src = { .shared::cta } -// .type = { .u64 } -// .op = { .max } -template -__device__ static inline void cp_reduce_async_bulk( - cuda::ptx::space_global_t, - cuda::ptx::space_shared_t, - cuda::ptx::op_max_t, - uint64_t* dstMem, - const uint64_t* srcMem, - uint32_t size); -*/ -#if __cccl_ptx_isa >= 800 -extern "C" _CCCL_DEVICE void __cuda_ptx_cp_reduce_async_bulk_is_not_supported_before_SM_90__(); -template -_CCCL_DEVICE static inline void cp_reduce_async_bulk( - space_global_t, - space_shared_t, - op_max_t, - _CUDA_VSTD::uint64_t* __dstMem, - const _CUDA_VSTD::uint64_t* __srcMem, - _CUDA_VSTD::uint32_t __size) -{ - // __space == space_global (due to parameter type constraint) - // __space == space_shared (due to parameter type constraint) - // __type == type_u64 (due to parameter type constraint) - // __op == op_max (due to parameter type constraint) - NV_IF_ELSE_TARGET( - NV_PROVIDES_SM_90, - (asm("cp.reduce.async.bulk.global.shared::cta.bulk_group.max.u64 [%0], [%1], %2; // 4." - : - : "l"(__as_ptr_gmem(__dstMem)), "r"(__as_ptr_smem(__srcMem)), "r"(__size) - : "memory");), - ( - // Unsupported architectures will have a linker error with a semi-decent error message - __cuda_ptx_cp_reduce_async_bulk_is_not_supported_before_SM_90__();)); -} -#endif // __cccl_ptx_isa >= 800 - -/* -// cp.reduce.async.bulk.dst.src.bulk_group.op.type [dstMem], [srcMem], size; // 4. PTX ISA 80, SM_90 -// .dst = { .global } -// .src = { .shared::cta } -// .type = { .u64 } -// .op = { .add } -template -__device__ static inline void cp_reduce_async_bulk( - cuda::ptx::space_global_t, - cuda::ptx::space_shared_t, - cuda::ptx::op_add_t, - uint64_t* dstMem, - const uint64_t* srcMem, - uint32_t size); -*/ -#if __cccl_ptx_isa >= 800 -extern "C" _CCCL_DEVICE void __cuda_ptx_cp_reduce_async_bulk_is_not_supported_before_SM_90__(); -template -_CCCL_DEVICE static inline void cp_reduce_async_bulk( - space_global_t, - space_shared_t, - op_add_t, - _CUDA_VSTD::uint64_t* __dstMem, - const _CUDA_VSTD::uint64_t* __srcMem, - _CUDA_VSTD::uint32_t __size) -{ - // __space == space_global (due to parameter type constraint) - // __space == space_shared (due to parameter type constraint) - // __type == type_u64 (due to parameter type constraint) - // __op == op_add (due to parameter type constraint) - NV_IF_ELSE_TARGET( - NV_PROVIDES_SM_90, - (asm("cp.reduce.async.bulk.global.shared::cta.bulk_group.add.u64 [%0], [%1], %2; // 4." - : - : "l"(__as_ptr_gmem(__dstMem)), "r"(__as_ptr_smem(__srcMem)), "r"(__size) - : "memory");), - ( - // Unsupported architectures will have a linker error with a semi-decent error message - __cuda_ptx_cp_reduce_async_bulk_is_not_supported_before_SM_90__();)); -} -#endif // __cccl_ptx_isa >= 800 - -/* -// cp.reduce.async.bulk.dst.src.bulk_group.op.type [dstMem], [srcMem], size; // 4. PTX ISA 80, SM_90 -// .dst = { .global } -// .src = { .shared::cta } -// .type = { .s64 } -// .op = { .min } -template -__device__ static inline void cp_reduce_async_bulk( - cuda::ptx::space_global_t, - cuda::ptx::space_shared_t, - cuda::ptx::op_min_t, - int64_t* dstMem, - const int64_t* srcMem, - uint32_t size); -*/ -#if __cccl_ptx_isa >= 800 -extern "C" _CCCL_DEVICE void __cuda_ptx_cp_reduce_async_bulk_is_not_supported_before_SM_90__(); -template -_CCCL_DEVICE static inline void cp_reduce_async_bulk( - space_global_t, - space_shared_t, - op_min_t, - _CUDA_VSTD::int64_t* __dstMem, - const _CUDA_VSTD::int64_t* __srcMem, - _CUDA_VSTD::uint32_t __size) -{ - // __space == space_global (due to parameter type constraint) - // __space == space_shared (due to parameter type constraint) - // __type == type_s64 (due to parameter type constraint) - // __op == op_min (due to parameter type constraint) - NV_IF_ELSE_TARGET( - NV_PROVIDES_SM_90, - (asm("cp.reduce.async.bulk.global.shared::cta.bulk_group.min.s64 [%0], [%1], %2; // 4." - : - : "l"(__as_ptr_gmem(__dstMem)), "r"(__as_ptr_smem(__srcMem)), "r"(__size) - : "memory");), - ( - // Unsupported architectures will have a linker error with a semi-decent error message - __cuda_ptx_cp_reduce_async_bulk_is_not_supported_before_SM_90__();)); -} -#endif // __cccl_ptx_isa >= 800 - -/* -// cp.reduce.async.bulk.dst.src.bulk_group.op.type [dstMem], [srcMem], size; // 4. PTX ISA 80, SM_90 -// .dst = { .global } -// .src = { .shared::cta } -// .type = { .s64 } -// .op = { .max } -template -__device__ static inline void cp_reduce_async_bulk( - cuda::ptx::space_global_t, - cuda::ptx::space_shared_t, - cuda::ptx::op_max_t, - int64_t* dstMem, - const int64_t* srcMem, - uint32_t size); -*/ -#if __cccl_ptx_isa >= 800 -extern "C" _CCCL_DEVICE void __cuda_ptx_cp_reduce_async_bulk_is_not_supported_before_SM_90__(); -template -_CCCL_DEVICE static inline void cp_reduce_async_bulk( - space_global_t, - space_shared_t, - op_max_t, - _CUDA_VSTD::int64_t* __dstMem, - const _CUDA_VSTD::int64_t* __srcMem, - _CUDA_VSTD::uint32_t __size) -{ - // __space == space_global (due to parameter type constraint) - // __space == space_shared (due to parameter type constraint) - // __type == type_s64 (due to parameter type constraint) - // __op == op_max (due to parameter type constraint) - NV_IF_ELSE_TARGET( - NV_PROVIDES_SM_90, - (asm("cp.reduce.async.bulk.global.shared::cta.bulk_group.max.s64 [%0], [%1], %2; // 4." - : - : "l"(__as_ptr_gmem(__dstMem)), "r"(__as_ptr_smem(__srcMem)), "r"(__size) - : "memory");), - ( - // Unsupported architectures will have a linker error with a semi-decent error message - __cuda_ptx_cp_reduce_async_bulk_is_not_supported_before_SM_90__();)); -} -#endif // __cccl_ptx_isa >= 800 - -/* -// cp.reduce.async.bulk.dst.src.bulk_group.op.type [dstMem], [srcMem], size; // 4. PTX ISA 80, SM_90 -// .dst = { .global } -// .src = { .shared::cta } -// .type = { .f32 } -// .op = { .add } -template -__device__ static inline void cp_reduce_async_bulk( - cuda::ptx::space_global_t, - cuda::ptx::space_shared_t, - cuda::ptx::op_add_t, - float* dstMem, - const float* srcMem, - uint32_t size); -*/ -#if __cccl_ptx_isa >= 800 -extern "C" _CCCL_DEVICE void __cuda_ptx_cp_reduce_async_bulk_is_not_supported_before_SM_90__(); -template -_CCCL_DEVICE static inline void cp_reduce_async_bulk( - space_global_t, space_shared_t, op_add_t, float* __dstMem, const float* __srcMem, _CUDA_VSTD::uint32_t __size) -{ - // __space == space_global (due to parameter type constraint) - // __space == space_shared (due to parameter type constraint) - // __type == type_f32 (due to parameter type constraint) - // __op == op_add (due to parameter type constraint) - NV_IF_ELSE_TARGET( - NV_PROVIDES_SM_90, - (asm("cp.reduce.async.bulk.global.shared::cta.bulk_group.add.f32 [%0], [%1], %2; // 4." - : - : "l"(__as_ptr_gmem(__dstMem)), "r"(__as_ptr_smem(__srcMem)), "r"(__size) - : "memory");), - ( - // Unsupported architectures will have a linker error with a semi-decent error message - __cuda_ptx_cp_reduce_async_bulk_is_not_supported_before_SM_90__();)); -} -#endif // __cccl_ptx_isa >= 800 - -/* -// cp.reduce.async.bulk.dst.src.bulk_group.op.type [dstMem], [srcMem], size; // 4. PTX ISA 80, SM_90 -// .dst = { .global } -// .src = { .shared::cta } -// .type = { .f64 } -// .op = { .add } -template -__device__ static inline void cp_reduce_async_bulk( - cuda::ptx::space_global_t, - cuda::ptx::space_shared_t, - cuda::ptx::op_add_t, - double* dstMem, - const double* srcMem, - uint32_t size); -*/ -#if __cccl_ptx_isa >= 800 -extern "C" _CCCL_DEVICE void __cuda_ptx_cp_reduce_async_bulk_is_not_supported_before_SM_90__(); -template -_CCCL_DEVICE static inline void cp_reduce_async_bulk( - space_global_t, space_shared_t, op_add_t, double* __dstMem, const double* __srcMem, _CUDA_VSTD::uint32_t __size) -{ - // __space == space_global (due to parameter type constraint) - // __space == space_shared (due to parameter type constraint) - // __type == type_f64 (due to parameter type constraint) - // __op == op_add (due to parameter type constraint) - NV_IF_ELSE_TARGET( - NV_PROVIDES_SM_90, - (asm("cp.reduce.async.bulk.global.shared::cta.bulk_group.add.f64 [%0], [%1], %2; // 4." - : - : "l"(__as_ptr_gmem(__dstMem)), "r"(__as_ptr_smem(__srcMem)), "r"(__size) - : "memory");), - ( - // Unsupported architectures will have a linker error with a semi-decent error message - __cuda_ptx_cp_reduce_async_bulk_is_not_supported_before_SM_90__();)); -} -#endif // __cccl_ptx_isa >= 800 - -/* -// cp.reduce.async.bulk.dst.src.bulk_group.op.u64 [dstMem], [srcMem], size; // 6. PTX ISA 80, SM_90 -// .dst = { .global } -// .src = { .shared::cta } -// .type = { .s64 } -// .op = { .add } -template -__device__ static inline void cp_reduce_async_bulk( - cuda::ptx::space_global_t, - cuda::ptx::space_shared_t, - cuda::ptx::op_add_t, - int64_t* dstMem, - const int64_t* srcMem, - uint32_t size); -*/ -#if __cccl_ptx_isa >= 800 -extern "C" _CCCL_DEVICE void __cuda_ptx_cp_reduce_async_bulk_is_not_supported_before_SM_90__(); -template -_CCCL_DEVICE static inline void cp_reduce_async_bulk( - space_global_t, - space_shared_t, - op_add_t, - _CUDA_VSTD::int64_t* __dstMem, - const _CUDA_VSTD::int64_t* __srcMem, - _CUDA_VSTD::uint32_t __size) -{ - // __space == space_global (due to parameter type constraint) - // __space == space_shared (due to parameter type constraint) - // __type == type_s64 (due to parameter type constraint) - // __op == op_add (due to parameter type constraint) - NV_IF_ELSE_TARGET( - NV_PROVIDES_SM_90, - (asm("cp.reduce.async.bulk.global.shared::cta.bulk_group.add.u64 [%0], [%1], %2; // 6." - : - : "l"(__as_ptr_gmem(__dstMem)), "r"(__as_ptr_smem(__srcMem)), "r"(__size) - : "memory");), - ( - // Unsupported architectures will have a linker error with a semi-decent error message - __cuda_ptx_cp_reduce_async_bulk_is_not_supported_before_SM_90__();)); -} -#endif // __cccl_ptx_isa >= 800 - +#include #ifdef _LIBCUDACXX_HAS_NVF16 -/* -// cp.reduce.async.bulk.dst.src.bulk_group.op.type [dstMem], [srcMem], size; // 4. PTX ISA 80, SM_90 -// .dst = { .global } -// .src = { .shared::cta } -// .type = { .f16 } -// .op = { .min } -template -__device__ static inline void cp_reduce_async_bulk( - cuda::ptx::space_global_t, - cuda::ptx::space_shared_t, - cuda::ptx::op_min_t, - __half* dstMem, - const __half* srcMem, - uint32_t size); -*/ -# if __cccl_ptx_isa >= 800 -extern "C" _CCCL_DEVICE void __cuda_ptx_cp_reduce_async_bulk_is_not_supported_before_SM_90__(); -template -_CCCL_DEVICE static inline void cp_reduce_async_bulk( - space_global_t, space_shared_t, op_min_t, __half* __dstMem, const __half* __srcMem, _CUDA_VSTD::uint32_t __size) -{ - // __space == space_global (due to parameter type constraint) - // __space == space_shared (due to parameter type constraint) - // __type == type_f16 (due to parameter type constraint) - // __op == op_min (due to parameter type constraint) - NV_IF_ELSE_TARGET( - NV_PROVIDES_SM_90, - (asm("cp.reduce.async.bulk.global.shared::cta.bulk_group.min.f16 [%0], [%1], %2; // 4." - : - : "l"(__as_ptr_gmem(__dstMem)), "r"(__as_ptr_smem(__srcMem)), "r"(__size) - : "memory");), - ( - // Unsupported architectures will have a linker error with a semi-decent error message - __cuda_ptx_cp_reduce_async_bulk_is_not_supported_before_SM_90__();)); -} -# endif // __cccl_ptx_isa >= 800 - -/* -// cp.reduce.async.bulk.dst.src.bulk_group.op.type [dstMem], [srcMem], size; // 4. PTX ISA 80, SM_90 -// .dst = { .global } -// .src = { .shared::cta } -// .type = { .f16 } -// .op = { .max } -template -__device__ static inline void cp_reduce_async_bulk( - cuda::ptx::space_global_t, - cuda::ptx::space_shared_t, - cuda::ptx::op_max_t, - __half* dstMem, - const __half* srcMem, - uint32_t size); -*/ -# if __cccl_ptx_isa >= 800 -extern "C" _CCCL_DEVICE void __cuda_ptx_cp_reduce_async_bulk_is_not_supported_before_SM_90__(); -template -_CCCL_DEVICE static inline void cp_reduce_async_bulk( - space_global_t, space_shared_t, op_max_t, __half* __dstMem, const __half* __srcMem, _CUDA_VSTD::uint32_t __size) -{ - // __space == space_global (due to parameter type constraint) - // __space == space_shared (due to parameter type constraint) - // __type == type_f16 (due to parameter type constraint) - // __op == op_max (due to parameter type constraint) - NV_IF_ELSE_TARGET( - NV_PROVIDES_SM_90, - (asm("cp.reduce.async.bulk.global.shared::cta.bulk_group.max.f16 [%0], [%1], %2; // 4." - : - : "l"(__as_ptr_gmem(__dstMem)), "r"(__as_ptr_smem(__srcMem)), "r"(__size) - : "memory");), - ( - // Unsupported architectures will have a linker error with a semi-decent error message - __cuda_ptx_cp_reduce_async_bulk_is_not_supported_before_SM_90__();)); -} -# endif // __cccl_ptx_isa >= 800 - -/* -// cp.reduce.async.bulk.dst.src.bulk_group.op.noftz.type [dstMem], [srcMem], size; // 5. PTX ISA 80, SM_90 -// .dst = { .global } -// .src = { .shared::cta } -// .type = { .f16 } -// .op = { .add } -template -__device__ static inline void cp_reduce_async_bulk( - cuda::ptx::space_global_t, - cuda::ptx::space_shared_t, - cuda::ptx::op_add_t, - __half* dstMem, - const __half* srcMem, - uint32_t size); -*/ -# if __cccl_ptx_isa >= 800 -extern "C" _CCCL_DEVICE void __cuda_ptx_cp_reduce_async_bulk_is_not_supported_before_SM_90__(); -template -_CCCL_DEVICE static inline void cp_reduce_async_bulk( - space_global_t, space_shared_t, op_add_t, __half* __dstMem, const __half* __srcMem, _CUDA_VSTD::uint32_t __size) -{ - // __space == space_global (due to parameter type constraint) - // __space == space_shared (due to parameter type constraint) - // __type == type_f16 (due to parameter type constraint) - // __op == op_add (due to parameter type constraint) - NV_IF_ELSE_TARGET( - NV_PROVIDES_SM_90, - (asm("cp.reduce.async.bulk.global.shared::cta.bulk_group.add.noftz.f16 [%0], [%1], %2; // 5." - : - : "l"(__as_ptr_gmem(__dstMem)), "r"(__as_ptr_smem(__srcMem)), "r"(__size) - : "memory");), - ( - // Unsupported architectures will have a linker error with a semi-decent error message - __cuda_ptx_cp_reduce_async_bulk_is_not_supported_before_SM_90__();)); -} -# endif // __cccl_ptx_isa >= 800 +# include #endif // _LIBCUDACXX_HAS_NVF16 - #ifdef _LIBCUDACXX_HAS_NVBF16 -/* -// cp.reduce.async.bulk.dst.src.bulk_group.op.type [dstMem], [srcMem], size; // 4. PTX ISA 80, SM_90 -// .dst = { .global } -// .src = { .shared::cta } -// .type = { .bf16 } -// .op = { .min } -template -__device__ static inline void cp_reduce_async_bulk( - cuda::ptx::space_global_t, - cuda::ptx::space_shared_t, - cuda::ptx::op_min_t, - __nv_bfloat16* dstMem, - const __nv_bfloat16* srcMem, - uint32_t size); -*/ -# if __cccl_ptx_isa >= 800 -extern "C" _CCCL_DEVICE void __cuda_ptx_cp_reduce_async_bulk_is_not_supported_before_SM_90__(); -template -_CCCL_DEVICE static inline void cp_reduce_async_bulk( - space_global_t, - space_shared_t, - op_min_t, - __nv_bfloat16* __dstMem, - const __nv_bfloat16* __srcMem, - _CUDA_VSTD::uint32_t __size) -{ - // __space == space_global (due to parameter type constraint) - // __space == space_shared (due to parameter type constraint) - // __type == type_bf16 (due to parameter type constraint) - // __op == op_min (due to parameter type constraint) - NV_IF_ELSE_TARGET( - NV_PROVIDES_SM_90, - (asm("cp.reduce.async.bulk.global.shared::cta.bulk_group.min.bf16 [%0], [%1], %2; // 4." - : - : "l"(__as_ptr_gmem(__dstMem)), "r"(__as_ptr_smem(__srcMem)), "r"(__size) - : "memory");), - ( - // Unsupported architectures will have a linker error with a semi-decent error message - __cuda_ptx_cp_reduce_async_bulk_is_not_supported_before_SM_90__();)); -} -# endif // __cccl_ptx_isa >= 800 - -/* -// cp.reduce.async.bulk.dst.src.bulk_group.op.type [dstMem], [srcMem], size; // 4. PTX ISA 80, SM_90 -// .dst = { .global } -// .src = { .shared::cta } -// .type = { .bf16 } -// .op = { .max } -template -__device__ static inline void cp_reduce_async_bulk( - cuda::ptx::space_global_t, - cuda::ptx::space_shared_t, - cuda::ptx::op_max_t, - __nv_bfloat16* dstMem, - const __nv_bfloat16* srcMem, - uint32_t size); -*/ -# if __cccl_ptx_isa >= 800 -extern "C" _CCCL_DEVICE void __cuda_ptx_cp_reduce_async_bulk_is_not_supported_before_SM_90__(); -template -_CCCL_DEVICE static inline void cp_reduce_async_bulk( - space_global_t, - space_shared_t, - op_max_t, - __nv_bfloat16* __dstMem, - const __nv_bfloat16* __srcMem, - _CUDA_VSTD::uint32_t __size) -{ - // __space == space_global (due to parameter type constraint) - // __space == space_shared (due to parameter type constraint) - // __type == type_bf16 (due to parameter type constraint) - // __op == op_max (due to parameter type constraint) - NV_IF_ELSE_TARGET( - NV_PROVIDES_SM_90, - (asm("cp.reduce.async.bulk.global.shared::cta.bulk_group.max.bf16 [%0], [%1], %2; // 4." - : - : "l"(__as_ptr_gmem(__dstMem)), "r"(__as_ptr_smem(__srcMem)), "r"(__size) - : "memory");), - ( - // Unsupported architectures will have a linker error with a semi-decent error message - __cuda_ptx_cp_reduce_async_bulk_is_not_supported_before_SM_90__();)); -} -# endif // __cccl_ptx_isa >= 800 - -/* -// cp.reduce.async.bulk.dst.src.bulk_group.op.noftz.type [dstMem], [srcMem], size; // 5. PTX ISA 80, SM_90 -// .dst = { .global } -// .src = { .shared::cta } -// .type = { .bf16 } -// .op = { .add } -template -__device__ static inline void cp_reduce_async_bulk( - cuda::ptx::space_global_t, - cuda::ptx::space_shared_t, - cuda::ptx::op_add_t, - __nv_bfloat16* dstMem, - const __nv_bfloat16* srcMem, - uint32_t size); -*/ -# if __cccl_ptx_isa >= 800 -extern "C" _CCCL_DEVICE void __cuda_ptx_cp_reduce_async_bulk_is_not_supported_before_SM_90__(); -template -_CCCL_DEVICE static inline void cp_reduce_async_bulk( - space_global_t, - space_shared_t, - op_add_t, - __nv_bfloat16* __dstMem, - const __nv_bfloat16* __srcMem, - _CUDA_VSTD::uint32_t __size) -{ - // __space == space_global (due to parameter type constraint) - // __space == space_shared (due to parameter type constraint) - // __type == type_bf16 (due to parameter type constraint) - // __op == op_add (due to parameter type constraint) - NV_IF_ELSE_TARGET( - NV_PROVIDES_SM_90, - (asm("cp.reduce.async.bulk.global.shared::cta.bulk_group.add.noftz.bf16 [%0], [%1], %2; // 5." - : - : "l"(__as_ptr_gmem(__dstMem)), "r"(__as_ptr_smem(__srcMem)), "r"(__size) - : "memory");), - ( - // Unsupported architectures will have a linker error with a semi-decent error message - __cuda_ptx_cp_reduce_async_bulk_is_not_supported_before_SM_90__();)); -} -# endif // __cccl_ptx_isa >= 800 +# include #endif // _LIBCUDACXX_HAS_NVBF16 _LIBCUDACXX_END_NAMESPACE_CUDA_PTX diff --git a/libcudacxx/include/cuda/__ptx/instructions/cp_reduce_async_bulk_tensor.h b/libcudacxx/include/cuda/__ptx/instructions/cp_reduce_async_bulk_tensor.h index 4ecb108a719..a6b23a706c7 100644 --- a/libcudacxx/include/cuda/__ptx/instructions/cp_reduce_async_bulk_tensor.h +++ b/libcudacxx/include/cuda/__ptx/instructions/cp_reduce_async_bulk_tensor.h @@ -32,538 +32,7 @@ _LIBCUDACXX_BEGIN_NAMESPACE_CUDA_PTX // 9.7.8.24.10. Data Movement and Conversion Instructions: cp.reduce.async.bulk.tensor // https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#data-movement-and-conversion-instructions-cp-reduce-async-bulk-tensor -/* -// cp.reduce.async.bulk.tensor.1d.dst.src.op.tile.bulk_group [tensorMap, tensorCoords], [srcMem]; // 1a. PTX ISA 80, -SM_90 -// .dst = { .global } -// .src = { .shared::cta } -// .op = { .add, .min, .max, .inc, .dec, .and, .or, .xor } -template -__device__ static inline void cp_reduce_async_bulk_tensor( - cuda::ptx::space_global_t, - cuda::ptx::space_shared_t, - cuda::ptx::op_t op, - const void* tensorMap, - const int32_t (&tensorCoords)[1], - const void* srcMem); -*/ -#if __cccl_ptx_isa >= 800 -extern "C" _CCCL_DEVICE void __cuda_ptx_cp_reduce_async_bulk_tensor_is_not_supported_before_SM_90__(); -template -_CCCL_DEVICE static inline void cp_reduce_async_bulk_tensor( - space_global_t, - space_shared_t, - op_t<_Op> __op, - const void* __tensorMap, - const _CUDA_VSTD::int32_t (&__tensorCoords)[1], - const void* __srcMem) -{ - // __space == space_global (due to parameter type constraint) - // __space == space_shared (due to parameter type constraint) - static_assert(__op == op_add || __op == op_min || __op == op_max || __op == op_inc || __op == op_dec - || __op == op_and_op || __op == op_or_op || __op == op_xor_op, - ""); - NV_IF_ELSE_TARGET( - NV_PROVIDES_SM_90, - ( - _CCCL_IF_CONSTEXPR (__op == op_add) { - asm("cp.reduce.async.bulk.tensor.1d.global.shared::cta.add.tile.bulk_group [%0, {%1}], [%2]; // 1a." - : - : "l"(__tensorMap), "r"(__tensorCoords[0]), "r"(__as_ptr_smem(__srcMem)) - : "memory"); - } _CCCL_ELSE_IF_CONSTEXPR (__op == op_min) { - asm("cp.reduce.async.bulk.tensor.1d.global.shared::cta.min.tile.bulk_group [%0, {%1}], [%2]; // 1a." - : - : "l"(__tensorMap), "r"(__tensorCoords[0]), "r"(__as_ptr_smem(__srcMem)) - : "memory"); - } _CCCL_ELSE_IF_CONSTEXPR (__op == op_max) { - asm("cp.reduce.async.bulk.tensor.1d.global.shared::cta.max.tile.bulk_group [%0, {%1}], [%2]; // 1a." - : - : "l"(__tensorMap), "r"(__tensorCoords[0]), "r"(__as_ptr_smem(__srcMem)) - : "memory"); - } _CCCL_ELSE_IF_CONSTEXPR (__op == op_inc) { - asm("cp.reduce.async.bulk.tensor.1d.global.shared::cta.inc.tile.bulk_group [%0, {%1}], [%2]; // 1a." - : - : "l"(__tensorMap), "r"(__tensorCoords[0]), "r"(__as_ptr_smem(__srcMem)) - : "memory"); - } _CCCL_ELSE_IF_CONSTEXPR (__op == op_dec) { - asm("cp.reduce.async.bulk.tensor.1d.global.shared::cta.dec.tile.bulk_group [%0, {%1}], [%2]; // 1a." - : - : "l"(__tensorMap), "r"(__tensorCoords[0]), "r"(__as_ptr_smem(__srcMem)) - : "memory"); - } _CCCL_ELSE_IF_CONSTEXPR (__op == op_and_op) { - asm("cp.reduce.async.bulk.tensor.1d.global.shared::cta.and.tile.bulk_group [%0, {%1}], [%2]; // 1a." - : - : "l"(__tensorMap), "r"(__tensorCoords[0]), "r"(__as_ptr_smem(__srcMem)) - : "memory"); - } _CCCL_ELSE_IF_CONSTEXPR (__op == op_or_op) { - asm("cp.reduce.async.bulk.tensor.1d.global.shared::cta.or.tile.bulk_group [%0, {%1}], [%2]; // 1a." - : - : "l"(__tensorMap), "r"(__tensorCoords[0]), "r"(__as_ptr_smem(__srcMem)) - : "memory"); - } _CCCL_ELSE_IF_CONSTEXPR (__op == op_xor_op) { - asm("cp.reduce.async.bulk.tensor.1d.global.shared::cta.xor.tile.bulk_group [%0, {%1}], [%2]; // 1a." - : - : "l"(__tensorMap), "r"(__tensorCoords[0]), "r"(__as_ptr_smem(__srcMem)) - : "memory"); - }), - ( - // Unsupported architectures will have a linker error with a semi-decent error message - __cuda_ptx_cp_reduce_async_bulk_tensor_is_not_supported_before_SM_90__();)); -} -#endif // __cccl_ptx_isa >= 800 - -/* -// cp.reduce.async.bulk.tensor.2d.dst.src.op.tile.bulk_group [tensorMap, tensorCoords], [srcMem]; // 1b. PTX ISA 80, -SM_90 -// .dst = { .global } -// .src = { .shared::cta } -// .op = { .add, .min, .max, .inc, .dec, .and, .or, .xor } -template -__device__ static inline void cp_reduce_async_bulk_tensor( - cuda::ptx::space_global_t, - cuda::ptx::space_shared_t, - cuda::ptx::op_t op, - const void* tensorMap, - const int32_t (&tensorCoords)[2], - const void* srcMem); -*/ -#if __cccl_ptx_isa >= 800 -extern "C" _CCCL_DEVICE void __cuda_ptx_cp_reduce_async_bulk_tensor_is_not_supported_before_SM_90__(); -template -_CCCL_DEVICE static inline void cp_reduce_async_bulk_tensor( - space_global_t, - space_shared_t, - op_t<_Op> __op, - const void* __tensorMap, - const _CUDA_VSTD::int32_t (&__tensorCoords)[2], - const void* __srcMem) -{ - // __space == space_global (due to parameter type constraint) - // __space == space_shared (due to parameter type constraint) - static_assert(__op == op_add || __op == op_min || __op == op_max || __op == op_inc || __op == op_dec - || __op == op_and_op || __op == op_or_op || __op == op_xor_op, - ""); - NV_IF_ELSE_TARGET( - NV_PROVIDES_SM_90, - ( - _CCCL_IF_CONSTEXPR (__op == op_add) { - asm("cp.reduce.async.bulk.tensor.2d.global.shared::cta.add.tile.bulk_group [%0, {%1, %2}], [%3]; // 1b." - : - : "l"(__tensorMap), "r"(__tensorCoords[0]), "r"(__tensorCoords[1]), "r"(__as_ptr_smem(__srcMem)) - : "memory"); - } _CCCL_ELSE_IF_CONSTEXPR (__op == op_min) { - asm("cp.reduce.async.bulk.tensor.2d.global.shared::cta.min.tile.bulk_group [%0, {%1, %2}], [%3]; // 1b." - : - : "l"(__tensorMap), "r"(__tensorCoords[0]), "r"(__tensorCoords[1]), "r"(__as_ptr_smem(__srcMem)) - : "memory"); - } _CCCL_ELSE_IF_CONSTEXPR (__op == op_max) { - asm("cp.reduce.async.bulk.tensor.2d.global.shared::cta.max.tile.bulk_group [%0, {%1, %2}], [%3]; // 1b." - : - : "l"(__tensorMap), "r"(__tensorCoords[0]), "r"(__tensorCoords[1]), "r"(__as_ptr_smem(__srcMem)) - : "memory"); - } _CCCL_ELSE_IF_CONSTEXPR (__op == op_inc) { - asm("cp.reduce.async.bulk.tensor.2d.global.shared::cta.inc.tile.bulk_group [%0, {%1, %2}], [%3]; // 1b." - : - : "l"(__tensorMap), "r"(__tensorCoords[0]), "r"(__tensorCoords[1]), "r"(__as_ptr_smem(__srcMem)) - : "memory"); - } _CCCL_ELSE_IF_CONSTEXPR (__op == op_dec) { - asm("cp.reduce.async.bulk.tensor.2d.global.shared::cta.dec.tile.bulk_group [%0, {%1, %2}], [%3]; // 1b." - : - : "l"(__tensorMap), "r"(__tensorCoords[0]), "r"(__tensorCoords[1]), "r"(__as_ptr_smem(__srcMem)) - : "memory"); - } _CCCL_ELSE_IF_CONSTEXPR (__op == op_and_op) { - asm("cp.reduce.async.bulk.tensor.2d.global.shared::cta.and.tile.bulk_group [%0, {%1, %2}], [%3]; // 1b." - : - : "l"(__tensorMap), "r"(__tensorCoords[0]), "r"(__tensorCoords[1]), "r"(__as_ptr_smem(__srcMem)) - : "memory"); - } _CCCL_ELSE_IF_CONSTEXPR (__op == op_or_op) { - asm("cp.reduce.async.bulk.tensor.2d.global.shared::cta.or.tile.bulk_group [%0, {%1, %2}], [%3]; // 1b." - : - : "l"(__tensorMap), "r"(__tensorCoords[0]), "r"(__tensorCoords[1]), "r"(__as_ptr_smem(__srcMem)) - : "memory"); - } _CCCL_ELSE_IF_CONSTEXPR (__op == op_xor_op) { - asm("cp.reduce.async.bulk.tensor.2d.global.shared::cta.xor.tile.bulk_group [%0, {%1, %2}], [%3]; // 1b." - : - : "l"(__tensorMap), "r"(__tensorCoords[0]), "r"(__tensorCoords[1]), "r"(__as_ptr_smem(__srcMem)) - : "memory"); - }), - ( - // Unsupported architectures will have a linker error with a semi-decent error message - __cuda_ptx_cp_reduce_async_bulk_tensor_is_not_supported_before_SM_90__();)); -} -#endif // __cccl_ptx_isa >= 800 - -/* -// cp.reduce.async.bulk.tensor.3d.dst.src.op.tile.bulk_group [tensorMap, tensorCoords], [srcMem]; // 1c. PTX ISA 80, -SM_90 -// .dst = { .global } -// .src = { .shared::cta } -// .op = { .add, .min, .max, .inc, .dec, .and, .or, .xor } -template -__device__ static inline void cp_reduce_async_bulk_tensor( - cuda::ptx::space_global_t, - cuda::ptx::space_shared_t, - cuda::ptx::op_t op, - const void* tensorMap, - const int32_t (&tensorCoords)[3], - const void* srcMem); -*/ -#if __cccl_ptx_isa >= 800 -extern "C" _CCCL_DEVICE void __cuda_ptx_cp_reduce_async_bulk_tensor_is_not_supported_before_SM_90__(); -template -_CCCL_DEVICE static inline void cp_reduce_async_bulk_tensor( - space_global_t, - space_shared_t, - op_t<_Op> __op, - const void* __tensorMap, - const _CUDA_VSTD::int32_t (&__tensorCoords)[3], - const void* __srcMem) -{ - // __space == space_global (due to parameter type constraint) - // __space == space_shared (due to parameter type constraint) - static_assert(__op == op_add || __op == op_min || __op == op_max || __op == op_inc || __op == op_dec - || __op == op_and_op || __op == op_or_op || __op == op_xor_op, - ""); - NV_IF_ELSE_TARGET( - NV_PROVIDES_SM_90, - ( - _CCCL_IF_CONSTEXPR (__op == op_add) { - asm("cp.reduce.async.bulk.tensor.3d.global.shared::cta.add.tile.bulk_group [%0, {%1, %2, %3}], [%4]; // 1c." - : - : "l"(__tensorMap), - "r"(__tensorCoords[0]), - "r"(__tensorCoords[1]), - "r"(__tensorCoords[2]), - "r"(__as_ptr_smem(__srcMem)) - : "memory"); - } _CCCL_ELSE_IF_CONSTEXPR (__op == op_min) { - asm("cp.reduce.async.bulk.tensor.3d.global.shared::cta.min.tile.bulk_group [%0, {%1, %2, %3}], [%4]; // 1c." - : - : "l"(__tensorMap), - "r"(__tensorCoords[0]), - "r"(__tensorCoords[1]), - "r"(__tensorCoords[2]), - "r"(__as_ptr_smem(__srcMem)) - : "memory"); - } _CCCL_ELSE_IF_CONSTEXPR (__op == op_max) { - asm("cp.reduce.async.bulk.tensor.3d.global.shared::cta.max.tile.bulk_group [%0, {%1, %2, %3}], [%4]; // 1c." - : - : "l"(__tensorMap), - "r"(__tensorCoords[0]), - "r"(__tensorCoords[1]), - "r"(__tensorCoords[2]), - "r"(__as_ptr_smem(__srcMem)) - : "memory"); - } _CCCL_ELSE_IF_CONSTEXPR (__op == op_inc) { - asm("cp.reduce.async.bulk.tensor.3d.global.shared::cta.inc.tile.bulk_group [%0, {%1, %2, %3}], [%4]; // 1c." - : - : "l"(__tensorMap), - "r"(__tensorCoords[0]), - "r"(__tensorCoords[1]), - "r"(__tensorCoords[2]), - "r"(__as_ptr_smem(__srcMem)) - : "memory"); - } _CCCL_ELSE_IF_CONSTEXPR (__op == op_dec) { - asm("cp.reduce.async.bulk.tensor.3d.global.shared::cta.dec.tile.bulk_group [%0, {%1, %2, %3}], [%4]; // 1c." - : - : "l"(__tensorMap), - "r"(__tensorCoords[0]), - "r"(__tensorCoords[1]), - "r"(__tensorCoords[2]), - "r"(__as_ptr_smem(__srcMem)) - : "memory"); - } _CCCL_ELSE_IF_CONSTEXPR (__op == op_and_op) { - asm("cp.reduce.async.bulk.tensor.3d.global.shared::cta.and.tile.bulk_group [%0, {%1, %2, %3}], [%4]; // 1c." - : - : "l"(__tensorMap), - "r"(__tensorCoords[0]), - "r"(__tensorCoords[1]), - "r"(__tensorCoords[2]), - "r"(__as_ptr_smem(__srcMem)) - : "memory"); - } _CCCL_ELSE_IF_CONSTEXPR (__op == op_or_op) { - asm("cp.reduce.async.bulk.tensor.3d.global.shared::cta.or.tile.bulk_group [%0, {%1, %2, %3}], [%4]; // 1c." - : - : "l"(__tensorMap), - "r"(__tensorCoords[0]), - "r"(__tensorCoords[1]), - "r"(__tensorCoords[2]), - "r"(__as_ptr_smem(__srcMem)) - : "memory"); - } _CCCL_ELSE_IF_CONSTEXPR (__op == op_xor_op) { - asm("cp.reduce.async.bulk.tensor.3d.global.shared::cta.xor.tile.bulk_group [%0, {%1, %2, %3}], [%4]; // 1c." - : - : "l"(__tensorMap), - "r"(__tensorCoords[0]), - "r"(__tensorCoords[1]), - "r"(__tensorCoords[2]), - "r"(__as_ptr_smem(__srcMem)) - : "memory"); - }), - ( - // Unsupported architectures will have a linker error with a semi-decent error message - __cuda_ptx_cp_reduce_async_bulk_tensor_is_not_supported_before_SM_90__();)); -} -#endif // __cccl_ptx_isa >= 800 - -/* -// cp.reduce.async.bulk.tensor.4d.dst.src.op.tile.bulk_group [tensorMap, tensorCoords], [srcMem]; // 1d. PTX ISA 80, -SM_90 -// .dst = { .global } -// .src = { .shared::cta } -// .op = { .add, .min, .max, .inc, .dec, .and, .or, .xor } -template -__device__ static inline void cp_reduce_async_bulk_tensor( - cuda::ptx::space_global_t, - cuda::ptx::space_shared_t, - cuda::ptx::op_t op, - const void* tensorMap, - const int32_t (&tensorCoords)[4], - const void* srcMem); -*/ -#if __cccl_ptx_isa >= 800 -extern "C" _CCCL_DEVICE void __cuda_ptx_cp_reduce_async_bulk_tensor_is_not_supported_before_SM_90__(); -template -_CCCL_DEVICE static inline void cp_reduce_async_bulk_tensor( - space_global_t, - space_shared_t, - op_t<_Op> __op, - const void* __tensorMap, - const _CUDA_VSTD::int32_t (&__tensorCoords)[4], - const void* __srcMem) -{ - // __space == space_global (due to parameter type constraint) - // __space == space_shared (due to parameter type constraint) - static_assert(__op == op_add || __op == op_min || __op == op_max || __op == op_inc || __op == op_dec - || __op == op_and_op || __op == op_or_op || __op == op_xor_op, - ""); - NV_IF_ELSE_TARGET( - NV_PROVIDES_SM_90, - ( - _CCCL_IF_CONSTEXPR (__op == op_add) { - asm("cp.reduce.async.bulk.tensor.4d.global.shared::cta.add.tile.bulk_group [%0, {%1, %2, %3, %4}], [%5]; // 1d." - : - : "l"(__tensorMap), - "r"(__tensorCoords[0]), - "r"(__tensorCoords[1]), - "r"(__tensorCoords[2]), - "r"(__tensorCoords[3]), - "r"(__as_ptr_smem(__srcMem)) - : "memory"); - } _CCCL_ELSE_IF_CONSTEXPR (__op == op_min) { - asm("cp.reduce.async.bulk.tensor.4d.global.shared::cta.min.tile.bulk_group [%0, {%1, %2, %3, %4}], [%5]; // 1d." - : - : "l"(__tensorMap), - "r"(__tensorCoords[0]), - "r"(__tensorCoords[1]), - "r"(__tensorCoords[2]), - "r"(__tensorCoords[3]), - "r"(__as_ptr_smem(__srcMem)) - : "memory"); - } _CCCL_ELSE_IF_CONSTEXPR (__op == op_max) { - asm("cp.reduce.async.bulk.tensor.4d.global.shared::cta.max.tile.bulk_group [%0, {%1, %2, %3, %4}], [%5]; // 1d." - : - : "l"(__tensorMap), - "r"(__tensorCoords[0]), - "r"(__tensorCoords[1]), - "r"(__tensorCoords[2]), - "r"(__tensorCoords[3]), - "r"(__as_ptr_smem(__srcMem)) - : "memory"); - } _CCCL_ELSE_IF_CONSTEXPR (__op == op_inc) { - asm("cp.reduce.async.bulk.tensor.4d.global.shared::cta.inc.tile.bulk_group [%0, {%1, %2, %3, %4}], [%5]; // 1d." - : - : "l"(__tensorMap), - "r"(__tensorCoords[0]), - "r"(__tensorCoords[1]), - "r"(__tensorCoords[2]), - "r"(__tensorCoords[3]), - "r"(__as_ptr_smem(__srcMem)) - : "memory"); - } _CCCL_ELSE_IF_CONSTEXPR (__op == op_dec) { - asm("cp.reduce.async.bulk.tensor.4d.global.shared::cta.dec.tile.bulk_group [%0, {%1, %2, %3, %4}], [%5]; // 1d." - : - : "l"(__tensorMap), - "r"(__tensorCoords[0]), - "r"(__tensorCoords[1]), - "r"(__tensorCoords[2]), - "r"(__tensorCoords[3]), - "r"(__as_ptr_smem(__srcMem)) - : "memory"); - } _CCCL_ELSE_IF_CONSTEXPR (__op == op_and_op) { - asm("cp.reduce.async.bulk.tensor.4d.global.shared::cta.and.tile.bulk_group [%0, {%1, %2, %3, %4}], [%5]; // 1d." - : - : "l"(__tensorMap), - "r"(__tensorCoords[0]), - "r"(__tensorCoords[1]), - "r"(__tensorCoords[2]), - "r"(__tensorCoords[3]), - "r"(__as_ptr_smem(__srcMem)) - : "memory"); - } _CCCL_ELSE_IF_CONSTEXPR (__op == op_or_op) { - asm("cp.reduce.async.bulk.tensor.4d.global.shared::cta.or.tile.bulk_group [%0, {%1, %2, %3, %4}], [%5]; // 1d." - : - : "l"(__tensorMap), - "r"(__tensorCoords[0]), - "r"(__tensorCoords[1]), - "r"(__tensorCoords[2]), - "r"(__tensorCoords[3]), - "r"(__as_ptr_smem(__srcMem)) - : "memory"); - } _CCCL_ELSE_IF_CONSTEXPR (__op == op_xor_op) { - asm("cp.reduce.async.bulk.tensor.4d.global.shared::cta.xor.tile.bulk_group [%0, {%1, %2, %3, %4}], [%5]; // 1d." - : - : "l"(__tensorMap), - "r"(__tensorCoords[0]), - "r"(__tensorCoords[1]), - "r"(__tensorCoords[2]), - "r"(__tensorCoords[3]), - "r"(__as_ptr_smem(__srcMem)) - : "memory"); - }), - ( - // Unsupported architectures will have a linker error with a semi-decent error message - __cuda_ptx_cp_reduce_async_bulk_tensor_is_not_supported_before_SM_90__();)); -} -#endif // __cccl_ptx_isa >= 800 - -/* -// cp.reduce.async.bulk.tensor.5d.dst.src.op.tile.bulk_group [tensorMap, tensorCoords], [srcMem]; // 1e. PTX ISA 80, -SM_90 -// .dst = { .global } -// .src = { .shared::cta } -// .op = { .add, .min, .max, .inc, .dec, .and, .or, .xor } -template -__device__ static inline void cp_reduce_async_bulk_tensor( - cuda::ptx::space_global_t, - cuda::ptx::space_shared_t, - cuda::ptx::op_t op, - const void* tensorMap, - const int32_t (&tensorCoords)[5], - const void* srcMem); -*/ -#if __cccl_ptx_isa >= 800 -extern "C" _CCCL_DEVICE void __cuda_ptx_cp_reduce_async_bulk_tensor_is_not_supported_before_SM_90__(); -template -_CCCL_DEVICE static inline void cp_reduce_async_bulk_tensor( - space_global_t, - space_shared_t, - op_t<_Op> __op, - const void* __tensorMap, - const _CUDA_VSTD::int32_t (&__tensorCoords)[5], - const void* __srcMem) -{ - // __space == space_global (due to parameter type constraint) - // __space == space_shared (due to parameter type constraint) - static_assert(__op == op_add || __op == op_min || __op == op_max || __op == op_inc || __op == op_dec - || __op == op_and_op || __op == op_or_op || __op == op_xor_op, - ""); - NV_IF_ELSE_TARGET( - NV_PROVIDES_SM_90, - ( - _CCCL_IF_CONSTEXPR (__op == op_add) { - asm("cp.reduce.async.bulk.tensor.5d.global.shared::cta.add.tile.bulk_group [%0, {%1, %2, %3, %4, %5}], [%6]; " - "// 1e." - : - : "l"(__tensorMap), - "r"(__tensorCoords[0]), - "r"(__tensorCoords[1]), - "r"(__tensorCoords[2]), - "r"(__tensorCoords[3]), - "r"(__tensorCoords[4]), - "r"(__as_ptr_smem(__srcMem)) - : "memory"); - } _CCCL_ELSE_IF_CONSTEXPR (__op == op_min) { - asm("cp.reduce.async.bulk.tensor.5d.global.shared::cta.min.tile.bulk_group [%0, {%1, %2, %3, %4, %5}], [%6]; " - "// 1e." - : - : "l"(__tensorMap), - "r"(__tensorCoords[0]), - "r"(__tensorCoords[1]), - "r"(__tensorCoords[2]), - "r"(__tensorCoords[3]), - "r"(__tensorCoords[4]), - "r"(__as_ptr_smem(__srcMem)) - : "memory"); - } _CCCL_ELSE_IF_CONSTEXPR (__op == op_max) { - asm("cp.reduce.async.bulk.tensor.5d.global.shared::cta.max.tile.bulk_group [%0, {%1, %2, %3, %4, %5}], [%6]; " - "// 1e." - : - : "l"(__tensorMap), - "r"(__tensorCoords[0]), - "r"(__tensorCoords[1]), - "r"(__tensorCoords[2]), - "r"(__tensorCoords[3]), - "r"(__tensorCoords[4]), - "r"(__as_ptr_smem(__srcMem)) - : "memory"); - } _CCCL_ELSE_IF_CONSTEXPR (__op == op_inc) { - asm("cp.reduce.async.bulk.tensor.5d.global.shared::cta.inc.tile.bulk_group [%0, {%1, %2, %3, %4, %5}], [%6]; " - "// 1e." - : - : "l"(__tensorMap), - "r"(__tensorCoords[0]), - "r"(__tensorCoords[1]), - "r"(__tensorCoords[2]), - "r"(__tensorCoords[3]), - "r"(__tensorCoords[4]), - "r"(__as_ptr_smem(__srcMem)) - : "memory"); - } _CCCL_ELSE_IF_CONSTEXPR (__op == op_dec) { - asm("cp.reduce.async.bulk.tensor.5d.global.shared::cta.dec.tile.bulk_group [%0, {%1, %2, %3, %4, %5}], [%6]; " - "// 1e." - : - : "l"(__tensorMap), - "r"(__tensorCoords[0]), - "r"(__tensorCoords[1]), - "r"(__tensorCoords[2]), - "r"(__tensorCoords[3]), - "r"(__tensorCoords[4]), - "r"(__as_ptr_smem(__srcMem)) - : "memory"); - } _CCCL_ELSE_IF_CONSTEXPR (__op == op_and_op) { - asm("cp.reduce.async.bulk.tensor.5d.global.shared::cta.and.tile.bulk_group [%0, {%1, %2, %3, %4, %5}], [%6]; " - "// 1e." - : - : "l"(__tensorMap), - "r"(__tensorCoords[0]), - "r"(__tensorCoords[1]), - "r"(__tensorCoords[2]), - "r"(__tensorCoords[3]), - "r"(__tensorCoords[4]), - "r"(__as_ptr_smem(__srcMem)) - : "memory"); - } _CCCL_ELSE_IF_CONSTEXPR (__op == op_or_op) { - asm("cp.reduce.async.bulk.tensor.5d.global.shared::cta.or.tile.bulk_group [%0, {%1, %2, %3, %4, %5}], [%6]; // " - "1e." - : - : "l"(__tensorMap), - "r"(__tensorCoords[0]), - "r"(__tensorCoords[1]), - "r"(__tensorCoords[2]), - "r"(__tensorCoords[3]), - "r"(__tensorCoords[4]), - "r"(__as_ptr_smem(__srcMem)) - : "memory"); - } _CCCL_ELSE_IF_CONSTEXPR (__op == op_xor_op) { - asm("cp.reduce.async.bulk.tensor.5d.global.shared::cta.xor.tile.bulk_group [%0, {%1, %2, %3, %4, %5}], [%6]; " - "// 1e." - : - : "l"(__tensorMap), - "r"(__tensorCoords[0]), - "r"(__tensorCoords[1]), - "r"(__tensorCoords[2]), - "r"(__tensorCoords[3]), - "r"(__tensorCoords[4]), - "r"(__as_ptr_smem(__srcMem)) - : "memory"); - }), - ( - // Unsupported architectures will have a linker error with a semi-decent error message - __cuda_ptx_cp_reduce_async_bulk_tensor_is_not_supported_before_SM_90__();)); -} -#endif // __cccl_ptx_isa >= 800 +#include _LIBCUDACXX_END_NAMESPACE_CUDA_PTX diff --git a/libcudacxx/include/cuda/__ptx/instructions/fence.h b/libcudacxx/include/cuda/__ptx/instructions/fence.h index 956f86c910e..045f09cb40e 100644 --- a/libcudacxx/include/cuda/__ptx/instructions/fence.h +++ b/libcudacxx/include/cuda/__ptx/instructions/fence.h @@ -32,253 +32,11 @@ _LIBCUDACXX_BEGIN_NAMESPACE_CUDA_PTX // 9.7.12.4. Parallel Synchronization and Communication Instructions: membar/fence // https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#parallel-synchronization-and-communication-instructions-membar-fence -/* -// fence{.sem}.scope; // 1. PTX ISA 60, SM_70 -// .sem = { .sc, .acq_rel } -// .scope = { .cta, .gpu, .sys } -template -__device__ static inline void fence( - cuda::ptx::sem_t sem, - cuda::ptx::scope_t scope); -*/ -#if __cccl_ptx_isa >= 600 -extern "C" _CCCL_DEVICE void __cuda_ptx_fence_is_not_supported_before_SM_70__(); -template -_CCCL_DEVICE static inline void fence(sem_t<_Sem> __sem, scope_t<_Scope> __scope) -{ - static_assert(__sem == sem_sc || __sem == sem_acq_rel, ""); - static_assert(__scope == scope_cta || __scope == scope_gpu || __scope == scope_sys, ""); - NV_IF_ELSE_TARGET( - NV_PROVIDES_SM_70, - ( - _CCCL_IF_CONSTEXPR (__sem == sem_sc && __scope == scope_cta) { - asm volatile("fence.sc.cta; // 1." : : : "memory"); - } _CCCL_ELSE_IF_CONSTEXPR (__sem == sem_sc && __scope == scope_gpu) { - asm volatile("fence.sc.gpu; // 1." : : : "memory"); - } _CCCL_ELSE_IF_CONSTEXPR (__sem == sem_sc && __scope == scope_sys) { - asm volatile("fence.sc.sys; // 1." : : : "memory"); - } _CCCL_ELSE_IF_CONSTEXPR (__sem == sem_acq_rel && __scope == scope_cta) { - asm volatile("fence.acq_rel.cta; // 1." : : : "memory"); - } _CCCL_ELSE_IF_CONSTEXPR (__sem == sem_acq_rel && __scope == scope_gpu) { - asm volatile("fence.acq_rel.gpu; // 1." : : : "memory"); - } _CCCL_ELSE_IF_CONSTEXPR (__sem == sem_acq_rel && __scope == scope_sys) { - asm volatile("fence.acq_rel.sys; // 1." : : : "memory"); - }), - ( - // Unsupported architectures will have a linker error with a semi-decent error message - __cuda_ptx_fence_is_not_supported_before_SM_70__();)); -} -#endif // __cccl_ptx_isa >= 600 - -/* -// fence{.sem}.scope; // 2. PTX ISA 78, SM_90 -// .sem = { .sc, .acq_rel } -// .scope = { .cluster } -template -__device__ static inline void fence( - cuda::ptx::sem_t sem, - cuda::ptx::scope_cluster_t); -*/ -#if __cccl_ptx_isa >= 780 -extern "C" _CCCL_DEVICE void __cuda_ptx_fence_is_not_supported_before_SM_90__(); -template -_CCCL_DEVICE static inline void fence(sem_t<_Sem> __sem, scope_cluster_t) -{ - static_assert(__sem == sem_sc || __sem == sem_acq_rel, ""); - // __scope == scope_cluster (due to parameter type constraint) - NV_IF_ELSE_TARGET( - NV_PROVIDES_SM_90, - ( - _CCCL_IF_CONSTEXPR (__sem == sem_sc) { - asm volatile("fence.sc.cluster; // 2." : : : "memory"); - } _CCCL_ELSE_IF_CONSTEXPR (__sem == sem_acq_rel) { - asm volatile("fence.acq_rel.cluster; // 2." : : : "memory"); - }), - ( - // Unsupported architectures will have a linker error with a semi-decent error message - __cuda_ptx_fence_is_not_supported_before_SM_90__();)); -} -#endif // __cccl_ptx_isa >= 780 -/* -// fence.mbarrier_init.sem.scope; // 3. PTX ISA 80, SM_90 -// .sem = { .release } -// .scope = { .cluster } -template -__device__ static inline void fence_mbarrier_init( - cuda::ptx::sem_release_t, - cuda::ptx::scope_cluster_t); -*/ -#if __cccl_ptx_isa >= 800 -extern "C" _CCCL_DEVICE void __cuda_ptx_fence_mbarrier_init_is_not_supported_before_SM_90__(); -template -_CCCL_DEVICE static inline void fence_mbarrier_init(sem_release_t, scope_cluster_t) -{ - // __sem == sem_release (due to parameter type constraint) - // __scope == scope_cluster (due to parameter type constraint) - NV_IF_ELSE_TARGET( - NV_PROVIDES_SM_90, - (asm volatile("fence.mbarrier_init.release.cluster; // 3." - : - : - : "memory");), - ( - // Unsupported architectures will have a linker error with a semi-decent error message - __cuda_ptx_fence_mbarrier_init_is_not_supported_before_SM_90__();)); -} -#endif // __cccl_ptx_isa >= 800 -/* -// fence.proxy.alias; // 4. PTX ISA 75, SM_70 -template -__device__ static inline void fence_proxy_alias(); -*/ -#if __cccl_ptx_isa >= 750 -extern "C" _CCCL_DEVICE void __cuda_ptx_fence_proxy_alias_is_not_supported_before_SM_70__(); -template -_CCCL_DEVICE static inline void fence_proxy_alias() -{ - NV_IF_ELSE_TARGET( - NV_PROVIDES_SM_70, - (asm volatile("fence.proxy.alias; // 4." - : - : - : "memory");), - ( - // Unsupported architectures will have a linker error with a semi-decent error message - __cuda_ptx_fence_proxy_alias_is_not_supported_before_SM_70__();)); -} -#endif // __cccl_ptx_isa >= 750 -/* -// fence.proxy.async; // 5. PTX ISA 80, SM_90 -template -__device__ static inline void fence_proxy_async(); -*/ -#if __cccl_ptx_isa >= 800 -extern "C" _CCCL_DEVICE void __cuda_ptx_fence_proxy_async_is_not_supported_before_SM_90__(); -template -_CCCL_DEVICE static inline void fence_proxy_async() -{ - NV_IF_ELSE_TARGET( - NV_PROVIDES_SM_90, - (asm volatile("fence.proxy.async; // 5." - : - : - : "memory");), - ( - // Unsupported architectures will have a linker error with a semi-decent error message - __cuda_ptx_fence_proxy_async_is_not_supported_before_SM_90__();)); -} -#endif // __cccl_ptx_isa >= 800 - -/* -// fence.proxy.async{.space}; // 6. PTX ISA 80, SM_90 -// .space = { .global, .shared::cluster, .shared::cta } -template -__device__ static inline void fence_proxy_async( - cuda::ptx::space_t space); -*/ -#if __cccl_ptx_isa >= 800 -extern "C" _CCCL_DEVICE void __cuda_ptx_fence_proxy_async_is_not_supported_before_SM_90__(); -template -_CCCL_DEVICE static inline void fence_proxy_async(space_t<_Space> __space) -{ - static_assert(__space == space_global || __space == space_cluster || __space == space_shared, ""); - NV_IF_ELSE_TARGET( - NV_PROVIDES_SM_90, - ( - _CCCL_IF_CONSTEXPR (__space == space_global) { - asm volatile("fence.proxy.async.global; // 6." : : : "memory"); - } _CCCL_ELSE_IF_CONSTEXPR (__space == space_cluster) { - asm volatile("fence.proxy.async.shared::cluster; // 6." : : : "memory"); - } _CCCL_ELSE_IF_CONSTEXPR (__space == space_shared) { - asm volatile("fence.proxy.async.shared::cta; // 6." : : : "memory"); - }), - ( - // Unsupported architectures will have a linker error with a semi-decent error message - __cuda_ptx_fence_proxy_async_is_not_supported_before_SM_90__();)); -} -#endif // __cccl_ptx_isa >= 800 -/* -// fence.proxy.tensormap::generic.release.scope; // 7. PTX ISA 83, SM_90 -// .sem = { .release } -// .scope = { .cta, .cluster, .gpu, .sys } -template -__device__ static inline void fence_proxy_tensormap_generic( - cuda::ptx::sem_release_t, - cuda::ptx::scope_t scope); -*/ -#if __cccl_ptx_isa >= 830 -extern "C" _CCCL_DEVICE void __cuda_ptx_fence_proxy_tensormap_generic_is_not_supported_before_SM_90__(); -template -_CCCL_DEVICE static inline void fence_proxy_tensormap_generic(sem_release_t, scope_t<_Scope> __scope) -{ - // __sem == sem_release (due to parameter type constraint) - static_assert(__scope == scope_cta || __scope == scope_cluster || __scope == scope_gpu || __scope == scope_sys, ""); - NV_IF_ELSE_TARGET( - NV_PROVIDES_SM_90, - ( - _CCCL_IF_CONSTEXPR (__scope == scope_cta) { - asm volatile("fence.proxy.tensormap::generic.release.cta; // 7." : : : "memory"); - } _CCCL_ELSE_IF_CONSTEXPR (__scope == scope_cluster) { - asm volatile("fence.proxy.tensormap::generic.release.cluster; // 7." : : : "memory"); - } _CCCL_ELSE_IF_CONSTEXPR (__scope == scope_gpu) { - asm volatile("fence.proxy.tensormap::generic.release.gpu; // 7." : : : "memory"); - } _CCCL_ELSE_IF_CONSTEXPR (__scope == scope_sys) { - asm volatile("fence.proxy.tensormap::generic.release.sys; // 7." : : : "memory"); - }), - ( - // Unsupported architectures will have a linker error with a semi-decent error message - __cuda_ptx_fence_proxy_tensormap_generic_is_not_supported_before_SM_90__();)); -} -#endif // __cccl_ptx_isa >= 830 - -/* -// fence.proxy.tensormap::generic.sem.scope [addr], size; // 8. PTX ISA 83, SM_90 -// .sem = { .acquire } -// .scope = { .cta, .cluster, .gpu, .sys } -template -__device__ static inline void fence_proxy_tensormap_generic( - cuda::ptx::sem_acquire_t, - cuda::ptx::scope_t scope, - const void* addr, - cuda::ptx::n32_t size); -*/ -#if __cccl_ptx_isa >= 830 -extern "C" _CCCL_DEVICE void __cuda_ptx_fence_proxy_tensormap_generic_is_not_supported_before_SM_90__(); -template -_CCCL_DEVICE static inline void -fence_proxy_tensormap_generic(sem_acquire_t, scope_t<_Scope> __scope, const void* __addr, n32_t<_N32> __size) -{ - // __sem == sem_acquire (due to parameter type constraint) - static_assert(__scope == scope_cta || __scope == scope_cluster || __scope == scope_gpu || __scope == scope_sys, ""); - NV_IF_ELSE_TARGET( - NV_PROVIDES_SM_90, - ( - _CCCL_IF_CONSTEXPR (__scope == scope_cta) { - asm volatile("fence.proxy.tensormap::generic.acquire.cta [%0], %1; // 8." - : - : "l"(__addr), "n"(__size.value) - : "memory"); - } _CCCL_ELSE_IF_CONSTEXPR (__scope == scope_cluster) { - asm volatile("fence.proxy.tensormap::generic.acquire.cluster [%0], %1; // 8." - : - : "l"(__addr), "n"(__size.value) - : "memory"); - } _CCCL_ELSE_IF_CONSTEXPR (__scope == scope_gpu) { - asm volatile("fence.proxy.tensormap::generic.acquire.gpu [%0], %1; // 8." - : - : "l"(__addr), "n"(__size.value) - : "memory"); - } _CCCL_ELSE_IF_CONSTEXPR (__scope == scope_sys) { - asm volatile("fence.proxy.tensormap::generic.acquire.sys [%0], %1; // 8." - : - : "l"(__addr), "n"(__size.value) - : "memory"); - }), - ( - // Unsupported architectures will have a linker error with a semi-decent error message - __cuda_ptx_fence_proxy_tensormap_generic_is_not_supported_before_SM_90__();)); -} -#endif // __cccl_ptx_isa >= 830 +#include +#include +#include +#include +#include _LIBCUDACXX_END_NAMESPACE_CUDA_PTX diff --git a/libcudacxx/include/cuda/__ptx/instructions/generated/barrier_cluster.inc b/libcudacxx/include/cuda/__ptx/instructions/generated/barrier_cluster.inc new file mode 100644 index 00000000000..ca9238bc3ff --- /dev/null +++ b/libcudacxx/include/cuda/__ptx/instructions/generated/barrier_cluster.inc @@ -0,0 +1,123 @@ +/* +// barrier.cluster.arrive; // PTX ISA 78, SM_90 +// Marked volatile and as clobbering memory +template +__device__ static inline void barrier_cluster_arrive(); +*/ +#if __cccl_ptx_isa >= 780 +extern "C" _CCCL_DEVICE void __cuda_ptx_barrier_cluster_arrive_is_not_supported_before_SM_90__(); +template +_CCCL_DEVICE static inline void barrier_cluster_arrive() +{ + NV_IF_ELSE_TARGET( + NV_PROVIDES_SM_90, + (asm volatile("barrier.cluster.arrive;" + : + : + : "memory");), + ( + // Unsupported architectures will have a linker error with a semi-decent error message + __cuda_ptx_barrier_cluster_arrive_is_not_supported_before_SM_90__();)); +} +#endif // __cccl_ptx_isa >= 780 + +/* +// barrier.cluster.wait; // PTX ISA 78, SM_90 +// Marked volatile and as clobbering memory +template +__device__ static inline void barrier_cluster_wait(); +*/ +#if __cccl_ptx_isa >= 780 +extern "C" _CCCL_DEVICE void __cuda_ptx_barrier_cluster_wait_is_not_supported_before_SM_90__(); +template +_CCCL_DEVICE static inline void barrier_cluster_wait() +{ + NV_IF_ELSE_TARGET( + NV_PROVIDES_SM_90, + (asm volatile("barrier.cluster.wait;" + : + : + : "memory");), + ( + // Unsupported architectures will have a linker error with a semi-decent error message + __cuda_ptx_barrier_cluster_wait_is_not_supported_before_SM_90__();)); +} +#endif // __cccl_ptx_isa >= 780 + +/* +// barrier.cluster.arrive.sem; // PTX ISA 80, SM_90 +// .sem = { .release } +// Marked volatile and as clobbering memory +template +__device__ static inline void barrier_cluster_arrive( + cuda::ptx::sem_release_t); +*/ +#if __cccl_ptx_isa >= 800 +extern "C" _CCCL_DEVICE void __cuda_ptx_barrier_cluster_arrive_is_not_supported_before_SM_90__(); +template +_CCCL_DEVICE static inline void barrier_cluster_arrive(sem_release_t) +{ + // __sem == sem_release (due to parameter type constraint) + NV_IF_ELSE_TARGET( + NV_PROVIDES_SM_90, + (asm volatile("barrier.cluster.arrive.release;" + : + : + : "memory");), + ( + // Unsupported architectures will have a linker error with a semi-decent error message + __cuda_ptx_barrier_cluster_arrive_is_not_supported_before_SM_90__();)); +} +#endif // __cccl_ptx_isa >= 800 + +/* +// barrier.cluster.arrive.sem; // PTX ISA 80, SM_90 +// .sem = { .relaxed } +// Marked volatile +template +__device__ static inline void barrier_cluster_arrive( + cuda::ptx::sem_relaxed_t); +*/ +#if __cccl_ptx_isa >= 800 +extern "C" _CCCL_DEVICE void __cuda_ptx_barrier_cluster_arrive_is_not_supported_before_SM_90__(); +template +_CCCL_DEVICE static inline void barrier_cluster_arrive(sem_relaxed_t) +{ + // __sem == sem_relaxed (due to parameter type constraint) + NV_IF_ELSE_TARGET( + NV_PROVIDES_SM_90, + (asm volatile("barrier.cluster.arrive.relaxed;" + : + : + :);), + ( + // Unsupported architectures will have a linker error with a semi-decent error message + __cuda_ptx_barrier_cluster_arrive_is_not_supported_before_SM_90__();)); +} +#endif // __cccl_ptx_isa >= 800 + +/* +// barrier.cluster.wait.sem; // PTX ISA 80, SM_90 +// .sem = { .acquire } +// Marked volatile and as clobbering memory +template +__device__ static inline void barrier_cluster_wait( + cuda::ptx::sem_acquire_t); +*/ +#if __cccl_ptx_isa >= 800 +extern "C" _CCCL_DEVICE void __cuda_ptx_barrier_cluster_wait_is_not_supported_before_SM_90__(); +template +_CCCL_DEVICE static inline void barrier_cluster_wait(sem_acquire_t) +{ + // __sem == sem_acquire (due to parameter type constraint) + NV_IF_ELSE_TARGET( + NV_PROVIDES_SM_90, + (asm volatile("barrier.cluster.wait.acquire;" + : + : + : "memory");), + ( + // Unsupported architectures will have a linker error with a semi-decent error message + __cuda_ptx_barrier_cluster_wait_is_not_supported_before_SM_90__();)); +} +#endif // __cccl_ptx_isa >= 800 diff --git a/libcudacxx/include/cuda/__ptx/instructions/generated/cp_async_bulk.inc b/libcudacxx/include/cuda/__ptx/instructions/generated/cp_async_bulk.inc new file mode 100644 index 00000000000..69f77053b95 --- /dev/null +++ b/libcudacxx/include/cuda/__ptx/instructions/generated/cp_async_bulk.inc @@ -0,0 +1,111 @@ +/* +// cp.async.bulk.dst.src.mbarrier::complete_tx::bytes [dstMem], [srcMem], size, [smem_bar]; // 1a. unicast PTX ISA 80, +SM_90 +// .dst = { .shared::cluster } +// .src = { .global } +template +__device__ static inline void cp_async_bulk( + cuda::ptx::space_cluster_t, + cuda::ptx::space_global_t, + void* dstMem, + const void* srcMem, + const uint32_t& size, + uint64_t* smem_bar); +*/ +#if __cccl_ptx_isa >= 800 +extern "C" _CCCL_DEVICE void __cuda_ptx_cp_async_bulk_is_not_supported_before_SM_90__(); +template +_CCCL_DEVICE static inline void cp_async_bulk( + space_cluster_t, + space_global_t, + void* __dstMem, + const void* __srcMem, + const _CUDA_VSTD::uint32_t& __size, + _CUDA_VSTD::uint64_t* __smem_bar) +{ + // __space == space_cluster (due to parameter type constraint) + // __space == space_global (due to parameter type constraint) + NV_IF_ELSE_TARGET( + NV_PROVIDES_SM_90, + (asm("cp.async.bulk.shared::cluster.global.mbarrier::complete_tx::bytes [%0], [%1], %2, [%3]; // 1a. unicast" + : + : "r"(__as_ptr_smem(__dstMem)), "l"(__as_ptr_gmem(__srcMem)), "r"(__size), "r"(__as_ptr_smem(__smem_bar)) + : "memory");), + ( + // Unsupported architectures will have a linker error with a semi-decent error message + __cuda_ptx_cp_async_bulk_is_not_supported_before_SM_90__();)); +} +#endif // __cccl_ptx_isa >= 800 + +/* +// cp.async.bulk.dst.src.mbarrier::complete_tx::bytes [dstMem], [srcMem], size, [rdsmem_bar]; // 2. PTX ISA 80, SM_90 +// .dst = { .shared::cluster } +// .src = { .shared::cta } +template +__device__ static inline void cp_async_bulk( + cuda::ptx::space_cluster_t, + cuda::ptx::space_shared_t, + void* dstMem, + const void* srcMem, + const uint32_t& size, + uint64_t* rdsmem_bar); +*/ +#if __cccl_ptx_isa >= 800 +extern "C" _CCCL_DEVICE void __cuda_ptx_cp_async_bulk_is_not_supported_before_SM_90__(); +template +_CCCL_DEVICE static inline void cp_async_bulk( + space_cluster_t, + space_shared_t, + void* __dstMem, + const void* __srcMem, + const _CUDA_VSTD::uint32_t& __size, + _CUDA_VSTD::uint64_t* __rdsmem_bar) +{ + // __space == space_cluster (due to parameter type constraint) + // __space == space_shared (due to parameter type constraint) + NV_IF_ELSE_TARGET( + NV_PROVIDES_SM_90, + (asm("cp.async.bulk.shared::cluster.shared::cta.mbarrier::complete_tx::bytes [%0], [%1], %2, [%3]; // 2. " + : + : "r"(__as_ptr_remote_dsmem(__dstMem)), + "r"(__as_ptr_smem(__srcMem)), + "r"(__size), + "r"(__as_ptr_remote_dsmem(__rdsmem_bar)) + : "memory");), + ( + // Unsupported architectures will have a linker error with a semi-decent error message + __cuda_ptx_cp_async_bulk_is_not_supported_before_SM_90__();)); +} +#endif // __cccl_ptx_isa >= 800 + +/* +// cp.async.bulk.dst.src.bulk_group [dstMem], [srcMem], size; // 3. PTX ISA 80, SM_90 +// .dst = { .global } +// .src = { .shared::cta } +template +__device__ static inline void cp_async_bulk( + cuda::ptx::space_global_t, + cuda::ptx::space_shared_t, + void* dstMem, + const void* srcMem, + const uint32_t& size); +*/ +#if __cccl_ptx_isa >= 800 +extern "C" _CCCL_DEVICE void __cuda_ptx_cp_async_bulk_is_not_supported_before_SM_90__(); +template +_CCCL_DEVICE static inline void +cp_async_bulk(space_global_t, space_shared_t, void* __dstMem, const void* __srcMem, const _CUDA_VSTD::uint32_t& __size) +{ + // __space == space_global (due to parameter type constraint) + // __space == space_shared (due to parameter type constraint) + NV_IF_ELSE_TARGET( + NV_PROVIDES_SM_90, + (asm("cp.async.bulk.global.shared::cta.bulk_group [%0], [%1], %2; // 3. " + : + : "l"(__as_ptr_gmem(__dstMem)), "r"(__as_ptr_smem(__srcMem)), "r"(__size) + : "memory");), + ( + // Unsupported architectures will have a linker error with a semi-decent error message + __cuda_ptx_cp_async_bulk_is_not_supported_before_SM_90__();)); +} +#endif // __cccl_ptx_isa >= 800 diff --git a/libcudacxx/include/cuda/__ptx/instructions/generated/cp_async_bulk_commit_group.inc b/libcudacxx/include/cuda/__ptx/instructions/generated/cp_async_bulk_commit_group.inc new file mode 100644 index 00000000000..24baddaea8f --- /dev/null +++ b/libcudacxx/include/cuda/__ptx/instructions/generated/cp_async_bulk_commit_group.inc @@ -0,0 +1,21 @@ +/* +// cp.async.bulk.commit_group; // PTX ISA 80, SM_90 +template +__device__ static inline void cp_async_bulk_commit_group(); +*/ +#if __cccl_ptx_isa >= 800 +extern "C" _CCCL_DEVICE void __cuda_ptx_cp_async_bulk_commit_group_is_not_supported_before_SM_90__(); +template +_CCCL_DEVICE static inline void cp_async_bulk_commit_group() +{ + NV_IF_ELSE_TARGET( + NV_PROVIDES_SM_90, + (asm volatile("cp.async.bulk.commit_group;" + : + : + :);), + ( + // Unsupported architectures will have a linker error with a semi-decent error message + __cuda_ptx_cp_async_bulk_commit_group_is_not_supported_before_SM_90__();)); +} +#endif // __cccl_ptx_isa >= 800 diff --git a/libcudacxx/include/cuda/__ptx/instructions/generated/cp_async_bulk_multicast.inc b/libcudacxx/include/cuda/__ptx/instructions/generated/cp_async_bulk_multicast.inc new file mode 100644 index 00000000000..cdd5a535eb6 --- /dev/null +++ b/libcudacxx/include/cuda/__ptx/instructions/generated/cp_async_bulk_multicast.inc @@ -0,0 +1,45 @@ +/* +// cp.async.bulk{.dst}{.src}.mbarrier::complete_tx::bytes.multicast::cluster [dstMem], [srcMem], size, [smem_bar], +ctaMask; // 1. PTX ISA 80, SM_90a +// .dst = { .shared::cluster } +// .src = { .global } +template +__device__ static inline void cp_async_bulk( + cuda::ptx::space_cluster_t, + cuda::ptx::space_global_t, + void* dstMem, + const void* srcMem, + const uint32_t& size, + uint64_t* smem_bar, + const uint16_t& ctaMask); +*/ +#if __cccl_ptx_isa >= 800 +extern "C" _CCCL_DEVICE void __cuda_ptx_cp_async_bulk_is_not_supported_before_SM_90a__(); +template +_CCCL_DEVICE static inline void cp_async_bulk( + space_cluster_t, + space_global_t, + void* __dstMem, + const void* __srcMem, + const _CUDA_VSTD::uint32_t& __size, + _CUDA_VSTD::uint64_t* __smem_bar, + const _CUDA_VSTD::uint16_t& __ctaMask) +{ + // __space == space_cluster (due to parameter type constraint) + // __space == space_global (due to parameter type constraint) + NV_IF_ELSE_TARGET( + NV_HAS_FEATURE_SM_90a, + (asm("cp.async.bulk.shared::cluster.global.mbarrier::complete_tx::bytes.multicast::cluster [%0], [%1], %2, [%3], " + "%4; // 1. " + : + : "r"(__as_ptr_smem(__dstMem)), + "l"(__as_ptr_gmem(__srcMem)), + "r"(__size), + "r"(__as_ptr_smem(__smem_bar)), + "h"(__ctaMask) + : "memory");), + ( + // Unsupported architectures will have a linker error with a semi-decent error message + __cuda_ptx_cp_async_bulk_is_not_supported_before_SM_90a__();)); +} +#endif // __cccl_ptx_isa >= 800 diff --git a/libcudacxx/include/cuda/__ptx/instructions/generated/cp_async_bulk_tensor.inc b/libcudacxx/include/cuda/__ptx/instructions/generated/cp_async_bulk_tensor.inc new file mode 100644 index 00000000000..547888d5b0f --- /dev/null +++ b/libcudacxx/include/cuda/__ptx/instructions/generated/cp_async_bulk_tensor.inc @@ -0,0 +1,416 @@ +/* +// cp.async.bulk.tensor.1d.dst.src.tile.mbarrier::complete_tx::bytes [dstMem], [tensorMap, tensorCoords], [smem_bar];// +1a. PTX ISA 80, SM_90 +// .dst = { .shared::cluster } +// .src = { .global } +template +__device__ static inline void cp_async_bulk_tensor( + cuda::ptx::space_cluster_t, + cuda::ptx::space_global_t, + void* dstMem, + const void* tensorMap, + const int32_t (&tensorCoords)[1], + uint64_t* smem_bar); +*/ +#if __cccl_ptx_isa >= 800 +extern "C" _CCCL_DEVICE void __cuda_ptx_cp_async_bulk_tensor_is_not_supported_before_SM_90__(); +template +_CCCL_DEVICE static inline void cp_async_bulk_tensor( + space_cluster_t, + space_global_t, + void* __dstMem, + const void* __tensorMap, + const _CUDA_VSTD::int32_t (&__tensorCoords)[1], + _CUDA_VSTD::uint64_t* __smem_bar) +{ + // __space == space_cluster (due to parameter type constraint) + // __space == space_global (due to parameter type constraint) + NV_IF_ELSE_TARGET( + NV_PROVIDES_SM_90, + (asm("cp.async.bulk.tensor.1d.shared::cluster.global.tile.mbarrier::complete_tx::bytes [%0], [%1, {%2}], [%3];// " + "1a." + : + : "r"(__as_ptr_smem(__dstMem)), "l"(__tensorMap), "r"(__tensorCoords[0]), "r"(__as_ptr_smem(__smem_bar)) + : "memory");), + ( + // Unsupported architectures will have a linker error with a semi-decent error message + __cuda_ptx_cp_async_bulk_tensor_is_not_supported_before_SM_90__();)); +} +#endif // __cccl_ptx_isa >= 800 + +/* +// cp.async.bulk.tensor.1d.dst.src.tile.bulk_group [tensorMap, tensorCoords], [srcMem]; // 3a. PTX ISA 80, SM_90 +// .dst = { .global } +// .src = { .shared::cta } +template +__device__ static inline void cp_async_bulk_tensor( + cuda::ptx::space_global_t, + cuda::ptx::space_shared_t, + const void* tensorMap, + const int32_t (&tensorCoords)[1], + const void* srcMem); +*/ +#if __cccl_ptx_isa >= 800 +extern "C" _CCCL_DEVICE void __cuda_ptx_cp_async_bulk_tensor_is_not_supported_before_SM_90__(); +template +_CCCL_DEVICE static inline void cp_async_bulk_tensor( + space_global_t, + space_shared_t, + const void* __tensorMap, + const _CUDA_VSTD::int32_t (&__tensorCoords)[1], + const void* __srcMem) +{ + // __space == space_global (due to parameter type constraint) + // __space == space_shared (due to parameter type constraint) + NV_IF_ELSE_TARGET( + NV_PROVIDES_SM_90, + (asm("cp.async.bulk.tensor.1d.global.shared::cta.tile.bulk_group [%0, {%1}], [%2]; // 3a." + : + : "l"(__tensorMap), "r"(__tensorCoords[0]), "r"(__as_ptr_smem(__srcMem)) + : "memory");), + ( + // Unsupported architectures will have a linker error with a semi-decent error message + __cuda_ptx_cp_async_bulk_tensor_is_not_supported_before_SM_90__();)); +} +#endif // __cccl_ptx_isa >= 800 + +/* +// cp.async.bulk.tensor.2d.dst.src.tile.mbarrier::complete_tx::bytes [dstMem], [tensorMap, tensorCoords], [smem_bar];// +1b. PTX ISA 80, SM_90 +// .dst = { .shared::cluster } +// .src = { .global } +template +__device__ static inline void cp_async_bulk_tensor( + cuda::ptx::space_cluster_t, + cuda::ptx::space_global_t, + void* dstMem, + const void* tensorMap, + const int32_t (&tensorCoords)[2], + uint64_t* smem_bar); +*/ +#if __cccl_ptx_isa >= 800 +extern "C" _CCCL_DEVICE void __cuda_ptx_cp_async_bulk_tensor_is_not_supported_before_SM_90__(); +template +_CCCL_DEVICE static inline void cp_async_bulk_tensor( + space_cluster_t, + space_global_t, + void* __dstMem, + const void* __tensorMap, + const _CUDA_VSTD::int32_t (&__tensorCoords)[2], + _CUDA_VSTD::uint64_t* __smem_bar) +{ + // __space == space_cluster (due to parameter type constraint) + // __space == space_global (due to parameter type constraint) + NV_IF_ELSE_TARGET( + NV_PROVIDES_SM_90, + (asm("cp.async.bulk.tensor.2d.shared::cluster.global.tile.mbarrier::complete_tx::bytes [%0], [%1, {%2, %3}], " + "[%4];// 1b." + : + : "r"(__as_ptr_smem(__dstMem)), + "l"(__tensorMap), + "r"(__tensorCoords[0]), + "r"(__tensorCoords[1]), + "r"(__as_ptr_smem(__smem_bar)) + : "memory");), + ( + // Unsupported architectures will have a linker error with a semi-decent error message + __cuda_ptx_cp_async_bulk_tensor_is_not_supported_before_SM_90__();)); +} +#endif // __cccl_ptx_isa >= 800 + +/* +// cp.async.bulk.tensor.2d.dst.src.tile.bulk_group [tensorMap, tensorCoords], [srcMem]; // 3b. PTX ISA 80, SM_90 +// .dst = { .global } +// .src = { .shared::cta } +template +__device__ static inline void cp_async_bulk_tensor( + cuda::ptx::space_global_t, + cuda::ptx::space_shared_t, + const void* tensorMap, + const int32_t (&tensorCoords)[2], + const void* srcMem); +*/ +#if __cccl_ptx_isa >= 800 +extern "C" _CCCL_DEVICE void __cuda_ptx_cp_async_bulk_tensor_is_not_supported_before_SM_90__(); +template +_CCCL_DEVICE static inline void cp_async_bulk_tensor( + space_global_t, + space_shared_t, + const void* __tensorMap, + const _CUDA_VSTD::int32_t (&__tensorCoords)[2], + const void* __srcMem) +{ + // __space == space_global (due to parameter type constraint) + // __space == space_shared (due to parameter type constraint) + NV_IF_ELSE_TARGET( + NV_PROVIDES_SM_90, + (asm("cp.async.bulk.tensor.2d.global.shared::cta.tile.bulk_group [%0, {%1, %2}], [%3]; // 3b." + : + : "l"(__tensorMap), "r"(__tensorCoords[0]), "r"(__tensorCoords[1]), "r"(__as_ptr_smem(__srcMem)) + : "memory");), + ( + // Unsupported architectures will have a linker error with a semi-decent error message + __cuda_ptx_cp_async_bulk_tensor_is_not_supported_before_SM_90__();)); +} +#endif // __cccl_ptx_isa >= 800 + +/* +// cp.async.bulk.tensor.3d.dst.src.tile.mbarrier::complete_tx::bytes [dstMem], [tensorMap, tensorCoords], [smem_bar];// +1c. PTX ISA 80, SM_90 +// .dst = { .shared::cluster } +// .src = { .global } +template +__device__ static inline void cp_async_bulk_tensor( + cuda::ptx::space_cluster_t, + cuda::ptx::space_global_t, + void* dstMem, + const void* tensorMap, + const int32_t (&tensorCoords)[3], + uint64_t* smem_bar); +*/ +#if __cccl_ptx_isa >= 800 +extern "C" _CCCL_DEVICE void __cuda_ptx_cp_async_bulk_tensor_is_not_supported_before_SM_90__(); +template +_CCCL_DEVICE static inline void cp_async_bulk_tensor( + space_cluster_t, + space_global_t, + void* __dstMem, + const void* __tensorMap, + const _CUDA_VSTD::int32_t (&__tensorCoords)[3], + _CUDA_VSTD::uint64_t* __smem_bar) +{ + // __space == space_cluster (due to parameter type constraint) + // __space == space_global (due to parameter type constraint) + NV_IF_ELSE_TARGET( + NV_PROVIDES_SM_90, + (asm("cp.async.bulk.tensor.3d.shared::cluster.global.tile.mbarrier::complete_tx::bytes [%0], [%1, {%2, %3, %4}], " + "[%5];// 1c." + : + : "r"(__as_ptr_smem(__dstMem)), + "l"(__tensorMap), + "r"(__tensorCoords[0]), + "r"(__tensorCoords[1]), + "r"(__tensorCoords[2]), + "r"(__as_ptr_smem(__smem_bar)) + : "memory");), + ( + // Unsupported architectures will have a linker error with a semi-decent error message + __cuda_ptx_cp_async_bulk_tensor_is_not_supported_before_SM_90__();)); +} +#endif // __cccl_ptx_isa >= 800 + +/* +// cp.async.bulk.tensor.3d.dst.src.tile.bulk_group [tensorMap, tensorCoords], [srcMem]; // 3c. PTX ISA 80, SM_90 +// .dst = { .global } +// .src = { .shared::cta } +template +__device__ static inline void cp_async_bulk_tensor( + cuda::ptx::space_global_t, + cuda::ptx::space_shared_t, + const void* tensorMap, + const int32_t (&tensorCoords)[3], + const void* srcMem); +*/ +#if __cccl_ptx_isa >= 800 +extern "C" _CCCL_DEVICE void __cuda_ptx_cp_async_bulk_tensor_is_not_supported_before_SM_90__(); +template +_CCCL_DEVICE static inline void cp_async_bulk_tensor( + space_global_t, + space_shared_t, + const void* __tensorMap, + const _CUDA_VSTD::int32_t (&__tensorCoords)[3], + const void* __srcMem) +{ + // __space == space_global (due to parameter type constraint) + // __space == space_shared (due to parameter type constraint) + NV_IF_ELSE_TARGET( + NV_PROVIDES_SM_90, + (asm("cp.async.bulk.tensor.3d.global.shared::cta.tile.bulk_group [%0, {%1, %2, %3}], [%4]; // 3c." + : + : "l"(__tensorMap), + "r"(__tensorCoords[0]), + "r"(__tensorCoords[1]), + "r"(__tensorCoords[2]), + "r"(__as_ptr_smem(__srcMem)) + : "memory");), + ( + // Unsupported architectures will have a linker error with a semi-decent error message + __cuda_ptx_cp_async_bulk_tensor_is_not_supported_before_SM_90__();)); +} +#endif // __cccl_ptx_isa >= 800 + +/* +// cp.async.bulk.tensor.4d.dst.src.tile.mbarrier::complete_tx::bytes [dstMem], [tensorMap, tensorCoords], [smem_bar];// +1d. PTX ISA 80, SM_90 +// .dst = { .shared::cluster } +// .src = { .global } +template +__device__ static inline void cp_async_bulk_tensor( + cuda::ptx::space_cluster_t, + cuda::ptx::space_global_t, + void* dstMem, + const void* tensorMap, + const int32_t (&tensorCoords)[4], + uint64_t* smem_bar); +*/ +#if __cccl_ptx_isa >= 800 +extern "C" _CCCL_DEVICE void __cuda_ptx_cp_async_bulk_tensor_is_not_supported_before_SM_90__(); +template +_CCCL_DEVICE static inline void cp_async_bulk_tensor( + space_cluster_t, + space_global_t, + void* __dstMem, + const void* __tensorMap, + const _CUDA_VSTD::int32_t (&__tensorCoords)[4], + _CUDA_VSTD::uint64_t* __smem_bar) +{ + // __space == space_cluster (due to parameter type constraint) + // __space == space_global (due to parameter type constraint) + NV_IF_ELSE_TARGET( + NV_PROVIDES_SM_90, + (asm("cp.async.bulk.tensor.4d.shared::cluster.global.tile.mbarrier::complete_tx::bytes [%0], [%1, {%2, %3, %4, " + "%5}], [%6];// 1d." + : + : "r"(__as_ptr_smem(__dstMem)), + "l"(__tensorMap), + "r"(__tensorCoords[0]), + "r"(__tensorCoords[1]), + "r"(__tensorCoords[2]), + "r"(__tensorCoords[3]), + "r"(__as_ptr_smem(__smem_bar)) + : "memory");), + ( + // Unsupported architectures will have a linker error with a semi-decent error message + __cuda_ptx_cp_async_bulk_tensor_is_not_supported_before_SM_90__();)); +} +#endif // __cccl_ptx_isa >= 800 + +/* +// cp.async.bulk.tensor.4d.dst.src.tile.bulk_group [tensorMap, tensorCoords], [srcMem]; // 3d. PTX ISA 80, SM_90 +// .dst = { .global } +// .src = { .shared::cta } +template +__device__ static inline void cp_async_bulk_tensor( + cuda::ptx::space_global_t, + cuda::ptx::space_shared_t, + const void* tensorMap, + const int32_t (&tensorCoords)[4], + const void* srcMem); +*/ +#if __cccl_ptx_isa >= 800 +extern "C" _CCCL_DEVICE void __cuda_ptx_cp_async_bulk_tensor_is_not_supported_before_SM_90__(); +template +_CCCL_DEVICE static inline void cp_async_bulk_tensor( + space_global_t, + space_shared_t, + const void* __tensorMap, + const _CUDA_VSTD::int32_t (&__tensorCoords)[4], + const void* __srcMem) +{ + // __space == space_global (due to parameter type constraint) + // __space == space_shared (due to parameter type constraint) + NV_IF_ELSE_TARGET( + NV_PROVIDES_SM_90, + (asm("cp.async.bulk.tensor.4d.global.shared::cta.tile.bulk_group [%0, {%1, %2, %3, %4}], [%5]; // 3d." + : + : "l"(__tensorMap), + "r"(__tensorCoords[0]), + "r"(__tensorCoords[1]), + "r"(__tensorCoords[2]), + "r"(__tensorCoords[3]), + "r"(__as_ptr_smem(__srcMem)) + : "memory");), + ( + // Unsupported architectures will have a linker error with a semi-decent error message + __cuda_ptx_cp_async_bulk_tensor_is_not_supported_before_SM_90__();)); +} +#endif // __cccl_ptx_isa >= 800 + +/* +// cp.async.bulk.tensor.5d.dst.src.tile.mbarrier::complete_tx::bytes [dstMem], [tensorMap, tensorCoords], [smem_bar];// +1e. PTX ISA 80, SM_90 +// .dst = { .shared::cluster } +// .src = { .global } +template +__device__ static inline void cp_async_bulk_tensor( + cuda::ptx::space_cluster_t, + cuda::ptx::space_global_t, + void* dstMem, + const void* tensorMap, + const int32_t (&tensorCoords)[5], + uint64_t* smem_bar); +*/ +#if __cccl_ptx_isa >= 800 +extern "C" _CCCL_DEVICE void __cuda_ptx_cp_async_bulk_tensor_is_not_supported_before_SM_90__(); +template +_CCCL_DEVICE static inline void cp_async_bulk_tensor( + space_cluster_t, + space_global_t, + void* __dstMem, + const void* __tensorMap, + const _CUDA_VSTD::int32_t (&__tensorCoords)[5], + _CUDA_VSTD::uint64_t* __smem_bar) +{ + // __space == space_cluster (due to parameter type constraint) + // __space == space_global (due to parameter type constraint) + NV_IF_ELSE_TARGET( + NV_PROVIDES_SM_90, + (asm("cp.async.bulk.tensor.5d.shared::cluster.global.tile.mbarrier::complete_tx::bytes [%0], [%1, {%2, %3, %4, %5, " + "%6}], [%7];// 1e." + : + : "r"(__as_ptr_smem(__dstMem)), + "l"(__tensorMap), + "r"(__tensorCoords[0]), + "r"(__tensorCoords[1]), + "r"(__tensorCoords[2]), + "r"(__tensorCoords[3]), + "r"(__tensorCoords[4]), + "r"(__as_ptr_smem(__smem_bar)) + : "memory");), + ( + // Unsupported architectures will have a linker error with a semi-decent error message + __cuda_ptx_cp_async_bulk_tensor_is_not_supported_before_SM_90__();)); +} +#endif // __cccl_ptx_isa >= 800 + +/* +// cp.async.bulk.tensor.5d.dst.src.tile.bulk_group [tensorMap, tensorCoords], [srcMem]; // 3e. PTX ISA 80, SM_90 +// .dst = { .global } +// .src = { .shared::cta } +template +__device__ static inline void cp_async_bulk_tensor( + cuda::ptx::space_global_t, + cuda::ptx::space_shared_t, + const void* tensorMap, + const int32_t (&tensorCoords)[5], + const void* srcMem); +*/ +#if __cccl_ptx_isa >= 800 +extern "C" _CCCL_DEVICE void __cuda_ptx_cp_async_bulk_tensor_is_not_supported_before_SM_90__(); +template +_CCCL_DEVICE static inline void cp_async_bulk_tensor( + space_global_t, + space_shared_t, + const void* __tensorMap, + const _CUDA_VSTD::int32_t (&__tensorCoords)[5], + const void* __srcMem) +{ + // __space == space_global (due to parameter type constraint) + // __space == space_shared (due to parameter type constraint) + NV_IF_ELSE_TARGET( + NV_PROVIDES_SM_90, + (asm("cp.async.bulk.tensor.5d.global.shared::cta.tile.bulk_group [%0, {%1, %2, %3, %4, %5}], [%6]; // 3e." + : + : "l"(__tensorMap), + "r"(__tensorCoords[0]), + "r"(__tensorCoords[1]), + "r"(__tensorCoords[2]), + "r"(__tensorCoords[3]), + "r"(__tensorCoords[4]), + "r"(__as_ptr_smem(__srcMem)) + : "memory");), + ( + // Unsupported architectures will have a linker error with a semi-decent error message + __cuda_ptx_cp_async_bulk_tensor_is_not_supported_before_SM_90__();)); +} +#endif // __cccl_ptx_isa >= 800 diff --git a/libcudacxx/include/cuda/__ptx/instructions/generated/cp_async_bulk_tensor_multicast.inc b/libcudacxx/include/cuda/__ptx/instructions/generated/cp_async_bulk_tensor_multicast.inc new file mode 100644 index 00000000000..020698a15b1 --- /dev/null +++ b/libcudacxx/include/cuda/__ptx/instructions/generated/cp_async_bulk_tensor_multicast.inc @@ -0,0 +1,239 @@ +/* +// cp.async.bulk.tensor.1d.dst.src.tile.mbarrier::complete_tx::bytes.multicast::cluster [dstMem], [tensorMap, +tensorCoords], [smem_bar], ctaMask; // 2a. PTX ISA 80, SM_90a +// .dst = { .shared::cluster } +// .src = { .global } +template +__device__ static inline void cp_async_bulk_tensor( + cuda::ptx::space_cluster_t, + cuda::ptx::space_global_t, + void* dstMem, + const void* tensorMap, + const int32_t (&tensorCoords)[1], + uint64_t* smem_bar, + const uint16_t& ctaMask); +*/ +#if __cccl_ptx_isa >= 800 +extern "C" _CCCL_DEVICE void __cuda_ptx_cp_async_bulk_tensor_is_not_supported_before_SM_90a__(); +template +_CCCL_DEVICE static inline void cp_async_bulk_tensor( + space_cluster_t, + space_global_t, + void* __dstMem, + const void* __tensorMap, + const _CUDA_VSTD::int32_t (&__tensorCoords)[1], + _CUDA_VSTD::uint64_t* __smem_bar, + const _CUDA_VSTD::uint16_t& __ctaMask) +{ + // __space == space_cluster (due to parameter type constraint) + // __space == space_global (due to parameter type constraint) + NV_IF_ELSE_TARGET( + NV_HAS_FEATURE_SM_90a, + (asm("cp.async.bulk.tensor.1d.shared::cluster.global.tile.mbarrier::complete_tx::bytes.multicast::cluster [%0], " + "[%1, {%2}], [%3], %4; // 2a." + : + : "r"(__as_ptr_smem(__dstMem)), + "l"(__tensorMap), + "r"(__tensorCoords[0]), + "r"(__as_ptr_smem(__smem_bar)), + "h"(__ctaMask) + : "memory");), + ( + // Unsupported architectures will have a linker error with a semi-decent error message + __cuda_ptx_cp_async_bulk_tensor_is_not_supported_before_SM_90a__();)); +} +#endif // __cccl_ptx_isa >= 800 + +/* +// cp.async.bulk.tensor.2d.dst.src.tile.mbarrier::complete_tx::bytes.multicast::cluster [dstMem], [tensorMap, +tensorCoords], [smem_bar], ctaMask; // 2b. PTX ISA 80, SM_90a +// .dst = { .shared::cluster } +// .src = { .global } +template +__device__ static inline void cp_async_bulk_tensor( + cuda::ptx::space_cluster_t, + cuda::ptx::space_global_t, + void* dstMem, + const void* tensorMap, + const int32_t (&tensorCoords)[2], + uint64_t* smem_bar, + const uint16_t& ctaMask); +*/ +#if __cccl_ptx_isa >= 800 +extern "C" _CCCL_DEVICE void __cuda_ptx_cp_async_bulk_tensor_is_not_supported_before_SM_90a__(); +template +_CCCL_DEVICE static inline void cp_async_bulk_tensor( + space_cluster_t, + space_global_t, + void* __dstMem, + const void* __tensorMap, + const _CUDA_VSTD::int32_t (&__tensorCoords)[2], + _CUDA_VSTD::uint64_t* __smem_bar, + const _CUDA_VSTD::uint16_t& __ctaMask) +{ + // __space == space_cluster (due to parameter type constraint) + // __space == space_global (due to parameter type constraint) + NV_IF_ELSE_TARGET( + NV_HAS_FEATURE_SM_90a, + (asm("cp.async.bulk.tensor.2d.shared::cluster.global.tile.mbarrier::complete_tx::bytes.multicast::cluster [%0], " + "[%1, {%2, %3}], [%4], %5; // 2b." + : + : "r"(__as_ptr_smem(__dstMem)), + "l"(__tensorMap), + "r"(__tensorCoords[0]), + "r"(__tensorCoords[1]), + "r"(__as_ptr_smem(__smem_bar)), + "h"(__ctaMask) + : "memory");), + ( + // Unsupported architectures will have a linker error with a semi-decent error message + __cuda_ptx_cp_async_bulk_tensor_is_not_supported_before_SM_90a__();)); +} +#endif // __cccl_ptx_isa >= 800 + +/* +// cp.async.bulk.tensor.3d.dst.src.tile.mbarrier::complete_tx::bytes.multicast::cluster [dstMem], [tensorMap, +tensorCoords], [smem_bar], ctaMask; // 2c. PTX ISA 80, SM_90a +// .dst = { .shared::cluster } +// .src = { .global } +template +__device__ static inline void cp_async_bulk_tensor( + cuda::ptx::space_cluster_t, + cuda::ptx::space_global_t, + void* dstMem, + const void* tensorMap, + const int32_t (&tensorCoords)[3], + uint64_t* smem_bar, + const uint16_t& ctaMask); +*/ +#if __cccl_ptx_isa >= 800 +extern "C" _CCCL_DEVICE void __cuda_ptx_cp_async_bulk_tensor_is_not_supported_before_SM_90a__(); +template +_CCCL_DEVICE static inline void cp_async_bulk_tensor( + space_cluster_t, + space_global_t, + void* __dstMem, + const void* __tensorMap, + const _CUDA_VSTD::int32_t (&__tensorCoords)[3], + _CUDA_VSTD::uint64_t* __smem_bar, + const _CUDA_VSTD::uint16_t& __ctaMask) +{ + // __space == space_cluster (due to parameter type constraint) + // __space == space_global (due to parameter type constraint) + NV_IF_ELSE_TARGET( + NV_HAS_FEATURE_SM_90a, + (asm("cp.async.bulk.tensor.3d.shared::cluster.global.tile.mbarrier::complete_tx::bytes.multicast::cluster [%0], " + "[%1, {%2, %3, %4}], [%5], %6; // 2c." + : + : "r"(__as_ptr_smem(__dstMem)), + "l"(__tensorMap), + "r"(__tensorCoords[0]), + "r"(__tensorCoords[1]), + "r"(__tensorCoords[2]), + "r"(__as_ptr_smem(__smem_bar)), + "h"(__ctaMask) + : "memory");), + ( + // Unsupported architectures will have a linker error with a semi-decent error message + __cuda_ptx_cp_async_bulk_tensor_is_not_supported_before_SM_90a__();)); +} +#endif // __cccl_ptx_isa >= 800 + +/* +// cp.async.bulk.tensor.4d.dst.src.tile.mbarrier::complete_tx::bytes.multicast::cluster [dstMem], [tensorMap, +tensorCoords], [smem_bar], ctaMask; // 2d. PTX ISA 80, SM_90a +// .dst = { .shared::cluster } +// .src = { .global } +template +__device__ static inline void cp_async_bulk_tensor( + cuda::ptx::space_cluster_t, + cuda::ptx::space_global_t, + void* dstMem, + const void* tensorMap, + const int32_t (&tensorCoords)[4], + uint64_t* smem_bar, + const uint16_t& ctaMask); +*/ +#if __cccl_ptx_isa >= 800 +extern "C" _CCCL_DEVICE void __cuda_ptx_cp_async_bulk_tensor_is_not_supported_before_SM_90a__(); +template +_CCCL_DEVICE static inline void cp_async_bulk_tensor( + space_cluster_t, + space_global_t, + void* __dstMem, + const void* __tensorMap, + const _CUDA_VSTD::int32_t (&__tensorCoords)[4], + _CUDA_VSTD::uint64_t* __smem_bar, + const _CUDA_VSTD::uint16_t& __ctaMask) +{ + // __space == space_cluster (due to parameter type constraint) + // __space == space_global (due to parameter type constraint) + NV_IF_ELSE_TARGET( + NV_HAS_FEATURE_SM_90a, + (asm("cp.async.bulk.tensor.4d.shared::cluster.global.tile.mbarrier::complete_tx::bytes.multicast::cluster [%0], " + "[%1, {%2, %3, %4, %5}], [%6], %7; // 2d." + : + : "r"(__as_ptr_smem(__dstMem)), + "l"(__tensorMap), + "r"(__tensorCoords[0]), + "r"(__tensorCoords[1]), + "r"(__tensorCoords[2]), + "r"(__tensorCoords[3]), + "r"(__as_ptr_smem(__smem_bar)), + "h"(__ctaMask) + : "memory");), + ( + // Unsupported architectures will have a linker error with a semi-decent error message + __cuda_ptx_cp_async_bulk_tensor_is_not_supported_before_SM_90a__();)); +} +#endif // __cccl_ptx_isa >= 800 + +/* +// cp.async.bulk.tensor.5d.dst.src.tile.mbarrier::complete_tx::bytes.multicast::cluster [dstMem], [tensorMap, +tensorCoords], [smem_bar], ctaMask; // 2e. PTX ISA 80, SM_90a +// .dst = { .shared::cluster } +// .src = { .global } +template +__device__ static inline void cp_async_bulk_tensor( + cuda::ptx::space_cluster_t, + cuda::ptx::space_global_t, + void* dstMem, + const void* tensorMap, + const int32_t (&tensorCoords)[5], + uint64_t* smem_bar, + const uint16_t& ctaMask); +*/ +#if __cccl_ptx_isa >= 800 +extern "C" _CCCL_DEVICE void __cuda_ptx_cp_async_bulk_tensor_is_not_supported_before_SM_90a__(); +template +_CCCL_DEVICE static inline void cp_async_bulk_tensor( + space_cluster_t, + space_global_t, + void* __dstMem, + const void* __tensorMap, + const _CUDA_VSTD::int32_t (&__tensorCoords)[5], + _CUDA_VSTD::uint64_t* __smem_bar, + const _CUDA_VSTD::uint16_t& __ctaMask) +{ + // __space == space_cluster (due to parameter type constraint) + // __space == space_global (due to parameter type constraint) + NV_IF_ELSE_TARGET( + NV_HAS_FEATURE_SM_90a, + (asm("cp.async.bulk.tensor.5d.shared::cluster.global.tile.mbarrier::complete_tx::bytes.multicast::cluster [%0], " + "[%1, {%2, %3, %4, %5, %6}], [%7], %8; // 2e." + : + : "r"(__as_ptr_smem(__dstMem)), + "l"(__tensorMap), + "r"(__tensorCoords[0]), + "r"(__tensorCoords[1]), + "r"(__tensorCoords[2]), + "r"(__tensorCoords[3]), + "r"(__tensorCoords[4]), + "r"(__as_ptr_smem(__smem_bar)), + "h"(__ctaMask) + : "memory");), + ( + // Unsupported architectures will have a linker error with a semi-decent error message + __cuda_ptx_cp_async_bulk_tensor_is_not_supported_before_SM_90a__();)); +} +#endif // __cccl_ptx_isa >= 800 diff --git a/libcudacxx/include/cuda/__ptx/instructions/generated/cp_async_bulk_wait_group.inc b/libcudacxx/include/cuda/__ptx/instructions/generated/cp_async_bulk_wait_group.inc new file mode 100644 index 00000000000..1a715a0fac6 --- /dev/null +++ b/libcudacxx/include/cuda/__ptx/instructions/generated/cp_async_bulk_wait_group.inc @@ -0,0 +1,45 @@ +/* +// cp.async.bulk.wait_group N; // PTX ISA 80, SM_90 +template +__device__ static inline void cp_async_bulk_wait_group( + cuda::ptx::n32_t N); +*/ +#if __cccl_ptx_isa >= 800 +extern "C" _CCCL_DEVICE void __cuda_ptx_cp_async_bulk_wait_group_is_not_supported_before_SM_90__(); +template +_CCCL_DEVICE static inline void cp_async_bulk_wait_group(n32_t<_N32> __n) +{ + NV_IF_ELSE_TARGET( + NV_PROVIDES_SM_90, + (asm volatile("cp.async.bulk.wait_group %0;" + : + : "n"(__n.value) + : "memory");), + ( + // Unsupported architectures will have a linker error with a semi-decent error message + __cuda_ptx_cp_async_bulk_wait_group_is_not_supported_before_SM_90__();)); +} +#endif // __cccl_ptx_isa >= 800 + +/* +// cp.async.bulk.wait_group.read N; // PTX ISA 80, SM_90 +template +__device__ static inline void cp_async_bulk_wait_group_read( + cuda::ptx::n32_t N); +*/ +#if __cccl_ptx_isa >= 800 +extern "C" _CCCL_DEVICE void __cuda_ptx_cp_async_bulk_wait_group_read_is_not_supported_before_SM_90__(); +template +_CCCL_DEVICE static inline void cp_async_bulk_wait_group_read(n32_t<_N32> __n) +{ + NV_IF_ELSE_TARGET( + NV_PROVIDES_SM_90, + (asm volatile("cp.async.bulk.wait_group.read %0;" + : + : "n"(__n.value) + : "memory");), + ( + // Unsupported architectures will have a linker error with a semi-decent error message + __cuda_ptx_cp_async_bulk_wait_group_read_is_not_supported_before_SM_90__();)); +} +#endif // __cccl_ptx_isa >= 800 diff --git a/libcudacxx/include/cuda/__ptx/instructions/generated/cp_reduce_async_bulk.inc b/libcudacxx/include/cuda/__ptx/instructions/generated/cp_reduce_async_bulk.inc new file mode 100644 index 00000000000..50059ff6c5b --- /dev/null +++ b/libcudacxx/include/cuda/__ptx/instructions/generated/cp_reduce_async_bulk.inc @@ -0,0 +1,1435 @@ +// 9.7.8.24.7. Data Movement and Conversion Instructions: cp.reduce.async.bulk +// https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#data-movement-and-conversion-instructions-cp-reduce-async-bulk +/* +// cp.reduce.async.bulk.dst.src.mbarrier::complete_tx::bytes.op.type [dstMem], [srcMem], size, [rdsmem_bar]; // 1. PTX +ISA 80, SM_90 +// .dst = { .shared::cluster } +// .src = { .shared::cta } +// .type = { .b32 } +// .op = { .and } +template +__device__ static inline void cp_reduce_async_bulk( + cuda::ptx::space_cluster_t, + cuda::ptx::space_shared_t, + cuda::ptx::op_and_op_t, + B32* dstMem, + const B32* srcMem, + uint32_t size, + uint64_t* rdsmem_bar); +*/ +#if __cccl_ptx_isa >= 800 +extern "C" _CCCL_DEVICE void __cuda_ptx_cp_reduce_async_bulk_is_not_supported_before_SM_90__(); +template +_CCCL_DEVICE static inline void cp_reduce_async_bulk( + space_cluster_t, + space_shared_t, + op_and_op_t, + _B32* __dstMem, + const _B32* __srcMem, + _CUDA_VSTD::uint32_t __size, + _CUDA_VSTD::uint64_t* __rdsmem_bar) +{ + // __space == space_cluster (due to parameter type constraint) + // __space == space_shared (due to parameter type constraint) + // __type == type_b32 (due to parameter type constraint) + // __op == op_and_op (due to parameter type constraint) + NV_IF_ELSE_TARGET( + NV_PROVIDES_SM_90, + (asm("cp.reduce.async.bulk.shared::cluster.shared::cta.mbarrier::complete_tx::bytes.and.b32 [%0], [%1], %2, [%3]; " + "// 1." + : + : "r"(__as_ptr_remote_dsmem(__dstMem)), + "r"(__as_ptr_smem(__srcMem)), + "r"(__size), + "r"(__as_ptr_remote_dsmem(__rdsmem_bar)) + : "memory");), + ( + // Unsupported architectures will have a linker error with a semi-decent error message + __cuda_ptx_cp_reduce_async_bulk_is_not_supported_before_SM_90__();)); +} +#endif // __cccl_ptx_isa >= 800 + +/* +// cp.reduce.async.bulk.dst.src.mbarrier::complete_tx::bytes.op.type [dstMem], [srcMem], size, [rdsmem_bar]; // 1. PTX +ISA 80, SM_90 +// .dst = { .shared::cluster } +// .src = { .shared::cta } +// .type = { .b32 } +// .op = { .or } +template +__device__ static inline void cp_reduce_async_bulk( + cuda::ptx::space_cluster_t, + cuda::ptx::space_shared_t, + cuda::ptx::op_or_op_t, + B32* dstMem, + const B32* srcMem, + uint32_t size, + uint64_t* rdsmem_bar); +*/ +#if __cccl_ptx_isa >= 800 +extern "C" _CCCL_DEVICE void __cuda_ptx_cp_reduce_async_bulk_is_not_supported_before_SM_90__(); +template +_CCCL_DEVICE static inline void cp_reduce_async_bulk( + space_cluster_t, + space_shared_t, + op_or_op_t, + _B32* __dstMem, + const _B32* __srcMem, + _CUDA_VSTD::uint32_t __size, + _CUDA_VSTD::uint64_t* __rdsmem_bar) +{ + // __space == space_cluster (due to parameter type constraint) + // __space == space_shared (due to parameter type constraint) + // __type == type_b32 (due to parameter type constraint) + // __op == op_or_op (due to parameter type constraint) + NV_IF_ELSE_TARGET( + NV_PROVIDES_SM_90, + (asm("cp.reduce.async.bulk.shared::cluster.shared::cta.mbarrier::complete_tx::bytes.or.b32 [%0], [%1], %2, [%3]; " + "// 1." + : + : "r"(__as_ptr_remote_dsmem(__dstMem)), + "r"(__as_ptr_smem(__srcMem)), + "r"(__size), + "r"(__as_ptr_remote_dsmem(__rdsmem_bar)) + : "memory");), + ( + // Unsupported architectures will have a linker error with a semi-decent error message + __cuda_ptx_cp_reduce_async_bulk_is_not_supported_before_SM_90__();)); +} +#endif // __cccl_ptx_isa >= 800 + +/* +// cp.reduce.async.bulk.dst.src.mbarrier::complete_tx::bytes.op.type [dstMem], [srcMem], size, [rdsmem_bar]; // 1. PTX +ISA 80, SM_90 +// .dst = { .shared::cluster } +// .src = { .shared::cta } +// .type = { .b32 } +// .op = { .xor } +template +__device__ static inline void cp_reduce_async_bulk( + cuda::ptx::space_cluster_t, + cuda::ptx::space_shared_t, + cuda::ptx::op_xor_op_t, + B32* dstMem, + const B32* srcMem, + uint32_t size, + uint64_t* rdsmem_bar); +*/ +#if __cccl_ptx_isa >= 800 +extern "C" _CCCL_DEVICE void __cuda_ptx_cp_reduce_async_bulk_is_not_supported_before_SM_90__(); +template +_CCCL_DEVICE static inline void cp_reduce_async_bulk( + space_cluster_t, + space_shared_t, + op_xor_op_t, + _B32* __dstMem, + const _B32* __srcMem, + _CUDA_VSTD::uint32_t __size, + _CUDA_VSTD::uint64_t* __rdsmem_bar) +{ + // __space == space_cluster (due to parameter type constraint) + // __space == space_shared (due to parameter type constraint) + // __type == type_b32 (due to parameter type constraint) + // __op == op_xor_op (due to parameter type constraint) + NV_IF_ELSE_TARGET( + NV_PROVIDES_SM_90, + (asm("cp.reduce.async.bulk.shared::cluster.shared::cta.mbarrier::complete_tx::bytes.xor.b32 [%0], [%1], %2, [%3]; " + "// 1." + : + : "r"(__as_ptr_remote_dsmem(__dstMem)), + "r"(__as_ptr_smem(__srcMem)), + "r"(__size), + "r"(__as_ptr_remote_dsmem(__rdsmem_bar)) + : "memory");), + ( + // Unsupported architectures will have a linker error with a semi-decent error message + __cuda_ptx_cp_reduce_async_bulk_is_not_supported_before_SM_90__();)); +} +#endif // __cccl_ptx_isa >= 800 + +/* +// cp.reduce.async.bulk.dst.src.mbarrier::complete_tx::bytes.op.type [dstMem], [srcMem], size, [rdsmem_bar]; // 1. PTX +ISA 80, SM_90 +// .dst = { .shared::cluster } +// .src = { .shared::cta } +// .type = { .u32 } +// .op = { .min } +template +__device__ static inline void cp_reduce_async_bulk( + cuda::ptx::space_cluster_t, + cuda::ptx::space_shared_t, + cuda::ptx::op_min_t, + uint32_t* dstMem, + const uint32_t* srcMem, + uint32_t size, + uint64_t* rdsmem_bar); +*/ +#if __cccl_ptx_isa >= 800 +extern "C" _CCCL_DEVICE void __cuda_ptx_cp_reduce_async_bulk_is_not_supported_before_SM_90__(); +template +_CCCL_DEVICE static inline void cp_reduce_async_bulk( + space_cluster_t, + space_shared_t, + op_min_t, + _CUDA_VSTD::uint32_t* __dstMem, + const _CUDA_VSTD::uint32_t* __srcMem, + _CUDA_VSTD::uint32_t __size, + _CUDA_VSTD::uint64_t* __rdsmem_bar) +{ + // __space == space_cluster (due to parameter type constraint) + // __space == space_shared (due to parameter type constraint) + // __type == type_u32 (due to parameter type constraint) + // __op == op_min (due to parameter type constraint) + NV_IF_ELSE_TARGET( + NV_PROVIDES_SM_90, + (asm("cp.reduce.async.bulk.shared::cluster.shared::cta.mbarrier::complete_tx::bytes.min.u32 [%0], [%1], %2, [%3]; " + "// 1." + : + : "r"(__as_ptr_remote_dsmem(__dstMem)), + "r"(__as_ptr_smem(__srcMem)), + "r"(__size), + "r"(__as_ptr_remote_dsmem(__rdsmem_bar)) + : "memory");), + ( + // Unsupported architectures will have a linker error with a semi-decent error message + __cuda_ptx_cp_reduce_async_bulk_is_not_supported_before_SM_90__();)); +} +#endif // __cccl_ptx_isa >= 800 + +/* +// cp.reduce.async.bulk.dst.src.mbarrier::complete_tx::bytes.op.type [dstMem], [srcMem], size, [rdsmem_bar]; // 1. PTX +ISA 80, SM_90 +// .dst = { .shared::cluster } +// .src = { .shared::cta } +// .type = { .u32 } +// .op = { .max } +template +__device__ static inline void cp_reduce_async_bulk( + cuda::ptx::space_cluster_t, + cuda::ptx::space_shared_t, + cuda::ptx::op_max_t, + uint32_t* dstMem, + const uint32_t* srcMem, + uint32_t size, + uint64_t* rdsmem_bar); +*/ +#if __cccl_ptx_isa >= 800 +extern "C" _CCCL_DEVICE void __cuda_ptx_cp_reduce_async_bulk_is_not_supported_before_SM_90__(); +template +_CCCL_DEVICE static inline void cp_reduce_async_bulk( + space_cluster_t, + space_shared_t, + op_max_t, + _CUDA_VSTD::uint32_t* __dstMem, + const _CUDA_VSTD::uint32_t* __srcMem, + _CUDA_VSTD::uint32_t __size, + _CUDA_VSTD::uint64_t* __rdsmem_bar) +{ + // __space == space_cluster (due to parameter type constraint) + // __space == space_shared (due to parameter type constraint) + // __type == type_u32 (due to parameter type constraint) + // __op == op_max (due to parameter type constraint) + NV_IF_ELSE_TARGET( + NV_PROVIDES_SM_90, + (asm("cp.reduce.async.bulk.shared::cluster.shared::cta.mbarrier::complete_tx::bytes.max.u32 [%0], [%1], %2, [%3]; " + "// 1." + : + : "r"(__as_ptr_remote_dsmem(__dstMem)), + "r"(__as_ptr_smem(__srcMem)), + "r"(__size), + "r"(__as_ptr_remote_dsmem(__rdsmem_bar)) + : "memory");), + ( + // Unsupported architectures will have a linker error with a semi-decent error message + __cuda_ptx_cp_reduce_async_bulk_is_not_supported_before_SM_90__();)); +} +#endif // __cccl_ptx_isa >= 800 + +/* +// cp.reduce.async.bulk.dst.src.mbarrier::complete_tx::bytes.op.type [dstMem], [srcMem], size, [rdsmem_bar]; // 1. PTX +ISA 80, SM_90 +// .dst = { .shared::cluster } +// .src = { .shared::cta } +// .type = { .u32 } +// .op = { .add } +template +__device__ static inline void cp_reduce_async_bulk( + cuda::ptx::space_cluster_t, + cuda::ptx::space_shared_t, + cuda::ptx::op_add_t, + uint32_t* dstMem, + const uint32_t* srcMem, + uint32_t size, + uint64_t* rdsmem_bar); +*/ +#if __cccl_ptx_isa >= 800 +extern "C" _CCCL_DEVICE void __cuda_ptx_cp_reduce_async_bulk_is_not_supported_before_SM_90__(); +template +_CCCL_DEVICE static inline void cp_reduce_async_bulk( + space_cluster_t, + space_shared_t, + op_add_t, + _CUDA_VSTD::uint32_t* __dstMem, + const _CUDA_VSTD::uint32_t* __srcMem, + _CUDA_VSTD::uint32_t __size, + _CUDA_VSTD::uint64_t* __rdsmem_bar) +{ + // __space == space_cluster (due to parameter type constraint) + // __space == space_shared (due to parameter type constraint) + // __type == type_u32 (due to parameter type constraint) + // __op == op_add (due to parameter type constraint) + NV_IF_ELSE_TARGET( + NV_PROVIDES_SM_90, + (asm("cp.reduce.async.bulk.shared::cluster.shared::cta.mbarrier::complete_tx::bytes.add.u32 [%0], [%1], %2, [%3]; " + "// 1." + : + : "r"(__as_ptr_remote_dsmem(__dstMem)), + "r"(__as_ptr_smem(__srcMem)), + "r"(__size), + "r"(__as_ptr_remote_dsmem(__rdsmem_bar)) + : "memory");), + ( + // Unsupported architectures will have a linker error with a semi-decent error message + __cuda_ptx_cp_reduce_async_bulk_is_not_supported_before_SM_90__();)); +} +#endif // __cccl_ptx_isa >= 800 + +/* +// cp.reduce.async.bulk.dst.src.mbarrier::complete_tx::bytes.op.type [dstMem], [srcMem], size, [rdsmem_bar]; // 1. PTX +ISA 80, SM_90 +// .dst = { .shared::cluster } +// .src = { .shared::cta } +// .type = { .u32 } +// .op = { .inc } +template +__device__ static inline void cp_reduce_async_bulk( + cuda::ptx::space_cluster_t, + cuda::ptx::space_shared_t, + cuda::ptx::op_inc_t, + uint32_t* dstMem, + const uint32_t* srcMem, + uint32_t size, + uint64_t* rdsmem_bar); +*/ +#if __cccl_ptx_isa >= 800 +extern "C" _CCCL_DEVICE void __cuda_ptx_cp_reduce_async_bulk_is_not_supported_before_SM_90__(); +template +_CCCL_DEVICE static inline void cp_reduce_async_bulk( + space_cluster_t, + space_shared_t, + op_inc_t, + _CUDA_VSTD::uint32_t* __dstMem, + const _CUDA_VSTD::uint32_t* __srcMem, + _CUDA_VSTD::uint32_t __size, + _CUDA_VSTD::uint64_t* __rdsmem_bar) +{ + // __space == space_cluster (due to parameter type constraint) + // __space == space_shared (due to parameter type constraint) + // __type == type_u32 (due to parameter type constraint) + // __op == op_inc (due to parameter type constraint) + NV_IF_ELSE_TARGET( + NV_PROVIDES_SM_90, + (asm("cp.reduce.async.bulk.shared::cluster.shared::cta.mbarrier::complete_tx::bytes.inc.u32 [%0], [%1], %2, [%3]; " + "// 1." + : + : "r"(__as_ptr_remote_dsmem(__dstMem)), + "r"(__as_ptr_smem(__srcMem)), + "r"(__size), + "r"(__as_ptr_remote_dsmem(__rdsmem_bar)) + : "memory");), + ( + // Unsupported architectures will have a linker error with a semi-decent error message + __cuda_ptx_cp_reduce_async_bulk_is_not_supported_before_SM_90__();)); +} +#endif // __cccl_ptx_isa >= 800 + +/* +// cp.reduce.async.bulk.dst.src.mbarrier::complete_tx::bytes.op.type [dstMem], [srcMem], size, [rdsmem_bar]; // 1. PTX +ISA 80, SM_90 +// .dst = { .shared::cluster } +// .src = { .shared::cta } +// .type = { .u32 } +// .op = { .dec } +template +__device__ static inline void cp_reduce_async_bulk( + cuda::ptx::space_cluster_t, + cuda::ptx::space_shared_t, + cuda::ptx::op_dec_t, + uint32_t* dstMem, + const uint32_t* srcMem, + uint32_t size, + uint64_t* rdsmem_bar); +*/ +#if __cccl_ptx_isa >= 800 +extern "C" _CCCL_DEVICE void __cuda_ptx_cp_reduce_async_bulk_is_not_supported_before_SM_90__(); +template +_CCCL_DEVICE static inline void cp_reduce_async_bulk( + space_cluster_t, + space_shared_t, + op_dec_t, + _CUDA_VSTD::uint32_t* __dstMem, + const _CUDA_VSTD::uint32_t* __srcMem, + _CUDA_VSTD::uint32_t __size, + _CUDA_VSTD::uint64_t* __rdsmem_bar) +{ + // __space == space_cluster (due to parameter type constraint) + // __space == space_shared (due to parameter type constraint) + // __type == type_u32 (due to parameter type constraint) + // __op == op_dec (due to parameter type constraint) + NV_IF_ELSE_TARGET( + NV_PROVIDES_SM_90, + (asm("cp.reduce.async.bulk.shared::cluster.shared::cta.mbarrier::complete_tx::bytes.dec.u32 [%0], [%1], %2, [%3]; " + "// 1." + : + : "r"(__as_ptr_remote_dsmem(__dstMem)), + "r"(__as_ptr_smem(__srcMem)), + "r"(__size), + "r"(__as_ptr_remote_dsmem(__rdsmem_bar)) + : "memory");), + ( + // Unsupported architectures will have a linker error with a semi-decent error message + __cuda_ptx_cp_reduce_async_bulk_is_not_supported_before_SM_90__();)); +} +#endif // __cccl_ptx_isa >= 800 + +/* +// cp.reduce.async.bulk.dst.src.mbarrier::complete_tx::bytes.op.type [dstMem], [srcMem], size, [rdsmem_bar]; // 1. PTX +ISA 80, SM_90 +// .dst = { .shared::cluster } +// .src = { .shared::cta } +// .type = { .s32 } +// .op = { .min } +template +__device__ static inline void cp_reduce_async_bulk( + cuda::ptx::space_cluster_t, + cuda::ptx::space_shared_t, + cuda::ptx::op_min_t, + int32_t* dstMem, + const int32_t* srcMem, + uint32_t size, + uint64_t* rdsmem_bar); +*/ +#if __cccl_ptx_isa >= 800 +extern "C" _CCCL_DEVICE void __cuda_ptx_cp_reduce_async_bulk_is_not_supported_before_SM_90__(); +template +_CCCL_DEVICE static inline void cp_reduce_async_bulk( + space_cluster_t, + space_shared_t, + op_min_t, + _CUDA_VSTD::int32_t* __dstMem, + const _CUDA_VSTD::int32_t* __srcMem, + _CUDA_VSTD::uint32_t __size, + _CUDA_VSTD::uint64_t* __rdsmem_bar) +{ + // __space == space_cluster (due to parameter type constraint) + // __space == space_shared (due to parameter type constraint) + // __type == type_s32 (due to parameter type constraint) + // __op == op_min (due to parameter type constraint) + NV_IF_ELSE_TARGET( + NV_PROVIDES_SM_90, + (asm("cp.reduce.async.bulk.shared::cluster.shared::cta.mbarrier::complete_tx::bytes.min.s32 [%0], [%1], %2, [%3]; " + "// 1." + : + : "r"(__as_ptr_remote_dsmem(__dstMem)), + "r"(__as_ptr_smem(__srcMem)), + "r"(__size), + "r"(__as_ptr_remote_dsmem(__rdsmem_bar)) + : "memory");), + ( + // Unsupported architectures will have a linker error with a semi-decent error message + __cuda_ptx_cp_reduce_async_bulk_is_not_supported_before_SM_90__();)); +} +#endif // __cccl_ptx_isa >= 800 + +/* +// cp.reduce.async.bulk.dst.src.mbarrier::complete_tx::bytes.op.type [dstMem], [srcMem], size, [rdsmem_bar]; // 1. PTX +ISA 80, SM_90 +// .dst = { .shared::cluster } +// .src = { .shared::cta } +// .type = { .s32 } +// .op = { .max } +template +__device__ static inline void cp_reduce_async_bulk( + cuda::ptx::space_cluster_t, + cuda::ptx::space_shared_t, + cuda::ptx::op_max_t, + int32_t* dstMem, + const int32_t* srcMem, + uint32_t size, + uint64_t* rdsmem_bar); +*/ +#if __cccl_ptx_isa >= 800 +extern "C" _CCCL_DEVICE void __cuda_ptx_cp_reduce_async_bulk_is_not_supported_before_SM_90__(); +template +_CCCL_DEVICE static inline void cp_reduce_async_bulk( + space_cluster_t, + space_shared_t, + op_max_t, + _CUDA_VSTD::int32_t* __dstMem, + const _CUDA_VSTD::int32_t* __srcMem, + _CUDA_VSTD::uint32_t __size, + _CUDA_VSTD::uint64_t* __rdsmem_bar) +{ + // __space == space_cluster (due to parameter type constraint) + // __space == space_shared (due to parameter type constraint) + // __type == type_s32 (due to parameter type constraint) + // __op == op_max (due to parameter type constraint) + NV_IF_ELSE_TARGET( + NV_PROVIDES_SM_90, + (asm("cp.reduce.async.bulk.shared::cluster.shared::cta.mbarrier::complete_tx::bytes.max.s32 [%0], [%1], %2, [%3]; " + "// 1." + : + : "r"(__as_ptr_remote_dsmem(__dstMem)), + "r"(__as_ptr_smem(__srcMem)), + "r"(__size), + "r"(__as_ptr_remote_dsmem(__rdsmem_bar)) + : "memory");), + ( + // Unsupported architectures will have a linker error with a semi-decent error message + __cuda_ptx_cp_reduce_async_bulk_is_not_supported_before_SM_90__();)); +} +#endif // __cccl_ptx_isa >= 800 + +/* +// cp.reduce.async.bulk.dst.src.mbarrier::complete_tx::bytes.op.type [dstMem], [srcMem], size, [rdsmem_bar]; // 1. PTX +ISA 80, SM_90 +// .dst = { .shared::cluster } +// .src = { .shared::cta } +// .type = { .s32 } +// .op = { .add } +template +__device__ static inline void cp_reduce_async_bulk( + cuda::ptx::space_cluster_t, + cuda::ptx::space_shared_t, + cuda::ptx::op_add_t, + int32_t* dstMem, + const int32_t* srcMem, + uint32_t size, + uint64_t* rdsmem_bar); +*/ +#if __cccl_ptx_isa >= 800 +extern "C" _CCCL_DEVICE void __cuda_ptx_cp_reduce_async_bulk_is_not_supported_before_SM_90__(); +template +_CCCL_DEVICE static inline void cp_reduce_async_bulk( + space_cluster_t, + space_shared_t, + op_add_t, + _CUDA_VSTD::int32_t* __dstMem, + const _CUDA_VSTD::int32_t* __srcMem, + _CUDA_VSTD::uint32_t __size, + _CUDA_VSTD::uint64_t* __rdsmem_bar) +{ + // __space == space_cluster (due to parameter type constraint) + // __space == space_shared (due to parameter type constraint) + // __type == type_s32 (due to parameter type constraint) + // __op == op_add (due to parameter type constraint) + NV_IF_ELSE_TARGET( + NV_PROVIDES_SM_90, + (asm("cp.reduce.async.bulk.shared::cluster.shared::cta.mbarrier::complete_tx::bytes.add.s32 [%0], [%1], %2, [%3]; " + "// 1." + : + : "r"(__as_ptr_remote_dsmem(__dstMem)), + "r"(__as_ptr_smem(__srcMem)), + "r"(__size), + "r"(__as_ptr_remote_dsmem(__rdsmem_bar)) + : "memory");), + ( + // Unsupported architectures will have a linker error with a semi-decent error message + __cuda_ptx_cp_reduce_async_bulk_is_not_supported_before_SM_90__();)); +} +#endif // __cccl_ptx_isa >= 800 + +/* +// cp.reduce.async.bulk.dst.src.mbarrier::complete_tx::bytes.op.type [dstMem], [srcMem], size, [rdsmem_bar]; // 1. PTX +ISA 80, SM_90 +// .dst = { .shared::cluster } +// .src = { .shared::cta } +// .type = { .u64 } +// .op = { .add } +template +__device__ static inline void cp_reduce_async_bulk( + cuda::ptx::space_cluster_t, + cuda::ptx::space_shared_t, + cuda::ptx::op_add_t, + uint64_t* dstMem, + const uint64_t* srcMem, + uint32_t size, + uint64_t* rdsmem_bar); +*/ +#if __cccl_ptx_isa >= 800 +extern "C" _CCCL_DEVICE void __cuda_ptx_cp_reduce_async_bulk_is_not_supported_before_SM_90__(); +template +_CCCL_DEVICE static inline void cp_reduce_async_bulk( + space_cluster_t, + space_shared_t, + op_add_t, + _CUDA_VSTD::uint64_t* __dstMem, + const _CUDA_VSTD::uint64_t* __srcMem, + _CUDA_VSTD::uint32_t __size, + _CUDA_VSTD::uint64_t* __rdsmem_bar) +{ + // __space == space_cluster (due to parameter type constraint) + // __space == space_shared (due to parameter type constraint) + // __type == type_u64 (due to parameter type constraint) + // __op == op_add (due to parameter type constraint) + NV_IF_ELSE_TARGET( + NV_PROVIDES_SM_90, + (asm("cp.reduce.async.bulk.shared::cluster.shared::cta.mbarrier::complete_tx::bytes.add.u64 [%0], [%1], %2, [%3]; " + "// 1." + : + : "r"(__as_ptr_remote_dsmem(__dstMem)), + "r"(__as_ptr_smem(__srcMem)), + "r"(__size), + "r"(__as_ptr_remote_dsmem(__rdsmem_bar)) + : "memory");), + ( + // Unsupported architectures will have a linker error with a semi-decent error message + __cuda_ptx_cp_reduce_async_bulk_is_not_supported_before_SM_90__();)); +} +#endif // __cccl_ptx_isa >= 800 + +/* +// cp.reduce.async.bulk.dst.src.mbarrier::complete_tx::bytes.op.u64 [dstMem], [srcMem], size, [rdsmem_bar]; // 2. PTX +ISA 80, SM_90 +// .dst = { .shared::cluster } +// .src = { .shared::cta } +// .type = { .s64 } +// .op = { .add } +template +__device__ static inline void cp_reduce_async_bulk( + cuda::ptx::space_cluster_t, + cuda::ptx::space_shared_t, + cuda::ptx::op_add_t, + int64_t* dstMem, + const int64_t* srcMem, + uint32_t size, + uint64_t* rdsmem_bar); +*/ +#if __cccl_ptx_isa >= 800 +extern "C" _CCCL_DEVICE void __cuda_ptx_cp_reduce_async_bulk_is_not_supported_before_SM_90__(); +template +_CCCL_DEVICE static inline void cp_reduce_async_bulk( + space_cluster_t, + space_shared_t, + op_add_t, + _CUDA_VSTD::int64_t* __dstMem, + const _CUDA_VSTD::int64_t* __srcMem, + _CUDA_VSTD::uint32_t __size, + _CUDA_VSTD::uint64_t* __rdsmem_bar) +{ + // __space == space_cluster (due to parameter type constraint) + // __space == space_shared (due to parameter type constraint) + // __type == type_s64 (due to parameter type constraint) + // __op == op_add (due to parameter type constraint) + NV_IF_ELSE_TARGET( + NV_PROVIDES_SM_90, + (asm("cp.reduce.async.bulk.shared::cluster.shared::cta.mbarrier::complete_tx::bytes.add.u64 [%0], [%1], %2, [%3]; " + "// 2." + : + : "r"(__as_ptr_remote_dsmem(__dstMem)), + "r"(__as_ptr_smem(__srcMem)), + "r"(__size), + "r"(__as_ptr_remote_dsmem(__rdsmem_bar)) + : "memory");), + ( + // Unsupported architectures will have a linker error with a semi-decent error message + __cuda_ptx_cp_reduce_async_bulk_is_not_supported_before_SM_90__();)); +} +#endif // __cccl_ptx_isa >= 800 + +/* +// cp.reduce.async.bulk.dst.src.bulk_group.op.type [dstMem], [srcMem], size; // 3. PTX ISA 80, SM_90 +// .dst = { .global } +// .src = { .shared::cta } +// .type = { .b32, .b64 } +// .op = { .and } +template +__device__ static inline void cp_reduce_async_bulk( + cuda::ptx::space_global_t, + cuda::ptx::space_shared_t, + cuda::ptx::op_and_op_t, + Type* dstMem, + const Type* srcMem, + uint32_t size); +*/ +#if __cccl_ptx_isa >= 800 +extern "C" _CCCL_DEVICE void __cuda_ptx_cp_reduce_async_bulk_is_not_supported_before_SM_90__(); +template +_CCCL_DEVICE static inline void cp_reduce_async_bulk( + space_global_t, space_shared_t, op_and_op_t, _Type* __dstMem, const _Type* __srcMem, _CUDA_VSTD::uint32_t __size) +{ + // __space == space_global (due to parameter type constraint) + // __space == space_shared (due to parameter type constraint) + static_assert(sizeof(_Type) == 4 || sizeof(_Type) == 8, ""); + // __op == op_and_op (due to parameter type constraint) + NV_IF_ELSE_TARGET( + NV_PROVIDES_SM_90, + ( + _CCCL_IF_CONSTEXPR (sizeof(_Type) == 4) { + asm("cp.reduce.async.bulk.global.shared::cta.bulk_group.and.b32 [%0], [%1], %2; // 3." + : + : "l"(__as_ptr_gmem(__dstMem)), "r"(__as_ptr_smem(__srcMem)), "r"(__size) + : "memory"); + } _CCCL_ELSE_IF_CONSTEXPR (sizeof(_Type) == 8) { + asm("cp.reduce.async.bulk.global.shared::cta.bulk_group.and.b64 [%0], [%1], %2; // 3." + : + : "l"(__as_ptr_gmem(__dstMem)), "r"(__as_ptr_smem(__srcMem)), "r"(__size) + : "memory"); + }), + ( + // Unsupported architectures will have a linker error with a semi-decent error message + __cuda_ptx_cp_reduce_async_bulk_is_not_supported_before_SM_90__();)); +} +#endif // __cccl_ptx_isa >= 800 + +/* +// cp.reduce.async.bulk.dst.src.bulk_group.op.type [dstMem], [srcMem], size; // 3. PTX ISA 80, SM_90 +// .dst = { .global } +// .src = { .shared::cta } +// .type = { .b32, .b64 } +// .op = { .or } +template +__device__ static inline void cp_reduce_async_bulk( + cuda::ptx::space_global_t, + cuda::ptx::space_shared_t, + cuda::ptx::op_or_op_t, + Type* dstMem, + const Type* srcMem, + uint32_t size); +*/ +#if __cccl_ptx_isa >= 800 +extern "C" _CCCL_DEVICE void __cuda_ptx_cp_reduce_async_bulk_is_not_supported_before_SM_90__(); +template +_CCCL_DEVICE static inline void cp_reduce_async_bulk( + space_global_t, space_shared_t, op_or_op_t, _Type* __dstMem, const _Type* __srcMem, _CUDA_VSTD::uint32_t __size) +{ + // __space == space_global (due to parameter type constraint) + // __space == space_shared (due to parameter type constraint) + static_assert(sizeof(_Type) == 4 || sizeof(_Type) == 8, ""); + // __op == op_or_op (due to parameter type constraint) + NV_IF_ELSE_TARGET( + NV_PROVIDES_SM_90, + ( + _CCCL_IF_CONSTEXPR (sizeof(_Type) == 4) { + asm("cp.reduce.async.bulk.global.shared::cta.bulk_group.or.b32 [%0], [%1], %2; // 3." + : + : "l"(__as_ptr_gmem(__dstMem)), "r"(__as_ptr_smem(__srcMem)), "r"(__size) + : "memory"); + } _CCCL_ELSE_IF_CONSTEXPR (sizeof(_Type) == 8) { + asm("cp.reduce.async.bulk.global.shared::cta.bulk_group.or.b64 [%0], [%1], %2; // 3." + : + : "l"(__as_ptr_gmem(__dstMem)), "r"(__as_ptr_smem(__srcMem)), "r"(__size) + : "memory"); + }), + ( + // Unsupported architectures will have a linker error with a semi-decent error message + __cuda_ptx_cp_reduce_async_bulk_is_not_supported_before_SM_90__();)); +} +#endif // __cccl_ptx_isa >= 800 + +/* +// cp.reduce.async.bulk.dst.src.bulk_group.op.type [dstMem], [srcMem], size; // 3. PTX ISA 80, SM_90 +// .dst = { .global } +// .src = { .shared::cta } +// .type = { .b32, .b64 } +// .op = { .xor } +template +__device__ static inline void cp_reduce_async_bulk( + cuda::ptx::space_global_t, + cuda::ptx::space_shared_t, + cuda::ptx::op_xor_op_t, + Type* dstMem, + const Type* srcMem, + uint32_t size); +*/ +#if __cccl_ptx_isa >= 800 +extern "C" _CCCL_DEVICE void __cuda_ptx_cp_reduce_async_bulk_is_not_supported_before_SM_90__(); +template +_CCCL_DEVICE static inline void cp_reduce_async_bulk( + space_global_t, space_shared_t, op_xor_op_t, _Type* __dstMem, const _Type* __srcMem, _CUDA_VSTD::uint32_t __size) +{ + // __space == space_global (due to parameter type constraint) + // __space == space_shared (due to parameter type constraint) + static_assert(sizeof(_Type) == 4 || sizeof(_Type) == 8, ""); + // __op == op_xor_op (due to parameter type constraint) + NV_IF_ELSE_TARGET( + NV_PROVIDES_SM_90, + ( + _CCCL_IF_CONSTEXPR (sizeof(_Type) == 4) { + asm("cp.reduce.async.bulk.global.shared::cta.bulk_group.xor.b32 [%0], [%1], %2; // 3." + : + : "l"(__as_ptr_gmem(__dstMem)), "r"(__as_ptr_smem(__srcMem)), "r"(__size) + : "memory"); + } _CCCL_ELSE_IF_CONSTEXPR (sizeof(_Type) == 8) { + asm("cp.reduce.async.bulk.global.shared::cta.bulk_group.xor.b64 [%0], [%1], %2; // 3." + : + : "l"(__as_ptr_gmem(__dstMem)), "r"(__as_ptr_smem(__srcMem)), "r"(__size) + : "memory"); + }), + ( + // Unsupported architectures will have a linker error with a semi-decent error message + __cuda_ptx_cp_reduce_async_bulk_is_not_supported_before_SM_90__();)); +} +#endif // __cccl_ptx_isa >= 800 + +/* +// cp.reduce.async.bulk.dst.src.bulk_group.op.type [dstMem], [srcMem], size; // 4. PTX ISA 80, SM_90 +// .dst = { .global } +// .src = { .shared::cta } +// .type = { .u32 } +// .op = { .min } +template +__device__ static inline void cp_reduce_async_bulk( + cuda::ptx::space_global_t, + cuda::ptx::space_shared_t, + cuda::ptx::op_min_t, + uint32_t* dstMem, + const uint32_t* srcMem, + uint32_t size); +*/ +#if __cccl_ptx_isa >= 800 +extern "C" _CCCL_DEVICE void __cuda_ptx_cp_reduce_async_bulk_is_not_supported_before_SM_90__(); +template +_CCCL_DEVICE static inline void cp_reduce_async_bulk( + space_global_t, + space_shared_t, + op_min_t, + _CUDA_VSTD::uint32_t* __dstMem, + const _CUDA_VSTD::uint32_t* __srcMem, + _CUDA_VSTD::uint32_t __size) +{ + // __space == space_global (due to parameter type constraint) + // __space == space_shared (due to parameter type constraint) + // __type == type_u32 (due to parameter type constraint) + // __op == op_min (due to parameter type constraint) + NV_IF_ELSE_TARGET( + NV_PROVIDES_SM_90, + (asm("cp.reduce.async.bulk.global.shared::cta.bulk_group.min.u32 [%0], [%1], %2; // 4." + : + : "l"(__as_ptr_gmem(__dstMem)), "r"(__as_ptr_smem(__srcMem)), "r"(__size) + : "memory");), + ( + // Unsupported architectures will have a linker error with a semi-decent error message + __cuda_ptx_cp_reduce_async_bulk_is_not_supported_before_SM_90__();)); +} +#endif // __cccl_ptx_isa >= 800 + +/* +// cp.reduce.async.bulk.dst.src.bulk_group.op.type [dstMem], [srcMem], size; // 4. PTX ISA 80, SM_90 +// .dst = { .global } +// .src = { .shared::cta } +// .type = { .u32 } +// .op = { .max } +template +__device__ static inline void cp_reduce_async_bulk( + cuda::ptx::space_global_t, + cuda::ptx::space_shared_t, + cuda::ptx::op_max_t, + uint32_t* dstMem, + const uint32_t* srcMem, + uint32_t size); +*/ +#if __cccl_ptx_isa >= 800 +extern "C" _CCCL_DEVICE void __cuda_ptx_cp_reduce_async_bulk_is_not_supported_before_SM_90__(); +template +_CCCL_DEVICE static inline void cp_reduce_async_bulk( + space_global_t, + space_shared_t, + op_max_t, + _CUDA_VSTD::uint32_t* __dstMem, + const _CUDA_VSTD::uint32_t* __srcMem, + _CUDA_VSTD::uint32_t __size) +{ + // __space == space_global (due to parameter type constraint) + // __space == space_shared (due to parameter type constraint) + // __type == type_u32 (due to parameter type constraint) + // __op == op_max (due to parameter type constraint) + NV_IF_ELSE_TARGET( + NV_PROVIDES_SM_90, + (asm("cp.reduce.async.bulk.global.shared::cta.bulk_group.max.u32 [%0], [%1], %2; // 4." + : + : "l"(__as_ptr_gmem(__dstMem)), "r"(__as_ptr_smem(__srcMem)), "r"(__size) + : "memory");), + ( + // Unsupported architectures will have a linker error with a semi-decent error message + __cuda_ptx_cp_reduce_async_bulk_is_not_supported_before_SM_90__();)); +} +#endif // __cccl_ptx_isa >= 800 + +/* +// cp.reduce.async.bulk.dst.src.bulk_group.op.type [dstMem], [srcMem], size; // 4. PTX ISA 80, SM_90 +// .dst = { .global } +// .src = { .shared::cta } +// .type = { .u32 } +// .op = { .add } +template +__device__ static inline void cp_reduce_async_bulk( + cuda::ptx::space_global_t, + cuda::ptx::space_shared_t, + cuda::ptx::op_add_t, + uint32_t* dstMem, + const uint32_t* srcMem, + uint32_t size); +*/ +#if __cccl_ptx_isa >= 800 +extern "C" _CCCL_DEVICE void __cuda_ptx_cp_reduce_async_bulk_is_not_supported_before_SM_90__(); +template +_CCCL_DEVICE static inline void cp_reduce_async_bulk( + space_global_t, + space_shared_t, + op_add_t, + _CUDA_VSTD::uint32_t* __dstMem, + const _CUDA_VSTD::uint32_t* __srcMem, + _CUDA_VSTD::uint32_t __size) +{ + // __space == space_global (due to parameter type constraint) + // __space == space_shared (due to parameter type constraint) + // __type == type_u32 (due to parameter type constraint) + // __op == op_add (due to parameter type constraint) + NV_IF_ELSE_TARGET( + NV_PROVIDES_SM_90, + (asm("cp.reduce.async.bulk.global.shared::cta.bulk_group.add.u32 [%0], [%1], %2; // 4." + : + : "l"(__as_ptr_gmem(__dstMem)), "r"(__as_ptr_smem(__srcMem)), "r"(__size) + : "memory");), + ( + // Unsupported architectures will have a linker error with a semi-decent error message + __cuda_ptx_cp_reduce_async_bulk_is_not_supported_before_SM_90__();)); +} +#endif // __cccl_ptx_isa >= 800 + +/* +// cp.reduce.async.bulk.dst.src.bulk_group.op.type [dstMem], [srcMem], size; // 4. PTX ISA 80, SM_90 +// .dst = { .global } +// .src = { .shared::cta } +// .type = { .u32 } +// .op = { .inc } +template +__device__ static inline void cp_reduce_async_bulk( + cuda::ptx::space_global_t, + cuda::ptx::space_shared_t, + cuda::ptx::op_inc_t, + uint32_t* dstMem, + const uint32_t* srcMem, + uint32_t size); +*/ +#if __cccl_ptx_isa >= 800 +extern "C" _CCCL_DEVICE void __cuda_ptx_cp_reduce_async_bulk_is_not_supported_before_SM_90__(); +template +_CCCL_DEVICE static inline void cp_reduce_async_bulk( + space_global_t, + space_shared_t, + op_inc_t, + _CUDA_VSTD::uint32_t* __dstMem, + const _CUDA_VSTD::uint32_t* __srcMem, + _CUDA_VSTD::uint32_t __size) +{ + // __space == space_global (due to parameter type constraint) + // __space == space_shared (due to parameter type constraint) + // __type == type_u32 (due to parameter type constraint) + // __op == op_inc (due to parameter type constraint) + NV_IF_ELSE_TARGET( + NV_PROVIDES_SM_90, + (asm("cp.reduce.async.bulk.global.shared::cta.bulk_group.inc.u32 [%0], [%1], %2; // 4." + : + : "l"(__as_ptr_gmem(__dstMem)), "r"(__as_ptr_smem(__srcMem)), "r"(__size) + : "memory");), + ( + // Unsupported architectures will have a linker error with a semi-decent error message + __cuda_ptx_cp_reduce_async_bulk_is_not_supported_before_SM_90__();)); +} +#endif // __cccl_ptx_isa >= 800 + +/* +// cp.reduce.async.bulk.dst.src.bulk_group.op.type [dstMem], [srcMem], size; // 4. PTX ISA 80, SM_90 +// .dst = { .global } +// .src = { .shared::cta } +// .type = { .u32 } +// .op = { .dec } +template +__device__ static inline void cp_reduce_async_bulk( + cuda::ptx::space_global_t, + cuda::ptx::space_shared_t, + cuda::ptx::op_dec_t, + uint32_t* dstMem, + const uint32_t* srcMem, + uint32_t size); +*/ +#if __cccl_ptx_isa >= 800 +extern "C" _CCCL_DEVICE void __cuda_ptx_cp_reduce_async_bulk_is_not_supported_before_SM_90__(); +template +_CCCL_DEVICE static inline void cp_reduce_async_bulk( + space_global_t, + space_shared_t, + op_dec_t, + _CUDA_VSTD::uint32_t* __dstMem, + const _CUDA_VSTD::uint32_t* __srcMem, + _CUDA_VSTD::uint32_t __size) +{ + // __space == space_global (due to parameter type constraint) + // __space == space_shared (due to parameter type constraint) + // __type == type_u32 (due to parameter type constraint) + // __op == op_dec (due to parameter type constraint) + NV_IF_ELSE_TARGET( + NV_PROVIDES_SM_90, + (asm("cp.reduce.async.bulk.global.shared::cta.bulk_group.dec.u32 [%0], [%1], %2; // 4." + : + : "l"(__as_ptr_gmem(__dstMem)), "r"(__as_ptr_smem(__srcMem)), "r"(__size) + : "memory");), + ( + // Unsupported architectures will have a linker error with a semi-decent error message + __cuda_ptx_cp_reduce_async_bulk_is_not_supported_before_SM_90__();)); +} +#endif // __cccl_ptx_isa >= 800 + +/* +// cp.reduce.async.bulk.dst.src.bulk_group.op.type [dstMem], [srcMem], size; // 4. PTX ISA 80, SM_90 +// .dst = { .global } +// .src = { .shared::cta } +// .type = { .s32 } +// .op = { .min } +template +__device__ static inline void cp_reduce_async_bulk( + cuda::ptx::space_global_t, + cuda::ptx::space_shared_t, + cuda::ptx::op_min_t, + int32_t* dstMem, + const int32_t* srcMem, + uint32_t size); +*/ +#if __cccl_ptx_isa >= 800 +extern "C" _CCCL_DEVICE void __cuda_ptx_cp_reduce_async_bulk_is_not_supported_before_SM_90__(); +template +_CCCL_DEVICE static inline void cp_reduce_async_bulk( + space_global_t, + space_shared_t, + op_min_t, + _CUDA_VSTD::int32_t* __dstMem, + const _CUDA_VSTD::int32_t* __srcMem, + _CUDA_VSTD::uint32_t __size) +{ + // __space == space_global (due to parameter type constraint) + // __space == space_shared (due to parameter type constraint) + // __type == type_s32 (due to parameter type constraint) + // __op == op_min (due to parameter type constraint) + NV_IF_ELSE_TARGET( + NV_PROVIDES_SM_90, + (asm("cp.reduce.async.bulk.global.shared::cta.bulk_group.min.s32 [%0], [%1], %2; // 4." + : + : "l"(__as_ptr_gmem(__dstMem)), "r"(__as_ptr_smem(__srcMem)), "r"(__size) + : "memory");), + ( + // Unsupported architectures will have a linker error with a semi-decent error message + __cuda_ptx_cp_reduce_async_bulk_is_not_supported_before_SM_90__();)); +} +#endif // __cccl_ptx_isa >= 800 + +/* +// cp.reduce.async.bulk.dst.src.bulk_group.op.type [dstMem], [srcMem], size; // 4. PTX ISA 80, SM_90 +// .dst = { .global } +// .src = { .shared::cta } +// .type = { .s32 } +// .op = { .max } +template +__device__ static inline void cp_reduce_async_bulk( + cuda::ptx::space_global_t, + cuda::ptx::space_shared_t, + cuda::ptx::op_max_t, + int32_t* dstMem, + const int32_t* srcMem, + uint32_t size); +*/ +#if __cccl_ptx_isa >= 800 +extern "C" _CCCL_DEVICE void __cuda_ptx_cp_reduce_async_bulk_is_not_supported_before_SM_90__(); +template +_CCCL_DEVICE static inline void cp_reduce_async_bulk( + space_global_t, + space_shared_t, + op_max_t, + _CUDA_VSTD::int32_t* __dstMem, + const _CUDA_VSTD::int32_t* __srcMem, + _CUDA_VSTD::uint32_t __size) +{ + // __space == space_global (due to parameter type constraint) + // __space == space_shared (due to parameter type constraint) + // __type == type_s32 (due to parameter type constraint) + // __op == op_max (due to parameter type constraint) + NV_IF_ELSE_TARGET( + NV_PROVIDES_SM_90, + (asm("cp.reduce.async.bulk.global.shared::cta.bulk_group.max.s32 [%0], [%1], %2; // 4." + : + : "l"(__as_ptr_gmem(__dstMem)), "r"(__as_ptr_smem(__srcMem)), "r"(__size) + : "memory");), + ( + // Unsupported architectures will have a linker error with a semi-decent error message + __cuda_ptx_cp_reduce_async_bulk_is_not_supported_before_SM_90__();)); +} +#endif // __cccl_ptx_isa >= 800 + +/* +// cp.reduce.async.bulk.dst.src.bulk_group.op.type [dstMem], [srcMem], size; // 4. PTX ISA 80, SM_90 +// .dst = { .global } +// .src = { .shared::cta } +// .type = { .s32 } +// .op = { .add } +template +__device__ static inline void cp_reduce_async_bulk( + cuda::ptx::space_global_t, + cuda::ptx::space_shared_t, + cuda::ptx::op_add_t, + int32_t* dstMem, + const int32_t* srcMem, + uint32_t size); +*/ +#if __cccl_ptx_isa >= 800 +extern "C" _CCCL_DEVICE void __cuda_ptx_cp_reduce_async_bulk_is_not_supported_before_SM_90__(); +template +_CCCL_DEVICE static inline void cp_reduce_async_bulk( + space_global_t, + space_shared_t, + op_add_t, + _CUDA_VSTD::int32_t* __dstMem, + const _CUDA_VSTD::int32_t* __srcMem, + _CUDA_VSTD::uint32_t __size) +{ + // __space == space_global (due to parameter type constraint) + // __space == space_shared (due to parameter type constraint) + // __type == type_s32 (due to parameter type constraint) + // __op == op_add (due to parameter type constraint) + NV_IF_ELSE_TARGET( + NV_PROVIDES_SM_90, + (asm("cp.reduce.async.bulk.global.shared::cta.bulk_group.add.s32 [%0], [%1], %2; // 4." + : + : "l"(__as_ptr_gmem(__dstMem)), "r"(__as_ptr_smem(__srcMem)), "r"(__size) + : "memory");), + ( + // Unsupported architectures will have a linker error with a semi-decent error message + __cuda_ptx_cp_reduce_async_bulk_is_not_supported_before_SM_90__();)); +} +#endif // __cccl_ptx_isa >= 800 + +/* +// cp.reduce.async.bulk.dst.src.bulk_group.op.type [dstMem], [srcMem], size; // 4. PTX ISA 80, SM_90 +// .dst = { .global } +// .src = { .shared::cta } +// .type = { .u64 } +// .op = { .min } +template +__device__ static inline void cp_reduce_async_bulk( + cuda::ptx::space_global_t, + cuda::ptx::space_shared_t, + cuda::ptx::op_min_t, + uint64_t* dstMem, + const uint64_t* srcMem, + uint32_t size); +*/ +#if __cccl_ptx_isa >= 800 +extern "C" _CCCL_DEVICE void __cuda_ptx_cp_reduce_async_bulk_is_not_supported_before_SM_90__(); +template +_CCCL_DEVICE static inline void cp_reduce_async_bulk( + space_global_t, + space_shared_t, + op_min_t, + _CUDA_VSTD::uint64_t* __dstMem, + const _CUDA_VSTD::uint64_t* __srcMem, + _CUDA_VSTD::uint32_t __size) +{ + // __space == space_global (due to parameter type constraint) + // __space == space_shared (due to parameter type constraint) + // __type == type_u64 (due to parameter type constraint) + // __op == op_min (due to parameter type constraint) + NV_IF_ELSE_TARGET( + NV_PROVIDES_SM_90, + (asm("cp.reduce.async.bulk.global.shared::cta.bulk_group.min.u64 [%0], [%1], %2; // 4." + : + : "l"(__as_ptr_gmem(__dstMem)), "r"(__as_ptr_smem(__srcMem)), "r"(__size) + : "memory");), + ( + // Unsupported architectures will have a linker error with a semi-decent error message + __cuda_ptx_cp_reduce_async_bulk_is_not_supported_before_SM_90__();)); +} +#endif // __cccl_ptx_isa >= 800 + +/* +// cp.reduce.async.bulk.dst.src.bulk_group.op.type [dstMem], [srcMem], size; // 4. PTX ISA 80, SM_90 +// .dst = { .global } +// .src = { .shared::cta } +// .type = { .u64 } +// .op = { .max } +template +__device__ static inline void cp_reduce_async_bulk( + cuda::ptx::space_global_t, + cuda::ptx::space_shared_t, + cuda::ptx::op_max_t, + uint64_t* dstMem, + const uint64_t* srcMem, + uint32_t size); +*/ +#if __cccl_ptx_isa >= 800 +extern "C" _CCCL_DEVICE void __cuda_ptx_cp_reduce_async_bulk_is_not_supported_before_SM_90__(); +template +_CCCL_DEVICE static inline void cp_reduce_async_bulk( + space_global_t, + space_shared_t, + op_max_t, + _CUDA_VSTD::uint64_t* __dstMem, + const _CUDA_VSTD::uint64_t* __srcMem, + _CUDA_VSTD::uint32_t __size) +{ + // __space == space_global (due to parameter type constraint) + // __space == space_shared (due to parameter type constraint) + // __type == type_u64 (due to parameter type constraint) + // __op == op_max (due to parameter type constraint) + NV_IF_ELSE_TARGET( + NV_PROVIDES_SM_90, + (asm("cp.reduce.async.bulk.global.shared::cta.bulk_group.max.u64 [%0], [%1], %2; // 4." + : + : "l"(__as_ptr_gmem(__dstMem)), "r"(__as_ptr_smem(__srcMem)), "r"(__size) + : "memory");), + ( + // Unsupported architectures will have a linker error with a semi-decent error message + __cuda_ptx_cp_reduce_async_bulk_is_not_supported_before_SM_90__();)); +} +#endif // __cccl_ptx_isa >= 800 + +/* +// cp.reduce.async.bulk.dst.src.bulk_group.op.type [dstMem], [srcMem], size; // 4. PTX ISA 80, SM_90 +// .dst = { .global } +// .src = { .shared::cta } +// .type = { .u64 } +// .op = { .add } +template +__device__ static inline void cp_reduce_async_bulk( + cuda::ptx::space_global_t, + cuda::ptx::space_shared_t, + cuda::ptx::op_add_t, + uint64_t* dstMem, + const uint64_t* srcMem, + uint32_t size); +*/ +#if __cccl_ptx_isa >= 800 +extern "C" _CCCL_DEVICE void __cuda_ptx_cp_reduce_async_bulk_is_not_supported_before_SM_90__(); +template +_CCCL_DEVICE static inline void cp_reduce_async_bulk( + space_global_t, + space_shared_t, + op_add_t, + _CUDA_VSTD::uint64_t* __dstMem, + const _CUDA_VSTD::uint64_t* __srcMem, + _CUDA_VSTD::uint32_t __size) +{ + // __space == space_global (due to parameter type constraint) + // __space == space_shared (due to parameter type constraint) + // __type == type_u64 (due to parameter type constraint) + // __op == op_add (due to parameter type constraint) + NV_IF_ELSE_TARGET( + NV_PROVIDES_SM_90, + (asm("cp.reduce.async.bulk.global.shared::cta.bulk_group.add.u64 [%0], [%1], %2; // 4." + : + : "l"(__as_ptr_gmem(__dstMem)), "r"(__as_ptr_smem(__srcMem)), "r"(__size) + : "memory");), + ( + // Unsupported architectures will have a linker error with a semi-decent error message + __cuda_ptx_cp_reduce_async_bulk_is_not_supported_before_SM_90__();)); +} +#endif // __cccl_ptx_isa >= 800 + +/* +// cp.reduce.async.bulk.dst.src.bulk_group.op.type [dstMem], [srcMem], size; // 4. PTX ISA 80, SM_90 +// .dst = { .global } +// .src = { .shared::cta } +// .type = { .s64 } +// .op = { .min } +template +__device__ static inline void cp_reduce_async_bulk( + cuda::ptx::space_global_t, + cuda::ptx::space_shared_t, + cuda::ptx::op_min_t, + int64_t* dstMem, + const int64_t* srcMem, + uint32_t size); +*/ +#if __cccl_ptx_isa >= 800 +extern "C" _CCCL_DEVICE void __cuda_ptx_cp_reduce_async_bulk_is_not_supported_before_SM_90__(); +template +_CCCL_DEVICE static inline void cp_reduce_async_bulk( + space_global_t, + space_shared_t, + op_min_t, + _CUDA_VSTD::int64_t* __dstMem, + const _CUDA_VSTD::int64_t* __srcMem, + _CUDA_VSTD::uint32_t __size) +{ + // __space == space_global (due to parameter type constraint) + // __space == space_shared (due to parameter type constraint) + // __type == type_s64 (due to parameter type constraint) + // __op == op_min (due to parameter type constraint) + NV_IF_ELSE_TARGET( + NV_PROVIDES_SM_90, + (asm("cp.reduce.async.bulk.global.shared::cta.bulk_group.min.s64 [%0], [%1], %2; // 4." + : + : "l"(__as_ptr_gmem(__dstMem)), "r"(__as_ptr_smem(__srcMem)), "r"(__size) + : "memory");), + ( + // Unsupported architectures will have a linker error with a semi-decent error message + __cuda_ptx_cp_reduce_async_bulk_is_not_supported_before_SM_90__();)); +} +#endif // __cccl_ptx_isa >= 800 + +/* +// cp.reduce.async.bulk.dst.src.bulk_group.op.type [dstMem], [srcMem], size; // 4. PTX ISA 80, SM_90 +// .dst = { .global } +// .src = { .shared::cta } +// .type = { .s64 } +// .op = { .max } +template +__device__ static inline void cp_reduce_async_bulk( + cuda::ptx::space_global_t, + cuda::ptx::space_shared_t, + cuda::ptx::op_max_t, + int64_t* dstMem, + const int64_t* srcMem, + uint32_t size); +*/ +#if __cccl_ptx_isa >= 800 +extern "C" _CCCL_DEVICE void __cuda_ptx_cp_reduce_async_bulk_is_not_supported_before_SM_90__(); +template +_CCCL_DEVICE static inline void cp_reduce_async_bulk( + space_global_t, + space_shared_t, + op_max_t, + _CUDA_VSTD::int64_t* __dstMem, + const _CUDA_VSTD::int64_t* __srcMem, + _CUDA_VSTD::uint32_t __size) +{ + // __space == space_global (due to parameter type constraint) + // __space == space_shared (due to parameter type constraint) + // __type == type_s64 (due to parameter type constraint) + // __op == op_max (due to parameter type constraint) + NV_IF_ELSE_TARGET( + NV_PROVIDES_SM_90, + (asm("cp.reduce.async.bulk.global.shared::cta.bulk_group.max.s64 [%0], [%1], %2; // 4." + : + : "l"(__as_ptr_gmem(__dstMem)), "r"(__as_ptr_smem(__srcMem)), "r"(__size) + : "memory");), + ( + // Unsupported architectures will have a linker error with a semi-decent error message + __cuda_ptx_cp_reduce_async_bulk_is_not_supported_before_SM_90__();)); +} +#endif // __cccl_ptx_isa >= 800 + +/* +// cp.reduce.async.bulk.dst.src.bulk_group.op.type [dstMem], [srcMem], size; // 4. PTX ISA 80, SM_90 +// .dst = { .global } +// .src = { .shared::cta } +// .type = { .f32 } +// .op = { .add } +template +__device__ static inline void cp_reduce_async_bulk( + cuda::ptx::space_global_t, + cuda::ptx::space_shared_t, + cuda::ptx::op_add_t, + float* dstMem, + const float* srcMem, + uint32_t size); +*/ +#if __cccl_ptx_isa >= 800 +extern "C" _CCCL_DEVICE void __cuda_ptx_cp_reduce_async_bulk_is_not_supported_before_SM_90__(); +template +_CCCL_DEVICE static inline void cp_reduce_async_bulk( + space_global_t, space_shared_t, op_add_t, float* __dstMem, const float* __srcMem, _CUDA_VSTD::uint32_t __size) +{ + // __space == space_global (due to parameter type constraint) + // __space == space_shared (due to parameter type constraint) + // __type == type_f32 (due to parameter type constraint) + // __op == op_add (due to parameter type constraint) + NV_IF_ELSE_TARGET( + NV_PROVIDES_SM_90, + (asm("cp.reduce.async.bulk.global.shared::cta.bulk_group.add.f32 [%0], [%1], %2; // 4." + : + : "l"(__as_ptr_gmem(__dstMem)), "r"(__as_ptr_smem(__srcMem)), "r"(__size) + : "memory");), + ( + // Unsupported architectures will have a linker error with a semi-decent error message + __cuda_ptx_cp_reduce_async_bulk_is_not_supported_before_SM_90__();)); +} +#endif // __cccl_ptx_isa >= 800 + +/* +// cp.reduce.async.bulk.dst.src.bulk_group.op.type [dstMem], [srcMem], size; // 4. PTX ISA 80, SM_90 +// .dst = { .global } +// .src = { .shared::cta } +// .type = { .f64 } +// .op = { .add } +template +__device__ static inline void cp_reduce_async_bulk( + cuda::ptx::space_global_t, + cuda::ptx::space_shared_t, + cuda::ptx::op_add_t, + double* dstMem, + const double* srcMem, + uint32_t size); +*/ +#if __cccl_ptx_isa >= 800 +extern "C" _CCCL_DEVICE void __cuda_ptx_cp_reduce_async_bulk_is_not_supported_before_SM_90__(); +template +_CCCL_DEVICE static inline void cp_reduce_async_bulk( + space_global_t, space_shared_t, op_add_t, double* __dstMem, const double* __srcMem, _CUDA_VSTD::uint32_t __size) +{ + // __space == space_global (due to parameter type constraint) + // __space == space_shared (due to parameter type constraint) + // __type == type_f64 (due to parameter type constraint) + // __op == op_add (due to parameter type constraint) + NV_IF_ELSE_TARGET( + NV_PROVIDES_SM_90, + (asm("cp.reduce.async.bulk.global.shared::cta.bulk_group.add.f64 [%0], [%1], %2; // 4." + : + : "l"(__as_ptr_gmem(__dstMem)), "r"(__as_ptr_smem(__srcMem)), "r"(__size) + : "memory");), + ( + // Unsupported architectures will have a linker error with a semi-decent error message + __cuda_ptx_cp_reduce_async_bulk_is_not_supported_before_SM_90__();)); +} +#endif // __cccl_ptx_isa >= 800 + +/* +// cp.reduce.async.bulk.dst.src.bulk_group.op.u64 [dstMem], [srcMem], size; // 6. PTX ISA 80, SM_90 +// .dst = { .global } +// .src = { .shared::cta } +// .type = { .s64 } +// .op = { .add } +template +__device__ static inline void cp_reduce_async_bulk( + cuda::ptx::space_global_t, + cuda::ptx::space_shared_t, + cuda::ptx::op_add_t, + int64_t* dstMem, + const int64_t* srcMem, + uint32_t size); +*/ +#if __cccl_ptx_isa >= 800 +extern "C" _CCCL_DEVICE void __cuda_ptx_cp_reduce_async_bulk_is_not_supported_before_SM_90__(); +template +_CCCL_DEVICE static inline void cp_reduce_async_bulk( + space_global_t, + space_shared_t, + op_add_t, + _CUDA_VSTD::int64_t* __dstMem, + const _CUDA_VSTD::int64_t* __srcMem, + _CUDA_VSTD::uint32_t __size) +{ + // __space == space_global (due to parameter type constraint) + // __space == space_shared (due to parameter type constraint) + // __type == type_s64 (due to parameter type constraint) + // __op == op_add (due to parameter type constraint) + NV_IF_ELSE_TARGET( + NV_PROVIDES_SM_90, + (asm("cp.reduce.async.bulk.global.shared::cta.bulk_group.add.u64 [%0], [%1], %2; // 6." + : + : "l"(__as_ptr_gmem(__dstMem)), "r"(__as_ptr_smem(__srcMem)), "r"(__size) + : "memory");), + ( + // Unsupported architectures will have a linker error with a semi-decent error message + __cuda_ptx_cp_reduce_async_bulk_is_not_supported_before_SM_90__();)); +} +#endif // __cccl_ptx_isa >= 800 diff --git a/libcudacxx/include/cuda/__ptx/instructions/generated/cp_reduce_async_bulk_bf16.inc b/libcudacxx/include/cuda/__ptx/instructions/generated/cp_reduce_async_bulk_bf16.inc new file mode 100644 index 00000000000..c657e8d1935 --- /dev/null +++ b/libcudacxx/include/cuda/__ptx/instructions/generated/cp_reduce_async_bulk_bf16.inc @@ -0,0 +1,127 @@ +#ifdef _LIBCUDACXX_HAS_NVBF16 +/* +// cp.reduce.async.bulk.dst.src.bulk_group.op.type [dstMem], [srcMem], size; // 4. PTX ISA 80, SM_90 +// .dst = { .global } +// .src = { .shared::cta } +// .type = { .bf16 } +// .op = { .min } +template +__device__ static inline void cp_reduce_async_bulk( + cuda::ptx::space_global_t, + cuda::ptx::space_shared_t, + cuda::ptx::op_min_t, + __nv_bfloat16* dstMem, + const __nv_bfloat16* srcMem, + uint32_t size); +*/ +# if __cccl_ptx_isa >= 800 +extern "C" _CCCL_DEVICE void __cuda_ptx_cp_reduce_async_bulk_is_not_supported_before_SM_90__(); +template +_CCCL_DEVICE static inline void cp_reduce_async_bulk( + space_global_t, + space_shared_t, + op_min_t, + __nv_bfloat16* __dstMem, + const __nv_bfloat16* __srcMem, + _CUDA_VSTD::uint32_t __size) +{ + // __space == space_global (due to parameter type constraint) + // __space == space_shared (due to parameter type constraint) + // __type == type_bf16 (due to parameter type constraint) + // __op == op_min (due to parameter type constraint) + NV_IF_ELSE_TARGET( + NV_PROVIDES_SM_90, + (asm("cp.reduce.async.bulk.global.shared::cta.bulk_group.min.bf16 [%0], [%1], %2; // 4." + : + : "l"(__as_ptr_gmem(__dstMem)), "r"(__as_ptr_smem(__srcMem)), "r"(__size) + : "memory");), + ( + // Unsupported architectures will have a linker error with a semi-decent error message + __cuda_ptx_cp_reduce_async_bulk_is_not_supported_before_SM_90__();)); +} +# endif // __cccl_ptx_isa >= 800 + +/* +// cp.reduce.async.bulk.dst.src.bulk_group.op.type [dstMem], [srcMem], size; // 4. PTX ISA 80, SM_90 +// .dst = { .global } +// .src = { .shared::cta } +// .type = { .bf16 } +// .op = { .max } +template +__device__ static inline void cp_reduce_async_bulk( + cuda::ptx::space_global_t, + cuda::ptx::space_shared_t, + cuda::ptx::op_max_t, + __nv_bfloat16* dstMem, + const __nv_bfloat16* srcMem, + uint32_t size); +*/ +# if __cccl_ptx_isa >= 800 +extern "C" _CCCL_DEVICE void __cuda_ptx_cp_reduce_async_bulk_is_not_supported_before_SM_90__(); +template +_CCCL_DEVICE static inline void cp_reduce_async_bulk( + space_global_t, + space_shared_t, + op_max_t, + __nv_bfloat16* __dstMem, + const __nv_bfloat16* __srcMem, + _CUDA_VSTD::uint32_t __size) +{ + // __space == space_global (due to parameter type constraint) + // __space == space_shared (due to parameter type constraint) + // __type == type_bf16 (due to parameter type constraint) + // __op == op_max (due to parameter type constraint) + NV_IF_ELSE_TARGET( + NV_PROVIDES_SM_90, + (asm("cp.reduce.async.bulk.global.shared::cta.bulk_group.max.bf16 [%0], [%1], %2; // 4." + : + : "l"(__as_ptr_gmem(__dstMem)), "r"(__as_ptr_smem(__srcMem)), "r"(__size) + : "memory");), + ( + // Unsupported architectures will have a linker error with a semi-decent error message + __cuda_ptx_cp_reduce_async_bulk_is_not_supported_before_SM_90__();)); +} +# endif // __cccl_ptx_isa >= 800 + +/* +// cp.reduce.async.bulk.dst.src.bulk_group.op.noftz.type [dstMem], [srcMem], size; // 5. PTX ISA 80, SM_90 +// .dst = { .global } +// .src = { .shared::cta } +// .type = { .bf16 } +// .op = { .add } +template +__device__ static inline void cp_reduce_async_bulk( + cuda::ptx::space_global_t, + cuda::ptx::space_shared_t, + cuda::ptx::op_add_t, + __nv_bfloat16* dstMem, + const __nv_bfloat16* srcMem, + uint32_t size); +*/ +# if __cccl_ptx_isa >= 800 +extern "C" _CCCL_DEVICE void __cuda_ptx_cp_reduce_async_bulk_is_not_supported_before_SM_90__(); +template +_CCCL_DEVICE static inline void cp_reduce_async_bulk( + space_global_t, + space_shared_t, + op_add_t, + __nv_bfloat16* __dstMem, + const __nv_bfloat16* __srcMem, + _CUDA_VSTD::uint32_t __size) +{ + // __space == space_global (due to parameter type constraint) + // __space == space_shared (due to parameter type constraint) + // __type == type_bf16 (due to parameter type constraint) + // __op == op_add (due to parameter type constraint) + NV_IF_ELSE_TARGET( + NV_PROVIDES_SM_90, + (asm("cp.reduce.async.bulk.global.shared::cta.bulk_group.add.noftz.bf16 [%0], [%1], %2; // 5." + : + : "l"(__as_ptr_gmem(__dstMem)), "r"(__as_ptr_smem(__srcMem)), "r"(__size) + : "memory");), + ( + // Unsupported architectures will have a linker error with a semi-decent error message + __cuda_ptx_cp_reduce_async_bulk_is_not_supported_before_SM_90__();)); +} +# endif // __cccl_ptx_isa >= 800 +#endif // _LIBCUDACXX_HAS_NVBF16 diff --git a/libcudacxx/include/cuda/__ptx/instructions/generated/cp_reduce_async_bulk_f16.inc b/libcudacxx/include/cuda/__ptx/instructions/generated/cp_reduce_async_bulk_f16.inc new file mode 100644 index 00000000000..3a52630db53 --- /dev/null +++ b/libcudacxx/include/cuda/__ptx/instructions/generated/cp_reduce_async_bulk_f16.inc @@ -0,0 +1,110 @@ +/* +// cp.reduce.async.bulk.dst.src.bulk_group.op.type [dstMem], [srcMem], size; // 4. PTX ISA 80, SM_90 +// .dst = { .global } +// .src = { .shared::cta } +// .type = { .f16 } +// .op = { .min } +template +__device__ static inline void cp_reduce_async_bulk( + cuda::ptx::space_global_t, + cuda::ptx::space_shared_t, + cuda::ptx::op_min_t, + __half* dstMem, + const __half* srcMem, + uint32_t size); +*/ +# if __cccl_ptx_isa >= 800 +extern "C" _CCCL_DEVICE void __cuda_ptx_cp_reduce_async_bulk_is_not_supported_before_SM_90__(); +template +_CCCL_DEVICE static inline void cp_reduce_async_bulk( + space_global_t, space_shared_t, op_min_t, __half* __dstMem, const __half* __srcMem, _CUDA_VSTD::uint32_t __size) +{ + // __space == space_global (due to parameter type constraint) + // __space == space_shared (due to parameter type constraint) + // __type == type_f16 (due to parameter type constraint) + // __op == op_min (due to parameter type constraint) + NV_IF_ELSE_TARGET( + NV_PROVIDES_SM_90, + (asm("cp.reduce.async.bulk.global.shared::cta.bulk_group.min.f16 [%0], [%1], %2; // 4." + : + : "l"(__as_ptr_gmem(__dstMem)), "r"(__as_ptr_smem(__srcMem)), "r"(__size) + : "memory");), + ( + // Unsupported architectures will have a linker error with a semi-decent error message + __cuda_ptx_cp_reduce_async_bulk_is_not_supported_before_SM_90__();)); +} +# endif // __cccl_ptx_isa >= 800 + +/* +// cp.reduce.async.bulk.dst.src.bulk_group.op.type [dstMem], [srcMem], size; // 4. PTX ISA 80, SM_90 +// .dst = { .global } +// .src = { .shared::cta } +// .type = { .f16 } +// .op = { .max } +template +__device__ static inline void cp_reduce_async_bulk( + cuda::ptx::space_global_t, + cuda::ptx::space_shared_t, + cuda::ptx::op_max_t, + __half* dstMem, + const __half* srcMem, + uint32_t size); +*/ +# if __cccl_ptx_isa >= 800 +extern "C" _CCCL_DEVICE void __cuda_ptx_cp_reduce_async_bulk_is_not_supported_before_SM_90__(); +template +_CCCL_DEVICE static inline void cp_reduce_async_bulk( + space_global_t, space_shared_t, op_max_t, __half* __dstMem, const __half* __srcMem, _CUDA_VSTD::uint32_t __size) +{ + // __space == space_global (due to parameter type constraint) + // __space == space_shared (due to parameter type constraint) + // __type == type_f16 (due to parameter type constraint) + // __op == op_max (due to parameter type constraint) + NV_IF_ELSE_TARGET( + NV_PROVIDES_SM_90, + (asm("cp.reduce.async.bulk.global.shared::cta.bulk_group.max.f16 [%0], [%1], %2; // 4." + : + : "l"(__as_ptr_gmem(__dstMem)), "r"(__as_ptr_smem(__srcMem)), "r"(__size) + : "memory");), + ( + // Unsupported architectures will have a linker error with a semi-decent error message + __cuda_ptx_cp_reduce_async_bulk_is_not_supported_before_SM_90__();)); +} +# endif // __cccl_ptx_isa >= 800 + +/* +// cp.reduce.async.bulk.dst.src.bulk_group.op.noftz.type [dstMem], [srcMem], size; // 5. PTX ISA 80, SM_90 +// .dst = { .global } +// .src = { .shared::cta } +// .type = { .f16 } +// .op = { .add } +template +__device__ static inline void cp_reduce_async_bulk( + cuda::ptx::space_global_t, + cuda::ptx::space_shared_t, + cuda::ptx::op_add_t, + __half* dstMem, + const __half* srcMem, + uint32_t size); +*/ +# if __cccl_ptx_isa >= 800 +extern "C" _CCCL_DEVICE void __cuda_ptx_cp_reduce_async_bulk_is_not_supported_before_SM_90__(); +template +_CCCL_DEVICE static inline void cp_reduce_async_bulk( + space_global_t, space_shared_t, op_add_t, __half* __dstMem, const __half* __srcMem, _CUDA_VSTD::uint32_t __size) +{ + // __space == space_global (due to parameter type constraint) + // __space == space_shared (due to parameter type constraint) + // __type == type_f16 (due to parameter type constraint) + // __op == op_add (due to parameter type constraint) + NV_IF_ELSE_TARGET( + NV_PROVIDES_SM_90, + (asm("cp.reduce.async.bulk.global.shared::cta.bulk_group.add.noftz.f16 [%0], [%1], %2; // 5." + : + : "l"(__as_ptr_gmem(__dstMem)), "r"(__as_ptr_smem(__srcMem)), "r"(__size) + : "memory");), + ( + // Unsupported architectures will have a linker error with a semi-decent error message + __cuda_ptx_cp_reduce_async_bulk_is_not_supported_before_SM_90__();)); +} +# endif // __cccl_ptx_isa >= 800 diff --git a/libcudacxx/include/cuda/__ptx/instructions/generated/cp_reduce_async_bulk_tensor.inc b/libcudacxx/include/cuda/__ptx/instructions/generated/cp_reduce_async_bulk_tensor.inc new file mode 100644 index 00000000000..32008f6af5b --- /dev/null +++ b/libcudacxx/include/cuda/__ptx/instructions/generated/cp_reduce_async_bulk_tensor.inc @@ -0,0 +1,532 @@ +/* +// cp.reduce.async.bulk.tensor.1d.dst.src.op.tile.bulk_group [tensorMap, tensorCoords], [srcMem]; // 1a. PTX ISA 80, +SM_90 +// .dst = { .global } +// .src = { .shared::cta } +// .op = { .add, .min, .max, .inc, .dec, .and, .or, .xor } +template +__device__ static inline void cp_reduce_async_bulk_tensor( + cuda::ptx::space_global_t, + cuda::ptx::space_shared_t, + cuda::ptx::op_t op, + const void* tensorMap, + const int32_t (&tensorCoords)[1], + const void* srcMem); +*/ +#if __cccl_ptx_isa >= 800 +extern "C" _CCCL_DEVICE void __cuda_ptx_cp_reduce_async_bulk_tensor_is_not_supported_before_SM_90__(); +template +_CCCL_DEVICE static inline void cp_reduce_async_bulk_tensor( + space_global_t, + space_shared_t, + op_t<_Op> __op, + const void* __tensorMap, + const _CUDA_VSTD::int32_t (&__tensorCoords)[1], + const void* __srcMem) +{ + // __space == space_global (due to parameter type constraint) + // __space == space_shared (due to parameter type constraint) + static_assert(__op == op_add || __op == op_min || __op == op_max || __op == op_inc || __op == op_dec + || __op == op_and_op || __op == op_or_op || __op == op_xor_op, + ""); + NV_IF_ELSE_TARGET( + NV_PROVIDES_SM_90, + ( + _CCCL_IF_CONSTEXPR (__op == op_add) { + asm("cp.reduce.async.bulk.tensor.1d.global.shared::cta.add.tile.bulk_group [%0, {%1}], [%2]; // 1a." + : + : "l"(__tensorMap), "r"(__tensorCoords[0]), "r"(__as_ptr_smem(__srcMem)) + : "memory"); + } _CCCL_ELSE_IF_CONSTEXPR (__op == op_min) { + asm("cp.reduce.async.bulk.tensor.1d.global.shared::cta.min.tile.bulk_group [%0, {%1}], [%2]; // 1a." + : + : "l"(__tensorMap), "r"(__tensorCoords[0]), "r"(__as_ptr_smem(__srcMem)) + : "memory"); + } _CCCL_ELSE_IF_CONSTEXPR (__op == op_max) { + asm("cp.reduce.async.bulk.tensor.1d.global.shared::cta.max.tile.bulk_group [%0, {%1}], [%2]; // 1a." + : + : "l"(__tensorMap), "r"(__tensorCoords[0]), "r"(__as_ptr_smem(__srcMem)) + : "memory"); + } _CCCL_ELSE_IF_CONSTEXPR (__op == op_inc) { + asm("cp.reduce.async.bulk.tensor.1d.global.shared::cta.inc.tile.bulk_group [%0, {%1}], [%2]; // 1a." + : + : "l"(__tensorMap), "r"(__tensorCoords[0]), "r"(__as_ptr_smem(__srcMem)) + : "memory"); + } _CCCL_ELSE_IF_CONSTEXPR (__op == op_dec) { + asm("cp.reduce.async.bulk.tensor.1d.global.shared::cta.dec.tile.bulk_group [%0, {%1}], [%2]; // 1a." + : + : "l"(__tensorMap), "r"(__tensorCoords[0]), "r"(__as_ptr_smem(__srcMem)) + : "memory"); + } _CCCL_ELSE_IF_CONSTEXPR (__op == op_and_op) { + asm("cp.reduce.async.bulk.tensor.1d.global.shared::cta.and.tile.bulk_group [%0, {%1}], [%2]; // 1a." + : + : "l"(__tensorMap), "r"(__tensorCoords[0]), "r"(__as_ptr_smem(__srcMem)) + : "memory"); + } _CCCL_ELSE_IF_CONSTEXPR (__op == op_or_op) { + asm("cp.reduce.async.bulk.tensor.1d.global.shared::cta.or.tile.bulk_group [%0, {%1}], [%2]; // 1a." + : + : "l"(__tensorMap), "r"(__tensorCoords[0]), "r"(__as_ptr_smem(__srcMem)) + : "memory"); + } _CCCL_ELSE_IF_CONSTEXPR (__op == op_xor_op) { + asm("cp.reduce.async.bulk.tensor.1d.global.shared::cta.xor.tile.bulk_group [%0, {%1}], [%2]; // 1a." + : + : "l"(__tensorMap), "r"(__tensorCoords[0]), "r"(__as_ptr_smem(__srcMem)) + : "memory"); + }), + ( + // Unsupported architectures will have a linker error with a semi-decent error message + __cuda_ptx_cp_reduce_async_bulk_tensor_is_not_supported_before_SM_90__();)); +} +#endif // __cccl_ptx_isa >= 800 + +/* +// cp.reduce.async.bulk.tensor.2d.dst.src.op.tile.bulk_group [tensorMap, tensorCoords], [srcMem]; // 1b. PTX ISA 80, +SM_90 +// .dst = { .global } +// .src = { .shared::cta } +// .op = { .add, .min, .max, .inc, .dec, .and, .or, .xor } +template +__device__ static inline void cp_reduce_async_bulk_tensor( + cuda::ptx::space_global_t, + cuda::ptx::space_shared_t, + cuda::ptx::op_t op, + const void* tensorMap, + const int32_t (&tensorCoords)[2], + const void* srcMem); +*/ +#if __cccl_ptx_isa >= 800 +extern "C" _CCCL_DEVICE void __cuda_ptx_cp_reduce_async_bulk_tensor_is_not_supported_before_SM_90__(); +template +_CCCL_DEVICE static inline void cp_reduce_async_bulk_tensor( + space_global_t, + space_shared_t, + op_t<_Op> __op, + const void* __tensorMap, + const _CUDA_VSTD::int32_t (&__tensorCoords)[2], + const void* __srcMem) +{ + // __space == space_global (due to parameter type constraint) + // __space == space_shared (due to parameter type constraint) + static_assert(__op == op_add || __op == op_min || __op == op_max || __op == op_inc || __op == op_dec + || __op == op_and_op || __op == op_or_op || __op == op_xor_op, + ""); + NV_IF_ELSE_TARGET( + NV_PROVIDES_SM_90, + ( + _CCCL_IF_CONSTEXPR (__op == op_add) { + asm("cp.reduce.async.bulk.tensor.2d.global.shared::cta.add.tile.bulk_group [%0, {%1, %2}], [%3]; // 1b." + : + : "l"(__tensorMap), "r"(__tensorCoords[0]), "r"(__tensorCoords[1]), "r"(__as_ptr_smem(__srcMem)) + : "memory"); + } _CCCL_ELSE_IF_CONSTEXPR (__op == op_min) { + asm("cp.reduce.async.bulk.tensor.2d.global.shared::cta.min.tile.bulk_group [%0, {%1, %2}], [%3]; // 1b." + : + : "l"(__tensorMap), "r"(__tensorCoords[0]), "r"(__tensorCoords[1]), "r"(__as_ptr_smem(__srcMem)) + : "memory"); + } _CCCL_ELSE_IF_CONSTEXPR (__op == op_max) { + asm("cp.reduce.async.bulk.tensor.2d.global.shared::cta.max.tile.bulk_group [%0, {%1, %2}], [%3]; // 1b." + : + : "l"(__tensorMap), "r"(__tensorCoords[0]), "r"(__tensorCoords[1]), "r"(__as_ptr_smem(__srcMem)) + : "memory"); + } _CCCL_ELSE_IF_CONSTEXPR (__op == op_inc) { + asm("cp.reduce.async.bulk.tensor.2d.global.shared::cta.inc.tile.bulk_group [%0, {%1, %2}], [%3]; // 1b." + : + : "l"(__tensorMap), "r"(__tensorCoords[0]), "r"(__tensorCoords[1]), "r"(__as_ptr_smem(__srcMem)) + : "memory"); + } _CCCL_ELSE_IF_CONSTEXPR (__op == op_dec) { + asm("cp.reduce.async.bulk.tensor.2d.global.shared::cta.dec.tile.bulk_group [%0, {%1, %2}], [%3]; // 1b." + : + : "l"(__tensorMap), "r"(__tensorCoords[0]), "r"(__tensorCoords[1]), "r"(__as_ptr_smem(__srcMem)) + : "memory"); + } _CCCL_ELSE_IF_CONSTEXPR (__op == op_and_op) { + asm("cp.reduce.async.bulk.tensor.2d.global.shared::cta.and.tile.bulk_group [%0, {%1, %2}], [%3]; // 1b." + : + : "l"(__tensorMap), "r"(__tensorCoords[0]), "r"(__tensorCoords[1]), "r"(__as_ptr_smem(__srcMem)) + : "memory"); + } _CCCL_ELSE_IF_CONSTEXPR (__op == op_or_op) { + asm("cp.reduce.async.bulk.tensor.2d.global.shared::cta.or.tile.bulk_group [%0, {%1, %2}], [%3]; // 1b." + : + : "l"(__tensorMap), "r"(__tensorCoords[0]), "r"(__tensorCoords[1]), "r"(__as_ptr_smem(__srcMem)) + : "memory"); + } _CCCL_ELSE_IF_CONSTEXPR (__op == op_xor_op) { + asm("cp.reduce.async.bulk.tensor.2d.global.shared::cta.xor.tile.bulk_group [%0, {%1, %2}], [%3]; // 1b." + : + : "l"(__tensorMap), "r"(__tensorCoords[0]), "r"(__tensorCoords[1]), "r"(__as_ptr_smem(__srcMem)) + : "memory"); + }), + ( + // Unsupported architectures will have a linker error with a semi-decent error message + __cuda_ptx_cp_reduce_async_bulk_tensor_is_not_supported_before_SM_90__();)); +} +#endif // __cccl_ptx_isa >= 800 + +/* +// cp.reduce.async.bulk.tensor.3d.dst.src.op.tile.bulk_group [tensorMap, tensorCoords], [srcMem]; // 1c. PTX ISA 80, +SM_90 +// .dst = { .global } +// .src = { .shared::cta } +// .op = { .add, .min, .max, .inc, .dec, .and, .or, .xor } +template +__device__ static inline void cp_reduce_async_bulk_tensor( + cuda::ptx::space_global_t, + cuda::ptx::space_shared_t, + cuda::ptx::op_t op, + const void* tensorMap, + const int32_t (&tensorCoords)[3], + const void* srcMem); +*/ +#if __cccl_ptx_isa >= 800 +extern "C" _CCCL_DEVICE void __cuda_ptx_cp_reduce_async_bulk_tensor_is_not_supported_before_SM_90__(); +template +_CCCL_DEVICE static inline void cp_reduce_async_bulk_tensor( + space_global_t, + space_shared_t, + op_t<_Op> __op, + const void* __tensorMap, + const _CUDA_VSTD::int32_t (&__tensorCoords)[3], + const void* __srcMem) +{ + // __space == space_global (due to parameter type constraint) + // __space == space_shared (due to parameter type constraint) + static_assert(__op == op_add || __op == op_min || __op == op_max || __op == op_inc || __op == op_dec + || __op == op_and_op || __op == op_or_op || __op == op_xor_op, + ""); + NV_IF_ELSE_TARGET( + NV_PROVIDES_SM_90, + ( + _CCCL_IF_CONSTEXPR (__op == op_add) { + asm("cp.reduce.async.bulk.tensor.3d.global.shared::cta.add.tile.bulk_group [%0, {%1, %2, %3}], [%4]; // 1c." + : + : "l"(__tensorMap), + "r"(__tensorCoords[0]), + "r"(__tensorCoords[1]), + "r"(__tensorCoords[2]), + "r"(__as_ptr_smem(__srcMem)) + : "memory"); + } _CCCL_ELSE_IF_CONSTEXPR (__op == op_min) { + asm("cp.reduce.async.bulk.tensor.3d.global.shared::cta.min.tile.bulk_group [%0, {%1, %2, %3}], [%4]; // 1c." + : + : "l"(__tensorMap), + "r"(__tensorCoords[0]), + "r"(__tensorCoords[1]), + "r"(__tensorCoords[2]), + "r"(__as_ptr_smem(__srcMem)) + : "memory"); + } _CCCL_ELSE_IF_CONSTEXPR (__op == op_max) { + asm("cp.reduce.async.bulk.tensor.3d.global.shared::cta.max.tile.bulk_group [%0, {%1, %2, %3}], [%4]; // 1c." + : + : "l"(__tensorMap), + "r"(__tensorCoords[0]), + "r"(__tensorCoords[1]), + "r"(__tensorCoords[2]), + "r"(__as_ptr_smem(__srcMem)) + : "memory"); + } _CCCL_ELSE_IF_CONSTEXPR (__op == op_inc) { + asm("cp.reduce.async.bulk.tensor.3d.global.shared::cta.inc.tile.bulk_group [%0, {%1, %2, %3}], [%4]; // 1c." + : + : "l"(__tensorMap), + "r"(__tensorCoords[0]), + "r"(__tensorCoords[1]), + "r"(__tensorCoords[2]), + "r"(__as_ptr_smem(__srcMem)) + : "memory"); + } _CCCL_ELSE_IF_CONSTEXPR (__op == op_dec) { + asm("cp.reduce.async.bulk.tensor.3d.global.shared::cta.dec.tile.bulk_group [%0, {%1, %2, %3}], [%4]; // 1c." + : + : "l"(__tensorMap), + "r"(__tensorCoords[0]), + "r"(__tensorCoords[1]), + "r"(__tensorCoords[2]), + "r"(__as_ptr_smem(__srcMem)) + : "memory"); + } _CCCL_ELSE_IF_CONSTEXPR (__op == op_and_op) { + asm("cp.reduce.async.bulk.tensor.3d.global.shared::cta.and.tile.bulk_group [%0, {%1, %2, %3}], [%4]; // 1c." + : + : "l"(__tensorMap), + "r"(__tensorCoords[0]), + "r"(__tensorCoords[1]), + "r"(__tensorCoords[2]), + "r"(__as_ptr_smem(__srcMem)) + : "memory"); + } _CCCL_ELSE_IF_CONSTEXPR (__op == op_or_op) { + asm("cp.reduce.async.bulk.tensor.3d.global.shared::cta.or.tile.bulk_group [%0, {%1, %2, %3}], [%4]; // 1c." + : + : "l"(__tensorMap), + "r"(__tensorCoords[0]), + "r"(__tensorCoords[1]), + "r"(__tensorCoords[2]), + "r"(__as_ptr_smem(__srcMem)) + : "memory"); + } _CCCL_ELSE_IF_CONSTEXPR (__op == op_xor_op) { + asm("cp.reduce.async.bulk.tensor.3d.global.shared::cta.xor.tile.bulk_group [%0, {%1, %2, %3}], [%4]; // 1c." + : + : "l"(__tensorMap), + "r"(__tensorCoords[0]), + "r"(__tensorCoords[1]), + "r"(__tensorCoords[2]), + "r"(__as_ptr_smem(__srcMem)) + : "memory"); + }), + ( + // Unsupported architectures will have a linker error with a semi-decent error message + __cuda_ptx_cp_reduce_async_bulk_tensor_is_not_supported_before_SM_90__();)); +} +#endif // __cccl_ptx_isa >= 800 + +/* +// cp.reduce.async.bulk.tensor.4d.dst.src.op.tile.bulk_group [tensorMap, tensorCoords], [srcMem]; // 1d. PTX ISA 80, +SM_90 +// .dst = { .global } +// .src = { .shared::cta } +// .op = { .add, .min, .max, .inc, .dec, .and, .or, .xor } +template +__device__ static inline void cp_reduce_async_bulk_tensor( + cuda::ptx::space_global_t, + cuda::ptx::space_shared_t, + cuda::ptx::op_t op, + const void* tensorMap, + const int32_t (&tensorCoords)[4], + const void* srcMem); +*/ +#if __cccl_ptx_isa >= 800 +extern "C" _CCCL_DEVICE void __cuda_ptx_cp_reduce_async_bulk_tensor_is_not_supported_before_SM_90__(); +template +_CCCL_DEVICE static inline void cp_reduce_async_bulk_tensor( + space_global_t, + space_shared_t, + op_t<_Op> __op, + const void* __tensorMap, + const _CUDA_VSTD::int32_t (&__tensorCoords)[4], + const void* __srcMem) +{ + // __space == space_global (due to parameter type constraint) + // __space == space_shared (due to parameter type constraint) + static_assert(__op == op_add || __op == op_min || __op == op_max || __op == op_inc || __op == op_dec + || __op == op_and_op || __op == op_or_op || __op == op_xor_op, + ""); + NV_IF_ELSE_TARGET( + NV_PROVIDES_SM_90, + ( + _CCCL_IF_CONSTEXPR (__op == op_add) { + asm("cp.reduce.async.bulk.tensor.4d.global.shared::cta.add.tile.bulk_group [%0, {%1, %2, %3, %4}], [%5]; // 1d." + : + : "l"(__tensorMap), + "r"(__tensorCoords[0]), + "r"(__tensorCoords[1]), + "r"(__tensorCoords[2]), + "r"(__tensorCoords[3]), + "r"(__as_ptr_smem(__srcMem)) + : "memory"); + } _CCCL_ELSE_IF_CONSTEXPR (__op == op_min) { + asm("cp.reduce.async.bulk.tensor.4d.global.shared::cta.min.tile.bulk_group [%0, {%1, %2, %3, %4}], [%5]; // 1d." + : + : "l"(__tensorMap), + "r"(__tensorCoords[0]), + "r"(__tensorCoords[1]), + "r"(__tensorCoords[2]), + "r"(__tensorCoords[3]), + "r"(__as_ptr_smem(__srcMem)) + : "memory"); + } _CCCL_ELSE_IF_CONSTEXPR (__op == op_max) { + asm("cp.reduce.async.bulk.tensor.4d.global.shared::cta.max.tile.bulk_group [%0, {%1, %2, %3, %4}], [%5]; // 1d." + : + : "l"(__tensorMap), + "r"(__tensorCoords[0]), + "r"(__tensorCoords[1]), + "r"(__tensorCoords[2]), + "r"(__tensorCoords[3]), + "r"(__as_ptr_smem(__srcMem)) + : "memory"); + } _CCCL_ELSE_IF_CONSTEXPR (__op == op_inc) { + asm("cp.reduce.async.bulk.tensor.4d.global.shared::cta.inc.tile.bulk_group [%0, {%1, %2, %3, %4}], [%5]; // 1d." + : + : "l"(__tensorMap), + "r"(__tensorCoords[0]), + "r"(__tensorCoords[1]), + "r"(__tensorCoords[2]), + "r"(__tensorCoords[3]), + "r"(__as_ptr_smem(__srcMem)) + : "memory"); + } _CCCL_ELSE_IF_CONSTEXPR (__op == op_dec) { + asm("cp.reduce.async.bulk.tensor.4d.global.shared::cta.dec.tile.bulk_group [%0, {%1, %2, %3, %4}], [%5]; // 1d." + : + : "l"(__tensorMap), + "r"(__tensorCoords[0]), + "r"(__tensorCoords[1]), + "r"(__tensorCoords[2]), + "r"(__tensorCoords[3]), + "r"(__as_ptr_smem(__srcMem)) + : "memory"); + } _CCCL_ELSE_IF_CONSTEXPR (__op == op_and_op) { + asm("cp.reduce.async.bulk.tensor.4d.global.shared::cta.and.tile.bulk_group [%0, {%1, %2, %3, %4}], [%5]; // 1d." + : + : "l"(__tensorMap), + "r"(__tensorCoords[0]), + "r"(__tensorCoords[1]), + "r"(__tensorCoords[2]), + "r"(__tensorCoords[3]), + "r"(__as_ptr_smem(__srcMem)) + : "memory"); + } _CCCL_ELSE_IF_CONSTEXPR (__op == op_or_op) { + asm("cp.reduce.async.bulk.tensor.4d.global.shared::cta.or.tile.bulk_group [%0, {%1, %2, %3, %4}], [%5]; // 1d." + : + : "l"(__tensorMap), + "r"(__tensorCoords[0]), + "r"(__tensorCoords[1]), + "r"(__tensorCoords[2]), + "r"(__tensorCoords[3]), + "r"(__as_ptr_smem(__srcMem)) + : "memory"); + } _CCCL_ELSE_IF_CONSTEXPR (__op == op_xor_op) { + asm("cp.reduce.async.bulk.tensor.4d.global.shared::cta.xor.tile.bulk_group [%0, {%1, %2, %3, %4}], [%5]; // 1d." + : + : "l"(__tensorMap), + "r"(__tensorCoords[0]), + "r"(__tensorCoords[1]), + "r"(__tensorCoords[2]), + "r"(__tensorCoords[3]), + "r"(__as_ptr_smem(__srcMem)) + : "memory"); + }), + ( + // Unsupported architectures will have a linker error with a semi-decent error message + __cuda_ptx_cp_reduce_async_bulk_tensor_is_not_supported_before_SM_90__();)); +} +#endif // __cccl_ptx_isa >= 800 + +/* +// cp.reduce.async.bulk.tensor.5d.dst.src.op.tile.bulk_group [tensorMap, tensorCoords], [srcMem]; // 1e. PTX ISA 80, +SM_90 +// .dst = { .global } +// .src = { .shared::cta } +// .op = { .add, .min, .max, .inc, .dec, .and, .or, .xor } +template +__device__ static inline void cp_reduce_async_bulk_tensor( + cuda::ptx::space_global_t, + cuda::ptx::space_shared_t, + cuda::ptx::op_t op, + const void* tensorMap, + const int32_t (&tensorCoords)[5], + const void* srcMem); +*/ +#if __cccl_ptx_isa >= 800 +extern "C" _CCCL_DEVICE void __cuda_ptx_cp_reduce_async_bulk_tensor_is_not_supported_before_SM_90__(); +template +_CCCL_DEVICE static inline void cp_reduce_async_bulk_tensor( + space_global_t, + space_shared_t, + op_t<_Op> __op, + const void* __tensorMap, + const _CUDA_VSTD::int32_t (&__tensorCoords)[5], + const void* __srcMem) +{ + // __space == space_global (due to parameter type constraint) + // __space == space_shared (due to parameter type constraint) + static_assert(__op == op_add || __op == op_min || __op == op_max || __op == op_inc || __op == op_dec + || __op == op_and_op || __op == op_or_op || __op == op_xor_op, + ""); + NV_IF_ELSE_TARGET( + NV_PROVIDES_SM_90, + ( + _CCCL_IF_CONSTEXPR (__op == op_add) { + asm("cp.reduce.async.bulk.tensor.5d.global.shared::cta.add.tile.bulk_group [%0, {%1, %2, %3, %4, %5}], [%6]; " + "// 1e." + : + : "l"(__tensorMap), + "r"(__tensorCoords[0]), + "r"(__tensorCoords[1]), + "r"(__tensorCoords[2]), + "r"(__tensorCoords[3]), + "r"(__tensorCoords[4]), + "r"(__as_ptr_smem(__srcMem)) + : "memory"); + } _CCCL_ELSE_IF_CONSTEXPR (__op == op_min) { + asm("cp.reduce.async.bulk.tensor.5d.global.shared::cta.min.tile.bulk_group [%0, {%1, %2, %3, %4, %5}], [%6]; " + "// 1e." + : + : "l"(__tensorMap), + "r"(__tensorCoords[0]), + "r"(__tensorCoords[1]), + "r"(__tensorCoords[2]), + "r"(__tensorCoords[3]), + "r"(__tensorCoords[4]), + "r"(__as_ptr_smem(__srcMem)) + : "memory"); + } _CCCL_ELSE_IF_CONSTEXPR (__op == op_max) { + asm("cp.reduce.async.bulk.tensor.5d.global.shared::cta.max.tile.bulk_group [%0, {%1, %2, %3, %4, %5}], [%6]; " + "// 1e." + : + : "l"(__tensorMap), + "r"(__tensorCoords[0]), + "r"(__tensorCoords[1]), + "r"(__tensorCoords[2]), + "r"(__tensorCoords[3]), + "r"(__tensorCoords[4]), + "r"(__as_ptr_smem(__srcMem)) + : "memory"); + } _CCCL_ELSE_IF_CONSTEXPR (__op == op_inc) { + asm("cp.reduce.async.bulk.tensor.5d.global.shared::cta.inc.tile.bulk_group [%0, {%1, %2, %3, %4, %5}], [%6]; " + "// 1e." + : + : "l"(__tensorMap), + "r"(__tensorCoords[0]), + "r"(__tensorCoords[1]), + "r"(__tensorCoords[2]), + "r"(__tensorCoords[3]), + "r"(__tensorCoords[4]), + "r"(__as_ptr_smem(__srcMem)) + : "memory"); + } _CCCL_ELSE_IF_CONSTEXPR (__op == op_dec) { + asm("cp.reduce.async.bulk.tensor.5d.global.shared::cta.dec.tile.bulk_group [%0, {%1, %2, %3, %4, %5}], [%6]; " + "// 1e." + : + : "l"(__tensorMap), + "r"(__tensorCoords[0]), + "r"(__tensorCoords[1]), + "r"(__tensorCoords[2]), + "r"(__tensorCoords[3]), + "r"(__tensorCoords[4]), + "r"(__as_ptr_smem(__srcMem)) + : "memory"); + } _CCCL_ELSE_IF_CONSTEXPR (__op == op_and_op) { + asm("cp.reduce.async.bulk.tensor.5d.global.shared::cta.and.tile.bulk_group [%0, {%1, %2, %3, %4, %5}], [%6]; " + "// 1e." + : + : "l"(__tensorMap), + "r"(__tensorCoords[0]), + "r"(__tensorCoords[1]), + "r"(__tensorCoords[2]), + "r"(__tensorCoords[3]), + "r"(__tensorCoords[4]), + "r"(__as_ptr_smem(__srcMem)) + : "memory"); + } _CCCL_ELSE_IF_CONSTEXPR (__op == op_or_op) { + asm("cp.reduce.async.bulk.tensor.5d.global.shared::cta.or.tile.bulk_group [%0, {%1, %2, %3, %4, %5}], [%6]; // " + "1e." + : + : "l"(__tensorMap), + "r"(__tensorCoords[0]), + "r"(__tensorCoords[1]), + "r"(__tensorCoords[2]), + "r"(__tensorCoords[3]), + "r"(__tensorCoords[4]), + "r"(__as_ptr_smem(__srcMem)) + : "memory"); + } _CCCL_ELSE_IF_CONSTEXPR (__op == op_xor_op) { + asm("cp.reduce.async.bulk.tensor.5d.global.shared::cta.xor.tile.bulk_group [%0, {%1, %2, %3, %4, %5}], [%6]; " + "// 1e." + : + : "l"(__tensorMap), + "r"(__tensorCoords[0]), + "r"(__tensorCoords[1]), + "r"(__tensorCoords[2]), + "r"(__tensorCoords[3]), + "r"(__tensorCoords[4]), + "r"(__as_ptr_smem(__srcMem)) + : "memory"); + }), + ( + // Unsupported architectures will have a linker error with a semi-decent error message + __cuda_ptx_cp_reduce_async_bulk_tensor_is_not_supported_before_SM_90__();)); +} +#endif // __cccl_ptx_isa >= 800 diff --git a/libcudacxx/include/cuda/__ptx/instructions/generated/fence.inc b/libcudacxx/include/cuda/__ptx/instructions/generated/fence.inc new file mode 100644 index 00000000000..f10ec07ebb5 --- /dev/null +++ b/libcudacxx/include/cuda/__ptx/instructions/generated/fence.inc @@ -0,0 +1,67 @@ +/* +// fence{.sem}.scope; // 1. PTX ISA 60, SM_70 +// .sem = { .sc, .acq_rel } +// .scope = { .cta, .gpu, .sys } +template +__device__ static inline void fence( + cuda::ptx::sem_t sem, + cuda::ptx::scope_t scope); +*/ +#if __cccl_ptx_isa >= 600 +extern "C" _CCCL_DEVICE void __cuda_ptx_fence_is_not_supported_before_SM_70__(); +template +_CCCL_DEVICE static inline void fence(sem_t<_Sem> __sem, scope_t<_Scope> __scope) +{ + static_assert(__sem == sem_sc || __sem == sem_acq_rel, ""); + static_assert(__scope == scope_cta || __scope == scope_gpu || __scope == scope_sys, ""); + NV_IF_ELSE_TARGET( + NV_PROVIDES_SM_70, + ( + _CCCL_IF_CONSTEXPR (__sem == sem_sc && __scope == scope_cta) { + asm volatile("fence.sc.cta; // 1." : : : "memory"); + } _CCCL_ELSE_IF_CONSTEXPR (__sem == sem_sc && __scope == scope_gpu) { + asm volatile("fence.sc.gpu; // 1." : : : "memory"); + } _CCCL_ELSE_IF_CONSTEXPR (__sem == sem_sc && __scope == scope_sys) { + asm volatile("fence.sc.sys; // 1." : : : "memory"); + } _CCCL_ELSE_IF_CONSTEXPR (__sem == sem_acq_rel && __scope == scope_cta) { + asm volatile("fence.acq_rel.cta; // 1." : : : "memory"); + } _CCCL_ELSE_IF_CONSTEXPR (__sem == sem_acq_rel && __scope == scope_gpu) { + asm volatile("fence.acq_rel.gpu; // 1." : : : "memory"); + } _CCCL_ELSE_IF_CONSTEXPR (__sem == sem_acq_rel && __scope == scope_sys) { + asm volatile("fence.acq_rel.sys; // 1." : : : "memory"); + }), + ( + // Unsupported architectures will have a linker error with a semi-decent error message + __cuda_ptx_fence_is_not_supported_before_SM_70__();)); +} +#endif // __cccl_ptx_isa >= 600 + +/* +// fence{.sem}.scope; // 2. PTX ISA 78, SM_90 +// .sem = { .sc, .acq_rel } +// .scope = { .cluster } +template +__device__ static inline void fence( + cuda::ptx::sem_t sem, + cuda::ptx::scope_cluster_t); +*/ +#if __cccl_ptx_isa >= 780 +extern "C" _CCCL_DEVICE void __cuda_ptx_fence_is_not_supported_before_SM_90__(); +template +_CCCL_DEVICE static inline void fence(sem_t<_Sem> __sem, scope_cluster_t) +{ + static_assert(__sem == sem_sc || __sem == sem_acq_rel, ""); + // __scope == scope_cluster (due to parameter type constraint) + NV_IF_ELSE_TARGET( + NV_PROVIDES_SM_90, + ( + _CCCL_IF_CONSTEXPR (__sem == sem_sc) { + asm volatile("fence.sc.cluster; // 2." : : : "memory"); + } _CCCL_ELSE_IF_CONSTEXPR (__sem == sem_acq_rel) { + asm volatile("fence.acq_rel.cluster; // 2." : : : "memory"); + }), + ( + // Unsupported architectures will have a linker error with a semi-decent error message + __cuda_ptx_fence_is_not_supported_before_SM_90__();)); +} +#endif // __cccl_ptx_isa >= 780 diff --git a/libcudacxx/include/cuda/__ptx/instructions/generated/fence_mbarrier_init.inc b/libcudacxx/include/cuda/__ptx/instructions/generated/fence_mbarrier_init.inc new file mode 100644 index 00000000000..0d39c222598 --- /dev/null +++ b/libcudacxx/include/cuda/__ptx/instructions/generated/fence_mbarrier_init.inc @@ -0,0 +1,27 @@ +/* +// fence.mbarrier_init.sem.scope; // 3. PTX ISA 80, SM_90 +// .sem = { .release } +// .scope = { .cluster } +template +__device__ static inline void fence_mbarrier_init( + cuda::ptx::sem_release_t, + cuda::ptx::scope_cluster_t); +*/ +#if __cccl_ptx_isa >= 800 +extern "C" _CCCL_DEVICE void __cuda_ptx_fence_mbarrier_init_is_not_supported_before_SM_90__(); +template +_CCCL_DEVICE static inline void fence_mbarrier_init(sem_release_t, scope_cluster_t) +{ + // __sem == sem_release (due to parameter type constraint) + // __scope == scope_cluster (due to parameter type constraint) + NV_IF_ELSE_TARGET( + NV_PROVIDES_SM_90, + (asm volatile("fence.mbarrier_init.release.cluster; // 3." + : + : + : "memory");), + ( + // Unsupported architectures will have a linker error with a semi-decent error message + __cuda_ptx_fence_mbarrier_init_is_not_supported_before_SM_90__();)); +} +#endif // __cccl_ptx_isa >= 800 diff --git a/libcudacxx/include/cuda/__ptx/instructions/generated/fence_proxy_alias.inc b/libcudacxx/include/cuda/__ptx/instructions/generated/fence_proxy_alias.inc new file mode 100644 index 00000000000..98260b851ca --- /dev/null +++ b/libcudacxx/include/cuda/__ptx/instructions/generated/fence_proxy_alias.inc @@ -0,0 +1,21 @@ +/* +// fence.proxy.alias; // 4. PTX ISA 75, SM_70 +template +__device__ static inline void fence_proxy_alias(); +*/ +#if __cccl_ptx_isa >= 750 +extern "C" _CCCL_DEVICE void __cuda_ptx_fence_proxy_alias_is_not_supported_before_SM_70__(); +template +_CCCL_DEVICE static inline void fence_proxy_alias() +{ + NV_IF_ELSE_TARGET( + NV_PROVIDES_SM_70, + (asm volatile("fence.proxy.alias; // 4." + : + : + : "memory");), + ( + // Unsupported architectures will have a linker error with a semi-decent error message + __cuda_ptx_fence_proxy_alias_is_not_supported_before_SM_70__();)); +} +#endif // __cccl_ptx_isa >= 750 diff --git a/libcudacxx/include/cuda/__ptx/instructions/generated/fence_proxy_async.inc b/libcudacxx/include/cuda/__ptx/instructions/generated/fence_proxy_async.inc new file mode 100644 index 00000000000..f0a37baabdb --- /dev/null +++ b/libcudacxx/include/cuda/__ptx/instructions/generated/fence_proxy_async.inc @@ -0,0 +1,50 @@ +/* +// fence.proxy.async; // 5. PTX ISA 80, SM_90 +template +__device__ static inline void fence_proxy_async(); +*/ +#if __cccl_ptx_isa >= 800 +extern "C" _CCCL_DEVICE void __cuda_ptx_fence_proxy_async_is_not_supported_before_SM_90__(); +template +_CCCL_DEVICE static inline void fence_proxy_async() +{ + NV_IF_ELSE_TARGET( + NV_PROVIDES_SM_90, + (asm volatile("fence.proxy.async; // 5." + : + : + : "memory");), + ( + // Unsupported architectures will have a linker error with a semi-decent error message + __cuda_ptx_fence_proxy_async_is_not_supported_before_SM_90__();)); +} +#endif // __cccl_ptx_isa >= 800 + +/* +// fence.proxy.async{.space}; // 6. PTX ISA 80, SM_90 +// .space = { .global, .shared::cluster, .shared::cta } +template +__device__ static inline void fence_proxy_async( + cuda::ptx::space_t space); +*/ +#if __cccl_ptx_isa >= 800 +extern "C" _CCCL_DEVICE void __cuda_ptx_fence_proxy_async_is_not_supported_before_SM_90__(); +template +_CCCL_DEVICE static inline void fence_proxy_async(space_t<_Space> __space) +{ + static_assert(__space == space_global || __space == space_cluster || __space == space_shared, ""); + NV_IF_ELSE_TARGET( + NV_PROVIDES_SM_90, + ( + _CCCL_IF_CONSTEXPR (__space == space_global) { + asm volatile("fence.proxy.async.global; // 6." : : : "memory"); + } _CCCL_ELSE_IF_CONSTEXPR (__space == space_cluster) { + asm volatile("fence.proxy.async.shared::cluster; // 6." : : : "memory"); + } _CCCL_ELSE_IF_CONSTEXPR (__space == space_shared) { + asm volatile("fence.proxy.async.shared::cta; // 6." : : : "memory"); + }), + ( + // Unsupported architectures will have a linker error with a semi-decent error message + __cuda_ptx_fence_proxy_async_is_not_supported_before_SM_90__();)); +} +#endif // __cccl_ptx_isa >= 800 diff --git a/libcudacxx/include/cuda/__ptx/instructions/generated/fence_proxy_tensormap_generic.inc b/libcudacxx/include/cuda/__ptx/instructions/generated/fence_proxy_tensormap_generic.inc new file mode 100644 index 00000000000..3e5b2a265f4 --- /dev/null +++ b/libcudacxx/include/cuda/__ptx/instructions/generated/fence_proxy_tensormap_generic.inc @@ -0,0 +1,82 @@ +/* +// fence.proxy.tensormap::generic.release.scope; // 7. PTX ISA 83, SM_90 +// .sem = { .release } +// .scope = { .cta, .cluster, .gpu, .sys } +template +__device__ static inline void fence_proxy_tensormap_generic( + cuda::ptx::sem_release_t, + cuda::ptx::scope_t scope); +*/ +#if __cccl_ptx_isa >= 830 +extern "C" _CCCL_DEVICE void __cuda_ptx_fence_proxy_tensormap_generic_is_not_supported_before_SM_90__(); +template +_CCCL_DEVICE static inline void fence_proxy_tensormap_generic(sem_release_t, scope_t<_Scope> __scope) +{ + // __sem == sem_release (due to parameter type constraint) + static_assert(__scope == scope_cta || __scope == scope_cluster || __scope == scope_gpu || __scope == scope_sys, ""); + NV_IF_ELSE_TARGET( + NV_PROVIDES_SM_90, + ( + _CCCL_IF_CONSTEXPR (__scope == scope_cta) { + asm volatile("fence.proxy.tensormap::generic.release.cta; // 7." : : : "memory"); + } _CCCL_ELSE_IF_CONSTEXPR (__scope == scope_cluster) { + asm volatile("fence.proxy.tensormap::generic.release.cluster; // 7." : : : "memory"); + } _CCCL_ELSE_IF_CONSTEXPR (__scope == scope_gpu) { + asm volatile("fence.proxy.tensormap::generic.release.gpu; // 7." : : : "memory"); + } _CCCL_ELSE_IF_CONSTEXPR (__scope == scope_sys) { + asm volatile("fence.proxy.tensormap::generic.release.sys; // 7." : : : "memory"); + }), + ( + // Unsupported architectures will have a linker error with a semi-decent error message + __cuda_ptx_fence_proxy_tensormap_generic_is_not_supported_before_SM_90__();)); +} +#endif // __cccl_ptx_isa >= 830 + +/* +// fence.proxy.tensormap::generic.sem.scope [addr], size; // 8. PTX ISA 83, SM_90 +// .sem = { .acquire } +// .scope = { .cta, .cluster, .gpu, .sys } +template +__device__ static inline void fence_proxy_tensormap_generic( + cuda::ptx::sem_acquire_t, + cuda::ptx::scope_t scope, + const void* addr, + cuda::ptx::n32_t size); +*/ +#if __cccl_ptx_isa >= 830 +extern "C" _CCCL_DEVICE void __cuda_ptx_fence_proxy_tensormap_generic_is_not_supported_before_SM_90__(); +template +_CCCL_DEVICE static inline void +fence_proxy_tensormap_generic(sem_acquire_t, scope_t<_Scope> __scope, const void* __addr, n32_t<_N32> __size) +{ + // __sem == sem_acquire (due to parameter type constraint) + static_assert(__scope == scope_cta || __scope == scope_cluster || __scope == scope_gpu || __scope == scope_sys, ""); + NV_IF_ELSE_TARGET( + NV_PROVIDES_SM_90, + ( + _CCCL_IF_CONSTEXPR (__scope == scope_cta) { + asm volatile("fence.proxy.tensormap::generic.acquire.cta [%0], %1; // 8." + : + : "l"(__addr), "n"(__size.value) + : "memory"); + } _CCCL_ELSE_IF_CONSTEXPR (__scope == scope_cluster) { + asm volatile("fence.proxy.tensormap::generic.acquire.cluster [%0], %1; // 8." + : + : "l"(__addr), "n"(__size.value) + : "memory"); + } _CCCL_ELSE_IF_CONSTEXPR (__scope == scope_gpu) { + asm volatile("fence.proxy.tensormap::generic.acquire.gpu [%0], %1; // 8." + : + : "l"(__addr), "n"(__size.value) + : "memory"); + } _CCCL_ELSE_IF_CONSTEXPR (__scope == scope_sys) { + asm volatile("fence.proxy.tensormap::generic.acquire.sys [%0], %1; // 8." + : + : "l"(__addr), "n"(__size.value) + : "memory"); + }), + ( + // Unsupported architectures will have a linker error with a semi-decent error message + __cuda_ptx_fence_proxy_tensormap_generic_is_not_supported_before_SM_90__();)); +} +#endif // __cccl_ptx_isa >= 830 diff --git a/libcudacxx/include/cuda/__ptx/instructions/generated/get_sreg.inc b/libcudacxx/include/cuda/__ptx/instructions/generated/get_sreg.inc new file mode 100644 index 00000000000..dd3079915f7 --- /dev/null +++ b/libcudacxx/include/cuda/__ptx/instructions/generated/get_sreg.inc @@ -0,0 +1,1001 @@ +/* +// mov.u32 sreg_value, %%tid.x; // PTX ISA 20 +template +__device__ static inline uint32_t get_sreg_tid_x(); +*/ +#if __cccl_ptx_isa >= 200 +template +_CCCL_DEVICE static inline _CUDA_VSTD::uint32_t get_sreg_tid_x() +{ + _CUDA_VSTD::uint32_t __sreg_value; + asm("mov.u32 %0, %%tid.x;" : "=r"(__sreg_value) : :); + return __sreg_value; +} +#endif // __cccl_ptx_isa >= 200 + +/* +// mov.u32 sreg_value, %%tid.y; // PTX ISA 20 +template +__device__ static inline uint32_t get_sreg_tid_y(); +*/ +#if __cccl_ptx_isa >= 200 +template +_CCCL_DEVICE static inline _CUDA_VSTD::uint32_t get_sreg_tid_y() +{ + _CUDA_VSTD::uint32_t __sreg_value; + asm("mov.u32 %0, %%tid.y;" : "=r"(__sreg_value) : :); + return __sreg_value; +} +#endif // __cccl_ptx_isa >= 200 + +/* +// mov.u32 sreg_value, %%tid.z; // PTX ISA 20 +template +__device__ static inline uint32_t get_sreg_tid_z(); +*/ +#if __cccl_ptx_isa >= 200 +template +_CCCL_DEVICE static inline _CUDA_VSTD::uint32_t get_sreg_tid_z() +{ + _CUDA_VSTD::uint32_t __sreg_value; + asm("mov.u32 %0, %%tid.z;" : "=r"(__sreg_value) : :); + return __sreg_value; +} +#endif // __cccl_ptx_isa >= 200 + +/* +// mov.u32 sreg_value, %%ntid.x; // PTX ISA 20 +template +__device__ static inline uint32_t get_sreg_ntid_x(); +*/ +#if __cccl_ptx_isa >= 200 +template +_CCCL_DEVICE static inline _CUDA_VSTD::uint32_t get_sreg_ntid_x() +{ + _CUDA_VSTD::uint32_t __sreg_value; + asm volatile("mov.u32 %0, %%ntid.x;" : "=r"(__sreg_value) : :); + return __sreg_value; +} +#endif // __cccl_ptx_isa >= 200 + +/* +// mov.u32 sreg_value, %%ntid.y; // PTX ISA 20 +template +__device__ static inline uint32_t get_sreg_ntid_y(); +*/ +#if __cccl_ptx_isa >= 200 +template +_CCCL_DEVICE static inline _CUDA_VSTD::uint32_t get_sreg_ntid_y() +{ + _CUDA_VSTD::uint32_t __sreg_value; + asm volatile("mov.u32 %0, %%ntid.y;" : "=r"(__sreg_value) : :); + return __sreg_value; +} +#endif // __cccl_ptx_isa >= 200 + +/* +// mov.u32 sreg_value, %%ntid.z; // PTX ISA 20 +template +__device__ static inline uint32_t get_sreg_ntid_z(); +*/ +#if __cccl_ptx_isa >= 200 +template +_CCCL_DEVICE static inline _CUDA_VSTD::uint32_t get_sreg_ntid_z() +{ + _CUDA_VSTD::uint32_t __sreg_value; + asm volatile("mov.u32 %0, %%ntid.z;" : "=r"(__sreg_value) : :); + return __sreg_value; +} +#endif // __cccl_ptx_isa >= 200 + +/* +// mov.u32 sreg_value, %%laneid; // PTX ISA 13 +template +__device__ static inline uint32_t get_sreg_laneid(); +*/ +#if __cccl_ptx_isa >= 130 +template +_CCCL_DEVICE static inline _CUDA_VSTD::uint32_t get_sreg_laneid() +{ + _CUDA_VSTD::uint32_t __sreg_value; + asm("mov.u32 %0, %%laneid;" : "=r"(__sreg_value) : :); + return __sreg_value; +} +#endif // __cccl_ptx_isa >= 130 + +/* +// mov.u32 sreg_value, %%warpid; // PTX ISA 13 +template +__device__ static inline uint32_t get_sreg_warpid(); +*/ +#if __cccl_ptx_isa >= 130 +template +_CCCL_DEVICE static inline _CUDA_VSTD::uint32_t get_sreg_warpid() +{ + _CUDA_VSTD::uint32_t __sreg_value; + asm volatile("mov.u32 %0, %%warpid;" : "=r"(__sreg_value) : :); + return __sreg_value; +} +#endif // __cccl_ptx_isa >= 130 + +/* +// mov.u32 sreg_value, %%nwarpid; // PTX ISA 20, SM_35 +template +__device__ static inline uint32_t get_sreg_nwarpid(); +*/ +#if __cccl_ptx_isa >= 200 +extern "C" _CCCL_DEVICE void __cuda_ptx_get_sreg_nwarpid_is_not_supported_before_SM_35__(); +template +_CCCL_DEVICE static inline _CUDA_VSTD::uint32_t get_sreg_nwarpid() +{ + NV_IF_ELSE_TARGET( + NV_PROVIDES_SM_35, + (_CUDA_VSTD::uint32_t __sreg_value; asm volatile( + "mov.u32 %0, %%nwarpid;" + : "=r"(__sreg_value) + : + :); + return __sreg_value;), + ( + // Unsupported architectures will have a linker error with a semi-decent error message + __cuda_ptx_get_sreg_nwarpid_is_not_supported_before_SM_35__(); return 0;)); +} +#endif // __cccl_ptx_isa >= 200 + +/* +// mov.u32 sreg_value, %%ctaid.x; // PTX ISA 20 +template +__device__ static inline uint32_t get_sreg_ctaid_x(); +*/ +#if __cccl_ptx_isa >= 200 +template +_CCCL_DEVICE static inline _CUDA_VSTD::uint32_t get_sreg_ctaid_x() +{ + _CUDA_VSTD::uint32_t __sreg_value; + asm("mov.u32 %0, %%ctaid.x;" : "=r"(__sreg_value) : :); + return __sreg_value; +} +#endif // __cccl_ptx_isa >= 200 + +/* +// mov.u32 sreg_value, %%ctaid.y; // PTX ISA 20 +template +__device__ static inline uint32_t get_sreg_ctaid_y(); +*/ +#if __cccl_ptx_isa >= 200 +template +_CCCL_DEVICE static inline _CUDA_VSTD::uint32_t get_sreg_ctaid_y() +{ + _CUDA_VSTD::uint32_t __sreg_value; + asm("mov.u32 %0, %%ctaid.y;" : "=r"(__sreg_value) : :); + return __sreg_value; +} +#endif // __cccl_ptx_isa >= 200 + +/* +// mov.u32 sreg_value, %%ctaid.z; // PTX ISA 20 +template +__device__ static inline uint32_t get_sreg_ctaid_z(); +*/ +#if __cccl_ptx_isa >= 200 +template +_CCCL_DEVICE static inline _CUDA_VSTD::uint32_t get_sreg_ctaid_z() +{ + _CUDA_VSTD::uint32_t __sreg_value; + asm("mov.u32 %0, %%ctaid.z;" : "=r"(__sreg_value) : :); + return __sreg_value; +} +#endif // __cccl_ptx_isa >= 200 + +/* +// mov.u32 sreg_value, %%nctaid.x; // PTX ISA 20 +template +__device__ static inline uint32_t get_sreg_nctaid_x(); +*/ +#if __cccl_ptx_isa >= 200 +template +_CCCL_DEVICE static inline _CUDA_VSTD::uint32_t get_sreg_nctaid_x() +{ + _CUDA_VSTD::uint32_t __sreg_value; + asm("mov.u32 %0, %%nctaid.x;" : "=r"(__sreg_value) : :); + return __sreg_value; +} +#endif // __cccl_ptx_isa >= 200 + +/* +// mov.u32 sreg_value, %%nctaid.y; // PTX ISA 20 +template +__device__ static inline uint32_t get_sreg_nctaid_y(); +*/ +#if __cccl_ptx_isa >= 200 +template +_CCCL_DEVICE static inline _CUDA_VSTD::uint32_t get_sreg_nctaid_y() +{ + _CUDA_VSTD::uint32_t __sreg_value; + asm("mov.u32 %0, %%nctaid.y;" : "=r"(__sreg_value) : :); + return __sreg_value; +} +#endif // __cccl_ptx_isa >= 200 + +/* +// mov.u32 sreg_value, %%nctaid.z; // PTX ISA 20 +template +__device__ static inline uint32_t get_sreg_nctaid_z(); +*/ +#if __cccl_ptx_isa >= 200 +template +_CCCL_DEVICE static inline _CUDA_VSTD::uint32_t get_sreg_nctaid_z() +{ + _CUDA_VSTD::uint32_t __sreg_value; + asm("mov.u32 %0, %%nctaid.z;" : "=r"(__sreg_value) : :); + return __sreg_value; +} +#endif // __cccl_ptx_isa >= 200 + +/* +// mov.u32 sreg_value, %%smid; // PTX ISA 13 +template +__device__ static inline uint32_t get_sreg_smid(); +*/ +#if __cccl_ptx_isa >= 130 +template +_CCCL_DEVICE static inline _CUDA_VSTD::uint32_t get_sreg_smid() +{ + _CUDA_VSTD::uint32_t __sreg_value; + asm("mov.u32 %0, %%smid;" : "=r"(__sreg_value) : :); + return __sreg_value; +} +#endif // __cccl_ptx_isa >= 130 + +/* +// mov.u32 sreg_value, %%nsmid; // PTX ISA 20, SM_35 +template +__device__ static inline uint32_t get_sreg_nsmid(); +*/ +#if __cccl_ptx_isa >= 200 +extern "C" _CCCL_DEVICE void __cuda_ptx_get_sreg_nsmid_is_not_supported_before_SM_35__(); +template +_CCCL_DEVICE static inline _CUDA_VSTD::uint32_t get_sreg_nsmid() +{ + NV_IF_ELSE_TARGET( + NV_PROVIDES_SM_35, + (_CUDA_VSTD::uint32_t __sreg_value; asm volatile( + "mov.u32 %0, %%nsmid;" + : "=r"(__sreg_value) + : + :); + return __sreg_value;), + ( + // Unsupported architectures will have a linker error with a semi-decent error message + __cuda_ptx_get_sreg_nsmid_is_not_supported_before_SM_35__(); return 0;)); +} +#endif // __cccl_ptx_isa >= 200 + +/* +// mov.u64 sreg_value, %%gridid; // PTX ISA 30 +template +__device__ static inline uint64_t get_sreg_gridid(); +*/ +#if __cccl_ptx_isa >= 300 +template +_CCCL_DEVICE static inline _CUDA_VSTD::uint64_t get_sreg_gridid() +{ + _CUDA_VSTD::uint64_t __sreg_value; + asm("mov.u64 %0, %%gridid;" : "=l"(__sreg_value) : :); + return __sreg_value; +} +#endif // __cccl_ptx_isa >= 300 + +/* +// mov.pred sreg_value, %%is_explicit_cluster; // PTX ISA 78, SM_90 +template +__device__ static inline bool get_sreg_is_explicit_cluster(); +*/ +#if __cccl_ptx_isa >= 780 +extern "C" _CCCL_DEVICE void __cuda_ptx_get_sreg_is_explicit_cluster_is_not_supported_before_SM_90__(); +template +_CCCL_DEVICE static inline bool get_sreg_is_explicit_cluster() +{ + NV_IF_ELSE_TARGET( + NV_PROVIDES_SM_90, + (_CUDA_VSTD::uint32_t __sreg_value; + asm("{\n\t .reg .pred P_OUT; \n\t" + "mov.pred P_OUT, %%is_explicit_cluster;\n\t" + "selp.b32 %0, 1, 0, P_OUT; \n" + "}" + : "=r"(__sreg_value) + : + :); + return static_cast(__sreg_value);), + ( + // Unsupported architectures will have a linker error with a semi-decent error message + __cuda_ptx_get_sreg_is_explicit_cluster_is_not_supported_before_SM_90__(); return false;)); +} +#endif // __cccl_ptx_isa >= 780 + +/* +// mov.u32 sreg_value, %%clusterid.x; // PTX ISA 78, SM_90 +template +__device__ static inline uint32_t get_sreg_clusterid_x(); +*/ +#if __cccl_ptx_isa >= 780 +extern "C" _CCCL_DEVICE void __cuda_ptx_get_sreg_clusterid_x_is_not_supported_before_SM_90__(); +template +_CCCL_DEVICE static inline _CUDA_VSTD::uint32_t get_sreg_clusterid_x() +{ + NV_IF_ELSE_TARGET( + NV_PROVIDES_SM_90, + (_CUDA_VSTD::uint32_t __sreg_value; + asm("mov.u32 %0, %%clusterid.x;" + : "=r"(__sreg_value) + : + :); + return __sreg_value;), + ( + // Unsupported architectures will have a linker error with a semi-decent error message + __cuda_ptx_get_sreg_clusterid_x_is_not_supported_before_SM_90__(); return 0;)); +} +#endif // __cccl_ptx_isa >= 780 + +/* +// mov.u32 sreg_value, %%clusterid.y; // PTX ISA 78, SM_90 +template +__device__ static inline uint32_t get_sreg_clusterid_y(); +*/ +#if __cccl_ptx_isa >= 780 +extern "C" _CCCL_DEVICE void __cuda_ptx_get_sreg_clusterid_y_is_not_supported_before_SM_90__(); +template +_CCCL_DEVICE static inline _CUDA_VSTD::uint32_t get_sreg_clusterid_y() +{ + NV_IF_ELSE_TARGET( + NV_PROVIDES_SM_90, + (_CUDA_VSTD::uint32_t __sreg_value; + asm("mov.u32 %0, %%clusterid.y;" + : "=r"(__sreg_value) + : + :); + return __sreg_value;), + ( + // Unsupported architectures will have a linker error with a semi-decent error message + __cuda_ptx_get_sreg_clusterid_y_is_not_supported_before_SM_90__(); return 0;)); +} +#endif // __cccl_ptx_isa >= 780 + +/* +// mov.u32 sreg_value, %%clusterid.z; // PTX ISA 78, SM_90 +template +__device__ static inline uint32_t get_sreg_clusterid_z(); +*/ +#if __cccl_ptx_isa >= 780 +extern "C" _CCCL_DEVICE void __cuda_ptx_get_sreg_clusterid_z_is_not_supported_before_SM_90__(); +template +_CCCL_DEVICE static inline _CUDA_VSTD::uint32_t get_sreg_clusterid_z() +{ + NV_IF_ELSE_TARGET( + NV_PROVIDES_SM_90, + (_CUDA_VSTD::uint32_t __sreg_value; + asm("mov.u32 %0, %%clusterid.z;" + : "=r"(__sreg_value) + : + :); + return __sreg_value;), + ( + // Unsupported architectures will have a linker error with a semi-decent error message + __cuda_ptx_get_sreg_clusterid_z_is_not_supported_before_SM_90__(); return 0;)); +} +#endif // __cccl_ptx_isa >= 780 + +/* +// mov.u32 sreg_value, %%nclusterid.x; // PTX ISA 78, SM_90 +template +__device__ static inline uint32_t get_sreg_nclusterid_x(); +*/ +#if __cccl_ptx_isa >= 780 +extern "C" _CCCL_DEVICE void __cuda_ptx_get_sreg_nclusterid_x_is_not_supported_before_SM_90__(); +template +_CCCL_DEVICE static inline _CUDA_VSTD::uint32_t get_sreg_nclusterid_x() +{ + NV_IF_ELSE_TARGET( + NV_PROVIDES_SM_90, + (_CUDA_VSTD::uint32_t __sreg_value; + asm("mov.u32 %0, %%nclusterid.x;" + : "=r"(__sreg_value) + : + :); + return __sreg_value;), + ( + // Unsupported architectures will have a linker error with a semi-decent error message + __cuda_ptx_get_sreg_nclusterid_x_is_not_supported_before_SM_90__(); return 0;)); +} +#endif // __cccl_ptx_isa >= 780 + +/* +// mov.u32 sreg_value, %%nclusterid.y; // PTX ISA 78, SM_90 +template +__device__ static inline uint32_t get_sreg_nclusterid_y(); +*/ +#if __cccl_ptx_isa >= 780 +extern "C" _CCCL_DEVICE void __cuda_ptx_get_sreg_nclusterid_y_is_not_supported_before_SM_90__(); +template +_CCCL_DEVICE static inline _CUDA_VSTD::uint32_t get_sreg_nclusterid_y() +{ + NV_IF_ELSE_TARGET( + NV_PROVIDES_SM_90, + (_CUDA_VSTD::uint32_t __sreg_value; + asm("mov.u32 %0, %%nclusterid.y;" + : "=r"(__sreg_value) + : + :); + return __sreg_value;), + ( + // Unsupported architectures will have a linker error with a semi-decent error message + __cuda_ptx_get_sreg_nclusterid_y_is_not_supported_before_SM_90__(); return 0;)); +} +#endif // __cccl_ptx_isa >= 780 + +/* +// mov.u32 sreg_value, %%nclusterid.z; // PTX ISA 78, SM_90 +template +__device__ static inline uint32_t get_sreg_nclusterid_z(); +*/ +#if __cccl_ptx_isa >= 780 +extern "C" _CCCL_DEVICE void __cuda_ptx_get_sreg_nclusterid_z_is_not_supported_before_SM_90__(); +template +_CCCL_DEVICE static inline _CUDA_VSTD::uint32_t get_sreg_nclusterid_z() +{ + NV_IF_ELSE_TARGET( + NV_PROVIDES_SM_90, + (_CUDA_VSTD::uint32_t __sreg_value; + asm("mov.u32 %0, %%nclusterid.z;" + : "=r"(__sreg_value) + : + :); + return __sreg_value;), + ( + // Unsupported architectures will have a linker error with a semi-decent error message + __cuda_ptx_get_sreg_nclusterid_z_is_not_supported_before_SM_90__(); return 0;)); +} +#endif // __cccl_ptx_isa >= 780 + +/* +// mov.u32 sreg_value, %%cluster_ctaid.x; // PTX ISA 78, SM_90 +template +__device__ static inline uint32_t get_sreg_cluster_ctaid_x(); +*/ +#if __cccl_ptx_isa >= 780 +extern "C" _CCCL_DEVICE void __cuda_ptx_get_sreg_cluster_ctaid_x_is_not_supported_before_SM_90__(); +template +_CCCL_DEVICE static inline _CUDA_VSTD::uint32_t get_sreg_cluster_ctaid_x() +{ + NV_IF_ELSE_TARGET( + NV_PROVIDES_SM_90, + (_CUDA_VSTD::uint32_t __sreg_value; + asm("mov.u32 %0, %%cluster_ctaid.x;" + : "=r"(__sreg_value) + : + :); + return __sreg_value;), + ( + // Unsupported architectures will have a linker error with a semi-decent error message + __cuda_ptx_get_sreg_cluster_ctaid_x_is_not_supported_before_SM_90__(); return 0;)); +} +#endif // __cccl_ptx_isa >= 780 + +/* +// mov.u32 sreg_value, %%cluster_ctaid.y; // PTX ISA 78, SM_90 +template +__device__ static inline uint32_t get_sreg_cluster_ctaid_y(); +*/ +#if __cccl_ptx_isa >= 780 +extern "C" _CCCL_DEVICE void __cuda_ptx_get_sreg_cluster_ctaid_y_is_not_supported_before_SM_90__(); +template +_CCCL_DEVICE static inline _CUDA_VSTD::uint32_t get_sreg_cluster_ctaid_y() +{ + NV_IF_ELSE_TARGET( + NV_PROVIDES_SM_90, + (_CUDA_VSTD::uint32_t __sreg_value; + asm("mov.u32 %0, %%cluster_ctaid.y;" + : "=r"(__sreg_value) + : + :); + return __sreg_value;), + ( + // Unsupported architectures will have a linker error with a semi-decent error message + __cuda_ptx_get_sreg_cluster_ctaid_y_is_not_supported_before_SM_90__(); return 0;)); +} +#endif // __cccl_ptx_isa >= 780 + +/* +// mov.u32 sreg_value, %%cluster_ctaid.z; // PTX ISA 78, SM_90 +template +__device__ static inline uint32_t get_sreg_cluster_ctaid_z(); +*/ +#if __cccl_ptx_isa >= 780 +extern "C" _CCCL_DEVICE void __cuda_ptx_get_sreg_cluster_ctaid_z_is_not_supported_before_SM_90__(); +template +_CCCL_DEVICE static inline _CUDA_VSTD::uint32_t get_sreg_cluster_ctaid_z() +{ + NV_IF_ELSE_TARGET( + NV_PROVIDES_SM_90, + (_CUDA_VSTD::uint32_t __sreg_value; + asm("mov.u32 %0, %%cluster_ctaid.z;" + : "=r"(__sreg_value) + : + :); + return __sreg_value;), + ( + // Unsupported architectures will have a linker error with a semi-decent error message + __cuda_ptx_get_sreg_cluster_ctaid_z_is_not_supported_before_SM_90__(); return 0;)); +} +#endif // __cccl_ptx_isa >= 780 + +/* +// mov.u32 sreg_value, %%cluster_nctaid.x; // PTX ISA 78, SM_90 +template +__device__ static inline uint32_t get_sreg_cluster_nctaid_x(); +*/ +#if __cccl_ptx_isa >= 780 +extern "C" _CCCL_DEVICE void __cuda_ptx_get_sreg_cluster_nctaid_x_is_not_supported_before_SM_90__(); +template +_CCCL_DEVICE static inline _CUDA_VSTD::uint32_t get_sreg_cluster_nctaid_x() +{ + NV_IF_ELSE_TARGET( + NV_PROVIDES_SM_90, + (_CUDA_VSTD::uint32_t __sreg_value; + asm("mov.u32 %0, %%cluster_nctaid.x;" + : "=r"(__sreg_value) + : + :); + return __sreg_value;), + ( + // Unsupported architectures will have a linker error with a semi-decent error message + __cuda_ptx_get_sreg_cluster_nctaid_x_is_not_supported_before_SM_90__(); return 0;)); +} +#endif // __cccl_ptx_isa >= 780 + +/* +// mov.u32 sreg_value, %%cluster_nctaid.y; // PTX ISA 78, SM_90 +template +__device__ static inline uint32_t get_sreg_cluster_nctaid_y(); +*/ +#if __cccl_ptx_isa >= 780 +extern "C" _CCCL_DEVICE void __cuda_ptx_get_sreg_cluster_nctaid_y_is_not_supported_before_SM_90__(); +template +_CCCL_DEVICE static inline _CUDA_VSTD::uint32_t get_sreg_cluster_nctaid_y() +{ + NV_IF_ELSE_TARGET( + NV_PROVIDES_SM_90, + (_CUDA_VSTD::uint32_t __sreg_value; + asm("mov.u32 %0, %%cluster_nctaid.y;" + : "=r"(__sreg_value) + : + :); + return __sreg_value;), + ( + // Unsupported architectures will have a linker error with a semi-decent error message + __cuda_ptx_get_sreg_cluster_nctaid_y_is_not_supported_before_SM_90__(); return 0;)); +} +#endif // __cccl_ptx_isa >= 780 + +/* +// mov.u32 sreg_value, %%cluster_nctaid.z; // PTX ISA 78, SM_90 +template +__device__ static inline uint32_t get_sreg_cluster_nctaid_z(); +*/ +#if __cccl_ptx_isa >= 780 +extern "C" _CCCL_DEVICE void __cuda_ptx_get_sreg_cluster_nctaid_z_is_not_supported_before_SM_90__(); +template +_CCCL_DEVICE static inline _CUDA_VSTD::uint32_t get_sreg_cluster_nctaid_z() +{ + NV_IF_ELSE_TARGET( + NV_PROVIDES_SM_90, + (_CUDA_VSTD::uint32_t __sreg_value; + asm("mov.u32 %0, %%cluster_nctaid.z;" + : "=r"(__sreg_value) + : + :); + return __sreg_value;), + ( + // Unsupported architectures will have a linker error with a semi-decent error message + __cuda_ptx_get_sreg_cluster_nctaid_z_is_not_supported_before_SM_90__(); return 0;)); +} +#endif // __cccl_ptx_isa >= 780 + +/* +// mov.u32 sreg_value, %%cluster_ctarank; // PTX ISA 78, SM_90 +template +__device__ static inline uint32_t get_sreg_cluster_ctarank(); +*/ +#if __cccl_ptx_isa >= 780 +extern "C" _CCCL_DEVICE void __cuda_ptx_get_sreg_cluster_ctarank_is_not_supported_before_SM_90__(); +template +_CCCL_DEVICE static inline _CUDA_VSTD::uint32_t get_sreg_cluster_ctarank() +{ + NV_IF_ELSE_TARGET( + NV_PROVIDES_SM_90, + (_CUDA_VSTD::uint32_t __sreg_value; + asm("mov.u32 %0, %%cluster_ctarank;" + : "=r"(__sreg_value) + : + :); + return __sreg_value;), + ( + // Unsupported architectures will have a linker error with a semi-decent error message + __cuda_ptx_get_sreg_cluster_ctarank_is_not_supported_before_SM_90__(); return 0;)); +} +#endif // __cccl_ptx_isa >= 780 + +/* +// mov.u32 sreg_value, %%cluster_nctarank; // PTX ISA 78, SM_90 +template +__device__ static inline uint32_t get_sreg_cluster_nctarank(); +*/ +#if __cccl_ptx_isa >= 780 +extern "C" _CCCL_DEVICE void __cuda_ptx_get_sreg_cluster_nctarank_is_not_supported_before_SM_90__(); +template +_CCCL_DEVICE static inline _CUDA_VSTD::uint32_t get_sreg_cluster_nctarank() +{ + NV_IF_ELSE_TARGET( + NV_PROVIDES_SM_90, + (_CUDA_VSTD::uint32_t __sreg_value; + asm("mov.u32 %0, %%cluster_nctarank;" + : "=r"(__sreg_value) + : + :); + return __sreg_value;), + ( + // Unsupported architectures will have a linker error with a semi-decent error message + __cuda_ptx_get_sreg_cluster_nctarank_is_not_supported_before_SM_90__(); return 0;)); +} +#endif // __cccl_ptx_isa >= 780 + +/* +// mov.u32 sreg_value, %%lanemask_eq; // PTX ISA 20, SM_35 +template +__device__ static inline uint32_t get_sreg_lanemask_eq(); +*/ +#if __cccl_ptx_isa >= 200 +extern "C" _CCCL_DEVICE void __cuda_ptx_get_sreg_lanemask_eq_is_not_supported_before_SM_35__(); +template +_CCCL_DEVICE static inline _CUDA_VSTD::uint32_t get_sreg_lanemask_eq() +{ + NV_IF_ELSE_TARGET( + NV_PROVIDES_SM_35, + (_CUDA_VSTD::uint32_t __sreg_value; + asm("mov.u32 %0, %%lanemask_eq;" + : "=r"(__sreg_value) + : + :); + return __sreg_value;), + ( + // Unsupported architectures will have a linker error with a semi-decent error message + __cuda_ptx_get_sreg_lanemask_eq_is_not_supported_before_SM_35__(); return 0;)); +} +#endif // __cccl_ptx_isa >= 200 + +/* +// mov.u32 sreg_value, %%lanemask_le; // PTX ISA 20, SM_35 +template +__device__ static inline uint32_t get_sreg_lanemask_le(); +*/ +#if __cccl_ptx_isa >= 200 +extern "C" _CCCL_DEVICE void __cuda_ptx_get_sreg_lanemask_le_is_not_supported_before_SM_35__(); +template +_CCCL_DEVICE static inline _CUDA_VSTD::uint32_t get_sreg_lanemask_le() +{ + NV_IF_ELSE_TARGET( + NV_PROVIDES_SM_35, + (_CUDA_VSTD::uint32_t __sreg_value; + asm("mov.u32 %0, %%lanemask_le;" + : "=r"(__sreg_value) + : + :); + return __sreg_value;), + ( + // Unsupported architectures will have a linker error with a semi-decent error message + __cuda_ptx_get_sreg_lanemask_le_is_not_supported_before_SM_35__(); return 0;)); +} +#endif // __cccl_ptx_isa >= 200 + +/* +// mov.u32 sreg_value, %%lanemask_lt; // PTX ISA 20, SM_35 +template +__device__ static inline uint32_t get_sreg_lanemask_lt(); +*/ +#if __cccl_ptx_isa >= 200 +extern "C" _CCCL_DEVICE void __cuda_ptx_get_sreg_lanemask_lt_is_not_supported_before_SM_35__(); +template +_CCCL_DEVICE static inline _CUDA_VSTD::uint32_t get_sreg_lanemask_lt() +{ + NV_IF_ELSE_TARGET( + NV_PROVIDES_SM_35, + (_CUDA_VSTD::uint32_t __sreg_value; + asm("mov.u32 %0, %%lanemask_lt;" + : "=r"(__sreg_value) + : + :); + return __sreg_value;), + ( + // Unsupported architectures will have a linker error with a semi-decent error message + __cuda_ptx_get_sreg_lanemask_lt_is_not_supported_before_SM_35__(); return 0;)); +} +#endif // __cccl_ptx_isa >= 200 + +/* +// mov.u32 sreg_value, %%lanemask_ge; // PTX ISA 20, SM_35 +template +__device__ static inline uint32_t get_sreg_lanemask_ge(); +*/ +#if __cccl_ptx_isa >= 200 +extern "C" _CCCL_DEVICE void __cuda_ptx_get_sreg_lanemask_ge_is_not_supported_before_SM_35__(); +template +_CCCL_DEVICE static inline _CUDA_VSTD::uint32_t get_sreg_lanemask_ge() +{ + NV_IF_ELSE_TARGET( + NV_PROVIDES_SM_35, + (_CUDA_VSTD::uint32_t __sreg_value; + asm("mov.u32 %0, %%lanemask_ge;" + : "=r"(__sreg_value) + : + :); + return __sreg_value;), + ( + // Unsupported architectures will have a linker error with a semi-decent error message + __cuda_ptx_get_sreg_lanemask_ge_is_not_supported_before_SM_35__(); return 0;)); +} +#endif // __cccl_ptx_isa >= 200 + +/* +// mov.u32 sreg_value, %%lanemask_gt; // PTX ISA 20, SM_35 +template +__device__ static inline uint32_t get_sreg_lanemask_gt(); +*/ +#if __cccl_ptx_isa >= 200 +extern "C" _CCCL_DEVICE void __cuda_ptx_get_sreg_lanemask_gt_is_not_supported_before_SM_35__(); +template +_CCCL_DEVICE static inline _CUDA_VSTD::uint32_t get_sreg_lanemask_gt() +{ + NV_IF_ELSE_TARGET( + NV_PROVIDES_SM_35, + (_CUDA_VSTD::uint32_t __sreg_value; + asm("mov.u32 %0, %%lanemask_gt;" + : "=r"(__sreg_value) + : + :); + return __sreg_value;), + ( + // Unsupported architectures will have a linker error with a semi-decent error message + __cuda_ptx_get_sreg_lanemask_gt_is_not_supported_before_SM_35__(); return 0;)); +} +#endif // __cccl_ptx_isa >= 200 + +/* +// mov.u32 sreg_value, %%clock; // PTX ISA 10 +template +__device__ static inline uint32_t get_sreg_clock(); +*/ +#if __cccl_ptx_isa >= 100 +template +_CCCL_DEVICE static inline _CUDA_VSTD::uint32_t get_sreg_clock() +{ + _CUDA_VSTD::uint32_t __sreg_value; + asm volatile("mov.u32 %0, %%clock;" : "=r"(__sreg_value) : :); + return __sreg_value; +} +#endif // __cccl_ptx_isa >= 100 + +/* +// mov.u32 sreg_value, %%clock_hi; // PTX ISA 50, SM_35 +template +__device__ static inline uint32_t get_sreg_clock_hi(); +*/ +#if __cccl_ptx_isa >= 500 +extern "C" _CCCL_DEVICE void __cuda_ptx_get_sreg_clock_hi_is_not_supported_before_SM_35__(); +template +_CCCL_DEVICE static inline _CUDA_VSTD::uint32_t get_sreg_clock_hi() +{ + NV_IF_ELSE_TARGET( + NV_PROVIDES_SM_35, + (_CUDA_VSTD::uint32_t __sreg_value; asm volatile( + "mov.u32 %0, %%clock_hi;" + : "=r"(__sreg_value) + : + :); + return __sreg_value;), + ( + // Unsupported architectures will have a linker error with a semi-decent error message + __cuda_ptx_get_sreg_clock_hi_is_not_supported_before_SM_35__(); return 0;)); +} +#endif // __cccl_ptx_isa >= 500 + +/* +// mov.u64 sreg_value, %%clock64; // PTX ISA 20, SM_35 +template +__device__ static inline uint64_t get_sreg_clock64(); +*/ +#if __cccl_ptx_isa >= 200 +extern "C" _CCCL_DEVICE void __cuda_ptx_get_sreg_clock64_is_not_supported_before_SM_35__(); +template +_CCCL_DEVICE static inline _CUDA_VSTD::uint64_t get_sreg_clock64() +{ + NV_IF_ELSE_TARGET( + NV_PROVIDES_SM_35, + (_CUDA_VSTD::uint64_t __sreg_value; asm volatile( + "mov.u64 %0, %%clock64;" + : "=l"(__sreg_value) + : + :); + return __sreg_value;), + ( + // Unsupported architectures will have a linker error with a semi-decent error message + __cuda_ptx_get_sreg_clock64_is_not_supported_before_SM_35__(); return 0;)); +} +#endif // __cccl_ptx_isa >= 200 + +/* +// mov.u64 sreg_value, %%globaltimer; // PTX ISA 31, SM_35 +template +__device__ static inline uint64_t get_sreg_globaltimer(); +*/ +#if __cccl_ptx_isa >= 310 +extern "C" _CCCL_DEVICE void __cuda_ptx_get_sreg_globaltimer_is_not_supported_before_SM_35__(); +template +_CCCL_DEVICE static inline _CUDA_VSTD::uint64_t get_sreg_globaltimer() +{ + NV_IF_ELSE_TARGET( + NV_PROVIDES_SM_35, + (_CUDA_VSTD::uint64_t __sreg_value; asm volatile( + "mov.u64 %0, %%globaltimer;" + : "=l"(__sreg_value) + : + :); + return __sreg_value;), + ( + // Unsupported architectures will have a linker error with a semi-decent error message + __cuda_ptx_get_sreg_globaltimer_is_not_supported_before_SM_35__(); return 0;)); +} +#endif // __cccl_ptx_isa >= 310 + +/* +// mov.u32 sreg_value, %%globaltimer_lo; // PTX ISA 31, SM_35 +template +__device__ static inline uint32_t get_sreg_globaltimer_lo(); +*/ +#if __cccl_ptx_isa >= 310 +extern "C" _CCCL_DEVICE void __cuda_ptx_get_sreg_globaltimer_lo_is_not_supported_before_SM_35__(); +template +_CCCL_DEVICE static inline _CUDA_VSTD::uint32_t get_sreg_globaltimer_lo() +{ + NV_IF_ELSE_TARGET( + NV_PROVIDES_SM_35, + (_CUDA_VSTD::uint32_t __sreg_value; asm volatile( + "mov.u32 %0, %%globaltimer_lo;" + : "=r"(__sreg_value) + : + :); + return __sreg_value;), + ( + // Unsupported architectures will have a linker error with a semi-decent error message + __cuda_ptx_get_sreg_globaltimer_lo_is_not_supported_before_SM_35__(); return 0;)); +} +#endif // __cccl_ptx_isa >= 310 + +/* +// mov.u32 sreg_value, %%globaltimer_hi; // PTX ISA 31, SM_35 +template +__device__ static inline uint32_t get_sreg_globaltimer_hi(); +*/ +#if __cccl_ptx_isa >= 310 +extern "C" _CCCL_DEVICE void __cuda_ptx_get_sreg_globaltimer_hi_is_not_supported_before_SM_35__(); +template +_CCCL_DEVICE static inline _CUDA_VSTD::uint32_t get_sreg_globaltimer_hi() +{ + NV_IF_ELSE_TARGET( + NV_PROVIDES_SM_35, + (_CUDA_VSTD::uint32_t __sreg_value; asm volatile( + "mov.u32 %0, %%globaltimer_hi;" + : "=r"(__sreg_value) + : + :); + return __sreg_value;), + ( + // Unsupported architectures will have a linker error with a semi-decent error message + __cuda_ptx_get_sreg_globaltimer_hi_is_not_supported_before_SM_35__(); return 0;)); +} +#endif // __cccl_ptx_isa >= 310 + +/* +// mov.u32 sreg_value, %%total_smem_size; // PTX ISA 41, SM_35 +template +__device__ static inline uint32_t get_sreg_total_smem_size(); +*/ +#if __cccl_ptx_isa >= 410 +extern "C" _CCCL_DEVICE void __cuda_ptx_get_sreg_total_smem_size_is_not_supported_before_SM_35__(); +template +_CCCL_DEVICE static inline _CUDA_VSTD::uint32_t get_sreg_total_smem_size() +{ + NV_IF_ELSE_TARGET( + NV_PROVIDES_SM_35, + (_CUDA_VSTD::uint32_t __sreg_value; + asm("mov.u32 %0, %%total_smem_size;" + : "=r"(__sreg_value) + : + :); + return __sreg_value;), + ( + // Unsupported architectures will have a linker error with a semi-decent error message + __cuda_ptx_get_sreg_total_smem_size_is_not_supported_before_SM_35__(); return 0;)); +} +#endif // __cccl_ptx_isa >= 410 + +/* +// mov.u32 sreg_value, %%aggr_smem_size; // PTX ISA 81, SM_90 +template +__device__ static inline uint32_t get_sreg_aggr_smem_size(); +*/ +#if __cccl_ptx_isa >= 810 +extern "C" _CCCL_DEVICE void __cuda_ptx_get_sreg_aggr_smem_size_is_not_supported_before_SM_90__(); +template +_CCCL_DEVICE static inline _CUDA_VSTD::uint32_t get_sreg_aggr_smem_size() +{ + NV_IF_ELSE_TARGET( + NV_PROVIDES_SM_90, + (_CUDA_VSTD::uint32_t __sreg_value; + asm("mov.u32 %0, %%aggr_smem_size;" + : "=r"(__sreg_value) + : + :); + return __sreg_value;), + ( + // Unsupported architectures will have a linker error with a semi-decent error message + __cuda_ptx_get_sreg_aggr_smem_size_is_not_supported_before_SM_90__(); return 0;)); +} +#endif // __cccl_ptx_isa >= 810 + +/* +// mov.u32 sreg_value, %%dynamic_smem_size; // PTX ISA 41, SM_35 +template +__device__ static inline uint32_t get_sreg_dynamic_smem_size(); +*/ +#if __cccl_ptx_isa >= 410 +extern "C" _CCCL_DEVICE void __cuda_ptx_get_sreg_dynamic_smem_size_is_not_supported_before_SM_35__(); +template +_CCCL_DEVICE static inline _CUDA_VSTD::uint32_t get_sreg_dynamic_smem_size() +{ + NV_IF_ELSE_TARGET( + NV_PROVIDES_SM_35, + (_CUDA_VSTD::uint32_t __sreg_value; + asm("mov.u32 %0, %%dynamic_smem_size;" + : "=r"(__sreg_value) + : + :); + return __sreg_value;), + ( + // Unsupported architectures will have a linker error with a semi-decent error message + __cuda_ptx_get_sreg_dynamic_smem_size_is_not_supported_before_SM_35__(); return 0;)); +} +#endif // __cccl_ptx_isa >= 410 + +/* +// mov.u64 sreg_value, %%current_graph_exec; // PTX ISA 80, SM_50 +template +__device__ static inline uint64_t get_sreg_current_graph_exec(); +*/ +#if __cccl_ptx_isa >= 800 +extern "C" _CCCL_DEVICE void __cuda_ptx_get_sreg_current_graph_exec_is_not_supported_before_SM_50__(); +template +_CCCL_DEVICE static inline _CUDA_VSTD::uint64_t get_sreg_current_graph_exec() +{ + NV_IF_ELSE_TARGET( + NV_PROVIDES_SM_50, + (_CUDA_VSTD::uint64_t __sreg_value; + asm("mov.u64 %0, %%current_graph_exec;" + : "=l"(__sreg_value) + : + :); + return __sreg_value;), + ( + // Unsupported architectures will have a linker error with a semi-decent error message + __cuda_ptx_get_sreg_current_graph_exec_is_not_supported_before_SM_50__(); return 0;)); +} +#endif // __cccl_ptx_isa >= 800 diff --git a/libcudacxx/include/cuda/__ptx/instructions/generated/getctarank.inc b/libcudacxx/include/cuda/__ptx/instructions/generated/getctarank.inc new file mode 100644 index 00000000000..51bd351be87 --- /dev/null +++ b/libcudacxx/include/cuda/__ptx/instructions/generated/getctarank.inc @@ -0,0 +1,27 @@ +/* +// getctarank{.space}.u32 dest, addr; // PTX ISA 78, SM_90 +// .space = { .shared::cluster } +template +__device__ static inline uint32_t getctarank( + cuda::ptx::space_cluster_t, + const void* addr); +*/ +#if __cccl_ptx_isa >= 780 +extern "C" _CCCL_DEVICE void __cuda_ptx_getctarank_is_not_supported_before_SM_90__(); +template +_CCCL_DEVICE static inline _CUDA_VSTD::uint32_t getctarank(space_cluster_t, const void* __addr) +{ + // __space == space_cluster (due to parameter type constraint) + NV_IF_ELSE_TARGET( + NV_PROVIDES_SM_90, + (_CUDA_VSTD::uint32_t __dest; + asm("getctarank.shared::cluster.u32 %0, %1;" + : "=r"(__dest) + : "r"(__as_ptr_smem(__addr)) + :); + return __dest;), + ( + // Unsupported architectures will have a linker error with a semi-decent error message + __cuda_ptx_getctarank_is_not_supported_before_SM_90__(); return 0;)); +} +#endif // __cccl_ptx_isa >= 780 diff --git a/libcudacxx/include/cuda/__ptx/instructions/generated/mbarrier_arrive.inc b/libcudacxx/include/cuda/__ptx/instructions/generated/mbarrier_arrive.inc new file mode 100644 index 00000000000..f3e2b860d50 --- /dev/null +++ b/libcudacxx/include/cuda/__ptx/instructions/generated/mbarrier_arrive.inc @@ -0,0 +1,205 @@ +/* +// mbarrier.arrive.shared.b64 state, [addr]; // 1. PTX ISA 70, SM_80 +template +__device__ static inline uint64_t mbarrier_arrive( + uint64_t* addr); +*/ +#if __cccl_ptx_isa >= 700 +extern "C" _CCCL_DEVICE void __cuda_ptx_mbarrier_arrive_is_not_supported_before_SM_80__(); +template +_CCCL_DEVICE static inline _CUDA_VSTD::uint64_t mbarrier_arrive(_CUDA_VSTD::uint64_t* __addr) +{ + NV_IF_ELSE_TARGET( + NV_PROVIDES_SM_80, + (_CUDA_VSTD::uint64_t __state; + asm("mbarrier.arrive.shared.b64 %0, [%1]; // 1. " + : "=l"(__state) + : "r"(__as_ptr_smem(__addr)) + : "memory"); + return __state;), + ( + // Unsupported architectures will have a linker error with a semi-decent error message + __cuda_ptx_mbarrier_arrive_is_not_supported_before_SM_80__(); return 0;)); +} +#endif // __cccl_ptx_isa >= 700 + +/* +// mbarrier.arrive.shared::cta.b64 state, [addr], count; // 2. PTX ISA 78, SM_90 +template +__device__ static inline uint64_t mbarrier_arrive( + uint64_t* addr, + const uint32_t& count); +*/ +#if __cccl_ptx_isa >= 780 +extern "C" _CCCL_DEVICE void __cuda_ptx_mbarrier_arrive_is_not_supported_before_SM_90__(); +template +_CCCL_DEVICE static inline _CUDA_VSTD::uint64_t +mbarrier_arrive(_CUDA_VSTD::uint64_t* __addr, const _CUDA_VSTD::uint32_t& __count) +{ + NV_IF_ELSE_TARGET( + NV_PROVIDES_SM_90, + (_CUDA_VSTD::uint64_t __state; + asm("mbarrier.arrive.shared::cta.b64 %0, [%1], %2; // 2. " + : "=l"(__state) + : "r"(__as_ptr_smem(__addr)), "r"(__count) + : "memory"); + return __state;), + ( + // Unsupported architectures will have a linker error with a semi-decent error message + __cuda_ptx_mbarrier_arrive_is_not_supported_before_SM_90__(); return 0;)); +} +#endif // __cccl_ptx_isa >= 780 + +/* +// mbarrier.arrive{.sem}{.scope}{.space}.b64 state, [addr]; // 3a. PTX ISA 80, SM_90 +// .sem = { .release } +// .scope = { .cta, .cluster } +// .space = { .shared::cta } +template +__device__ static inline uint64_t mbarrier_arrive( + cuda::ptx::sem_release_t, + cuda::ptx::scope_t scope, + cuda::ptx::space_shared_t, + uint64_t* addr); +*/ +#if __cccl_ptx_isa >= 800 +extern "C" _CCCL_DEVICE void __cuda_ptx_mbarrier_arrive_is_not_supported_before_SM_90__(); +template +_CCCL_DEVICE static inline _CUDA_VSTD::uint64_t +mbarrier_arrive(sem_release_t, scope_t<_Scope> __scope, space_shared_t, _CUDA_VSTD::uint64_t* __addr) +{ + // __sem == sem_release (due to parameter type constraint) + static_assert(__scope == scope_cta || __scope == scope_cluster, ""); + // __space == space_shared (due to parameter type constraint) + NV_IF_ELSE_TARGET( + NV_PROVIDES_SM_90, + ( + _CUDA_VSTD::uint64_t __state; _CCCL_IF_CONSTEXPR (__scope == scope_cta) { + asm("mbarrier.arrive.release.cta.shared::cta.b64 %0, [%1]; // 3a. " + : "=l"(__state) + : "r"(__as_ptr_smem(__addr)) + : "memory"); + } _CCCL_ELSE_IF_CONSTEXPR (__scope == scope_cluster) { + asm("mbarrier.arrive.release.cluster.shared::cta.b64 %0, [%1]; // 3a. " + : "=l"(__state) + : "r"(__as_ptr_smem(__addr)) + : "memory"); + } return __state;), + ( + // Unsupported architectures will have a linker error with a semi-decent error message + __cuda_ptx_mbarrier_arrive_is_not_supported_before_SM_90__(); return 0;)); +} +#endif // __cccl_ptx_isa >= 800 + +/* +// mbarrier.arrive{.sem}{.scope}{.space}.b64 state, [addr], count; // 3b. PTX ISA 80, SM_90 +// .sem = { .release } +// .scope = { .cta, .cluster } +// .space = { .shared::cta } +template +__device__ static inline uint64_t mbarrier_arrive( + cuda::ptx::sem_release_t, + cuda::ptx::scope_t scope, + cuda::ptx::space_shared_t, + uint64_t* addr, + const uint32_t& count); +*/ +#if __cccl_ptx_isa >= 800 +extern "C" _CCCL_DEVICE void __cuda_ptx_mbarrier_arrive_is_not_supported_before_SM_90__(); +template +_CCCL_DEVICE static inline _CUDA_VSTD::uint64_t mbarrier_arrive( + sem_release_t, + scope_t<_Scope> __scope, + space_shared_t, + _CUDA_VSTD::uint64_t* __addr, + const _CUDA_VSTD::uint32_t& __count) +{ + // __sem == sem_release (due to parameter type constraint) + static_assert(__scope == scope_cta || __scope == scope_cluster, ""); + // __space == space_shared (due to parameter type constraint) + NV_IF_ELSE_TARGET( + NV_PROVIDES_SM_90, + ( + _CUDA_VSTD::uint64_t __state; _CCCL_IF_CONSTEXPR (__scope == scope_cta) { + asm("mbarrier.arrive.release.cta.shared::cta.b64 %0, [%1], %2; // 3b. " + : "=l"(__state) + : "r"(__as_ptr_smem(__addr)), "r"(__count) + : "memory"); + } _CCCL_ELSE_IF_CONSTEXPR (__scope == scope_cluster) { + asm("mbarrier.arrive.release.cluster.shared::cta.b64 %0, [%1], %2; // 3b. " + : "=l"(__state) + : "r"(__as_ptr_smem(__addr)), "r"(__count) + : "memory"); + } return __state;), + ( + // Unsupported architectures will have a linker error with a semi-decent error message + __cuda_ptx_mbarrier_arrive_is_not_supported_before_SM_90__(); return 0;)); +} +#endif // __cccl_ptx_isa >= 800 + +/* +// mbarrier.arrive{.sem}{.scope}{.space}.b64 _, [addr]; // 4a. PTX ISA 80, SM_90 +// .sem = { .release } +// .scope = { .cluster } +// .space = { .shared::cluster } +template +__device__ static inline void mbarrier_arrive( + cuda::ptx::sem_release_t, + cuda::ptx::scope_cluster_t, + cuda::ptx::space_cluster_t, + uint64_t* addr); +*/ +#if __cccl_ptx_isa >= 800 +extern "C" _CCCL_DEVICE void __cuda_ptx_mbarrier_arrive_is_not_supported_before_SM_90__(); +template +_CCCL_DEVICE static inline void +mbarrier_arrive(sem_release_t, scope_cluster_t, space_cluster_t, _CUDA_VSTD::uint64_t* __addr) +{ + // __sem == sem_release (due to parameter type constraint) + // __scope == scope_cluster (due to parameter type constraint) + // __space == space_cluster (due to parameter type constraint) + NV_IF_ELSE_TARGET( + NV_PROVIDES_SM_90, + (asm("mbarrier.arrive.release.cluster.shared::cluster.b64 _, [%0]; // 4a. " + : + : "r"(__as_ptr_remote_dsmem(__addr)) + : "memory");), + ( + // Unsupported architectures will have a linker error with a semi-decent error message + __cuda_ptx_mbarrier_arrive_is_not_supported_before_SM_90__();)); +} +#endif // __cccl_ptx_isa >= 800 + +/* +// mbarrier.arrive{.sem}{.scope}{.space}.b64 _, [addr], count; // 4b. PTX ISA 80, SM_90 +// .sem = { .release } +// .scope = { .cluster } +// .space = { .shared::cluster } +template +__device__ static inline void mbarrier_arrive( + cuda::ptx::sem_release_t, + cuda::ptx::scope_cluster_t, + cuda::ptx::space_cluster_t, + uint64_t* addr, + const uint32_t& count); +*/ +#if __cccl_ptx_isa >= 800 +extern "C" _CCCL_DEVICE void __cuda_ptx_mbarrier_arrive_is_not_supported_before_SM_90__(); +template +_CCCL_DEVICE static inline void mbarrier_arrive( + sem_release_t, scope_cluster_t, space_cluster_t, _CUDA_VSTD::uint64_t* __addr, const _CUDA_VSTD::uint32_t& __count) +{ + // __sem == sem_release (due to parameter type constraint) + // __scope == scope_cluster (due to parameter type constraint) + // __space == space_cluster (due to parameter type constraint) + NV_IF_ELSE_TARGET( + NV_PROVIDES_SM_90, + (asm("mbarrier.arrive.release.cluster.shared::cluster.b64 _, [%0], %1; // 4b. " + : + : "r"(__as_ptr_remote_dsmem(__addr)), "r"(__count) + : "memory");), + ( + // Unsupported architectures will have a linker error with a semi-decent error message + __cuda_ptx_mbarrier_arrive_is_not_supported_before_SM_90__();)); +} +#endif // __cccl_ptx_isa >= 800 diff --git a/libcudacxx/include/cuda/__ptx/instructions/generated/mbarrier_arrive_expect_tx.inc b/libcudacxx/include/cuda/__ptx/instructions/generated/mbarrier_arrive_expect_tx.inc new file mode 100644 index 00000000000..efb749957b1 --- /dev/null +++ b/libcudacxx/include/cuda/__ptx/instructions/generated/mbarrier_arrive_expect_tx.inc @@ -0,0 +1,79 @@ +/* +// mbarrier.arrive.expect_tx{.sem}{.scope}{.space}.b64 state, [addr], tx_count; // 8. PTX ISA 80, SM_90 +// .sem = { .release } +// .scope = { .cta, .cluster } +// .space = { .shared::cta } +template +__device__ static inline uint64_t mbarrier_arrive_expect_tx( + cuda::ptx::sem_release_t, + cuda::ptx::scope_t scope, + cuda::ptx::space_shared_t, + uint64_t* addr, + const uint32_t& tx_count); +*/ +#if __cccl_ptx_isa >= 800 +extern "C" _CCCL_DEVICE void __cuda_ptx_mbarrier_arrive_expect_tx_is_not_supported_before_SM_90__(); +template +_CCCL_DEVICE static inline _CUDA_VSTD::uint64_t mbarrier_arrive_expect_tx( + sem_release_t, + scope_t<_Scope> __scope, + space_shared_t, + _CUDA_VSTD::uint64_t* __addr, + const _CUDA_VSTD::uint32_t& __tx_count) +{ + // __sem == sem_release (due to parameter type constraint) + static_assert(__scope == scope_cta || __scope == scope_cluster, ""); + // __space == space_shared (due to parameter type constraint) + NV_IF_ELSE_TARGET( + NV_PROVIDES_SM_90, + ( + _CUDA_VSTD::uint64_t __state; _CCCL_IF_CONSTEXPR (__scope == scope_cta) { + asm("mbarrier.arrive.expect_tx.release.cta.shared::cta.b64 %0, [%1], %2; // 8. " + : "=l"(__state) + : "r"(__as_ptr_smem(__addr)), "r"(__tx_count) + : "memory"); + } _CCCL_ELSE_IF_CONSTEXPR (__scope == scope_cluster) { + asm("mbarrier.arrive.expect_tx.release.cluster.shared::cta.b64 %0, [%1], %2; // 8. " + : "=l"(__state) + : "r"(__as_ptr_smem(__addr)), "r"(__tx_count) + : "memory"); + } return __state;), + ( + // Unsupported architectures will have a linker error with a semi-decent error message + __cuda_ptx_mbarrier_arrive_expect_tx_is_not_supported_before_SM_90__(); return 0;)); +} +#endif // __cccl_ptx_isa >= 800 + +/* +// mbarrier.arrive.expect_tx{.sem}{.scope}{.space}.b64 _, [addr], tx_count; // 9. PTX ISA 80, SM_90 +// .sem = { .release } +// .scope = { .cluster } +// .space = { .shared::cluster } +template +__device__ static inline void mbarrier_arrive_expect_tx( + cuda::ptx::sem_release_t, + cuda::ptx::scope_cluster_t, + cuda::ptx::space_cluster_t, + uint64_t* addr, + const uint32_t& tx_count); +*/ +#if __cccl_ptx_isa >= 800 +extern "C" _CCCL_DEVICE void __cuda_ptx_mbarrier_arrive_expect_tx_is_not_supported_before_SM_90__(); +template +_CCCL_DEVICE static inline void mbarrier_arrive_expect_tx( + sem_release_t, scope_cluster_t, space_cluster_t, _CUDA_VSTD::uint64_t* __addr, const _CUDA_VSTD::uint32_t& __tx_count) +{ + // __sem == sem_release (due to parameter type constraint) + // __scope == scope_cluster (due to parameter type constraint) + // __space == space_cluster (due to parameter type constraint) + NV_IF_ELSE_TARGET( + NV_PROVIDES_SM_90, + (asm("mbarrier.arrive.expect_tx.release.cluster.shared::cluster.b64 _, [%0], %1; // 9. " + : + : "r"(__as_ptr_remote_dsmem(__addr)), "r"(__tx_count) + : "memory");), + ( + // Unsupported architectures will have a linker error with a semi-decent error message + __cuda_ptx_mbarrier_arrive_expect_tx_is_not_supported_before_SM_90__();)); +} +#endif // __cccl_ptx_isa >= 800 diff --git a/libcudacxx/include/cuda/__ptx/instructions/generated/mbarrier_arrive_no_complete.inc b/libcudacxx/include/cuda/__ptx/instructions/generated/mbarrier_arrive_no_complete.inc new file mode 100644 index 00000000000..879bedebdc9 --- /dev/null +++ b/libcudacxx/include/cuda/__ptx/instructions/generated/mbarrier_arrive_no_complete.inc @@ -0,0 +1,26 @@ +/* +// mbarrier.arrive.noComplete.shared.b64 state, [addr], count; // 5. PTX ISA 70, SM_80 +template +__device__ static inline uint64_t mbarrier_arrive_no_complete( + uint64_t* addr, + const uint32_t& count); +*/ +#if __cccl_ptx_isa >= 700 +extern "C" _CCCL_DEVICE void __cuda_ptx_mbarrier_arrive_no_complete_is_not_supported_before_SM_80__(); +template +_CCCL_DEVICE static inline _CUDA_VSTD::uint64_t +mbarrier_arrive_no_complete(_CUDA_VSTD::uint64_t* __addr, const _CUDA_VSTD::uint32_t& __count) +{ + NV_IF_ELSE_TARGET( + NV_PROVIDES_SM_80, + (_CUDA_VSTD::uint64_t __state; + asm("mbarrier.arrive.noComplete.shared.b64 %0, [%1], %2; // 5. " + : "=l"(__state) + : "r"(__as_ptr_smem(__addr)), "r"(__count) + : "memory"); + return __state;), + ( + // Unsupported architectures will have a linker error with a semi-decent error message + __cuda_ptx_mbarrier_arrive_no_complete_is_not_supported_before_SM_80__(); return 0;)); +} +#endif // __cccl_ptx_isa >= 700 diff --git a/libcudacxx/include/cuda/__ptx/instructions/generated/mbarrier_init.inc b/libcudacxx/include/cuda/__ptx/instructions/generated/mbarrier_init.inc new file mode 100644 index 00000000000..3afeeacfccf --- /dev/null +++ b/libcudacxx/include/cuda/__ptx/instructions/generated/mbarrier_init.inc @@ -0,0 +1,23 @@ +/* +// mbarrier.init.shared.b64 [addr], count; // PTX ISA 70, SM_80 +template +__device__ static inline void mbarrier_init( + uint64_t* addr, + const uint32_t& count); +*/ +#if __cccl_ptx_isa >= 700 +extern "C" _CCCL_DEVICE void __cuda_ptx_mbarrier_init_is_not_supported_before_SM_80__(); +template +_CCCL_DEVICE static inline void mbarrier_init(_CUDA_VSTD::uint64_t* __addr, const _CUDA_VSTD::uint32_t& __count) +{ + NV_IF_ELSE_TARGET( + NV_PROVIDES_SM_80, + (asm("mbarrier.init.shared.b64 [%0], %1;" + : + : "r"(__as_ptr_smem(__addr)), "r"(__count) + : "memory");), + ( + // Unsupported architectures will have a linker error with a semi-decent error message + __cuda_ptx_mbarrier_init_is_not_supported_before_SM_80__();)); +} +#endif // __cccl_ptx_isa >= 700 diff --git a/libcudacxx/include/cuda/__ptx/instructions/generated/mbarrier_test_wait.inc b/libcudacxx/include/cuda/__ptx/instructions/generated/mbarrier_test_wait.inc new file mode 100644 index 00000000000..301c0364af4 --- /dev/null +++ b/libcudacxx/include/cuda/__ptx/instructions/generated/mbarrier_test_wait.inc @@ -0,0 +1,75 @@ +/* +// mbarrier.test_wait.shared.b64 waitComplete, [addr], state; // 1. PTX +ISA 70, SM_80 template +__device__ static inline bool mbarrier_test_wait( + uint64_t* addr, + const uint64_t& state); +*/ +#if __cccl_ptx_isa >= 700 +extern "C" _CCCL_DEVICE void __cuda_ptx_mbarrier_test_wait_is_not_supported_before_SM_80__(); +template +_CCCL_DEVICE static inline bool mbarrier_test_wait(_CUDA_VSTD::uint64_t* __addr, const _CUDA_VSTD::uint64_t& __state) +{ + NV_IF_ELSE_TARGET( + NV_PROVIDES_SM_80, + (_CUDA_VSTD::uint32_t __waitComplete; + asm("{\n\t .reg .pred P_OUT; \n\t" + "mbarrier.test_wait.shared.b64 P_OUT, [%1], %2; // 1. \n\t" + "selp.b32 %0, 1, 0, P_OUT; \n" + "}" + : "=r"(__waitComplete) + : "r"(__as_ptr_smem(__addr)), "l"(__state) + : "memory"); + return static_cast(__waitComplete);), + ( + // Unsupported architectures will have a linker error with a semi-decent error message + __cuda_ptx_mbarrier_test_wait_is_not_supported_before_SM_80__(); return false;)); +} +#endif // __cccl_ptx_isa >= 700 + +/* +// mbarrier.test_wait{.sem}{.scope}.shared::cta.b64 waitComplete, [addr], state; // 2. PTX +ISA 80, SM_90 +// .sem = { .acquire } +// .scope = { .cta, .cluster } +template +__device__ static inline bool mbarrier_test_wait( + cuda::ptx::sem_acquire_t, + cuda::ptx::scope_t scope, + uint64_t* addr, + const uint64_t& state); +*/ +#if __cccl_ptx_isa >= 800 +extern "C" _CCCL_DEVICE void __cuda_ptx_mbarrier_test_wait_is_not_supported_before_SM_90__(); +template +_CCCL_DEVICE static inline bool mbarrier_test_wait( + sem_acquire_t, scope_t<_Scope> __scope, _CUDA_VSTD::uint64_t* __addr, const _CUDA_VSTD::uint64_t& __state) +{ + // __sem == sem_acquire (due to parameter type constraint) + static_assert(__scope == scope_cta || __scope == scope_cluster, ""); + NV_IF_ELSE_TARGET( + NV_PROVIDES_SM_90, + ( + _CUDA_VSTD::uint32_t __waitComplete; _CCCL_IF_CONSTEXPR (__scope == scope_cta) { + asm("{\n\t .reg .pred P_OUT; \n\t" + "mbarrier.test_wait.acquire.cta.shared::cta.b64 P_OUT, [%1], %2; // 2. \n\t" + "selp.b32 %0, 1, 0, P_OUT; \n" + "}" + : "=r"(__waitComplete) + : "r"(__as_ptr_smem(__addr)), "l"(__state) + : "memory"); + } _CCCL_ELSE_IF_CONSTEXPR (__scope == scope_cluster) { + asm("{\n\t .reg .pred P_OUT; \n\t" + "mbarrier.test_wait.acquire.cluster.shared::cta.b64 P_OUT, [%1], %2; // 2. " + "\n\t" + "selp.b32 %0, 1, 0, P_OUT; \n" + "}" + : "=r"(__waitComplete) + : "r"(__as_ptr_smem(__addr)), "l"(__state) + : "memory"); + } return static_cast(__waitComplete);), + ( + // Unsupported architectures will have a linker error with a semi-decent error message + __cuda_ptx_mbarrier_test_wait_is_not_supported_before_SM_90__(); return false;)); +} +#endif // __cccl_ptx_isa >= 800 diff --git a/libcudacxx/include/cuda/__ptx/instructions/generated/mbarrier_test_wait_parity.inc b/libcudacxx/include/cuda/__ptx/instructions/generated/mbarrier_test_wait_parity.inc new file mode 100644 index 00000000000..604cfd92045 --- /dev/null +++ b/libcudacxx/include/cuda/__ptx/instructions/generated/mbarrier_test_wait_parity.inc @@ -0,0 +1,75 @@ +/* +// mbarrier.test_wait.parity.shared.b64 waitComplete, [addr], phaseParity; // 3. PTX +ISA 71, SM_80 template +__device__ static inline bool mbarrier_test_wait_parity( + uint64_t* addr, + const uint32_t& phaseParity); +*/ +#if __cccl_ptx_isa >= 710 +extern "C" _CCCL_DEVICE void __cuda_ptx_mbarrier_test_wait_parity_is_not_supported_before_SM_80__(); +template +_CCCL_DEVICE static inline bool +mbarrier_test_wait_parity(_CUDA_VSTD::uint64_t* __addr, const _CUDA_VSTD::uint32_t& __phaseParity) +{ + NV_IF_ELSE_TARGET( + NV_PROVIDES_SM_80, + (_CUDA_VSTD::uint32_t __waitComplete; + asm("{\n\t .reg .pred P_OUT; \n\t" + "mbarrier.test_wait.parity.shared.b64 P_OUT, [%1], %2; // 3. \n\t" + "selp.b32 %0, 1, 0, P_OUT; \n" + "}" + : "=r"(__waitComplete) + : "r"(__as_ptr_smem(__addr)), "r"(__phaseParity) + : "memory"); + return static_cast(__waitComplete);), + ( + // Unsupported architectures will have a linker error with a semi-decent error message + __cuda_ptx_mbarrier_test_wait_parity_is_not_supported_before_SM_80__(); return false;)); +} +#endif // __cccl_ptx_isa >= 710 + +/* +// mbarrier.test_wait.parity{.sem}{.scope}.shared::cta.b64 waitComplete, [addr], phaseParity; // 4. PTX +ISA 80, SM_90 +// .sem = { .acquire } +// .scope = { .cta, .cluster } +template +__device__ static inline bool mbarrier_test_wait_parity( + cuda::ptx::sem_acquire_t, + cuda::ptx::scope_t scope, + uint64_t* addr, + const uint32_t& phaseParity); +*/ +#if __cccl_ptx_isa >= 800 +extern "C" _CCCL_DEVICE void __cuda_ptx_mbarrier_test_wait_parity_is_not_supported_before_SM_90__(); +template +_CCCL_DEVICE static inline bool mbarrier_test_wait_parity( + sem_acquire_t, scope_t<_Scope> __scope, _CUDA_VSTD::uint64_t* __addr, const _CUDA_VSTD::uint32_t& __phaseParity) +{ + // __sem == sem_acquire (due to parameter type constraint) + static_assert(__scope == scope_cta || __scope == scope_cluster, ""); + NV_IF_ELSE_TARGET( + NV_PROVIDES_SM_90, + ( + _CUDA_VSTD::uint32_t __waitComplete; _CCCL_IF_CONSTEXPR (__scope == scope_cta) { + asm("{\n\t .reg .pred P_OUT; \n\t" + "mbarrier.test_wait.parity.acquire.cta.shared::cta.b64 P_OUT, [%1], %2; // 4. \n\t" + "selp.b32 %0, 1, 0, P_OUT; \n" + "}" + : "=r"(__waitComplete) + : "r"(__as_ptr_smem(__addr)), "r"(__phaseParity) + : "memory"); + } _CCCL_ELSE_IF_CONSTEXPR (__scope == scope_cluster) { + asm("{\n\t .reg .pred P_OUT; \n\t" + "mbarrier.test_wait.parity.acquire.cluster.shared::cta.b64 P_OUT, [%1], %2; // 4. \n\t" + "selp.b32 %0, 1, 0, P_OUT; \n" + "}" + : "=r"(__waitComplete) + : "r"(__as_ptr_smem(__addr)), "r"(__phaseParity) + : "memory"); + } return static_cast(__waitComplete);), + ( + // Unsupported architectures will have a linker error with a semi-decent error message + __cuda_ptx_mbarrier_test_wait_parity_is_not_supported_before_SM_90__(); return false;)); +} +#endif // __cccl_ptx_isa >= 800 diff --git a/libcudacxx/include/cuda/__ptx/instructions/generated/mbarrier_try_wait.inc b/libcudacxx/include/cuda/__ptx/instructions/generated/mbarrier_try_wait.inc new file mode 100644 index 00000000000..c5f2062664c --- /dev/null +++ b/libcudacxx/include/cuda/__ptx/instructions/generated/mbarrier_try_wait.inc @@ -0,0 +1,157 @@ +/* +// mbarrier.try_wait.shared::cta.b64 waitComplete, [addr], state; // 5a. +PTX ISA 78, SM_90 template +__device__ static inline bool mbarrier_try_wait( + uint64_t* addr, + const uint64_t& state); +*/ +#if __cccl_ptx_isa >= 780 +extern "C" _CCCL_DEVICE void __cuda_ptx_mbarrier_try_wait_is_not_supported_before_SM_90__(); +template +_CCCL_DEVICE static inline bool mbarrier_try_wait(_CUDA_VSTD::uint64_t* __addr, const _CUDA_VSTD::uint64_t& __state) +{ + NV_IF_ELSE_TARGET( + NV_PROVIDES_SM_90, + (_CUDA_VSTD::uint32_t __waitComplete; + asm("{\n\t .reg .pred P_OUT; \n\t" + "mbarrier.try_wait.shared::cta.b64 P_OUT, [%1], %2; // 5a. \n\t" + "selp.b32 %0, 1, 0, P_OUT; \n" + "}" + : "=r"(__waitComplete) + : "r"(__as_ptr_smem(__addr)), "l"(__state) + : "memory"); + return static_cast(__waitComplete);), + ( + // Unsupported architectures will have a linker error with a semi-decent error message + __cuda_ptx_mbarrier_try_wait_is_not_supported_before_SM_90__(); return false;)); +} +#endif // __cccl_ptx_isa >= 780 + +/* +// mbarrier.try_wait.shared::cta.b64 waitComplete, [addr], state, suspendTimeHint; // 5b. PTX +ISA 78, SM_90 template +__device__ static inline bool mbarrier_try_wait( + uint64_t* addr, + const uint64_t& state, + const uint32_t& suspendTimeHint); +*/ +#if __cccl_ptx_isa >= 780 +extern "C" _CCCL_DEVICE void __cuda_ptx_mbarrier_try_wait_is_not_supported_before_SM_90__(); +template +_CCCL_DEVICE static inline bool mbarrier_try_wait( + _CUDA_VSTD::uint64_t* __addr, const _CUDA_VSTD::uint64_t& __state, const _CUDA_VSTD::uint32_t& __suspendTimeHint) +{ + NV_IF_ELSE_TARGET( + NV_PROVIDES_SM_90, + (_CUDA_VSTD::uint32_t __waitComplete; + asm("{\n\t .reg .pred P_OUT; \n\t" + "mbarrier.try_wait.shared::cta.b64 P_OUT, [%1], %2, %3; // 5b. \n\t" + "selp.b32 %0, 1, 0, P_OUT; \n" + "}" + : "=r"(__waitComplete) + : "r"(__as_ptr_smem(__addr)), "l"(__state), "r"(__suspendTimeHint) + : "memory"); + return static_cast(__waitComplete);), + ( + // Unsupported architectures will have a linker error with a semi-decent error message + __cuda_ptx_mbarrier_try_wait_is_not_supported_before_SM_90__(); return false;)); +} +#endif // __cccl_ptx_isa >= 780 + +/* +// mbarrier.try_wait{.sem}{.scope}.shared::cta.b64 waitComplete, [addr], state; // 6a. +PTX ISA 80, SM_90 +// .sem = { .acquire } +// .scope = { .cta, .cluster } +template +__device__ static inline bool mbarrier_try_wait( + cuda::ptx::sem_acquire_t, + cuda::ptx::scope_t scope, + uint64_t* addr, + const uint64_t& state); +*/ +#if __cccl_ptx_isa >= 800 +extern "C" _CCCL_DEVICE void __cuda_ptx_mbarrier_try_wait_is_not_supported_before_SM_90__(); +template +_CCCL_DEVICE static inline bool mbarrier_try_wait( + sem_acquire_t, scope_t<_Scope> __scope, _CUDA_VSTD::uint64_t* __addr, const _CUDA_VSTD::uint64_t& __state) +{ + // __sem == sem_acquire (due to parameter type constraint) + static_assert(__scope == scope_cta || __scope == scope_cluster, ""); + NV_IF_ELSE_TARGET( + NV_PROVIDES_SM_90, + ( + _CUDA_VSTD::uint32_t __waitComplete; _CCCL_IF_CONSTEXPR (__scope == scope_cta) { + asm("{\n\t .reg .pred P_OUT; \n\t" + "mbarrier.try_wait.acquire.cta.shared::cta.b64 P_OUT, [%1], %2; // 6a. \n\t" + "selp.b32 %0, 1, 0, P_OUT; \n" + "}" + : "=r"(__waitComplete) + : "r"(__as_ptr_smem(__addr)), "l"(__state) + : "memory"); + } _CCCL_ELSE_IF_CONSTEXPR (__scope == scope_cluster) { + asm("{\n\t .reg .pred P_OUT; \n\t" + "mbarrier.try_wait.acquire.cluster.shared::cta.b64 P_OUT, [%1], %2; // 6a. " + "\n\t" + "selp.b32 %0, 1, 0, P_OUT; \n" + "}" + : "=r"(__waitComplete) + : "r"(__as_ptr_smem(__addr)), "l"(__state) + : "memory"); + } return static_cast(__waitComplete);), + ( + // Unsupported architectures will have a linker error with a semi-decent error message + __cuda_ptx_mbarrier_try_wait_is_not_supported_before_SM_90__(); return false;)); +} +#endif // __cccl_ptx_isa >= 800 + +/* +// mbarrier.try_wait{.sem}{.scope}.shared::cta.b64 waitComplete, [addr], state , suspendTimeHint; // 6b. +PTX ISA 80, SM_90 +// .sem = { .acquire } +// .scope = { .cta, .cluster } +template +__device__ static inline bool mbarrier_try_wait( + cuda::ptx::sem_acquire_t, + cuda::ptx::scope_t scope, + uint64_t* addr, + const uint64_t& state, + const uint32_t& suspendTimeHint); +*/ +#if __cccl_ptx_isa >= 800 +extern "C" _CCCL_DEVICE void __cuda_ptx_mbarrier_try_wait_is_not_supported_before_SM_90__(); +template +_CCCL_DEVICE static inline bool mbarrier_try_wait( + sem_acquire_t, + scope_t<_Scope> __scope, + _CUDA_VSTD::uint64_t* __addr, + const _CUDA_VSTD::uint64_t& __state, + const _CUDA_VSTD::uint32_t& __suspendTimeHint) +{ + // __sem == sem_acquire (due to parameter type constraint) + static_assert(__scope == scope_cta || __scope == scope_cluster, ""); + NV_IF_ELSE_TARGET( + NV_PROVIDES_SM_90, + ( + _CUDA_VSTD::uint32_t __waitComplete; _CCCL_IF_CONSTEXPR (__scope == scope_cta) { + asm("{\n\t .reg .pred P_OUT; \n\t" + "mbarrier.try_wait.acquire.cta.shared::cta.b64 P_OUT, [%1], %2 , %3; // 6b. \n\t" + "selp.b32 %0, 1, 0, P_OUT; \n" + "}" + : "=r"(__waitComplete) + : "r"(__as_ptr_smem(__addr)), "l"(__state), "r"(__suspendTimeHint) + : "memory"); + } _CCCL_ELSE_IF_CONSTEXPR (__scope == scope_cluster) { + asm("{\n\t .reg .pred P_OUT; \n\t" + "mbarrier.try_wait.acquire.cluster.shared::cta.b64 P_OUT, [%1], %2 , %3; // 6b. \n\t" + "selp.b32 %0, 1, 0, P_OUT; \n" + "}" + : "=r"(__waitComplete) + : "r"(__as_ptr_smem(__addr)), "l"(__state), "r"(__suspendTimeHint) + : "memory"); + } return static_cast(__waitComplete);), + ( + // Unsupported architectures will have a linker error with a semi-decent error message + __cuda_ptx_mbarrier_try_wait_is_not_supported_before_SM_90__(); return false;)); +} +#endif // __cccl_ptx_isa >= 800 diff --git a/libcudacxx/include/cuda/__ptx/instructions/generated/mbarrier_try_wait_parity.inc b/libcudacxx/include/cuda/__ptx/instructions/generated/mbarrier_try_wait_parity.inc new file mode 100644 index 00000000000..321bfc515da --- /dev/null +++ b/libcudacxx/include/cuda/__ptx/instructions/generated/mbarrier_try_wait_parity.inc @@ -0,0 +1,157 @@ +/* +// mbarrier.try_wait.parity.shared::cta.b64 waitComplete, [addr], phaseParity; // 7a. +PTX ISA 78, SM_90 template +__device__ static inline bool mbarrier_try_wait_parity( + uint64_t* addr, + const uint32_t& phaseParity); +*/ +#if __cccl_ptx_isa >= 780 +extern "C" _CCCL_DEVICE void __cuda_ptx_mbarrier_try_wait_parity_is_not_supported_before_SM_90__(); +template +_CCCL_DEVICE static inline bool +mbarrier_try_wait_parity(_CUDA_VSTD::uint64_t* __addr, const _CUDA_VSTD::uint32_t& __phaseParity) +{ + NV_IF_ELSE_TARGET( + NV_PROVIDES_SM_90, + (_CUDA_VSTD::uint32_t __waitComplete; + asm("{\n\t .reg .pred P_OUT; \n\t" + "mbarrier.try_wait.parity.shared::cta.b64 P_OUT, [%1], %2; // 7a. \n\t" + "selp.b32 %0, 1, 0, P_OUT; \n" + "}" + : "=r"(__waitComplete) + : "r"(__as_ptr_smem(__addr)), "r"(__phaseParity) + : "memory"); + return static_cast(__waitComplete);), + ( + // Unsupported architectures will have a linker error with a semi-decent error message + __cuda_ptx_mbarrier_try_wait_parity_is_not_supported_before_SM_90__(); return false;)); +} +#endif // __cccl_ptx_isa >= 780 + +/* +// mbarrier.try_wait.parity.shared::cta.b64 waitComplete, [addr], phaseParity, suspendTimeHint; // 7b. +PTX ISA 78, SM_90 template +__device__ static inline bool mbarrier_try_wait_parity( + uint64_t* addr, + const uint32_t& phaseParity, + const uint32_t& suspendTimeHint); +*/ +#if __cccl_ptx_isa >= 780 +extern "C" _CCCL_DEVICE void __cuda_ptx_mbarrier_try_wait_parity_is_not_supported_before_SM_90__(); +template +_CCCL_DEVICE static inline bool mbarrier_try_wait_parity( + _CUDA_VSTD::uint64_t* __addr, const _CUDA_VSTD::uint32_t& __phaseParity, const _CUDA_VSTD::uint32_t& __suspendTimeHint) +{ + NV_IF_ELSE_TARGET( + NV_PROVIDES_SM_90, + (_CUDA_VSTD::uint32_t __waitComplete; + asm("{\n\t .reg .pred P_OUT; \n\t" + "mbarrier.try_wait.parity.shared::cta.b64 P_OUT, [%1], %2, %3; // 7b. \n\t" + "selp.b32 %0, 1, 0, P_OUT; \n" + "}" + : "=r"(__waitComplete) + : "r"(__as_ptr_smem(__addr)), "r"(__phaseParity), "r"(__suspendTimeHint) + : "memory"); + return static_cast(__waitComplete);), + ( + // Unsupported architectures will have a linker error with a semi-decent error message + __cuda_ptx_mbarrier_try_wait_parity_is_not_supported_before_SM_90__(); return false;)); +} +#endif // __cccl_ptx_isa >= 780 + +/* +// mbarrier.try_wait.parity{.sem}{.scope}.shared::cta.b64 waitComplete, [addr], phaseParity; // 8a. +PTX ISA 80, SM_90 +// .sem = { .acquire } +// .scope = { .cta, .cluster } +template +__device__ static inline bool mbarrier_try_wait_parity( + cuda::ptx::sem_acquire_t, + cuda::ptx::scope_t scope, + uint64_t* addr, + const uint32_t& phaseParity); +*/ +#if __cccl_ptx_isa >= 800 +extern "C" _CCCL_DEVICE void __cuda_ptx_mbarrier_try_wait_parity_is_not_supported_before_SM_90__(); +template +_CCCL_DEVICE static inline bool mbarrier_try_wait_parity( + sem_acquire_t, scope_t<_Scope> __scope, _CUDA_VSTD::uint64_t* __addr, const _CUDA_VSTD::uint32_t& __phaseParity) +{ + // __sem == sem_acquire (due to parameter type constraint) + static_assert(__scope == scope_cta || __scope == scope_cluster, ""); + NV_IF_ELSE_TARGET( + NV_PROVIDES_SM_90, + ( + _CUDA_VSTD::uint32_t __waitComplete; _CCCL_IF_CONSTEXPR (__scope == scope_cta) { + asm("{\n\t .reg .pred P_OUT; \n\t" + "mbarrier.try_wait.parity.acquire.cta.shared::cta.b64 P_OUT, [%1], %2; // 8a. \n\t" + "selp.b32 %0, 1, 0, P_OUT; \n" + "}" + : "=r"(__waitComplete) + : "r"(__as_ptr_smem(__addr)), "r"(__phaseParity) + : "memory"); + } _CCCL_ELSE_IF_CONSTEXPR (__scope == scope_cluster) { + asm("{\n\t .reg .pred P_OUT; \n\t" + "mbarrier.try_wait.parity.acquire.cluster.shared::cta.b64 P_OUT, [%1], %2; // 8a. \n\t" + "selp.b32 %0, 1, 0, P_OUT; \n" + "}" + : "=r"(__waitComplete) + : "r"(__as_ptr_smem(__addr)), "r"(__phaseParity) + : "memory"); + } return static_cast(__waitComplete);), + ( + // Unsupported architectures will have a linker error with a semi-decent error message + __cuda_ptx_mbarrier_try_wait_parity_is_not_supported_before_SM_90__(); return false;)); +} +#endif // __cccl_ptx_isa >= 800 + +/* +// mbarrier.try_wait.parity{.sem}{.scope}.shared::cta.b64 waitComplete, [addr], phaseParity, suspendTimeHint; // 8b. +PTX ISA 80, SM_90 +// .sem = { .acquire } +// .scope = { .cta, .cluster } +template +__device__ static inline bool mbarrier_try_wait_parity( + cuda::ptx::sem_acquire_t, + cuda::ptx::scope_t scope, + uint64_t* addr, + const uint32_t& phaseParity, + const uint32_t& suspendTimeHint); +*/ +#if __cccl_ptx_isa >= 800 +extern "C" _CCCL_DEVICE void __cuda_ptx_mbarrier_try_wait_parity_is_not_supported_before_SM_90__(); +template +_CCCL_DEVICE static inline bool mbarrier_try_wait_parity( + sem_acquire_t, + scope_t<_Scope> __scope, + _CUDA_VSTD::uint64_t* __addr, + const _CUDA_VSTD::uint32_t& __phaseParity, + const _CUDA_VSTD::uint32_t& __suspendTimeHint) +{ + // __sem == sem_acquire (due to parameter type constraint) + static_assert(__scope == scope_cta || __scope == scope_cluster, ""); + NV_IF_ELSE_TARGET( + NV_PROVIDES_SM_90, + ( + _CUDA_VSTD::uint32_t __waitComplete; _CCCL_IF_CONSTEXPR (__scope == scope_cta) { + asm("{\n\t .reg .pred P_OUT; \n\t" + "mbarrier.try_wait.parity.acquire.cta.shared::cta.b64 P_OUT, [%1], %2, %3; // 8b. \n\t" + "selp.b32 %0, 1, 0, P_OUT; \n" + "}" + : "=r"(__waitComplete) + : "r"(__as_ptr_smem(__addr)), "r"(__phaseParity), "r"(__suspendTimeHint) + : "memory"); + } _CCCL_ELSE_IF_CONSTEXPR (__scope == scope_cluster) { + asm("{\n\t .reg .pred P_OUT; \n\t" + "mbarrier.try_wait.parity.acquire.cluster.shared::cta.b64 P_OUT, [%1], %2, %3; // 8b. \n\t" + "selp.b32 %0, 1, 0, P_OUT; \n" + "}" + : "=r"(__waitComplete) + : "r"(__as_ptr_smem(__addr)), "r"(__phaseParity), "r"(__suspendTimeHint) + : "memory"); + } return static_cast(__waitComplete);), + ( + // Unsupported architectures will have a linker error with a semi-decent error message + __cuda_ptx_mbarrier_try_wait_parity_is_not_supported_before_SM_90__(); return false;)); +} +#endif // __cccl_ptx_isa >= 800 diff --git a/libcudacxx/include/cuda/__ptx/instructions/generated/red_async.inc b/libcudacxx/include/cuda/__ptx/instructions/generated/red_async.inc new file mode 100644 index 00000000000..3157fa1c627 --- /dev/null +++ b/libcudacxx/include/cuda/__ptx/instructions/generated/red_async.inc @@ -0,0 +1,417 @@ +/* +// red.async.relaxed.cluster.shared::cluster.mbarrier::complete_tx::bytes{.op}{.type} [dest], value, [remote_bar]; // +PTX ISA 81, SM_90 +// .type = { .u32 } +// .op = { .inc } +template +__device__ static inline void red_async( + cuda::ptx::op_inc_t, + uint32_t* dest, + const uint32_t& value, + uint64_t* remote_bar); +*/ +#if __cccl_ptx_isa >= 810 +extern "C" _CCCL_DEVICE void __cuda_ptx_red_async_is_not_supported_before_SM_90__(); +template +_CCCL_DEVICE static inline void red_async( + op_inc_t, _CUDA_VSTD::uint32_t* __dest, const _CUDA_VSTD::uint32_t& __value, _CUDA_VSTD::uint64_t* __remote_bar) +{ + // __type == type_u32 (due to parameter type constraint) + // __op == op_inc (due to parameter type constraint) + NV_IF_ELSE_TARGET( + NV_PROVIDES_SM_90, + (asm("red.async.relaxed.cluster.shared::cluster.mbarrier::complete_tx::bytes.inc.u32 [%0], %1, [%2]; " + : + : "r"(__as_ptr_remote_dsmem(__dest)), "r"(__value), "r"(__as_ptr_remote_dsmem(__remote_bar)) + : "memory");), + ( + // Unsupported architectures will have a linker error with a semi-decent error message + __cuda_ptx_red_async_is_not_supported_before_SM_90__();)); +} +#endif // __cccl_ptx_isa >= 810 + +/* +// red.async.relaxed.cluster.shared::cluster.mbarrier::complete_tx::bytes{.op}{.type} [dest], value, [remote_bar]; // +PTX ISA 81, SM_90 +// .type = { .u32 } +// .op = { .dec } +template +__device__ static inline void red_async( + cuda::ptx::op_dec_t, + uint32_t* dest, + const uint32_t& value, + uint64_t* remote_bar); +*/ +#if __cccl_ptx_isa >= 810 +extern "C" _CCCL_DEVICE void __cuda_ptx_red_async_is_not_supported_before_SM_90__(); +template +_CCCL_DEVICE static inline void red_async( + op_dec_t, _CUDA_VSTD::uint32_t* __dest, const _CUDA_VSTD::uint32_t& __value, _CUDA_VSTD::uint64_t* __remote_bar) +{ + // __type == type_u32 (due to parameter type constraint) + // __op == op_dec (due to parameter type constraint) + NV_IF_ELSE_TARGET( + NV_PROVIDES_SM_90, + (asm("red.async.relaxed.cluster.shared::cluster.mbarrier::complete_tx::bytes.dec.u32 [%0], %1, [%2]; " + : + : "r"(__as_ptr_remote_dsmem(__dest)), "r"(__value), "r"(__as_ptr_remote_dsmem(__remote_bar)) + : "memory");), + ( + // Unsupported architectures will have a linker error with a semi-decent error message + __cuda_ptx_red_async_is_not_supported_before_SM_90__();)); +} +#endif // __cccl_ptx_isa >= 810 + +/* +// red.async.relaxed.cluster.shared::cluster.mbarrier::complete_tx::bytes{.op}{.type} [dest], value, [remote_bar]; // +PTX ISA 81, SM_90 +// .type = { .u32 } +// .op = { .min } +template +__device__ static inline void red_async( + cuda::ptx::op_min_t, + uint32_t* dest, + const uint32_t& value, + uint64_t* remote_bar); +*/ +#if __cccl_ptx_isa >= 810 +extern "C" _CCCL_DEVICE void __cuda_ptx_red_async_is_not_supported_before_SM_90__(); +template +_CCCL_DEVICE static inline void red_async( + op_min_t, _CUDA_VSTD::uint32_t* __dest, const _CUDA_VSTD::uint32_t& __value, _CUDA_VSTD::uint64_t* __remote_bar) +{ + // __type == type_u32 (due to parameter type constraint) + // __op == op_min (due to parameter type constraint) + NV_IF_ELSE_TARGET( + NV_PROVIDES_SM_90, + (asm("red.async.relaxed.cluster.shared::cluster.mbarrier::complete_tx::bytes.min.u32 [%0], %1, [%2]; " + : + : "r"(__as_ptr_remote_dsmem(__dest)), "r"(__value), "r"(__as_ptr_remote_dsmem(__remote_bar)) + : "memory");), + ( + // Unsupported architectures will have a linker error with a semi-decent error message + __cuda_ptx_red_async_is_not_supported_before_SM_90__();)); +} +#endif // __cccl_ptx_isa >= 810 + +/* +// red.async.relaxed.cluster.shared::cluster.mbarrier::complete_tx::bytes{.op}{.type} [dest], value, [remote_bar]; // +PTX ISA 81, SM_90 +// .type = { .u32 } +// .op = { .max } +template +__device__ static inline void red_async( + cuda::ptx::op_max_t, + uint32_t* dest, + const uint32_t& value, + uint64_t* remote_bar); +*/ +#if __cccl_ptx_isa >= 810 +extern "C" _CCCL_DEVICE void __cuda_ptx_red_async_is_not_supported_before_SM_90__(); +template +_CCCL_DEVICE static inline void red_async( + op_max_t, _CUDA_VSTD::uint32_t* __dest, const _CUDA_VSTD::uint32_t& __value, _CUDA_VSTD::uint64_t* __remote_bar) +{ + // __type == type_u32 (due to parameter type constraint) + // __op == op_max (due to parameter type constraint) + NV_IF_ELSE_TARGET( + NV_PROVIDES_SM_90, + (asm("red.async.relaxed.cluster.shared::cluster.mbarrier::complete_tx::bytes.max.u32 [%0], %1, [%2]; " + : + : "r"(__as_ptr_remote_dsmem(__dest)), "r"(__value), "r"(__as_ptr_remote_dsmem(__remote_bar)) + : "memory");), + ( + // Unsupported architectures will have a linker error with a semi-decent error message + __cuda_ptx_red_async_is_not_supported_before_SM_90__();)); +} +#endif // __cccl_ptx_isa >= 810 + +/* +// red.async.relaxed.cluster.shared::cluster.mbarrier::complete_tx::bytes{.op}{.type} [dest], value, [remote_bar]; // +PTX ISA 81, SM_90 +// .type = { .u32 } +// .op = { .add } +template +__device__ static inline void red_async( + cuda::ptx::op_add_t, + uint32_t* dest, + const uint32_t& value, + uint64_t* remote_bar); +*/ +#if __cccl_ptx_isa >= 810 +extern "C" _CCCL_DEVICE void __cuda_ptx_red_async_is_not_supported_before_SM_90__(); +template +_CCCL_DEVICE static inline void red_async( + op_add_t, _CUDA_VSTD::uint32_t* __dest, const _CUDA_VSTD::uint32_t& __value, _CUDA_VSTD::uint64_t* __remote_bar) +{ + // __type == type_u32 (due to parameter type constraint) + // __op == op_add (due to parameter type constraint) + NV_IF_ELSE_TARGET( + NV_PROVIDES_SM_90, + (asm("red.async.relaxed.cluster.shared::cluster.mbarrier::complete_tx::bytes.add.u32 [%0], %1, [%2]; " + : + : "r"(__as_ptr_remote_dsmem(__dest)), "r"(__value), "r"(__as_ptr_remote_dsmem(__remote_bar)) + : "memory");), + ( + // Unsupported architectures will have a linker error with a semi-decent error message + __cuda_ptx_red_async_is_not_supported_before_SM_90__();)); +} +#endif // __cccl_ptx_isa >= 810 + +/* +// red.async.relaxed.cluster.shared::cluster.mbarrier::complete_tx::bytes{.op}{.type} [dest], value, [remote_bar]; // +PTX ISA 81, SM_90 +// .type = { .s32 } +// .op = { .min } +template +__device__ static inline void red_async( + cuda::ptx::op_min_t, + int32_t* dest, + const int32_t& value, + uint64_t* remote_bar); +*/ +#if __cccl_ptx_isa >= 810 +extern "C" _CCCL_DEVICE void __cuda_ptx_red_async_is_not_supported_before_SM_90__(); +template +_CCCL_DEVICE static inline void +red_async(op_min_t, _CUDA_VSTD::int32_t* __dest, const _CUDA_VSTD::int32_t& __value, _CUDA_VSTD::uint64_t* __remote_bar) +{ + // __type == type_s32 (due to parameter type constraint) + // __op == op_min (due to parameter type constraint) + NV_IF_ELSE_TARGET( + NV_PROVIDES_SM_90, + (asm("red.async.relaxed.cluster.shared::cluster.mbarrier::complete_tx::bytes.min.s32 [%0], %1, [%2]; " + : + : "r"(__as_ptr_remote_dsmem(__dest)), "r"(__value), "r"(__as_ptr_remote_dsmem(__remote_bar)) + : "memory");), + ( + // Unsupported architectures will have a linker error with a semi-decent error message + __cuda_ptx_red_async_is_not_supported_before_SM_90__();)); +} +#endif // __cccl_ptx_isa >= 810 + +/* +// red.async.relaxed.cluster.shared::cluster.mbarrier::complete_tx::bytes{.op}{.type} [dest], value, [remote_bar]; // +PTX ISA 81, SM_90 +// .type = { .s32 } +// .op = { .max } +template +__device__ static inline void red_async( + cuda::ptx::op_max_t, + int32_t* dest, + const int32_t& value, + uint64_t* remote_bar); +*/ +#if __cccl_ptx_isa >= 810 +extern "C" _CCCL_DEVICE void __cuda_ptx_red_async_is_not_supported_before_SM_90__(); +template +_CCCL_DEVICE static inline void +red_async(op_max_t, _CUDA_VSTD::int32_t* __dest, const _CUDA_VSTD::int32_t& __value, _CUDA_VSTD::uint64_t* __remote_bar) +{ + // __type == type_s32 (due to parameter type constraint) + // __op == op_max (due to parameter type constraint) + NV_IF_ELSE_TARGET( + NV_PROVIDES_SM_90, + (asm("red.async.relaxed.cluster.shared::cluster.mbarrier::complete_tx::bytes.max.s32 [%0], %1, [%2]; " + : + : "r"(__as_ptr_remote_dsmem(__dest)), "r"(__value), "r"(__as_ptr_remote_dsmem(__remote_bar)) + : "memory");), + ( + // Unsupported architectures will have a linker error with a semi-decent error message + __cuda_ptx_red_async_is_not_supported_before_SM_90__();)); +} +#endif // __cccl_ptx_isa >= 810 + +/* +// red.async.relaxed.cluster.shared::cluster.mbarrier::complete_tx::bytes{.op}{.type} [dest], value, [remote_bar]; // +PTX ISA 81, SM_90 +// .type = { .s32 } +// .op = { .add } +template +__device__ static inline void red_async( + cuda::ptx::op_add_t, + int32_t* dest, + const int32_t& value, + uint64_t* remote_bar); +*/ +#if __cccl_ptx_isa >= 810 +extern "C" _CCCL_DEVICE void __cuda_ptx_red_async_is_not_supported_before_SM_90__(); +template +_CCCL_DEVICE static inline void +red_async(op_add_t, _CUDA_VSTD::int32_t* __dest, const _CUDA_VSTD::int32_t& __value, _CUDA_VSTD::uint64_t* __remote_bar) +{ + // __type == type_s32 (due to parameter type constraint) + // __op == op_add (due to parameter type constraint) + NV_IF_ELSE_TARGET( + NV_PROVIDES_SM_90, + (asm("red.async.relaxed.cluster.shared::cluster.mbarrier::complete_tx::bytes.add.s32 [%0], %1, [%2]; " + : + : "r"(__as_ptr_remote_dsmem(__dest)), "r"(__value), "r"(__as_ptr_remote_dsmem(__remote_bar)) + : "memory");), + ( + // Unsupported architectures will have a linker error with a semi-decent error message + __cuda_ptx_red_async_is_not_supported_before_SM_90__();)); +} +#endif // __cccl_ptx_isa >= 810 + +/* +// red.async.relaxed.cluster.shared::cluster.mbarrier::complete_tx::bytes{.op}{.type} [dest], value, [remote_bar]; // +PTX ISA 81, SM_90 +// .type = { .b32 } +// .op = { .and } +template +__device__ static inline void red_async( + cuda::ptx::op_and_op_t, + B32* dest, + const B32& value, + uint64_t* remote_bar); +*/ +#if __cccl_ptx_isa >= 810 +extern "C" _CCCL_DEVICE void __cuda_ptx_red_async_is_not_supported_before_SM_90__(); +template +_CCCL_DEVICE static inline void +red_async(op_and_op_t, _B32* __dest, const _B32& __value, _CUDA_VSTD::uint64_t* __remote_bar) +{ + // __type == type_b32 (due to parameter type constraint) + // __op == op_and_op (due to parameter type constraint) + static_assert(sizeof(_B32) == 4, ""); + NV_IF_ELSE_TARGET( + NV_PROVIDES_SM_90, + (asm("red.async.relaxed.cluster.shared::cluster.mbarrier::complete_tx::bytes.and.b32 [%0], %1, [%2]; " + : + : "r"(__as_ptr_remote_dsmem(__dest)), "r"(__as_b32(__value)), "r"(__as_ptr_remote_dsmem(__remote_bar)) + : "memory");), + ( + // Unsupported architectures will have a linker error with a semi-decent error message + __cuda_ptx_red_async_is_not_supported_before_SM_90__();)); +} +#endif // __cccl_ptx_isa >= 810 + +/* +// red.async.relaxed.cluster.shared::cluster.mbarrier::complete_tx::bytes{.op}{.type} [dest], value, [remote_bar]; // +PTX ISA 81, SM_90 +// .type = { .b32 } +// .op = { .or } +template +__device__ static inline void red_async( + cuda::ptx::op_or_op_t, + B32* dest, + const B32& value, + uint64_t* remote_bar); +*/ +#if __cccl_ptx_isa >= 810 +extern "C" _CCCL_DEVICE void __cuda_ptx_red_async_is_not_supported_before_SM_90__(); +template +_CCCL_DEVICE static inline void +red_async(op_or_op_t, _B32* __dest, const _B32& __value, _CUDA_VSTD::uint64_t* __remote_bar) +{ + // __type == type_b32 (due to parameter type constraint) + // __op == op_or_op (due to parameter type constraint) + static_assert(sizeof(_B32) == 4, ""); + NV_IF_ELSE_TARGET( + NV_PROVIDES_SM_90, + (asm("red.async.relaxed.cluster.shared::cluster.mbarrier::complete_tx::bytes.or.b32 [%0], %1, [%2]; " + : + : "r"(__as_ptr_remote_dsmem(__dest)), "r"(__as_b32(__value)), "r"(__as_ptr_remote_dsmem(__remote_bar)) + : "memory");), + ( + // Unsupported architectures will have a linker error with a semi-decent error message + __cuda_ptx_red_async_is_not_supported_before_SM_90__();)); +} +#endif // __cccl_ptx_isa >= 810 + +/* +// red.async.relaxed.cluster.shared::cluster.mbarrier::complete_tx::bytes{.op}{.type} [dest], value, [remote_bar]; // +PTX ISA 81, SM_90 +// .type = { .b32 } +// .op = { .xor } +template +__device__ static inline void red_async( + cuda::ptx::op_xor_op_t, + B32* dest, + const B32& value, + uint64_t* remote_bar); +*/ +#if __cccl_ptx_isa >= 810 +extern "C" _CCCL_DEVICE void __cuda_ptx_red_async_is_not_supported_before_SM_90__(); +template +_CCCL_DEVICE static inline void +red_async(op_xor_op_t, _B32* __dest, const _B32& __value, _CUDA_VSTD::uint64_t* __remote_bar) +{ + // __type == type_b32 (due to parameter type constraint) + // __op == op_xor_op (due to parameter type constraint) + static_assert(sizeof(_B32) == 4, ""); + NV_IF_ELSE_TARGET( + NV_PROVIDES_SM_90, + (asm("red.async.relaxed.cluster.shared::cluster.mbarrier::complete_tx::bytes.xor.b32 [%0], %1, [%2]; " + : + : "r"(__as_ptr_remote_dsmem(__dest)), "r"(__as_b32(__value)), "r"(__as_ptr_remote_dsmem(__remote_bar)) + : "memory");), + ( + // Unsupported architectures will have a linker error with a semi-decent error message + __cuda_ptx_red_async_is_not_supported_before_SM_90__();)); +} +#endif // __cccl_ptx_isa >= 810 + +/* +// red.async.relaxed.cluster.shared::cluster.mbarrier::complete_tx::bytes{.op}{.type} [dest], value, [remote_bar]; // +PTX ISA 81, SM_90 +// .type = { .u64 } +// .op = { .add } +template +__device__ static inline void red_async( + cuda::ptx::op_add_t, + uint64_t* dest, + const uint64_t& value, + uint64_t* remote_bar); +*/ +#if __cccl_ptx_isa >= 810 +extern "C" _CCCL_DEVICE void __cuda_ptx_red_async_is_not_supported_before_SM_90__(); +template +_CCCL_DEVICE static inline void red_async( + op_add_t, _CUDA_VSTD::uint64_t* __dest, const _CUDA_VSTD::uint64_t& __value, _CUDA_VSTD::uint64_t* __remote_bar) +{ + // __type == type_u64 (due to parameter type constraint) + // __op == op_add (due to parameter type constraint) + NV_IF_ELSE_TARGET( + NV_PROVIDES_SM_90, + (asm("red.async.relaxed.cluster.shared::cluster.mbarrier::complete_tx::bytes.add.u64 [%0], %1, [%2]; " + : + : "r"(__as_ptr_remote_dsmem(__dest)), "l"(__value), "r"(__as_ptr_remote_dsmem(__remote_bar)) + : "memory");), + ( + // Unsupported architectures will have a linker error with a semi-decent error message + __cuda_ptx_red_async_is_not_supported_before_SM_90__();)); +} +#endif // __cccl_ptx_isa >= 810 + +/* +// red.async.relaxed.cluster.shared::cluster.mbarrier::complete_tx::bytes{.op}.u64 [dest], value, [remote_bar]; // .u64 +intentional PTX ISA 81, SM_90 +// .op = { .add } +template +__device__ static inline void red_async( + cuda::ptx::op_add_t, + int64_t* dest, + const int64_t& value, + int64_t* remote_bar); +*/ +#if __cccl_ptx_isa >= 810 +extern "C" _CCCL_DEVICE void __cuda_ptx_red_async_is_not_supported_before_SM_90__(); +template +_CCCL_DEVICE static inline void +red_async(op_add_t, _CUDA_VSTD::int64_t* __dest, const _CUDA_VSTD::int64_t& __value, _CUDA_VSTD::int64_t* __remote_bar) +{ + // __op == op_add (due to parameter type constraint) + NV_IF_ELSE_TARGET( + NV_PROVIDES_SM_90, + (asm("red.async.relaxed.cluster.shared::cluster.mbarrier::complete_tx::bytes.add.u64 [%0], %1, [%2]; // .u64 " + "intentional" + : + : "r"(__as_ptr_remote_dsmem(__dest)), "l"(__value), "r"(__as_ptr_remote_dsmem(__remote_bar)) + : "memory");), + ( + // Unsupported architectures will have a linker error with a semi-decent error message + __cuda_ptx_red_async_is_not_supported_before_SM_90__();)); +} +#endif // __cccl_ptx_isa >= 810 diff --git a/libcudacxx/include/cuda/__ptx/instructions/generated/st_async.inc b/libcudacxx/include/cuda/__ptx/instructions/generated/st_async.inc new file mode 100644 index 00000000000..9dfab243ffe --- /dev/null +++ b/libcudacxx/include/cuda/__ptx/instructions/generated/st_async.inc @@ -0,0 +1,108 @@ +/* +// st.async.weak.shared::cluster.mbarrier::complete_tx::bytes{.type} [addr], value, [remote_bar]; // 1. PTX ISA 81, +SM_90 +// .type = { .b32, .b64 } +template +__device__ static inline void st_async( + Type* addr, + const Type& value, + uint64_t* remote_bar); +*/ +#if __cccl_ptx_isa >= 810 +extern "C" _CCCL_DEVICE void __cuda_ptx_st_async_is_not_supported_before_SM_90__(); +template +_CCCL_DEVICE static inline void st_async(_Type* __addr, const _Type& __value, _CUDA_VSTD::uint64_t* __remote_bar) +{ + static_assert(sizeof(_Type) == 4 || sizeof(_Type) == 8, ""); + NV_IF_ELSE_TARGET( + NV_PROVIDES_SM_90, + ( + _CCCL_IF_CONSTEXPR (sizeof(_Type) == 4) { + asm("st.async.weak.shared::cluster.mbarrier::complete_tx::bytes.b32 [%0], %1, [%2]; // 1. " + : + : "r"(__as_ptr_remote_dsmem(__addr)), "r"(__as_b32(__value)), "r"(__as_ptr_remote_dsmem(__remote_bar)) + : "memory"); + } _CCCL_ELSE_IF_CONSTEXPR (sizeof(_Type) == 8) { + asm("st.async.weak.shared::cluster.mbarrier::complete_tx::bytes.b64 [%0], %1, [%2]; // 1. " + : + : "r"(__as_ptr_remote_dsmem(__addr)), "l"(__as_b64(__value)), "r"(__as_ptr_remote_dsmem(__remote_bar)) + : "memory"); + }), + ( + // Unsupported architectures will have a linker error with a semi-decent error message + __cuda_ptx_st_async_is_not_supported_before_SM_90__();)); +} +#endif // __cccl_ptx_isa >= 810 + +/* +// st.async.weak.shared::cluster.mbarrier::complete_tx::bytes.v2{.type} [addr], value, [remote_bar]; // 2. PTX ISA 81, +SM_90 +// .type = { .b32, .b64 } +template +__device__ static inline void st_async( + Type* addr, + const Type (&value)[2], + uint64_t* remote_bar); +*/ +#if __cccl_ptx_isa >= 810 +extern "C" _CCCL_DEVICE void __cuda_ptx_st_async_is_not_supported_before_SM_90__(); +template +_CCCL_DEVICE static inline void st_async(_Type* __addr, const _Type (&__value)[2], _CUDA_VSTD::uint64_t* __remote_bar) +{ + static_assert(sizeof(_Type) == 4 || sizeof(_Type) == 8, ""); + NV_IF_ELSE_TARGET( + NV_PROVIDES_SM_90, + ( + _CCCL_IF_CONSTEXPR (sizeof(_Type) == 4) { + asm("st.async.weak.shared::cluster.mbarrier::complete_tx::bytes.v2.b32 [%0], {%1, %2}, [%3]; // 2. " + : + : "r"(__as_ptr_remote_dsmem(__addr)), + "r"(__as_b32(__value[0])), + "r"(__as_b32(__value[1])), + "r"(__as_ptr_remote_dsmem(__remote_bar)) + : "memory"); + } _CCCL_ELSE_IF_CONSTEXPR (sizeof(_Type) == 8) { + asm("st.async.weak.shared::cluster.mbarrier::complete_tx::bytes.v2.b64 [%0], {%1, %2}, [%3]; // 2. " + : + : "r"(__as_ptr_remote_dsmem(__addr)), + "l"(__as_b64(__value[0])), + "l"(__as_b64(__value[1])), + "r"(__as_ptr_remote_dsmem(__remote_bar)) + : "memory"); + }), + ( + // Unsupported architectures will have a linker error with a semi-decent error message + __cuda_ptx_st_async_is_not_supported_before_SM_90__();)); +} +#endif // __cccl_ptx_isa >= 810 + +/* +// st.async.weak.shared::cluster.mbarrier::complete_tx::bytes.v4.b32 [addr], value, [remote_bar]; // 3. PTX ISA 81, +SM_90 template +__device__ static inline void st_async( + B32* addr, + const B32 (&value)[4], + uint64_t* remote_bar); +*/ +#if __cccl_ptx_isa >= 810 +extern "C" _CCCL_DEVICE void __cuda_ptx_st_async_is_not_supported_before_SM_90__(); +template +_CCCL_DEVICE static inline void st_async(_B32* __addr, const _B32 (&__value)[4], _CUDA_VSTD::uint64_t* __remote_bar) +{ + static_assert(sizeof(_B32) == 4, ""); + NV_IF_ELSE_TARGET( + NV_PROVIDES_SM_90, + (asm("st.async.weak.shared::cluster.mbarrier::complete_tx::bytes.v4.b32 [%0], {%1, %2, %3, %4}, [%5]; // 3. " + : + : "r"(__as_ptr_remote_dsmem(__addr)), + "r"(__as_b32(__value[0])), + "r"(__as_b32(__value[1])), + "r"(__as_b32(__value[2])), + "r"(__as_b32(__value[3])), + "r"(__as_ptr_remote_dsmem(__remote_bar)) + : "memory");), + ( + // Unsupported architectures will have a linker error with a semi-decent error message + __cuda_ptx_st_async_is_not_supported_before_SM_90__();)); +} +#endif // __cccl_ptx_isa >= 810 diff --git a/libcudacxx/include/cuda/__ptx/instructions/generated/tensormap_cp_fenceproxy.inc b/libcudacxx/include/cuda/__ptx/instructions/generated/tensormap_cp_fenceproxy.inc new file mode 100644 index 00000000000..033d0606e7f --- /dev/null +++ b/libcudacxx/include/cuda/__ptx/instructions/generated/tensormap_cp_fenceproxy.inc @@ -0,0 +1,54 @@ +/* +// tensormap.cp_fenceproxy.global.shared::cta.tensormap::generic.sem.scope.sync.aligned [dst], [src], size; // PTX ISA +83, SM_90 +// .sem = { .release } +// .scope = { .cta, .cluster, .gpu, .sys } +template +__device__ static inline void tensormap_cp_fenceproxy( + cuda::ptx::sem_release_t, + cuda::ptx::scope_t scope, + void* dst, + const void* src, + cuda::ptx::n32_t size); +*/ +#if __cccl_ptx_isa >= 830 +extern "C" _CCCL_DEVICE void __cuda_ptx_tensormap_cp_fenceproxy_is_not_supported_before_SM_90__(); +template +_CCCL_DEVICE static inline void +tensormap_cp_fenceproxy(sem_release_t, scope_t<_Scope> __scope, void* __dst, const void* __src, n32_t<_N32> __size) +{ + // __sem == sem_release (due to parameter type constraint) + static_assert(__scope == scope_cta || __scope == scope_cluster || __scope == scope_gpu || __scope == scope_sys, ""); + NV_IF_ELSE_TARGET( + NV_PROVIDES_SM_90, + ( + _CCCL_IF_CONSTEXPR (__scope == scope_cta) { + asm volatile( + "tensormap.cp_fenceproxy.global.shared::cta.tensormap::generic.release.cta.sync.aligned [%0], [%1], %2;" + : + : "l"(__as_ptr_gmem(__dst)), "r"(__as_ptr_smem(__src)), "n"(__size.value) + : "memory"); + } _CCCL_ELSE_IF_CONSTEXPR (__scope == scope_cluster) { + asm volatile( + "tensormap.cp_fenceproxy.global.shared::cta.tensormap::generic.release.cluster.sync.aligned [%0], [%1], %2;" + : + : "l"(__as_ptr_gmem(__dst)), "r"(__as_ptr_smem(__src)), "n"(__size.value) + : "memory"); + } _CCCL_ELSE_IF_CONSTEXPR (__scope == scope_gpu) { + asm volatile( + "tensormap.cp_fenceproxy.global.shared::cta.tensormap::generic.release.gpu.sync.aligned [%0], [%1], %2;" + : + : "l"(__as_ptr_gmem(__dst)), "r"(__as_ptr_smem(__src)), "n"(__size.value) + : "memory"); + } _CCCL_ELSE_IF_CONSTEXPR (__scope == scope_sys) { + asm volatile( + "tensormap.cp_fenceproxy.global.shared::cta.tensormap::generic.release.sys.sync.aligned [%0], [%1], %2;" + : + : "l"(__as_ptr_gmem(__dst)), "r"(__as_ptr_smem(__src)), "n"(__size.value) + : "memory"); + }), + ( + // Unsupported architectures will have a linker error with a semi-decent error message + __cuda_ptx_tensormap_cp_fenceproxy_is_not_supported_before_SM_90__();)); +} +#endif // __cccl_ptx_isa >= 830 diff --git a/libcudacxx/include/cuda/__ptx/instructions/generated/tensormap_replace.inc b/libcudacxx/include/cuda/__ptx/instructions/generated/tensormap_replace.inc new file mode 100644 index 00000000000..3b1060ead38 --- /dev/null +++ b/libcudacxx/include/cuda/__ptx/instructions/generated/tensormap_replace.inc @@ -0,0 +1,569 @@ +/* +// tensormap.replace.tile.global_address.space.b1024.b64 [tm_addr], new_val; // PTX ISA 83, SM_90a +// .space = { .global } +template +__device__ static inline void tensormap_replace_global_address( + cuda::ptx::space_global_t, + void* tm_addr, + B64 new_val); +*/ +#if __cccl_ptx_isa >= 830 +extern "C" _CCCL_DEVICE void __cuda_ptx_tensormap_replace_global_address_is_not_supported_before_SM_90a__(); +template +_CCCL_DEVICE static inline void tensormap_replace_global_address(space_global_t, void* __tm_addr, _B64 __new_val) +{ + // __space == space_global (due to parameter type constraint) + static_assert(sizeof(_B64) == 8, ""); + NV_IF_ELSE_TARGET( + NV_HAS_FEATURE_SM_90a, + (asm("tensormap.replace.tile.global_address.global.b1024.b64 [%0], %1;" + : + : "l"(__as_ptr_gmem(__tm_addr)), "l"(__as_b64(__new_val)) + : "memory");), + ( + // Unsupported architectures will have a linker error with a semi-decent error message + __cuda_ptx_tensormap_replace_global_address_is_not_supported_before_SM_90a__();)); +} +#endif // __cccl_ptx_isa >= 830 + +/* +// tensormap.replace.tile.global_address.space.b1024.b64 [tm_addr], new_val; // PTX ISA 83, SM_90a +// .space = { .shared::cta } +template +__device__ static inline void tensormap_replace_global_address( + cuda::ptx::space_shared_t, + void* tm_addr, + B64 new_val); +*/ +#if __cccl_ptx_isa >= 830 +extern "C" _CCCL_DEVICE void __cuda_ptx_tensormap_replace_global_address_is_not_supported_before_SM_90a__(); +template +_CCCL_DEVICE static inline void tensormap_replace_global_address(space_shared_t, void* __tm_addr, _B64 __new_val) +{ + // __space == space_shared (due to parameter type constraint) + static_assert(sizeof(_B64) == 8, ""); + NV_IF_ELSE_TARGET( + NV_HAS_FEATURE_SM_90a, + (asm("tensormap.replace.tile.global_address.shared::cta.b1024.b64 [%0], %1;" + : + : "r"(__as_ptr_smem(__tm_addr)), "l"(__as_b64(__new_val)) + : "memory");), + ( + // Unsupported architectures will have a linker error with a semi-decent error message + __cuda_ptx_tensormap_replace_global_address_is_not_supported_before_SM_90a__();)); +} +#endif // __cccl_ptx_isa >= 830 + +/* +// tensormap.replace.tile.rank.space.b1024.b32 [tm_addr], new_val; // PTX ISA 83, SM_90a +// .space = { .global } +template +__device__ static inline void tensormap_replace_rank( + cuda::ptx::space_global_t, + void* tm_addr, + B32 new_val); +*/ +#if __cccl_ptx_isa >= 830 +extern "C" _CCCL_DEVICE void __cuda_ptx_tensormap_replace_rank_is_not_supported_before_SM_90a__(); +template +_CCCL_DEVICE static inline void tensormap_replace_rank(space_global_t, void* __tm_addr, _B32 __new_val) +{ + // __space == space_global (due to parameter type constraint) + static_assert(sizeof(_B32) == 4, ""); + NV_IF_ELSE_TARGET( + NV_HAS_FEATURE_SM_90a, + (asm("tensormap.replace.tile.rank.global.b1024.b32 [%0], %1;" + : + : "l"(__as_ptr_gmem(__tm_addr)), "r"(__as_b32(__new_val)) + : "memory");), + ( + // Unsupported architectures will have a linker error with a semi-decent error message + __cuda_ptx_tensormap_replace_rank_is_not_supported_before_SM_90a__();)); +} +#endif // __cccl_ptx_isa >= 830 + +/* +// tensormap.replace.tile.rank.space.b1024.b32 [tm_addr], new_val; // PTX ISA 83, SM_90a +// .space = { .shared::cta } +template +__device__ static inline void tensormap_replace_rank( + cuda::ptx::space_shared_t, + void* tm_addr, + B32 new_val); +*/ +#if __cccl_ptx_isa >= 830 +extern "C" _CCCL_DEVICE void __cuda_ptx_tensormap_replace_rank_is_not_supported_before_SM_90a__(); +template +_CCCL_DEVICE static inline void tensormap_replace_rank(space_shared_t, void* __tm_addr, _B32 __new_val) +{ + // __space == space_shared (due to parameter type constraint) + static_assert(sizeof(_B32) == 4, ""); + NV_IF_ELSE_TARGET( + NV_HAS_FEATURE_SM_90a, + (asm("tensormap.replace.tile.rank.shared::cta.b1024.b32 [%0], %1;" + : + : "r"(__as_ptr_smem(__tm_addr)), "r"(__as_b32(__new_val)) + : "memory");), + ( + // Unsupported architectures will have a linker error with a semi-decent error message + __cuda_ptx_tensormap_replace_rank_is_not_supported_before_SM_90a__();)); +} +#endif // __cccl_ptx_isa >= 830 + +/* +// tensormap.replace.tile.box_dim.space.b1024.b32 [tm_addr], ord, new_val; // PTX ISA 83, SM_90a +// .space = { .global } +template +__device__ static inline void tensormap_replace_box_dim( + cuda::ptx::space_global_t, + void* tm_addr, + cuda::ptx::n32_t ord, + B32 new_val); +*/ +#if __cccl_ptx_isa >= 830 +extern "C" _CCCL_DEVICE void __cuda_ptx_tensormap_replace_box_dim_is_not_supported_before_SM_90a__(); +template +_CCCL_DEVICE static inline void +tensormap_replace_box_dim(space_global_t, void* __tm_addr, n32_t<_N32> __ord, _B32 __new_val) +{ + // __space == space_global (due to parameter type constraint) + static_assert(sizeof(_B32) == 4, ""); + NV_IF_ELSE_TARGET( + NV_HAS_FEATURE_SM_90a, + (asm("tensormap.replace.tile.box_dim.global.b1024.b32 [%0], %1, %2;" + : + : "l"(__as_ptr_gmem(__tm_addr)), "n"(__ord.value), "r"(__as_b32(__new_val)) + : "memory");), + ( + // Unsupported architectures will have a linker error with a semi-decent error message + __cuda_ptx_tensormap_replace_box_dim_is_not_supported_before_SM_90a__();)); +} +#endif // __cccl_ptx_isa >= 830 + +/* +// tensormap.replace.tile.box_dim.space.b1024.b32 [tm_addr], ord, new_val; // PTX ISA 83, SM_90a +// .space = { .shared::cta } +template +__device__ static inline void tensormap_replace_box_dim( + cuda::ptx::space_shared_t, + void* tm_addr, + cuda::ptx::n32_t ord, + B32 new_val); +*/ +#if __cccl_ptx_isa >= 830 +extern "C" _CCCL_DEVICE void __cuda_ptx_tensormap_replace_box_dim_is_not_supported_before_SM_90a__(); +template +_CCCL_DEVICE static inline void +tensormap_replace_box_dim(space_shared_t, void* __tm_addr, n32_t<_N32> __ord, _B32 __new_val) +{ + // __space == space_shared (due to parameter type constraint) + static_assert(sizeof(_B32) == 4, ""); + NV_IF_ELSE_TARGET( + NV_HAS_FEATURE_SM_90a, + (asm("tensormap.replace.tile.box_dim.shared::cta.b1024.b32 [%0], %1, %2;" + : + : "r"(__as_ptr_smem(__tm_addr)), "n"(__ord.value), "r"(__as_b32(__new_val)) + : "memory");), + ( + // Unsupported architectures will have a linker error with a semi-decent error message + __cuda_ptx_tensormap_replace_box_dim_is_not_supported_before_SM_90a__();)); +} +#endif // __cccl_ptx_isa >= 830 + +/* +// tensormap.replace.tile.global_dim.space.b1024.b32 [tm_addr], ord, new_val; // PTX ISA 83, SM_90a +// .space = { .global } +template +__device__ static inline void tensormap_replace_global_dim( + cuda::ptx::space_global_t, + void* tm_addr, + cuda::ptx::n32_t ord, + B32 new_val); +*/ +#if __cccl_ptx_isa >= 830 +extern "C" _CCCL_DEVICE void __cuda_ptx_tensormap_replace_global_dim_is_not_supported_before_SM_90a__(); +template +_CCCL_DEVICE static inline void +tensormap_replace_global_dim(space_global_t, void* __tm_addr, n32_t<_N32> __ord, _B32 __new_val) +{ + // __space == space_global (due to parameter type constraint) + static_assert(sizeof(_B32) == 4, ""); + NV_IF_ELSE_TARGET( + NV_HAS_FEATURE_SM_90a, + (asm("tensormap.replace.tile.global_dim.global.b1024.b32 [%0], %1, %2;" + : + : "l"(__as_ptr_gmem(__tm_addr)), "n"(__ord.value), "r"(__as_b32(__new_val)) + : "memory");), + ( + // Unsupported architectures will have a linker error with a semi-decent error message + __cuda_ptx_tensormap_replace_global_dim_is_not_supported_before_SM_90a__();)); +} +#endif // __cccl_ptx_isa >= 830 + +/* +// tensormap.replace.tile.global_dim.space.b1024.b32 [tm_addr], ord, new_val; // PTX ISA 83, SM_90a +// .space = { .shared::cta } +template +__device__ static inline void tensormap_replace_global_dim( + cuda::ptx::space_shared_t, + void* tm_addr, + cuda::ptx::n32_t ord, + B32 new_val); +*/ +#if __cccl_ptx_isa >= 830 +extern "C" _CCCL_DEVICE void __cuda_ptx_tensormap_replace_global_dim_is_not_supported_before_SM_90a__(); +template +_CCCL_DEVICE static inline void +tensormap_replace_global_dim(space_shared_t, void* __tm_addr, n32_t<_N32> __ord, _B32 __new_val) +{ + // __space == space_shared (due to parameter type constraint) + static_assert(sizeof(_B32) == 4, ""); + NV_IF_ELSE_TARGET( + NV_HAS_FEATURE_SM_90a, + (asm("tensormap.replace.tile.global_dim.shared::cta.b1024.b32 [%0], %1, %2;" + : + : "r"(__as_ptr_smem(__tm_addr)), "n"(__ord.value), "r"(__as_b32(__new_val)) + : "memory");), + ( + // Unsupported architectures will have a linker error with a semi-decent error message + __cuda_ptx_tensormap_replace_global_dim_is_not_supported_before_SM_90a__();)); +} +#endif // __cccl_ptx_isa >= 830 + +/* +// tensormap.replace.tile.global_stride.space.b1024.b64 [tm_addr], ord, new_val; // PTX ISA 83, SM_90a +// .space = { .global } +template +__device__ static inline void tensormap_replace_global_stride( + cuda::ptx::space_global_t, + void* tm_addr, + cuda::ptx::n32_t ord, + B64 new_val); +*/ +#if __cccl_ptx_isa >= 830 +extern "C" _CCCL_DEVICE void __cuda_ptx_tensormap_replace_global_stride_is_not_supported_before_SM_90a__(); +template +_CCCL_DEVICE static inline void +tensormap_replace_global_stride(space_global_t, void* __tm_addr, n32_t<_N32> __ord, _B64 __new_val) +{ + // __space == space_global (due to parameter type constraint) + static_assert(sizeof(_B64) == 8, ""); + NV_IF_ELSE_TARGET( + NV_HAS_FEATURE_SM_90a, + (asm("tensormap.replace.tile.global_stride.global.b1024.b64 [%0], %1, %2;" + : + : "l"(__as_ptr_gmem(__tm_addr)), "n"(__ord.value), "l"(__as_b64(__new_val)) + : "memory");), + ( + // Unsupported architectures will have a linker error with a semi-decent error message + __cuda_ptx_tensormap_replace_global_stride_is_not_supported_before_SM_90a__();)); +} +#endif // __cccl_ptx_isa >= 830 + +/* +// tensormap.replace.tile.global_stride.space.b1024.b64 [tm_addr], ord, new_val; // PTX ISA 83, SM_90a +// .space = { .shared::cta } +template +__device__ static inline void tensormap_replace_global_stride( + cuda::ptx::space_shared_t, + void* tm_addr, + cuda::ptx::n32_t ord, + B64 new_val); +*/ +#if __cccl_ptx_isa >= 830 +extern "C" _CCCL_DEVICE void __cuda_ptx_tensormap_replace_global_stride_is_not_supported_before_SM_90a__(); +template +_CCCL_DEVICE static inline void +tensormap_replace_global_stride(space_shared_t, void* __tm_addr, n32_t<_N32> __ord, _B64 __new_val) +{ + // __space == space_shared (due to parameter type constraint) + static_assert(sizeof(_B64) == 8, ""); + NV_IF_ELSE_TARGET( + NV_HAS_FEATURE_SM_90a, + (asm("tensormap.replace.tile.global_stride.shared::cta.b1024.b64 [%0], %1, %2;" + : + : "r"(__as_ptr_smem(__tm_addr)), "n"(__ord.value), "l"(__as_b64(__new_val)) + : "memory");), + ( + // Unsupported architectures will have a linker error with a semi-decent error message + __cuda_ptx_tensormap_replace_global_stride_is_not_supported_before_SM_90a__();)); +} +#endif // __cccl_ptx_isa >= 830 + +/* +// tensormap.replace.tile.element_stride.space.b1024.b32 [tm_addr], ord, new_val; // PTX ISA 83, SM_90a +// .space = { .global } +template +__device__ static inline void tensormap_replace_element_size( + cuda::ptx::space_global_t, + void* tm_addr, + cuda::ptx::n32_t ord, + B32 new_val); +*/ +#if __cccl_ptx_isa >= 830 +extern "C" _CCCL_DEVICE void __cuda_ptx_tensormap_replace_element_size_is_not_supported_before_SM_90a__(); +template +_CCCL_DEVICE static inline void +tensormap_replace_element_size(space_global_t, void* __tm_addr, n32_t<_N32> __ord, _B32 __new_val) +{ + // __space == space_global (due to parameter type constraint) + static_assert(sizeof(_B32) == 4, ""); + NV_IF_ELSE_TARGET( + NV_HAS_FEATURE_SM_90a, + (asm("tensormap.replace.tile.element_stride.global.b1024.b32 [%0], %1, %2;" + : + : "l"(__as_ptr_gmem(__tm_addr)), "n"(__ord.value), "r"(__as_b32(__new_val)) + : "memory");), + ( + // Unsupported architectures will have a linker error with a semi-decent error message + __cuda_ptx_tensormap_replace_element_size_is_not_supported_before_SM_90a__();)); +} +#endif // __cccl_ptx_isa >= 830 + +/* +// tensormap.replace.tile.element_stride.space.b1024.b32 [tm_addr], ord, new_val; // PTX ISA 83, SM_90a +// .space = { .shared::cta } +template +__device__ static inline void tensormap_replace_element_size( + cuda::ptx::space_shared_t, + void* tm_addr, + cuda::ptx::n32_t ord, + B32 new_val); +*/ +#if __cccl_ptx_isa >= 830 +extern "C" _CCCL_DEVICE void __cuda_ptx_tensormap_replace_element_size_is_not_supported_before_SM_90a__(); +template +_CCCL_DEVICE static inline void +tensormap_replace_element_size(space_shared_t, void* __tm_addr, n32_t<_N32> __ord, _B32 __new_val) +{ + // __space == space_shared (due to parameter type constraint) + static_assert(sizeof(_B32) == 4, ""); + NV_IF_ELSE_TARGET( + NV_HAS_FEATURE_SM_90a, + (asm("tensormap.replace.tile.element_stride.shared::cta.b1024.b32 [%0], %1, %2;" + : + : "r"(__as_ptr_smem(__tm_addr)), "n"(__ord.value), "r"(__as_b32(__new_val)) + : "memory");), + ( + // Unsupported architectures will have a linker error with a semi-decent error message + __cuda_ptx_tensormap_replace_element_size_is_not_supported_before_SM_90a__();)); +} +#endif // __cccl_ptx_isa >= 830 + +/* +// tensormap.replace.tile.elemtype.space.b1024.b32 [tm_addr], new_val; // PTX ISA 83, SM_90a +// .space = { .global } +template +__device__ static inline void tensormap_replace_elemtype( + cuda::ptx::space_global_t, + void* tm_addr, + cuda::ptx::n32_t new_val); +*/ +#if __cccl_ptx_isa >= 830 +extern "C" _CCCL_DEVICE void __cuda_ptx_tensormap_replace_elemtype_is_not_supported_before_SM_90a__(); +template +_CCCL_DEVICE static inline void tensormap_replace_elemtype(space_global_t, void* __tm_addr, n32_t<_N32> __new_val) +{ + // __space == space_global (due to parameter type constraint) + NV_IF_ELSE_TARGET( + NV_HAS_FEATURE_SM_90a, + (asm("tensormap.replace.tile.elemtype.global.b1024.b32 [%0], %1;" + : + : "l"(__as_ptr_gmem(__tm_addr)), "n"(__new_val.value) + : "memory");), + ( + // Unsupported architectures will have a linker error with a semi-decent error message + __cuda_ptx_tensormap_replace_elemtype_is_not_supported_before_SM_90a__();)); +} +#endif // __cccl_ptx_isa >= 830 + +/* +// tensormap.replace.tile.elemtype.space.b1024.b32 [tm_addr], new_val; // PTX ISA 83, SM_90a +// .space = { .shared::cta } +template +__device__ static inline void tensormap_replace_elemtype( + cuda::ptx::space_shared_t, + void* tm_addr, + cuda::ptx::n32_t new_val); +*/ +#if __cccl_ptx_isa >= 830 +extern "C" _CCCL_DEVICE void __cuda_ptx_tensormap_replace_elemtype_is_not_supported_before_SM_90a__(); +template +_CCCL_DEVICE static inline void tensormap_replace_elemtype(space_shared_t, void* __tm_addr, n32_t<_N32> __new_val) +{ + // __space == space_shared (due to parameter type constraint) + NV_IF_ELSE_TARGET( + NV_HAS_FEATURE_SM_90a, + (asm("tensormap.replace.tile.elemtype.shared::cta.b1024.b32 [%0], %1;" + : + : "r"(__as_ptr_smem(__tm_addr)), "n"(__new_val.value) + : "memory");), + ( + // Unsupported architectures will have a linker error with a semi-decent error message + __cuda_ptx_tensormap_replace_elemtype_is_not_supported_before_SM_90a__();)); +} +#endif // __cccl_ptx_isa >= 830 + +/* +// tensormap.replace.tile.interleave_layout.space.b1024.b32 [tm_addr], new_val; // PTX ISA 83, SM_90a +// .space = { .global } +template +__device__ static inline void tensormap_replace_interleave_layout( + cuda::ptx::space_global_t, + void* tm_addr, + cuda::ptx::n32_t new_val); +*/ +#if __cccl_ptx_isa >= 830 +extern "C" _CCCL_DEVICE void __cuda_ptx_tensormap_replace_interleave_layout_is_not_supported_before_SM_90a__(); +template +_CCCL_DEVICE static inline void +tensormap_replace_interleave_layout(space_global_t, void* __tm_addr, n32_t<_N32> __new_val) +{ + // __space == space_global (due to parameter type constraint) + NV_IF_ELSE_TARGET( + NV_HAS_FEATURE_SM_90a, + (asm("tensormap.replace.tile.interleave_layout.global.b1024.b32 [%0], %1;" + : + : "l"(__as_ptr_gmem(__tm_addr)), "n"(__new_val.value) + : "memory");), + ( + // Unsupported architectures will have a linker error with a semi-decent error message + __cuda_ptx_tensormap_replace_interleave_layout_is_not_supported_before_SM_90a__();)); +} +#endif // __cccl_ptx_isa >= 830 + +/* +// tensormap.replace.tile.interleave_layout.space.b1024.b32 [tm_addr], new_val; // PTX ISA 83, SM_90a +// .space = { .shared::cta } +template +__device__ static inline void tensormap_replace_interleave_layout( + cuda::ptx::space_shared_t, + void* tm_addr, + cuda::ptx::n32_t new_val); +*/ +#if __cccl_ptx_isa >= 830 +extern "C" _CCCL_DEVICE void __cuda_ptx_tensormap_replace_interleave_layout_is_not_supported_before_SM_90a__(); +template +_CCCL_DEVICE static inline void +tensormap_replace_interleave_layout(space_shared_t, void* __tm_addr, n32_t<_N32> __new_val) +{ + // __space == space_shared (due to parameter type constraint) + NV_IF_ELSE_TARGET( + NV_HAS_FEATURE_SM_90a, + (asm("tensormap.replace.tile.interleave_layout.shared::cta.b1024.b32 [%0], %1;" + : + : "r"(__as_ptr_smem(__tm_addr)), "n"(__new_val.value) + : "memory");), + ( + // Unsupported architectures will have a linker error with a semi-decent error message + __cuda_ptx_tensormap_replace_interleave_layout_is_not_supported_before_SM_90a__();)); +} +#endif // __cccl_ptx_isa >= 830 + +/* +// tensormap.replace.tile.swizzle_mode.space.b1024.b32 [tm_addr], new_val; // PTX ISA 83, SM_90a +// .space = { .global } +template +__device__ static inline void tensormap_replace_swizzle_mode( + cuda::ptx::space_global_t, + void* tm_addr, + cuda::ptx::n32_t new_val); +*/ +#if __cccl_ptx_isa >= 830 +extern "C" _CCCL_DEVICE void __cuda_ptx_tensormap_replace_swizzle_mode_is_not_supported_before_SM_90a__(); +template +_CCCL_DEVICE static inline void tensormap_replace_swizzle_mode(space_global_t, void* __tm_addr, n32_t<_N32> __new_val) +{ + // __space == space_global (due to parameter type constraint) + NV_IF_ELSE_TARGET( + NV_HAS_FEATURE_SM_90a, + (asm("tensormap.replace.tile.swizzle_mode.global.b1024.b32 [%0], %1;" + : + : "l"(__as_ptr_gmem(__tm_addr)), "n"(__new_val.value) + : "memory");), + ( + // Unsupported architectures will have a linker error with a semi-decent error message + __cuda_ptx_tensormap_replace_swizzle_mode_is_not_supported_before_SM_90a__();)); +} +#endif // __cccl_ptx_isa >= 830 + +/* +// tensormap.replace.tile.swizzle_mode.space.b1024.b32 [tm_addr], new_val; // PTX ISA 83, SM_90a +// .space = { .shared::cta } +template +__device__ static inline void tensormap_replace_swizzle_mode( + cuda::ptx::space_shared_t, + void* tm_addr, + cuda::ptx::n32_t new_val); +*/ +#if __cccl_ptx_isa >= 830 +extern "C" _CCCL_DEVICE void __cuda_ptx_tensormap_replace_swizzle_mode_is_not_supported_before_SM_90a__(); +template +_CCCL_DEVICE static inline void tensormap_replace_swizzle_mode(space_shared_t, void* __tm_addr, n32_t<_N32> __new_val) +{ + // __space == space_shared (due to parameter type constraint) + NV_IF_ELSE_TARGET( + NV_HAS_FEATURE_SM_90a, + (asm("tensormap.replace.tile.swizzle_mode.shared::cta.b1024.b32 [%0], %1;" + : + : "r"(__as_ptr_smem(__tm_addr)), "n"(__new_val.value) + : "memory");), + ( + // Unsupported architectures will have a linker error with a semi-decent error message + __cuda_ptx_tensormap_replace_swizzle_mode_is_not_supported_before_SM_90a__();)); +} +#endif // __cccl_ptx_isa >= 830 + +/* +// tensormap.replace.tile.fill_mode.space.b1024.b32 [tm_addr], new_val; // PTX ISA 83, SM_90a +// .space = { .global } +template +__device__ static inline void tensormap_replace_fill_mode( + cuda::ptx::space_global_t, + void* tm_addr, + cuda::ptx::n32_t new_val); +*/ +#if __cccl_ptx_isa >= 830 +extern "C" _CCCL_DEVICE void __cuda_ptx_tensormap_replace_fill_mode_is_not_supported_before_SM_90a__(); +template +_CCCL_DEVICE static inline void tensormap_replace_fill_mode(space_global_t, void* __tm_addr, n32_t<_N32> __new_val) +{ + // __space == space_global (due to parameter type constraint) + NV_IF_ELSE_TARGET( + NV_HAS_FEATURE_SM_90a, + (asm("tensormap.replace.tile.fill_mode.global.b1024.b32 [%0], %1;" + : + : "l"(__as_ptr_gmem(__tm_addr)), "n"(__new_val.value) + : "memory");), + ( + // Unsupported architectures will have a linker error with a semi-decent error message + __cuda_ptx_tensormap_replace_fill_mode_is_not_supported_before_SM_90a__();)); +} +#endif // __cccl_ptx_isa >= 830 + +/* +// tensormap.replace.tile.fill_mode.space.b1024.b32 [tm_addr], new_val; // PTX ISA 83, SM_90a +// .space = { .shared::cta } +template +__device__ static inline void tensormap_replace_fill_mode( + cuda::ptx::space_shared_t, + void* tm_addr, + cuda::ptx::n32_t new_val); +*/ +#if __cccl_ptx_isa >= 830 +extern "C" _CCCL_DEVICE void __cuda_ptx_tensormap_replace_fill_mode_is_not_supported_before_SM_90a__(); +template +_CCCL_DEVICE static inline void tensormap_replace_fill_mode(space_shared_t, void* __tm_addr, n32_t<_N32> __new_val) +{ + // __space == space_shared (due to parameter type constraint) + NV_IF_ELSE_TARGET( + NV_HAS_FEATURE_SM_90a, + (asm("tensormap.replace.tile.fill_mode.shared::cta.b1024.b32 [%0], %1;" + : + : "r"(__as_ptr_smem(__tm_addr)), "n"(__new_val.value) + : "memory");), + ( + // Unsupported architectures will have a linker error with a semi-decent error message + __cuda_ptx_tensormap_replace_fill_mode_is_not_supported_before_SM_90a__();)); +} +#endif // __cccl_ptx_isa >= 830 diff --git a/libcudacxx/include/cuda/__ptx/instructions/get_sreg.h b/libcudacxx/include/cuda/__ptx/instructions/get_sreg.h index 8982984885d..033005beb5b 100644 --- a/libcudacxx/include/cuda/__ptx/instructions/get_sreg.h +++ b/libcudacxx/include/cuda/__ptx/instructions/get_sreg.h @@ -32,1007 +32,7 @@ _LIBCUDACXX_BEGIN_NAMESPACE_CUDA_PTX // 10. Special Registers // https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#special-registers -/* -// mov.u32 sreg_value, %%tid.x; // PTX ISA 20 -template -__device__ static inline uint32_t get_sreg_tid_x(); -*/ -#if __cccl_ptx_isa >= 200 -template -_CCCL_DEVICE static inline _CUDA_VSTD::uint32_t get_sreg_tid_x() -{ - _CUDA_VSTD::uint32_t __sreg_value; - asm("mov.u32 %0, %%tid.x;" : "=r"(__sreg_value) : :); - return __sreg_value; -} -#endif // __cccl_ptx_isa >= 200 - -/* -// mov.u32 sreg_value, %%tid.y; // PTX ISA 20 -template -__device__ static inline uint32_t get_sreg_tid_y(); -*/ -#if __cccl_ptx_isa >= 200 -template -_CCCL_DEVICE static inline _CUDA_VSTD::uint32_t get_sreg_tid_y() -{ - _CUDA_VSTD::uint32_t __sreg_value; - asm("mov.u32 %0, %%tid.y;" : "=r"(__sreg_value) : :); - return __sreg_value; -} -#endif // __cccl_ptx_isa >= 200 - -/* -// mov.u32 sreg_value, %%tid.z; // PTX ISA 20 -template -__device__ static inline uint32_t get_sreg_tid_z(); -*/ -#if __cccl_ptx_isa >= 200 -template -_CCCL_DEVICE static inline _CUDA_VSTD::uint32_t get_sreg_tid_z() -{ - _CUDA_VSTD::uint32_t __sreg_value; - asm("mov.u32 %0, %%tid.z;" : "=r"(__sreg_value) : :); - return __sreg_value; -} -#endif // __cccl_ptx_isa >= 200 - -/* -// mov.u32 sreg_value, %%ntid.x; // PTX ISA 20 -template -__device__ static inline uint32_t get_sreg_ntid_x(); -*/ -#if __cccl_ptx_isa >= 200 -template -_CCCL_DEVICE static inline _CUDA_VSTD::uint32_t get_sreg_ntid_x() -{ - _CUDA_VSTD::uint32_t __sreg_value; - asm volatile("mov.u32 %0, %%ntid.x;" : "=r"(__sreg_value) : :); - return __sreg_value; -} -#endif // __cccl_ptx_isa >= 200 - -/* -// mov.u32 sreg_value, %%ntid.y; // PTX ISA 20 -template -__device__ static inline uint32_t get_sreg_ntid_y(); -*/ -#if __cccl_ptx_isa >= 200 -template -_CCCL_DEVICE static inline _CUDA_VSTD::uint32_t get_sreg_ntid_y() -{ - _CUDA_VSTD::uint32_t __sreg_value; - asm volatile("mov.u32 %0, %%ntid.y;" : "=r"(__sreg_value) : :); - return __sreg_value; -} -#endif // __cccl_ptx_isa >= 200 - -/* -// mov.u32 sreg_value, %%ntid.z; // PTX ISA 20 -template -__device__ static inline uint32_t get_sreg_ntid_z(); -*/ -#if __cccl_ptx_isa >= 200 -template -_CCCL_DEVICE static inline _CUDA_VSTD::uint32_t get_sreg_ntid_z() -{ - _CUDA_VSTD::uint32_t __sreg_value; - asm volatile("mov.u32 %0, %%ntid.z;" : "=r"(__sreg_value) : :); - return __sreg_value; -} -#endif // __cccl_ptx_isa >= 200 - -/* -// mov.u32 sreg_value, %%laneid; // PTX ISA 13 -template -__device__ static inline uint32_t get_sreg_laneid(); -*/ -#if __cccl_ptx_isa >= 130 -template -_CCCL_DEVICE static inline _CUDA_VSTD::uint32_t get_sreg_laneid() -{ - _CUDA_VSTD::uint32_t __sreg_value; - asm("mov.u32 %0, %%laneid;" : "=r"(__sreg_value) : :); - return __sreg_value; -} -#endif // __cccl_ptx_isa >= 130 - -/* -// mov.u32 sreg_value, %%warpid; // PTX ISA 13 -template -__device__ static inline uint32_t get_sreg_warpid(); -*/ -#if __cccl_ptx_isa >= 130 -template -_CCCL_DEVICE static inline _CUDA_VSTD::uint32_t get_sreg_warpid() -{ - _CUDA_VSTD::uint32_t __sreg_value; - asm volatile("mov.u32 %0, %%warpid;" : "=r"(__sreg_value) : :); - return __sreg_value; -} -#endif // __cccl_ptx_isa >= 130 - -/* -// mov.u32 sreg_value, %%nwarpid; // PTX ISA 20, SM_35 -template -__device__ static inline uint32_t get_sreg_nwarpid(); -*/ -#if __cccl_ptx_isa >= 200 -extern "C" _CCCL_DEVICE void __cuda_ptx_get_sreg_nwarpid_is_not_supported_before_SM_35__(); -template -_CCCL_DEVICE static inline _CUDA_VSTD::uint32_t get_sreg_nwarpid() -{ - NV_IF_ELSE_TARGET( - NV_PROVIDES_SM_35, - (_CUDA_VSTD::uint32_t __sreg_value; asm volatile( - "mov.u32 %0, %%nwarpid;" - : "=r"(__sreg_value) - : - :); - return __sreg_value;), - ( - // Unsupported architectures will have a linker error with a semi-decent error message - __cuda_ptx_get_sreg_nwarpid_is_not_supported_before_SM_35__(); return 0;)); -} -#endif // __cccl_ptx_isa >= 200 - -/* -// mov.u32 sreg_value, %%ctaid.x; // PTX ISA 20 -template -__device__ static inline uint32_t get_sreg_ctaid_x(); -*/ -#if __cccl_ptx_isa >= 200 -template -_CCCL_DEVICE static inline _CUDA_VSTD::uint32_t get_sreg_ctaid_x() -{ - _CUDA_VSTD::uint32_t __sreg_value; - asm("mov.u32 %0, %%ctaid.x;" : "=r"(__sreg_value) : :); - return __sreg_value; -} -#endif // __cccl_ptx_isa >= 200 - -/* -// mov.u32 sreg_value, %%ctaid.y; // PTX ISA 20 -template -__device__ static inline uint32_t get_sreg_ctaid_y(); -*/ -#if __cccl_ptx_isa >= 200 -template -_CCCL_DEVICE static inline _CUDA_VSTD::uint32_t get_sreg_ctaid_y() -{ - _CUDA_VSTD::uint32_t __sreg_value; - asm("mov.u32 %0, %%ctaid.y;" : "=r"(__sreg_value) : :); - return __sreg_value; -} -#endif // __cccl_ptx_isa >= 200 - -/* -// mov.u32 sreg_value, %%ctaid.z; // PTX ISA 20 -template -__device__ static inline uint32_t get_sreg_ctaid_z(); -*/ -#if __cccl_ptx_isa >= 200 -template -_CCCL_DEVICE static inline _CUDA_VSTD::uint32_t get_sreg_ctaid_z() -{ - _CUDA_VSTD::uint32_t __sreg_value; - asm("mov.u32 %0, %%ctaid.z;" : "=r"(__sreg_value) : :); - return __sreg_value; -} -#endif // __cccl_ptx_isa >= 200 - -/* -// mov.u32 sreg_value, %%nctaid.x; // PTX ISA 20 -template -__device__ static inline uint32_t get_sreg_nctaid_x(); -*/ -#if __cccl_ptx_isa >= 200 -template -_CCCL_DEVICE static inline _CUDA_VSTD::uint32_t get_sreg_nctaid_x() -{ - _CUDA_VSTD::uint32_t __sreg_value; - asm("mov.u32 %0, %%nctaid.x;" : "=r"(__sreg_value) : :); - return __sreg_value; -} -#endif // __cccl_ptx_isa >= 200 - -/* -// mov.u32 sreg_value, %%nctaid.y; // PTX ISA 20 -template -__device__ static inline uint32_t get_sreg_nctaid_y(); -*/ -#if __cccl_ptx_isa >= 200 -template -_CCCL_DEVICE static inline _CUDA_VSTD::uint32_t get_sreg_nctaid_y() -{ - _CUDA_VSTD::uint32_t __sreg_value; - asm("mov.u32 %0, %%nctaid.y;" : "=r"(__sreg_value) : :); - return __sreg_value; -} -#endif // __cccl_ptx_isa >= 200 - -/* -// mov.u32 sreg_value, %%nctaid.z; // PTX ISA 20 -template -__device__ static inline uint32_t get_sreg_nctaid_z(); -*/ -#if __cccl_ptx_isa >= 200 -template -_CCCL_DEVICE static inline _CUDA_VSTD::uint32_t get_sreg_nctaid_z() -{ - _CUDA_VSTD::uint32_t __sreg_value; - asm("mov.u32 %0, %%nctaid.z;" : "=r"(__sreg_value) : :); - return __sreg_value; -} -#endif // __cccl_ptx_isa >= 200 - -/* -// mov.u32 sreg_value, %%smid; // PTX ISA 13 -template -__device__ static inline uint32_t get_sreg_smid(); -*/ -#if __cccl_ptx_isa >= 130 -template -_CCCL_DEVICE static inline _CUDA_VSTD::uint32_t get_sreg_smid() -{ - _CUDA_VSTD::uint32_t __sreg_value; - asm("mov.u32 %0, %%smid;" : "=r"(__sreg_value) : :); - return __sreg_value; -} -#endif // __cccl_ptx_isa >= 130 - -/* -// mov.u32 sreg_value, %%nsmid; // PTX ISA 20, SM_35 -template -__device__ static inline uint32_t get_sreg_nsmid(); -*/ -#if __cccl_ptx_isa >= 200 -extern "C" _CCCL_DEVICE void __cuda_ptx_get_sreg_nsmid_is_not_supported_before_SM_35__(); -template -_CCCL_DEVICE static inline _CUDA_VSTD::uint32_t get_sreg_nsmid() -{ - NV_IF_ELSE_TARGET( - NV_PROVIDES_SM_35, - (_CUDA_VSTD::uint32_t __sreg_value; asm volatile( - "mov.u32 %0, %%nsmid;" - : "=r"(__sreg_value) - : - :); - return __sreg_value;), - ( - // Unsupported architectures will have a linker error with a semi-decent error message - __cuda_ptx_get_sreg_nsmid_is_not_supported_before_SM_35__(); return 0;)); -} -#endif // __cccl_ptx_isa >= 200 - -/* -// mov.u64 sreg_value, %%gridid; // PTX ISA 30 -template -__device__ static inline uint64_t get_sreg_gridid(); -*/ -#if __cccl_ptx_isa >= 300 -template -_CCCL_DEVICE static inline _CUDA_VSTD::uint64_t get_sreg_gridid() -{ - _CUDA_VSTD::uint64_t __sreg_value; - asm("mov.u64 %0, %%gridid;" : "=l"(__sreg_value) : :); - return __sreg_value; -} -#endif // __cccl_ptx_isa >= 300 - -/* -// mov.pred sreg_value, %%is_explicit_cluster; // PTX ISA 78, SM_90 -template -__device__ static inline bool get_sreg_is_explicit_cluster(); -*/ -#if __cccl_ptx_isa >= 780 -extern "C" _CCCL_DEVICE void __cuda_ptx_get_sreg_is_explicit_cluster_is_not_supported_before_SM_90__(); -template -_CCCL_DEVICE static inline bool get_sreg_is_explicit_cluster() -{ - NV_IF_ELSE_TARGET( - NV_PROVIDES_SM_90, - (_CUDA_VSTD::uint32_t __sreg_value; - asm("{\n\t .reg .pred P_OUT; \n\t" - "mov.pred P_OUT, %%is_explicit_cluster;\n\t" - "selp.b32 %0, 1, 0, P_OUT; \n" - "}" - : "=r"(__sreg_value) - : - :); - return static_cast(__sreg_value);), - ( - // Unsupported architectures will have a linker error with a semi-decent error message - __cuda_ptx_get_sreg_is_explicit_cluster_is_not_supported_before_SM_90__(); return false;)); -} -#endif // __cccl_ptx_isa >= 780 - -/* -// mov.u32 sreg_value, %%clusterid.x; // PTX ISA 78, SM_90 -template -__device__ static inline uint32_t get_sreg_clusterid_x(); -*/ -#if __cccl_ptx_isa >= 780 -extern "C" _CCCL_DEVICE void __cuda_ptx_get_sreg_clusterid_x_is_not_supported_before_SM_90__(); -template -_CCCL_DEVICE static inline _CUDA_VSTD::uint32_t get_sreg_clusterid_x() -{ - NV_IF_ELSE_TARGET( - NV_PROVIDES_SM_90, - (_CUDA_VSTD::uint32_t __sreg_value; - asm("mov.u32 %0, %%clusterid.x;" - : "=r"(__sreg_value) - : - :); - return __sreg_value;), - ( - // Unsupported architectures will have a linker error with a semi-decent error message - __cuda_ptx_get_sreg_clusterid_x_is_not_supported_before_SM_90__(); return 0;)); -} -#endif // __cccl_ptx_isa >= 780 - -/* -// mov.u32 sreg_value, %%clusterid.y; // PTX ISA 78, SM_90 -template -__device__ static inline uint32_t get_sreg_clusterid_y(); -*/ -#if __cccl_ptx_isa >= 780 -extern "C" _CCCL_DEVICE void __cuda_ptx_get_sreg_clusterid_y_is_not_supported_before_SM_90__(); -template -_CCCL_DEVICE static inline _CUDA_VSTD::uint32_t get_sreg_clusterid_y() -{ - NV_IF_ELSE_TARGET( - NV_PROVIDES_SM_90, - (_CUDA_VSTD::uint32_t __sreg_value; - asm("mov.u32 %0, %%clusterid.y;" - : "=r"(__sreg_value) - : - :); - return __sreg_value;), - ( - // Unsupported architectures will have a linker error with a semi-decent error message - __cuda_ptx_get_sreg_clusterid_y_is_not_supported_before_SM_90__(); return 0;)); -} -#endif // __cccl_ptx_isa >= 780 - -/* -// mov.u32 sreg_value, %%clusterid.z; // PTX ISA 78, SM_90 -template -__device__ static inline uint32_t get_sreg_clusterid_z(); -*/ -#if __cccl_ptx_isa >= 780 -extern "C" _CCCL_DEVICE void __cuda_ptx_get_sreg_clusterid_z_is_not_supported_before_SM_90__(); -template -_CCCL_DEVICE static inline _CUDA_VSTD::uint32_t get_sreg_clusterid_z() -{ - NV_IF_ELSE_TARGET( - NV_PROVIDES_SM_90, - (_CUDA_VSTD::uint32_t __sreg_value; - asm("mov.u32 %0, %%clusterid.z;" - : "=r"(__sreg_value) - : - :); - return __sreg_value;), - ( - // Unsupported architectures will have a linker error with a semi-decent error message - __cuda_ptx_get_sreg_clusterid_z_is_not_supported_before_SM_90__(); return 0;)); -} -#endif // __cccl_ptx_isa >= 780 - -/* -// mov.u32 sreg_value, %%nclusterid.x; // PTX ISA 78, SM_90 -template -__device__ static inline uint32_t get_sreg_nclusterid_x(); -*/ -#if __cccl_ptx_isa >= 780 -extern "C" _CCCL_DEVICE void __cuda_ptx_get_sreg_nclusterid_x_is_not_supported_before_SM_90__(); -template -_CCCL_DEVICE static inline _CUDA_VSTD::uint32_t get_sreg_nclusterid_x() -{ - NV_IF_ELSE_TARGET( - NV_PROVIDES_SM_90, - (_CUDA_VSTD::uint32_t __sreg_value; - asm("mov.u32 %0, %%nclusterid.x;" - : "=r"(__sreg_value) - : - :); - return __sreg_value;), - ( - // Unsupported architectures will have a linker error with a semi-decent error message - __cuda_ptx_get_sreg_nclusterid_x_is_not_supported_before_SM_90__(); return 0;)); -} -#endif // __cccl_ptx_isa >= 780 - -/* -// mov.u32 sreg_value, %%nclusterid.y; // PTX ISA 78, SM_90 -template -__device__ static inline uint32_t get_sreg_nclusterid_y(); -*/ -#if __cccl_ptx_isa >= 780 -extern "C" _CCCL_DEVICE void __cuda_ptx_get_sreg_nclusterid_y_is_not_supported_before_SM_90__(); -template -_CCCL_DEVICE static inline _CUDA_VSTD::uint32_t get_sreg_nclusterid_y() -{ - NV_IF_ELSE_TARGET( - NV_PROVIDES_SM_90, - (_CUDA_VSTD::uint32_t __sreg_value; - asm("mov.u32 %0, %%nclusterid.y;" - : "=r"(__sreg_value) - : - :); - return __sreg_value;), - ( - // Unsupported architectures will have a linker error with a semi-decent error message - __cuda_ptx_get_sreg_nclusterid_y_is_not_supported_before_SM_90__(); return 0;)); -} -#endif // __cccl_ptx_isa >= 780 - -/* -// mov.u32 sreg_value, %%nclusterid.z; // PTX ISA 78, SM_90 -template -__device__ static inline uint32_t get_sreg_nclusterid_z(); -*/ -#if __cccl_ptx_isa >= 780 -extern "C" _CCCL_DEVICE void __cuda_ptx_get_sreg_nclusterid_z_is_not_supported_before_SM_90__(); -template -_CCCL_DEVICE static inline _CUDA_VSTD::uint32_t get_sreg_nclusterid_z() -{ - NV_IF_ELSE_TARGET( - NV_PROVIDES_SM_90, - (_CUDA_VSTD::uint32_t __sreg_value; - asm("mov.u32 %0, %%nclusterid.z;" - : "=r"(__sreg_value) - : - :); - return __sreg_value;), - ( - // Unsupported architectures will have a linker error with a semi-decent error message - __cuda_ptx_get_sreg_nclusterid_z_is_not_supported_before_SM_90__(); return 0;)); -} -#endif // __cccl_ptx_isa >= 780 - -/* -// mov.u32 sreg_value, %%cluster_ctaid.x; // PTX ISA 78, SM_90 -template -__device__ static inline uint32_t get_sreg_cluster_ctaid_x(); -*/ -#if __cccl_ptx_isa >= 780 -extern "C" _CCCL_DEVICE void __cuda_ptx_get_sreg_cluster_ctaid_x_is_not_supported_before_SM_90__(); -template -_CCCL_DEVICE static inline _CUDA_VSTD::uint32_t get_sreg_cluster_ctaid_x() -{ - NV_IF_ELSE_TARGET( - NV_PROVIDES_SM_90, - (_CUDA_VSTD::uint32_t __sreg_value; - asm("mov.u32 %0, %%cluster_ctaid.x;" - : "=r"(__sreg_value) - : - :); - return __sreg_value;), - ( - // Unsupported architectures will have a linker error with a semi-decent error message - __cuda_ptx_get_sreg_cluster_ctaid_x_is_not_supported_before_SM_90__(); return 0;)); -} -#endif // __cccl_ptx_isa >= 780 - -/* -// mov.u32 sreg_value, %%cluster_ctaid.y; // PTX ISA 78, SM_90 -template -__device__ static inline uint32_t get_sreg_cluster_ctaid_y(); -*/ -#if __cccl_ptx_isa >= 780 -extern "C" _CCCL_DEVICE void __cuda_ptx_get_sreg_cluster_ctaid_y_is_not_supported_before_SM_90__(); -template -_CCCL_DEVICE static inline _CUDA_VSTD::uint32_t get_sreg_cluster_ctaid_y() -{ - NV_IF_ELSE_TARGET( - NV_PROVIDES_SM_90, - (_CUDA_VSTD::uint32_t __sreg_value; - asm("mov.u32 %0, %%cluster_ctaid.y;" - : "=r"(__sreg_value) - : - :); - return __sreg_value;), - ( - // Unsupported architectures will have a linker error with a semi-decent error message - __cuda_ptx_get_sreg_cluster_ctaid_y_is_not_supported_before_SM_90__(); return 0;)); -} -#endif // __cccl_ptx_isa >= 780 - -/* -// mov.u32 sreg_value, %%cluster_ctaid.z; // PTX ISA 78, SM_90 -template -__device__ static inline uint32_t get_sreg_cluster_ctaid_z(); -*/ -#if __cccl_ptx_isa >= 780 -extern "C" _CCCL_DEVICE void __cuda_ptx_get_sreg_cluster_ctaid_z_is_not_supported_before_SM_90__(); -template -_CCCL_DEVICE static inline _CUDA_VSTD::uint32_t get_sreg_cluster_ctaid_z() -{ - NV_IF_ELSE_TARGET( - NV_PROVIDES_SM_90, - (_CUDA_VSTD::uint32_t __sreg_value; - asm("mov.u32 %0, %%cluster_ctaid.z;" - : "=r"(__sreg_value) - : - :); - return __sreg_value;), - ( - // Unsupported architectures will have a linker error with a semi-decent error message - __cuda_ptx_get_sreg_cluster_ctaid_z_is_not_supported_before_SM_90__(); return 0;)); -} -#endif // __cccl_ptx_isa >= 780 - -/* -// mov.u32 sreg_value, %%cluster_nctaid.x; // PTX ISA 78, SM_90 -template -__device__ static inline uint32_t get_sreg_cluster_nctaid_x(); -*/ -#if __cccl_ptx_isa >= 780 -extern "C" _CCCL_DEVICE void __cuda_ptx_get_sreg_cluster_nctaid_x_is_not_supported_before_SM_90__(); -template -_CCCL_DEVICE static inline _CUDA_VSTD::uint32_t get_sreg_cluster_nctaid_x() -{ - NV_IF_ELSE_TARGET( - NV_PROVIDES_SM_90, - (_CUDA_VSTD::uint32_t __sreg_value; - asm("mov.u32 %0, %%cluster_nctaid.x;" - : "=r"(__sreg_value) - : - :); - return __sreg_value;), - ( - // Unsupported architectures will have a linker error with a semi-decent error message - __cuda_ptx_get_sreg_cluster_nctaid_x_is_not_supported_before_SM_90__(); return 0;)); -} -#endif // __cccl_ptx_isa >= 780 - -/* -// mov.u32 sreg_value, %%cluster_nctaid.y; // PTX ISA 78, SM_90 -template -__device__ static inline uint32_t get_sreg_cluster_nctaid_y(); -*/ -#if __cccl_ptx_isa >= 780 -extern "C" _CCCL_DEVICE void __cuda_ptx_get_sreg_cluster_nctaid_y_is_not_supported_before_SM_90__(); -template -_CCCL_DEVICE static inline _CUDA_VSTD::uint32_t get_sreg_cluster_nctaid_y() -{ - NV_IF_ELSE_TARGET( - NV_PROVIDES_SM_90, - (_CUDA_VSTD::uint32_t __sreg_value; - asm("mov.u32 %0, %%cluster_nctaid.y;" - : "=r"(__sreg_value) - : - :); - return __sreg_value;), - ( - // Unsupported architectures will have a linker error with a semi-decent error message - __cuda_ptx_get_sreg_cluster_nctaid_y_is_not_supported_before_SM_90__(); return 0;)); -} -#endif // __cccl_ptx_isa >= 780 - -/* -// mov.u32 sreg_value, %%cluster_nctaid.z; // PTX ISA 78, SM_90 -template -__device__ static inline uint32_t get_sreg_cluster_nctaid_z(); -*/ -#if __cccl_ptx_isa >= 780 -extern "C" _CCCL_DEVICE void __cuda_ptx_get_sreg_cluster_nctaid_z_is_not_supported_before_SM_90__(); -template -_CCCL_DEVICE static inline _CUDA_VSTD::uint32_t get_sreg_cluster_nctaid_z() -{ - NV_IF_ELSE_TARGET( - NV_PROVIDES_SM_90, - (_CUDA_VSTD::uint32_t __sreg_value; - asm("mov.u32 %0, %%cluster_nctaid.z;" - : "=r"(__sreg_value) - : - :); - return __sreg_value;), - ( - // Unsupported architectures will have a linker error with a semi-decent error message - __cuda_ptx_get_sreg_cluster_nctaid_z_is_not_supported_before_SM_90__(); return 0;)); -} -#endif // __cccl_ptx_isa >= 780 - -/* -// mov.u32 sreg_value, %%cluster_ctarank; // PTX ISA 78, SM_90 -template -__device__ static inline uint32_t get_sreg_cluster_ctarank(); -*/ -#if __cccl_ptx_isa >= 780 -extern "C" _CCCL_DEVICE void __cuda_ptx_get_sreg_cluster_ctarank_is_not_supported_before_SM_90__(); -template -_CCCL_DEVICE static inline _CUDA_VSTD::uint32_t get_sreg_cluster_ctarank() -{ - NV_IF_ELSE_TARGET( - NV_PROVIDES_SM_90, - (_CUDA_VSTD::uint32_t __sreg_value; - asm("mov.u32 %0, %%cluster_ctarank;" - : "=r"(__sreg_value) - : - :); - return __sreg_value;), - ( - // Unsupported architectures will have a linker error with a semi-decent error message - __cuda_ptx_get_sreg_cluster_ctarank_is_not_supported_before_SM_90__(); return 0;)); -} -#endif // __cccl_ptx_isa >= 780 - -/* -// mov.u32 sreg_value, %%cluster_nctarank; // PTX ISA 78, SM_90 -template -__device__ static inline uint32_t get_sreg_cluster_nctarank(); -*/ -#if __cccl_ptx_isa >= 780 -extern "C" _CCCL_DEVICE void __cuda_ptx_get_sreg_cluster_nctarank_is_not_supported_before_SM_90__(); -template -_CCCL_DEVICE static inline _CUDA_VSTD::uint32_t get_sreg_cluster_nctarank() -{ - NV_IF_ELSE_TARGET( - NV_PROVIDES_SM_90, - (_CUDA_VSTD::uint32_t __sreg_value; - asm("mov.u32 %0, %%cluster_nctarank;" - : "=r"(__sreg_value) - : - :); - return __sreg_value;), - ( - // Unsupported architectures will have a linker error with a semi-decent error message - __cuda_ptx_get_sreg_cluster_nctarank_is_not_supported_before_SM_90__(); return 0;)); -} -#endif // __cccl_ptx_isa >= 780 - -/* -// mov.u32 sreg_value, %%lanemask_eq; // PTX ISA 20, SM_35 -template -__device__ static inline uint32_t get_sreg_lanemask_eq(); -*/ -#if __cccl_ptx_isa >= 200 -extern "C" _CCCL_DEVICE void __cuda_ptx_get_sreg_lanemask_eq_is_not_supported_before_SM_35__(); -template -_CCCL_DEVICE static inline _CUDA_VSTD::uint32_t get_sreg_lanemask_eq() -{ - NV_IF_ELSE_TARGET( - NV_PROVIDES_SM_35, - (_CUDA_VSTD::uint32_t __sreg_value; - asm("mov.u32 %0, %%lanemask_eq;" - : "=r"(__sreg_value) - : - :); - return __sreg_value;), - ( - // Unsupported architectures will have a linker error with a semi-decent error message - __cuda_ptx_get_sreg_lanemask_eq_is_not_supported_before_SM_35__(); return 0;)); -} -#endif // __cccl_ptx_isa >= 200 - -/* -// mov.u32 sreg_value, %%lanemask_le; // PTX ISA 20, SM_35 -template -__device__ static inline uint32_t get_sreg_lanemask_le(); -*/ -#if __cccl_ptx_isa >= 200 -extern "C" _CCCL_DEVICE void __cuda_ptx_get_sreg_lanemask_le_is_not_supported_before_SM_35__(); -template -_CCCL_DEVICE static inline _CUDA_VSTD::uint32_t get_sreg_lanemask_le() -{ - NV_IF_ELSE_TARGET( - NV_PROVIDES_SM_35, - (_CUDA_VSTD::uint32_t __sreg_value; - asm("mov.u32 %0, %%lanemask_le;" - : "=r"(__sreg_value) - : - :); - return __sreg_value;), - ( - // Unsupported architectures will have a linker error with a semi-decent error message - __cuda_ptx_get_sreg_lanemask_le_is_not_supported_before_SM_35__(); return 0;)); -} -#endif // __cccl_ptx_isa >= 200 - -/* -// mov.u32 sreg_value, %%lanemask_lt; // PTX ISA 20, SM_35 -template -__device__ static inline uint32_t get_sreg_lanemask_lt(); -*/ -#if __cccl_ptx_isa >= 200 -extern "C" _CCCL_DEVICE void __cuda_ptx_get_sreg_lanemask_lt_is_not_supported_before_SM_35__(); -template -_CCCL_DEVICE static inline _CUDA_VSTD::uint32_t get_sreg_lanemask_lt() -{ - NV_IF_ELSE_TARGET( - NV_PROVIDES_SM_35, - (_CUDA_VSTD::uint32_t __sreg_value; - asm("mov.u32 %0, %%lanemask_lt;" - : "=r"(__sreg_value) - : - :); - return __sreg_value;), - ( - // Unsupported architectures will have a linker error with a semi-decent error message - __cuda_ptx_get_sreg_lanemask_lt_is_not_supported_before_SM_35__(); return 0;)); -} -#endif // __cccl_ptx_isa >= 200 - -/* -// mov.u32 sreg_value, %%lanemask_ge; // PTX ISA 20, SM_35 -template -__device__ static inline uint32_t get_sreg_lanemask_ge(); -*/ -#if __cccl_ptx_isa >= 200 -extern "C" _CCCL_DEVICE void __cuda_ptx_get_sreg_lanemask_ge_is_not_supported_before_SM_35__(); -template -_CCCL_DEVICE static inline _CUDA_VSTD::uint32_t get_sreg_lanemask_ge() -{ - NV_IF_ELSE_TARGET( - NV_PROVIDES_SM_35, - (_CUDA_VSTD::uint32_t __sreg_value; - asm("mov.u32 %0, %%lanemask_ge;" - : "=r"(__sreg_value) - : - :); - return __sreg_value;), - ( - // Unsupported architectures will have a linker error with a semi-decent error message - __cuda_ptx_get_sreg_lanemask_ge_is_not_supported_before_SM_35__(); return 0;)); -} -#endif // __cccl_ptx_isa >= 200 - -/* -// mov.u32 sreg_value, %%lanemask_gt; // PTX ISA 20, SM_35 -template -__device__ static inline uint32_t get_sreg_lanemask_gt(); -*/ -#if __cccl_ptx_isa >= 200 -extern "C" _CCCL_DEVICE void __cuda_ptx_get_sreg_lanemask_gt_is_not_supported_before_SM_35__(); -template -_CCCL_DEVICE static inline _CUDA_VSTD::uint32_t get_sreg_lanemask_gt() -{ - NV_IF_ELSE_TARGET( - NV_PROVIDES_SM_35, - (_CUDA_VSTD::uint32_t __sreg_value; - asm("mov.u32 %0, %%lanemask_gt;" - : "=r"(__sreg_value) - : - :); - return __sreg_value;), - ( - // Unsupported architectures will have a linker error with a semi-decent error message - __cuda_ptx_get_sreg_lanemask_gt_is_not_supported_before_SM_35__(); return 0;)); -} -#endif // __cccl_ptx_isa >= 200 - -/* -// mov.u32 sreg_value, %%clock; // PTX ISA 10 -template -__device__ static inline uint32_t get_sreg_clock(); -*/ -#if __cccl_ptx_isa >= 100 -template -_CCCL_DEVICE static inline _CUDA_VSTD::uint32_t get_sreg_clock() -{ - _CUDA_VSTD::uint32_t __sreg_value; - asm volatile("mov.u32 %0, %%clock;" : "=r"(__sreg_value) : :); - return __sreg_value; -} -#endif // __cccl_ptx_isa >= 100 - -/* -// mov.u32 sreg_value, %%clock_hi; // PTX ISA 50, SM_35 -template -__device__ static inline uint32_t get_sreg_clock_hi(); -*/ -#if __cccl_ptx_isa >= 500 -extern "C" _CCCL_DEVICE void __cuda_ptx_get_sreg_clock_hi_is_not_supported_before_SM_35__(); -template -_CCCL_DEVICE static inline _CUDA_VSTD::uint32_t get_sreg_clock_hi() -{ - NV_IF_ELSE_TARGET( - NV_PROVIDES_SM_35, - (_CUDA_VSTD::uint32_t __sreg_value; asm volatile( - "mov.u32 %0, %%clock_hi;" - : "=r"(__sreg_value) - : - :); - return __sreg_value;), - ( - // Unsupported architectures will have a linker error with a semi-decent error message - __cuda_ptx_get_sreg_clock_hi_is_not_supported_before_SM_35__(); return 0;)); -} -#endif // __cccl_ptx_isa >= 500 - -/* -// mov.u64 sreg_value, %%clock64; // PTX ISA 20, SM_35 -template -__device__ static inline uint64_t get_sreg_clock64(); -*/ -#if __cccl_ptx_isa >= 200 -extern "C" _CCCL_DEVICE void __cuda_ptx_get_sreg_clock64_is_not_supported_before_SM_35__(); -template -_CCCL_DEVICE static inline _CUDA_VSTD::uint64_t get_sreg_clock64() -{ - NV_IF_ELSE_TARGET( - NV_PROVIDES_SM_35, - (_CUDA_VSTD::uint64_t __sreg_value; asm volatile( - "mov.u64 %0, %%clock64;" - : "=l"(__sreg_value) - : - :); - return __sreg_value;), - ( - // Unsupported architectures will have a linker error with a semi-decent error message - __cuda_ptx_get_sreg_clock64_is_not_supported_before_SM_35__(); return 0;)); -} -#endif // __cccl_ptx_isa >= 200 - -/* -// mov.u64 sreg_value, %%globaltimer; // PTX ISA 31, SM_35 -template -__device__ static inline uint64_t get_sreg_globaltimer(); -*/ -#if __cccl_ptx_isa >= 310 -extern "C" _CCCL_DEVICE void __cuda_ptx_get_sreg_globaltimer_is_not_supported_before_SM_35__(); -template -_CCCL_DEVICE static inline _CUDA_VSTD::uint64_t get_sreg_globaltimer() -{ - NV_IF_ELSE_TARGET( - NV_PROVIDES_SM_35, - (_CUDA_VSTD::uint64_t __sreg_value; asm volatile( - "mov.u64 %0, %%globaltimer;" - : "=l"(__sreg_value) - : - :); - return __sreg_value;), - ( - // Unsupported architectures will have a linker error with a semi-decent error message - __cuda_ptx_get_sreg_globaltimer_is_not_supported_before_SM_35__(); return 0;)); -} -#endif // __cccl_ptx_isa >= 310 - -/* -// mov.u32 sreg_value, %%globaltimer_lo; // PTX ISA 31, SM_35 -template -__device__ static inline uint32_t get_sreg_globaltimer_lo(); -*/ -#if __cccl_ptx_isa >= 310 -extern "C" _CCCL_DEVICE void __cuda_ptx_get_sreg_globaltimer_lo_is_not_supported_before_SM_35__(); -template -_CCCL_DEVICE static inline _CUDA_VSTD::uint32_t get_sreg_globaltimer_lo() -{ - NV_IF_ELSE_TARGET( - NV_PROVIDES_SM_35, - (_CUDA_VSTD::uint32_t __sreg_value; asm volatile( - "mov.u32 %0, %%globaltimer_lo;" - : "=r"(__sreg_value) - : - :); - return __sreg_value;), - ( - // Unsupported architectures will have a linker error with a semi-decent error message - __cuda_ptx_get_sreg_globaltimer_lo_is_not_supported_before_SM_35__(); return 0;)); -} -#endif // __cccl_ptx_isa >= 310 - -/* -// mov.u32 sreg_value, %%globaltimer_hi; // PTX ISA 31, SM_35 -template -__device__ static inline uint32_t get_sreg_globaltimer_hi(); -*/ -#if __cccl_ptx_isa >= 310 -extern "C" _CCCL_DEVICE void __cuda_ptx_get_sreg_globaltimer_hi_is_not_supported_before_SM_35__(); -template -_CCCL_DEVICE static inline _CUDA_VSTD::uint32_t get_sreg_globaltimer_hi() -{ - NV_IF_ELSE_TARGET( - NV_PROVIDES_SM_35, - (_CUDA_VSTD::uint32_t __sreg_value; asm volatile( - "mov.u32 %0, %%globaltimer_hi;" - : "=r"(__sreg_value) - : - :); - return __sreg_value;), - ( - // Unsupported architectures will have a linker error with a semi-decent error message - __cuda_ptx_get_sreg_globaltimer_hi_is_not_supported_before_SM_35__(); return 0;)); -} -#endif // __cccl_ptx_isa >= 310 - -/* -// mov.u32 sreg_value, %%total_smem_size; // PTX ISA 41, SM_35 -template -__device__ static inline uint32_t get_sreg_total_smem_size(); -*/ -#if __cccl_ptx_isa >= 410 -extern "C" _CCCL_DEVICE void __cuda_ptx_get_sreg_total_smem_size_is_not_supported_before_SM_35__(); -template -_CCCL_DEVICE static inline _CUDA_VSTD::uint32_t get_sreg_total_smem_size() -{ - NV_IF_ELSE_TARGET( - NV_PROVIDES_SM_35, - (_CUDA_VSTD::uint32_t __sreg_value; - asm("mov.u32 %0, %%total_smem_size;" - : "=r"(__sreg_value) - : - :); - return __sreg_value;), - ( - // Unsupported architectures will have a linker error with a semi-decent error message - __cuda_ptx_get_sreg_total_smem_size_is_not_supported_before_SM_35__(); return 0;)); -} -#endif // __cccl_ptx_isa >= 410 - -/* -// mov.u32 sreg_value, %%aggr_smem_size; // PTX ISA 81, SM_90 -template -__device__ static inline uint32_t get_sreg_aggr_smem_size(); -*/ -#if __cccl_ptx_isa >= 810 -extern "C" _CCCL_DEVICE void __cuda_ptx_get_sreg_aggr_smem_size_is_not_supported_before_SM_90__(); -template -_CCCL_DEVICE static inline _CUDA_VSTD::uint32_t get_sreg_aggr_smem_size() -{ - NV_IF_ELSE_TARGET( - NV_PROVIDES_SM_90, - (_CUDA_VSTD::uint32_t __sreg_value; - asm("mov.u32 %0, %%aggr_smem_size;" - : "=r"(__sreg_value) - : - :); - return __sreg_value;), - ( - // Unsupported architectures will have a linker error with a semi-decent error message - __cuda_ptx_get_sreg_aggr_smem_size_is_not_supported_before_SM_90__(); return 0;)); -} -#endif // __cccl_ptx_isa >= 810 - -/* -// mov.u32 sreg_value, %%dynamic_smem_size; // PTX ISA 41, SM_35 -template -__device__ static inline uint32_t get_sreg_dynamic_smem_size(); -*/ -#if __cccl_ptx_isa >= 410 -extern "C" _CCCL_DEVICE void __cuda_ptx_get_sreg_dynamic_smem_size_is_not_supported_before_SM_35__(); -template -_CCCL_DEVICE static inline _CUDA_VSTD::uint32_t get_sreg_dynamic_smem_size() -{ - NV_IF_ELSE_TARGET( - NV_PROVIDES_SM_35, - (_CUDA_VSTD::uint32_t __sreg_value; - asm("mov.u32 %0, %%dynamic_smem_size;" - : "=r"(__sreg_value) - : - :); - return __sreg_value;), - ( - // Unsupported architectures will have a linker error with a semi-decent error message - __cuda_ptx_get_sreg_dynamic_smem_size_is_not_supported_before_SM_35__(); return 0;)); -} -#endif // __cccl_ptx_isa >= 410 - -/* -// mov.u64 sreg_value, %%current_graph_exec; // PTX ISA 80, SM_50 -template -__device__ static inline uint64_t get_sreg_current_graph_exec(); -*/ -#if __cccl_ptx_isa >= 800 -extern "C" _CCCL_DEVICE void __cuda_ptx_get_sreg_current_graph_exec_is_not_supported_before_SM_50__(); -template -_CCCL_DEVICE static inline _CUDA_VSTD::uint64_t get_sreg_current_graph_exec() -{ - NV_IF_ELSE_TARGET( - NV_PROVIDES_SM_50, - (_CUDA_VSTD::uint64_t __sreg_value; - asm("mov.u64 %0, %%current_graph_exec;" - : "=l"(__sreg_value) - : - :); - return __sreg_value;), - ( - // Unsupported architectures will have a linker error with a semi-decent error message - __cuda_ptx_get_sreg_current_graph_exec_is_not_supported_before_SM_50__(); return 0;)); -} -#endif // __cccl_ptx_isa >= 800 +#include _LIBCUDACXX_END_NAMESPACE_CUDA_PTX diff --git a/libcudacxx/include/cuda/__ptx/instructions/getctarank.h b/libcudacxx/include/cuda/__ptx/instructions/getctarank.h index f1a2bbbd0e9..f5ed3424d3b 100644 --- a/libcudacxx/include/cuda/__ptx/instructions/getctarank.h +++ b/libcudacxx/include/cuda/__ptx/instructions/getctarank.h @@ -32,33 +32,7 @@ _LIBCUDACXX_BEGIN_NAMESPACE_CUDA_PTX // 9.7.8.23. Data Movement and Conversion Instructions: getctarank // https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#data-movement-and-conversion-instructions-getctarank -/* -// getctarank{.space}.u32 dest, addr; // PTX ISA 78, SM_90 -// .space = { .shared::cluster } -template -__device__ static inline uint32_t getctarank( - cuda::ptx::space_cluster_t, - const void* addr); -*/ -#if __cccl_ptx_isa >= 780 -extern "C" _CCCL_DEVICE void __cuda_ptx_getctarank_is_not_supported_before_SM_90__(); -template -_CCCL_DEVICE static inline _CUDA_VSTD::uint32_t getctarank(space_cluster_t, const void* __addr) -{ - // __space == space_cluster (due to parameter type constraint) - NV_IF_ELSE_TARGET( - NV_PROVIDES_SM_90, - (_CUDA_VSTD::uint32_t __dest; - asm("getctarank.shared::cluster.u32 %0, %1;" - : "=r"(__dest) - : "r"(__as_ptr_smem(__addr)) - :); - return __dest;), - ( - // Unsupported architectures will have a linker error with a semi-decent error message - __cuda_ptx_getctarank_is_not_supported_before_SM_90__(); return 0;)); -} -#endif // __cccl_ptx_isa >= 780 +#include _LIBCUDACXX_END_NAMESPACE_CUDA_PTX diff --git a/libcudacxx/include/cuda/__ptx/instructions/mbarrier_arrive.h b/libcudacxx/include/cuda/__ptx/instructions/mbarrier_arrive.h index 5b423990f1c..fb1341a61d8 100644 --- a/libcudacxx/include/cuda/__ptx/instructions/mbarrier_arrive.h +++ b/libcudacxx/include/cuda/__ptx/instructions/mbarrier_arrive.h @@ -32,316 +32,9 @@ _LIBCUDACXX_BEGIN_NAMESPACE_CUDA_PTX // 9.7.12.15.13. Parallel Synchronization and Communication Instructions: mbarrier.arrive // https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#parallel-synchronization-and-communication-instructions-mbarrier-arrive -/* -// mbarrier.arrive.shared.b64 state, [addr]; // 1. PTX ISA 70, SM_80 -template -__device__ static inline uint64_t mbarrier_arrive( - uint64_t* addr); -*/ -#if __cccl_ptx_isa >= 700 -extern "C" _CCCL_DEVICE void __cuda_ptx_mbarrier_arrive_is_not_supported_before_SM_80__(); -template -_CCCL_DEVICE static inline _CUDA_VSTD::uint64_t mbarrier_arrive(_CUDA_VSTD::uint64_t* __addr) -{ - NV_IF_ELSE_TARGET( - NV_PROVIDES_SM_80, - (_CUDA_VSTD::uint64_t __state; - asm("mbarrier.arrive.shared.b64 %0, [%1]; // 1. " - : "=l"(__state) - : "r"(__as_ptr_smem(__addr)) - : "memory"); - return __state;), - ( - // Unsupported architectures will have a linker error with a semi-decent error message - __cuda_ptx_mbarrier_arrive_is_not_supported_before_SM_80__(); return 0;)); -} -#endif // __cccl_ptx_isa >= 700 - -/* -// mbarrier.arrive.shared::cta.b64 state, [addr], count; // 2. PTX ISA 78, SM_90 -template -__device__ static inline uint64_t mbarrier_arrive( - uint64_t* addr, - const uint32_t& count); -*/ -#if __cccl_ptx_isa >= 780 -extern "C" _CCCL_DEVICE void __cuda_ptx_mbarrier_arrive_is_not_supported_before_SM_90__(); -template -_CCCL_DEVICE static inline _CUDA_VSTD::uint64_t -mbarrier_arrive(_CUDA_VSTD::uint64_t* __addr, const _CUDA_VSTD::uint32_t& __count) -{ - NV_IF_ELSE_TARGET( - NV_PROVIDES_SM_90, - (_CUDA_VSTD::uint64_t __state; - asm("mbarrier.arrive.shared::cta.b64 %0, [%1], %2; // 2. " - : "=l"(__state) - : "r"(__as_ptr_smem(__addr)), "r"(__count) - : "memory"); - return __state;), - ( - // Unsupported architectures will have a linker error with a semi-decent error message - __cuda_ptx_mbarrier_arrive_is_not_supported_before_SM_90__(); return 0;)); -} -#endif // __cccl_ptx_isa >= 780 - -/* -// mbarrier.arrive{.sem}{.scope}{.space}.b64 state, [addr]; // 3a. PTX ISA 80, SM_90 -// .sem = { .release } -// .scope = { .cta, .cluster } -// .space = { .shared::cta } -template -__device__ static inline uint64_t mbarrier_arrive( - cuda::ptx::sem_release_t, - cuda::ptx::scope_t scope, - cuda::ptx::space_shared_t, - uint64_t* addr); -*/ -#if __cccl_ptx_isa >= 800 -extern "C" _CCCL_DEVICE void __cuda_ptx_mbarrier_arrive_is_not_supported_before_SM_90__(); -template -_CCCL_DEVICE static inline _CUDA_VSTD::uint64_t -mbarrier_arrive(sem_release_t, scope_t<_Scope> __scope, space_shared_t, _CUDA_VSTD::uint64_t* __addr) -{ - // __sem == sem_release (due to parameter type constraint) - static_assert(__scope == scope_cta || __scope == scope_cluster, ""); - // __space == space_shared (due to parameter type constraint) - NV_IF_ELSE_TARGET( - NV_PROVIDES_SM_90, - ( - _CUDA_VSTD::uint64_t __state; _CCCL_IF_CONSTEXPR (__scope == scope_cta) { - asm("mbarrier.arrive.release.cta.shared::cta.b64 %0, [%1]; // 3a. " - : "=l"(__state) - : "r"(__as_ptr_smem(__addr)) - : "memory"); - } _CCCL_ELSE_IF_CONSTEXPR (__scope == scope_cluster) { - asm("mbarrier.arrive.release.cluster.shared::cta.b64 %0, [%1]; // 3a. " - : "=l"(__state) - : "r"(__as_ptr_smem(__addr)) - : "memory"); - } return __state;), - ( - // Unsupported architectures will have a linker error with a semi-decent error message - __cuda_ptx_mbarrier_arrive_is_not_supported_before_SM_90__(); return 0;)); -} -#endif // __cccl_ptx_isa >= 800 - -/* -// mbarrier.arrive{.sem}{.scope}{.space}.b64 state, [addr], count; // 3b. PTX ISA 80, SM_90 -// .sem = { .release } -// .scope = { .cta, .cluster } -// .space = { .shared::cta } -template -__device__ static inline uint64_t mbarrier_arrive( - cuda::ptx::sem_release_t, - cuda::ptx::scope_t scope, - cuda::ptx::space_shared_t, - uint64_t* addr, - const uint32_t& count); -*/ -#if __cccl_ptx_isa >= 800 -extern "C" _CCCL_DEVICE void __cuda_ptx_mbarrier_arrive_is_not_supported_before_SM_90__(); -template -_CCCL_DEVICE static inline _CUDA_VSTD::uint64_t mbarrier_arrive( - sem_release_t, - scope_t<_Scope> __scope, - space_shared_t, - _CUDA_VSTD::uint64_t* __addr, - const _CUDA_VSTD::uint32_t& __count) -{ - // __sem == sem_release (due to parameter type constraint) - static_assert(__scope == scope_cta || __scope == scope_cluster, ""); - // __space == space_shared (due to parameter type constraint) - NV_IF_ELSE_TARGET( - NV_PROVIDES_SM_90, - ( - _CUDA_VSTD::uint64_t __state; _CCCL_IF_CONSTEXPR (__scope == scope_cta) { - asm("mbarrier.arrive.release.cta.shared::cta.b64 %0, [%1], %2; // 3b. " - : "=l"(__state) - : "r"(__as_ptr_smem(__addr)), "r"(__count) - : "memory"); - } _CCCL_ELSE_IF_CONSTEXPR (__scope == scope_cluster) { - asm("mbarrier.arrive.release.cluster.shared::cta.b64 %0, [%1], %2; // 3b. " - : "=l"(__state) - : "r"(__as_ptr_smem(__addr)), "r"(__count) - : "memory"); - } return __state;), - ( - // Unsupported architectures will have a linker error with a semi-decent error message - __cuda_ptx_mbarrier_arrive_is_not_supported_before_SM_90__(); return 0;)); -} -#endif // __cccl_ptx_isa >= 800 - -/* -// mbarrier.arrive{.sem}{.scope}{.space}.b64 _, [addr]; // 4a. PTX ISA 80, SM_90 -// .sem = { .release } -// .scope = { .cluster } -// .space = { .shared::cluster } -template -__device__ static inline void mbarrier_arrive( - cuda::ptx::sem_release_t, - cuda::ptx::scope_cluster_t, - cuda::ptx::space_cluster_t, - uint64_t* addr); -*/ -#if __cccl_ptx_isa >= 800 -extern "C" _CCCL_DEVICE void __cuda_ptx_mbarrier_arrive_is_not_supported_before_SM_90__(); -template -_CCCL_DEVICE static inline void -mbarrier_arrive(sem_release_t, scope_cluster_t, space_cluster_t, _CUDA_VSTD::uint64_t* __addr) -{ - // __sem == sem_release (due to parameter type constraint) - // __scope == scope_cluster (due to parameter type constraint) - // __space == space_cluster (due to parameter type constraint) - NV_IF_ELSE_TARGET( - NV_PROVIDES_SM_90, - (asm("mbarrier.arrive.release.cluster.shared::cluster.b64 _, [%0]; // 4a. " - : - : "r"(__as_ptr_remote_dsmem(__addr)) - : "memory");), - ( - // Unsupported architectures will have a linker error with a semi-decent error message - __cuda_ptx_mbarrier_arrive_is_not_supported_before_SM_90__();)); -} -#endif // __cccl_ptx_isa >= 800 - -/* -// mbarrier.arrive{.sem}{.scope}{.space}.b64 _, [addr], count; // 4b. PTX ISA 80, SM_90 -// .sem = { .release } -// .scope = { .cluster } -// .space = { .shared::cluster } -template -__device__ static inline void mbarrier_arrive( - cuda::ptx::sem_release_t, - cuda::ptx::scope_cluster_t, - cuda::ptx::space_cluster_t, - uint64_t* addr, - const uint32_t& count); -*/ -#if __cccl_ptx_isa >= 800 -extern "C" _CCCL_DEVICE void __cuda_ptx_mbarrier_arrive_is_not_supported_before_SM_90__(); -template -_CCCL_DEVICE static inline void mbarrier_arrive( - sem_release_t, scope_cluster_t, space_cluster_t, _CUDA_VSTD::uint64_t* __addr, const _CUDA_VSTD::uint32_t& __count) -{ - // __sem == sem_release (due to parameter type constraint) - // __scope == scope_cluster (due to parameter type constraint) - // __space == space_cluster (due to parameter type constraint) - NV_IF_ELSE_TARGET( - NV_PROVIDES_SM_90, - (asm("mbarrier.arrive.release.cluster.shared::cluster.b64 _, [%0], %1; // 4b. " - : - : "r"(__as_ptr_remote_dsmem(__addr)), "r"(__count) - : "memory");), - ( - // Unsupported architectures will have a linker error with a semi-decent error message - __cuda_ptx_mbarrier_arrive_is_not_supported_before_SM_90__();)); -} -#endif // __cccl_ptx_isa >= 800 -/* -// mbarrier.arrive.noComplete.shared.b64 state, [addr], count; // 5. PTX ISA 70, SM_80 -template -__device__ static inline uint64_t mbarrier_arrive_no_complete( - uint64_t* addr, - const uint32_t& count); -*/ -#if __cccl_ptx_isa >= 700 -extern "C" _CCCL_DEVICE void __cuda_ptx_mbarrier_arrive_no_complete_is_not_supported_before_SM_80__(); -template -_CCCL_DEVICE static inline _CUDA_VSTD::uint64_t -mbarrier_arrive_no_complete(_CUDA_VSTD::uint64_t* __addr, const _CUDA_VSTD::uint32_t& __count) -{ - NV_IF_ELSE_TARGET( - NV_PROVIDES_SM_80, - (_CUDA_VSTD::uint64_t __state; - asm("mbarrier.arrive.noComplete.shared.b64 %0, [%1], %2; // 5. " - : "=l"(__state) - : "r"(__as_ptr_smem(__addr)), "r"(__count) - : "memory"); - return __state;), - ( - // Unsupported architectures will have a linker error with a semi-decent error message - __cuda_ptx_mbarrier_arrive_no_complete_is_not_supported_before_SM_80__(); return 0;)); -} -#endif // __cccl_ptx_isa >= 700 -/* -// mbarrier.arrive.expect_tx{.sem}{.scope}{.space}.b64 state, [addr], tx_count; // 8. PTX ISA 80, SM_90 -// .sem = { .release } -// .scope = { .cta, .cluster } -// .space = { .shared::cta } -template -__device__ static inline uint64_t mbarrier_arrive_expect_tx( - cuda::ptx::sem_release_t, - cuda::ptx::scope_t scope, - cuda::ptx::space_shared_t, - uint64_t* addr, - const uint32_t& tx_count); -*/ -#if __cccl_ptx_isa >= 800 -extern "C" _CCCL_DEVICE void __cuda_ptx_mbarrier_arrive_expect_tx_is_not_supported_before_SM_90__(); -template -_CCCL_DEVICE static inline _CUDA_VSTD::uint64_t mbarrier_arrive_expect_tx( - sem_release_t, - scope_t<_Scope> __scope, - space_shared_t, - _CUDA_VSTD::uint64_t* __addr, - const _CUDA_VSTD::uint32_t& __tx_count) -{ - // __sem == sem_release (due to parameter type constraint) - static_assert(__scope == scope_cta || __scope == scope_cluster, ""); - // __space == space_shared (due to parameter type constraint) - NV_IF_ELSE_TARGET( - NV_PROVIDES_SM_90, - ( - _CUDA_VSTD::uint64_t __state; _CCCL_IF_CONSTEXPR (__scope == scope_cta) { - asm("mbarrier.arrive.expect_tx.release.cta.shared::cta.b64 %0, [%1], %2; // 8. " - : "=l"(__state) - : "r"(__as_ptr_smem(__addr)), "r"(__tx_count) - : "memory"); - } _CCCL_ELSE_IF_CONSTEXPR (__scope == scope_cluster) { - asm("mbarrier.arrive.expect_tx.release.cluster.shared::cta.b64 %0, [%1], %2; // 8. " - : "=l"(__state) - : "r"(__as_ptr_smem(__addr)), "r"(__tx_count) - : "memory"); - } return __state;), - ( - // Unsupported architectures will have a linker error with a semi-decent error message - __cuda_ptx_mbarrier_arrive_expect_tx_is_not_supported_before_SM_90__(); return 0;)); -} -#endif // __cccl_ptx_isa >= 800 - -/* -// mbarrier.arrive.expect_tx{.sem}{.scope}{.space}.b64 _, [addr], tx_count; // 9. PTX ISA 80, SM_90 -// .sem = { .release } -// .scope = { .cluster } -// .space = { .shared::cluster } -template -__device__ static inline void mbarrier_arrive_expect_tx( - cuda::ptx::sem_release_t, - cuda::ptx::scope_cluster_t, - cuda::ptx::space_cluster_t, - uint64_t* addr, - const uint32_t& tx_count); -*/ -#if __cccl_ptx_isa >= 800 -extern "C" _CCCL_DEVICE void __cuda_ptx_mbarrier_arrive_expect_tx_is_not_supported_before_SM_90__(); -template -_CCCL_DEVICE static inline void mbarrier_arrive_expect_tx( - sem_release_t, scope_cluster_t, space_cluster_t, _CUDA_VSTD::uint64_t* __addr, const _CUDA_VSTD::uint32_t& __tx_count) -{ - // __sem == sem_release (due to parameter type constraint) - // __scope == scope_cluster (due to parameter type constraint) - // __space == space_cluster (due to parameter type constraint) - NV_IF_ELSE_TARGET( - NV_PROVIDES_SM_90, - (asm("mbarrier.arrive.expect_tx.release.cluster.shared::cluster.b64 _, [%0], %1; // 9. " - : - : "r"(__as_ptr_remote_dsmem(__addr)), "r"(__tx_count) - : "memory");), - ( - // Unsupported architectures will have a linker error with a semi-decent error message - __cuda_ptx_mbarrier_arrive_expect_tx_is_not_supported_before_SM_90__();)); -} -#endif // __cccl_ptx_isa >= 800 +#include +#include +#include _LIBCUDACXX_END_NAMESPACE_CUDA_PTX diff --git a/libcudacxx/include/cuda/__ptx/instructions/mbarrier_init.h b/libcudacxx/include/cuda/__ptx/instructions/mbarrier_init.h index 366b1b67eec..575abda7a41 100644 --- a/libcudacxx/include/cuda/__ptx/instructions/mbarrier_init.h +++ b/libcudacxx/include/cuda/__ptx/instructions/mbarrier_init.h @@ -32,29 +32,7 @@ _LIBCUDACXX_BEGIN_NAMESPACE_CUDA_PTX // 9.7.12.15.9. Parallel Synchronization and Communication Instructions: mbarrier.init // https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#parallel-synchronization-and-communication-instructions-mbarrier-init -/* -// mbarrier.init.shared.b64 [addr], count; // PTX ISA 70, SM_80 -template -__device__ static inline void mbarrier_init( - uint64_t* addr, - const uint32_t& count); -*/ -#if __cccl_ptx_isa >= 700 -extern "C" _CCCL_DEVICE void __cuda_ptx_mbarrier_init_is_not_supported_before_SM_80__(); -template -_CCCL_DEVICE static inline void mbarrier_init(_CUDA_VSTD::uint64_t* __addr, const _CUDA_VSTD::uint32_t& __count) -{ - NV_IF_ELSE_TARGET( - NV_PROVIDES_SM_80, - (asm("mbarrier.init.shared.b64 [%0], %1;" - : - : "r"(__as_ptr_smem(__addr)), "r"(__count) - : "memory");), - ( - // Unsupported architectures will have a linker error with a semi-decent error message - __cuda_ptx_mbarrier_init_is_not_supported_before_SM_80__();)); -} -#endif // __cccl_ptx_isa >= 700 +#include _LIBCUDACXX_END_NAMESPACE_CUDA_PTX diff --git a/libcudacxx/include/cuda/__ptx/instructions/mbarrier_wait.h b/libcudacxx/include/cuda/__ptx/instructions/mbarrier_wait.h index 837fec44b9f..2d6adb78eec 100644 --- a/libcudacxx/include/cuda/__ptx/instructions/mbarrier_wait.h +++ b/libcudacxx/include/cuda/__ptx/instructions/mbarrier_wait.h @@ -32,470 +32,10 @@ _LIBCUDACXX_BEGIN_NAMESPACE_CUDA_PTX // 9.7.12.15.16. Parallel Synchronization and Communication Instructions: mbarrier.test_wait/mbarrier.try_wait // https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#parallel-synchronization-and-communication-instructions-mbarrier-test-wait-mbarrier-try-wait -/* -// mbarrier.test_wait.shared.b64 waitComplete, [addr], state; // 1. PTX -ISA 70, SM_80 template -__device__ static inline bool mbarrier_test_wait( - uint64_t* addr, - const uint64_t& state); -*/ -#if __cccl_ptx_isa >= 700 -extern "C" _CCCL_DEVICE void __cuda_ptx_mbarrier_test_wait_is_not_supported_before_SM_80__(); -template -_CCCL_DEVICE static inline bool mbarrier_test_wait(_CUDA_VSTD::uint64_t* __addr, const _CUDA_VSTD::uint64_t& __state) -{ - NV_IF_ELSE_TARGET( - NV_PROVIDES_SM_80, - (_CUDA_VSTD::uint32_t __waitComplete; - asm("{\n\t .reg .pred P_OUT; \n\t" - "mbarrier.test_wait.shared.b64 P_OUT, [%1], %2; // 1. \n\t" - "selp.b32 %0, 1, 0, P_OUT; \n" - "}" - : "=r"(__waitComplete) - : "r"(__as_ptr_smem(__addr)), "l"(__state) - : "memory"); - return static_cast(__waitComplete);), - ( - // Unsupported architectures will have a linker error with a semi-decent error message - __cuda_ptx_mbarrier_test_wait_is_not_supported_before_SM_80__(); return false;)); -} -#endif // __cccl_ptx_isa >= 700 - -/* -// mbarrier.test_wait{.sem}{.scope}.shared::cta.b64 waitComplete, [addr], state; // 2. PTX -ISA 80, SM_90 -// .sem = { .acquire } -// .scope = { .cta, .cluster } -template -__device__ static inline bool mbarrier_test_wait( - cuda::ptx::sem_acquire_t, - cuda::ptx::scope_t scope, - uint64_t* addr, - const uint64_t& state); -*/ -#if __cccl_ptx_isa >= 800 -extern "C" _CCCL_DEVICE void __cuda_ptx_mbarrier_test_wait_is_not_supported_before_SM_90__(); -template -_CCCL_DEVICE static inline bool mbarrier_test_wait( - sem_acquire_t, scope_t<_Scope> __scope, _CUDA_VSTD::uint64_t* __addr, const _CUDA_VSTD::uint64_t& __state) -{ - // __sem == sem_acquire (due to parameter type constraint) - static_assert(__scope == scope_cta || __scope == scope_cluster, ""); - NV_IF_ELSE_TARGET( - NV_PROVIDES_SM_90, - ( - _CUDA_VSTD::uint32_t __waitComplete; _CCCL_IF_CONSTEXPR (__scope == scope_cta) { - asm("{\n\t .reg .pred P_OUT; \n\t" - "mbarrier.test_wait.acquire.cta.shared::cta.b64 P_OUT, [%1], %2; // 2. \n\t" - "selp.b32 %0, 1, 0, P_OUT; \n" - "}" - : "=r"(__waitComplete) - : "r"(__as_ptr_smem(__addr)), "l"(__state) - : "memory"); - } _CCCL_ELSE_IF_CONSTEXPR (__scope == scope_cluster) { - asm("{\n\t .reg .pred P_OUT; \n\t" - "mbarrier.test_wait.acquire.cluster.shared::cta.b64 P_OUT, [%1], %2; // 2. " - "\n\t" - "selp.b32 %0, 1, 0, P_OUT; \n" - "}" - : "=r"(__waitComplete) - : "r"(__as_ptr_smem(__addr)), "l"(__state) - : "memory"); - } return static_cast(__waitComplete);), - ( - // Unsupported architectures will have a linker error with a semi-decent error message - __cuda_ptx_mbarrier_test_wait_is_not_supported_before_SM_90__(); return false;)); -} -#endif // __cccl_ptx_isa >= 800 -/* -// mbarrier.test_wait.parity.shared.b64 waitComplete, [addr], phaseParity; // 3. PTX -ISA 71, SM_80 template -__device__ static inline bool mbarrier_test_wait_parity( - uint64_t* addr, - const uint32_t& phaseParity); -*/ -#if __cccl_ptx_isa >= 710 -extern "C" _CCCL_DEVICE void __cuda_ptx_mbarrier_test_wait_parity_is_not_supported_before_SM_80__(); -template -_CCCL_DEVICE static inline bool -mbarrier_test_wait_parity(_CUDA_VSTD::uint64_t* __addr, const _CUDA_VSTD::uint32_t& __phaseParity) -{ - NV_IF_ELSE_TARGET( - NV_PROVIDES_SM_80, - (_CUDA_VSTD::uint32_t __waitComplete; - asm("{\n\t .reg .pred P_OUT; \n\t" - "mbarrier.test_wait.parity.shared.b64 P_OUT, [%1], %2; // 3. \n\t" - "selp.b32 %0, 1, 0, P_OUT; \n" - "}" - : "=r"(__waitComplete) - : "r"(__as_ptr_smem(__addr)), "r"(__phaseParity) - : "memory"); - return static_cast(__waitComplete);), - ( - // Unsupported architectures will have a linker error with a semi-decent error message - __cuda_ptx_mbarrier_test_wait_parity_is_not_supported_before_SM_80__(); return false;)); -} -#endif // __cccl_ptx_isa >= 710 - -/* -// mbarrier.test_wait.parity{.sem}{.scope}.shared::cta.b64 waitComplete, [addr], phaseParity; // 4. PTX -ISA 80, SM_90 -// .sem = { .acquire } -// .scope = { .cta, .cluster } -template -__device__ static inline bool mbarrier_test_wait_parity( - cuda::ptx::sem_acquire_t, - cuda::ptx::scope_t scope, - uint64_t* addr, - const uint32_t& phaseParity); -*/ -#if __cccl_ptx_isa >= 800 -extern "C" _CCCL_DEVICE void __cuda_ptx_mbarrier_test_wait_parity_is_not_supported_before_SM_90__(); -template -_CCCL_DEVICE static inline bool mbarrier_test_wait_parity( - sem_acquire_t, scope_t<_Scope> __scope, _CUDA_VSTD::uint64_t* __addr, const _CUDA_VSTD::uint32_t& __phaseParity) -{ - // __sem == sem_acquire (due to parameter type constraint) - static_assert(__scope == scope_cta || __scope == scope_cluster, ""); - NV_IF_ELSE_TARGET( - NV_PROVIDES_SM_90, - ( - _CUDA_VSTD::uint32_t __waitComplete; _CCCL_IF_CONSTEXPR (__scope == scope_cta) { - asm("{\n\t .reg .pred P_OUT; \n\t" - "mbarrier.test_wait.parity.acquire.cta.shared::cta.b64 P_OUT, [%1], %2; // 4. \n\t" - "selp.b32 %0, 1, 0, P_OUT; \n" - "}" - : "=r"(__waitComplete) - : "r"(__as_ptr_smem(__addr)), "r"(__phaseParity) - : "memory"); - } _CCCL_ELSE_IF_CONSTEXPR (__scope == scope_cluster) { - asm("{\n\t .reg .pred P_OUT; \n\t" - "mbarrier.test_wait.parity.acquire.cluster.shared::cta.b64 P_OUT, [%1], %2; // 4. \n\t" - "selp.b32 %0, 1, 0, P_OUT; \n" - "}" - : "=r"(__waitComplete) - : "r"(__as_ptr_smem(__addr)), "r"(__phaseParity) - : "memory"); - } return static_cast(__waitComplete);), - ( - // Unsupported architectures will have a linker error with a semi-decent error message - __cuda_ptx_mbarrier_test_wait_parity_is_not_supported_before_SM_90__(); return false;)); -} -#endif // __cccl_ptx_isa >= 800 -/* -// mbarrier.try_wait.shared::cta.b64 waitComplete, [addr], state; // 5a. -PTX ISA 78, SM_90 template -__device__ static inline bool mbarrier_try_wait( - uint64_t* addr, - const uint64_t& state); -*/ -#if __cccl_ptx_isa >= 780 -extern "C" _CCCL_DEVICE void __cuda_ptx_mbarrier_try_wait_is_not_supported_before_SM_90__(); -template -_CCCL_DEVICE static inline bool mbarrier_try_wait(_CUDA_VSTD::uint64_t* __addr, const _CUDA_VSTD::uint64_t& __state) -{ - NV_IF_ELSE_TARGET( - NV_PROVIDES_SM_90, - (_CUDA_VSTD::uint32_t __waitComplete; - asm("{\n\t .reg .pred P_OUT; \n\t" - "mbarrier.try_wait.shared::cta.b64 P_OUT, [%1], %2; // 5a. \n\t" - "selp.b32 %0, 1, 0, P_OUT; \n" - "}" - : "=r"(__waitComplete) - : "r"(__as_ptr_smem(__addr)), "l"(__state) - : "memory"); - return static_cast(__waitComplete);), - ( - // Unsupported architectures will have a linker error with a semi-decent error message - __cuda_ptx_mbarrier_try_wait_is_not_supported_before_SM_90__(); return false;)); -} -#endif // __cccl_ptx_isa >= 780 - -/* -// mbarrier.try_wait.shared::cta.b64 waitComplete, [addr], state, suspendTimeHint; // 5b. PTX -ISA 78, SM_90 template -__device__ static inline bool mbarrier_try_wait( - uint64_t* addr, - const uint64_t& state, - const uint32_t& suspendTimeHint); -*/ -#if __cccl_ptx_isa >= 780 -extern "C" _CCCL_DEVICE void __cuda_ptx_mbarrier_try_wait_is_not_supported_before_SM_90__(); -template -_CCCL_DEVICE static inline bool mbarrier_try_wait( - _CUDA_VSTD::uint64_t* __addr, const _CUDA_VSTD::uint64_t& __state, const _CUDA_VSTD::uint32_t& __suspendTimeHint) -{ - NV_IF_ELSE_TARGET( - NV_PROVIDES_SM_90, - (_CUDA_VSTD::uint32_t __waitComplete; - asm("{\n\t .reg .pred P_OUT; \n\t" - "mbarrier.try_wait.shared::cta.b64 P_OUT, [%1], %2, %3; // 5b. \n\t" - "selp.b32 %0, 1, 0, P_OUT; \n" - "}" - : "=r"(__waitComplete) - : "r"(__as_ptr_smem(__addr)), "l"(__state), "r"(__suspendTimeHint) - : "memory"); - return static_cast(__waitComplete);), - ( - // Unsupported architectures will have a linker error with a semi-decent error message - __cuda_ptx_mbarrier_try_wait_is_not_supported_before_SM_90__(); return false;)); -} -#endif // __cccl_ptx_isa >= 780 - -/* -// mbarrier.try_wait{.sem}{.scope}.shared::cta.b64 waitComplete, [addr], state; // 6a. -PTX ISA 80, SM_90 -// .sem = { .acquire } -// .scope = { .cta, .cluster } -template -__device__ static inline bool mbarrier_try_wait( - cuda::ptx::sem_acquire_t, - cuda::ptx::scope_t scope, - uint64_t* addr, - const uint64_t& state); -*/ -#if __cccl_ptx_isa >= 800 -extern "C" _CCCL_DEVICE void __cuda_ptx_mbarrier_try_wait_is_not_supported_before_SM_90__(); -template -_CCCL_DEVICE static inline bool mbarrier_try_wait( - sem_acquire_t, scope_t<_Scope> __scope, _CUDA_VSTD::uint64_t* __addr, const _CUDA_VSTD::uint64_t& __state) -{ - // __sem == sem_acquire (due to parameter type constraint) - static_assert(__scope == scope_cta || __scope == scope_cluster, ""); - NV_IF_ELSE_TARGET( - NV_PROVIDES_SM_90, - ( - _CUDA_VSTD::uint32_t __waitComplete; _CCCL_IF_CONSTEXPR (__scope == scope_cta) { - asm("{\n\t .reg .pred P_OUT; \n\t" - "mbarrier.try_wait.acquire.cta.shared::cta.b64 P_OUT, [%1], %2; // 6a. \n\t" - "selp.b32 %0, 1, 0, P_OUT; \n" - "}" - : "=r"(__waitComplete) - : "r"(__as_ptr_smem(__addr)), "l"(__state) - : "memory"); - } _CCCL_ELSE_IF_CONSTEXPR (__scope == scope_cluster) { - asm("{\n\t .reg .pred P_OUT; \n\t" - "mbarrier.try_wait.acquire.cluster.shared::cta.b64 P_OUT, [%1], %2; // 6a. " - "\n\t" - "selp.b32 %0, 1, 0, P_OUT; \n" - "}" - : "=r"(__waitComplete) - : "r"(__as_ptr_smem(__addr)), "l"(__state) - : "memory"); - } return static_cast(__waitComplete);), - ( - // Unsupported architectures will have a linker error with a semi-decent error message - __cuda_ptx_mbarrier_try_wait_is_not_supported_before_SM_90__(); return false;)); -} -#endif // __cccl_ptx_isa >= 800 - -/* -// mbarrier.try_wait{.sem}{.scope}.shared::cta.b64 waitComplete, [addr], state , suspendTimeHint; // 6b. -PTX ISA 80, SM_90 -// .sem = { .acquire } -// .scope = { .cta, .cluster } -template -__device__ static inline bool mbarrier_try_wait( - cuda::ptx::sem_acquire_t, - cuda::ptx::scope_t scope, - uint64_t* addr, - const uint64_t& state, - const uint32_t& suspendTimeHint); -*/ -#if __cccl_ptx_isa >= 800 -extern "C" _CCCL_DEVICE void __cuda_ptx_mbarrier_try_wait_is_not_supported_before_SM_90__(); -template -_CCCL_DEVICE static inline bool mbarrier_try_wait( - sem_acquire_t, - scope_t<_Scope> __scope, - _CUDA_VSTD::uint64_t* __addr, - const _CUDA_VSTD::uint64_t& __state, - const _CUDA_VSTD::uint32_t& __suspendTimeHint) -{ - // __sem == sem_acquire (due to parameter type constraint) - static_assert(__scope == scope_cta || __scope == scope_cluster, ""); - NV_IF_ELSE_TARGET( - NV_PROVIDES_SM_90, - ( - _CUDA_VSTD::uint32_t __waitComplete; _CCCL_IF_CONSTEXPR (__scope == scope_cta) { - asm("{\n\t .reg .pred P_OUT; \n\t" - "mbarrier.try_wait.acquire.cta.shared::cta.b64 P_OUT, [%1], %2 , %3; // 6b. \n\t" - "selp.b32 %0, 1, 0, P_OUT; \n" - "}" - : "=r"(__waitComplete) - : "r"(__as_ptr_smem(__addr)), "l"(__state), "r"(__suspendTimeHint) - : "memory"); - } _CCCL_ELSE_IF_CONSTEXPR (__scope == scope_cluster) { - asm("{\n\t .reg .pred P_OUT; \n\t" - "mbarrier.try_wait.acquire.cluster.shared::cta.b64 P_OUT, [%1], %2 , %3; // 6b. \n\t" - "selp.b32 %0, 1, 0, P_OUT; \n" - "}" - : "=r"(__waitComplete) - : "r"(__as_ptr_smem(__addr)), "l"(__state), "r"(__suspendTimeHint) - : "memory"); - } return static_cast(__waitComplete);), - ( - // Unsupported architectures will have a linker error with a semi-decent error message - __cuda_ptx_mbarrier_try_wait_is_not_supported_before_SM_90__(); return false;)); -} -#endif // __cccl_ptx_isa >= 800 -/* -// mbarrier.try_wait.parity.shared::cta.b64 waitComplete, [addr], phaseParity; // 7a. -PTX ISA 78, SM_90 template -__device__ static inline bool mbarrier_try_wait_parity( - uint64_t* addr, - const uint32_t& phaseParity); -*/ -#if __cccl_ptx_isa >= 780 -extern "C" _CCCL_DEVICE void __cuda_ptx_mbarrier_try_wait_parity_is_not_supported_before_SM_90__(); -template -_CCCL_DEVICE static inline bool -mbarrier_try_wait_parity(_CUDA_VSTD::uint64_t* __addr, const _CUDA_VSTD::uint32_t& __phaseParity) -{ - NV_IF_ELSE_TARGET( - NV_PROVIDES_SM_90, - (_CUDA_VSTD::uint32_t __waitComplete; - asm("{\n\t .reg .pred P_OUT; \n\t" - "mbarrier.try_wait.parity.shared::cta.b64 P_OUT, [%1], %2; // 7a. \n\t" - "selp.b32 %0, 1, 0, P_OUT; \n" - "}" - : "=r"(__waitComplete) - : "r"(__as_ptr_smem(__addr)), "r"(__phaseParity) - : "memory"); - return static_cast(__waitComplete);), - ( - // Unsupported architectures will have a linker error with a semi-decent error message - __cuda_ptx_mbarrier_try_wait_parity_is_not_supported_before_SM_90__(); return false;)); -} -#endif // __cccl_ptx_isa >= 780 - -/* -// mbarrier.try_wait.parity.shared::cta.b64 waitComplete, [addr], phaseParity, suspendTimeHint; // 7b. -PTX ISA 78, SM_90 template -__device__ static inline bool mbarrier_try_wait_parity( - uint64_t* addr, - const uint32_t& phaseParity, - const uint32_t& suspendTimeHint); -*/ -#if __cccl_ptx_isa >= 780 -extern "C" _CCCL_DEVICE void __cuda_ptx_mbarrier_try_wait_parity_is_not_supported_before_SM_90__(); -template -_CCCL_DEVICE static inline bool mbarrier_try_wait_parity( - _CUDA_VSTD::uint64_t* __addr, const _CUDA_VSTD::uint32_t& __phaseParity, const _CUDA_VSTD::uint32_t& __suspendTimeHint) -{ - NV_IF_ELSE_TARGET( - NV_PROVIDES_SM_90, - (_CUDA_VSTD::uint32_t __waitComplete; - asm("{\n\t .reg .pred P_OUT; \n\t" - "mbarrier.try_wait.parity.shared::cta.b64 P_OUT, [%1], %2, %3; // 7b. \n\t" - "selp.b32 %0, 1, 0, P_OUT; \n" - "}" - : "=r"(__waitComplete) - : "r"(__as_ptr_smem(__addr)), "r"(__phaseParity), "r"(__suspendTimeHint) - : "memory"); - return static_cast(__waitComplete);), - ( - // Unsupported architectures will have a linker error with a semi-decent error message - __cuda_ptx_mbarrier_try_wait_parity_is_not_supported_before_SM_90__(); return false;)); -} -#endif // __cccl_ptx_isa >= 780 - -/* -// mbarrier.try_wait.parity{.sem}{.scope}.shared::cta.b64 waitComplete, [addr], phaseParity; // 8a. -PTX ISA 80, SM_90 -// .sem = { .acquire } -// .scope = { .cta, .cluster } -template -__device__ static inline bool mbarrier_try_wait_parity( - cuda::ptx::sem_acquire_t, - cuda::ptx::scope_t scope, - uint64_t* addr, - const uint32_t& phaseParity); -*/ -#if __cccl_ptx_isa >= 800 -extern "C" _CCCL_DEVICE void __cuda_ptx_mbarrier_try_wait_parity_is_not_supported_before_SM_90__(); -template -_CCCL_DEVICE static inline bool mbarrier_try_wait_parity( - sem_acquire_t, scope_t<_Scope> __scope, _CUDA_VSTD::uint64_t* __addr, const _CUDA_VSTD::uint32_t& __phaseParity) -{ - // __sem == sem_acquire (due to parameter type constraint) - static_assert(__scope == scope_cta || __scope == scope_cluster, ""); - NV_IF_ELSE_TARGET( - NV_PROVIDES_SM_90, - ( - _CUDA_VSTD::uint32_t __waitComplete; _CCCL_IF_CONSTEXPR (__scope == scope_cta) { - asm("{\n\t .reg .pred P_OUT; \n\t" - "mbarrier.try_wait.parity.acquire.cta.shared::cta.b64 P_OUT, [%1], %2; // 8a. \n\t" - "selp.b32 %0, 1, 0, P_OUT; \n" - "}" - : "=r"(__waitComplete) - : "r"(__as_ptr_smem(__addr)), "r"(__phaseParity) - : "memory"); - } _CCCL_ELSE_IF_CONSTEXPR (__scope == scope_cluster) { - asm("{\n\t .reg .pred P_OUT; \n\t" - "mbarrier.try_wait.parity.acquire.cluster.shared::cta.b64 P_OUT, [%1], %2; // 8a. \n\t" - "selp.b32 %0, 1, 0, P_OUT; \n" - "}" - : "=r"(__waitComplete) - : "r"(__as_ptr_smem(__addr)), "r"(__phaseParity) - : "memory"); - } return static_cast(__waitComplete);), - ( - // Unsupported architectures will have a linker error with a semi-decent error message - __cuda_ptx_mbarrier_try_wait_parity_is_not_supported_before_SM_90__(); return false;)); -} -#endif // __cccl_ptx_isa >= 800 - -/* -// mbarrier.try_wait.parity{.sem}{.scope}.shared::cta.b64 waitComplete, [addr], phaseParity, suspendTimeHint; // 8b. -PTX ISA 80, SM_90 -// .sem = { .acquire } -// .scope = { .cta, .cluster } -template -__device__ static inline bool mbarrier_try_wait_parity( - cuda::ptx::sem_acquire_t, - cuda::ptx::scope_t scope, - uint64_t* addr, - const uint32_t& phaseParity, - const uint32_t& suspendTimeHint); -*/ -#if __cccl_ptx_isa >= 800 -extern "C" _CCCL_DEVICE void __cuda_ptx_mbarrier_try_wait_parity_is_not_supported_before_SM_90__(); -template -_CCCL_DEVICE static inline bool mbarrier_try_wait_parity( - sem_acquire_t, - scope_t<_Scope> __scope, - _CUDA_VSTD::uint64_t* __addr, - const _CUDA_VSTD::uint32_t& __phaseParity, - const _CUDA_VSTD::uint32_t& __suspendTimeHint) -{ - // __sem == sem_acquire (due to parameter type constraint) - static_assert(__scope == scope_cta || __scope == scope_cluster, ""); - NV_IF_ELSE_TARGET( - NV_PROVIDES_SM_90, - ( - _CUDA_VSTD::uint32_t __waitComplete; _CCCL_IF_CONSTEXPR (__scope == scope_cta) { - asm("{\n\t .reg .pred P_OUT; \n\t" - "mbarrier.try_wait.parity.acquire.cta.shared::cta.b64 P_OUT, [%1], %2, %3; // 8b. \n\t" - "selp.b32 %0, 1, 0, P_OUT; \n" - "}" - : "=r"(__waitComplete) - : "r"(__as_ptr_smem(__addr)), "r"(__phaseParity), "r"(__suspendTimeHint) - : "memory"); - } _CCCL_ELSE_IF_CONSTEXPR (__scope == scope_cluster) { - asm("{\n\t .reg .pred P_OUT; \n\t" - "mbarrier.try_wait.parity.acquire.cluster.shared::cta.b64 P_OUT, [%1], %2, %3; // 8b. \n\t" - "selp.b32 %0, 1, 0, P_OUT; \n" - "}" - : "=r"(__waitComplete) - : "r"(__as_ptr_smem(__addr)), "r"(__phaseParity), "r"(__suspendTimeHint) - : "memory"); - } return static_cast(__waitComplete);), - ( - // Unsupported architectures will have a linker error with a semi-decent error message - __cuda_ptx_mbarrier_try_wait_parity_is_not_supported_before_SM_90__(); return false;)); -} -#endif // __cccl_ptx_isa >= 800 +#include +#include +#include +#include _LIBCUDACXX_END_NAMESPACE_CUDA_PTX diff --git a/libcudacxx/include/cuda/__ptx/instructions/red_async.h b/libcudacxx/include/cuda/__ptx/instructions/red_async.h index 777628c67d0..a610cf2b583 100644 --- a/libcudacxx/include/cuda/__ptx/instructions/red_async.h +++ b/libcudacxx/include/cuda/__ptx/instructions/red_async.h @@ -32,423 +32,7 @@ _LIBCUDACXX_BEGIN_NAMESPACE_CUDA_PTX // 9.7.12.7. Parallel Synchronization and Communication Instructions: red.async // https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#parallel-synchronization-and-communication-instructions-red-async -/* -// red.async.relaxed.cluster.shared::cluster.mbarrier::complete_tx::bytes{.op}{.type} [dest], value, [remote_bar]; // -PTX ISA 81, SM_90 -// .type = { .u32 } -// .op = { .inc } -template -__device__ static inline void red_async( - cuda::ptx::op_inc_t, - uint32_t* dest, - const uint32_t& value, - uint64_t* remote_bar); -*/ -#if __cccl_ptx_isa >= 810 -extern "C" _CCCL_DEVICE void __cuda_ptx_red_async_is_not_supported_before_SM_90__(); -template -_CCCL_DEVICE static inline void red_async( - op_inc_t, _CUDA_VSTD::uint32_t* __dest, const _CUDA_VSTD::uint32_t& __value, _CUDA_VSTD::uint64_t* __remote_bar) -{ - // __type == type_u32 (due to parameter type constraint) - // __op == op_inc (due to parameter type constraint) - NV_IF_ELSE_TARGET( - NV_PROVIDES_SM_90, - (asm("red.async.relaxed.cluster.shared::cluster.mbarrier::complete_tx::bytes.inc.u32 [%0], %1, [%2]; " - : - : "r"(__as_ptr_remote_dsmem(__dest)), "r"(__value), "r"(__as_ptr_remote_dsmem(__remote_bar)) - : "memory");), - ( - // Unsupported architectures will have a linker error with a semi-decent error message - __cuda_ptx_red_async_is_not_supported_before_SM_90__();)); -} -#endif // __cccl_ptx_isa >= 810 - -/* -// red.async.relaxed.cluster.shared::cluster.mbarrier::complete_tx::bytes{.op}{.type} [dest], value, [remote_bar]; // -PTX ISA 81, SM_90 -// .type = { .u32 } -// .op = { .dec } -template -__device__ static inline void red_async( - cuda::ptx::op_dec_t, - uint32_t* dest, - const uint32_t& value, - uint64_t* remote_bar); -*/ -#if __cccl_ptx_isa >= 810 -extern "C" _CCCL_DEVICE void __cuda_ptx_red_async_is_not_supported_before_SM_90__(); -template -_CCCL_DEVICE static inline void red_async( - op_dec_t, _CUDA_VSTD::uint32_t* __dest, const _CUDA_VSTD::uint32_t& __value, _CUDA_VSTD::uint64_t* __remote_bar) -{ - // __type == type_u32 (due to parameter type constraint) - // __op == op_dec (due to parameter type constraint) - NV_IF_ELSE_TARGET( - NV_PROVIDES_SM_90, - (asm("red.async.relaxed.cluster.shared::cluster.mbarrier::complete_tx::bytes.dec.u32 [%0], %1, [%2]; " - : - : "r"(__as_ptr_remote_dsmem(__dest)), "r"(__value), "r"(__as_ptr_remote_dsmem(__remote_bar)) - : "memory");), - ( - // Unsupported architectures will have a linker error with a semi-decent error message - __cuda_ptx_red_async_is_not_supported_before_SM_90__();)); -} -#endif // __cccl_ptx_isa >= 810 - -/* -// red.async.relaxed.cluster.shared::cluster.mbarrier::complete_tx::bytes{.op}{.type} [dest], value, [remote_bar]; // -PTX ISA 81, SM_90 -// .type = { .u32 } -// .op = { .min } -template -__device__ static inline void red_async( - cuda::ptx::op_min_t, - uint32_t* dest, - const uint32_t& value, - uint64_t* remote_bar); -*/ -#if __cccl_ptx_isa >= 810 -extern "C" _CCCL_DEVICE void __cuda_ptx_red_async_is_not_supported_before_SM_90__(); -template -_CCCL_DEVICE static inline void red_async( - op_min_t, _CUDA_VSTD::uint32_t* __dest, const _CUDA_VSTD::uint32_t& __value, _CUDA_VSTD::uint64_t* __remote_bar) -{ - // __type == type_u32 (due to parameter type constraint) - // __op == op_min (due to parameter type constraint) - NV_IF_ELSE_TARGET( - NV_PROVIDES_SM_90, - (asm("red.async.relaxed.cluster.shared::cluster.mbarrier::complete_tx::bytes.min.u32 [%0], %1, [%2]; " - : - : "r"(__as_ptr_remote_dsmem(__dest)), "r"(__value), "r"(__as_ptr_remote_dsmem(__remote_bar)) - : "memory");), - ( - // Unsupported architectures will have a linker error with a semi-decent error message - __cuda_ptx_red_async_is_not_supported_before_SM_90__();)); -} -#endif // __cccl_ptx_isa >= 810 - -/* -// red.async.relaxed.cluster.shared::cluster.mbarrier::complete_tx::bytes{.op}{.type} [dest], value, [remote_bar]; // -PTX ISA 81, SM_90 -// .type = { .u32 } -// .op = { .max } -template -__device__ static inline void red_async( - cuda::ptx::op_max_t, - uint32_t* dest, - const uint32_t& value, - uint64_t* remote_bar); -*/ -#if __cccl_ptx_isa >= 810 -extern "C" _CCCL_DEVICE void __cuda_ptx_red_async_is_not_supported_before_SM_90__(); -template -_CCCL_DEVICE static inline void red_async( - op_max_t, _CUDA_VSTD::uint32_t* __dest, const _CUDA_VSTD::uint32_t& __value, _CUDA_VSTD::uint64_t* __remote_bar) -{ - // __type == type_u32 (due to parameter type constraint) - // __op == op_max (due to parameter type constraint) - NV_IF_ELSE_TARGET( - NV_PROVIDES_SM_90, - (asm("red.async.relaxed.cluster.shared::cluster.mbarrier::complete_tx::bytes.max.u32 [%0], %1, [%2]; " - : - : "r"(__as_ptr_remote_dsmem(__dest)), "r"(__value), "r"(__as_ptr_remote_dsmem(__remote_bar)) - : "memory");), - ( - // Unsupported architectures will have a linker error with a semi-decent error message - __cuda_ptx_red_async_is_not_supported_before_SM_90__();)); -} -#endif // __cccl_ptx_isa >= 810 - -/* -// red.async.relaxed.cluster.shared::cluster.mbarrier::complete_tx::bytes{.op}{.type} [dest], value, [remote_bar]; // -PTX ISA 81, SM_90 -// .type = { .u32 } -// .op = { .add } -template -__device__ static inline void red_async( - cuda::ptx::op_add_t, - uint32_t* dest, - const uint32_t& value, - uint64_t* remote_bar); -*/ -#if __cccl_ptx_isa >= 810 -extern "C" _CCCL_DEVICE void __cuda_ptx_red_async_is_not_supported_before_SM_90__(); -template -_CCCL_DEVICE static inline void red_async( - op_add_t, _CUDA_VSTD::uint32_t* __dest, const _CUDA_VSTD::uint32_t& __value, _CUDA_VSTD::uint64_t* __remote_bar) -{ - // __type == type_u32 (due to parameter type constraint) - // __op == op_add (due to parameter type constraint) - NV_IF_ELSE_TARGET( - NV_PROVIDES_SM_90, - (asm("red.async.relaxed.cluster.shared::cluster.mbarrier::complete_tx::bytes.add.u32 [%0], %1, [%2]; " - : - : "r"(__as_ptr_remote_dsmem(__dest)), "r"(__value), "r"(__as_ptr_remote_dsmem(__remote_bar)) - : "memory");), - ( - // Unsupported architectures will have a linker error with a semi-decent error message - __cuda_ptx_red_async_is_not_supported_before_SM_90__();)); -} -#endif // __cccl_ptx_isa >= 810 - -/* -// red.async.relaxed.cluster.shared::cluster.mbarrier::complete_tx::bytes{.op}{.type} [dest], value, [remote_bar]; // -PTX ISA 81, SM_90 -// .type = { .s32 } -// .op = { .min } -template -__device__ static inline void red_async( - cuda::ptx::op_min_t, - int32_t* dest, - const int32_t& value, - uint64_t* remote_bar); -*/ -#if __cccl_ptx_isa >= 810 -extern "C" _CCCL_DEVICE void __cuda_ptx_red_async_is_not_supported_before_SM_90__(); -template -_CCCL_DEVICE static inline void -red_async(op_min_t, _CUDA_VSTD::int32_t* __dest, const _CUDA_VSTD::int32_t& __value, _CUDA_VSTD::uint64_t* __remote_bar) -{ - // __type == type_s32 (due to parameter type constraint) - // __op == op_min (due to parameter type constraint) - NV_IF_ELSE_TARGET( - NV_PROVIDES_SM_90, - (asm("red.async.relaxed.cluster.shared::cluster.mbarrier::complete_tx::bytes.min.s32 [%0], %1, [%2]; " - : - : "r"(__as_ptr_remote_dsmem(__dest)), "r"(__value), "r"(__as_ptr_remote_dsmem(__remote_bar)) - : "memory");), - ( - // Unsupported architectures will have a linker error with a semi-decent error message - __cuda_ptx_red_async_is_not_supported_before_SM_90__();)); -} -#endif // __cccl_ptx_isa >= 810 - -/* -// red.async.relaxed.cluster.shared::cluster.mbarrier::complete_tx::bytes{.op}{.type} [dest], value, [remote_bar]; // -PTX ISA 81, SM_90 -// .type = { .s32 } -// .op = { .max } -template -__device__ static inline void red_async( - cuda::ptx::op_max_t, - int32_t* dest, - const int32_t& value, - uint64_t* remote_bar); -*/ -#if __cccl_ptx_isa >= 810 -extern "C" _CCCL_DEVICE void __cuda_ptx_red_async_is_not_supported_before_SM_90__(); -template -_CCCL_DEVICE static inline void -red_async(op_max_t, _CUDA_VSTD::int32_t* __dest, const _CUDA_VSTD::int32_t& __value, _CUDA_VSTD::uint64_t* __remote_bar) -{ - // __type == type_s32 (due to parameter type constraint) - // __op == op_max (due to parameter type constraint) - NV_IF_ELSE_TARGET( - NV_PROVIDES_SM_90, - (asm("red.async.relaxed.cluster.shared::cluster.mbarrier::complete_tx::bytes.max.s32 [%0], %1, [%2]; " - : - : "r"(__as_ptr_remote_dsmem(__dest)), "r"(__value), "r"(__as_ptr_remote_dsmem(__remote_bar)) - : "memory");), - ( - // Unsupported architectures will have a linker error with a semi-decent error message - __cuda_ptx_red_async_is_not_supported_before_SM_90__();)); -} -#endif // __cccl_ptx_isa >= 810 - -/* -// red.async.relaxed.cluster.shared::cluster.mbarrier::complete_tx::bytes{.op}{.type} [dest], value, [remote_bar]; // -PTX ISA 81, SM_90 -// .type = { .s32 } -// .op = { .add } -template -__device__ static inline void red_async( - cuda::ptx::op_add_t, - int32_t* dest, - const int32_t& value, - uint64_t* remote_bar); -*/ -#if __cccl_ptx_isa >= 810 -extern "C" _CCCL_DEVICE void __cuda_ptx_red_async_is_not_supported_before_SM_90__(); -template -_CCCL_DEVICE static inline void -red_async(op_add_t, _CUDA_VSTD::int32_t* __dest, const _CUDA_VSTD::int32_t& __value, _CUDA_VSTD::uint64_t* __remote_bar) -{ - // __type == type_s32 (due to parameter type constraint) - // __op == op_add (due to parameter type constraint) - NV_IF_ELSE_TARGET( - NV_PROVIDES_SM_90, - (asm("red.async.relaxed.cluster.shared::cluster.mbarrier::complete_tx::bytes.add.s32 [%0], %1, [%2]; " - : - : "r"(__as_ptr_remote_dsmem(__dest)), "r"(__value), "r"(__as_ptr_remote_dsmem(__remote_bar)) - : "memory");), - ( - // Unsupported architectures will have a linker error with a semi-decent error message - __cuda_ptx_red_async_is_not_supported_before_SM_90__();)); -} -#endif // __cccl_ptx_isa >= 810 - -/* -// red.async.relaxed.cluster.shared::cluster.mbarrier::complete_tx::bytes{.op}{.type} [dest], value, [remote_bar]; // -PTX ISA 81, SM_90 -// .type = { .b32 } -// .op = { .and } -template -__device__ static inline void red_async( - cuda::ptx::op_and_op_t, - B32* dest, - const B32& value, - uint64_t* remote_bar); -*/ -#if __cccl_ptx_isa >= 810 -extern "C" _CCCL_DEVICE void __cuda_ptx_red_async_is_not_supported_before_SM_90__(); -template -_CCCL_DEVICE static inline void -red_async(op_and_op_t, _B32* __dest, const _B32& __value, _CUDA_VSTD::uint64_t* __remote_bar) -{ - // __type == type_b32 (due to parameter type constraint) - // __op == op_and_op (due to parameter type constraint) - static_assert(sizeof(_B32) == 4, ""); - NV_IF_ELSE_TARGET( - NV_PROVIDES_SM_90, - (asm("red.async.relaxed.cluster.shared::cluster.mbarrier::complete_tx::bytes.and.b32 [%0], %1, [%2]; " - : - : "r"(__as_ptr_remote_dsmem(__dest)), "r"(__as_b32(__value)), "r"(__as_ptr_remote_dsmem(__remote_bar)) - : "memory");), - ( - // Unsupported architectures will have a linker error with a semi-decent error message - __cuda_ptx_red_async_is_not_supported_before_SM_90__();)); -} -#endif // __cccl_ptx_isa >= 810 - -/* -// red.async.relaxed.cluster.shared::cluster.mbarrier::complete_tx::bytes{.op}{.type} [dest], value, [remote_bar]; // -PTX ISA 81, SM_90 -// .type = { .b32 } -// .op = { .or } -template -__device__ static inline void red_async( - cuda::ptx::op_or_op_t, - B32* dest, - const B32& value, - uint64_t* remote_bar); -*/ -#if __cccl_ptx_isa >= 810 -extern "C" _CCCL_DEVICE void __cuda_ptx_red_async_is_not_supported_before_SM_90__(); -template -_CCCL_DEVICE static inline void -red_async(op_or_op_t, _B32* __dest, const _B32& __value, _CUDA_VSTD::uint64_t* __remote_bar) -{ - // __type == type_b32 (due to parameter type constraint) - // __op == op_or_op (due to parameter type constraint) - static_assert(sizeof(_B32) == 4, ""); - NV_IF_ELSE_TARGET( - NV_PROVIDES_SM_90, - (asm("red.async.relaxed.cluster.shared::cluster.mbarrier::complete_tx::bytes.or.b32 [%0], %1, [%2]; " - : - : "r"(__as_ptr_remote_dsmem(__dest)), "r"(__as_b32(__value)), "r"(__as_ptr_remote_dsmem(__remote_bar)) - : "memory");), - ( - // Unsupported architectures will have a linker error with a semi-decent error message - __cuda_ptx_red_async_is_not_supported_before_SM_90__();)); -} -#endif // __cccl_ptx_isa >= 810 - -/* -// red.async.relaxed.cluster.shared::cluster.mbarrier::complete_tx::bytes{.op}{.type} [dest], value, [remote_bar]; // -PTX ISA 81, SM_90 -// .type = { .b32 } -// .op = { .xor } -template -__device__ static inline void red_async( - cuda::ptx::op_xor_op_t, - B32* dest, - const B32& value, - uint64_t* remote_bar); -*/ -#if __cccl_ptx_isa >= 810 -extern "C" _CCCL_DEVICE void __cuda_ptx_red_async_is_not_supported_before_SM_90__(); -template -_CCCL_DEVICE static inline void -red_async(op_xor_op_t, _B32* __dest, const _B32& __value, _CUDA_VSTD::uint64_t* __remote_bar) -{ - // __type == type_b32 (due to parameter type constraint) - // __op == op_xor_op (due to parameter type constraint) - static_assert(sizeof(_B32) == 4, ""); - NV_IF_ELSE_TARGET( - NV_PROVIDES_SM_90, - (asm("red.async.relaxed.cluster.shared::cluster.mbarrier::complete_tx::bytes.xor.b32 [%0], %1, [%2]; " - : - : "r"(__as_ptr_remote_dsmem(__dest)), "r"(__as_b32(__value)), "r"(__as_ptr_remote_dsmem(__remote_bar)) - : "memory");), - ( - // Unsupported architectures will have a linker error with a semi-decent error message - __cuda_ptx_red_async_is_not_supported_before_SM_90__();)); -} -#endif // __cccl_ptx_isa >= 810 - -/* -// red.async.relaxed.cluster.shared::cluster.mbarrier::complete_tx::bytes{.op}{.type} [dest], value, [remote_bar]; // -PTX ISA 81, SM_90 -// .type = { .u64 } -// .op = { .add } -template -__device__ static inline void red_async( - cuda::ptx::op_add_t, - uint64_t* dest, - const uint64_t& value, - uint64_t* remote_bar); -*/ -#if __cccl_ptx_isa >= 810 -extern "C" _CCCL_DEVICE void __cuda_ptx_red_async_is_not_supported_before_SM_90__(); -template -_CCCL_DEVICE static inline void red_async( - op_add_t, _CUDA_VSTD::uint64_t* __dest, const _CUDA_VSTD::uint64_t& __value, _CUDA_VSTD::uint64_t* __remote_bar) -{ - // __type == type_u64 (due to parameter type constraint) - // __op == op_add (due to parameter type constraint) - NV_IF_ELSE_TARGET( - NV_PROVIDES_SM_90, - (asm("red.async.relaxed.cluster.shared::cluster.mbarrier::complete_tx::bytes.add.u64 [%0], %1, [%2]; " - : - : "r"(__as_ptr_remote_dsmem(__dest)), "l"(__value), "r"(__as_ptr_remote_dsmem(__remote_bar)) - : "memory");), - ( - // Unsupported architectures will have a linker error with a semi-decent error message - __cuda_ptx_red_async_is_not_supported_before_SM_90__();)); -} -#endif // __cccl_ptx_isa >= 810 - -/* -// red.async.relaxed.cluster.shared::cluster.mbarrier::complete_tx::bytes{.op}.u64 [dest], value, [remote_bar]; // .u64 -intentional PTX ISA 81, SM_90 -// .op = { .add } -template -__device__ static inline void red_async( - cuda::ptx::op_add_t, - int64_t* dest, - const int64_t& value, - int64_t* remote_bar); -*/ -#if __cccl_ptx_isa >= 810 -extern "C" _CCCL_DEVICE void __cuda_ptx_red_async_is_not_supported_before_SM_90__(); -template -_CCCL_DEVICE static inline void -red_async(op_add_t, _CUDA_VSTD::int64_t* __dest, const _CUDA_VSTD::int64_t& __value, _CUDA_VSTD::int64_t* __remote_bar) -{ - // __op == op_add (due to parameter type constraint) - NV_IF_ELSE_TARGET( - NV_PROVIDES_SM_90, - (asm("red.async.relaxed.cluster.shared::cluster.mbarrier::complete_tx::bytes.add.u64 [%0], %1, [%2]; // .u64 " - "intentional" - : - : "r"(__as_ptr_remote_dsmem(__dest)), "l"(__value), "r"(__as_ptr_remote_dsmem(__remote_bar)) - : "memory");), - ( - // Unsupported architectures will have a linker error with a semi-decent error message - __cuda_ptx_red_async_is_not_supported_before_SM_90__();)); -} -#endif // __cccl_ptx_isa >= 810 +#include _LIBCUDACXX_END_NAMESPACE_CUDA_PTX diff --git a/libcudacxx/include/cuda/__ptx/instructions/st_async.h b/libcudacxx/include/cuda/__ptx/instructions/st_async.h index e6774087802..09199b4a3ce 100644 --- a/libcudacxx/include/cuda/__ptx/instructions/st_async.h +++ b/libcudacxx/include/cuda/__ptx/instructions/st_async.h @@ -32,114 +32,7 @@ _LIBCUDACXX_BEGIN_NAMESPACE_CUDA_PTX // 9.7.8.12. Data Movement and Conversion Instructions: st.async // https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#data-movement-and-conversion-instructions-st-async -/* -// st.async.weak.shared::cluster.mbarrier::complete_tx::bytes{.type} [addr], value, [remote_bar]; // 1. PTX ISA 81, -SM_90 -// .type = { .b32, .b64 } -template -__device__ static inline void st_async( - Type* addr, - const Type& value, - uint64_t* remote_bar); -*/ -#if __cccl_ptx_isa >= 810 -extern "C" _CCCL_DEVICE void __cuda_ptx_st_async_is_not_supported_before_SM_90__(); -template -_CCCL_DEVICE static inline void st_async(_Type* __addr, const _Type& __value, _CUDA_VSTD::uint64_t* __remote_bar) -{ - static_assert(sizeof(_Type) == 4 || sizeof(_Type) == 8, ""); - NV_IF_ELSE_TARGET( - NV_PROVIDES_SM_90, - ( - _CCCL_IF_CONSTEXPR (sizeof(_Type) == 4) { - asm("st.async.weak.shared::cluster.mbarrier::complete_tx::bytes.b32 [%0], %1, [%2]; // 1. " - : - : "r"(__as_ptr_remote_dsmem(__addr)), "r"(__as_b32(__value)), "r"(__as_ptr_remote_dsmem(__remote_bar)) - : "memory"); - } _CCCL_ELSE_IF_CONSTEXPR (sizeof(_Type) == 8) { - asm("st.async.weak.shared::cluster.mbarrier::complete_tx::bytes.b64 [%0], %1, [%2]; // 1. " - : - : "r"(__as_ptr_remote_dsmem(__addr)), "l"(__as_b64(__value)), "r"(__as_ptr_remote_dsmem(__remote_bar)) - : "memory"); - }), - ( - // Unsupported architectures will have a linker error with a semi-decent error message - __cuda_ptx_st_async_is_not_supported_before_SM_90__();)); -} -#endif // __cccl_ptx_isa >= 810 - -/* -// st.async.weak.shared::cluster.mbarrier::complete_tx::bytes.v2{.type} [addr], value, [remote_bar]; // 2. PTX ISA 81, -SM_90 -// .type = { .b32, .b64 } -template -__device__ static inline void st_async( - Type* addr, - const Type (&value)[2], - uint64_t* remote_bar); -*/ -#if __cccl_ptx_isa >= 810 -extern "C" _CCCL_DEVICE void __cuda_ptx_st_async_is_not_supported_before_SM_90__(); -template -_CCCL_DEVICE static inline void st_async(_Type* __addr, const _Type (&__value)[2], _CUDA_VSTD::uint64_t* __remote_bar) -{ - static_assert(sizeof(_Type) == 4 || sizeof(_Type) == 8, ""); - NV_IF_ELSE_TARGET( - NV_PROVIDES_SM_90, - ( - _CCCL_IF_CONSTEXPR (sizeof(_Type) == 4) { - asm("st.async.weak.shared::cluster.mbarrier::complete_tx::bytes.v2.b32 [%0], {%1, %2}, [%3]; // 2. " - : - : "r"(__as_ptr_remote_dsmem(__addr)), - "r"(__as_b32(__value[0])), - "r"(__as_b32(__value[1])), - "r"(__as_ptr_remote_dsmem(__remote_bar)) - : "memory"); - } _CCCL_ELSE_IF_CONSTEXPR (sizeof(_Type) == 8) { - asm("st.async.weak.shared::cluster.mbarrier::complete_tx::bytes.v2.b64 [%0], {%1, %2}, [%3]; // 2. " - : - : "r"(__as_ptr_remote_dsmem(__addr)), - "l"(__as_b64(__value[0])), - "l"(__as_b64(__value[1])), - "r"(__as_ptr_remote_dsmem(__remote_bar)) - : "memory"); - }), - ( - // Unsupported architectures will have a linker error with a semi-decent error message - __cuda_ptx_st_async_is_not_supported_before_SM_90__();)); -} -#endif // __cccl_ptx_isa >= 810 - -/* -// st.async.weak.shared::cluster.mbarrier::complete_tx::bytes.v4.b32 [addr], value, [remote_bar]; // 3. PTX ISA 81, -SM_90 template -__device__ static inline void st_async( - B32* addr, - const B32 (&value)[4], - uint64_t* remote_bar); -*/ -#if __cccl_ptx_isa >= 810 -extern "C" _CCCL_DEVICE void __cuda_ptx_st_async_is_not_supported_before_SM_90__(); -template -_CCCL_DEVICE static inline void st_async(_B32* __addr, const _B32 (&__value)[4], _CUDA_VSTD::uint64_t* __remote_bar) -{ - static_assert(sizeof(_B32) == 4, ""); - NV_IF_ELSE_TARGET( - NV_PROVIDES_SM_90, - (asm("st.async.weak.shared::cluster.mbarrier::complete_tx::bytes.v4.b32 [%0], {%1, %2, %3, %4}, [%5]; // 3. " - : - : "r"(__as_ptr_remote_dsmem(__addr)), - "r"(__as_b32(__value[0])), - "r"(__as_b32(__value[1])), - "r"(__as_b32(__value[2])), - "r"(__as_b32(__value[3])), - "r"(__as_ptr_remote_dsmem(__remote_bar)) - : "memory");), - ( - // Unsupported architectures will have a linker error with a semi-decent error message - __cuda_ptx_st_async_is_not_supported_before_SM_90__();)); -} -#endif // __cccl_ptx_isa >= 810 +#include _LIBCUDACXX_END_NAMESPACE_CUDA_PTX diff --git a/libcudacxx/include/cuda/__ptx/instructions/tensormap_cp_fenceproxy.h b/libcudacxx/include/cuda/__ptx/instructions/tensormap_cp_fenceproxy.h index ce8b0f10991..de179f69735 100644 --- a/libcudacxx/include/cuda/__ptx/instructions/tensormap_cp_fenceproxy.h +++ b/libcudacxx/include/cuda/__ptx/instructions/tensormap_cp_fenceproxy.h @@ -32,60 +32,7 @@ _LIBCUDACXX_BEGIN_NAMESPACE_CUDA_PTX // 9.7.12.15.18. Parallel Synchronization and Communication Instructions: tensormap.cp_fenceproxy // https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#parallel-synchronization-and-communication-instructions-tensormap-cp-fenceproxy -/* -// tensormap.cp_fenceproxy.global.shared::cta.tensormap::generic.sem.scope.sync.aligned [dst], [src], size; // PTX ISA -83, SM_90 -// .sem = { .release } -// .scope = { .cta, .cluster, .gpu, .sys } -template -__device__ static inline void tensormap_cp_fenceproxy( - cuda::ptx::sem_release_t, - cuda::ptx::scope_t scope, - void* dst, - const void* src, - cuda::ptx::n32_t size); -*/ -#if __cccl_ptx_isa >= 830 -extern "C" _CCCL_DEVICE void __cuda_ptx_tensormap_cp_fenceproxy_is_not_supported_before_SM_90__(); -template -_CCCL_DEVICE static inline void -tensormap_cp_fenceproxy(sem_release_t, scope_t<_Scope> __scope, void* __dst, const void* __src, n32_t<_N32> __size) -{ - // __sem == sem_release (due to parameter type constraint) - static_assert(__scope == scope_cta || __scope == scope_cluster || __scope == scope_gpu || __scope == scope_sys, ""); - NV_IF_ELSE_TARGET( - NV_PROVIDES_SM_90, - ( - _CCCL_IF_CONSTEXPR (__scope == scope_cta) { - asm volatile( - "tensormap.cp_fenceproxy.global.shared::cta.tensormap::generic.release.cta.sync.aligned [%0], [%1], %2;" - : - : "l"(__as_ptr_gmem(__dst)), "r"(__as_ptr_smem(__src)), "n"(__size.value) - : "memory"); - } _CCCL_ELSE_IF_CONSTEXPR (__scope == scope_cluster) { - asm volatile( - "tensormap.cp_fenceproxy.global.shared::cta.tensormap::generic.release.cluster.sync.aligned [%0], [%1], %2;" - : - : "l"(__as_ptr_gmem(__dst)), "r"(__as_ptr_smem(__src)), "n"(__size.value) - : "memory"); - } _CCCL_ELSE_IF_CONSTEXPR (__scope == scope_gpu) { - asm volatile( - "tensormap.cp_fenceproxy.global.shared::cta.tensormap::generic.release.gpu.sync.aligned [%0], [%1], %2;" - : - : "l"(__as_ptr_gmem(__dst)), "r"(__as_ptr_smem(__src)), "n"(__size.value) - : "memory"); - } _CCCL_ELSE_IF_CONSTEXPR (__scope == scope_sys) { - asm volatile( - "tensormap.cp_fenceproxy.global.shared::cta.tensormap::generic.release.sys.sync.aligned [%0], [%1], %2;" - : - : "l"(__as_ptr_gmem(__dst)), "r"(__as_ptr_smem(__src)), "n"(__size.value) - : "memory"); - }), - ( - // Unsupported architectures will have a linker error with a semi-decent error message - __cuda_ptx_tensormap_cp_fenceproxy_is_not_supported_before_SM_90__();)); -} -#endif // __cccl_ptx_isa >= 830 +#include _LIBCUDACXX_END_NAMESPACE_CUDA_PTX diff --git a/libcudacxx/include/cuda/__ptx/instructions/tensormap_replace.h b/libcudacxx/include/cuda/__ptx/instructions/tensormap_replace.h index b40c0cf72aa..2f81d8b4361 100644 --- a/libcudacxx/include/cuda/__ptx/instructions/tensormap_replace.h +++ b/libcudacxx/include/cuda/__ptx/instructions/tensormap_replace.h @@ -32,575 +32,7 @@ _LIBCUDACXX_BEGIN_NAMESPACE_CUDA_PTX // 9.7.8.25. Data Movement and Conversion Instructions: tensormap.replace // https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#data-movement-and-conversion-instructions-tensormap-replace -/* -// tensormap.replace.tile.global_address.space.b1024.b64 [tm_addr], new_val; // PTX ISA 83, SM_90a -// .space = { .global } -template -__device__ static inline void tensormap_replace_global_address( - cuda::ptx::space_global_t, - void* tm_addr, - B64 new_val); -*/ -#if __cccl_ptx_isa >= 830 -extern "C" _CCCL_DEVICE void __cuda_ptx_tensormap_replace_global_address_is_not_supported_before_SM_90a__(); -template -_CCCL_DEVICE static inline void tensormap_replace_global_address(space_global_t, void* __tm_addr, _B64 __new_val) -{ - // __space == space_global (due to parameter type constraint) - static_assert(sizeof(_B64) == 8, ""); - NV_IF_ELSE_TARGET( - NV_HAS_FEATURE_SM_90a, - (asm("tensormap.replace.tile.global_address.global.b1024.b64 [%0], %1;" - : - : "l"(__as_ptr_gmem(__tm_addr)), "l"(__as_b64(__new_val)) - : "memory");), - ( - // Unsupported architectures will have a linker error with a semi-decent error message - __cuda_ptx_tensormap_replace_global_address_is_not_supported_before_SM_90a__();)); -} -#endif // __cccl_ptx_isa >= 830 - -/* -// tensormap.replace.tile.global_address.space.b1024.b64 [tm_addr], new_val; // PTX ISA 83, SM_90a -// .space = { .shared::cta } -template -__device__ static inline void tensormap_replace_global_address( - cuda::ptx::space_shared_t, - void* tm_addr, - B64 new_val); -*/ -#if __cccl_ptx_isa >= 830 -extern "C" _CCCL_DEVICE void __cuda_ptx_tensormap_replace_global_address_is_not_supported_before_SM_90a__(); -template -_CCCL_DEVICE static inline void tensormap_replace_global_address(space_shared_t, void* __tm_addr, _B64 __new_val) -{ - // __space == space_shared (due to parameter type constraint) - static_assert(sizeof(_B64) == 8, ""); - NV_IF_ELSE_TARGET( - NV_HAS_FEATURE_SM_90a, - (asm("tensormap.replace.tile.global_address.shared::cta.b1024.b64 [%0], %1;" - : - : "r"(__as_ptr_smem(__tm_addr)), "l"(__as_b64(__new_val)) - : "memory");), - ( - // Unsupported architectures will have a linker error with a semi-decent error message - __cuda_ptx_tensormap_replace_global_address_is_not_supported_before_SM_90a__();)); -} -#endif // __cccl_ptx_isa >= 830 - -/* -// tensormap.replace.tile.rank.space.b1024.b32 [tm_addr], new_val; // PTX ISA 83, SM_90a -// .space = { .global } -template -__device__ static inline void tensormap_replace_rank( - cuda::ptx::space_global_t, - void* tm_addr, - B32 new_val); -*/ -#if __cccl_ptx_isa >= 830 -extern "C" _CCCL_DEVICE void __cuda_ptx_tensormap_replace_rank_is_not_supported_before_SM_90a__(); -template -_CCCL_DEVICE static inline void tensormap_replace_rank(space_global_t, void* __tm_addr, _B32 __new_val) -{ - // __space == space_global (due to parameter type constraint) - static_assert(sizeof(_B32) == 4, ""); - NV_IF_ELSE_TARGET( - NV_HAS_FEATURE_SM_90a, - (asm("tensormap.replace.tile.rank.global.b1024.b32 [%0], %1;" - : - : "l"(__as_ptr_gmem(__tm_addr)), "r"(__as_b32(__new_val)) - : "memory");), - ( - // Unsupported architectures will have a linker error with a semi-decent error message - __cuda_ptx_tensormap_replace_rank_is_not_supported_before_SM_90a__();)); -} -#endif // __cccl_ptx_isa >= 830 - -/* -// tensormap.replace.tile.rank.space.b1024.b32 [tm_addr], new_val; // PTX ISA 83, SM_90a -// .space = { .shared::cta } -template -__device__ static inline void tensormap_replace_rank( - cuda::ptx::space_shared_t, - void* tm_addr, - B32 new_val); -*/ -#if __cccl_ptx_isa >= 830 -extern "C" _CCCL_DEVICE void __cuda_ptx_tensormap_replace_rank_is_not_supported_before_SM_90a__(); -template -_CCCL_DEVICE static inline void tensormap_replace_rank(space_shared_t, void* __tm_addr, _B32 __new_val) -{ - // __space == space_shared (due to parameter type constraint) - static_assert(sizeof(_B32) == 4, ""); - NV_IF_ELSE_TARGET( - NV_HAS_FEATURE_SM_90a, - (asm("tensormap.replace.tile.rank.shared::cta.b1024.b32 [%0], %1;" - : - : "r"(__as_ptr_smem(__tm_addr)), "r"(__as_b32(__new_val)) - : "memory");), - ( - // Unsupported architectures will have a linker error with a semi-decent error message - __cuda_ptx_tensormap_replace_rank_is_not_supported_before_SM_90a__();)); -} -#endif // __cccl_ptx_isa >= 830 - -/* -// tensormap.replace.tile.box_dim.space.b1024.b32 [tm_addr], ord, new_val; // PTX ISA 83, SM_90a -// .space = { .global } -template -__device__ static inline void tensormap_replace_box_dim( - cuda::ptx::space_global_t, - void* tm_addr, - cuda::ptx::n32_t ord, - B32 new_val); -*/ -#if __cccl_ptx_isa >= 830 -extern "C" _CCCL_DEVICE void __cuda_ptx_tensormap_replace_box_dim_is_not_supported_before_SM_90a__(); -template -_CCCL_DEVICE static inline void -tensormap_replace_box_dim(space_global_t, void* __tm_addr, n32_t<_N32> __ord, _B32 __new_val) -{ - // __space == space_global (due to parameter type constraint) - static_assert(sizeof(_B32) == 4, ""); - NV_IF_ELSE_TARGET( - NV_HAS_FEATURE_SM_90a, - (asm("tensormap.replace.tile.box_dim.global.b1024.b32 [%0], %1, %2;" - : - : "l"(__as_ptr_gmem(__tm_addr)), "n"(__ord.value), "r"(__as_b32(__new_val)) - : "memory");), - ( - // Unsupported architectures will have a linker error with a semi-decent error message - __cuda_ptx_tensormap_replace_box_dim_is_not_supported_before_SM_90a__();)); -} -#endif // __cccl_ptx_isa >= 830 - -/* -// tensormap.replace.tile.box_dim.space.b1024.b32 [tm_addr], ord, new_val; // PTX ISA 83, SM_90a -// .space = { .shared::cta } -template -__device__ static inline void tensormap_replace_box_dim( - cuda::ptx::space_shared_t, - void* tm_addr, - cuda::ptx::n32_t ord, - B32 new_val); -*/ -#if __cccl_ptx_isa >= 830 -extern "C" _CCCL_DEVICE void __cuda_ptx_tensormap_replace_box_dim_is_not_supported_before_SM_90a__(); -template -_CCCL_DEVICE static inline void -tensormap_replace_box_dim(space_shared_t, void* __tm_addr, n32_t<_N32> __ord, _B32 __new_val) -{ - // __space == space_shared (due to parameter type constraint) - static_assert(sizeof(_B32) == 4, ""); - NV_IF_ELSE_TARGET( - NV_HAS_FEATURE_SM_90a, - (asm("tensormap.replace.tile.box_dim.shared::cta.b1024.b32 [%0], %1, %2;" - : - : "r"(__as_ptr_smem(__tm_addr)), "n"(__ord.value), "r"(__as_b32(__new_val)) - : "memory");), - ( - // Unsupported architectures will have a linker error with a semi-decent error message - __cuda_ptx_tensormap_replace_box_dim_is_not_supported_before_SM_90a__();)); -} -#endif // __cccl_ptx_isa >= 830 - -/* -// tensormap.replace.tile.global_dim.space.b1024.b32 [tm_addr], ord, new_val; // PTX ISA 83, SM_90a -// .space = { .global } -template -__device__ static inline void tensormap_replace_global_dim( - cuda::ptx::space_global_t, - void* tm_addr, - cuda::ptx::n32_t ord, - B32 new_val); -*/ -#if __cccl_ptx_isa >= 830 -extern "C" _CCCL_DEVICE void __cuda_ptx_tensormap_replace_global_dim_is_not_supported_before_SM_90a__(); -template -_CCCL_DEVICE static inline void -tensormap_replace_global_dim(space_global_t, void* __tm_addr, n32_t<_N32> __ord, _B32 __new_val) -{ - // __space == space_global (due to parameter type constraint) - static_assert(sizeof(_B32) == 4, ""); - NV_IF_ELSE_TARGET( - NV_HAS_FEATURE_SM_90a, - (asm("tensormap.replace.tile.global_dim.global.b1024.b32 [%0], %1, %2;" - : - : "l"(__as_ptr_gmem(__tm_addr)), "n"(__ord.value), "r"(__as_b32(__new_val)) - : "memory");), - ( - // Unsupported architectures will have a linker error with a semi-decent error message - __cuda_ptx_tensormap_replace_global_dim_is_not_supported_before_SM_90a__();)); -} -#endif // __cccl_ptx_isa >= 830 - -/* -// tensormap.replace.tile.global_dim.space.b1024.b32 [tm_addr], ord, new_val; // PTX ISA 83, SM_90a -// .space = { .shared::cta } -template -__device__ static inline void tensormap_replace_global_dim( - cuda::ptx::space_shared_t, - void* tm_addr, - cuda::ptx::n32_t ord, - B32 new_val); -*/ -#if __cccl_ptx_isa >= 830 -extern "C" _CCCL_DEVICE void __cuda_ptx_tensormap_replace_global_dim_is_not_supported_before_SM_90a__(); -template -_CCCL_DEVICE static inline void -tensormap_replace_global_dim(space_shared_t, void* __tm_addr, n32_t<_N32> __ord, _B32 __new_val) -{ - // __space == space_shared (due to parameter type constraint) - static_assert(sizeof(_B32) == 4, ""); - NV_IF_ELSE_TARGET( - NV_HAS_FEATURE_SM_90a, - (asm("tensormap.replace.tile.global_dim.shared::cta.b1024.b32 [%0], %1, %2;" - : - : "r"(__as_ptr_smem(__tm_addr)), "n"(__ord.value), "r"(__as_b32(__new_val)) - : "memory");), - ( - // Unsupported architectures will have a linker error with a semi-decent error message - __cuda_ptx_tensormap_replace_global_dim_is_not_supported_before_SM_90a__();)); -} -#endif // __cccl_ptx_isa >= 830 - -/* -// tensormap.replace.tile.global_stride.space.b1024.b64 [tm_addr], ord, new_val; // PTX ISA 83, SM_90a -// .space = { .global } -template -__device__ static inline void tensormap_replace_global_stride( - cuda::ptx::space_global_t, - void* tm_addr, - cuda::ptx::n32_t ord, - B64 new_val); -*/ -#if __cccl_ptx_isa >= 830 -extern "C" _CCCL_DEVICE void __cuda_ptx_tensormap_replace_global_stride_is_not_supported_before_SM_90a__(); -template -_CCCL_DEVICE static inline void -tensormap_replace_global_stride(space_global_t, void* __tm_addr, n32_t<_N32> __ord, _B64 __new_val) -{ - // __space == space_global (due to parameter type constraint) - static_assert(sizeof(_B64) == 8, ""); - NV_IF_ELSE_TARGET( - NV_HAS_FEATURE_SM_90a, - (asm("tensormap.replace.tile.global_stride.global.b1024.b64 [%0], %1, %2;" - : - : "l"(__as_ptr_gmem(__tm_addr)), "n"(__ord.value), "l"(__as_b64(__new_val)) - : "memory");), - ( - // Unsupported architectures will have a linker error with a semi-decent error message - __cuda_ptx_tensormap_replace_global_stride_is_not_supported_before_SM_90a__();)); -} -#endif // __cccl_ptx_isa >= 830 - -/* -// tensormap.replace.tile.global_stride.space.b1024.b64 [tm_addr], ord, new_val; // PTX ISA 83, SM_90a -// .space = { .shared::cta } -template -__device__ static inline void tensormap_replace_global_stride( - cuda::ptx::space_shared_t, - void* tm_addr, - cuda::ptx::n32_t ord, - B64 new_val); -*/ -#if __cccl_ptx_isa >= 830 -extern "C" _CCCL_DEVICE void __cuda_ptx_tensormap_replace_global_stride_is_not_supported_before_SM_90a__(); -template -_CCCL_DEVICE static inline void -tensormap_replace_global_stride(space_shared_t, void* __tm_addr, n32_t<_N32> __ord, _B64 __new_val) -{ - // __space == space_shared (due to parameter type constraint) - static_assert(sizeof(_B64) == 8, ""); - NV_IF_ELSE_TARGET( - NV_HAS_FEATURE_SM_90a, - (asm("tensormap.replace.tile.global_stride.shared::cta.b1024.b64 [%0], %1, %2;" - : - : "r"(__as_ptr_smem(__tm_addr)), "n"(__ord.value), "l"(__as_b64(__new_val)) - : "memory");), - ( - // Unsupported architectures will have a linker error with a semi-decent error message - __cuda_ptx_tensormap_replace_global_stride_is_not_supported_before_SM_90a__();)); -} -#endif // __cccl_ptx_isa >= 830 - -/* -// tensormap.replace.tile.element_stride.space.b1024.b32 [tm_addr], ord, new_val; // PTX ISA 83, SM_90a -// .space = { .global } -template -__device__ static inline void tensormap_replace_element_size( - cuda::ptx::space_global_t, - void* tm_addr, - cuda::ptx::n32_t ord, - B32 new_val); -*/ -#if __cccl_ptx_isa >= 830 -extern "C" _CCCL_DEVICE void __cuda_ptx_tensormap_replace_element_size_is_not_supported_before_SM_90a__(); -template -_CCCL_DEVICE static inline void -tensormap_replace_element_size(space_global_t, void* __tm_addr, n32_t<_N32> __ord, _B32 __new_val) -{ - // __space == space_global (due to parameter type constraint) - static_assert(sizeof(_B32) == 4, ""); - NV_IF_ELSE_TARGET( - NV_HAS_FEATURE_SM_90a, - (asm("tensormap.replace.tile.element_stride.global.b1024.b32 [%0], %1, %2;" - : - : "l"(__as_ptr_gmem(__tm_addr)), "n"(__ord.value), "r"(__as_b32(__new_val)) - : "memory");), - ( - // Unsupported architectures will have a linker error with a semi-decent error message - __cuda_ptx_tensormap_replace_element_size_is_not_supported_before_SM_90a__();)); -} -#endif // __cccl_ptx_isa >= 830 - -/* -// tensormap.replace.tile.element_stride.space.b1024.b32 [tm_addr], ord, new_val; // PTX ISA 83, SM_90a -// .space = { .shared::cta } -template -__device__ static inline void tensormap_replace_element_size( - cuda::ptx::space_shared_t, - void* tm_addr, - cuda::ptx::n32_t ord, - B32 new_val); -*/ -#if __cccl_ptx_isa >= 830 -extern "C" _CCCL_DEVICE void __cuda_ptx_tensormap_replace_element_size_is_not_supported_before_SM_90a__(); -template -_CCCL_DEVICE static inline void -tensormap_replace_element_size(space_shared_t, void* __tm_addr, n32_t<_N32> __ord, _B32 __new_val) -{ - // __space == space_shared (due to parameter type constraint) - static_assert(sizeof(_B32) == 4, ""); - NV_IF_ELSE_TARGET( - NV_HAS_FEATURE_SM_90a, - (asm("tensormap.replace.tile.element_stride.shared::cta.b1024.b32 [%0], %1, %2;" - : - : "r"(__as_ptr_smem(__tm_addr)), "n"(__ord.value), "r"(__as_b32(__new_val)) - : "memory");), - ( - // Unsupported architectures will have a linker error with a semi-decent error message - __cuda_ptx_tensormap_replace_element_size_is_not_supported_before_SM_90a__();)); -} -#endif // __cccl_ptx_isa >= 830 - -/* -// tensormap.replace.tile.elemtype.space.b1024.b32 [tm_addr], new_val; // PTX ISA 83, SM_90a -// .space = { .global } -template -__device__ static inline void tensormap_replace_elemtype( - cuda::ptx::space_global_t, - void* tm_addr, - cuda::ptx::n32_t new_val); -*/ -#if __cccl_ptx_isa >= 830 -extern "C" _CCCL_DEVICE void __cuda_ptx_tensormap_replace_elemtype_is_not_supported_before_SM_90a__(); -template -_CCCL_DEVICE static inline void tensormap_replace_elemtype(space_global_t, void* __tm_addr, n32_t<_N32> __new_val) -{ - // __space == space_global (due to parameter type constraint) - NV_IF_ELSE_TARGET( - NV_HAS_FEATURE_SM_90a, - (asm("tensormap.replace.tile.elemtype.global.b1024.b32 [%0], %1;" - : - : "l"(__as_ptr_gmem(__tm_addr)), "n"(__new_val.value) - : "memory");), - ( - // Unsupported architectures will have a linker error with a semi-decent error message - __cuda_ptx_tensormap_replace_elemtype_is_not_supported_before_SM_90a__();)); -} -#endif // __cccl_ptx_isa >= 830 - -/* -// tensormap.replace.tile.elemtype.space.b1024.b32 [tm_addr], new_val; // PTX ISA 83, SM_90a -// .space = { .shared::cta } -template -__device__ static inline void tensormap_replace_elemtype( - cuda::ptx::space_shared_t, - void* tm_addr, - cuda::ptx::n32_t new_val); -*/ -#if __cccl_ptx_isa >= 830 -extern "C" _CCCL_DEVICE void __cuda_ptx_tensormap_replace_elemtype_is_not_supported_before_SM_90a__(); -template -_CCCL_DEVICE static inline void tensormap_replace_elemtype(space_shared_t, void* __tm_addr, n32_t<_N32> __new_val) -{ - // __space == space_shared (due to parameter type constraint) - NV_IF_ELSE_TARGET( - NV_HAS_FEATURE_SM_90a, - (asm("tensormap.replace.tile.elemtype.shared::cta.b1024.b32 [%0], %1;" - : - : "r"(__as_ptr_smem(__tm_addr)), "n"(__new_val.value) - : "memory");), - ( - // Unsupported architectures will have a linker error with a semi-decent error message - __cuda_ptx_tensormap_replace_elemtype_is_not_supported_before_SM_90a__();)); -} -#endif // __cccl_ptx_isa >= 830 - -/* -// tensormap.replace.tile.interleave_layout.space.b1024.b32 [tm_addr], new_val; // PTX ISA 83, SM_90a -// .space = { .global } -template -__device__ static inline void tensormap_replace_interleave_layout( - cuda::ptx::space_global_t, - void* tm_addr, - cuda::ptx::n32_t new_val); -*/ -#if __cccl_ptx_isa >= 830 -extern "C" _CCCL_DEVICE void __cuda_ptx_tensormap_replace_interleave_layout_is_not_supported_before_SM_90a__(); -template -_CCCL_DEVICE static inline void -tensormap_replace_interleave_layout(space_global_t, void* __tm_addr, n32_t<_N32> __new_val) -{ - // __space == space_global (due to parameter type constraint) - NV_IF_ELSE_TARGET( - NV_HAS_FEATURE_SM_90a, - (asm("tensormap.replace.tile.interleave_layout.global.b1024.b32 [%0], %1;" - : - : "l"(__as_ptr_gmem(__tm_addr)), "n"(__new_val.value) - : "memory");), - ( - // Unsupported architectures will have a linker error with a semi-decent error message - __cuda_ptx_tensormap_replace_interleave_layout_is_not_supported_before_SM_90a__();)); -} -#endif // __cccl_ptx_isa >= 830 - -/* -// tensormap.replace.tile.interleave_layout.space.b1024.b32 [tm_addr], new_val; // PTX ISA 83, SM_90a -// .space = { .shared::cta } -template -__device__ static inline void tensormap_replace_interleave_layout( - cuda::ptx::space_shared_t, - void* tm_addr, - cuda::ptx::n32_t new_val); -*/ -#if __cccl_ptx_isa >= 830 -extern "C" _CCCL_DEVICE void __cuda_ptx_tensormap_replace_interleave_layout_is_not_supported_before_SM_90a__(); -template -_CCCL_DEVICE static inline void -tensormap_replace_interleave_layout(space_shared_t, void* __tm_addr, n32_t<_N32> __new_val) -{ - // __space == space_shared (due to parameter type constraint) - NV_IF_ELSE_TARGET( - NV_HAS_FEATURE_SM_90a, - (asm("tensormap.replace.tile.interleave_layout.shared::cta.b1024.b32 [%0], %1;" - : - : "r"(__as_ptr_smem(__tm_addr)), "n"(__new_val.value) - : "memory");), - ( - // Unsupported architectures will have a linker error with a semi-decent error message - __cuda_ptx_tensormap_replace_interleave_layout_is_not_supported_before_SM_90a__();)); -} -#endif // __cccl_ptx_isa >= 830 - -/* -// tensormap.replace.tile.swizzle_mode.space.b1024.b32 [tm_addr], new_val; // PTX ISA 83, SM_90a -// .space = { .global } -template -__device__ static inline void tensormap_replace_swizzle_mode( - cuda::ptx::space_global_t, - void* tm_addr, - cuda::ptx::n32_t new_val); -*/ -#if __cccl_ptx_isa >= 830 -extern "C" _CCCL_DEVICE void __cuda_ptx_tensormap_replace_swizzle_mode_is_not_supported_before_SM_90a__(); -template -_CCCL_DEVICE static inline void tensormap_replace_swizzle_mode(space_global_t, void* __tm_addr, n32_t<_N32> __new_val) -{ - // __space == space_global (due to parameter type constraint) - NV_IF_ELSE_TARGET( - NV_HAS_FEATURE_SM_90a, - (asm("tensormap.replace.tile.swizzle_mode.global.b1024.b32 [%0], %1;" - : - : "l"(__as_ptr_gmem(__tm_addr)), "n"(__new_val.value) - : "memory");), - ( - // Unsupported architectures will have a linker error with a semi-decent error message - __cuda_ptx_tensormap_replace_swizzle_mode_is_not_supported_before_SM_90a__();)); -} -#endif // __cccl_ptx_isa >= 830 - -/* -// tensormap.replace.tile.swizzle_mode.space.b1024.b32 [tm_addr], new_val; // PTX ISA 83, SM_90a -// .space = { .shared::cta } -template -__device__ static inline void tensormap_replace_swizzle_mode( - cuda::ptx::space_shared_t, - void* tm_addr, - cuda::ptx::n32_t new_val); -*/ -#if __cccl_ptx_isa >= 830 -extern "C" _CCCL_DEVICE void __cuda_ptx_tensormap_replace_swizzle_mode_is_not_supported_before_SM_90a__(); -template -_CCCL_DEVICE static inline void tensormap_replace_swizzle_mode(space_shared_t, void* __tm_addr, n32_t<_N32> __new_val) -{ - // __space == space_shared (due to parameter type constraint) - NV_IF_ELSE_TARGET( - NV_HAS_FEATURE_SM_90a, - (asm("tensormap.replace.tile.swizzle_mode.shared::cta.b1024.b32 [%0], %1;" - : - : "r"(__as_ptr_smem(__tm_addr)), "n"(__new_val.value) - : "memory");), - ( - // Unsupported architectures will have a linker error with a semi-decent error message - __cuda_ptx_tensormap_replace_swizzle_mode_is_not_supported_before_SM_90a__();)); -} -#endif // __cccl_ptx_isa >= 830 - -/* -// tensormap.replace.tile.fill_mode.space.b1024.b32 [tm_addr], new_val; // PTX ISA 83, SM_90a -// .space = { .global } -template -__device__ static inline void tensormap_replace_fill_mode( - cuda::ptx::space_global_t, - void* tm_addr, - cuda::ptx::n32_t new_val); -*/ -#if __cccl_ptx_isa >= 830 -extern "C" _CCCL_DEVICE void __cuda_ptx_tensormap_replace_fill_mode_is_not_supported_before_SM_90a__(); -template -_CCCL_DEVICE static inline void tensormap_replace_fill_mode(space_global_t, void* __tm_addr, n32_t<_N32> __new_val) -{ - // __space == space_global (due to parameter type constraint) - NV_IF_ELSE_TARGET( - NV_HAS_FEATURE_SM_90a, - (asm("tensormap.replace.tile.fill_mode.global.b1024.b32 [%0], %1;" - : - : "l"(__as_ptr_gmem(__tm_addr)), "n"(__new_val.value) - : "memory");), - ( - // Unsupported architectures will have a linker error with a semi-decent error message - __cuda_ptx_tensormap_replace_fill_mode_is_not_supported_before_SM_90a__();)); -} -#endif // __cccl_ptx_isa >= 830 - -/* -// tensormap.replace.tile.fill_mode.space.b1024.b32 [tm_addr], new_val; // PTX ISA 83, SM_90a -// .space = { .shared::cta } -template -__device__ static inline void tensormap_replace_fill_mode( - cuda::ptx::space_shared_t, - void* tm_addr, - cuda::ptx::n32_t new_val); -*/ -#if __cccl_ptx_isa >= 830 -extern "C" _CCCL_DEVICE void __cuda_ptx_tensormap_replace_fill_mode_is_not_supported_before_SM_90a__(); -template -_CCCL_DEVICE static inline void tensormap_replace_fill_mode(space_shared_t, void* __tm_addr, n32_t<_N32> __new_val) -{ - // __space == space_shared (due to parameter type constraint) - NV_IF_ELSE_TARGET( - NV_HAS_FEATURE_SM_90a, - (asm("tensormap.replace.tile.fill_mode.shared::cta.b1024.b32 [%0], %1;" - : - : "r"(__as_ptr_smem(__tm_addr)), "n"(__new_val.value) - : "memory");), - ( - // Unsupported architectures will have a linker error with a semi-decent error message - __cuda_ptx_tensormap_replace_fill_mode_is_not_supported_before_SM_90a__();)); -} -#endif // __cccl_ptx_isa >= 830 +#include _LIBCUDACXX_END_NAMESPACE_CUDA_PTX From 4ae70bbbfc2baa942fb499e8d719487544fa9e03 Mon Sep 17 00:00:00 2001 From: David Bayer <48736217+davebayer@users.noreply.github.com> Date: Fri, 22 Nov 2024 13:44:07 +0100 Subject: [PATCH 11/45] implement C++26 `std::span`'s constructor from `std::initializer_list` (#2923) Co-authored-by: Michael Schellenberger Costa --- .../cuda/std/detail/libcxx/include/span | 33 ++++++- libcudacxx/include/cuda/std/version | 5 +- .../views/views.span/span.cons/array.pass.cpp | 5 +- .../span.cons/initializer_list.pass.cpp | 86 ++++++++++++++++--- 4 files changed, 111 insertions(+), 18 deletions(-) diff --git a/libcudacxx/include/cuda/std/detail/libcxx/include/span b/libcudacxx/include/cuda/std/detail/libcxx/include/span index 75774146c09..042d2f029c5 100644 --- a/libcudacxx/include/cuda/std/detail/libcxx/include/span +++ b/libcudacxx/include/cuda/std/detail/libcxx/include/span @@ -172,6 +172,7 @@ template #include #include // for ptrdiff_t #include +#include // standard-mandated includes #include @@ -202,6 +203,12 @@ _CCCL_INLINE_VAR constexpr bool __is_std_span> = true; template _CCCL_CONCEPT __span_array_convertible = _CCCL_TRAIT(is_convertible, _From (*)[], _To (*)[]); +template +_CCCL_INLINE_VAR constexpr bool __is_std_initializer_list = false; + +template +_CCCL_INLINE_VAR constexpr bool __is_std_initializer_list> = true; + // We want to ensure that span interacts nicely with containers that might not have had the ranges treatment # if defined(__cpp_lib_ranges) && !_CCCL_COMPILER(MSVC2017) # define _CCCL_SPAN_USES_RANGES @@ -216,7 +223,8 @@ _CCCL_CONCEPT_FRAGMENT( requires(_CUDA_VRANGES::sized_range<_Range>), requires((_CUDA_VRANGES::borrowed_range<_Range> || _CCCL_TRAIT(is_const, _ElementType))), requires((!_CCCL_TRAIT(is_array, remove_cvref_t<_Range>))), - requires((!__is_std_span> && !__is_std_array>) ), + requires((!__is_std_span> && !__is_std_array> + && !__is_std_initializer_list>) ), requires(_CCCL_TRAIT( is_convertible, remove_reference_t<_CUDA_VRANGES::range_reference_t<_Range>> (*)[], _ElementType (*)[])))); @@ -259,11 +267,13 @@ _CCCL_INLINE_VAR constexpr bool __is_span_compatible_container< _ElementType, void_t< // is not a specialization of span - enable_if_t, nullptr_t>, + enable_if_t>, nullptr_t>, + // is not a specialization of array + enable_if_t>, nullptr_t>, // is not a specialization of array - enable_if_t, nullptr_t>, + enable_if_t>, nullptr_t>, // is_array_v is false, - enable_if_t, + enable_if_t), nullptr_t>, // data(cont) and size(cont) are well formed decltype(_CUDA_VSTD::data(_CUDA_VSTD::declval<_Container&>())), decltype(_CUDA_VSTD::size(_CUDA_VSTD::declval<_Container&>())), @@ -329,6 +339,14 @@ public: : __data_{nullptr} {} + _CCCL_TEMPLATE(class _Tp2 = _Tp) + _CCCL_REQUIRES(_CCCL_TRAIT(is_const, _Tp2)) + _LIBCUDACXX_HIDE_FROM_ABI constexpr explicit span(initializer_list __il) noexcept + : __data_{__il.begin()} + { + _CCCL_ASSERT(_Extent == __il.size(), "size mismatch in span's constructor (initializer_list)."); + } + _CCCL_HIDE_FROM_ABI span(const span&) noexcept = default; _CCCL_HIDE_FROM_ABI span& operator=(const span&) noexcept = default; @@ -585,6 +603,13 @@ public: , __size_{0} {} + _CCCL_TEMPLATE(class _Tp2 = _Tp) + _CCCL_REQUIRES(_CCCL_TRAIT(is_const, _Tp2)) + _LIBCUDACXX_HIDE_FROM_ABI constexpr span(initializer_list __il) noexcept + : __data_{__il.begin()} + , __size_{__il.size()} + {} + _CCCL_HIDE_FROM_ABI span(const span&) noexcept = default; _CCCL_HIDE_FROM_ABI span& operator=(const span&) noexcept = default; diff --git a/libcudacxx/include/cuda/std/version b/libcudacxx/include/cuda/std/version index 238259f45ef..bb9475ede07 100644 --- a/libcudacxx/include/cuda/std/version +++ b/libcudacxx/include/cuda/std/version @@ -59,8 +59,9 @@ # ifndef _LIBCUDACXX_HAS_NO_THREADS // # define __cccl_lib_shared_timed_mutex 201402L # endif // !_LIBCUDACXX_HAS_NO_THREADS -# define __cccl_lib_source_location 201907L -# define __cccl_lib_span 202311L +# define __cccl_lib_source_location 201907L +# define __cccl_lib_span 202311L +# define __cccl_lib_span_initializer_list 202311L // # define __cccl_lib_string_udls 201304L # define __cccl_lib_transformation_trait_aliases 201304L # define __cccl_lib_transparent_operators 201210L diff --git a/libcudacxx/test/libcudacxx/std/containers/views/views.span/span.cons/array.pass.cpp b/libcudacxx/test/libcudacxx/std/containers/views/views.span/span.cons/array.pass.cpp index b0cb864464b..5c819507038 100644 --- a/libcudacxx/test/libcudacxx/std/containers/views/views.span/span.cons/array.pass.cpp +++ b/libcudacxx/test/libcudacxx/std/containers/views/views.span/span.cons/array.pass.cpp @@ -18,6 +18,7 @@ // — remove_pointer_t(*)[] is convertible to ElementType(*)[]. // +#include #include #include @@ -92,8 +93,8 @@ __host__ __device__ constexpr bool testSpan() assert(s3.data() == val && s3.size() == 2); assert(s4.data() == val && s4.size() == 2); - cuda::std::span s5 = {{1, 2}}; - cuda::std::span s6 = {{1, 2}}; + cuda::std::span s5 = {cuda::std::array{1, 2}}; + cuda::std::span s6 = {cuda::std::array{1, 2}}; assert(s5.size() == 2); // and it dangles assert(s6.size() == 2); // and it dangles diff --git a/libcudacxx/test/libcudacxx/std/containers/views/views.span/span.cons/initializer_list.pass.cpp b/libcudacxx/test/libcudacxx/std/containers/views/views.span/span.cons/initializer_list.pass.cpp index 3c2a2526455..d84d0b01115 100644 --- a/libcudacxx/test/libcudacxx/std/containers/views/views.span/span.cons/initializer_list.pass.cpp +++ b/libcudacxx/test/libcudacxx/std/containers/views/views.span/span.cons/initializer_list.pass.cpp @@ -1,20 +1,50 @@ //===----------------------------------------------------------------------===// // -// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// Part of the libcu++ Project, under the Apache License v2.0 with LLVM Exceptions. // See https://llvm.org/LICENSE.txt for license information. // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception // SPDX-FileCopyrightText: Copyright (c) 2023 NVIDIA CORPORATION & AFFILIATES. // //===----------------------------------------------------------------------===// -// UNSUPPORTED: c++03, c++11 -// +// UNSUPPORTED: c++11 +// + +// constexpr explicit(extent != dynamic_extent) span(std::initializer_list il); + +// #include #include #include +#include #include +#include +#include "test_convertible.h" #include "test_macros.h" + +using cuda::std::is_constructible; + +// Constructor constrains +static_assert(is_constructible, cuda::std::initializer_list>::value, ""); +static_assert(is_constructible, cuda::std::initializer_list>::value, ""); +static_assert(!is_constructible, cuda::std::initializer_list>::value, ""); +static_assert(!is_constructible, cuda::std::initializer_list>::value, ""); + +static_assert(!is_constructible, cuda::std::initializer_list>::value, ""); +static_assert(!is_constructible, cuda::std::initializer_list>::value, ""); +static_assert(!is_constructible, cuda::std::initializer_list>::value, ""); +static_assert(!is_constructible, cuda::std::initializer_list>::value, ""); + +// Constructor conditionally explicit + +static_assert(!test_convertible, cuda::std::initializer_list>(), + "This constructor must be explicit"); +static_assert(is_constructible, cuda::std::initializer_list>::value, ""); +static_assert(test_convertible, cuda::std::initializer_list>(), + "This constructor must not be explicit"); +static_assert(is_constructible, cuda::std::initializer_list>::value, ""); + struct Sink { constexpr Sink() = default; @@ -26,25 +56,61 @@ __host__ __device__ constexpr cuda::std::size_t count(cuda::std::span -__host__ __device__ constexpr cuda::std::size_t countn(cuda::std::span sp) +template +__host__ __device__ constexpr cuda::std::size_t count_n(cuda::std::span sp) { return sp.size(); } __host__ __device__ constexpr bool test() { - Sink a[10] = {}; - assert(count({a}) == 10); - assert(count({a, a + 10}) == 10); - assert(countn<10>({a}) == 10); + // Dynamic extent + { + Sink a[10]{}; + + assert(count({a}) == 1); + assert(count({a, a + 10}) == 2); + assert(count({a, a + 1, a + 2}) == 3); + assert(count(cuda::std::initializer_list{a[0], a[1], a[2], a[3]}) == 4); + } + + return true; +} + +// Test P2447R4 "Annex C examples" + +__host__ __device__ constexpr int three(cuda::std::span sp) +{ + return static_cast(sp.size()); +} + +__host__ __device__ bool test_P2447R4_annex_c_examples() +{ + // 1. Overload resolution is affected + // --> tested in "initializer_list.verify.cpp" + + // 2. The `initializer_list` ctor has high precedence + // --> tested in "initializer_list.verify.cpp" + + // 3. Implicit two-argument construction with a highly convertible value_type + { + void* a[10]; + assert(three({a, 0}) == 2); + } + // { + // cuda::std::any a[10]; + // assert(four({a, a + 10}) == 2); + // } + return true; } int main(int, char**) { - test(); + assert(test()); static_assert(test(), ""); + assert(test_P2447R4_annex_c_examples()); + return 0; } From 83d180f487ac85c3985d39835d665ae676ba49b0 Mon Sep 17 00:00:00 2001 From: David Bayer <48736217+davebayer@users.noreply.github.com> Date: Fri, 22 Nov 2024 14:06:24 +0100 Subject: [PATCH 12/45] Add tuple protocol to `cuda::std::complex` from C++26 (#2882) --- .../include/cuda/std/__complex/nvbf16.h | 32 ++++ .../include/cuda/std/__complex/nvfp16.h | 32 ++++ libcudacxx/include/cuda/std/__fwd/complex.h | 30 +++ libcudacxx/include/cuda/std/__fwd/get.h | 13 ++ .../std/__tuple_dir/structured_bindings.h | 9 + .../include/cuda/std/__tuple_dir/tuple_like.h | 5 + .../cuda/std/__tuple_dir/tuple_like_ext.h | 5 + .../cuda/std/detail/libcxx/include/complex | 74 ++++++++ libcudacxx/include/cuda/std/version | 1 + .../complex.number/complex.tuple/get.pass.cpp | 171 ++++++++++++++++++ .../tuple_element_compiles.pass.cpp | 66 +++++++ .../tuple_size_compiles.pass.cpp | 62 +++++++ 12 files changed, 500 insertions(+) create mode 100644 libcudacxx/include/cuda/std/__fwd/complex.h create mode 100644 libcudacxx/test/libcudacxx/std/numerics/complex.number/complex.tuple/get.pass.cpp create mode 100644 libcudacxx/test/libcudacxx/std/numerics/complex.number/complex.tuple/tuple_element_compiles.pass.cpp create mode 100644 libcudacxx/test/libcudacxx/std/numerics/complex.number/complex.tuple/tuple_size_compiles.pass.cpp diff --git a/libcudacxx/include/cuda/std/__complex/nvbf16.h b/libcudacxx/include/cuda/std/__complex/nvbf16.h index b456a53139a..ede7f05a29a 100644 --- a/libcudacxx/include/cuda/std/__complex/nvbf16.h +++ b/libcudacxx/include/cuda/std/__complex/nvbf16.h @@ -30,6 +30,7 @@ _CCCL_DIAG_POP # include # include +# include # include # include # include @@ -112,6 +113,9 @@ class _CCCL_TYPE_VISIBILITY_DEFAULT _CCCL_ALIGNAS(alignof(__nv_bfloat162)) compl template friend class complex; + template + friend struct __get_complex_impl; + public: using value_type = __nv_bfloat16; @@ -295,6 +299,34 @@ _LIBCUDACXX_HIDE_FROM_ABI complex<__nv_bfloat16> acos(const complex<__nv_bfloat1 return complex<__nv_bfloat16>{_CUDA_VSTD::acos(complex{__x})}; } +template <> +struct __get_complex_impl<__nv_bfloat16> +{ + template + static _LIBCUDACXX_HIDE_FROM_ABI constexpr __nv_bfloat16& get(complex<__nv_bfloat16>& __z) noexcept + { + return (_Index == 0) ? __z.__repr_.x : __z.__repr_.y; + } + + template + static _LIBCUDACXX_HIDE_FROM_ABI constexpr __nv_bfloat16&& get(complex<__nv_bfloat16>&& __z) noexcept + { + return _CUDA_VSTD::move((_Index == 0) ? __z.__repr_.x : __z.__repr_.y); + } + + template + static _LIBCUDACXX_HIDE_FROM_ABI constexpr const __nv_bfloat16& get(const complex<__nv_bfloat16>& __z) noexcept + { + return (_Index == 0) ? __z.__repr_.x : __z.__repr_.y; + } + + template + static _LIBCUDACXX_HIDE_FROM_ABI constexpr const __nv_bfloat16&& get(const complex<__nv_bfloat16>&& __z) noexcept + { + return _CUDA_VSTD::move((_Index == 0) ? __z.__repr_.x : __z.__repr_.y); + } +}; + # if !_CCCL_COMPILER(NVRTC) template ::std::basic_istream<_CharT, _Traits>& diff --git a/libcudacxx/include/cuda/std/__complex/nvfp16.h b/libcudacxx/include/cuda/std/__complex/nvfp16.h index de8b2538f94..11406f98588 100644 --- a/libcudacxx/include/cuda/std/__complex/nvfp16.h +++ b/libcudacxx/include/cuda/std/__complex/nvfp16.h @@ -27,6 +27,7 @@ # include # include +# include # include # include # include @@ -109,6 +110,9 @@ class _CCCL_TYPE_VISIBILITY_DEFAULT _CCCL_ALIGNAS(alignof(__half2)) complex<__ha template friend class complex; + template + friend struct __get_complex_impl; + public: using value_type = __half; @@ -292,6 +296,34 @@ _LIBCUDACXX_HIDE_FROM_ABI complex<__half> acos(const complex<__half>& __x) return complex<__half>{_CUDA_VSTD::acos(complex{__x})}; } +template <> +struct __get_complex_impl<__half> +{ + template + static _LIBCUDACXX_HIDE_FROM_ABI constexpr __half& get(complex<__half>& __z) noexcept + { + return (_Index == 0) ? __z.__repr_.x : __z.__repr_.y; + } + + template + static _LIBCUDACXX_HIDE_FROM_ABI constexpr __half&& get(complex<__half>&& __z) noexcept + { + return _CUDA_VSTD::move((_Index == 0) ? __z.__repr_.x : __z.__repr_.y); + } + + template + static _LIBCUDACXX_HIDE_FROM_ABI constexpr const __half& get(const complex<__half>& __z) noexcept + { + return (_Index == 0) ? __z.__repr_.x : __z.__repr_.y; + } + + template + static _LIBCUDACXX_HIDE_FROM_ABI constexpr const __half&& get(const complex<__half>&& __z) noexcept + { + return _CUDA_VSTD::move((_Index == 0) ? __z.__repr_.x : __z.__repr_.y); + } +}; + # if !defined(_LIBCUDACXX_HAS_NO_LOCALIZATION) && !_CCCL_COMPILER(NVRTC) template ::std::basic_istream<_CharT, _Traits>& operator>>(::std::basic_istream<_CharT, _Traits>& __is, complex<__half>& __x) diff --git a/libcudacxx/include/cuda/std/__fwd/complex.h b/libcudacxx/include/cuda/std/__fwd/complex.h new file mode 100644 index 00000000000..ba5617380dd --- /dev/null +++ b/libcudacxx/include/cuda/std/__fwd/complex.h @@ -0,0 +1,30 @@ +//===----------------------------------------------------------------------===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// SPDX-FileCopyrightText: Copyright (c) 2023-24 NVIDIA CORPORATION & AFFILIATES. +// +//===----------------------------------------------------------------------===// + +#ifndef _LIBCUDACXX___FWD_COMPLEX_H +#define _LIBCUDACXX___FWD_COMPLEX_H + +#include + +#if defined(_CCCL_IMPLICIT_SYSTEM_HEADER_GCC) +# pragma GCC system_header +#elif defined(_CCCL_IMPLICIT_SYSTEM_HEADER_CLANG) +# pragma clang system_header +#elif defined(_CCCL_IMPLICIT_SYSTEM_HEADER_MSVC) +# pragma system_header +#endif // no system header + +_LIBCUDACXX_BEGIN_NAMESPACE_STD + +template +class _CCCL_TYPE_VISIBILITY_DEFAULT complex; + +_LIBCUDACXX_END_NAMESPACE_STD + +#endif // _LIBCUDACXX___FWD_COMPLEX_H diff --git a/libcudacxx/include/cuda/std/__fwd/get.h b/libcudacxx/include/cuda/std/__fwd/get.h index 6fd977fd158..70607edc813 100644 --- a/libcudacxx/include/cuda/std/__fwd/get.h +++ b/libcudacxx/include/cuda/std/__fwd/get.h @@ -22,6 +22,7 @@ #include #include +#include #include #include #include @@ -70,6 +71,18 @@ _LIBCUDACXX_HIDE_FROM_ABI _CCCL_CONSTEXPR_CXX14 _Tp&& get(array<_Tp, _Size>&&) n template _LIBCUDACXX_HIDE_FROM_ABI _CCCL_CONSTEXPR_CXX14 const _Tp&& get(const array<_Tp, _Size>&&) noexcept; +template +_LIBCUDACXX_HIDE_FROM_ABI constexpr _Tp& get(complex<_Tp>&) noexcept; + +template +_LIBCUDACXX_HIDE_FROM_ABI constexpr _Tp&& get(complex<_Tp>&&) noexcept; + +template +_LIBCUDACXX_HIDE_FROM_ABI constexpr const _Tp& get(const complex<_Tp>&) noexcept; + +template +_LIBCUDACXX_HIDE_FROM_ABI constexpr const _Tp&& get(const complex<_Tp>&&) noexcept; + _LIBCUDACXX_END_NAMESPACE_STD #if _CCCL_STD_VER >= 2017 && !_CCCL_COMPILER(MSVC2017) diff --git a/libcudacxx/include/cuda/std/__tuple_dir/structured_bindings.h b/libcudacxx/include/cuda/std/__tuple_dir/structured_bindings.h index e054f78729e..2652536435d 100644 --- a/libcudacxx/include/cuda/std/__tuple_dir/structured_bindings.h +++ b/libcudacxx/include/cuda/std/__tuple_dir/structured_bindings.h @@ -31,6 +31,7 @@ _CCCL_DIAG_SUPPRESS_CLANG("-Wmismatched-tags") #endif // !_CCCL_COMPILER(NVRTC) #include +#include #include #include #include @@ -87,6 +88,14 @@ struct tuple_element<_Ip, const volatile _CUDA_VSTD::array<_Tp, _Size>> : _CUDA_VSTD::tuple_element<_Ip, const volatile _CUDA_VSTD::array<_Tp, _Size>> {}; +template +struct tuple_size<_CUDA_VSTD::complex<_Tp>> : _CUDA_VSTD::tuple_size<_CUDA_VSTD::complex<_Tp>> +{}; + +template +struct tuple_element<_Ip, _CUDA_VSTD::complex<_Tp>> : _CUDA_VSTD::tuple_element<_Ip, _CUDA_VSTD::complex<_Tp>> +{}; + template struct tuple_size<_CUDA_VSTD::pair<_Tp, _Up>> : _CUDA_VSTD::tuple_size<_CUDA_VSTD::pair<_Tp, _Up>> {}; diff --git a/libcudacxx/include/cuda/std/__tuple_dir/tuple_like.h b/libcudacxx/include/cuda/std/__tuple_dir/tuple_like.h index 28a6b1dada9..d9f30347dde 100644 --- a/libcudacxx/include/cuda/std/__tuple_dir/tuple_like.h +++ b/libcudacxx/include/cuda/std/__tuple_dir/tuple_like.h @@ -21,6 +21,7 @@ #endif // no system header #include +#include #include #include #include @@ -56,6 +57,10 @@ template struct __tuple_like> : true_type {}; +template +struct __tuple_like> : true_type +{}; + #if _CCCL_STD_VER >= 2017 && !_CCCL_COMPILER(MSVC2017) template struct __tuple_like<_CUDA_VRANGES::subrange<_Ip, _Sp, _Kp>> : true_type diff --git a/libcudacxx/include/cuda/std/__tuple_dir/tuple_like_ext.h b/libcudacxx/include/cuda/std/__tuple_dir/tuple_like_ext.h index 064a3b2787b..8dc56ff460b 100644 --- a/libcudacxx/include/cuda/std/__tuple_dir/tuple_like_ext.h +++ b/libcudacxx/include/cuda/std/__tuple_dir/tuple_like_ext.h @@ -21,6 +21,7 @@ #endif // no system header #include +#include #include #include #include @@ -55,6 +56,10 @@ template struct __tuple_like_ext> : true_type {}; +template +struct __tuple_like_ext> : true_type +{}; + template struct __tuple_like_ext<__tuple_types<_Tp...>> : true_type {}; diff --git a/libcudacxx/include/cuda/std/detail/libcxx/include/complex b/libcudacxx/include/cuda/std/detail/libcxx/include/complex index 68d59129e4e..4e98f7c9774 100644 --- a/libcudacxx/include/cuda/std/detail/libcxx/include/complex +++ b/libcudacxx/include/cuda/std/detail/libcxx/include/complex @@ -243,6 +243,9 @@ template complex tanh (const complex&); #endif // no system header #include +#include +#include +#include #include #include #include @@ -286,6 +289,9 @@ class _CCCL_TYPE_VISIBILITY_DEFAULT _LIBCUDACXX_COMPLEX_ALIGNAS complex template friend class complex; + template + friend struct __get_complex_impl; + public: using value_type = _Tp; @@ -1418,6 +1424,74 @@ _LIBCUDACXX_HIDE_FROM_ABI complex<_Tp> tan(const complex<_Tp>& __x) return complex<_Tp>(__z.imag(), -__z.real()); } +template +struct tuple_size> : _CUDA_VSTD::integral_constant +{}; + +template + struct tuple_element<_Index, complex<_Tp>> : _CUDA_VSTD::enable_if < _Index<2, _Tp> +{}; + +template +struct __get_complex_impl +{ + template + static _LIBCUDACXX_HIDE_FROM_ABI constexpr _Tp& get(complex<_Tp>& __z) noexcept + { + return (_Index == 0) ? __z.__re_ : __z.__im_; + } + + template + static _LIBCUDACXX_HIDE_FROM_ABI constexpr _Tp&& get(complex<_Tp>&& __z) noexcept + { + return _CUDA_VSTD::move((_Index == 0) ? __z.__re_ : __z.__im_); + } + + template + static _LIBCUDACXX_HIDE_FROM_ABI constexpr const _Tp& get(const complex<_Tp>& __z) noexcept + { + return (_Index == 0) ? __z.__re_ : __z.__im_; + } + + template + static _LIBCUDACXX_HIDE_FROM_ABI constexpr const _Tp&& get(const complex<_Tp>&& __z) noexcept + { + return _CUDA_VSTD::move((_Index == 0) ? __z.__re_ : __z.__im_); + } +}; + +template +_LIBCUDACXX_HIDE_FROM_ABI constexpr _Tp& get(complex<_Tp>& __z) noexcept +{ + static_assert(_Index < 2, "Index value is out of range"); + + return __get_complex_impl<_Tp>::template get<_Index>(__z); +} + +template +_LIBCUDACXX_HIDE_FROM_ABI constexpr _Tp&& get(complex<_Tp>&& __z) noexcept +{ + static_assert(_Index < 2, "Index value is out of range"); + + return __get_complex_impl<_Tp>::template get<_Index>(_CUDA_VSTD::move(__z)); +} + +template +_LIBCUDACXX_HIDE_FROM_ABI constexpr const _Tp& get(const complex<_Tp>& __z) noexcept +{ + static_assert(_Index < 2, "Index value is out of range"); + + return __get_complex_impl<_Tp>::template get<_Index>(__z); +} + +template +_LIBCUDACXX_HIDE_FROM_ABI constexpr const _Tp&& get(const complex<_Tp>&& __z) noexcept +{ + static_assert(_Index < 2, "Index value is out of range"); + + return __get_complex_impl<_Tp>::template get<_Index>(_CUDA_VSTD::move(__z)); +} + #if !_CCCL_COMPILER(NVRTC) template ::std::basic_istream<_CharT, _Traits>& operator>>(::std::basic_istream<_CharT, _Traits>& __is, complex<_Tp>& __x) diff --git a/libcudacxx/include/cuda/std/version b/libcudacxx/include/cuda/std/version index bb9475ede07..0762976d0d9 100644 --- a/libcudacxx/include/cuda/std/version +++ b/libcudacxx/include/cuda/std/version @@ -30,6 +30,7 @@ #endif // !_CCCL_COMPILER(NVRTC) #define __cccl_lib_to_underlying 202102L +// #define __cpp_lib_tuple_like 202311L // P2819R2 is implemented, but P2165R4 is not yet #if _CCCL_STD_VER >= 2014 # define __cccl_lib_bit_cast 201806L diff --git a/libcudacxx/test/libcudacxx/std/numerics/complex.number/complex.tuple/get.pass.cpp b/libcudacxx/test/libcudacxx/std/numerics/complex.number/complex.tuple/get.pass.cpp new file mode 100644 index 00000000000..bbf70e671e3 --- /dev/null +++ b/libcudacxx/test/libcudacxx/std/numerics/complex.number/complex.tuple/get.pass.cpp @@ -0,0 +1,171 @@ +//===----------------------------------------------------------------------===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// SPDX-FileCopyrightText: Copyright (c) 2023 NVIDIA CORPORATION & AFFILIATES. +// +//===----------------------------------------------------------------------===// + +// + +// template +// constexpr T& get(complex&) noexcept; +// template +// constexpr T&& get(complex&&) noexcept; +// template +// constexpr const T& get(const complex&) noexcept; +// template +// constexpr const T&& get(const complex&&) noexcept; + +#include +#include +// #include +#include + +#include "test_macros.h" + +template +TEST_CONSTEXPR_CXX14 __host__ __device__ void test() +{ + // & + { + cuda::std::complex c{T{27}, T{28}}; + + auto& r = cuda::std::get<0>(c); + ASSERT_SAME_TYPE(T&, decltype(cuda::std::get<0>(c))); + static_assert(noexcept(cuda::std::get<0>(c)), ""); + assert(r == T{27}); + auto& i = cuda::std::get<1>(c); + ASSERT_SAME_TYPE(T&, decltype(cuda::std::get<1>(c))); + static_assert(noexcept(cuda::std::get<1>(c)), ""); + assert(i == T{28}); + } + // && + { + cuda::std::complex c{T{27}, T{28}}; + + auto&& r = cuda::std::get<0>(cuda::std::move(c)); + ASSERT_SAME_TYPE(T&&, decltype(cuda::std::get<0>(cuda::std::move(c)))); + static_assert(noexcept(cuda::std::get<0>(cuda::std::move(c))), ""); + assert(r == T{27}); + } + { + cuda::std::complex c{T{27}, T{28}}; + + auto&& i = cuda::std::get<1>(cuda::std::move(c)); + ASSERT_SAME_TYPE(T&&, decltype(cuda::std::get<1>(cuda::std::move(c)))); + static_assert(noexcept(cuda::std::get<1>(cuda::std::move(c))), ""); + assert(i == T{28}); + } + // const & + { + const cuda::std::complex c{T{27}, T{28}}; + + const auto& r = cuda::std::get<0>(c); + ASSERT_SAME_TYPE(const T&, decltype(cuda::std::get<0>(c))); + static_assert(noexcept(cuda::std::get<0>(c)), ""); + assert(r == T{27}); + const auto& i = cuda::std::get<1>(c); + ASSERT_SAME_TYPE(const T&, decltype(cuda::std::get<1>(c))); + static_assert(noexcept(cuda::std::get<1>(c)), ""); + assert(i == T{28}); + } + // const && + { + const cuda::std::complex c{T{27}, T{28}}; + + const auto&& r = cuda::std::get<0>(cuda::std::move(c)); + ASSERT_SAME_TYPE(const T&&, decltype(cuda::std::get<0>(cuda::std::move(c)))); + static_assert(noexcept(cuda::std::get<0>(cuda::std::move(c))), ""); + assert(r == T{27}); + } + { + const cuda::std::complex c{T{27}, T{28}}; + + const auto&& i = cuda::std::get<1>(cuda::std::move(c)); + ASSERT_SAME_TYPE(const T&&, decltype(cuda::std::get<1>(cuda::std::move(c)))); + static_assert(noexcept(cuda::std::get<1>(cuda::std::move(c))), ""); + assert(i == T{28}); + } + +#if TEST_STD_VER >= 2017 + // `get()` allows using `complex` with structured bindings + { + cuda::std::complex c{T{27}, T{28}}; + + auto [r, i]{c}; + ASSERT_SAME_TYPE(T, decltype(r)); + assert(r == T{27}); + ASSERT_SAME_TYPE(T, decltype(i)); + assert(i == T{28}); + } + { + cuda::std::complex c{T{27}, T{28}}; + + auto& [r, i]{c}; + ASSERT_SAME_TYPE(T, decltype(r)); + assert(r == T{27}); + ASSERT_SAME_TYPE(T, decltype(i)); + assert(i == T{28}); + } +#endif // TEST_STD_VER >= 2017 + + // TODO: Re-enable this test when we have cuda::ranges::views + // #if TEST_STD_VER >= 2017 + // // `get()` allows using `complex` with ranges + // { + // cuda::std::complex arr[]{{T{27}, T{28}}, {T{82}, T{94}}}; + + // auto reals = arr | cuda::std::views::elements<0>; + // ASSERT_SAME_AS(T, cuda::std::ranges::range_value_t); + // assert(cuda::std::ranges::size(reals) == 2); + // assert(cuda::std::ranges::equal(reals, std::array{27, 82})); + + // auto imags = arr | cuda::std::views::elements<0>; + // ASSERT_SAME_AS(T, cuda::std::ranges::range_value_t); + // assert(cuda::std::ranges::size(imags) == 2); + // assert(cuda::std::ranges::equal(imags, std::array{28, 94})); + // } + // #endif // TEST_STD_VER >= 2017 +} + +__host__ __device__ bool test() +{ + test(); + test(); + + // CUDA treats long double as double + // test(); + +#ifdef _LIBCUDACXX_HAS_NVFP16 + test<__half>(); +#endif +#ifdef _LIBCUDACXX_HAS_NVBF16 + test<__nv_bfloat16>(); +#endif + + return true; +} + +TEST_CONSTEXPR_CXX14 __host__ __device__ bool test_constexpr() +{ + test(); + test(); + + // CUDA treats long double as double + // test(); + + return true; +} + +int main(int, char**) +{ + test(); + +#if TEST_STD_VER >= 2014 + static_assert(test_constexpr(), ""); +#endif + + return 0; +} diff --git a/libcudacxx/test/libcudacxx/std/numerics/complex.number/complex.tuple/tuple_element_compiles.pass.cpp b/libcudacxx/test/libcudacxx/std/numerics/complex.number/complex.tuple/tuple_element_compiles.pass.cpp new file mode 100644 index 00000000000..660af111335 --- /dev/null +++ b/libcudacxx/test/libcudacxx/std/numerics/complex.number/complex.tuple/tuple_element_compiles.pass.cpp @@ -0,0 +1,66 @@ +//===----------------------------------------------------------------------===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// SPDX-FileCopyrightText: Copyright (c) 2023 NVIDIA CORPORATION & AFFILIATES. +// +//===----------------------------------------------------------------------===// + +// + +// template struct tuple_element; + +#include +#include +#include + +#include "test_macros.h" + +template +struct HasTupleElement : cuda::std::false_type +{}; + +template +struct HasTupleElement{})>> : cuda::std::true_type +{}; + +struct SomeObject +{}; + +static_assert(!HasTupleElement<0, SomeObject>::value, ""); +static_assert(!HasTupleElement<1, SomeObject>::value, ""); +static_assert(!HasTupleElement<3, SomeObject>::value, ""); + +template +__host__ __device__ void test() +{ + using C = cuda::std::complex; + + static_assert(HasTupleElement<0, C>::value, ""); + static_assert(HasTupleElement<1, C>::value, ""); + + ASSERT_SAME_TYPE(T, typename cuda::std::tuple_element<0, C>::type); + ASSERT_SAME_TYPE(T, typename cuda::std::tuple_element<1, C>::type); +} + +__host__ __device__ void test() +{ + test(); + test(); + + // CUDA treats long double as double + // test(); + +#ifdef _LIBCUDACXX_HAS_NVFP16 + test<__half>(); +#endif +#ifdef _LIBCUDACXX_HAS_NVBF16 + test<__nv_bfloat16>(); +#endif +} + +int main(int, char**) +{ + return 0; +} diff --git a/libcudacxx/test/libcudacxx/std/numerics/complex.number/complex.tuple/tuple_size_compiles.pass.cpp b/libcudacxx/test/libcudacxx/std/numerics/complex.number/complex.tuple/tuple_size_compiles.pass.cpp new file mode 100644 index 00000000000..4e34a5c0d64 --- /dev/null +++ b/libcudacxx/test/libcudacxx/std/numerics/complex.number/complex.tuple/tuple_size_compiles.pass.cpp @@ -0,0 +1,62 @@ +//===----------------------------------------------------------------------===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// SPDX-FileCopyrightText: Copyright (c) 2023 NVIDIA CORPORATION & AFFILIATES. +// +//===----------------------------------------------------------------------===// + +// + +// template struct tuple_size; + +#include +#include +#include + +#include "test_macros.h" + +template +struct HasTupleSize : cuda::std::false_type +{}; + +template +struct HasTupleSize{})>> : cuda::std::true_type +{}; + +struct SomeObject +{}; + +static_assert(!HasTupleSize::value, ""); + +template +__host__ __device__ void test() +{ + using C = cuda::std::complex; + + static_assert(HasTupleSize::value, ""); + ASSERT_SAME_TYPE(size_t, typename cuda::std::tuple_size::value_type); + static_assert(cuda::std::tuple_size() == 2, ""); +} + +__host__ __device__ void test() +{ + test(); + test(); + + // CUDA treats long double as double + // test(); + +#ifdef _LIBCUDACXX_HAS_NVFP16 + test<__half>(); +#endif +#ifdef _LIBCUDACXX_HAS_NVBF16 + test<__nv_bfloat16>(); +#endif +} + +int main(int, char**) +{ + return 0; +} From 537b05013f3e8930cc393f2384cab93c499dd85e Mon Sep 17 00:00:00 2001 From: Bernhard Manfred Gruber Date: Sat, 23 Nov 2024 09:04:49 +0100 Subject: [PATCH 13/45] Add missing qualifier for cuda namespace (#2940) Fixes: #2939 --- cub/cub/device/dispatch/dispatch_histogram.cuh | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/cub/cub/device/dispatch/dispatch_histogram.cuh b/cub/cub/device/dispatch/dispatch_histogram.cuh index 15e0311fa2a..9df804d41fd 100644 --- a/cub/cub/device/dispatch/dispatch_histogram.cuh +++ b/cub/cub/device/dispatch/dispatch_histogram.cuh @@ -419,7 +419,7 @@ struct dispatch_histogram privatized_decode_op, privatized_decode_op + NUM_ACTIVE_CHANNELS, privatized_decode_op_wrapper.begin()); ::cuda::std::copy(output_decode_op, output_decode_op + NUM_ACTIVE_CHANNELS, output_decode_op_wrapper.begin()); - auto minus_one = cuda::proclaim_return_type([](int levels) { + auto minus_one = ::cuda::proclaim_return_type([](int levels) { return levels - 1; }); ::cuda::std::transform( From c22af18463566a2de4040941deb5895739910f5a Mon Sep 17 00:00:00 2001 From: Bernhard Manfred Gruber Date: Sat, 23 Nov 2024 09:06:44 +0100 Subject: [PATCH 14/45] Try to fix a clang warning: (#2941) agent_histogram.cuh:827:37: warning: comparison of different enumeration types --- cub/cub/agent/agent_histogram.cuh | 46 ++++++++++++++----------------- 1 file changed, 20 insertions(+), 26 deletions(-) diff --git a/cub/cub/agent/agent_histogram.cuh b/cub/cub/agent/agent_histogram.cuh index f324de52bce..21a487828ca 100644 --- a/cub/cub/agent/agent_histogram.cuh +++ b/cub/cub/agent/agent_histogram.cuh @@ -106,23 +106,19 @@ template struct AgentHistogramPolicy { - enum - { - /// Threads per thread block - BLOCK_THREADS = _BLOCK_THREADS, - - /// Pixels per thread (per tile of input) - PIXELS_PER_THREAD = _PIXELS_PER_THREAD, + /// Threads per thread block + static constexpr int BLOCK_THREADS = _BLOCK_THREADS; + /// Pixels per thread (per tile of input) + static constexpr int PIXELS_PER_THREAD = _PIXELS_PER_THREAD; - /// Whether to perform localized RLE to compress samples before histogramming - IS_RLE_COMPRESS = _RLE_COMPRESS, + /// Whether to perform localized RLE to compress samples before histogramming + static constexpr bool IS_RLE_COMPRESS = _RLE_COMPRESS; - /// Whether to prefer privatized shared-memory bins (versus privatized global-memory bins) - MEM_PREFERENCE = _MEM_PREFERENCE, + /// Whether to prefer privatized shared-memory bins (versus privatized global-memory bins) + static constexpr BlockHistogramMemoryPreference MEM_PREFERENCE = _MEM_PREFERENCE; - /// Whether to dequeue tiles from a global work queue - IS_WORK_STEALING = _WORK_STEALING, - }; + /// Whether to dequeue tiles from a global work queue + static constexpr bool IS_WORK_STEALING = _WORK_STEALING; /// Vector size for samples loading (1, 2, 4) static constexpr int VEC_SIZE = _VEC_SIZE; @@ -202,23 +198,21 @@ struct AgentHistogram using VecT = typename CubVector::Type; /// Constants - enum - { - BLOCK_THREADS = AgentHistogramPolicyT::BLOCK_THREADS, + static constexpr int BLOCK_THREADS = AgentHistogramPolicyT::BLOCK_THREADS; - PIXELS_PER_THREAD = AgentHistogramPolicyT::PIXELS_PER_THREAD, - SAMPLES_PER_THREAD = PIXELS_PER_THREAD * NUM_CHANNELS, - VECS_PER_THREAD = SAMPLES_PER_THREAD / VecSize, + static constexpr int PIXELS_PER_THREAD = AgentHistogramPolicyT::PIXELS_PER_THREAD; + static constexpr int SAMPLES_PER_THREAD = PIXELS_PER_THREAD * NUM_CHANNELS; + static constexpr int VECS_PER_THREAD = SAMPLES_PER_THREAD / VecSize; - TILE_PIXELS = PIXELS_PER_THREAD * BLOCK_THREADS, - TILE_SAMPLES = SAMPLES_PER_THREAD * BLOCK_THREADS, + static constexpr int TILE_PIXELS = PIXELS_PER_THREAD * BLOCK_THREADS; + static constexpr int TILE_SAMPLES = SAMPLES_PER_THREAD * BLOCK_THREADS; - IS_RLE_COMPRESS = AgentHistogramPolicyT::IS_RLE_COMPRESS, + static constexpr bool IS_RLE_COMPRESS = AgentHistogramPolicyT::IS_RLE_COMPRESS; - MEM_PREFERENCE = (PRIVATIZED_SMEM_BINS > 0) ? AgentHistogramPolicyT::MEM_PREFERENCE : GMEM, + static constexpr BlockHistogramMemoryPreference MEM_PREFERENCE = + (PRIVATIZED_SMEM_BINS > 0) ? AgentHistogramPolicyT::MEM_PREFERENCE : GMEM; - IS_WORK_STEALING = AgentHistogramPolicyT::IS_WORK_STEALING, - }; + static constexpr bool IS_WORK_STEALING = AgentHistogramPolicyT::IS_WORK_STEALING; /// Cache load modifier for reading input elements static constexpr CacheLoadModifier LOAD_MODIFIER = AgentHistogramPolicyT::LOAD_MODIFIER; From 80031e29baa11e1674b7d30770badeca0fbdb5dc Mon Sep 17 00:00:00 2001 From: Eric Niebler Date: Sun, 24 Nov 2024 10:03:30 -0800 Subject: [PATCH 15/45] minor consistency improvements in concepts macros (#2928) --- .../cuda/std/__concepts/concept_macros.h | 107 +++++++++--------- 1 file changed, 52 insertions(+), 55 deletions(-) diff --git a/libcudacxx/include/cuda/std/__concepts/concept_macros.h b/libcudacxx/include/cuda/std/__concepts/concept_macros.h index 2850c38a493..18587ca57df 100644 --- a/libcudacxx/include/cuda/std/__concepts/concept_macros.h +++ b/libcudacxx/include/cuda/std/__concepts/concept_macros.h @@ -24,6 +24,8 @@ # pragma system_header #endif // no system header +#include // for size_t + //////////////////////////////////////////////////////////////////////////////// // _CCCL_TEMPLATE // Usage: @@ -50,7 +52,7 @@ using __cccl_enable_if_t = typename __cccl_select<_Bp>::template type<_Tp>; template using __cccl_requires_t = typename __cccl_select<_Bp>::template type<_Tp>; -#if (defined(__cpp_concepts) && _CCCL_STD_VER >= 2020) +#if (defined(__cpp_concepts) && _CCCL_STD_VER >= 2020) || defined(_CCCL_DOXYGEN_INVOKED) # define _CCCL_TEMPLATE(...) template <__VA_ARGS__> # define _CCCL_REQUIRES(...) requires __VA_ARGS__ # define _CCCL_AND && @@ -58,43 +60,38 @@ using __cccl_requires_t = typename __cccl_select<_Bp>::template type<_Tp>; # define _CCCL_TRAILING_REQUIRES(...) ->__VA_ARGS__ _CCCL_TRAILING_REQUIRES_AUX_ #else // ^^^ __cpp_concepts ^^^ / vvv !__cpp_concepts vvv # define _CCCL_TEMPLATE(...) template <__VA_ARGS__ -# define _CCCL_REQUIRES(...) , bool _CCCL_true_ = true, __cccl_enable_if_t < __VA_ARGS__ && _CCCL_true_, int > = 0 > -# define _CCCL_AND &&_CCCL_true_, int > = 0, __cccl_enable_if_t < +# define _CCCL_REQUIRES(...) , bool __cccl_true_ = true, __cccl_enable_if_t < __VA_ARGS__ && __cccl_true_, int > = 0 > +# define _CCCL_AND &&__cccl_true_, int > = 0, __cccl_enable_if_t < # define _CCCL_TRAILING_REQUIRES_AUX_(...) , __VA_ARGS__ > # define _CCCL_TRAILING_REQUIRES(...) ->__cccl_requires_t < __VA_ARGS__ _CCCL_TRAILING_REQUIRES_AUX_ #endif // !__cpp_concepts #if _CCCL_STD_VER >= 2014 -namespace __cccl_concept -{ - -template -struct _Tag; +template +struct __cccl_tag; template -_LIBCUDACXX_HIDE_FROM_ABI constexpr bool __is_true() +_LIBCUDACXX_HIDE_FROM_ABI constexpr bool __cccl_is_true() { return true; } -# if _CCCL_COMPILER(CLANG) || _CCCL_COMPILER(MSVC) +# if _CCCL_COMPILER(MSVC) template _LIBCUDACXX_HIDE_FROM_ABI __cccl_enable_if_t<_Bp> __cccl_requires() {} -# else // ^^^ _CCCL_COMPILER(CLANG) || _CCCL_COMPILER(MSVC) ^^^ / vvv other compilers vvv +# else // ^^^ _CCCL_COMPILER(MSVC) ^^^ / vvv !_CCCL_COMPILER(MSVC) vvv template = 0> _CCCL_INLINE_VAR constexpr int __cccl_requires = 0; -# endif // !_CCCL_COMPILER(CLANG) && !_CCCL_COMPILER(MSVC) +# endif // !_CCCL_COMPILER(MSVC) template -_LIBCUDACXX_HIDE_FROM_ABI auto __cccl_make_dependent(_Tp*, _Tag<_Args...>*) -> _Tp; +_LIBCUDACXX_HIDE_FROM_ABI auto __cccl_make_dependent(_Tp*, __cccl_tag<_Args...>*) -> _Tp; template -using __requires_expr_impl = - decltype(__cccl_make_dependent(static_cast<_Impl*>(nullptr), static_cast<_Tag*>(nullptr))); - -} // namespace __cccl_concept +using __cccl_requires_expr_impl = + decltype(__cccl_make_dependent(static_cast<_Impl*>(nullptr), static_cast<__cccl_tag*>(nullptr))); // So that we can refer to the ::cuda::std namespace below _LIBCUDACXX_BEGIN_NAMESPACE_STD @@ -107,10 +104,10 @@ _LIBCUDACXX_END_NAMESPACE_STD // // where ::concept is a fully qualified name, would not compile. The // _CUDA_VSTD macro is fully qualified. -namespace __unqualified_cuda_std = _CUDA_VSTD; // NOLINT(misc-unused-alias-decls) +namespace __cccl_unqualified_cuda_std = _CUDA_VSTD; // NOLINT(misc-unused-alias-decls) # if _CCCL_CUDACC_BELOW(12, 2) -# define _CCCL_CONCEPT_VSTD __unqualified_cuda_std // must not be fully qualified +# define _CCCL_CONCEPT_VSTD __cccl_unqualified_cuda_std // must not be fully qualified # else # define _CCCL_CONCEPT_VSTD _CUDA_VSTD # endif @@ -118,10 +115,10 @@ namespace __unqualified_cuda_std = _CUDA_VSTD; // NOLINT(misc-unused-alias-decls # define _CCCL_CONCEPT_FRAGMENT_REQS_M0(_REQ) _CCCL_CONCEPT_FRAGMENT_REQS_SELECT_(_REQ)(_REQ) # define _CCCL_CONCEPT_FRAGMENT_REQS_M1(_REQ) _CCCL_PP_EXPAND _REQ # define _CCCL_CONCEPT_FRAGMENT_REQS_(...) {_CCCL_PP_FOR_EACH(_CCCL_CONCEPT_FRAGMENT_REQS_M, __VA_ARGS__)} -# define _CCCL_CONCEPT_FRAGMENT_REQS_SELECT_(_REQ) \ - _CCCL_PP_CAT3(_CCCL_CONCEPT_FRAGMENT_REQS_SELECT_, \ - _CCCL_PP_EVAL(_CCCL_PP_CHECK, _CCCL_PP_CAT3(_CCCL_CONCEPT_FRAGMENT_REQS_SELECT_PROBE_, _REQ))) \ - /**/ +# define _CCCL_CONCEPT_FRAGMENT_REQS_SELECT_(_REQ) \ + _CCCL_PP_CAT3(_CCCL_CONCEPT_FRAGMENT_REQS_SELECT_, \ + _CCCL_PP_EVAL(_CCCL_PP_CHECK, _CCCL_PP_CAT3(_CCCL_CONCEPT_FRAGMENT_REQS_SELECT_PROBE_, _REQ))) + # define _CCCL_CONCEPT_FRAGMENT_REQS_SELECT_PROBE_requires _CCCL_PP_PROBE_N(~, 1) # define _CCCL_CONCEPT_FRAGMENT_REQS_SELECT_PROBE_noexcept _CCCL_PP_PROBE_N(~, 2) # define _CCCL_CONCEPT_FRAGMENT_REQS_SELECT_PROBE_typename _CCCL_PP_PROBE_N(~, 3) @@ -132,15 +129,14 @@ namespace __unqualified_cuda_std = _CUDA_VSTD; // NOLINT(misc-unused-alias-decls # define _CCCL_CONCEPT_FRAGMENT_REQS_SELECT_2 _CCCL_CONCEPT_FRAGMENT_REQS_REQUIRES_OR_NOEXCEPT # define _CCCL_CONCEPT_FRAGMENT_REQS_SELECT_3 _CCCL_CONCEPT_FRAGMENT_REQS_REQUIRES_OR_NOEXCEPT # define _CCCL_CONCEPT_FRAGMENT_REQS_SELECT_4 _CCCL_CONCEPT_FRAGMENT_REQS_SAME_AS + # define _CCCL_CONCEPT_FRAGMENT_REQS_REQUIRES_OR_NOEXCEPT(_REQ) \ _CCCL_PP_CAT4(_CCCL_CONCEPT_FRAGMENT_REQS_REQUIRES_, _REQ) # define _CCCL_PP_EAT_TYPENAME_PROBE_typename _CCCL_PP_PROBE(~) # define _CCCL_PP_EAT_TYPENAME_SELECT_(_Xp, ...) \ _CCCL_PP_CAT3(_CCCL_PP_EAT_TYPENAME_SELECT_, \ _CCCL_PP_EVAL(_CCCL_PP_CHECK, _CCCL_PP_CAT3(_CCCL_PP_EAT_TYPENAME_PROBE_, _Xp))) -# define _CCCL_PP_EAT_TYPENAME_(...) \ - _CCCL_PP_EVAL2(_CCCL_PP_EAT_TYPENAME_SELECT_, __VA_ARGS__, ) \ - (__VA_ARGS__) +# define _CCCL_PP_EAT_TYPENAME_(...) _CCCL_PP_EVAL2(_CCCL_PP_EAT_TYPENAME_SELECT_, __VA_ARGS__, )(__VA_ARGS__) # define _CCCL_PP_EAT_TYPENAME_SELECT_0(...) __VA_ARGS__ # define _CCCL_PP_EAT_TYPENAME_SELECT_1(...) _CCCL_PP_CAT3(_CCCL_PP_EAT_TYPENAME_, __VA_ARGS__) # define _CCCL_PP_EAT_TYPENAME_typename @@ -178,13 +174,13 @@ namespace __unqualified_cuda_std = _CUDA_VSTD; // NOLINT(misc-unused-alias-decls # define _CCCL_CONCEPT_FRAGMENT(_NAME, ...) \ _LIBCUDACXX_HIDE_FROM_ABI auto _NAME##_CCCL_CONCEPT_FRAGMENT_impl_ _CCCL_CONCEPT_FRAGMENT_REQS_##__VA_ARGS__ > { \ } \ - template \ + template \ _LIBCUDACXX_HIDE_FROM_ABI char _NAME##_CCCL_CONCEPT_FRAGMENT_( \ - __cccl_concept::_Tag<_As...>*, decltype(&_NAME##_CCCL_CONCEPT_FRAGMENT_impl_<_As...>)); \ - _LIBCUDACXX_HIDE_FROM_ABI char(&_NAME##_CCCL_CONCEPT_FRAGMENT_(...))[2] /**/ + ::__cccl_tag<_As...>*, decltype(&_NAME##_CCCL_CONCEPT_FRAGMENT_impl_<_As...>)); \ + _LIBCUDACXX_HIDE_FROM_ABI char(&_NAME##_CCCL_CONCEPT_FRAGMENT_(...))[2] # if defined(_MSC_VER) && !defined(__clang__) # define _CCCL_CONCEPT_FRAGMENT_TRUE(...) \ - __cccl_concept::__is_true() + ::__cccl_is_true() # else # define _CCCL_CONCEPT_FRAGMENT_TRUE(...) \ !(decltype(_CCCL_PP_FOR_EACH(_CCCL_CONCEPT_FRAGMENT_REQS_M, __VA_ARGS__) void(), false){}) @@ -194,23 +190,22 @@ namespace __unqualified_cuda_std = _CUDA_VSTD; // NOLINT(misc-unused-alias-decls # define _CCCL_CONCEPT_FRAGMENT_REQS_M(_REQ) \ _CCCL_PP_CAT2(_CCCL_CONCEPT_FRAGMENT_REQS_M, _CCCL_PP_IS_PAREN(_REQ)) \ (_REQ), -# define _CCCL_CONCEPT_FRAGMENT_REQS_REQUIRES_requires(...) __cccl_concept::__cccl_requires<__VA_ARGS__> -# define _CCCL_CONCEPT_FRAGMENT_REQS_REQUIRES_typename(...) static_cast<__cccl_concept::_Tag<__VA_ARGS__>*>(nullptr) +# define _CCCL_CONCEPT_FRAGMENT_REQS_REQUIRES_requires(...) ::__cccl_requires<__VA_ARGS__> +# define _CCCL_CONCEPT_FRAGMENT_REQS_REQUIRES_typename(...) static_cast<::__cccl_tag<__VA_ARGS__>*>(nullptr) # if _CCCL_COMPILER(GCC, <, 14) // GCC < 14 can't mangle noexcept expressions, so just check that the // expression is well-formed. // https://gcc.gnu.org/bugzilla/show_bug.cgi?id=70790 # define _CCCL_CONCEPT_FRAGMENT_REQS_REQUIRES_noexcept(...) __VA_ARGS__ # else -# define _CCCL_CONCEPT_FRAGMENT_REQS_REQUIRES_noexcept(...) __cccl_concept::__cccl_requires +# define _CCCL_CONCEPT_FRAGMENT_REQS_REQUIRES_noexcept(...) ::__cccl_requires # endif # define _CCCL_CONCEPT_FRAGMENT_REQS_SAME_AS(_REQ) \ - __cccl_concept::__cccl_requires< \ - _CUDA_VSTD::same_as<_CCCL_PP_CAT4(_CCCL_CONCEPT_FRAGMENT_REQS_SAME_AS_, _REQ) _CCCL_PP_RPAREN>> + ::__cccl_requires<_CUDA_VSTD::same_as<_CCCL_PP_CAT4(_CCCL_CONCEPT_FRAGMENT_REQS_SAME_AS_, _REQ) _CCCL_PP_RPAREN>> # define _CCCL_CONCEPT_FRAGMENT_REQS_SAME_AS__Same_as(...) __VA_ARGS__, decltype _CCCL_PP_LPAREN # define _CCCL_FRAGMENT(_NAME, ...) \ - (1u == sizeof(_NAME##_CCCL_CONCEPT_FRAGMENT_(static_cast<__cccl_concept::_Tag<__VA_ARGS__>*>(nullptr), nullptr))) + (1u == sizeof(_NAME##_CCCL_CONCEPT_FRAGMENT_(static_cast<::__cccl_tag<__VA_ARGS__>*>(nullptr), nullptr))) # endif @@ -225,7 +220,7 @@ namespace __unqualified_cuda_std = _CUDA_VSTD; // NOLINT(misc-unused-alias-decls // ); // // Can only be used as the last requirement in a concept definition. -# if defined(__cpp_concepts) && _CCCL_STD_VER >= 2020 +# if defined(__cpp_concepts) && _CCCL_STD_VER >= 2020 || defined(_CCCL_DOXYGEN_INVOKED) # define _CCCL_REQUIRES_EXPR(_TY, ...) requires(__VA_ARGS__) _CCCL_REQUIRES_EXPR_2 # define _CCCL_REQUIRES_EXPR_2(...) {_CCCL_PP_FOR_EACH(_CCCL_CONCEPT_FRAGMENT_REQS_M, __VA_ARGS__)} # else @@ -249,27 +244,29 @@ namespace __unqualified_cuda_std = _CUDA_VSTD; // NOLINT(misc-unused-alias-decls # define _CCCL_REQUIRES_EXPR_EXPAND_TPARAMS(...) _CCCL_PP_FOR_EACH(_CCCL_REQUIRES_EXPR_EXPAND_TPARAM, __VA_ARGS__) # define _CCCL_REQUIRES_EXPR(_TY, ...) \ - __cccl_concept::__requires_expr_impl:: \ - __cccl_is_satisfied(static_cast<__cccl_concept::_Tag*>(nullptr), \ - static_cast(nullptr)); \ + ::__cccl_requires_expr_impl< \ + struct _CCCL_PP_CAT(__cccl_requires_expr_detail_, __LINE__) _CCCL_REQUIRES_EXPR_EXPAND_TPARAMS \ + _TY>::__cccl_is_satisfied(static_cast<::__cccl_tag*>(nullptr), \ + static_cast(nullptr)); \ struct _CCCL_PP_CAT(__cccl_requires_expr_detail_, __LINE__) \ { \ - using _Self_t = _CCCL_PP_CAT(__cccl_requires_expr_detail_, __LINE__); \ + using __cccl_self_t = _CCCL_PP_CAT(__cccl_requires_expr_detail_, __LINE__); \ template \ - _LIBCUDACXX_HIDE_FROM_ABI static auto _Well_formed(__VA_ARGS__) _CCCL_REQUIRES_EXPR_2 - -# define _CCCL_REQUIRES_EXPR_2(...) \ - ->decltype(_CCCL_PP_FOR_EACH(_CCCL_CONCEPT_FRAGMENT_REQS_M, __VA_ARGS__) void()) {} \ - template (&_Self_t::_Well_formed))> \ - _LIBCUDACXX_HIDE_FROM_ABI static constexpr bool __cccl_is_satisfied(__cccl_concept::_Tag*, Sig*) \ - { \ - return true; \ - } \ - _LIBCUDACXX_HIDE_FROM_ABI static constexpr bool __cccl_is_satisfied(void*, ...) \ - { \ - return false; \ - } \ + _LIBCUDACXX_HIDE_FROM_ABI static auto __cccl_well_formed(__VA_ARGS__) _CCCL_REQUIRES_EXPR_2 + +# define _CCCL_REQUIRES_EXPR_2(...) \ + ->decltype(_CCCL_PP_FOR_EACH(_CCCL_CONCEPT_FRAGMENT_REQS_M, __VA_ARGS__) void()) {} \ + template (&__cccl_self_t::__cccl_well_formed<_Args...>))> \ + _LIBCUDACXX_HIDE_FROM_ABI static constexpr bool __cccl_is_satisfied(::__cccl_tag<_Args...>*, _Sig*) \ + { \ + return true; \ + } \ + _LIBCUDACXX_HIDE_FROM_ABI static constexpr bool __cccl_is_satisfied(void*, ...) \ + { \ + return false; \ + } \ } # endif From 18a014125be329b7b7b848f571c576c71b02bdee Mon Sep 17 00:00:00 2001 From: Michael Schellenberger Costa Date: Mon, 25 Nov 2024 11:02:24 +0100 Subject: [PATCH 16/45] Drop some of the mdspan fold implementation (#2949) * Drop unused macros * Do not return a custom struct * Replace `__MDSPAN_FOLD_AND` with `__fold_and_v` when possible --- .../include/cuda/std/__mdspan/extents.h | 20 ++++++------- .../include/cuda/std/__mdspan/layout_left.h | 7 +++-- .../include/cuda/std/__mdspan/layout_right.h | 7 +++-- .../include/cuda/std/__mdspan/layout_stride.h | 8 ++--- libcudacxx/include/cuda/std/__mdspan/macros.h | 30 ++++--------------- .../cuda/std/__mdspan/maybe_static_value.h | 9 ++---- libcudacxx/include/cuda/std/__mdspan/mdspan.h | 29 ++++++++---------- .../include/cuda/std/__mdspan/submdspan.h | 18 +++++------ 8 files changed, 51 insertions(+), 77 deletions(-) diff --git a/libcudacxx/include/cuda/std/__mdspan/extents.h b/libcudacxx/include/cuda/std/__mdspan/extents.h index 302cc26894a..d0bdfd016f6 100644 --- a/libcudacxx/include/cuda/std/__mdspan/extents.h +++ b/libcudacxx/include/cuda/std/__mdspan/extents.h @@ -61,6 +61,7 @@ #include #include #include +#include #include #include #include @@ -111,8 +112,7 @@ struct __compare_extent_compatible template static integral_constant::value) /* && ... */ - )> _CCCL_HOST_DEVICE + __fold_and_v<(__compare_extent_compatible<_Extents, _OtherExtents>::value)...>> _CCCL_HOST_DEVICE __check_compatible_extents(true_type, _CUDA_VSTD::integer_sequence, _CUDA_VSTD::integer_sequence) noexcept @@ -285,18 +285,16 @@ class extents _CCCL_REQUIRES( // TODO: check whether the other version works with newest NVCC, doesn't with 11.4 // NVCC seems to pick up rank_dynamic from the wrong extents type??? - __MDSPAN_FOLD_AND(_CCCL_TRAIT(_CUDA_VSTD::is_convertible, _Integral, index_type) /* && ... */) - _CCCL_AND __MDSPAN_FOLD_AND(_CCCL_TRAIT(_CUDA_VSTD::is_nothrow_constructible, index_type, _Integral) /* && ... */) - _CCCL_AND + __fold_and_v<_CCCL_TRAIT(_CUDA_VSTD::is_convertible, _Integral, index_type)...> _CCCL_AND + __fold_and_v<_CCCL_TRAIT(_CUDA_VSTD::is_nothrow_constructible, index_type, _Integral)...> _CCCL_AND // NVCC chokes on the fold thingy here so wrote the workaround ((sizeof...(_Integral) == __detail::__count_dynamic_extents<_Extents...>::val) || (sizeof...(_Integral) == sizeof...(_Extents)))) # else _CCCL_TEMPLATE(class... _Integral) - _CCCL_REQUIRES( - __MDSPAN_FOLD_AND(_CCCL_TRAIT(_CUDA_VSTD::is_convertible, _Integral, index_type) /* && ... */) - _CCCL_AND __MDSPAN_FOLD_AND(_CCCL_TRAIT(_CUDA_VSTD::is_nothrow_constructible, index_type, _Integral) /* && ... */) - _CCCL_AND((sizeof...(_Integral) == rank_dynamic()) || (sizeof...(_Integral) == rank()))) + _CCCL_REQUIRES(__fold_and_v<_CCCL_TRAIT(_CUDA_VSTD::is_convertible, _Integral, index_type)...> _CCCL_AND + __fold_and_v<_CCCL_TRAIT(_CUDA_VSTD::is_nothrow_constructible, index_type, _Integral)...> _CCCL_AND( + (sizeof...(_Integral) == rank_dynamic()) || (sizeof...(_Integral) == rank()))) # endif _LIBCUDACXX_HIDE_FROM_ABI explicit constexpr extents(_Integral... __exts) noexcept # ifndef _CCCL_HAS_NO_ATTRIBUTE_NO_UNIQUE_ADDRESS @@ -318,8 +316,8 @@ class extents # endif { /* TODO: precondition check - * If sizeof...(_IndexTypes) != rank_dynamic() is true, exts_arr[r] equals Er for each r for which Er is a static - * extent, and either + * If sizeof...(_IndexTypes) != rank_dynamic() is true, exts_arr[r] equals Er for each r for which Er is a + * static extent, and either * - sizeof...(__exts) == 0 is true, or * - each element of __exts is nonnegative and is a representable value of type index_type. */ diff --git a/libcudacxx/include/cuda/std/__mdspan/layout_left.h b/libcudacxx/include/cuda/std/__mdspan/layout_left.h index 8a11107f390..1c105638de7 100644 --- a/libcudacxx/include/cuda/std/__mdspan/layout_left.h +++ b/libcudacxx/include/cuda/std/__mdspan/layout_left.h @@ -57,6 +57,7 @@ #include #include #include +#include #include #include #include @@ -187,9 +188,9 @@ class layout_left::mapping //-------------------------------------------------------------------------------- _CCCL_TEMPLATE(class... _Indices) - _CCCL_REQUIRES((sizeof...(_Indices) == extents_type::rank()) _CCCL_AND __MDSPAN_FOLD_AND( - (_CCCL_TRAIT(_CUDA_VSTD::is_convertible, _Indices, index_type) - && _CCCL_TRAIT(_CUDA_VSTD::is_nothrow_constructible, index_type, _Indices)))) + _CCCL_REQUIRES((sizeof...(_Indices) == extents_type::rank()) + _CCCL_AND __fold_and_v<_CCCL_TRAIT(_CUDA_VSTD::is_convertible, _Indices, index_type)...> // + _CCCL_AND __fold_and_v<_CCCL_TRAIT(_CUDA_VSTD::is_nothrow_constructible, index_type, _Indices)...>) _CCCL_HOST_DEVICE constexpr index_type operator()(_Indices... __idxs) const noexcept { // Immediately cast incoming indices to `index_type` diff --git a/libcudacxx/include/cuda/std/__mdspan/layout_right.h b/libcudacxx/include/cuda/std/__mdspan/layout_right.h index bd61461ab82..43a1df74b30 100644 --- a/libcudacxx/include/cuda/std/__mdspan/layout_right.h +++ b/libcudacxx/include/cuda/std/__mdspan/layout_right.h @@ -57,6 +57,7 @@ #include #include #include +#include #include #include #include @@ -192,9 +193,9 @@ class layout_right::mapping //-------------------------------------------------------------------------------- _CCCL_TEMPLATE(class... _Indices) - _CCCL_REQUIRES((sizeof...(_Indices) == extents_type::rank()) _CCCL_AND __MDSPAN_FOLD_AND( - (_CCCL_TRAIT(_CUDA_VSTD::is_convertible, _Indices, index_type) - && _CCCL_TRAIT(_CUDA_VSTD::is_nothrow_constructible, index_type, _Indices)))) + _CCCL_REQUIRES((sizeof...(_Indices) == extents_type::rank()) + _CCCL_AND __fold_and_v<_CCCL_TRAIT(_CUDA_VSTD::is_convertible, _Indices, index_type)...> // + _CCCL_AND __fold_and_v<_CCCL_TRAIT(_CUDA_VSTD::is_nothrow_constructible, index_type, _Indices)...>) _CCCL_HOST_DEVICE constexpr index_type operator()(_Indices... __idxs) const noexcept { return __compute_offset(__rank_count<0, extents_type::rank()>(), static_cast(__idxs)...); diff --git a/libcudacxx/include/cuda/std/__mdspan/layout_stride.h b/libcudacxx/include/cuda/std/__mdspan/layout_stride.h index 3f31820cf49..520ded0f222 100644 --- a/libcudacxx/include/cuda/std/__mdspan/layout_stride.h +++ b/libcudacxx/include/cuda/std/__mdspan/layout_stride.h @@ -61,6 +61,7 @@ # include #endif // _CCCL_HAS_NO_ATTRIBUTE_NO_UNIQUE_ADDRESS #include +#include #include #include #include @@ -425,10 +426,9 @@ struct layout_stride } _CCCL_TEMPLATE(class... _Indices) - _CCCL_REQUIRES( - (sizeof...(_Indices) == _Extents::rank()) - _CCCL_AND __MDSPAN_FOLD_AND(_CCCL_TRAIT(is_convertible, _Indices, index_type) /*&& ...*/) - _CCCL_AND __MDSPAN_FOLD_AND(_CCCL_TRAIT(is_nothrow_constructible, index_type, _Indices) /*&& ...*/)) + _CCCL_REQUIRES((sizeof...(_Indices) == _Extents::rank()) + _CCCL_AND __fold_and_v<_CCCL_TRAIT(is_convertible, _Indices, index_type)...> // + _CCCL_AND __fold_and_v<_CCCL_TRAIT(is_nothrow_constructible, index_type, _Indices)...>) __MDSPAN_FORCE_INLINE_FUNCTION constexpr index_type operator()(_Indices... __idxs) const noexcept { diff --git a/libcudacxx/include/cuda/std/__mdspan/macros.h b/libcudacxx/include/cuda/std/__mdspan/macros.h index d3dc04b1111..b9b56adae37 100644 --- a/libcudacxx/include/cuda/std/__mdspan/macros.h +++ b/libcudacxx/include/cuda/std/__mdspan/macros.h @@ -276,18 +276,12 @@ //============================================================================== // {{{1 -struct __mdspan_enable_fold_comma -{}; - # ifdef __MDSPAN_USE_FOLD_EXPRESSIONS -# define __MDSPAN_FOLD_AND(...) ((__VA_ARGS__) && ...) -# define __MDSPAN_FOLD_AND_TEMPLATE(...) ((__VA_ARGS__) && ...) -# define __MDSPAN_FOLD_OR(...) ((__VA_ARGS__) || ...) -# define __MDSPAN_FOLD_ASSIGN_LEFT(__INIT, ...) (__INIT = ... = (__VA_ARGS__)) -# define __MDSPAN_FOLD_ASSIGN_RIGHT(__PACK, ...) (__PACK = ... = (__VA_ARGS__)) -# define __MDSPAN_FOLD_TIMES_RIGHT(__PACK, ...) (__PACK * ... * (__VA_ARGS__)) -# define __MDSPAN_FOLD_PLUS_RIGHT(__PACK, ...) (__PACK + ... + (__VA_ARGS__)) -# define __MDSPAN_FOLD_COMMA(...) ((__VA_ARGS__), ...) +# define __MDSPAN_FOLD_AND(...) ((__VA_ARGS__) && ...) +# define __MDSPAN_FOLD_OR(...) ((__VA_ARGS__) || ...) +# define __MDSPAN_FOLD_ASSIGN_LEFT(__INIT, ...) (__INIT = ... = (__VA_ARGS__)) +# define __MDSPAN_FOLD_TIMES_RIGHT(__PACK, ...) (__PACK * ... * (__VA_ARGS__)) +# define __MDSPAN_FOLD_PLUS_RIGHT(__PACK, ...) (__PACK + ... + (__VA_ARGS__)) # else _LIBCUDACXX_BEGIN_NAMESPACE_STD @@ -601,12 +595,6 @@ __fold_left_assign_impl(_Args&&... __args) # endif -template -_CCCL_HOST_DEVICE constexpr __mdspan_enable_fold_comma __fold_comma_impl(_Args&&...) noexcept -{ - return {}; -} - template struct __bools; @@ -618,18 +606,10 @@ _LIBCUDACXX_END_NAMESPACE_STD # define __MDSPAN_FOLD_OR(...) _CUDA_VSTD::__fold_compatibility_impl::__fold_right_or_impl((__VA_ARGS__)...) # define __MDSPAN_FOLD_ASSIGN_LEFT(__INIT, ...) \ _CUDA_VSTD::__fold_compatibility_impl::__fold_left_assign_impl(__INIT, (__VA_ARGS__)...) -# define __MDSPAN_FOLD_ASSIGN_RIGHT(__PACK, ...) \ - _CUDA_VSTD::__fold_compatibility_impl::__fold_right_assign_impl((__PACK)..., __VA_ARGS__) # define __MDSPAN_FOLD_TIMES_RIGHT(__PACK, ...) \ _CUDA_VSTD::__fold_compatibility_impl::__fold_right_times_impl((__PACK)..., __VA_ARGS__) # define __MDSPAN_FOLD_PLUS_RIGHT(__PACK, ...) \ _CUDA_VSTD::__fold_compatibility_impl::__fold_right_plus_impl((__PACK)..., __VA_ARGS__) -# define __MDSPAN_FOLD_COMMA(...) _CUDA_VSTD::__fold_compatibility_impl::__fold_comma_impl((__VA_ARGS__)...) - -# define __MDSPAN_FOLD_AND_TEMPLATE(...) \ - _CCCL_TRAIT(_CUDA_VSTD::is_same, \ - __fold_compatibility_impl::__bools<(__VA_ARGS__)..., true>, \ - __fold_compatibility_impl::__bools) # endif diff --git a/libcudacxx/include/cuda/std/__mdspan/maybe_static_value.h b/libcudacxx/include/cuda/std/__mdspan/maybe_static_value.h index 0a8d2696b9f..fd978c6c3f8 100644 --- a/libcudacxx/include/cuda/std/__mdspan/maybe_static_value.h +++ b/libcudacxx/include/cuda/std/__mdspan/maybe_static_value.h @@ -88,10 +88,9 @@ struct __maybe_static_value return static_cast<_dynamic_t>(__v); } template - __MDSPAN_FORCE_INLINE_FUNCTION constexpr __mdspan_enable_fold_comma __set_value(_Up&& /*__rhs*/) noexcept + __MDSPAN_FORCE_INLINE_FUNCTION constexpr void __set_value(_Up&& /*__rhs*/) noexcept { // Should we assert that the value matches the static value here? - return {}; } //-------------------------------------------------------------------------- @@ -132,10 +131,9 @@ struct __maybe_static_value<_dynamic_t, _static_t, __is_dynamic_sentinal, __is_d return __v; } template - __MDSPAN_FORCE_INLINE_FUNCTION constexpr __mdspan_enable_fold_comma __set_value(_Up&& __rhs) noexcept + __MDSPAN_FORCE_INLINE_FUNCTION constexpr void __set_value(_Up&& __rhs) noexcept { __v = (_Up&&) rhs; - return {}; } # else __MDSPAN_FORCE_INLINE_FUNCTION constexpr _dynamic_t __value() const noexcept @@ -147,10 +145,9 @@ struct __maybe_static_value<_dynamic_t, _static_t, __is_dynamic_sentinal, __is_d return this->__no_unique_address_emulation<_dynamic_t>::__ref(); } template - __MDSPAN_FORCE_INLINE_FUNCTION constexpr __mdspan_enable_fold_comma __set_value(_Up&& __rhs) noexcept + __MDSPAN_FORCE_INLINE_FUNCTION constexpr void __set_value(_Up&& __rhs) noexcept { this->__no_unique_address_emulation<_dynamic_t>::__ref() = (_Up&&) __rhs; - return {}; } # endif }; diff --git a/libcudacxx/include/cuda/std/__mdspan/mdspan.h b/libcudacxx/include/cuda/std/__mdspan/mdspan.h index ad359c555a2..b206a35fd55 100644 --- a/libcudacxx/include/cuda/std/__mdspan/mdspan.h +++ b/libcudacxx/include/cuda/std/__mdspan/mdspan.h @@ -59,6 +59,7 @@ #include #include #include +#include #include #include #include @@ -177,12 +178,11 @@ class mdspan _CCCL_HIDE_FROM_ABI constexpr mdspan(mdspan&&) = default; _CCCL_TEMPLATE(class... _SizeTypes) - _CCCL_REQUIRES( - __MDSPAN_FOLD_AND(_CCCL_TRAIT(is_convertible, _SizeTypes, index_type) /* && ... */) - _CCCL_AND __MDSPAN_FOLD_AND(_CCCL_TRAIT(is_nothrow_constructible, index_type, _SizeTypes) /* && ... */) - _CCCL_AND((sizeof...(_SizeTypes) == rank()) || (sizeof...(_SizeTypes) == rank_dynamic())) - _CCCL_AND _CCCL_TRAIT(is_constructible, mapping_type, extents_type) - _CCCL_AND _CCCL_TRAIT(is_default_constructible, accessor_type)) + _CCCL_REQUIRES(__fold_and_v<_CCCL_TRAIT(is_convertible, _SizeTypes, index_type)...> _CCCL_AND + __fold_and_v<_CCCL_TRAIT(is_nothrow_constructible, index_type, _SizeTypes)...> _CCCL_AND( + (sizeof...(_SizeTypes) == rank()) || (sizeof...(_SizeTypes) == rank_dynamic())) + _CCCL_AND _CCCL_TRAIT(is_constructible, mapping_type, extents_type) + _CCCL_AND _CCCL_TRAIT(is_default_constructible, accessor_type)) _LIBCUDACXX_HIDE_FROM_ABI explicit constexpr mdspan(data_handle_type __p, _SizeTypes... __dynamic_extents) // TODO @proposal-bug shouldn't I be allowed to do `move(__p)` here? : __members( @@ -264,10 +264,9 @@ class mdspan # if __MDSPAN_USE_BRACKET_OPERATOR _CCCL_TEMPLATE(class... _SizeTypes) - _CCCL_REQUIRES( - __MDSPAN_FOLD_AND(_CCCL_TRAIT(is_convertible, _SizeTypes, index_type) /* && ... */) - _CCCL_AND __MDSPAN_FOLD_AND(_CCCL_TRAIT(is_nothrow_constructible, index_type, _SizeTypes) /* && ... */) - _CCCL_AND(rank() == sizeof...(_SizeTypes))) + _CCCL_REQUIRES(__fold_and_v<_CCCL_TRAIT(is_convertible, _SizeTypes, index_type)...> _CCCL_AND + __fold_and_v<_CCCL_TRAIT(is_nothrow_constructible, index_type, _SizeTypes)...> _CCCL_AND( + rank() == sizeof...(_SizeTypes))) __MDSPAN_FORCE_INLINE_FUNCTION constexpr reference operator[](_SizeTypes... __indices) const { @@ -307,10 +306,9 @@ class mdspan # if __MDSPAN_USE_PAREN_OPERATOR _CCCL_TEMPLATE(class... _SizeTypes) - _CCCL_REQUIRES( - __MDSPAN_FOLD_AND(_CCCL_TRAIT(is_convertible, _SizeTypes, index_type) /* && ... */) - _CCCL_AND __MDSPAN_FOLD_AND(_CCCL_TRAIT(is_nothrow_constructible, index_type, _SizeTypes) /* && ... */) - _CCCL_AND(extents_type::rank() == sizeof...(_SizeTypes))) + _CCCL_REQUIRES(__fold_and_v<_CCCL_TRAIT(is_convertible, _SizeTypes, index_type)...> _CCCL_AND + __fold_and_v<_CCCL_TRAIT(is_nothrow_constructible, index_type, _SizeTypes)...> _CCCL_AND( + extents_type::rank() == sizeof...(_SizeTypes))) __MDSPAN_FORCE_INLINE_FUNCTION constexpr reference operator()(_SizeTypes... __indices) const { @@ -440,8 +438,7 @@ class mdspan # if defined(__MDSPAN_USE_CLASS_TEMPLATE_ARGUMENT_DEDUCTION) _CCCL_TEMPLATE(class _ElementType, class... _SizeTypes) -_CCCL_REQUIRES(__MDSPAN_FOLD_AND(_CCCL_TRAIT(is_integral, _SizeTypes) /* && ... */) - _CCCL_AND(sizeof...(_SizeTypes) > 0)) +_CCCL_REQUIRES(__fold_and_v<_CCCL_TRAIT(is_integral, _SizeTypes)...> _CCCL_AND(sizeof...(_SizeTypes) > 0)) _CCCL_HOST_DEVICE explicit mdspan(_ElementType*, _SizeTypes...) -> mdspan<_ElementType, dextents>; diff --git a/libcudacxx/include/cuda/std/__mdspan/submdspan.h b/libcudacxx/include/cuda/std/__mdspan/submdspan.h index 2053c3a6d88..aac6f43c85d 100644 --- a/libcudacxx/include/cuda/std/__mdspan/submdspan.h +++ b/libcudacxx/include/cuda/std/__mdspan/submdspan.h @@ -62,6 +62,7 @@ #include #include #include +#include #include #include #include @@ -250,8 +251,8 @@ struct __assign_op_slice_handler< _CUDA_VSTD::integer_sequence> { // TODO remove this for better compiler performance - static_assert(__MDSPAN_FOLD_AND((_Strides == dynamic_extent || _Strides > 0) /* && ... */), " "); - static_assert(__MDSPAN_FOLD_AND((_Offsets == dynamic_extent || _Offsets >= 0) /* && ... */), " "); + static_assert(__fold_and_v<(_Strides == dynamic_extent || _Strides > 0)...>, " "); + static_assert(__fold_and_v<(_Offsets == dynamic_extent || _Offsets >= 0)...>, " "); using __offsets_storage_t = __partially_static_sizes<_IndexT, size_t, _Offsets...>; using __extents_storage_t = __partially_static_sizes<_IndexT, size_t, _Exts...>; @@ -522,13 +523,12 @@ struct _is_layout_stride : true_type //============================================================================== _CCCL_TEMPLATE(class _ET, class _EXT, class _LP, class _AP, class... _SliceSpecs) -_CCCL_REQUIRES( - (_CCCL_TRAIT(_CUDA_VSTD::is_same, _LP, layout_left) || _CCCL_TRAIT(_CUDA_VSTD::is_same, _LP, layout_right) - || __detail::_is_layout_stride<_LP>::value) - _CCCL_AND __MDSPAN_FOLD_AND((_CCCL_TRAIT(_CUDA_VSTD::is_convertible, _SliceSpecs, size_t) - || _CCCL_TRAIT(_CUDA_VSTD::is_convertible, _SliceSpecs, tuple) - || _CCCL_TRAIT(_CUDA_VSTD::is_convertible, _SliceSpecs, full_extent_t)) /* && ... */) - _CCCL_AND(sizeof...(_SliceSpecs) == _EXT::rank())) +_CCCL_REQUIRES((_CCCL_TRAIT(_CUDA_VSTD::is_same, _LP, layout_left) + || _CCCL_TRAIT(_CUDA_VSTD::is_same, _LP, layout_right) || __detail::_is_layout_stride<_LP>::value) + _CCCL_AND __fold_and_v<(_CCCL_TRAIT(_CUDA_VSTD::is_convertible, _SliceSpecs, size_t) + || _CCCL_TRAIT(_CUDA_VSTD::is_convertible, _SliceSpecs, tuple) + || _CCCL_TRAIT(_CUDA_VSTD::is_convertible, _SliceSpecs, full_extent_t))...> + _CCCL_AND(sizeof...(_SliceSpecs) == _EXT::rank())) _LIBCUDACXX_HIDE_FROM_ABI __MDSPAN_DEDUCE_RETURN_TYPE_SINGLE_LINE( (constexpr submdspan(mdspan<_ET, _EXT, _LP, _AP> const& __src, _SliceSpecs... __slices) noexcept), ( From 0172045fe3bd610c5f7d3179408a0830f42d12e9 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?C=C3=A9dric=20Augonnet?= <158148890+caugonnet@users.noreply.github.com> Date: Mon, 25 Nov 2024 11:49:04 +0100 Subject: [PATCH 17/45] [STF] Implement CUDASTF_DOT_TIMING for the ctx.cuda_kernel construct (#2950) * Implement CUDASTF_DOT_TIMING facility for ctx.cuda_kernel * clang-format --- .../__stf/internal/backend_ctx.cuh | 50 ++++++++++++++++++- 1 file changed, 48 insertions(+), 2 deletions(-) diff --git a/cudax/include/cuda/experimental/__stf/internal/backend_ctx.cuh b/cudax/include/cuda/experimental/__stf/internal/backend_ctx.cuh index 2822370c1f3..7a63df4c8c3 100644 --- a/cudax/include/cuda/experimental/__stf/internal/backend_ctx.cuh +++ b/cudax/include/cuda/experimental/__stf/internal/backend_ctx.cuh @@ -64,6 +64,8 @@ class graph_ctx; class null_partition; +class stream_ctx; + namespace reserved { @@ -290,13 +292,57 @@ public: t.set_symbol(symbol); } + auto& dot = *ctx.get_dot(); + auto& statistics = reserved::task_statistics::instance(); + + cudaEvent_t start_event, end_event; + const bool record_time = t.schedule_task() || statistics.is_calibrating_to_file(); + t.start(); + + int device = -1; + SCOPE(exit) { - t.end(); + t.end_uncleared(); + + if constexpr (::std::is_same_v) + { + if (record_time) + { + cuda_safe_call(cudaEventRecord(end_event, t.get_stream())); + cuda_safe_call(cudaEventSynchronize(end_event)); + + float milliseconds = 0; + cuda_safe_call(cudaEventElapsedTime(&milliseconds, start_event, end_event)); + + if (dot.is_tracing()) + { + dot.template add_vertex_timing(t, milliseconds, device); + } + + if (statistics.is_calibrating()) + { + statistics.log_task_time(t, milliseconds); + } + } + } + + t.clear(); }; - auto& dot = *ctx.get_dot(); + if constexpr (::std::is_same_v) + { + if (record_time) + { + cuda_safe_call(cudaGetDevice(&device)); // We will use this to force it during the next run + // Events must be created here to avoid issues with multi-gpu + cuda_safe_call(cudaEventCreate(&start_event)); + cuda_safe_call(cudaEventCreate(&end_event)); + cuda_safe_call(cudaEventRecord(start_event, t.get_stream())); + } + } + if (dot.is_tracing()) { dot.template add_vertex(t); From 0b36a7dd077a7a38fa560d95cf04b1096d1b2466 Mon Sep 17 00:00:00 2001 From: Michael Schellenberger Costa Date: Mon, 25 Nov 2024 13:49:27 +0100 Subject: [PATCH 18/45] Avoid potential null dereference in `annotated_ptr` (#2951) Fixes [BUG]: UB in annotated_ptr #2942 --- libcudacxx/include/cuda/annotated_ptr | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/libcudacxx/include/cuda/annotated_ptr b/libcudacxx/include/cuda/annotated_ptr index 51601986b7d..7c74be390f1 100644 --- a/libcudacxx/include/cuda/annotated_ptr +++ b/libcudacxx/include/cuda/annotated_ptr @@ -391,6 +391,10 @@ public: _CCCL_HOST_DEVICE pointer get() const noexcept { + if (__repr == nullptr) + { + return nullptr; + } constexpr bool __is_shared = std::is_same<_Property, access_property::shared>::value; return __is_shared ? __repr : &(*annotated_ptr(__repr)); } From a791939bab707c6dcfe4f45bdd27777da2a37852 Mon Sep 17 00:00:00 2001 From: David Bayer <48736217+davebayer@users.noreply.github.com> Date: Mon, 25 Nov 2024 16:19:00 +0100 Subject: [PATCH 19/45] make compiler version comparison utility generic (#2952) --- libcudacxx/include/cuda/std/__cccl/compiler.h | 45 +++++++++++-------- 1 file changed, 27 insertions(+), 18 deletions(-) diff --git a/libcudacxx/include/cuda/std/__cccl/compiler.h b/libcudacxx/include/cuda/std/__cccl/compiler.h index fd7e93d22cb..037d5e753ed 100644 --- a/libcudacxx/include/cuda/std/__cccl/compiler.h +++ b/libcudacxx/include/cuda/std/__cccl/compiler.h @@ -11,7 +11,34 @@ #ifndef __CCCL_COMPILER_H #define __CCCL_COMPILER_H +// Utility to compare version numbers. To use: +// 1) Define a macro that makes a version number from major and minor numbers, e. g.: +// #define MYPRODUCT_MAKE_VERSION(_MAJOR, _MINOR) (_MAJOR * 100 + _MINOR) +// 2) Define a macro that you will use to compare versions, e. g.: +// #define MYPRODUCT(...) _CCCL_VERSION_COMPARE(MYPRODUCT, MYPRODUCT_##__VA_ARGS__) +// Signatures: +// MYPRODUCT(_PROD) - is the product _PROD version non-zero? +// MYPRODUCT(_PROD, _OP, _MAJOR) - compare the product _PROD version to _MAJOR using operator _OP +// MYPRODUCT(_PROD, _OP, _MAJOR, _MINOR) - compare the product _PROD version to _MAJOR._MINOR using operator _OP +#define _CCCL_VERSION_COMPARE_1(_PREFIX, _VER) (_VER != 0) +#define _CCCL_VERSION_COMPARE_3(_PREFIX, _VER, _OP, _MAJOR) _CCCL_VERSION_COMPARE_4(_PREFIX, _VER, _OP, _MAJOR, 0) +#define _CCCL_VERSION_COMPARE_4(_PREFIX, _VER, _OP, _MAJOR, _MINOR) \ + (_CCCL_VERSION_COMPARE_1(_PREFIX, _VER) && (_VER _OP _PREFIX##_MAKE_VERSION(_MAJOR, _MINOR))) +#define _CCCL_VERSION_SELECT_COUNT(_ARG1, _ARG2, _ARG3, _ARG4, _ARG5, ...) _ARG5 +#define _CCCL_VERSION_SELECT2(_ARGS) _CCCL_VERSION_SELECT_COUNT _ARGS +// MSVC traditonal preprocessor requires an extra level of indirection +#define _CCCL_VERSION_SELECT(...) \ + _CCCL_VERSION_SELECT2( \ + (__VA_ARGS__, \ + _CCCL_VERSION_COMPARE_4, \ + _CCCL_VERSION_COMPARE_3, \ + _CCCL_VERSION_COMPARE_BAD_ARG_COUNT, \ + _CCCL_VERSION_COMPARE_1, \ + _CCCL_VERSION_COMPARE_BAD_ARG_COUNT)) +#define _CCCL_VERSION_COMPARE(_PREFIX, ...) _CCCL_VERSION_SELECT(__VA_ARGS__)(_PREFIX, __VA_ARGS__) + #define _CCCL_COMPILER_MAKE_VERSION(_MAJOR, _MINOR) (_MAJOR * 100 + _MINOR) +#define _CCCL_COMPILER(...) _CCCL_VERSION_COMPARE(_CCCL_COMPILER, _CCCL_COMPILER_##__VA_ARGS__) // Determine the host compiler and its version #if defined(__INTEL_COMPILER) @@ -39,24 +66,6 @@ # define _CCCL_COMPILER_NVRTC _CCCL_COMPILER_MAKE_VERSION(__CUDACC_VER_MAJOR__, __CUDACC_VER_MINOR__) #endif -#define _CCCL_COMPILER_COMPARE_VERSION_1(_COMP) _COMP -#define _CCCL_COMPILER_COMPARE_VERSION_3(_COMP, _OP, _MAJOR) _CCCL_COMPILER_COMPARE_VERSION_4(_COMP, _OP, _MAJOR, 0) -#define _CCCL_COMPILER_COMPARE_VERSION_4(_COMP, _OP, _MAJOR, _MINOR) \ - (_COMP && (_COMP _OP _CCCL_COMPILER_MAKE_VERSION(_MAJOR, _MINOR))) - -#define _CCCL_COMPILER_SELECT_COUNT(_ARG1, _ARG2, _ARG3, _ARG4, _ARG5, ...) _ARG5 -#define _CCCL_COMPILER_SELECT2(_ARGS) _CCCL_COMPILER_SELECT_COUNT _ARGS -// MSVC traditonal preprocessor requires an extra level of indirection -#define _CCCL_COMPILER_SELECT(...) \ - _CCCL_COMPILER_SELECT2( \ - (__VA_ARGS__, \ - _CCCL_COMPILER_COMPARE_VERSION_4, \ - _CCCL_COMPILER_COMPARE_VERSION_3, \ - _CCCL_COMPILER_COMPARE_BAD_ARG_COUNT, \ - _CCCL_COMPILER_COMPARE_VERSION_1, \ - _CCCL_COMPILER_COMPARE_BAD_ARG_COUNT)) -#define _CCCL_COMPILER(...) _CCCL_COMPILER_SELECT(_CCCL_COMPILER_##__VA_ARGS__)(_CCCL_COMPILER_##__VA_ARGS__) - // Determine the cuda compiler #if defined(__NVCC__) # define _CCCL_CUDA_COMPILER_NVCC From 5ad00af618509234c02a88d9860cd1c8415ab66b Mon Sep 17 00:00:00 2001 From: Michael Schellenberger Costa Date: Mon, 25 Nov 2024 19:02:21 +0100 Subject: [PATCH 20/45] Add SM100 descriptor to target (#2954) This is adding the missing sm_100 identifier to nv/target Fixes #2890 --- libcudacxx/include/nv/target | 171 ++++++++++++++++++----------------- 1 file changed, 88 insertions(+), 83 deletions(-) diff --git a/libcudacxx/include/nv/target b/libcudacxx/include/nv/target index 1ad75b45b29..6a3a3f0f40c 100644 --- a/libcudacxx/include/nv/target +++ b/libcudacxx/include/nv/target @@ -50,25 +50,26 @@ typedef unsigned long long base_int_t; constexpr base_int_t all_hosts = 1; // NVIDIA GPUs -constexpr base_int_t sm_35_bit = 1 << 1; -constexpr base_int_t sm_37_bit = 1 << 2; -constexpr base_int_t sm_50_bit = 1 << 3; -constexpr base_int_t sm_52_bit = 1 << 4; -constexpr base_int_t sm_53_bit = 1 << 5; -constexpr base_int_t sm_60_bit = 1 << 6; -constexpr base_int_t sm_61_bit = 1 << 7; -constexpr base_int_t sm_62_bit = 1 << 8; -constexpr base_int_t sm_70_bit = 1 << 9; -constexpr base_int_t sm_72_bit = 1 << 10; -constexpr base_int_t sm_75_bit = 1 << 11; -constexpr base_int_t sm_80_bit = 1 << 12; -constexpr base_int_t sm_86_bit = 1 << 13; -constexpr base_int_t sm_87_bit = 1 << 14; -constexpr base_int_t sm_89_bit = 1 << 15; -constexpr base_int_t sm_90_bit = 1 << 16; +constexpr base_int_t sm_35_bit = 1 << 1; +constexpr base_int_t sm_37_bit = 1 << 2; +constexpr base_int_t sm_50_bit = 1 << 3; +constexpr base_int_t sm_52_bit = 1 << 4; +constexpr base_int_t sm_53_bit = 1 << 5; +constexpr base_int_t sm_60_bit = 1 << 6; +constexpr base_int_t sm_61_bit = 1 << 7; +constexpr base_int_t sm_62_bit = 1 << 8; +constexpr base_int_t sm_70_bit = 1 << 9; +constexpr base_int_t sm_72_bit = 1 << 10; +constexpr base_int_t sm_75_bit = 1 << 11; +constexpr base_int_t sm_80_bit = 1 << 12; +constexpr base_int_t sm_86_bit = 1 << 13; +constexpr base_int_t sm_87_bit = 1 << 14; +constexpr base_int_t sm_89_bit = 1 << 15; +constexpr base_int_t sm_90_bit = 1 << 16; +constexpr base_int_t sm_100_bit = 1 << 17; constexpr base_int_t all_devices = sm_35_bit | sm_37_bit | sm_50_bit | sm_52_bit | sm_53_bit | sm_60_bit | sm_61_bit | sm_62_bit | sm_70_bit | sm_72_bit - | sm_75_bit | sm_80_bit | sm_86_bit | sm_87_bit | sm_89_bit | sm_90_bit; + | sm_75_bit | sm_80_bit | sm_86_bit | sm_87_bit | sm_89_bit | sm_90_bit | sm_100_bit; // Store a set of targets as a set of bits struct _NV_BITSET_ATTRIBUTE target_description @@ -83,22 +84,23 @@ struct _NV_BITSET_ATTRIBUTE target_description // The type of the user-visible names of the NVIDIA GPU targets enum class sm_selector : base_int_t { - sm_35 = 35, - sm_37 = 37, - sm_50 = 50, - sm_52 = 52, - sm_53 = 53, - sm_60 = 60, - sm_61 = 61, - sm_62 = 62, - sm_70 = 70, - sm_72 = 72, - sm_75 = 75, - sm_80 = 80, - sm_86 = 86, - sm_87 = 87, - sm_89 = 89, - sm_90 = 90, + sm_35 = 35, + sm_37 = 37, + sm_50 = 50, + sm_52 = 52, + sm_53 = 53, + sm_60 = 60, + sm_61 = 61, + sm_62 = 62, + sm_70 = 70, + sm_72 = 72, + sm_75 = 75, + sm_80 = 80, + sm_86 = 86, + sm_87 = 87, + sm_89 = 89, + sm_90 = 90, + sm_100 = 100, }; constexpr base_int_t toint(sm_selector a) @@ -108,44 +110,46 @@ constexpr base_int_t toint(sm_selector a) constexpr base_int_t bitexact(sm_selector a) { - return toint(a) == 35 ? sm_35_bit - : toint(a) == 37 ? sm_37_bit - : toint(a) == 50 ? sm_50_bit - : toint(a) == 52 ? sm_52_bit - : toint(a) == 53 ? sm_53_bit - : toint(a) == 60 ? sm_60_bit - : toint(a) == 61 ? sm_61_bit - : toint(a) == 62 ? sm_62_bit - : toint(a) == 70 ? sm_70_bit - : toint(a) == 72 ? sm_72_bit - : toint(a) == 75 ? sm_75_bit - : toint(a) == 80 ? sm_80_bit - : toint(a) == 86 ? sm_86_bit - : toint(a) == 87 ? sm_87_bit - : toint(a) == 89 ? sm_89_bit - : toint(a) == 90 ? sm_90_bit - : 0; + return toint(a) == 35 ? sm_35_bit + : toint(a) == 37 ? sm_37_bit + : toint(a) == 50 ? sm_50_bit + : toint(a) == 52 ? sm_52_bit + : toint(a) == 53 ? sm_53_bit + : toint(a) == 60 ? sm_60_bit + : toint(a) == 61 ? sm_61_bit + : toint(a) == 62 ? sm_62_bit + : toint(a) == 70 ? sm_70_bit + : toint(a) == 72 ? sm_72_bit + : toint(a) == 75 ? sm_75_bit + : toint(a) == 80 ? sm_80_bit + : toint(a) == 86 ? sm_86_bit + : toint(a) == 87 ? sm_87_bit + : toint(a) == 89 ? sm_89_bit + : toint(a) == 90 ? sm_90_bit + : toint(a) == 100 ? sm_100_bit + : 0; } constexpr base_int_t bitrounddown(sm_selector a) { - return toint(a) >= 90 ? sm_90_bit - : toint(a) >= 89 ? sm_89_bit - : toint(a) >= 87 ? sm_87_bit - : toint(a) >= 86 ? sm_86_bit - : toint(a) >= 80 ? sm_80_bit - : toint(a) >= 75 ? sm_75_bit - : toint(a) >= 72 ? sm_72_bit - : toint(a) >= 70 ? sm_70_bit - : toint(a) >= 62 ? sm_62_bit - : toint(a) >= 61 ? sm_61_bit - : toint(a) >= 60 ? sm_60_bit - : toint(a) >= 53 ? sm_53_bit - : toint(a) >= 52 ? sm_52_bit - : toint(a) >= 50 ? sm_50_bit - : toint(a) >= 37 ? sm_37_bit - : toint(a) >= 35 ? sm_35_bit - : 0; + return toint(a) >= 100 ? sm_100_bit + : toint(a) >= 90 ? sm_90_bit + : toint(a) >= 89 ? sm_89_bit + : toint(a) >= 87 ? sm_87_bit + : toint(a) >= 86 ? sm_86_bit + : toint(a) >= 80 ? sm_80_bit + : toint(a) >= 75 ? sm_75_bit + : toint(a) >= 72 ? sm_72_bit + : toint(a) >= 70 ? sm_70_bit + : toint(a) >= 62 ? sm_62_bit + : toint(a) >= 61 ? sm_61_bit + : toint(a) >= 60 ? sm_60_bit + : toint(a) >= 53 ? sm_53_bit + : toint(a) >= 52 ? sm_52_bit + : toint(a) >= 50 ? sm_50_bit + : toint(a) >= 37 ? sm_37_bit + : toint(a) >= 35 ? sm_35_bit + : 0; } // Public API for NVIDIA GPUs @@ -188,22 +192,23 @@ constexpr target_description any_target = target_description(detail::all_hosts | constexpr target_description no_target = target_description(0); // The public names for NVIDIA GPU architectures -constexpr sm_selector sm_35 = sm_selector::sm_35; -constexpr sm_selector sm_37 = sm_selector::sm_37; -constexpr sm_selector sm_50 = sm_selector::sm_50; -constexpr sm_selector sm_52 = sm_selector::sm_52; -constexpr sm_selector sm_53 = sm_selector::sm_53; -constexpr sm_selector sm_60 = sm_selector::sm_60; -constexpr sm_selector sm_61 = sm_selector::sm_61; -constexpr sm_selector sm_62 = sm_selector::sm_62; -constexpr sm_selector sm_70 = sm_selector::sm_70; -constexpr sm_selector sm_72 = sm_selector::sm_72; -constexpr sm_selector sm_75 = sm_selector::sm_75; -constexpr sm_selector sm_80 = sm_selector::sm_80; -constexpr sm_selector sm_86 = sm_selector::sm_86; -constexpr sm_selector sm_87 = sm_selector::sm_87; -constexpr sm_selector sm_89 = sm_selector::sm_89; -constexpr sm_selector sm_90 = sm_selector::sm_90; +constexpr sm_selector sm_35 = sm_selector::sm_35; +constexpr sm_selector sm_37 = sm_selector::sm_37; +constexpr sm_selector sm_50 = sm_selector::sm_50; +constexpr sm_selector sm_52 = sm_selector::sm_52; +constexpr sm_selector sm_53 = sm_selector::sm_53; +constexpr sm_selector sm_60 = sm_selector::sm_60; +constexpr sm_selector sm_61 = sm_selector::sm_61; +constexpr sm_selector sm_62 = sm_selector::sm_62; +constexpr sm_selector sm_70 = sm_selector::sm_70; +constexpr sm_selector sm_72 = sm_selector::sm_72; +constexpr sm_selector sm_75 = sm_selector::sm_75; +constexpr sm_selector sm_80 = sm_selector::sm_80; +constexpr sm_selector sm_86 = sm_selector::sm_86; +constexpr sm_selector sm_87 = sm_selector::sm_87; +constexpr sm_selector sm_89 = sm_selector::sm_89; +constexpr sm_selector sm_90 = sm_selector::sm_90; +constexpr sm_selector sm_100 = sm_selector::sm_100; using detail::is_exactly; using detail::provides; From ace320be094aa157548323dcd3c72b54ccf85579 Mon Sep 17 00:00:00 2001 From: Bernhard Manfred Gruber Date: Mon, 25 Nov 2024 19:39:04 +0100 Subject: [PATCH 21/45] Regenerate `cuda::ptx` headers/docs and run format (#2937) Overwrites all generated PTX header and documentation files and runs `pre-commit run --all-files`. Also exclude generated PTX headers from header check. --- .../generated/barrier_cluster.rst | 13 ++- .../instructions/generated/cp_async_bulk.rst | 9 +- .../generated/cp_async_bulk_commit_group.rst | 5 +- .../generated/cp_async_bulk_multicast.rst | 5 +- .../generated/cp_async_bulk_tensor.rst | 23 ++-- .../cp_async_bulk_tensor_multicast.rst | 13 ++- .../generated/cp_async_bulk_wait_group.rst | 3 + .../generated/cp_reduce_async_bulk.rst | 55 +++++----- .../generated/cp_reduce_async_bulk_bf16.rst | 9 +- .../generated/cp_reduce_async_bulk_f16.rst | 9 +- .../generated/cp_reduce_async_bulk_tensor.rst | 3 + .../ptx/instructions/generated/fence.rst | 3 + .../generated/fence_mbarrier_init.rst | 5 +- .../generated/fence_proxy_alias.rst | 5 +- .../generated/fence_proxy_async.rst | 5 +- .../fence_proxy_tensormap_generic.rst | 3 + .../ptx/instructions/generated/getctarank.rst | 5 +- .../generated/mbarrier_arrive.rst | 11 +- .../generated/mbarrier_arrive_expect_tx.rst | 5 +- .../generated/mbarrier_arrive_no_complete.rst | 5 +- .../generated/mbarrier_expect_tx.rst | 3 + .../instructions/generated/mbarrier_init.rst | 5 +- .../generated/mbarrier_test_wait.rst | 5 +- .../generated/mbarrier_test_wait_parity.rst | 5 +- .../generated/mbarrier_try_wait.rst | 7 +- .../generated/mbarrier_try_wait_parity.rst | 7 +- .../ptx/instructions/generated/red_async.rst | 23 ++-- .../ptx/instructions/generated/st_async.rst | 3 + .../generated/tensormap_cp_fenceproxy.rst | 3 + .../generated/tensormap_replace.rst | 3 + .../cuda/__ptx/instructions/barrier_cluster.h | 2 +- .../cuda/__ptx/instructions/cp_async_bulk.h | 4 +- .../instructions/cp_async_bulk_commit_group.h | 2 +- .../__ptx/instructions/cp_async_bulk_tensor.h | 4 +- .../instructions/cp_async_bulk_wait_group.h | 2 +- .../__ptx/instructions/cp_reduce_async_bulk.h | 6 +- .../cp_reduce_async_bulk_tensor.h | 2 +- .../include/cuda/__ptx/instructions/fence.h | 10 +- ...{barrier_cluster.inc => barrier_cluster.h} | 17 ++- .../{cp_async_bulk.inc => cp_async_bulk.h} | 13 ++- ...group.inc => cp_async_bulk_commit_group.h} | 9 +- ...ulticast.inc => cp_async_bulk_multicast.h} | 9 +- ...bulk_tensor.inc => cp_async_bulk_tensor.h} | 27 +++-- ...t.inc => cp_async_bulk_tensor_multicast.h} | 17 ++- ...t_group.inc => cp_async_bulk_wait_group.h} | 15 ++- ..._async_bulk.inc => cp_reduce_async_bulk.h} | 67 ++++++------ ...k_bf16.inc => cp_reduce_async_bulk_bf16.h} | 27 +++-- ...ulk_f16.inc => cp_reduce_async_bulk_f16.h} | 25 +++-- ...nsor.inc => cp_reduce_async_bulk_tensor.h} | 77 +++++++------ .../generated/{fence.inc => fence.h} | 19 +++- ...barrier_init.inc => fence_mbarrier_init.h} | 9 +- ...ce_proxy_alias.inc => fence_proxy_alias.h} | 9 +- ...ce_proxy_async.inc => fence_proxy_async.h} | 13 ++- ...ic.inc => fence_proxy_tensormap_generic.h} | 19 +++- .../generated/{get_sreg.inc => get_sreg.h} | 103 ++++++++++-------- .../{getctarank.inc => getctarank.h} | 9 +- ...{mbarrier_arrive.inc => mbarrier_arrive.h} | 19 +++- ...ect_tx.inc => mbarrier_arrive_expect_tx.h} | 11 +- ...lete.inc => mbarrier_arrive_no_complete.h} | 9 +- .../{mbarrier_init.inc => mbarrier_init.h} | 9 +- ...ier_test_wait.inc => mbarrier_test_wait.h} | 11 +- ...parity.inc => mbarrier_test_wait_parity.h} | 11 +- ...rrier_try_wait.inc => mbarrier_try_wait.h} | 15 ++- ..._parity.inc => mbarrier_try_wait_parity.h} | 15 ++- .../generated/{red_async.inc => red_async.h} | 27 +++-- .../generated/{st_async.inc => st_async.h} | 11 +- ...nceproxy.inc => tensormap_cp_fenceproxy.h} | 13 ++- ...sormap_replace.inc => tensormap_replace.h} | 7 ++ .../cuda/__ptx/instructions/get_sreg.h | 2 +- .../cuda/__ptx/instructions/getctarank.h | 2 +- .../cuda/__ptx/instructions/mbarrier_arrive.h | 6 +- .../cuda/__ptx/instructions/mbarrier_init.h | 2 +- .../cuda/__ptx/instructions/mbarrier_wait.h | 8 +- .../cuda/__ptx/instructions/red_async.h | 2 +- .../cuda/__ptx/instructions/st_async.h | 2 +- .../instructions/tensormap_cp_fenceproxy.h | 2 +- .../__ptx/instructions/tensormap_replace.h | 2 +- .../test/internal_headers/CMakeLists.txt | 3 + 78 files changed, 631 insertions(+), 332 deletions(-) rename libcudacxx/include/cuda/__ptx/instructions/generated/{barrier_cluster.inc => barrier_cluster.h} (92%) rename libcudacxx/include/cuda/__ptx/instructions/generated/{cp_async_bulk.inc => cp_async_bulk.h} (93%) rename libcudacxx/include/cuda/__ptx/instructions/generated/{cp_async_bulk_commit_group.inc => cp_async_bulk_commit_group.h} (73%) rename libcudacxx/include/cuda/__ptx/instructions/generated/{cp_async_bulk_multicast.inc => cp_async_bulk_multicast.h} (86%) rename libcudacxx/include/cuda/__ptx/instructions/generated/{cp_async_bulk_tensor.inc => cp_async_bulk_tensor.h} (96%) rename libcudacxx/include/cuda/__ptx/instructions/generated/{cp_async_bulk_tensor_multicast.inc => cp_async_bulk_tensor_multicast.h} (95%) rename libcudacxx/include/cuda/__ptx/instructions/generated/{cp_async_bulk_wait_group.inc => cp_async_bulk_wait_group.h} (82%) rename libcudacxx/include/cuda/__ptx/instructions/generated/{cp_reduce_async_bulk.inc => cp_reduce_async_bulk.h} (97%) rename libcudacxx/include/cuda/__ptx/instructions/generated/{cp_reduce_async_bulk_bf16.inc => cp_reduce_async_bulk_bf16.h} (89%) rename libcudacxx/include/cuda/__ptx/instructions/generated/{cp_reduce_async_bulk_f16.inc => cp_reduce_async_bulk_f16.h} (89%) rename libcudacxx/include/cuda/__ptx/instructions/generated/{cp_reduce_async_bulk_tensor.inc => cp_reduce_async_bulk_tensor.h} (91%) rename libcudacxx/include/cuda/__ptx/instructions/generated/{fence.inc => fence.h} (81%) rename libcudacxx/include/cuda/__ptx/instructions/generated/{fence_mbarrier_init.inc => fence_mbarrier_init.h} (80%) rename libcudacxx/include/cuda/__ptx/instructions/generated/{fence_proxy_alias.inc => fence_proxy_alias.h} (74%) rename libcudacxx/include/cuda/__ptx/instructions/generated/{fence_proxy_async.inc => fence_proxy_async.h} (83%) rename libcudacxx/include/cuda/__ptx/instructions/generated/{fence_proxy_tensormap_generic.inc => fence_proxy_tensormap_generic.h} (85%) rename libcudacxx/include/cuda/__ptx/instructions/generated/{get_sreg.inc => get_sreg.h} (95%) rename libcudacxx/include/cuda/__ptx/instructions/generated/{getctarank.inc => getctarank.h} (81%) rename libcudacxx/include/cuda/__ptx/instructions/generated/{mbarrier_arrive.inc => mbarrier_arrive.h} (94%) rename libcudacxx/include/cuda/__ptx/instructions/generated/{mbarrier_arrive_expect_tx.inc => mbarrier_arrive_expect_tx.h} (90%) rename libcudacxx/include/cuda/__ptx/instructions/generated/{mbarrier_arrive_no_complete.inc => mbarrier_arrive_no_complete.h} (79%) rename libcudacxx/include/cuda/__ptx/instructions/generated/{mbarrier_init.inc => mbarrier_init.h} (78%) rename libcudacxx/include/cuda/__ptx/instructions/generated/{mbarrier_test_wait.inc => mbarrier_test_wait.h} (90%) rename libcudacxx/include/cuda/__ptx/instructions/generated/{mbarrier_test_wait_parity.inc => mbarrier_test_wait_parity.h} (90%) rename libcudacxx/include/cuda/__ptx/instructions/generated/{mbarrier_try_wait.inc => mbarrier_try_wait.h} (93%) rename libcudacxx/include/cuda/__ptx/instructions/generated/{mbarrier_try_wait_parity.inc => mbarrier_try_wait_parity.h} (93%) rename libcudacxx/include/cuda/__ptx/instructions/generated/{red_async.inc => red_async.h} (97%) rename libcudacxx/include/cuda/__ptx/instructions/generated/{st_async.inc => st_async.h} (93%) rename libcudacxx/include/cuda/__ptx/instructions/generated/{tensormap_cp_fenceproxy.inc => tensormap_cp_fenceproxy.h} (85%) rename libcudacxx/include/cuda/__ptx/instructions/generated/{tensormap_replace.inc => tensormap_replace.h} (99%) diff --git a/docs/libcudacxx/ptx/instructions/generated/barrier_cluster.rst b/docs/libcudacxx/ptx/instructions/generated/barrier_cluster.rst index bd994990c05..2e3b8bac188 100644 --- a/docs/libcudacxx/ptx/instructions/generated/barrier_cluster.rst +++ b/docs/libcudacxx/ptx/instructions/generated/barrier_cluster.rst @@ -1,10 +1,13 @@ +.. + This file was automatically generated. Do not edit. + barrier.cluster.arrive ^^^^^^^^^^^^^^^^^^^^^^ .. code:: cuda // barrier.cluster.arrive; // PTX ISA 78, SM_90 // Marked volatile and as clobbering memory - template + template __device__ static inline void barrier_cluster_arrive(); barrier.cluster.wait @@ -13,7 +16,7 @@ barrier.cluster.wait // barrier.cluster.wait; // PTX ISA 78, SM_90 // Marked volatile and as clobbering memory - template + template __device__ static inline void barrier_cluster_wait(); barrier.cluster.arrive.release @@ -23,7 +26,7 @@ barrier.cluster.arrive.release // barrier.cluster.arrive.sem; // PTX ISA 80, SM_90 // .sem = { .release } // Marked volatile and as clobbering memory - template + template __device__ static inline void barrier_cluster_arrive( cuda::ptx::sem_release_t); @@ -34,7 +37,7 @@ barrier.cluster.arrive.relaxed // barrier.cluster.arrive.sem; // PTX ISA 80, SM_90 // .sem = { .relaxed } // Marked volatile - template + template __device__ static inline void barrier_cluster_arrive( cuda::ptx::sem_relaxed_t); @@ -45,6 +48,6 @@ barrier.cluster.wait.acquire // barrier.cluster.wait.sem; // PTX ISA 80, SM_90 // .sem = { .acquire } // Marked volatile and as clobbering memory - template + template __device__ static inline void barrier_cluster_wait( cuda::ptx::sem_acquire_t); diff --git a/docs/libcudacxx/ptx/instructions/generated/cp_async_bulk.rst b/docs/libcudacxx/ptx/instructions/generated/cp_async_bulk.rst index f5c236f8bf9..4883d8495eb 100644 --- a/docs/libcudacxx/ptx/instructions/generated/cp_async_bulk.rst +++ b/docs/libcudacxx/ptx/instructions/generated/cp_async_bulk.rst @@ -1,3 +1,6 @@ +.. + This file was automatically generated. Do not edit. + cp.async.bulk.shared::cluster.global.mbarrier::complete_tx::bytes ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ .. code:: cuda @@ -5,7 +8,7 @@ cp.async.bulk.shared::cluster.global.mbarrier::complete_tx::bytes // cp.async.bulk.dst.src.mbarrier::complete_tx::bytes [dstMem], [srcMem], size, [smem_bar]; // 1a. unicast PTX ISA 80, SM_90 // .dst = { .shared::cluster } // .src = { .global } - template + template __device__ static inline void cp_async_bulk( cuda::ptx::space_cluster_t, cuda::ptx::space_global_t, @@ -21,7 +24,7 @@ cp.async.bulk.shared::cluster.shared::cta.mbarrier::complete_tx::bytes // cp.async.bulk.dst.src.mbarrier::complete_tx::bytes [dstMem], [srcMem], size, [rdsmem_bar]; // 2. PTX ISA 80, SM_90 // .dst = { .shared::cluster } // .src = { .shared::cta } - template + template __device__ static inline void cp_async_bulk( cuda::ptx::space_cluster_t, cuda::ptx::space_shared_t, @@ -37,7 +40,7 @@ cp.async.bulk.global.shared::cta.bulk_group // cp.async.bulk.dst.src.bulk_group [dstMem], [srcMem], size; // 3. PTX ISA 80, SM_90 // .dst = { .global } // .src = { .shared::cta } - template + template __device__ static inline void cp_async_bulk( cuda::ptx::space_global_t, cuda::ptx::space_shared_t, diff --git a/docs/libcudacxx/ptx/instructions/generated/cp_async_bulk_commit_group.rst b/docs/libcudacxx/ptx/instructions/generated/cp_async_bulk_commit_group.rst index 984b4aff976..07b9f9acfc1 100644 --- a/docs/libcudacxx/ptx/instructions/generated/cp_async_bulk_commit_group.rst +++ b/docs/libcudacxx/ptx/instructions/generated/cp_async_bulk_commit_group.rst @@ -1,7 +1,10 @@ +.. + This file was automatically generated. Do not edit. + cp.async.bulk.commit_group ^^^^^^^^^^^^^^^^^^^^^^^^^^ .. code:: cuda // cp.async.bulk.commit_group; // PTX ISA 80, SM_90 - template + template __device__ static inline void cp_async_bulk_commit_group(); diff --git a/docs/libcudacxx/ptx/instructions/generated/cp_async_bulk_multicast.rst b/docs/libcudacxx/ptx/instructions/generated/cp_async_bulk_multicast.rst index 9cb15d06fa3..af027c0b623 100644 --- a/docs/libcudacxx/ptx/instructions/generated/cp_async_bulk_multicast.rst +++ b/docs/libcudacxx/ptx/instructions/generated/cp_async_bulk_multicast.rst @@ -1,3 +1,6 @@ +.. + This file was automatically generated. Do not edit. + cp.async.bulk.shared::cluster.global.mbarrier::complete_tx::bytes.multicast::cluster ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ .. code:: cuda @@ -5,7 +8,7 @@ cp.async.bulk.shared::cluster.global.mbarrier::complete_tx::bytes.multicast::clu // cp.async.bulk{.dst}{.src}.mbarrier::complete_tx::bytes.multicast::cluster [dstMem], [srcMem], size, [smem_bar], ctaMask; // 1. PTX ISA 80, SM_90a // .dst = { .shared::cluster } // .src = { .global } - template + template __device__ static inline void cp_async_bulk( cuda::ptx::space_cluster_t, cuda::ptx::space_global_t, diff --git a/docs/libcudacxx/ptx/instructions/generated/cp_async_bulk_tensor.rst b/docs/libcudacxx/ptx/instructions/generated/cp_async_bulk_tensor.rst index 40eb070e66a..1c21efdd0a3 100644 --- a/docs/libcudacxx/ptx/instructions/generated/cp_async_bulk_tensor.rst +++ b/docs/libcudacxx/ptx/instructions/generated/cp_async_bulk_tensor.rst @@ -1,3 +1,6 @@ +.. + This file was automatically generated. Do not edit. + cp.async.bulk.tensor.1d.shared::cluster.global.tile.mbarrier::complete_tx::bytes ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ .. code:: cuda @@ -5,7 +8,7 @@ cp.async.bulk.tensor.1d.shared::cluster.global.tile.mbarrier::complete_tx::bytes // cp.async.bulk.tensor.1d.dst.src.tile.mbarrier::complete_tx::bytes [dstMem], [tensorMap, tensorCoords], [smem_bar];// 1a. PTX ISA 80, SM_90 // .dst = { .shared::cluster } // .src = { .global } - template + template __device__ static inline void cp_async_bulk_tensor( cuda::ptx::space_cluster_t, cuda::ptx::space_global_t, @@ -21,7 +24,7 @@ cp.async.bulk.tensor.1d.global.shared::cta.tile.bulk_group // cp.async.bulk.tensor.1d.dst.src.tile.bulk_group [tensorMap, tensorCoords], [srcMem]; // 3a. PTX ISA 80, SM_90 // .dst = { .global } // .src = { .shared::cta } - template + template __device__ static inline void cp_async_bulk_tensor( cuda::ptx::space_global_t, cuda::ptx::space_shared_t, @@ -36,7 +39,7 @@ cp.async.bulk.tensor.2d.shared::cluster.global.tile.mbarrier::complete_tx::bytes // cp.async.bulk.tensor.2d.dst.src.tile.mbarrier::complete_tx::bytes [dstMem], [tensorMap, tensorCoords], [smem_bar];// 1b. PTX ISA 80, SM_90 // .dst = { .shared::cluster } // .src = { .global } - template + template __device__ static inline void cp_async_bulk_tensor( cuda::ptx::space_cluster_t, cuda::ptx::space_global_t, @@ -52,7 +55,7 @@ cp.async.bulk.tensor.2d.global.shared::cta.tile.bulk_group // cp.async.bulk.tensor.2d.dst.src.tile.bulk_group [tensorMap, tensorCoords], [srcMem]; // 3b. PTX ISA 80, SM_90 // .dst = { .global } // .src = { .shared::cta } - template + template __device__ static inline void cp_async_bulk_tensor( cuda::ptx::space_global_t, cuda::ptx::space_shared_t, @@ -67,7 +70,7 @@ cp.async.bulk.tensor.3d.shared::cluster.global.tile.mbarrier::complete_tx::bytes // cp.async.bulk.tensor.3d.dst.src.tile.mbarrier::complete_tx::bytes [dstMem], [tensorMap, tensorCoords], [smem_bar];// 1c. PTX ISA 80, SM_90 // .dst = { .shared::cluster } // .src = { .global } - template + template __device__ static inline void cp_async_bulk_tensor( cuda::ptx::space_cluster_t, cuda::ptx::space_global_t, @@ -83,7 +86,7 @@ cp.async.bulk.tensor.3d.global.shared::cta.tile.bulk_group // cp.async.bulk.tensor.3d.dst.src.tile.bulk_group [tensorMap, tensorCoords], [srcMem]; // 3c. PTX ISA 80, SM_90 // .dst = { .global } // .src = { .shared::cta } - template + template __device__ static inline void cp_async_bulk_tensor( cuda::ptx::space_global_t, cuda::ptx::space_shared_t, @@ -98,7 +101,7 @@ cp.async.bulk.tensor.4d.shared::cluster.global.tile.mbarrier::complete_tx::bytes // cp.async.bulk.tensor.4d.dst.src.tile.mbarrier::complete_tx::bytes [dstMem], [tensorMap, tensorCoords], [smem_bar];// 1d. PTX ISA 80, SM_90 // .dst = { .shared::cluster } // .src = { .global } - template + template __device__ static inline void cp_async_bulk_tensor( cuda::ptx::space_cluster_t, cuda::ptx::space_global_t, @@ -114,7 +117,7 @@ cp.async.bulk.tensor.4d.global.shared::cta.tile.bulk_group // cp.async.bulk.tensor.4d.dst.src.tile.bulk_group [tensorMap, tensorCoords], [srcMem]; // 3d. PTX ISA 80, SM_90 // .dst = { .global } // .src = { .shared::cta } - template + template __device__ static inline void cp_async_bulk_tensor( cuda::ptx::space_global_t, cuda::ptx::space_shared_t, @@ -129,7 +132,7 @@ cp.async.bulk.tensor.5d.shared::cluster.global.tile.mbarrier::complete_tx::bytes // cp.async.bulk.tensor.5d.dst.src.tile.mbarrier::complete_tx::bytes [dstMem], [tensorMap, tensorCoords], [smem_bar];// 1e. PTX ISA 80, SM_90 // .dst = { .shared::cluster } // .src = { .global } - template + template __device__ static inline void cp_async_bulk_tensor( cuda::ptx::space_cluster_t, cuda::ptx::space_global_t, @@ -145,7 +148,7 @@ cp.async.bulk.tensor.5d.global.shared::cta.tile.bulk_group // cp.async.bulk.tensor.5d.dst.src.tile.bulk_group [tensorMap, tensorCoords], [srcMem]; // 3e. PTX ISA 80, SM_90 // .dst = { .global } // .src = { .shared::cta } - template + template __device__ static inline void cp_async_bulk_tensor( cuda::ptx::space_global_t, cuda::ptx::space_shared_t, diff --git a/docs/libcudacxx/ptx/instructions/generated/cp_async_bulk_tensor_multicast.rst b/docs/libcudacxx/ptx/instructions/generated/cp_async_bulk_tensor_multicast.rst index 2481c80bf3c..ac33a05b69f 100644 --- a/docs/libcudacxx/ptx/instructions/generated/cp_async_bulk_tensor_multicast.rst +++ b/docs/libcudacxx/ptx/instructions/generated/cp_async_bulk_tensor_multicast.rst @@ -1,3 +1,6 @@ +.. + This file was automatically generated. Do not edit. + cp.async.bulk.tensor.1d.shared::cluster.global.tile.mbarrier::complete_tx::bytes.multicast::cluster ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ .. code:: cuda @@ -5,7 +8,7 @@ cp.async.bulk.tensor.1d.shared::cluster.global.tile.mbarrier::complete_tx::bytes // cp.async.bulk.tensor.1d.dst.src.tile.mbarrier::complete_tx::bytes.multicast::cluster [dstMem], [tensorMap, tensorCoords], [smem_bar], ctaMask; // 2a. PTX ISA 80, SM_90a // .dst = { .shared::cluster } // .src = { .global } - template + template __device__ static inline void cp_async_bulk_tensor( cuda::ptx::space_cluster_t, cuda::ptx::space_global_t, @@ -22,7 +25,7 @@ cp.async.bulk.tensor.2d.shared::cluster.global.tile.mbarrier::complete_tx::bytes // cp.async.bulk.tensor.2d.dst.src.tile.mbarrier::complete_tx::bytes.multicast::cluster [dstMem], [tensorMap, tensorCoords], [smem_bar], ctaMask; // 2b. PTX ISA 80, SM_90a // .dst = { .shared::cluster } // .src = { .global } - template + template __device__ static inline void cp_async_bulk_tensor( cuda::ptx::space_cluster_t, cuda::ptx::space_global_t, @@ -39,7 +42,7 @@ cp.async.bulk.tensor.3d.shared::cluster.global.tile.mbarrier::complete_tx::bytes // cp.async.bulk.tensor.3d.dst.src.tile.mbarrier::complete_tx::bytes.multicast::cluster [dstMem], [tensorMap, tensorCoords], [smem_bar], ctaMask; // 2c. PTX ISA 80, SM_90a // .dst = { .shared::cluster } // .src = { .global } - template + template __device__ static inline void cp_async_bulk_tensor( cuda::ptx::space_cluster_t, cuda::ptx::space_global_t, @@ -56,7 +59,7 @@ cp.async.bulk.tensor.4d.shared::cluster.global.tile.mbarrier::complete_tx::bytes // cp.async.bulk.tensor.4d.dst.src.tile.mbarrier::complete_tx::bytes.multicast::cluster [dstMem], [tensorMap, tensorCoords], [smem_bar], ctaMask; // 2d. PTX ISA 80, SM_90a // .dst = { .shared::cluster } // .src = { .global } - template + template __device__ static inline void cp_async_bulk_tensor( cuda::ptx::space_cluster_t, cuda::ptx::space_global_t, @@ -73,7 +76,7 @@ cp.async.bulk.tensor.5d.shared::cluster.global.tile.mbarrier::complete_tx::bytes // cp.async.bulk.tensor.5d.dst.src.tile.mbarrier::complete_tx::bytes.multicast::cluster [dstMem], [tensorMap, tensorCoords], [smem_bar], ctaMask; // 2e. PTX ISA 80, SM_90a // .dst = { .shared::cluster } // .src = { .global } - template + template __device__ static inline void cp_async_bulk_tensor( cuda::ptx::space_cluster_t, cuda::ptx::space_global_t, diff --git a/docs/libcudacxx/ptx/instructions/generated/cp_async_bulk_wait_group.rst b/docs/libcudacxx/ptx/instructions/generated/cp_async_bulk_wait_group.rst index 08ebd3c28a7..06ff8e9014c 100644 --- a/docs/libcudacxx/ptx/instructions/generated/cp_async_bulk_wait_group.rst +++ b/docs/libcudacxx/ptx/instructions/generated/cp_async_bulk_wait_group.rst @@ -1,3 +1,6 @@ +.. + This file was automatically generated. Do not edit. + cp.async.bulk.wait_group ^^^^^^^^^^^^^^^^^^^^^^^^ .. code:: cuda diff --git a/docs/libcudacxx/ptx/instructions/generated/cp_reduce_async_bulk.rst b/docs/libcudacxx/ptx/instructions/generated/cp_reduce_async_bulk.rst index cc82d633375..b043eb9f456 100644 --- a/docs/libcudacxx/ptx/instructions/generated/cp_reduce_async_bulk.rst +++ b/docs/libcudacxx/ptx/instructions/generated/cp_reduce_async_bulk.rst @@ -1,3 +1,6 @@ +.. + This file was automatically generated. Do not edit. + cp.reduce.async.bulk.shared::cluster.shared::cta.mbarrier::complete_tx::bytes.and.b32 ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ .. code:: cuda @@ -64,7 +67,7 @@ cp.reduce.async.bulk.shared::cluster.shared::cta.mbarrier::complete_tx::bytes.mi // .src = { .shared::cta } // .type = { .u32 } // .op = { .min } - template + template __device__ static inline void cp_reduce_async_bulk( cuda::ptx::space_cluster_t, cuda::ptx::space_shared_t, @@ -83,7 +86,7 @@ cp.reduce.async.bulk.shared::cluster.shared::cta.mbarrier::complete_tx::bytes.ma // .src = { .shared::cta } // .type = { .u32 } // .op = { .max } - template + template __device__ static inline void cp_reduce_async_bulk( cuda::ptx::space_cluster_t, cuda::ptx::space_shared_t, @@ -102,7 +105,7 @@ cp.reduce.async.bulk.shared::cluster.shared::cta.mbarrier::complete_tx::bytes.ad // .src = { .shared::cta } // .type = { .u32 } // .op = { .add } - template + template __device__ static inline void cp_reduce_async_bulk( cuda::ptx::space_cluster_t, cuda::ptx::space_shared_t, @@ -121,7 +124,7 @@ cp.reduce.async.bulk.shared::cluster.shared::cta.mbarrier::complete_tx::bytes.in // .src = { .shared::cta } // .type = { .u32 } // .op = { .inc } - template + template __device__ static inline void cp_reduce_async_bulk( cuda::ptx::space_cluster_t, cuda::ptx::space_shared_t, @@ -140,7 +143,7 @@ cp.reduce.async.bulk.shared::cluster.shared::cta.mbarrier::complete_tx::bytes.de // .src = { .shared::cta } // .type = { .u32 } // .op = { .dec } - template + template __device__ static inline void cp_reduce_async_bulk( cuda::ptx::space_cluster_t, cuda::ptx::space_shared_t, @@ -159,7 +162,7 @@ cp.reduce.async.bulk.shared::cluster.shared::cta.mbarrier::complete_tx::bytes.mi // .src = { .shared::cta } // .type = { .s32 } // .op = { .min } - template + template __device__ static inline void cp_reduce_async_bulk( cuda::ptx::space_cluster_t, cuda::ptx::space_shared_t, @@ -178,7 +181,7 @@ cp.reduce.async.bulk.shared::cluster.shared::cta.mbarrier::complete_tx::bytes.ma // .src = { .shared::cta } // .type = { .s32 } // .op = { .max } - template + template __device__ static inline void cp_reduce_async_bulk( cuda::ptx::space_cluster_t, cuda::ptx::space_shared_t, @@ -197,7 +200,7 @@ cp.reduce.async.bulk.shared::cluster.shared::cta.mbarrier::complete_tx::bytes.ad // .src = { .shared::cta } // .type = { .s32 } // .op = { .add } - template + template __device__ static inline void cp_reduce_async_bulk( cuda::ptx::space_cluster_t, cuda::ptx::space_shared_t, @@ -216,7 +219,7 @@ cp.reduce.async.bulk.shared::cluster.shared::cta.mbarrier::complete_tx::bytes.ad // .src = { .shared::cta } // .type = { .u64 } // .op = { .add } - template + template __device__ static inline void cp_reduce_async_bulk( cuda::ptx::space_cluster_t, cuda::ptx::space_shared_t, @@ -235,7 +238,7 @@ cp.reduce.async.bulk.shared::cluster.shared::cta.mbarrier::complete_tx::bytes.ad // .src = { .shared::cta } // .type = { .s64 } // .op = { .add } - template + template __device__ static inline void cp_reduce_async_bulk( cuda::ptx::space_cluster_t, cuda::ptx::space_shared_t, @@ -362,7 +365,7 @@ cp.reduce.async.bulk.global.shared::cta.bulk_group.min.u32 // .src = { .shared::cta } // .type = { .u32 } // .op = { .min } - template + template __device__ static inline void cp_reduce_async_bulk( cuda::ptx::space_global_t, cuda::ptx::space_shared_t, @@ -380,7 +383,7 @@ cp.reduce.async.bulk.global.shared::cta.bulk_group.max.u32 // .src = { .shared::cta } // .type = { .u32 } // .op = { .max } - template + template __device__ static inline void cp_reduce_async_bulk( cuda::ptx::space_global_t, cuda::ptx::space_shared_t, @@ -398,7 +401,7 @@ cp.reduce.async.bulk.global.shared::cta.bulk_group.add.u32 // .src = { .shared::cta } // .type = { .u32 } // .op = { .add } - template + template __device__ static inline void cp_reduce_async_bulk( cuda::ptx::space_global_t, cuda::ptx::space_shared_t, @@ -416,7 +419,7 @@ cp.reduce.async.bulk.global.shared::cta.bulk_group.inc.u32 // .src = { .shared::cta } // .type = { .u32 } // .op = { .inc } - template + template __device__ static inline void cp_reduce_async_bulk( cuda::ptx::space_global_t, cuda::ptx::space_shared_t, @@ -434,7 +437,7 @@ cp.reduce.async.bulk.global.shared::cta.bulk_group.dec.u32 // .src = { .shared::cta } // .type = { .u32 } // .op = { .dec } - template + template __device__ static inline void cp_reduce_async_bulk( cuda::ptx::space_global_t, cuda::ptx::space_shared_t, @@ -452,7 +455,7 @@ cp.reduce.async.bulk.global.shared::cta.bulk_group.min.s32 // .src = { .shared::cta } // .type = { .s32 } // .op = { .min } - template + template __device__ static inline void cp_reduce_async_bulk( cuda::ptx::space_global_t, cuda::ptx::space_shared_t, @@ -470,7 +473,7 @@ cp.reduce.async.bulk.global.shared::cta.bulk_group.max.s32 // .src = { .shared::cta } // .type = { .s32 } // .op = { .max } - template + template __device__ static inline void cp_reduce_async_bulk( cuda::ptx::space_global_t, cuda::ptx::space_shared_t, @@ -488,7 +491,7 @@ cp.reduce.async.bulk.global.shared::cta.bulk_group.add.s32 // .src = { .shared::cta } // .type = { .s32 } // .op = { .add } - template + template __device__ static inline void cp_reduce_async_bulk( cuda::ptx::space_global_t, cuda::ptx::space_shared_t, @@ -506,7 +509,7 @@ cp.reduce.async.bulk.global.shared::cta.bulk_group.min.u64 // .src = { .shared::cta } // .type = { .u64 } // .op = { .min } - template + template __device__ static inline void cp_reduce_async_bulk( cuda::ptx::space_global_t, cuda::ptx::space_shared_t, @@ -524,7 +527,7 @@ cp.reduce.async.bulk.global.shared::cta.bulk_group.max.u64 // .src = { .shared::cta } // .type = { .u64 } // .op = { .max } - template + template __device__ static inline void cp_reduce_async_bulk( cuda::ptx::space_global_t, cuda::ptx::space_shared_t, @@ -542,7 +545,7 @@ cp.reduce.async.bulk.global.shared::cta.bulk_group.add.u64 // .src = { .shared::cta } // .type = { .u64 } // .op = { .add } - template + template __device__ static inline void cp_reduce_async_bulk( cuda::ptx::space_global_t, cuda::ptx::space_shared_t, @@ -560,7 +563,7 @@ cp.reduce.async.bulk.global.shared::cta.bulk_group.min.s64 // .src = { .shared::cta } // .type = { .s64 } // .op = { .min } - template + template __device__ static inline void cp_reduce_async_bulk( cuda::ptx::space_global_t, cuda::ptx::space_shared_t, @@ -578,7 +581,7 @@ cp.reduce.async.bulk.global.shared::cta.bulk_group.max.s64 // .src = { .shared::cta } // .type = { .s64 } // .op = { .max } - template + template __device__ static inline void cp_reduce_async_bulk( cuda::ptx::space_global_t, cuda::ptx::space_shared_t, @@ -596,7 +599,7 @@ cp.reduce.async.bulk.global.shared::cta.bulk_group.add.f32 // .src = { .shared::cta } // .type = { .f32 } // .op = { .add } - template + template __device__ static inline void cp_reduce_async_bulk( cuda::ptx::space_global_t, cuda::ptx::space_shared_t, @@ -614,7 +617,7 @@ cp.reduce.async.bulk.global.shared::cta.bulk_group.add.f64 // .src = { .shared::cta } // .type = { .f64 } // .op = { .add } - template + template __device__ static inline void cp_reduce_async_bulk( cuda::ptx::space_global_t, cuda::ptx::space_shared_t, @@ -632,7 +635,7 @@ cp.reduce.async.bulk.global.shared::cta.bulk_group.add.u64 // .src = { .shared::cta } // .type = { .s64 } // .op = { .add } - template + template __device__ static inline void cp_reduce_async_bulk( cuda::ptx::space_global_t, cuda::ptx::space_shared_t, diff --git a/docs/libcudacxx/ptx/instructions/generated/cp_reduce_async_bulk_bf16.rst b/docs/libcudacxx/ptx/instructions/generated/cp_reduce_async_bulk_bf16.rst index e4dea98a119..80e927d0375 100644 --- a/docs/libcudacxx/ptx/instructions/generated/cp_reduce_async_bulk_bf16.rst +++ b/docs/libcudacxx/ptx/instructions/generated/cp_reduce_async_bulk_bf16.rst @@ -1,3 +1,6 @@ +.. + This file was automatically generated. Do not edit. + cp.reduce.async.bulk.global.shared::cta.bulk_group.min.bf16 ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ .. code:: cuda @@ -7,7 +10,7 @@ cp.reduce.async.bulk.global.shared::cta.bulk_group.min.bf16 // .src = { .shared::cta } // .type = { .bf16 } // .op = { .min } - template + template __device__ static inline void cp_reduce_async_bulk( cuda::ptx::space_global_t, cuda::ptx::space_shared_t, @@ -25,7 +28,7 @@ cp.reduce.async.bulk.global.shared::cta.bulk_group.max.bf16 // .src = { .shared::cta } // .type = { .bf16 } // .op = { .max } - template + template __device__ static inline void cp_reduce_async_bulk( cuda::ptx::space_global_t, cuda::ptx::space_shared_t, @@ -43,7 +46,7 @@ cp.reduce.async.bulk.global.shared::cta.bulk_group.add.noftz.bf16 // .src = { .shared::cta } // .type = { .bf16 } // .op = { .add } - template + template __device__ static inline void cp_reduce_async_bulk( cuda::ptx::space_global_t, cuda::ptx::space_shared_t, diff --git a/docs/libcudacxx/ptx/instructions/generated/cp_reduce_async_bulk_f16.rst b/docs/libcudacxx/ptx/instructions/generated/cp_reduce_async_bulk_f16.rst index 18c5e0bfc60..0d658fd9256 100644 --- a/docs/libcudacxx/ptx/instructions/generated/cp_reduce_async_bulk_f16.rst +++ b/docs/libcudacxx/ptx/instructions/generated/cp_reduce_async_bulk_f16.rst @@ -1,3 +1,6 @@ +.. + This file was automatically generated. Do not edit. + cp.reduce.async.bulk.global.shared::cta.bulk_group.min.f16 ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ .. code:: cuda @@ -7,7 +10,7 @@ cp.reduce.async.bulk.global.shared::cta.bulk_group.min.f16 // .src = { .shared::cta } // .type = { .f16 } // .op = { .min } - template + template __device__ static inline void cp_reduce_async_bulk( cuda::ptx::space_global_t, cuda::ptx::space_shared_t, @@ -25,7 +28,7 @@ cp.reduce.async.bulk.global.shared::cta.bulk_group.max.f16 // .src = { .shared::cta } // .type = { .f16 } // .op = { .max } - template + template __device__ static inline void cp_reduce_async_bulk( cuda::ptx::space_global_t, cuda::ptx::space_shared_t, @@ -43,7 +46,7 @@ cp.reduce.async.bulk.global.shared::cta.bulk_group.add.noftz.f16 // .src = { .shared::cta } // .type = { .f16 } // .op = { .add } - template + template __device__ static inline void cp_reduce_async_bulk( cuda::ptx::space_global_t, cuda::ptx::space_shared_t, diff --git a/docs/libcudacxx/ptx/instructions/generated/cp_reduce_async_bulk_tensor.rst b/docs/libcudacxx/ptx/instructions/generated/cp_reduce_async_bulk_tensor.rst index c653b01cd60..d587d3f51a2 100644 --- a/docs/libcudacxx/ptx/instructions/generated/cp_reduce_async_bulk_tensor.rst +++ b/docs/libcudacxx/ptx/instructions/generated/cp_reduce_async_bulk_tensor.rst @@ -1,3 +1,6 @@ +.. + This file was automatically generated. Do not edit. + cp.reduce.async.bulk.tensor.1d.global.shared::cta.add.tile.bulk_group ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ .. code:: cuda diff --git a/docs/libcudacxx/ptx/instructions/generated/fence.rst b/docs/libcudacxx/ptx/instructions/generated/fence.rst index 2fe14dcb3b2..ed21fa80b6e 100644 --- a/docs/libcudacxx/ptx/instructions/generated/fence.rst +++ b/docs/libcudacxx/ptx/instructions/generated/fence.rst @@ -1,3 +1,6 @@ +.. + This file was automatically generated. Do not edit. + fence.sc.cta ^^^^^^^^^^^^ .. code:: cuda diff --git a/docs/libcudacxx/ptx/instructions/generated/fence_mbarrier_init.rst b/docs/libcudacxx/ptx/instructions/generated/fence_mbarrier_init.rst index 0f5298e3359..c7dd357632a 100644 --- a/docs/libcudacxx/ptx/instructions/generated/fence_mbarrier_init.rst +++ b/docs/libcudacxx/ptx/instructions/generated/fence_mbarrier_init.rst @@ -1,3 +1,6 @@ +.. + This file was automatically generated. Do not edit. + fence.mbarrier_init.release.cluster ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ .. code:: cuda @@ -5,7 +8,7 @@ fence.mbarrier_init.release.cluster // fence.mbarrier_init.sem.scope; // 3. PTX ISA 80, SM_90 // .sem = { .release } // .scope = { .cluster } - template + template __device__ static inline void fence_mbarrier_init( cuda::ptx::sem_release_t, cuda::ptx::scope_cluster_t); diff --git a/docs/libcudacxx/ptx/instructions/generated/fence_proxy_alias.rst b/docs/libcudacxx/ptx/instructions/generated/fence_proxy_alias.rst index 935aab9b6df..fdd1f8d0b12 100644 --- a/docs/libcudacxx/ptx/instructions/generated/fence_proxy_alias.rst +++ b/docs/libcudacxx/ptx/instructions/generated/fence_proxy_alias.rst @@ -1,7 +1,10 @@ +.. + This file was automatically generated. Do not edit. + fence.proxy.alias ^^^^^^^^^^^^^^^^^ .. code:: cuda // fence.proxy.alias; // 4. PTX ISA 75, SM_70 - template + template __device__ static inline void fence_proxy_alias(); diff --git a/docs/libcudacxx/ptx/instructions/generated/fence_proxy_async.rst b/docs/libcudacxx/ptx/instructions/generated/fence_proxy_async.rst index 3e741a1f6c4..8376e96ce6b 100644 --- a/docs/libcudacxx/ptx/instructions/generated/fence_proxy_async.rst +++ b/docs/libcudacxx/ptx/instructions/generated/fence_proxy_async.rst @@ -1,9 +1,12 @@ +.. + This file was automatically generated. Do not edit. + fence.proxy.async ^^^^^^^^^^^^^^^^^ .. code:: cuda // fence.proxy.async; // 5. PTX ISA 80, SM_90 - template + template __device__ static inline void fence_proxy_async(); fence.proxy.async.global diff --git a/docs/libcudacxx/ptx/instructions/generated/fence_proxy_tensormap_generic.rst b/docs/libcudacxx/ptx/instructions/generated/fence_proxy_tensormap_generic.rst index db582971c3d..78c3cd308a0 100644 --- a/docs/libcudacxx/ptx/instructions/generated/fence_proxy_tensormap_generic.rst +++ b/docs/libcudacxx/ptx/instructions/generated/fence_proxy_tensormap_generic.rst @@ -1,3 +1,6 @@ +.. + This file was automatically generated. Do not edit. + fence.proxy.tensormap::generic.release.cta ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ .. code:: cuda diff --git a/docs/libcudacxx/ptx/instructions/generated/getctarank.rst b/docs/libcudacxx/ptx/instructions/generated/getctarank.rst index c85f52ee302..374c182576f 100644 --- a/docs/libcudacxx/ptx/instructions/generated/getctarank.rst +++ b/docs/libcudacxx/ptx/instructions/generated/getctarank.rst @@ -1,10 +1,13 @@ +.. + This file was automatically generated. Do not edit. + getctarank.shared::cluster.u32 ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ .. code:: cuda // getctarank{.space}.u32 dest, addr; // PTX ISA 78, SM_90 // .space = { .shared::cluster } - template + template __device__ static inline uint32_t getctarank( cuda::ptx::space_cluster_t, const void* addr); diff --git a/docs/libcudacxx/ptx/instructions/generated/mbarrier_arrive.rst b/docs/libcudacxx/ptx/instructions/generated/mbarrier_arrive.rst index 92cd106cad9..21436e2b3ca 100644 --- a/docs/libcudacxx/ptx/instructions/generated/mbarrier_arrive.rst +++ b/docs/libcudacxx/ptx/instructions/generated/mbarrier_arrive.rst @@ -1,9 +1,12 @@ +.. + This file was automatically generated. Do not edit. + mbarrier.arrive.shared.b64 ^^^^^^^^^^^^^^^^^^^^^^^^^^ .. code:: cuda // mbarrier.arrive.shared.b64 state, [addr]; // 1. PTX ISA 70, SM_80 - template + template __device__ static inline uint64_t mbarrier_arrive( uint64_t* addr); @@ -12,7 +15,7 @@ mbarrier.arrive.shared::cta.b64 .. code:: cuda // mbarrier.arrive.shared::cta.b64 state, [addr], count; // 2. PTX ISA 78, SM_90 - template + template __device__ static inline uint64_t mbarrier_arrive( uint64_t* addr, const uint32_t& count); @@ -87,7 +90,7 @@ mbarrier.arrive.release.cluster.shared::cluster.b64 // .sem = { .release } // .scope = { .cluster } // .space = { .shared::cluster } - template + template __device__ static inline void mbarrier_arrive( cuda::ptx::sem_release_t, cuda::ptx::scope_cluster_t, @@ -102,7 +105,7 @@ mbarrier.arrive.release.cluster.shared::cluster.b64 // .sem = { .release } // .scope = { .cluster } // .space = { .shared::cluster } - template + template __device__ static inline void mbarrier_arrive( cuda::ptx::sem_release_t, cuda::ptx::scope_cluster_t, diff --git a/docs/libcudacxx/ptx/instructions/generated/mbarrier_arrive_expect_tx.rst b/docs/libcudacxx/ptx/instructions/generated/mbarrier_arrive_expect_tx.rst index 0087ae2f458..47c56eca31a 100644 --- a/docs/libcudacxx/ptx/instructions/generated/mbarrier_arrive_expect_tx.rst +++ b/docs/libcudacxx/ptx/instructions/generated/mbarrier_arrive_expect_tx.rst @@ -1,3 +1,6 @@ +.. + This file was automatically generated. Do not edit. + mbarrier.arrive.expect_tx.release.cta.shared::cta.b64 ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ .. code:: cuda @@ -38,7 +41,7 @@ mbarrier.arrive.expect_tx.release.cluster.shared::cluster.b64 // .sem = { .release } // .scope = { .cluster } // .space = { .shared::cluster } - template + template __device__ static inline void mbarrier_arrive_expect_tx( cuda::ptx::sem_release_t, cuda::ptx::scope_cluster_t, diff --git a/docs/libcudacxx/ptx/instructions/generated/mbarrier_arrive_no_complete.rst b/docs/libcudacxx/ptx/instructions/generated/mbarrier_arrive_no_complete.rst index b6d7edbbeee..ba909ae1f56 100644 --- a/docs/libcudacxx/ptx/instructions/generated/mbarrier_arrive_no_complete.rst +++ b/docs/libcudacxx/ptx/instructions/generated/mbarrier_arrive_no_complete.rst @@ -1,9 +1,12 @@ +.. + This file was automatically generated. Do not edit. + mbarrier.arrive.noComplete.shared.b64 ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ .. code:: cuda // mbarrier.arrive.noComplete.shared.b64 state, [addr], count; // 5. PTX ISA 70, SM_80 - template + template __device__ static inline uint64_t mbarrier_arrive_no_complete( uint64_t* addr, const uint32_t& count); diff --git a/docs/libcudacxx/ptx/instructions/generated/mbarrier_expect_tx.rst b/docs/libcudacxx/ptx/instructions/generated/mbarrier_expect_tx.rst index b87d6f62a23..46adcd16be3 100644 --- a/docs/libcudacxx/ptx/instructions/generated/mbarrier_expect_tx.rst +++ b/docs/libcudacxx/ptx/instructions/generated/mbarrier_expect_tx.rst @@ -1,3 +1,6 @@ +.. + This file was automatically generated. Do not edit. + mbarrier.expect_tx.relaxed.cta.shared::cta.b64 ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ .. code:: cuda diff --git a/docs/libcudacxx/ptx/instructions/generated/mbarrier_init.rst b/docs/libcudacxx/ptx/instructions/generated/mbarrier_init.rst index 3e529d86d78..2c3520a20f6 100644 --- a/docs/libcudacxx/ptx/instructions/generated/mbarrier_init.rst +++ b/docs/libcudacxx/ptx/instructions/generated/mbarrier_init.rst @@ -1,9 +1,12 @@ +.. + This file was automatically generated. Do not edit. + mbarrier.init.shared.b64 ^^^^^^^^^^^^^^^^^^^^^^^^ .. code:: cuda // mbarrier.init.shared.b64 [addr], count; // PTX ISA 70, SM_80 - template + template __device__ static inline void mbarrier_init( uint64_t* addr, const uint32_t& count); diff --git a/docs/libcudacxx/ptx/instructions/generated/mbarrier_test_wait.rst b/docs/libcudacxx/ptx/instructions/generated/mbarrier_test_wait.rst index 4cb241c7ca8..d16b2ac07ac 100644 --- a/docs/libcudacxx/ptx/instructions/generated/mbarrier_test_wait.rst +++ b/docs/libcudacxx/ptx/instructions/generated/mbarrier_test_wait.rst @@ -1,9 +1,12 @@ +.. + This file was automatically generated. Do not edit. + mbarrier.test_wait.shared.b64 ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ .. code:: cuda // mbarrier.test_wait.shared.b64 waitComplete, [addr], state; // 1. PTX ISA 70, SM_80 - template + template __device__ static inline bool mbarrier_test_wait( uint64_t* addr, const uint64_t& state); diff --git a/docs/libcudacxx/ptx/instructions/generated/mbarrier_test_wait_parity.rst b/docs/libcudacxx/ptx/instructions/generated/mbarrier_test_wait_parity.rst index e750c4a543f..ec464b3398b 100644 --- a/docs/libcudacxx/ptx/instructions/generated/mbarrier_test_wait_parity.rst +++ b/docs/libcudacxx/ptx/instructions/generated/mbarrier_test_wait_parity.rst @@ -1,9 +1,12 @@ +.. + This file was automatically generated. Do not edit. + mbarrier.test_wait.parity.shared.b64 ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ .. code:: cuda // mbarrier.test_wait.parity.shared.b64 waitComplete, [addr], phaseParity; // 3. PTX ISA 71, SM_80 - template + template __device__ static inline bool mbarrier_test_wait_parity( uint64_t* addr, const uint32_t& phaseParity); diff --git a/docs/libcudacxx/ptx/instructions/generated/mbarrier_try_wait.rst b/docs/libcudacxx/ptx/instructions/generated/mbarrier_try_wait.rst index ce648c66ee9..3dfdba46861 100644 --- a/docs/libcudacxx/ptx/instructions/generated/mbarrier_try_wait.rst +++ b/docs/libcudacxx/ptx/instructions/generated/mbarrier_try_wait.rst @@ -1,9 +1,12 @@ +.. + This file was automatically generated. Do not edit. + mbarrier.try_wait.shared::cta.b64 ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ .. code:: cuda // mbarrier.try_wait.shared::cta.b64 waitComplete, [addr], state; // 5a. PTX ISA 78, SM_90 - template + template __device__ static inline bool mbarrier_try_wait( uint64_t* addr, const uint64_t& state); @@ -13,7 +16,7 @@ mbarrier.try_wait.shared::cta.b64 .. code:: cuda // mbarrier.try_wait.shared::cta.b64 waitComplete, [addr], state, suspendTimeHint; // 5b. PTX ISA 78, SM_90 - template + template __device__ static inline bool mbarrier_try_wait( uint64_t* addr, const uint64_t& state, diff --git a/docs/libcudacxx/ptx/instructions/generated/mbarrier_try_wait_parity.rst b/docs/libcudacxx/ptx/instructions/generated/mbarrier_try_wait_parity.rst index 3210dc0eab1..4e7af4bace5 100644 --- a/docs/libcudacxx/ptx/instructions/generated/mbarrier_try_wait_parity.rst +++ b/docs/libcudacxx/ptx/instructions/generated/mbarrier_try_wait_parity.rst @@ -1,9 +1,12 @@ +.. + This file was automatically generated. Do not edit. + mbarrier.try_wait.parity.shared::cta.b64 ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ .. code:: cuda // mbarrier.try_wait.parity.shared::cta.b64 waitComplete, [addr], phaseParity; // 7a. PTX ISA 78, SM_90 - template + template __device__ static inline bool mbarrier_try_wait_parity( uint64_t* addr, const uint32_t& phaseParity); @@ -13,7 +16,7 @@ mbarrier.try_wait.parity.shared::cta.b64 .. code:: cuda // mbarrier.try_wait.parity.shared::cta.b64 waitComplete, [addr], phaseParity, suspendTimeHint; // 7b. PTX ISA 78, SM_90 - template + template __device__ static inline bool mbarrier_try_wait_parity( uint64_t* addr, const uint32_t& phaseParity, diff --git a/docs/libcudacxx/ptx/instructions/generated/red_async.rst b/docs/libcudacxx/ptx/instructions/generated/red_async.rst index d6b9cf36549..658fe0a8f44 100644 --- a/docs/libcudacxx/ptx/instructions/generated/red_async.rst +++ b/docs/libcudacxx/ptx/instructions/generated/red_async.rst @@ -1,3 +1,6 @@ +.. + This file was automatically generated. Do not edit. + red.async.relaxed.cluster.shared::cluster.mbarrier::complete_tx::bytes.inc.u32 ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ .. code:: cuda @@ -5,7 +8,7 @@ red.async.relaxed.cluster.shared::cluster.mbarrier::complete_tx::bytes.inc.u32 // red.async.relaxed.cluster.shared::cluster.mbarrier::complete_tx::bytes{.op}{.type} [dest], value, [remote_bar]; // PTX ISA 81, SM_90 // .type = { .u32 } // .op = { .inc } - template + template __device__ static inline void red_async( cuda::ptx::op_inc_t, uint32_t* dest, @@ -19,7 +22,7 @@ red.async.relaxed.cluster.shared::cluster.mbarrier::complete_tx::bytes.dec.u32 // red.async.relaxed.cluster.shared::cluster.mbarrier::complete_tx::bytes{.op}{.type} [dest], value, [remote_bar]; // PTX ISA 81, SM_90 // .type = { .u32 } // .op = { .dec } - template + template __device__ static inline void red_async( cuda::ptx::op_dec_t, uint32_t* dest, @@ -33,7 +36,7 @@ red.async.relaxed.cluster.shared::cluster.mbarrier::complete_tx::bytes.min.u32 // red.async.relaxed.cluster.shared::cluster.mbarrier::complete_tx::bytes{.op}{.type} [dest], value, [remote_bar]; // PTX ISA 81, SM_90 // .type = { .u32 } // .op = { .min } - template + template __device__ static inline void red_async( cuda::ptx::op_min_t, uint32_t* dest, @@ -47,7 +50,7 @@ red.async.relaxed.cluster.shared::cluster.mbarrier::complete_tx::bytes.max.u32 // red.async.relaxed.cluster.shared::cluster.mbarrier::complete_tx::bytes{.op}{.type} [dest], value, [remote_bar]; // PTX ISA 81, SM_90 // .type = { .u32 } // .op = { .max } - template + template __device__ static inline void red_async( cuda::ptx::op_max_t, uint32_t* dest, @@ -61,7 +64,7 @@ red.async.relaxed.cluster.shared::cluster.mbarrier::complete_tx::bytes.add.u32 // red.async.relaxed.cluster.shared::cluster.mbarrier::complete_tx::bytes{.op}{.type} [dest], value, [remote_bar]; // PTX ISA 81, SM_90 // .type = { .u32 } // .op = { .add } - template + template __device__ static inline void red_async( cuda::ptx::op_add_t, uint32_t* dest, @@ -75,7 +78,7 @@ red.async.relaxed.cluster.shared::cluster.mbarrier::complete_tx::bytes.min.s32 // red.async.relaxed.cluster.shared::cluster.mbarrier::complete_tx::bytes{.op}{.type} [dest], value, [remote_bar]; // PTX ISA 81, SM_90 // .type = { .s32 } // .op = { .min } - template + template __device__ static inline void red_async( cuda::ptx::op_min_t, int32_t* dest, @@ -89,7 +92,7 @@ red.async.relaxed.cluster.shared::cluster.mbarrier::complete_tx::bytes.max.s32 // red.async.relaxed.cluster.shared::cluster.mbarrier::complete_tx::bytes{.op}{.type} [dest], value, [remote_bar]; // PTX ISA 81, SM_90 // .type = { .s32 } // .op = { .max } - template + template __device__ static inline void red_async( cuda::ptx::op_max_t, int32_t* dest, @@ -103,7 +106,7 @@ red.async.relaxed.cluster.shared::cluster.mbarrier::complete_tx::bytes.add.s32 // red.async.relaxed.cluster.shared::cluster.mbarrier::complete_tx::bytes{.op}{.type} [dest], value, [remote_bar]; // PTX ISA 81, SM_90 // .type = { .s32 } // .op = { .add } - template + template __device__ static inline void red_async( cuda::ptx::op_add_t, int32_t* dest, @@ -159,7 +162,7 @@ red.async.relaxed.cluster.shared::cluster.mbarrier::complete_tx::bytes.add.u64 // red.async.relaxed.cluster.shared::cluster.mbarrier::complete_tx::bytes{.op}{.type} [dest], value, [remote_bar]; // PTX ISA 81, SM_90 // .type = { .u64 } // .op = { .add } - template + template __device__ static inline void red_async( cuda::ptx::op_add_t, uint64_t* dest, @@ -172,7 +175,7 @@ red.async.relaxed.cluster.shared::cluster.mbarrier::complete_tx::bytes.add.u64 // red.async.relaxed.cluster.shared::cluster.mbarrier::complete_tx::bytes{.op}.u64 [dest], value, [remote_bar]; // .u64 intentional PTX ISA 81, SM_90 // .op = { .add } - template + template __device__ static inline void red_async( cuda::ptx::op_add_t, int64_t* dest, diff --git a/docs/libcudacxx/ptx/instructions/generated/st_async.rst b/docs/libcudacxx/ptx/instructions/generated/st_async.rst index c519ea57f70..d00a152cf29 100644 --- a/docs/libcudacxx/ptx/instructions/generated/st_async.rst +++ b/docs/libcudacxx/ptx/instructions/generated/st_async.rst @@ -1,3 +1,6 @@ +.. + This file was automatically generated. Do not edit. + st.async.weak.shared::cluster.mbarrier::complete_tx::bytes.b32 ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ .. code:: cuda diff --git a/docs/libcudacxx/ptx/instructions/generated/tensormap_cp_fenceproxy.rst b/docs/libcudacxx/ptx/instructions/generated/tensormap_cp_fenceproxy.rst index 52fae102ad4..e42bae5a5a0 100644 --- a/docs/libcudacxx/ptx/instructions/generated/tensormap_cp_fenceproxy.rst +++ b/docs/libcudacxx/ptx/instructions/generated/tensormap_cp_fenceproxy.rst @@ -1,3 +1,6 @@ +.. + This file was automatically generated. Do not edit. + tensormap.cp_fenceproxy.global.shared::cta.tensormap::generic.release.cta.sync.aligned ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ .. code:: cuda diff --git a/docs/libcudacxx/ptx/instructions/generated/tensormap_replace.rst b/docs/libcudacxx/ptx/instructions/generated/tensormap_replace.rst index 33e6f1d839a..a8c4a260782 100644 --- a/docs/libcudacxx/ptx/instructions/generated/tensormap_replace.rst +++ b/docs/libcudacxx/ptx/instructions/generated/tensormap_replace.rst @@ -1,3 +1,6 @@ +.. + This file was automatically generated. Do not edit. + tensormap.replace.tile.global_address.global.b1024.b64 ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ .. code:: cuda diff --git a/libcudacxx/include/cuda/__ptx/instructions/barrier_cluster.h b/libcudacxx/include/cuda/__ptx/instructions/barrier_cluster.h index 8b09ddd1110..93b6a06037c 100644 --- a/libcudacxx/include/cuda/__ptx/instructions/barrier_cluster.h +++ b/libcudacxx/include/cuda/__ptx/instructions/barrier_cluster.h @@ -32,7 +32,7 @@ _LIBCUDACXX_BEGIN_NAMESPACE_CUDA_PTX // 9.7.12.3. Parallel Synchronization and Communication Instructions: barrier.cluster // https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#parallel-synchronization-and-communication-instructions-barrier-cluster -#include +#include _LIBCUDACXX_END_NAMESPACE_CUDA_PTX diff --git a/libcudacxx/include/cuda/__ptx/instructions/cp_async_bulk.h b/libcudacxx/include/cuda/__ptx/instructions/cp_async_bulk.h index 480a02a701e..abfba441ac9 100644 --- a/libcudacxx/include/cuda/__ptx/instructions/cp_async_bulk.h +++ b/libcudacxx/include/cuda/__ptx/instructions/cp_async_bulk.h @@ -32,8 +32,8 @@ _LIBCUDACXX_BEGIN_NAMESPACE_CUDA_PTX // 9.7.8.24.6. Data Movement and Conversion Instructions: cp.async.bulk // https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#data-movement-and-conversion-instructions-cp-async-bulk -#include -#include +#include +#include _LIBCUDACXX_END_NAMESPACE_CUDA_PTX diff --git a/libcudacxx/include/cuda/__ptx/instructions/cp_async_bulk_commit_group.h b/libcudacxx/include/cuda/__ptx/instructions/cp_async_bulk_commit_group.h index bd97259cf19..f9320e975f2 100644 --- a/libcudacxx/include/cuda/__ptx/instructions/cp_async_bulk_commit_group.h +++ b/libcudacxx/include/cuda/__ptx/instructions/cp_async_bulk_commit_group.h @@ -32,7 +32,7 @@ _LIBCUDACXX_BEGIN_NAMESPACE_CUDA_PTX // 9.7.8.24.12. Data Movement and Conversion Instructions: cp.async.bulk.commit_group // https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#data-movement-and-conversion-instructions-cp-async-bulk-commit-group -#include +#include _LIBCUDACXX_END_NAMESPACE_CUDA_PTX diff --git a/libcudacxx/include/cuda/__ptx/instructions/cp_async_bulk_tensor.h b/libcudacxx/include/cuda/__ptx/instructions/cp_async_bulk_tensor.h index 5b9f575ce5f..7de5b41b744 100644 --- a/libcudacxx/include/cuda/__ptx/instructions/cp_async_bulk_tensor.h +++ b/libcudacxx/include/cuda/__ptx/instructions/cp_async_bulk_tensor.h @@ -32,8 +32,8 @@ _LIBCUDACXX_BEGIN_NAMESPACE_CUDA_PTX // 9.7.8.24.9. Data Movement and Conversion Instructions: cp.async.bulk.tensor // https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#data-movement-and-conversion-instructions-cp-async-bulk-tensor -#include -#include +#include +#include _LIBCUDACXX_END_NAMESPACE_CUDA_PTX diff --git a/libcudacxx/include/cuda/__ptx/instructions/cp_async_bulk_wait_group.h b/libcudacxx/include/cuda/__ptx/instructions/cp_async_bulk_wait_group.h index 00a3700e1a9..0d933e2cc34 100644 --- a/libcudacxx/include/cuda/__ptx/instructions/cp_async_bulk_wait_group.h +++ b/libcudacxx/include/cuda/__ptx/instructions/cp_async_bulk_wait_group.h @@ -32,7 +32,7 @@ _LIBCUDACXX_BEGIN_NAMESPACE_CUDA_PTX // 9.7.8.24.13. Data Movement and Conversion Instructions: cp.async.bulk.wait_group // https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#data-movement-and-conversion-instructions-cp-async-bulk-wait-group -#include +#include _LIBCUDACXX_END_NAMESPACE_CUDA_PTX diff --git a/libcudacxx/include/cuda/__ptx/instructions/cp_reduce_async_bulk.h b/libcudacxx/include/cuda/__ptx/instructions/cp_reduce_async_bulk.h index ee6d90bc4d9..f1487301ada 100644 --- a/libcudacxx/include/cuda/__ptx/instructions/cp_reduce_async_bulk.h +++ b/libcudacxx/include/cuda/__ptx/instructions/cp_reduce_async_bulk.h @@ -43,12 +43,12 @@ _LIBCUDACXX_BEGIN_NAMESPACE_CUDA_PTX // 9.7.8.24.7. Data Movement and Conversion Instructions: cp.reduce.async.bulk // https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#data-movement-and-conversion-instructions-cp-reduce-async-bulk -#include +#include #ifdef _LIBCUDACXX_HAS_NVF16 -# include +# include #endif // _LIBCUDACXX_HAS_NVF16 #ifdef _LIBCUDACXX_HAS_NVBF16 -# include +# include #endif // _LIBCUDACXX_HAS_NVBF16 _LIBCUDACXX_END_NAMESPACE_CUDA_PTX diff --git a/libcudacxx/include/cuda/__ptx/instructions/cp_reduce_async_bulk_tensor.h b/libcudacxx/include/cuda/__ptx/instructions/cp_reduce_async_bulk_tensor.h index a6b23a706c7..436c42d4c3f 100644 --- a/libcudacxx/include/cuda/__ptx/instructions/cp_reduce_async_bulk_tensor.h +++ b/libcudacxx/include/cuda/__ptx/instructions/cp_reduce_async_bulk_tensor.h @@ -32,7 +32,7 @@ _LIBCUDACXX_BEGIN_NAMESPACE_CUDA_PTX // 9.7.8.24.10. Data Movement and Conversion Instructions: cp.reduce.async.bulk.tensor // https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#data-movement-and-conversion-instructions-cp-reduce-async-bulk-tensor -#include +#include _LIBCUDACXX_END_NAMESPACE_CUDA_PTX diff --git a/libcudacxx/include/cuda/__ptx/instructions/fence.h b/libcudacxx/include/cuda/__ptx/instructions/fence.h index 045f09cb40e..a8dccf979c2 100644 --- a/libcudacxx/include/cuda/__ptx/instructions/fence.h +++ b/libcudacxx/include/cuda/__ptx/instructions/fence.h @@ -32,11 +32,11 @@ _LIBCUDACXX_BEGIN_NAMESPACE_CUDA_PTX // 9.7.12.4. Parallel Synchronization and Communication Instructions: membar/fence // https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#parallel-synchronization-and-communication-instructions-membar-fence -#include -#include -#include -#include -#include +#include +#include +#include +#include +#include _LIBCUDACXX_END_NAMESPACE_CUDA_PTX diff --git a/libcudacxx/include/cuda/__ptx/instructions/generated/barrier_cluster.inc b/libcudacxx/include/cuda/__ptx/instructions/generated/barrier_cluster.h similarity index 92% rename from libcudacxx/include/cuda/__ptx/instructions/generated/barrier_cluster.inc rename to libcudacxx/include/cuda/__ptx/instructions/generated/barrier_cluster.h index ca9238bc3ff..10d55714c5b 100644 --- a/libcudacxx/include/cuda/__ptx/instructions/generated/barrier_cluster.inc +++ b/libcudacxx/include/cuda/__ptx/instructions/generated/barrier_cluster.h @@ -1,7 +1,12 @@ +// This file was automatically generated. Do not edit. + +#ifndef _CUDA_PTX_GENERATED_BARRIER_CLUSTER_H_ +#define _CUDA_PTX_GENERATED_BARRIER_CLUSTER_H_ + /* // barrier.cluster.arrive; // PTX ISA 78, SM_90 // Marked volatile and as clobbering memory -template +template __device__ static inline void barrier_cluster_arrive(); */ #if __cccl_ptx_isa >= 780 @@ -24,7 +29,7 @@ _CCCL_DEVICE static inline void barrier_cluster_arrive() /* // barrier.cluster.wait; // PTX ISA 78, SM_90 // Marked volatile and as clobbering memory -template +template __device__ static inline void barrier_cluster_wait(); */ #if __cccl_ptx_isa >= 780 @@ -48,7 +53,7 @@ _CCCL_DEVICE static inline void barrier_cluster_wait() // barrier.cluster.arrive.sem; // PTX ISA 80, SM_90 // .sem = { .release } // Marked volatile and as clobbering memory -template +template __device__ static inline void barrier_cluster_arrive( cuda::ptx::sem_release_t); */ @@ -74,7 +79,7 @@ _CCCL_DEVICE static inline void barrier_cluster_arrive(sem_release_t) // barrier.cluster.arrive.sem; // PTX ISA 80, SM_90 // .sem = { .relaxed } // Marked volatile -template +template __device__ static inline void barrier_cluster_arrive( cuda::ptx::sem_relaxed_t); */ @@ -100,7 +105,7 @@ _CCCL_DEVICE static inline void barrier_cluster_arrive(sem_relaxed_t) // barrier.cluster.wait.sem; // PTX ISA 80, SM_90 // .sem = { .acquire } // Marked volatile and as clobbering memory -template +template __device__ static inline void barrier_cluster_wait( cuda::ptx::sem_acquire_t); */ @@ -121,3 +126,5 @@ _CCCL_DEVICE static inline void barrier_cluster_wait(sem_acquire_t) __cuda_ptx_barrier_cluster_wait_is_not_supported_before_SM_90__();)); } #endif // __cccl_ptx_isa >= 800 + +#endif // _CUDA_PTX_GENERATED_BARRIER_CLUSTER_H_ diff --git a/libcudacxx/include/cuda/__ptx/instructions/generated/cp_async_bulk.inc b/libcudacxx/include/cuda/__ptx/instructions/generated/cp_async_bulk.h similarity index 93% rename from libcudacxx/include/cuda/__ptx/instructions/generated/cp_async_bulk.inc rename to libcudacxx/include/cuda/__ptx/instructions/generated/cp_async_bulk.h index 69f77053b95..8ba40d45f64 100644 --- a/libcudacxx/include/cuda/__ptx/instructions/generated/cp_async_bulk.inc +++ b/libcudacxx/include/cuda/__ptx/instructions/generated/cp_async_bulk.h @@ -1,9 +1,14 @@ +// This file was automatically generated. Do not edit. + +#ifndef _CUDA_PTX_GENERATED_CP_ASYNC_BULK_H_ +#define _CUDA_PTX_GENERATED_CP_ASYNC_BULK_H_ + /* // cp.async.bulk.dst.src.mbarrier::complete_tx::bytes [dstMem], [srcMem], size, [smem_bar]; // 1a. unicast PTX ISA 80, SM_90 // .dst = { .shared::cluster } // .src = { .global } -template +template __device__ static inline void cp_async_bulk( cuda::ptx::space_cluster_t, cuda::ptx::space_global_t, @@ -41,7 +46,7 @@ _CCCL_DEVICE static inline void cp_async_bulk( // cp.async.bulk.dst.src.mbarrier::complete_tx::bytes [dstMem], [srcMem], size, [rdsmem_bar]; // 2. PTX ISA 80, SM_90 // .dst = { .shared::cluster } // .src = { .shared::cta } -template +template __device__ static inline void cp_async_bulk( cuda::ptx::space_cluster_t, cuda::ptx::space_shared_t, @@ -82,7 +87,7 @@ _CCCL_DEVICE static inline void cp_async_bulk( // cp.async.bulk.dst.src.bulk_group [dstMem], [srcMem], size; // 3. PTX ISA 80, SM_90 // .dst = { .global } // .src = { .shared::cta } -template +template __device__ static inline void cp_async_bulk( cuda::ptx::space_global_t, cuda::ptx::space_shared_t, @@ -109,3 +114,5 @@ cp_async_bulk(space_global_t, space_shared_t, void* __dstMem, const void* __srcM __cuda_ptx_cp_async_bulk_is_not_supported_before_SM_90__();)); } #endif // __cccl_ptx_isa >= 800 + +#endif // _CUDA_PTX_GENERATED_CP_ASYNC_BULK_H_ diff --git a/libcudacxx/include/cuda/__ptx/instructions/generated/cp_async_bulk_commit_group.inc b/libcudacxx/include/cuda/__ptx/instructions/generated/cp_async_bulk_commit_group.h similarity index 73% rename from libcudacxx/include/cuda/__ptx/instructions/generated/cp_async_bulk_commit_group.inc rename to libcudacxx/include/cuda/__ptx/instructions/generated/cp_async_bulk_commit_group.h index 24baddaea8f..7bb58675ddb 100644 --- a/libcudacxx/include/cuda/__ptx/instructions/generated/cp_async_bulk_commit_group.inc +++ b/libcudacxx/include/cuda/__ptx/instructions/generated/cp_async_bulk_commit_group.h @@ -1,6 +1,11 @@ +// This file was automatically generated. Do not edit. + +#ifndef _CUDA_PTX_GENERATED_CP_ASYNC_BULK_COMMIT_GROUP_H_ +#define _CUDA_PTX_GENERATED_CP_ASYNC_BULK_COMMIT_GROUP_H_ + /* // cp.async.bulk.commit_group; // PTX ISA 80, SM_90 -template +template __device__ static inline void cp_async_bulk_commit_group(); */ #if __cccl_ptx_isa >= 800 @@ -19,3 +24,5 @@ _CCCL_DEVICE static inline void cp_async_bulk_commit_group() __cuda_ptx_cp_async_bulk_commit_group_is_not_supported_before_SM_90__();)); } #endif // __cccl_ptx_isa >= 800 + +#endif // _CUDA_PTX_GENERATED_CP_ASYNC_BULK_COMMIT_GROUP_H_ diff --git a/libcudacxx/include/cuda/__ptx/instructions/generated/cp_async_bulk_multicast.inc b/libcudacxx/include/cuda/__ptx/instructions/generated/cp_async_bulk_multicast.h similarity index 86% rename from libcudacxx/include/cuda/__ptx/instructions/generated/cp_async_bulk_multicast.inc rename to libcudacxx/include/cuda/__ptx/instructions/generated/cp_async_bulk_multicast.h index cdd5a535eb6..a5534ef0b48 100644 --- a/libcudacxx/include/cuda/__ptx/instructions/generated/cp_async_bulk_multicast.inc +++ b/libcudacxx/include/cuda/__ptx/instructions/generated/cp_async_bulk_multicast.h @@ -1,9 +1,14 @@ +// This file was automatically generated. Do not edit. + +#ifndef _CUDA_PTX_GENERATED_CP_ASYNC_BULK_MULTICAST_H_ +#define _CUDA_PTX_GENERATED_CP_ASYNC_BULK_MULTICAST_H_ + /* // cp.async.bulk{.dst}{.src}.mbarrier::complete_tx::bytes.multicast::cluster [dstMem], [srcMem], size, [smem_bar], ctaMask; // 1. PTX ISA 80, SM_90a // .dst = { .shared::cluster } // .src = { .global } -template +template __device__ static inline void cp_async_bulk( cuda::ptx::space_cluster_t, cuda::ptx::space_global_t, @@ -43,3 +48,5 @@ _CCCL_DEVICE static inline void cp_async_bulk( __cuda_ptx_cp_async_bulk_is_not_supported_before_SM_90a__();)); } #endif // __cccl_ptx_isa >= 800 + +#endif // _CUDA_PTX_GENERATED_CP_ASYNC_BULK_MULTICAST_H_ diff --git a/libcudacxx/include/cuda/__ptx/instructions/generated/cp_async_bulk_tensor.inc b/libcudacxx/include/cuda/__ptx/instructions/generated/cp_async_bulk_tensor.h similarity index 96% rename from libcudacxx/include/cuda/__ptx/instructions/generated/cp_async_bulk_tensor.inc rename to libcudacxx/include/cuda/__ptx/instructions/generated/cp_async_bulk_tensor.h index 547888d5b0f..3cbd26fda04 100644 --- a/libcudacxx/include/cuda/__ptx/instructions/generated/cp_async_bulk_tensor.inc +++ b/libcudacxx/include/cuda/__ptx/instructions/generated/cp_async_bulk_tensor.h @@ -1,9 +1,14 @@ +// This file was automatically generated. Do not edit. + +#ifndef _CUDA_PTX_GENERATED_CP_ASYNC_BULK_TENSOR_H_ +#define _CUDA_PTX_GENERATED_CP_ASYNC_BULK_TENSOR_H_ + /* // cp.async.bulk.tensor.1d.dst.src.tile.mbarrier::complete_tx::bytes [dstMem], [tensorMap, tensorCoords], [smem_bar];// 1a. PTX ISA 80, SM_90 // .dst = { .shared::cluster } // .src = { .global } -template +template __device__ static inline void cp_async_bulk_tensor( cuda::ptx::space_cluster_t, cuda::ptx::space_global_t, @@ -42,7 +47,7 @@ _CCCL_DEVICE static inline void cp_async_bulk_tensor( // cp.async.bulk.tensor.1d.dst.src.tile.bulk_group [tensorMap, tensorCoords], [srcMem]; // 3a. PTX ISA 80, SM_90 // .dst = { .global } // .src = { .shared::cta } -template +template __device__ static inline void cp_async_bulk_tensor( cuda::ptx::space_global_t, cuda::ptx::space_shared_t, @@ -79,7 +84,7 @@ _CCCL_DEVICE static inline void cp_async_bulk_tensor( 1b. PTX ISA 80, SM_90 // .dst = { .shared::cluster } // .src = { .global } -template +template __device__ static inline void cp_async_bulk_tensor( cuda::ptx::space_cluster_t, cuda::ptx::space_global_t, @@ -122,7 +127,7 @@ _CCCL_DEVICE static inline void cp_async_bulk_tensor( // cp.async.bulk.tensor.2d.dst.src.tile.bulk_group [tensorMap, tensorCoords], [srcMem]; // 3b. PTX ISA 80, SM_90 // .dst = { .global } // .src = { .shared::cta } -template +template __device__ static inline void cp_async_bulk_tensor( cuda::ptx::space_global_t, cuda::ptx::space_shared_t, @@ -159,7 +164,7 @@ _CCCL_DEVICE static inline void cp_async_bulk_tensor( 1c. PTX ISA 80, SM_90 // .dst = { .shared::cluster } // .src = { .global } -template +template __device__ static inline void cp_async_bulk_tensor( cuda::ptx::space_cluster_t, cuda::ptx::space_global_t, @@ -203,7 +208,7 @@ _CCCL_DEVICE static inline void cp_async_bulk_tensor( // cp.async.bulk.tensor.3d.dst.src.tile.bulk_group [tensorMap, tensorCoords], [srcMem]; // 3c. PTX ISA 80, SM_90 // .dst = { .global } // .src = { .shared::cta } -template +template __device__ static inline void cp_async_bulk_tensor( cuda::ptx::space_global_t, cuda::ptx::space_shared_t, @@ -244,7 +249,7 @@ _CCCL_DEVICE static inline void cp_async_bulk_tensor( 1d. PTX ISA 80, SM_90 // .dst = { .shared::cluster } // .src = { .global } -template +template __device__ static inline void cp_async_bulk_tensor( cuda::ptx::space_cluster_t, cuda::ptx::space_global_t, @@ -289,7 +294,7 @@ _CCCL_DEVICE static inline void cp_async_bulk_tensor( // cp.async.bulk.tensor.4d.dst.src.tile.bulk_group [tensorMap, tensorCoords], [srcMem]; // 3d. PTX ISA 80, SM_90 // .dst = { .global } // .src = { .shared::cta } -template +template __device__ static inline void cp_async_bulk_tensor( cuda::ptx::space_global_t, cuda::ptx::space_shared_t, @@ -331,7 +336,7 @@ _CCCL_DEVICE static inline void cp_async_bulk_tensor( 1e. PTX ISA 80, SM_90 // .dst = { .shared::cluster } // .src = { .global } -template +template __device__ static inline void cp_async_bulk_tensor( cuda::ptx::space_cluster_t, cuda::ptx::space_global_t, @@ -377,7 +382,7 @@ _CCCL_DEVICE static inline void cp_async_bulk_tensor( // cp.async.bulk.tensor.5d.dst.src.tile.bulk_group [tensorMap, tensorCoords], [srcMem]; // 3e. PTX ISA 80, SM_90 // .dst = { .global } // .src = { .shared::cta } -template +template __device__ static inline void cp_async_bulk_tensor( cuda::ptx::space_global_t, cuda::ptx::space_shared_t, @@ -414,3 +419,5 @@ _CCCL_DEVICE static inline void cp_async_bulk_tensor( __cuda_ptx_cp_async_bulk_tensor_is_not_supported_before_SM_90__();)); } #endif // __cccl_ptx_isa >= 800 + +#endif // _CUDA_PTX_GENERATED_CP_ASYNC_BULK_TENSOR_H_ diff --git a/libcudacxx/include/cuda/__ptx/instructions/generated/cp_async_bulk_tensor_multicast.inc b/libcudacxx/include/cuda/__ptx/instructions/generated/cp_async_bulk_tensor_multicast.h similarity index 95% rename from libcudacxx/include/cuda/__ptx/instructions/generated/cp_async_bulk_tensor_multicast.inc rename to libcudacxx/include/cuda/__ptx/instructions/generated/cp_async_bulk_tensor_multicast.h index 020698a15b1..915979d18f3 100644 --- a/libcudacxx/include/cuda/__ptx/instructions/generated/cp_async_bulk_tensor_multicast.inc +++ b/libcudacxx/include/cuda/__ptx/instructions/generated/cp_async_bulk_tensor_multicast.h @@ -1,9 +1,14 @@ +// This file was automatically generated. Do not edit. + +#ifndef _CUDA_PTX_GENERATED_CP_ASYNC_BULK_TENSOR_MULTICAST_H_ +#define _CUDA_PTX_GENERATED_CP_ASYNC_BULK_TENSOR_MULTICAST_H_ + /* // cp.async.bulk.tensor.1d.dst.src.tile.mbarrier::complete_tx::bytes.multicast::cluster [dstMem], [tensorMap, tensorCoords], [smem_bar], ctaMask; // 2a. PTX ISA 80, SM_90a // .dst = { .shared::cluster } // .src = { .global } -template +template __device__ static inline void cp_async_bulk_tensor( cuda::ptx::space_cluster_t, cuda::ptx::space_global_t, @@ -49,7 +54,7 @@ _CCCL_DEVICE static inline void cp_async_bulk_tensor( tensorCoords], [smem_bar], ctaMask; // 2b. PTX ISA 80, SM_90a // .dst = { .shared::cluster } // .src = { .global } -template +template __device__ static inline void cp_async_bulk_tensor( cuda::ptx::space_cluster_t, cuda::ptx::space_global_t, @@ -96,7 +101,7 @@ _CCCL_DEVICE static inline void cp_async_bulk_tensor( tensorCoords], [smem_bar], ctaMask; // 2c. PTX ISA 80, SM_90a // .dst = { .shared::cluster } // .src = { .global } -template +template __device__ static inline void cp_async_bulk_tensor( cuda::ptx::space_cluster_t, cuda::ptx::space_global_t, @@ -144,7 +149,7 @@ _CCCL_DEVICE static inline void cp_async_bulk_tensor( tensorCoords], [smem_bar], ctaMask; // 2d. PTX ISA 80, SM_90a // .dst = { .shared::cluster } // .src = { .global } -template +template __device__ static inline void cp_async_bulk_tensor( cuda::ptx::space_cluster_t, cuda::ptx::space_global_t, @@ -193,7 +198,7 @@ _CCCL_DEVICE static inline void cp_async_bulk_tensor( tensorCoords], [smem_bar], ctaMask; // 2e. PTX ISA 80, SM_90a // .dst = { .shared::cluster } // .src = { .global } -template +template __device__ static inline void cp_async_bulk_tensor( cuda::ptx::space_cluster_t, cuda::ptx::space_global_t, @@ -237,3 +242,5 @@ _CCCL_DEVICE static inline void cp_async_bulk_tensor( __cuda_ptx_cp_async_bulk_tensor_is_not_supported_before_SM_90a__();)); } #endif // __cccl_ptx_isa >= 800 + +#endif // _CUDA_PTX_GENERATED_CP_ASYNC_BULK_TENSOR_MULTICAST_H_ diff --git a/libcudacxx/include/cuda/__ptx/instructions/generated/cp_async_bulk_wait_group.inc b/libcudacxx/include/cuda/__ptx/instructions/generated/cp_async_bulk_wait_group.h similarity index 82% rename from libcudacxx/include/cuda/__ptx/instructions/generated/cp_async_bulk_wait_group.inc rename to libcudacxx/include/cuda/__ptx/instructions/generated/cp_async_bulk_wait_group.h index 1a715a0fac6..2057323665a 100644 --- a/libcudacxx/include/cuda/__ptx/instructions/generated/cp_async_bulk_wait_group.inc +++ b/libcudacxx/include/cuda/__ptx/instructions/generated/cp_async_bulk_wait_group.h @@ -1,3 +1,8 @@ +// This file was automatically generated. Do not edit. + +#ifndef _CUDA_PTX_GENERATED_CP_ASYNC_BULK_WAIT_GROUP_H_ +#define _CUDA_PTX_GENERATED_CP_ASYNC_BULK_WAIT_GROUP_H_ + /* // cp.async.bulk.wait_group N; // PTX ISA 80, SM_90 template @@ -7,13 +12,13 @@ __device__ static inline void cp_async_bulk_wait_group( #if __cccl_ptx_isa >= 800 extern "C" _CCCL_DEVICE void __cuda_ptx_cp_async_bulk_wait_group_is_not_supported_before_SM_90__(); template -_CCCL_DEVICE static inline void cp_async_bulk_wait_group(n32_t<_N32> __n) +_CCCL_DEVICE static inline void cp_async_bulk_wait_group(n32_t<_N32> __N) { NV_IF_ELSE_TARGET( NV_PROVIDES_SM_90, (asm volatile("cp.async.bulk.wait_group %0;" : - : "n"(__n.value) + : "n"(__N.value) : "memory");), ( // Unsupported architectures will have a linker error with a semi-decent error message @@ -30,16 +35,18 @@ __device__ static inline void cp_async_bulk_wait_group_read( #if __cccl_ptx_isa >= 800 extern "C" _CCCL_DEVICE void __cuda_ptx_cp_async_bulk_wait_group_read_is_not_supported_before_SM_90__(); template -_CCCL_DEVICE static inline void cp_async_bulk_wait_group_read(n32_t<_N32> __n) +_CCCL_DEVICE static inline void cp_async_bulk_wait_group_read(n32_t<_N32> __N) { NV_IF_ELSE_TARGET( NV_PROVIDES_SM_90, (asm volatile("cp.async.bulk.wait_group.read %0;" : - : "n"(__n.value) + : "n"(__N.value) : "memory");), ( // Unsupported architectures will have a linker error with a semi-decent error message __cuda_ptx_cp_async_bulk_wait_group_read_is_not_supported_before_SM_90__();)); } #endif // __cccl_ptx_isa >= 800 + +#endif // _CUDA_PTX_GENERATED_CP_ASYNC_BULK_WAIT_GROUP_H_ diff --git a/libcudacxx/include/cuda/__ptx/instructions/generated/cp_reduce_async_bulk.inc b/libcudacxx/include/cuda/__ptx/instructions/generated/cp_reduce_async_bulk.h similarity index 97% rename from libcudacxx/include/cuda/__ptx/instructions/generated/cp_reduce_async_bulk.inc rename to libcudacxx/include/cuda/__ptx/instructions/generated/cp_reduce_async_bulk.h index 50059ff6c5b..a35684c85e1 100644 --- a/libcudacxx/include/cuda/__ptx/instructions/generated/cp_reduce_async_bulk.inc +++ b/libcudacxx/include/cuda/__ptx/instructions/generated/cp_reduce_async_bulk.h @@ -1,5 +1,8 @@ -// 9.7.8.24.7. Data Movement and Conversion Instructions: cp.reduce.async.bulk -// https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#data-movement-and-conversion-instructions-cp-reduce-async-bulk +// This file was automatically generated. Do not edit. + +#ifndef _CUDA_PTX_GENERATED_CP_REDUCE_ASYNC_BULK_H_ +#define _CUDA_PTX_GENERATED_CP_REDUCE_ASYNC_BULK_H_ + /* // cp.reduce.async.bulk.dst.src.mbarrier::complete_tx::bytes.op.type [dstMem], [srcMem], size, [rdsmem_bar]; // 1. PTX ISA 80, SM_90 @@ -154,7 +157,7 @@ ISA 80, SM_90 // .src = { .shared::cta } // .type = { .u32 } // .op = { .min } -template +template __device__ static inline void cp_reduce_async_bulk( cuda::ptx::space_cluster_t, cuda::ptx::space_shared_t, @@ -203,7 +206,7 @@ ISA 80, SM_90 // .src = { .shared::cta } // .type = { .u32 } // .op = { .max } -template +template __device__ static inline void cp_reduce_async_bulk( cuda::ptx::space_cluster_t, cuda::ptx::space_shared_t, @@ -252,7 +255,7 @@ ISA 80, SM_90 // .src = { .shared::cta } // .type = { .u32 } // .op = { .add } -template +template __device__ static inline void cp_reduce_async_bulk( cuda::ptx::space_cluster_t, cuda::ptx::space_shared_t, @@ -301,7 +304,7 @@ ISA 80, SM_90 // .src = { .shared::cta } // .type = { .u32 } // .op = { .inc } -template +template __device__ static inline void cp_reduce_async_bulk( cuda::ptx::space_cluster_t, cuda::ptx::space_shared_t, @@ -350,7 +353,7 @@ ISA 80, SM_90 // .src = { .shared::cta } // .type = { .u32 } // .op = { .dec } -template +template __device__ static inline void cp_reduce_async_bulk( cuda::ptx::space_cluster_t, cuda::ptx::space_shared_t, @@ -399,7 +402,7 @@ ISA 80, SM_90 // .src = { .shared::cta } // .type = { .s32 } // .op = { .min } -template +template __device__ static inline void cp_reduce_async_bulk( cuda::ptx::space_cluster_t, cuda::ptx::space_shared_t, @@ -448,7 +451,7 @@ ISA 80, SM_90 // .src = { .shared::cta } // .type = { .s32 } // .op = { .max } -template +template __device__ static inline void cp_reduce_async_bulk( cuda::ptx::space_cluster_t, cuda::ptx::space_shared_t, @@ -497,7 +500,7 @@ ISA 80, SM_90 // .src = { .shared::cta } // .type = { .s32 } // .op = { .add } -template +template __device__ static inline void cp_reduce_async_bulk( cuda::ptx::space_cluster_t, cuda::ptx::space_shared_t, @@ -546,7 +549,7 @@ ISA 80, SM_90 // .src = { .shared::cta } // .type = { .u64 } // .op = { .add } -template +template __device__ static inline void cp_reduce_async_bulk( cuda::ptx::space_cluster_t, cuda::ptx::space_shared_t, @@ -595,7 +598,7 @@ ISA 80, SM_90 // .src = { .shared::cta } // .type = { .s64 } // .op = { .add } -template +template __device__ static inline void cp_reduce_async_bulk( cuda::ptx::space_cluster_t, cuda::ptx::space_shared_t, @@ -670,7 +673,7 @@ _CCCL_DEVICE static inline void cp_reduce_async_bulk( : : "l"(__as_ptr_gmem(__dstMem)), "r"(__as_ptr_smem(__srcMem)), "r"(__size) : "memory"); - } _CCCL_ELSE_IF_CONSTEXPR (sizeof(_Type) == 8) { + } else _CCCL_IF_CONSTEXPR (sizeof(_Type) == 8) { asm("cp.reduce.async.bulk.global.shared::cta.bulk_group.and.b64 [%0], [%1], %2; // 3." : : "l"(__as_ptr_gmem(__dstMem)), "r"(__as_ptr_smem(__srcMem)), "r"(__size) @@ -715,7 +718,7 @@ _CCCL_DEVICE static inline void cp_reduce_async_bulk( : : "l"(__as_ptr_gmem(__dstMem)), "r"(__as_ptr_smem(__srcMem)), "r"(__size) : "memory"); - } _CCCL_ELSE_IF_CONSTEXPR (sizeof(_Type) == 8) { + } else _CCCL_IF_CONSTEXPR (sizeof(_Type) == 8) { asm("cp.reduce.async.bulk.global.shared::cta.bulk_group.or.b64 [%0], [%1], %2; // 3." : : "l"(__as_ptr_gmem(__dstMem)), "r"(__as_ptr_smem(__srcMem)), "r"(__size) @@ -760,7 +763,7 @@ _CCCL_DEVICE static inline void cp_reduce_async_bulk( : : "l"(__as_ptr_gmem(__dstMem)), "r"(__as_ptr_smem(__srcMem)), "r"(__size) : "memory"); - } _CCCL_ELSE_IF_CONSTEXPR (sizeof(_Type) == 8) { + } else _CCCL_IF_CONSTEXPR (sizeof(_Type) == 8) { asm("cp.reduce.async.bulk.global.shared::cta.bulk_group.xor.b64 [%0], [%1], %2; // 3." : : "l"(__as_ptr_gmem(__dstMem)), "r"(__as_ptr_smem(__srcMem)), "r"(__size) @@ -778,7 +781,7 @@ _CCCL_DEVICE static inline void cp_reduce_async_bulk( // .src = { .shared::cta } // .type = { .u32 } // .op = { .min } -template +template __device__ static inline void cp_reduce_async_bulk( cuda::ptx::space_global_t, cuda::ptx::space_shared_t, @@ -820,7 +823,7 @@ _CCCL_DEVICE static inline void cp_reduce_async_bulk( // .src = { .shared::cta } // .type = { .u32 } // .op = { .max } -template +template __device__ static inline void cp_reduce_async_bulk( cuda::ptx::space_global_t, cuda::ptx::space_shared_t, @@ -862,7 +865,7 @@ _CCCL_DEVICE static inline void cp_reduce_async_bulk( // .src = { .shared::cta } // .type = { .u32 } // .op = { .add } -template +template __device__ static inline void cp_reduce_async_bulk( cuda::ptx::space_global_t, cuda::ptx::space_shared_t, @@ -904,7 +907,7 @@ _CCCL_DEVICE static inline void cp_reduce_async_bulk( // .src = { .shared::cta } // .type = { .u32 } // .op = { .inc } -template +template __device__ static inline void cp_reduce_async_bulk( cuda::ptx::space_global_t, cuda::ptx::space_shared_t, @@ -946,7 +949,7 @@ _CCCL_DEVICE static inline void cp_reduce_async_bulk( // .src = { .shared::cta } // .type = { .u32 } // .op = { .dec } -template +template __device__ static inline void cp_reduce_async_bulk( cuda::ptx::space_global_t, cuda::ptx::space_shared_t, @@ -988,7 +991,7 @@ _CCCL_DEVICE static inline void cp_reduce_async_bulk( // .src = { .shared::cta } // .type = { .s32 } // .op = { .min } -template +template __device__ static inline void cp_reduce_async_bulk( cuda::ptx::space_global_t, cuda::ptx::space_shared_t, @@ -1030,7 +1033,7 @@ _CCCL_DEVICE static inline void cp_reduce_async_bulk( // .src = { .shared::cta } // .type = { .s32 } // .op = { .max } -template +template __device__ static inline void cp_reduce_async_bulk( cuda::ptx::space_global_t, cuda::ptx::space_shared_t, @@ -1072,7 +1075,7 @@ _CCCL_DEVICE static inline void cp_reduce_async_bulk( // .src = { .shared::cta } // .type = { .s32 } // .op = { .add } -template +template __device__ static inline void cp_reduce_async_bulk( cuda::ptx::space_global_t, cuda::ptx::space_shared_t, @@ -1114,7 +1117,7 @@ _CCCL_DEVICE static inline void cp_reduce_async_bulk( // .src = { .shared::cta } // .type = { .u64 } // .op = { .min } -template +template __device__ static inline void cp_reduce_async_bulk( cuda::ptx::space_global_t, cuda::ptx::space_shared_t, @@ -1156,7 +1159,7 @@ _CCCL_DEVICE static inline void cp_reduce_async_bulk( // .src = { .shared::cta } // .type = { .u64 } // .op = { .max } -template +template __device__ static inline void cp_reduce_async_bulk( cuda::ptx::space_global_t, cuda::ptx::space_shared_t, @@ -1198,7 +1201,7 @@ _CCCL_DEVICE static inline void cp_reduce_async_bulk( // .src = { .shared::cta } // .type = { .u64 } // .op = { .add } -template +template __device__ static inline void cp_reduce_async_bulk( cuda::ptx::space_global_t, cuda::ptx::space_shared_t, @@ -1240,7 +1243,7 @@ _CCCL_DEVICE static inline void cp_reduce_async_bulk( // .src = { .shared::cta } // .type = { .s64 } // .op = { .min } -template +template __device__ static inline void cp_reduce_async_bulk( cuda::ptx::space_global_t, cuda::ptx::space_shared_t, @@ -1282,7 +1285,7 @@ _CCCL_DEVICE static inline void cp_reduce_async_bulk( // .src = { .shared::cta } // .type = { .s64 } // .op = { .max } -template +template __device__ static inline void cp_reduce_async_bulk( cuda::ptx::space_global_t, cuda::ptx::space_shared_t, @@ -1324,7 +1327,7 @@ _CCCL_DEVICE static inline void cp_reduce_async_bulk( // .src = { .shared::cta } // .type = { .f32 } // .op = { .add } -template +template __device__ static inline void cp_reduce_async_bulk( cuda::ptx::space_global_t, cuda::ptx::space_shared_t, @@ -1361,7 +1364,7 @@ _CCCL_DEVICE static inline void cp_reduce_async_bulk( // .src = { .shared::cta } // .type = { .f64 } // .op = { .add } -template +template __device__ static inline void cp_reduce_async_bulk( cuda::ptx::space_global_t, cuda::ptx::space_shared_t, @@ -1398,7 +1401,7 @@ _CCCL_DEVICE static inline void cp_reduce_async_bulk( // .src = { .shared::cta } // .type = { .s64 } // .op = { .add } -template +template __device__ static inline void cp_reduce_async_bulk( cuda::ptx::space_global_t, cuda::ptx::space_shared_t, @@ -1433,3 +1436,5 @@ _CCCL_DEVICE static inline void cp_reduce_async_bulk( __cuda_ptx_cp_reduce_async_bulk_is_not_supported_before_SM_90__();)); } #endif // __cccl_ptx_isa >= 800 + +#endif // _CUDA_PTX_GENERATED_CP_REDUCE_ASYNC_BULK_H_ diff --git a/libcudacxx/include/cuda/__ptx/instructions/generated/cp_reduce_async_bulk_bf16.inc b/libcudacxx/include/cuda/__ptx/instructions/generated/cp_reduce_async_bulk_bf16.h similarity index 89% rename from libcudacxx/include/cuda/__ptx/instructions/generated/cp_reduce_async_bulk_bf16.inc rename to libcudacxx/include/cuda/__ptx/instructions/generated/cp_reduce_async_bulk_bf16.h index c657e8d1935..1e13bb5f4f2 100644 --- a/libcudacxx/include/cuda/__ptx/instructions/generated/cp_reduce_async_bulk_bf16.inc +++ b/libcudacxx/include/cuda/__ptx/instructions/generated/cp_reduce_async_bulk_bf16.h @@ -1,11 +1,15 @@ -#ifdef _LIBCUDACXX_HAS_NVBF16 +// This file was automatically generated. Do not edit. + +#ifndef _CUDA_PTX_GENERATED_CP_REDUCE_ASYNC_BULK_BF16_H_ +#define _CUDA_PTX_GENERATED_CP_REDUCE_ASYNC_BULK_BF16_H_ + /* // cp.reduce.async.bulk.dst.src.bulk_group.op.type [dstMem], [srcMem], size; // 4. PTX ISA 80, SM_90 // .dst = { .global } // .src = { .shared::cta } // .type = { .bf16 } // .op = { .min } -template +template __device__ static inline void cp_reduce_async_bulk( cuda::ptx::space_global_t, cuda::ptx::space_shared_t, @@ -14,7 +18,7 @@ __device__ static inline void cp_reduce_async_bulk( const __nv_bfloat16* srcMem, uint32_t size); */ -# if __cccl_ptx_isa >= 800 +#if __cccl_ptx_isa >= 800 extern "C" _CCCL_DEVICE void __cuda_ptx_cp_reduce_async_bulk_is_not_supported_before_SM_90__(); template _CCCL_DEVICE static inline void cp_reduce_async_bulk( @@ -39,7 +43,7 @@ _CCCL_DEVICE static inline void cp_reduce_async_bulk( // Unsupported architectures will have a linker error with a semi-decent error message __cuda_ptx_cp_reduce_async_bulk_is_not_supported_before_SM_90__();)); } -# endif // __cccl_ptx_isa >= 800 +#endif // __cccl_ptx_isa >= 800 /* // cp.reduce.async.bulk.dst.src.bulk_group.op.type [dstMem], [srcMem], size; // 4. PTX ISA 80, SM_90 @@ -47,7 +51,7 @@ _CCCL_DEVICE static inline void cp_reduce_async_bulk( // .src = { .shared::cta } // .type = { .bf16 } // .op = { .max } -template +template __device__ static inline void cp_reduce_async_bulk( cuda::ptx::space_global_t, cuda::ptx::space_shared_t, @@ -56,7 +60,7 @@ __device__ static inline void cp_reduce_async_bulk( const __nv_bfloat16* srcMem, uint32_t size); */ -# if __cccl_ptx_isa >= 800 +#if __cccl_ptx_isa >= 800 extern "C" _CCCL_DEVICE void __cuda_ptx_cp_reduce_async_bulk_is_not_supported_before_SM_90__(); template _CCCL_DEVICE static inline void cp_reduce_async_bulk( @@ -81,7 +85,7 @@ _CCCL_DEVICE static inline void cp_reduce_async_bulk( // Unsupported architectures will have a linker error with a semi-decent error message __cuda_ptx_cp_reduce_async_bulk_is_not_supported_before_SM_90__();)); } -# endif // __cccl_ptx_isa >= 800 +#endif // __cccl_ptx_isa >= 800 /* // cp.reduce.async.bulk.dst.src.bulk_group.op.noftz.type [dstMem], [srcMem], size; // 5. PTX ISA 80, SM_90 @@ -89,7 +93,7 @@ _CCCL_DEVICE static inline void cp_reduce_async_bulk( // .src = { .shared::cta } // .type = { .bf16 } // .op = { .add } -template +template __device__ static inline void cp_reduce_async_bulk( cuda::ptx::space_global_t, cuda::ptx::space_shared_t, @@ -98,7 +102,7 @@ __device__ static inline void cp_reduce_async_bulk( const __nv_bfloat16* srcMem, uint32_t size); */ -# if __cccl_ptx_isa >= 800 +#if __cccl_ptx_isa >= 800 extern "C" _CCCL_DEVICE void __cuda_ptx_cp_reduce_async_bulk_is_not_supported_before_SM_90__(); template _CCCL_DEVICE static inline void cp_reduce_async_bulk( @@ -123,5 +127,6 @@ _CCCL_DEVICE static inline void cp_reduce_async_bulk( // Unsupported architectures will have a linker error with a semi-decent error message __cuda_ptx_cp_reduce_async_bulk_is_not_supported_before_SM_90__();)); } -# endif // __cccl_ptx_isa >= 800 -#endif // _LIBCUDACXX_HAS_NVBF16 +#endif // __cccl_ptx_isa >= 800 + +#endif // _CUDA_PTX_GENERATED_CP_REDUCE_ASYNC_BULK_BF16_H_ diff --git a/libcudacxx/include/cuda/__ptx/instructions/generated/cp_reduce_async_bulk_f16.inc b/libcudacxx/include/cuda/__ptx/instructions/generated/cp_reduce_async_bulk_f16.h similarity index 89% rename from libcudacxx/include/cuda/__ptx/instructions/generated/cp_reduce_async_bulk_f16.inc rename to libcudacxx/include/cuda/__ptx/instructions/generated/cp_reduce_async_bulk_f16.h index 3a52630db53..0c4678c95bb 100644 --- a/libcudacxx/include/cuda/__ptx/instructions/generated/cp_reduce_async_bulk_f16.inc +++ b/libcudacxx/include/cuda/__ptx/instructions/generated/cp_reduce_async_bulk_f16.h @@ -1,10 +1,15 @@ +// This file was automatically generated. Do not edit. + +#ifndef _CUDA_PTX_GENERATED_CP_REDUCE_ASYNC_BULK_F16_H_ +#define _CUDA_PTX_GENERATED_CP_REDUCE_ASYNC_BULK_F16_H_ + /* // cp.reduce.async.bulk.dst.src.bulk_group.op.type [dstMem], [srcMem], size; // 4. PTX ISA 80, SM_90 // .dst = { .global } // .src = { .shared::cta } // .type = { .f16 } // .op = { .min } -template +template __device__ static inline void cp_reduce_async_bulk( cuda::ptx::space_global_t, cuda::ptx::space_shared_t, @@ -13,7 +18,7 @@ __device__ static inline void cp_reduce_async_bulk( const __half* srcMem, uint32_t size); */ -# if __cccl_ptx_isa >= 800 +#if __cccl_ptx_isa >= 800 extern "C" _CCCL_DEVICE void __cuda_ptx_cp_reduce_async_bulk_is_not_supported_before_SM_90__(); template _CCCL_DEVICE static inline void cp_reduce_async_bulk( @@ -33,7 +38,7 @@ _CCCL_DEVICE static inline void cp_reduce_async_bulk( // Unsupported architectures will have a linker error with a semi-decent error message __cuda_ptx_cp_reduce_async_bulk_is_not_supported_before_SM_90__();)); } -# endif // __cccl_ptx_isa >= 800 +#endif // __cccl_ptx_isa >= 800 /* // cp.reduce.async.bulk.dst.src.bulk_group.op.type [dstMem], [srcMem], size; // 4. PTX ISA 80, SM_90 @@ -41,7 +46,7 @@ _CCCL_DEVICE static inline void cp_reduce_async_bulk( // .src = { .shared::cta } // .type = { .f16 } // .op = { .max } -template +template __device__ static inline void cp_reduce_async_bulk( cuda::ptx::space_global_t, cuda::ptx::space_shared_t, @@ -50,7 +55,7 @@ __device__ static inline void cp_reduce_async_bulk( const __half* srcMem, uint32_t size); */ -# if __cccl_ptx_isa >= 800 +#if __cccl_ptx_isa >= 800 extern "C" _CCCL_DEVICE void __cuda_ptx_cp_reduce_async_bulk_is_not_supported_before_SM_90__(); template _CCCL_DEVICE static inline void cp_reduce_async_bulk( @@ -70,7 +75,7 @@ _CCCL_DEVICE static inline void cp_reduce_async_bulk( // Unsupported architectures will have a linker error with a semi-decent error message __cuda_ptx_cp_reduce_async_bulk_is_not_supported_before_SM_90__();)); } -# endif // __cccl_ptx_isa >= 800 +#endif // __cccl_ptx_isa >= 800 /* // cp.reduce.async.bulk.dst.src.bulk_group.op.noftz.type [dstMem], [srcMem], size; // 5. PTX ISA 80, SM_90 @@ -78,7 +83,7 @@ _CCCL_DEVICE static inline void cp_reduce_async_bulk( // .src = { .shared::cta } // .type = { .f16 } // .op = { .add } -template +template __device__ static inline void cp_reduce_async_bulk( cuda::ptx::space_global_t, cuda::ptx::space_shared_t, @@ -87,7 +92,7 @@ __device__ static inline void cp_reduce_async_bulk( const __half* srcMem, uint32_t size); */ -# if __cccl_ptx_isa >= 800 +#if __cccl_ptx_isa >= 800 extern "C" _CCCL_DEVICE void __cuda_ptx_cp_reduce_async_bulk_is_not_supported_before_SM_90__(); template _CCCL_DEVICE static inline void cp_reduce_async_bulk( @@ -107,4 +112,6 @@ _CCCL_DEVICE static inline void cp_reduce_async_bulk( // Unsupported architectures will have a linker error with a semi-decent error message __cuda_ptx_cp_reduce_async_bulk_is_not_supported_before_SM_90__();)); } -# endif // __cccl_ptx_isa >= 800 +#endif // __cccl_ptx_isa >= 800 + +#endif // _CUDA_PTX_GENERATED_CP_REDUCE_ASYNC_BULK_F16_H_ diff --git a/libcudacxx/include/cuda/__ptx/instructions/generated/cp_reduce_async_bulk_tensor.inc b/libcudacxx/include/cuda/__ptx/instructions/generated/cp_reduce_async_bulk_tensor.h similarity index 91% rename from libcudacxx/include/cuda/__ptx/instructions/generated/cp_reduce_async_bulk_tensor.inc rename to libcudacxx/include/cuda/__ptx/instructions/generated/cp_reduce_async_bulk_tensor.h index 32008f6af5b..9ec5b2443d8 100644 --- a/libcudacxx/include/cuda/__ptx/instructions/generated/cp_reduce_async_bulk_tensor.inc +++ b/libcudacxx/include/cuda/__ptx/instructions/generated/cp_reduce_async_bulk_tensor.h @@ -1,3 +1,8 @@ +// This file was automatically generated. Do not edit. + +#ifndef _CUDA_PTX_GENERATED_CP_REDUCE_ASYNC_BULK_TENSOR_H_ +#define _CUDA_PTX_GENERATED_CP_REDUCE_ASYNC_BULK_TENSOR_H_ + /* // cp.reduce.async.bulk.tensor.1d.dst.src.op.tile.bulk_group [tensorMap, tensorCoords], [srcMem]; // 1a. PTX ISA 80, SM_90 @@ -37,37 +42,37 @@ _CCCL_DEVICE static inline void cp_reduce_async_bulk_tensor( : : "l"(__tensorMap), "r"(__tensorCoords[0]), "r"(__as_ptr_smem(__srcMem)) : "memory"); - } _CCCL_ELSE_IF_CONSTEXPR (__op == op_min) { + } else _CCCL_IF_CONSTEXPR (__op == op_min) { asm("cp.reduce.async.bulk.tensor.1d.global.shared::cta.min.tile.bulk_group [%0, {%1}], [%2]; // 1a." : : "l"(__tensorMap), "r"(__tensorCoords[0]), "r"(__as_ptr_smem(__srcMem)) : "memory"); - } _CCCL_ELSE_IF_CONSTEXPR (__op == op_max) { + } else _CCCL_IF_CONSTEXPR (__op == op_max) { asm("cp.reduce.async.bulk.tensor.1d.global.shared::cta.max.tile.bulk_group [%0, {%1}], [%2]; // 1a." : : "l"(__tensorMap), "r"(__tensorCoords[0]), "r"(__as_ptr_smem(__srcMem)) : "memory"); - } _CCCL_ELSE_IF_CONSTEXPR (__op == op_inc) { + } else _CCCL_IF_CONSTEXPR (__op == op_inc) { asm("cp.reduce.async.bulk.tensor.1d.global.shared::cta.inc.tile.bulk_group [%0, {%1}], [%2]; // 1a." : : "l"(__tensorMap), "r"(__tensorCoords[0]), "r"(__as_ptr_smem(__srcMem)) : "memory"); - } _CCCL_ELSE_IF_CONSTEXPR (__op == op_dec) { + } else _CCCL_IF_CONSTEXPR (__op == op_dec) { asm("cp.reduce.async.bulk.tensor.1d.global.shared::cta.dec.tile.bulk_group [%0, {%1}], [%2]; // 1a." : : "l"(__tensorMap), "r"(__tensorCoords[0]), "r"(__as_ptr_smem(__srcMem)) : "memory"); - } _CCCL_ELSE_IF_CONSTEXPR (__op == op_and_op) { + } else _CCCL_IF_CONSTEXPR (__op == op_and_op) { asm("cp.reduce.async.bulk.tensor.1d.global.shared::cta.and.tile.bulk_group [%0, {%1}], [%2]; // 1a." : : "l"(__tensorMap), "r"(__tensorCoords[0]), "r"(__as_ptr_smem(__srcMem)) : "memory"); - } _CCCL_ELSE_IF_CONSTEXPR (__op == op_or_op) { + } else _CCCL_IF_CONSTEXPR (__op == op_or_op) { asm("cp.reduce.async.bulk.tensor.1d.global.shared::cta.or.tile.bulk_group [%0, {%1}], [%2]; // 1a." : : "l"(__tensorMap), "r"(__tensorCoords[0]), "r"(__as_ptr_smem(__srcMem)) : "memory"); - } _CCCL_ELSE_IF_CONSTEXPR (__op == op_xor_op) { + } else _CCCL_IF_CONSTEXPR (__op == op_xor_op) { asm("cp.reduce.async.bulk.tensor.1d.global.shared::cta.xor.tile.bulk_group [%0, {%1}], [%2]; // 1a." : : "l"(__tensorMap), "r"(__tensorCoords[0]), "r"(__as_ptr_smem(__srcMem)) @@ -118,37 +123,37 @@ _CCCL_DEVICE static inline void cp_reduce_async_bulk_tensor( : : "l"(__tensorMap), "r"(__tensorCoords[0]), "r"(__tensorCoords[1]), "r"(__as_ptr_smem(__srcMem)) : "memory"); - } _CCCL_ELSE_IF_CONSTEXPR (__op == op_min) { + } else _CCCL_IF_CONSTEXPR (__op == op_min) { asm("cp.reduce.async.bulk.tensor.2d.global.shared::cta.min.tile.bulk_group [%0, {%1, %2}], [%3]; // 1b." : : "l"(__tensorMap), "r"(__tensorCoords[0]), "r"(__tensorCoords[1]), "r"(__as_ptr_smem(__srcMem)) : "memory"); - } _CCCL_ELSE_IF_CONSTEXPR (__op == op_max) { + } else _CCCL_IF_CONSTEXPR (__op == op_max) { asm("cp.reduce.async.bulk.tensor.2d.global.shared::cta.max.tile.bulk_group [%0, {%1, %2}], [%3]; // 1b." : : "l"(__tensorMap), "r"(__tensorCoords[0]), "r"(__tensorCoords[1]), "r"(__as_ptr_smem(__srcMem)) : "memory"); - } _CCCL_ELSE_IF_CONSTEXPR (__op == op_inc) { + } else _CCCL_IF_CONSTEXPR (__op == op_inc) { asm("cp.reduce.async.bulk.tensor.2d.global.shared::cta.inc.tile.bulk_group [%0, {%1, %2}], [%3]; // 1b." : : "l"(__tensorMap), "r"(__tensorCoords[0]), "r"(__tensorCoords[1]), "r"(__as_ptr_smem(__srcMem)) : "memory"); - } _CCCL_ELSE_IF_CONSTEXPR (__op == op_dec) { + } else _CCCL_IF_CONSTEXPR (__op == op_dec) { asm("cp.reduce.async.bulk.tensor.2d.global.shared::cta.dec.tile.bulk_group [%0, {%1, %2}], [%3]; // 1b." : : "l"(__tensorMap), "r"(__tensorCoords[0]), "r"(__tensorCoords[1]), "r"(__as_ptr_smem(__srcMem)) : "memory"); - } _CCCL_ELSE_IF_CONSTEXPR (__op == op_and_op) { + } else _CCCL_IF_CONSTEXPR (__op == op_and_op) { asm("cp.reduce.async.bulk.tensor.2d.global.shared::cta.and.tile.bulk_group [%0, {%1, %2}], [%3]; // 1b." : : "l"(__tensorMap), "r"(__tensorCoords[0]), "r"(__tensorCoords[1]), "r"(__as_ptr_smem(__srcMem)) : "memory"); - } _CCCL_ELSE_IF_CONSTEXPR (__op == op_or_op) { + } else _CCCL_IF_CONSTEXPR (__op == op_or_op) { asm("cp.reduce.async.bulk.tensor.2d.global.shared::cta.or.tile.bulk_group [%0, {%1, %2}], [%3]; // 1b." : : "l"(__tensorMap), "r"(__tensorCoords[0]), "r"(__tensorCoords[1]), "r"(__as_ptr_smem(__srcMem)) : "memory"); - } _CCCL_ELSE_IF_CONSTEXPR (__op == op_xor_op) { + } else _CCCL_IF_CONSTEXPR (__op == op_xor_op) { asm("cp.reduce.async.bulk.tensor.2d.global.shared::cta.xor.tile.bulk_group [%0, {%1, %2}], [%3]; // 1b." : : "l"(__tensorMap), "r"(__tensorCoords[0]), "r"(__tensorCoords[1]), "r"(__as_ptr_smem(__srcMem)) @@ -203,7 +208,7 @@ _CCCL_DEVICE static inline void cp_reduce_async_bulk_tensor( "r"(__tensorCoords[2]), "r"(__as_ptr_smem(__srcMem)) : "memory"); - } _CCCL_ELSE_IF_CONSTEXPR (__op == op_min) { + } else _CCCL_IF_CONSTEXPR (__op == op_min) { asm("cp.reduce.async.bulk.tensor.3d.global.shared::cta.min.tile.bulk_group [%0, {%1, %2, %3}], [%4]; // 1c." : : "l"(__tensorMap), @@ -212,7 +217,7 @@ _CCCL_DEVICE static inline void cp_reduce_async_bulk_tensor( "r"(__tensorCoords[2]), "r"(__as_ptr_smem(__srcMem)) : "memory"); - } _CCCL_ELSE_IF_CONSTEXPR (__op == op_max) { + } else _CCCL_IF_CONSTEXPR (__op == op_max) { asm("cp.reduce.async.bulk.tensor.3d.global.shared::cta.max.tile.bulk_group [%0, {%1, %2, %3}], [%4]; // 1c." : : "l"(__tensorMap), @@ -221,7 +226,7 @@ _CCCL_DEVICE static inline void cp_reduce_async_bulk_tensor( "r"(__tensorCoords[2]), "r"(__as_ptr_smem(__srcMem)) : "memory"); - } _CCCL_ELSE_IF_CONSTEXPR (__op == op_inc) { + } else _CCCL_IF_CONSTEXPR (__op == op_inc) { asm("cp.reduce.async.bulk.tensor.3d.global.shared::cta.inc.tile.bulk_group [%0, {%1, %2, %3}], [%4]; // 1c." : : "l"(__tensorMap), @@ -230,7 +235,7 @@ _CCCL_DEVICE static inline void cp_reduce_async_bulk_tensor( "r"(__tensorCoords[2]), "r"(__as_ptr_smem(__srcMem)) : "memory"); - } _CCCL_ELSE_IF_CONSTEXPR (__op == op_dec) { + } else _CCCL_IF_CONSTEXPR (__op == op_dec) { asm("cp.reduce.async.bulk.tensor.3d.global.shared::cta.dec.tile.bulk_group [%0, {%1, %2, %3}], [%4]; // 1c." : : "l"(__tensorMap), @@ -239,7 +244,7 @@ _CCCL_DEVICE static inline void cp_reduce_async_bulk_tensor( "r"(__tensorCoords[2]), "r"(__as_ptr_smem(__srcMem)) : "memory"); - } _CCCL_ELSE_IF_CONSTEXPR (__op == op_and_op) { + } else _CCCL_IF_CONSTEXPR (__op == op_and_op) { asm("cp.reduce.async.bulk.tensor.3d.global.shared::cta.and.tile.bulk_group [%0, {%1, %2, %3}], [%4]; // 1c." : : "l"(__tensorMap), @@ -248,7 +253,7 @@ _CCCL_DEVICE static inline void cp_reduce_async_bulk_tensor( "r"(__tensorCoords[2]), "r"(__as_ptr_smem(__srcMem)) : "memory"); - } _CCCL_ELSE_IF_CONSTEXPR (__op == op_or_op) { + } else _CCCL_IF_CONSTEXPR (__op == op_or_op) { asm("cp.reduce.async.bulk.tensor.3d.global.shared::cta.or.tile.bulk_group [%0, {%1, %2, %3}], [%4]; // 1c." : : "l"(__tensorMap), @@ -257,7 +262,7 @@ _CCCL_DEVICE static inline void cp_reduce_async_bulk_tensor( "r"(__tensorCoords[2]), "r"(__as_ptr_smem(__srcMem)) : "memory"); - } _CCCL_ELSE_IF_CONSTEXPR (__op == op_xor_op) { + } else _CCCL_IF_CONSTEXPR (__op == op_xor_op) { asm("cp.reduce.async.bulk.tensor.3d.global.shared::cta.xor.tile.bulk_group [%0, {%1, %2, %3}], [%4]; // 1c." : : "l"(__tensorMap), @@ -317,7 +322,7 @@ _CCCL_DEVICE static inline void cp_reduce_async_bulk_tensor( "r"(__tensorCoords[3]), "r"(__as_ptr_smem(__srcMem)) : "memory"); - } _CCCL_ELSE_IF_CONSTEXPR (__op == op_min) { + } else _CCCL_IF_CONSTEXPR (__op == op_min) { asm("cp.reduce.async.bulk.tensor.4d.global.shared::cta.min.tile.bulk_group [%0, {%1, %2, %3, %4}], [%5]; // 1d." : : "l"(__tensorMap), @@ -327,7 +332,7 @@ _CCCL_DEVICE static inline void cp_reduce_async_bulk_tensor( "r"(__tensorCoords[3]), "r"(__as_ptr_smem(__srcMem)) : "memory"); - } _CCCL_ELSE_IF_CONSTEXPR (__op == op_max) { + } else _CCCL_IF_CONSTEXPR (__op == op_max) { asm("cp.reduce.async.bulk.tensor.4d.global.shared::cta.max.tile.bulk_group [%0, {%1, %2, %3, %4}], [%5]; // 1d." : : "l"(__tensorMap), @@ -337,7 +342,7 @@ _CCCL_DEVICE static inline void cp_reduce_async_bulk_tensor( "r"(__tensorCoords[3]), "r"(__as_ptr_smem(__srcMem)) : "memory"); - } _CCCL_ELSE_IF_CONSTEXPR (__op == op_inc) { + } else _CCCL_IF_CONSTEXPR (__op == op_inc) { asm("cp.reduce.async.bulk.tensor.4d.global.shared::cta.inc.tile.bulk_group [%0, {%1, %2, %3, %4}], [%5]; // 1d." : : "l"(__tensorMap), @@ -347,7 +352,7 @@ _CCCL_DEVICE static inline void cp_reduce_async_bulk_tensor( "r"(__tensorCoords[3]), "r"(__as_ptr_smem(__srcMem)) : "memory"); - } _CCCL_ELSE_IF_CONSTEXPR (__op == op_dec) { + } else _CCCL_IF_CONSTEXPR (__op == op_dec) { asm("cp.reduce.async.bulk.tensor.4d.global.shared::cta.dec.tile.bulk_group [%0, {%1, %2, %3, %4}], [%5]; // 1d." : : "l"(__tensorMap), @@ -357,7 +362,7 @@ _CCCL_DEVICE static inline void cp_reduce_async_bulk_tensor( "r"(__tensorCoords[3]), "r"(__as_ptr_smem(__srcMem)) : "memory"); - } _CCCL_ELSE_IF_CONSTEXPR (__op == op_and_op) { + } else _CCCL_IF_CONSTEXPR (__op == op_and_op) { asm("cp.reduce.async.bulk.tensor.4d.global.shared::cta.and.tile.bulk_group [%0, {%1, %2, %3, %4}], [%5]; // 1d." : : "l"(__tensorMap), @@ -367,7 +372,7 @@ _CCCL_DEVICE static inline void cp_reduce_async_bulk_tensor( "r"(__tensorCoords[3]), "r"(__as_ptr_smem(__srcMem)) : "memory"); - } _CCCL_ELSE_IF_CONSTEXPR (__op == op_or_op) { + } else _CCCL_IF_CONSTEXPR (__op == op_or_op) { asm("cp.reduce.async.bulk.tensor.4d.global.shared::cta.or.tile.bulk_group [%0, {%1, %2, %3, %4}], [%5]; // 1d." : : "l"(__tensorMap), @@ -377,7 +382,7 @@ _CCCL_DEVICE static inline void cp_reduce_async_bulk_tensor( "r"(__tensorCoords[3]), "r"(__as_ptr_smem(__srcMem)) : "memory"); - } _CCCL_ELSE_IF_CONSTEXPR (__op == op_xor_op) { + } else _CCCL_IF_CONSTEXPR (__op == op_xor_op) { asm("cp.reduce.async.bulk.tensor.4d.global.shared::cta.xor.tile.bulk_group [%0, {%1, %2, %3, %4}], [%5]; // 1d." : : "l"(__tensorMap), @@ -440,7 +445,7 @@ _CCCL_DEVICE static inline void cp_reduce_async_bulk_tensor( "r"(__tensorCoords[4]), "r"(__as_ptr_smem(__srcMem)) : "memory"); - } _CCCL_ELSE_IF_CONSTEXPR (__op == op_min) { + } else _CCCL_IF_CONSTEXPR (__op == op_min) { asm("cp.reduce.async.bulk.tensor.5d.global.shared::cta.min.tile.bulk_group [%0, {%1, %2, %3, %4, %5}], [%6]; " "// 1e." : @@ -452,7 +457,7 @@ _CCCL_DEVICE static inline void cp_reduce_async_bulk_tensor( "r"(__tensorCoords[4]), "r"(__as_ptr_smem(__srcMem)) : "memory"); - } _CCCL_ELSE_IF_CONSTEXPR (__op == op_max) { + } else _CCCL_IF_CONSTEXPR (__op == op_max) { asm("cp.reduce.async.bulk.tensor.5d.global.shared::cta.max.tile.bulk_group [%0, {%1, %2, %3, %4, %5}], [%6]; " "// 1e." : @@ -464,7 +469,7 @@ _CCCL_DEVICE static inline void cp_reduce_async_bulk_tensor( "r"(__tensorCoords[4]), "r"(__as_ptr_smem(__srcMem)) : "memory"); - } _CCCL_ELSE_IF_CONSTEXPR (__op == op_inc) { + } else _CCCL_IF_CONSTEXPR (__op == op_inc) { asm("cp.reduce.async.bulk.tensor.5d.global.shared::cta.inc.tile.bulk_group [%0, {%1, %2, %3, %4, %5}], [%6]; " "// 1e." : @@ -476,7 +481,7 @@ _CCCL_DEVICE static inline void cp_reduce_async_bulk_tensor( "r"(__tensorCoords[4]), "r"(__as_ptr_smem(__srcMem)) : "memory"); - } _CCCL_ELSE_IF_CONSTEXPR (__op == op_dec) { + } else _CCCL_IF_CONSTEXPR (__op == op_dec) { asm("cp.reduce.async.bulk.tensor.5d.global.shared::cta.dec.tile.bulk_group [%0, {%1, %2, %3, %4, %5}], [%6]; " "// 1e." : @@ -488,7 +493,7 @@ _CCCL_DEVICE static inline void cp_reduce_async_bulk_tensor( "r"(__tensorCoords[4]), "r"(__as_ptr_smem(__srcMem)) : "memory"); - } _CCCL_ELSE_IF_CONSTEXPR (__op == op_and_op) { + } else _CCCL_IF_CONSTEXPR (__op == op_and_op) { asm("cp.reduce.async.bulk.tensor.5d.global.shared::cta.and.tile.bulk_group [%0, {%1, %2, %3, %4, %5}], [%6]; " "// 1e." : @@ -500,7 +505,7 @@ _CCCL_DEVICE static inline void cp_reduce_async_bulk_tensor( "r"(__tensorCoords[4]), "r"(__as_ptr_smem(__srcMem)) : "memory"); - } _CCCL_ELSE_IF_CONSTEXPR (__op == op_or_op) { + } else _CCCL_IF_CONSTEXPR (__op == op_or_op) { asm("cp.reduce.async.bulk.tensor.5d.global.shared::cta.or.tile.bulk_group [%0, {%1, %2, %3, %4, %5}], [%6]; // " "1e." : @@ -512,7 +517,7 @@ _CCCL_DEVICE static inline void cp_reduce_async_bulk_tensor( "r"(__tensorCoords[4]), "r"(__as_ptr_smem(__srcMem)) : "memory"); - } _CCCL_ELSE_IF_CONSTEXPR (__op == op_xor_op) { + } else _CCCL_IF_CONSTEXPR (__op == op_xor_op) { asm("cp.reduce.async.bulk.tensor.5d.global.shared::cta.xor.tile.bulk_group [%0, {%1, %2, %3, %4, %5}], [%6]; " "// 1e." : @@ -530,3 +535,5 @@ _CCCL_DEVICE static inline void cp_reduce_async_bulk_tensor( __cuda_ptx_cp_reduce_async_bulk_tensor_is_not_supported_before_SM_90__();)); } #endif // __cccl_ptx_isa >= 800 + +#endif // _CUDA_PTX_GENERATED_CP_REDUCE_ASYNC_BULK_TENSOR_H_ diff --git a/libcudacxx/include/cuda/__ptx/instructions/generated/fence.inc b/libcudacxx/include/cuda/__ptx/instructions/generated/fence.h similarity index 81% rename from libcudacxx/include/cuda/__ptx/instructions/generated/fence.inc rename to libcudacxx/include/cuda/__ptx/instructions/generated/fence.h index f10ec07ebb5..db00c4d4cba 100644 --- a/libcudacxx/include/cuda/__ptx/instructions/generated/fence.inc +++ b/libcudacxx/include/cuda/__ptx/instructions/generated/fence.h @@ -1,3 +1,8 @@ +// This file was automatically generated. Do not edit. + +#ifndef _CUDA_PTX_GENERATED_FENCE_H_ +#define _CUDA_PTX_GENERATED_FENCE_H_ + /* // fence{.sem}.scope; // 1. PTX ISA 60, SM_70 // .sem = { .sc, .acq_rel } @@ -19,15 +24,15 @@ _CCCL_DEVICE static inline void fence(sem_t<_Sem> __sem, scope_t<_Scope> __scope ( _CCCL_IF_CONSTEXPR (__sem == sem_sc && __scope == scope_cta) { asm volatile("fence.sc.cta; // 1." : : : "memory"); - } _CCCL_ELSE_IF_CONSTEXPR (__sem == sem_sc && __scope == scope_gpu) { + } else _CCCL_IF_CONSTEXPR (__sem == sem_sc && __scope == scope_gpu) { asm volatile("fence.sc.gpu; // 1." : : : "memory"); - } _CCCL_ELSE_IF_CONSTEXPR (__sem == sem_sc && __scope == scope_sys) { + } else _CCCL_IF_CONSTEXPR (__sem == sem_sc && __scope == scope_sys) { asm volatile("fence.sc.sys; // 1." : : : "memory"); - } _CCCL_ELSE_IF_CONSTEXPR (__sem == sem_acq_rel && __scope == scope_cta) { + } else _CCCL_IF_CONSTEXPR (__sem == sem_acq_rel && __scope == scope_cta) { asm volatile("fence.acq_rel.cta; // 1." : : : "memory"); - } _CCCL_ELSE_IF_CONSTEXPR (__sem == sem_acq_rel && __scope == scope_gpu) { + } else _CCCL_IF_CONSTEXPR (__sem == sem_acq_rel && __scope == scope_gpu) { asm volatile("fence.acq_rel.gpu; // 1." : : : "memory"); - } _CCCL_ELSE_IF_CONSTEXPR (__sem == sem_acq_rel && __scope == scope_sys) { + } else _CCCL_IF_CONSTEXPR (__sem == sem_acq_rel && __scope == scope_sys) { asm volatile("fence.acq_rel.sys; // 1." : : : "memory"); }), ( @@ -57,7 +62,7 @@ _CCCL_DEVICE static inline void fence(sem_t<_Sem> __sem, scope_cluster_t) ( _CCCL_IF_CONSTEXPR (__sem == sem_sc) { asm volatile("fence.sc.cluster; // 2." : : : "memory"); - } _CCCL_ELSE_IF_CONSTEXPR (__sem == sem_acq_rel) { + } else _CCCL_IF_CONSTEXPR (__sem == sem_acq_rel) { asm volatile("fence.acq_rel.cluster; // 2." : : : "memory"); }), ( @@ -65,3 +70,5 @@ _CCCL_DEVICE static inline void fence(sem_t<_Sem> __sem, scope_cluster_t) __cuda_ptx_fence_is_not_supported_before_SM_90__();)); } #endif // __cccl_ptx_isa >= 780 + +#endif // _CUDA_PTX_GENERATED_FENCE_H_ diff --git a/libcudacxx/include/cuda/__ptx/instructions/generated/fence_mbarrier_init.inc b/libcudacxx/include/cuda/__ptx/instructions/generated/fence_mbarrier_init.h similarity index 80% rename from libcudacxx/include/cuda/__ptx/instructions/generated/fence_mbarrier_init.inc rename to libcudacxx/include/cuda/__ptx/instructions/generated/fence_mbarrier_init.h index 0d39c222598..e185913b3cd 100644 --- a/libcudacxx/include/cuda/__ptx/instructions/generated/fence_mbarrier_init.inc +++ b/libcudacxx/include/cuda/__ptx/instructions/generated/fence_mbarrier_init.h @@ -1,8 +1,13 @@ +// This file was automatically generated. Do not edit. + +#ifndef _CUDA_PTX_GENERATED_FENCE_MBARRIER_INIT_H_ +#define _CUDA_PTX_GENERATED_FENCE_MBARRIER_INIT_H_ + /* // fence.mbarrier_init.sem.scope; // 3. PTX ISA 80, SM_90 // .sem = { .release } // .scope = { .cluster } -template +template __device__ static inline void fence_mbarrier_init( cuda::ptx::sem_release_t, cuda::ptx::scope_cluster_t); @@ -25,3 +30,5 @@ _CCCL_DEVICE static inline void fence_mbarrier_init(sem_release_t, scope_cluster __cuda_ptx_fence_mbarrier_init_is_not_supported_before_SM_90__();)); } #endif // __cccl_ptx_isa >= 800 + +#endif // _CUDA_PTX_GENERATED_FENCE_MBARRIER_INIT_H_ diff --git a/libcudacxx/include/cuda/__ptx/instructions/generated/fence_proxy_alias.inc b/libcudacxx/include/cuda/__ptx/instructions/generated/fence_proxy_alias.h similarity index 74% rename from libcudacxx/include/cuda/__ptx/instructions/generated/fence_proxy_alias.inc rename to libcudacxx/include/cuda/__ptx/instructions/generated/fence_proxy_alias.h index 98260b851ca..40229b84a96 100644 --- a/libcudacxx/include/cuda/__ptx/instructions/generated/fence_proxy_alias.inc +++ b/libcudacxx/include/cuda/__ptx/instructions/generated/fence_proxy_alias.h @@ -1,6 +1,11 @@ +// This file was automatically generated. Do not edit. + +#ifndef _CUDA_PTX_GENERATED_FENCE_PROXY_ALIAS_H_ +#define _CUDA_PTX_GENERATED_FENCE_PROXY_ALIAS_H_ + /* // fence.proxy.alias; // 4. PTX ISA 75, SM_70 -template +template __device__ static inline void fence_proxy_alias(); */ #if __cccl_ptx_isa >= 750 @@ -19,3 +24,5 @@ _CCCL_DEVICE static inline void fence_proxy_alias() __cuda_ptx_fence_proxy_alias_is_not_supported_before_SM_70__();)); } #endif // __cccl_ptx_isa >= 750 + +#endif // _CUDA_PTX_GENERATED_FENCE_PROXY_ALIAS_H_ diff --git a/libcudacxx/include/cuda/__ptx/instructions/generated/fence_proxy_async.inc b/libcudacxx/include/cuda/__ptx/instructions/generated/fence_proxy_async.h similarity index 83% rename from libcudacxx/include/cuda/__ptx/instructions/generated/fence_proxy_async.inc rename to libcudacxx/include/cuda/__ptx/instructions/generated/fence_proxy_async.h index f0a37baabdb..f64b5faee5e 100644 --- a/libcudacxx/include/cuda/__ptx/instructions/generated/fence_proxy_async.inc +++ b/libcudacxx/include/cuda/__ptx/instructions/generated/fence_proxy_async.h @@ -1,6 +1,11 @@ +// This file was automatically generated. Do not edit. + +#ifndef _CUDA_PTX_GENERATED_FENCE_PROXY_ASYNC_H_ +#define _CUDA_PTX_GENERATED_FENCE_PROXY_ASYNC_H_ + /* // fence.proxy.async; // 5. PTX ISA 80, SM_90 -template +template __device__ static inline void fence_proxy_async(); */ #if __cccl_ptx_isa >= 800 @@ -38,9 +43,9 @@ _CCCL_DEVICE static inline void fence_proxy_async(space_t<_Space> __space) ( _CCCL_IF_CONSTEXPR (__space == space_global) { asm volatile("fence.proxy.async.global; // 6." : : : "memory"); - } _CCCL_ELSE_IF_CONSTEXPR (__space == space_cluster) { + } else _CCCL_IF_CONSTEXPR (__space == space_cluster) { asm volatile("fence.proxy.async.shared::cluster; // 6." : : : "memory"); - } _CCCL_ELSE_IF_CONSTEXPR (__space == space_shared) { + } else _CCCL_IF_CONSTEXPR (__space == space_shared) { asm volatile("fence.proxy.async.shared::cta; // 6." : : : "memory"); }), ( @@ -48,3 +53,5 @@ _CCCL_DEVICE static inline void fence_proxy_async(space_t<_Space> __space) __cuda_ptx_fence_proxy_async_is_not_supported_before_SM_90__();)); } #endif // __cccl_ptx_isa >= 800 + +#endif // _CUDA_PTX_GENERATED_FENCE_PROXY_ASYNC_H_ diff --git a/libcudacxx/include/cuda/__ptx/instructions/generated/fence_proxy_tensormap_generic.inc b/libcudacxx/include/cuda/__ptx/instructions/generated/fence_proxy_tensormap_generic.h similarity index 85% rename from libcudacxx/include/cuda/__ptx/instructions/generated/fence_proxy_tensormap_generic.inc rename to libcudacxx/include/cuda/__ptx/instructions/generated/fence_proxy_tensormap_generic.h index 3e5b2a265f4..1e6119ee032 100644 --- a/libcudacxx/include/cuda/__ptx/instructions/generated/fence_proxy_tensormap_generic.inc +++ b/libcudacxx/include/cuda/__ptx/instructions/generated/fence_proxy_tensormap_generic.h @@ -1,3 +1,8 @@ +// This file was automatically generated. Do not edit. + +#ifndef _CUDA_PTX_GENERATED_FENCE_PROXY_TENSORMAP_GENERIC_H_ +#define _CUDA_PTX_GENERATED_FENCE_PROXY_TENSORMAP_GENERIC_H_ + /* // fence.proxy.tensormap::generic.release.scope; // 7. PTX ISA 83, SM_90 // .sem = { .release } @@ -19,11 +24,11 @@ _CCCL_DEVICE static inline void fence_proxy_tensormap_generic(sem_release_t, sco ( _CCCL_IF_CONSTEXPR (__scope == scope_cta) { asm volatile("fence.proxy.tensormap::generic.release.cta; // 7." : : : "memory"); - } _CCCL_ELSE_IF_CONSTEXPR (__scope == scope_cluster) { + } else _CCCL_IF_CONSTEXPR (__scope == scope_cluster) { asm volatile("fence.proxy.tensormap::generic.release.cluster; // 7." : : : "memory"); - } _CCCL_ELSE_IF_CONSTEXPR (__scope == scope_gpu) { + } else _CCCL_IF_CONSTEXPR (__scope == scope_gpu) { asm volatile("fence.proxy.tensormap::generic.release.gpu; // 7." : : : "memory"); - } _CCCL_ELSE_IF_CONSTEXPR (__scope == scope_sys) { + } else _CCCL_IF_CONSTEXPR (__scope == scope_sys) { asm volatile("fence.proxy.tensormap::generic.release.sys; // 7." : : : "memory"); }), ( @@ -59,17 +64,17 @@ fence_proxy_tensormap_generic(sem_acquire_t, scope_t<_Scope> __scope, const void : : "l"(__addr), "n"(__size.value) : "memory"); - } _CCCL_ELSE_IF_CONSTEXPR (__scope == scope_cluster) { + } else _CCCL_IF_CONSTEXPR (__scope == scope_cluster) { asm volatile("fence.proxy.tensormap::generic.acquire.cluster [%0], %1; // 8." : : "l"(__addr), "n"(__size.value) : "memory"); - } _CCCL_ELSE_IF_CONSTEXPR (__scope == scope_gpu) { + } else _CCCL_IF_CONSTEXPR (__scope == scope_gpu) { asm volatile("fence.proxy.tensormap::generic.acquire.gpu [%0], %1; // 8." : : "l"(__addr), "n"(__size.value) : "memory"); - } _CCCL_ELSE_IF_CONSTEXPR (__scope == scope_sys) { + } else _CCCL_IF_CONSTEXPR (__scope == scope_sys) { asm volatile("fence.proxy.tensormap::generic.acquire.sys [%0], %1; // 8." : : "l"(__addr), "n"(__size.value) @@ -80,3 +85,5 @@ fence_proxy_tensormap_generic(sem_acquire_t, scope_t<_Scope> __scope, const void __cuda_ptx_fence_proxy_tensormap_generic_is_not_supported_before_SM_90__();)); } #endif // __cccl_ptx_isa >= 830 + +#endif // _CUDA_PTX_GENERATED_FENCE_PROXY_TENSORMAP_GENERIC_H_ diff --git a/libcudacxx/include/cuda/__ptx/instructions/generated/get_sreg.inc b/libcudacxx/include/cuda/__ptx/instructions/generated/get_sreg.h similarity index 95% rename from libcudacxx/include/cuda/__ptx/instructions/generated/get_sreg.inc rename to libcudacxx/include/cuda/__ptx/instructions/generated/get_sreg.h index dd3079915f7..08128cc00a1 100644 --- a/libcudacxx/include/cuda/__ptx/instructions/generated/get_sreg.inc +++ b/libcudacxx/include/cuda/__ptx/instructions/generated/get_sreg.h @@ -1,6 +1,11 @@ +// This file was automatically generated. Do not edit. + +#ifndef _CUDA_PTX_GENERATED_GET_SREG_H_ +#define _CUDA_PTX_GENERATED_GET_SREG_H_ + /* // mov.u32 sreg_value, %%tid.x; // PTX ISA 20 -template +template __device__ static inline uint32_t get_sreg_tid_x(); */ #if __cccl_ptx_isa >= 200 @@ -15,7 +20,7 @@ _CCCL_DEVICE static inline _CUDA_VSTD::uint32_t get_sreg_tid_x() /* // mov.u32 sreg_value, %%tid.y; // PTX ISA 20 -template +template __device__ static inline uint32_t get_sreg_tid_y(); */ #if __cccl_ptx_isa >= 200 @@ -30,7 +35,7 @@ _CCCL_DEVICE static inline _CUDA_VSTD::uint32_t get_sreg_tid_y() /* // mov.u32 sreg_value, %%tid.z; // PTX ISA 20 -template +template __device__ static inline uint32_t get_sreg_tid_z(); */ #if __cccl_ptx_isa >= 200 @@ -45,7 +50,7 @@ _CCCL_DEVICE static inline _CUDA_VSTD::uint32_t get_sreg_tid_z() /* // mov.u32 sreg_value, %%ntid.x; // PTX ISA 20 -template +template __device__ static inline uint32_t get_sreg_ntid_x(); */ #if __cccl_ptx_isa >= 200 @@ -60,7 +65,7 @@ _CCCL_DEVICE static inline _CUDA_VSTD::uint32_t get_sreg_ntid_x() /* // mov.u32 sreg_value, %%ntid.y; // PTX ISA 20 -template +template __device__ static inline uint32_t get_sreg_ntid_y(); */ #if __cccl_ptx_isa >= 200 @@ -75,7 +80,7 @@ _CCCL_DEVICE static inline _CUDA_VSTD::uint32_t get_sreg_ntid_y() /* // mov.u32 sreg_value, %%ntid.z; // PTX ISA 20 -template +template __device__ static inline uint32_t get_sreg_ntid_z(); */ #if __cccl_ptx_isa >= 200 @@ -90,7 +95,7 @@ _CCCL_DEVICE static inline _CUDA_VSTD::uint32_t get_sreg_ntid_z() /* // mov.u32 sreg_value, %%laneid; // PTX ISA 13 -template +template __device__ static inline uint32_t get_sreg_laneid(); */ #if __cccl_ptx_isa >= 130 @@ -105,7 +110,7 @@ _CCCL_DEVICE static inline _CUDA_VSTD::uint32_t get_sreg_laneid() /* // mov.u32 sreg_value, %%warpid; // PTX ISA 13 -template +template __device__ static inline uint32_t get_sreg_warpid(); */ #if __cccl_ptx_isa >= 130 @@ -120,7 +125,7 @@ _CCCL_DEVICE static inline _CUDA_VSTD::uint32_t get_sreg_warpid() /* // mov.u32 sreg_value, %%nwarpid; // PTX ISA 20, SM_35 -template +template __device__ static inline uint32_t get_sreg_nwarpid(); */ #if __cccl_ptx_isa >= 200 @@ -144,7 +149,7 @@ _CCCL_DEVICE static inline _CUDA_VSTD::uint32_t get_sreg_nwarpid() /* // mov.u32 sreg_value, %%ctaid.x; // PTX ISA 20 -template +template __device__ static inline uint32_t get_sreg_ctaid_x(); */ #if __cccl_ptx_isa >= 200 @@ -159,7 +164,7 @@ _CCCL_DEVICE static inline _CUDA_VSTD::uint32_t get_sreg_ctaid_x() /* // mov.u32 sreg_value, %%ctaid.y; // PTX ISA 20 -template +template __device__ static inline uint32_t get_sreg_ctaid_y(); */ #if __cccl_ptx_isa >= 200 @@ -174,7 +179,7 @@ _CCCL_DEVICE static inline _CUDA_VSTD::uint32_t get_sreg_ctaid_y() /* // mov.u32 sreg_value, %%ctaid.z; // PTX ISA 20 -template +template __device__ static inline uint32_t get_sreg_ctaid_z(); */ #if __cccl_ptx_isa >= 200 @@ -189,7 +194,7 @@ _CCCL_DEVICE static inline _CUDA_VSTD::uint32_t get_sreg_ctaid_z() /* // mov.u32 sreg_value, %%nctaid.x; // PTX ISA 20 -template +template __device__ static inline uint32_t get_sreg_nctaid_x(); */ #if __cccl_ptx_isa >= 200 @@ -204,7 +209,7 @@ _CCCL_DEVICE static inline _CUDA_VSTD::uint32_t get_sreg_nctaid_x() /* // mov.u32 sreg_value, %%nctaid.y; // PTX ISA 20 -template +template __device__ static inline uint32_t get_sreg_nctaid_y(); */ #if __cccl_ptx_isa >= 200 @@ -219,7 +224,7 @@ _CCCL_DEVICE static inline _CUDA_VSTD::uint32_t get_sreg_nctaid_y() /* // mov.u32 sreg_value, %%nctaid.z; // PTX ISA 20 -template +template __device__ static inline uint32_t get_sreg_nctaid_z(); */ #if __cccl_ptx_isa >= 200 @@ -234,7 +239,7 @@ _CCCL_DEVICE static inline _CUDA_VSTD::uint32_t get_sreg_nctaid_z() /* // mov.u32 sreg_value, %%smid; // PTX ISA 13 -template +template __device__ static inline uint32_t get_sreg_smid(); */ #if __cccl_ptx_isa >= 130 @@ -249,7 +254,7 @@ _CCCL_DEVICE static inline _CUDA_VSTD::uint32_t get_sreg_smid() /* // mov.u32 sreg_value, %%nsmid; // PTX ISA 20, SM_35 -template +template __device__ static inline uint32_t get_sreg_nsmid(); */ #if __cccl_ptx_isa >= 200 @@ -273,7 +278,7 @@ _CCCL_DEVICE static inline _CUDA_VSTD::uint32_t get_sreg_nsmid() /* // mov.u64 sreg_value, %%gridid; // PTX ISA 30 -template +template __device__ static inline uint64_t get_sreg_gridid(); */ #if __cccl_ptx_isa >= 300 @@ -288,7 +293,7 @@ _CCCL_DEVICE static inline _CUDA_VSTD::uint64_t get_sreg_gridid() /* // mov.pred sreg_value, %%is_explicit_cluster; // PTX ISA 78, SM_90 -template +template __device__ static inline bool get_sreg_is_explicit_cluster(); */ #if __cccl_ptx_isa >= 780 @@ -315,7 +320,7 @@ _CCCL_DEVICE static inline bool get_sreg_is_explicit_cluster() /* // mov.u32 sreg_value, %%clusterid.x; // PTX ISA 78, SM_90 -template +template __device__ static inline uint32_t get_sreg_clusterid_x(); */ #if __cccl_ptx_isa >= 780 @@ -339,7 +344,7 @@ _CCCL_DEVICE static inline _CUDA_VSTD::uint32_t get_sreg_clusterid_x() /* // mov.u32 sreg_value, %%clusterid.y; // PTX ISA 78, SM_90 -template +template __device__ static inline uint32_t get_sreg_clusterid_y(); */ #if __cccl_ptx_isa >= 780 @@ -363,7 +368,7 @@ _CCCL_DEVICE static inline _CUDA_VSTD::uint32_t get_sreg_clusterid_y() /* // mov.u32 sreg_value, %%clusterid.z; // PTX ISA 78, SM_90 -template +template __device__ static inline uint32_t get_sreg_clusterid_z(); */ #if __cccl_ptx_isa >= 780 @@ -387,7 +392,7 @@ _CCCL_DEVICE static inline _CUDA_VSTD::uint32_t get_sreg_clusterid_z() /* // mov.u32 sreg_value, %%nclusterid.x; // PTX ISA 78, SM_90 -template +template __device__ static inline uint32_t get_sreg_nclusterid_x(); */ #if __cccl_ptx_isa >= 780 @@ -411,7 +416,7 @@ _CCCL_DEVICE static inline _CUDA_VSTD::uint32_t get_sreg_nclusterid_x() /* // mov.u32 sreg_value, %%nclusterid.y; // PTX ISA 78, SM_90 -template +template __device__ static inline uint32_t get_sreg_nclusterid_y(); */ #if __cccl_ptx_isa >= 780 @@ -435,7 +440,7 @@ _CCCL_DEVICE static inline _CUDA_VSTD::uint32_t get_sreg_nclusterid_y() /* // mov.u32 sreg_value, %%nclusterid.z; // PTX ISA 78, SM_90 -template +template __device__ static inline uint32_t get_sreg_nclusterid_z(); */ #if __cccl_ptx_isa >= 780 @@ -459,7 +464,7 @@ _CCCL_DEVICE static inline _CUDA_VSTD::uint32_t get_sreg_nclusterid_z() /* // mov.u32 sreg_value, %%cluster_ctaid.x; // PTX ISA 78, SM_90 -template +template __device__ static inline uint32_t get_sreg_cluster_ctaid_x(); */ #if __cccl_ptx_isa >= 780 @@ -483,7 +488,7 @@ _CCCL_DEVICE static inline _CUDA_VSTD::uint32_t get_sreg_cluster_ctaid_x() /* // mov.u32 sreg_value, %%cluster_ctaid.y; // PTX ISA 78, SM_90 -template +template __device__ static inline uint32_t get_sreg_cluster_ctaid_y(); */ #if __cccl_ptx_isa >= 780 @@ -507,7 +512,7 @@ _CCCL_DEVICE static inline _CUDA_VSTD::uint32_t get_sreg_cluster_ctaid_y() /* // mov.u32 sreg_value, %%cluster_ctaid.z; // PTX ISA 78, SM_90 -template +template __device__ static inline uint32_t get_sreg_cluster_ctaid_z(); */ #if __cccl_ptx_isa >= 780 @@ -531,7 +536,7 @@ _CCCL_DEVICE static inline _CUDA_VSTD::uint32_t get_sreg_cluster_ctaid_z() /* // mov.u32 sreg_value, %%cluster_nctaid.x; // PTX ISA 78, SM_90 -template +template __device__ static inline uint32_t get_sreg_cluster_nctaid_x(); */ #if __cccl_ptx_isa >= 780 @@ -555,7 +560,7 @@ _CCCL_DEVICE static inline _CUDA_VSTD::uint32_t get_sreg_cluster_nctaid_x() /* // mov.u32 sreg_value, %%cluster_nctaid.y; // PTX ISA 78, SM_90 -template +template __device__ static inline uint32_t get_sreg_cluster_nctaid_y(); */ #if __cccl_ptx_isa >= 780 @@ -579,7 +584,7 @@ _CCCL_DEVICE static inline _CUDA_VSTD::uint32_t get_sreg_cluster_nctaid_y() /* // mov.u32 sreg_value, %%cluster_nctaid.z; // PTX ISA 78, SM_90 -template +template __device__ static inline uint32_t get_sreg_cluster_nctaid_z(); */ #if __cccl_ptx_isa >= 780 @@ -603,7 +608,7 @@ _CCCL_DEVICE static inline _CUDA_VSTD::uint32_t get_sreg_cluster_nctaid_z() /* // mov.u32 sreg_value, %%cluster_ctarank; // PTX ISA 78, SM_90 -template +template __device__ static inline uint32_t get_sreg_cluster_ctarank(); */ #if __cccl_ptx_isa >= 780 @@ -627,7 +632,7 @@ _CCCL_DEVICE static inline _CUDA_VSTD::uint32_t get_sreg_cluster_ctarank() /* // mov.u32 sreg_value, %%cluster_nctarank; // PTX ISA 78, SM_90 -template +template __device__ static inline uint32_t get_sreg_cluster_nctarank(); */ #if __cccl_ptx_isa >= 780 @@ -651,7 +656,7 @@ _CCCL_DEVICE static inline _CUDA_VSTD::uint32_t get_sreg_cluster_nctarank() /* // mov.u32 sreg_value, %%lanemask_eq; // PTX ISA 20, SM_35 -template +template __device__ static inline uint32_t get_sreg_lanemask_eq(); */ #if __cccl_ptx_isa >= 200 @@ -675,7 +680,7 @@ _CCCL_DEVICE static inline _CUDA_VSTD::uint32_t get_sreg_lanemask_eq() /* // mov.u32 sreg_value, %%lanemask_le; // PTX ISA 20, SM_35 -template +template __device__ static inline uint32_t get_sreg_lanemask_le(); */ #if __cccl_ptx_isa >= 200 @@ -699,7 +704,7 @@ _CCCL_DEVICE static inline _CUDA_VSTD::uint32_t get_sreg_lanemask_le() /* // mov.u32 sreg_value, %%lanemask_lt; // PTX ISA 20, SM_35 -template +template __device__ static inline uint32_t get_sreg_lanemask_lt(); */ #if __cccl_ptx_isa >= 200 @@ -723,7 +728,7 @@ _CCCL_DEVICE static inline _CUDA_VSTD::uint32_t get_sreg_lanemask_lt() /* // mov.u32 sreg_value, %%lanemask_ge; // PTX ISA 20, SM_35 -template +template __device__ static inline uint32_t get_sreg_lanemask_ge(); */ #if __cccl_ptx_isa >= 200 @@ -747,7 +752,7 @@ _CCCL_DEVICE static inline _CUDA_VSTD::uint32_t get_sreg_lanemask_ge() /* // mov.u32 sreg_value, %%lanemask_gt; // PTX ISA 20, SM_35 -template +template __device__ static inline uint32_t get_sreg_lanemask_gt(); */ #if __cccl_ptx_isa >= 200 @@ -771,7 +776,7 @@ _CCCL_DEVICE static inline _CUDA_VSTD::uint32_t get_sreg_lanemask_gt() /* // mov.u32 sreg_value, %%clock; // PTX ISA 10 -template +template __device__ static inline uint32_t get_sreg_clock(); */ #if __cccl_ptx_isa >= 100 @@ -786,7 +791,7 @@ _CCCL_DEVICE static inline _CUDA_VSTD::uint32_t get_sreg_clock() /* // mov.u32 sreg_value, %%clock_hi; // PTX ISA 50, SM_35 -template +template __device__ static inline uint32_t get_sreg_clock_hi(); */ #if __cccl_ptx_isa >= 500 @@ -810,7 +815,7 @@ _CCCL_DEVICE static inline _CUDA_VSTD::uint32_t get_sreg_clock_hi() /* // mov.u64 sreg_value, %%clock64; // PTX ISA 20, SM_35 -template +template __device__ static inline uint64_t get_sreg_clock64(); */ #if __cccl_ptx_isa >= 200 @@ -834,7 +839,7 @@ _CCCL_DEVICE static inline _CUDA_VSTD::uint64_t get_sreg_clock64() /* // mov.u64 sreg_value, %%globaltimer; // PTX ISA 31, SM_35 -template +template __device__ static inline uint64_t get_sreg_globaltimer(); */ #if __cccl_ptx_isa >= 310 @@ -858,7 +863,7 @@ _CCCL_DEVICE static inline _CUDA_VSTD::uint64_t get_sreg_globaltimer() /* // mov.u32 sreg_value, %%globaltimer_lo; // PTX ISA 31, SM_35 -template +template __device__ static inline uint32_t get_sreg_globaltimer_lo(); */ #if __cccl_ptx_isa >= 310 @@ -882,7 +887,7 @@ _CCCL_DEVICE static inline _CUDA_VSTD::uint32_t get_sreg_globaltimer_lo() /* // mov.u32 sreg_value, %%globaltimer_hi; // PTX ISA 31, SM_35 -template +template __device__ static inline uint32_t get_sreg_globaltimer_hi(); */ #if __cccl_ptx_isa >= 310 @@ -906,7 +911,7 @@ _CCCL_DEVICE static inline _CUDA_VSTD::uint32_t get_sreg_globaltimer_hi() /* // mov.u32 sreg_value, %%total_smem_size; // PTX ISA 41, SM_35 -template +template __device__ static inline uint32_t get_sreg_total_smem_size(); */ #if __cccl_ptx_isa >= 410 @@ -930,7 +935,7 @@ _CCCL_DEVICE static inline _CUDA_VSTD::uint32_t get_sreg_total_smem_size() /* // mov.u32 sreg_value, %%aggr_smem_size; // PTX ISA 81, SM_90 -template +template __device__ static inline uint32_t get_sreg_aggr_smem_size(); */ #if __cccl_ptx_isa >= 810 @@ -954,7 +959,7 @@ _CCCL_DEVICE static inline _CUDA_VSTD::uint32_t get_sreg_aggr_smem_size() /* // mov.u32 sreg_value, %%dynamic_smem_size; // PTX ISA 41, SM_35 -template +template __device__ static inline uint32_t get_sreg_dynamic_smem_size(); */ #if __cccl_ptx_isa >= 410 @@ -978,7 +983,7 @@ _CCCL_DEVICE static inline _CUDA_VSTD::uint32_t get_sreg_dynamic_smem_size() /* // mov.u64 sreg_value, %%current_graph_exec; // PTX ISA 80, SM_50 -template +template __device__ static inline uint64_t get_sreg_current_graph_exec(); */ #if __cccl_ptx_isa >= 800 @@ -999,3 +1004,5 @@ _CCCL_DEVICE static inline _CUDA_VSTD::uint64_t get_sreg_current_graph_exec() __cuda_ptx_get_sreg_current_graph_exec_is_not_supported_before_SM_50__(); return 0;)); } #endif // __cccl_ptx_isa >= 800 + +#endif // _CUDA_PTX_GENERATED_GET_SREG_H_ diff --git a/libcudacxx/include/cuda/__ptx/instructions/generated/getctarank.inc b/libcudacxx/include/cuda/__ptx/instructions/generated/getctarank.h similarity index 81% rename from libcudacxx/include/cuda/__ptx/instructions/generated/getctarank.inc rename to libcudacxx/include/cuda/__ptx/instructions/generated/getctarank.h index 51bd351be87..a769868f45c 100644 --- a/libcudacxx/include/cuda/__ptx/instructions/generated/getctarank.inc +++ b/libcudacxx/include/cuda/__ptx/instructions/generated/getctarank.h @@ -1,7 +1,12 @@ +// This file was automatically generated. Do not edit. + +#ifndef _CUDA_PTX_GENERATED_GETCTARANK_H_ +#define _CUDA_PTX_GENERATED_GETCTARANK_H_ + /* // getctarank{.space}.u32 dest, addr; // PTX ISA 78, SM_90 // .space = { .shared::cluster } -template +template __device__ static inline uint32_t getctarank( cuda::ptx::space_cluster_t, const void* addr); @@ -25,3 +30,5 @@ _CCCL_DEVICE static inline _CUDA_VSTD::uint32_t getctarank(space_cluster_t, cons __cuda_ptx_getctarank_is_not_supported_before_SM_90__(); return 0;)); } #endif // __cccl_ptx_isa >= 780 + +#endif // _CUDA_PTX_GENERATED_GETCTARANK_H_ diff --git a/libcudacxx/include/cuda/__ptx/instructions/generated/mbarrier_arrive.inc b/libcudacxx/include/cuda/__ptx/instructions/generated/mbarrier_arrive.h similarity index 94% rename from libcudacxx/include/cuda/__ptx/instructions/generated/mbarrier_arrive.inc rename to libcudacxx/include/cuda/__ptx/instructions/generated/mbarrier_arrive.h index f3e2b860d50..e1afe25d8c2 100644 --- a/libcudacxx/include/cuda/__ptx/instructions/generated/mbarrier_arrive.inc +++ b/libcudacxx/include/cuda/__ptx/instructions/generated/mbarrier_arrive.h @@ -1,6 +1,11 @@ +// This file was automatically generated. Do not edit. + +#ifndef _CUDA_PTX_GENERATED_MBARRIER_ARRIVE_H_ +#define _CUDA_PTX_GENERATED_MBARRIER_ARRIVE_H_ + /* // mbarrier.arrive.shared.b64 state, [addr]; // 1. PTX ISA 70, SM_80 -template +template __device__ static inline uint64_t mbarrier_arrive( uint64_t* addr); */ @@ -25,7 +30,7 @@ _CCCL_DEVICE static inline _CUDA_VSTD::uint64_t mbarrier_arrive(_CUDA_VSTD::uint /* // mbarrier.arrive.shared::cta.b64 state, [addr], count; // 2. PTX ISA 78, SM_90 -template +template __device__ static inline uint64_t mbarrier_arrive( uint64_t* addr, const uint32_t& count); @@ -79,7 +84,7 @@ mbarrier_arrive(sem_release_t, scope_t<_Scope> __scope, space_shared_t, _CUDA_VS : "=l"(__state) : "r"(__as_ptr_smem(__addr)) : "memory"); - } _CCCL_ELSE_IF_CONSTEXPR (__scope == scope_cluster) { + } else _CCCL_IF_CONSTEXPR (__scope == scope_cluster) { asm("mbarrier.arrive.release.cluster.shared::cta.b64 %0, [%1]; // 3a. " : "=l"(__state) : "r"(__as_ptr_smem(__addr)) @@ -125,7 +130,7 @@ _CCCL_DEVICE static inline _CUDA_VSTD::uint64_t mbarrier_arrive( : "=l"(__state) : "r"(__as_ptr_smem(__addr)), "r"(__count) : "memory"); - } _CCCL_ELSE_IF_CONSTEXPR (__scope == scope_cluster) { + } else _CCCL_IF_CONSTEXPR (__scope == scope_cluster) { asm("mbarrier.arrive.release.cluster.shared::cta.b64 %0, [%1], %2; // 3b. " : "=l"(__state) : "r"(__as_ptr_smem(__addr)), "r"(__count) @@ -142,7 +147,7 @@ _CCCL_DEVICE static inline _CUDA_VSTD::uint64_t mbarrier_arrive( // .sem = { .release } // .scope = { .cluster } // .space = { .shared::cluster } -template +template __device__ static inline void mbarrier_arrive( cuda::ptx::sem_release_t, cuda::ptx::scope_cluster_t, @@ -175,7 +180,7 @@ mbarrier_arrive(sem_release_t, scope_cluster_t, space_cluster_t, _CUDA_VSTD::uin // .sem = { .release } // .scope = { .cluster } // .space = { .shared::cluster } -template +template __device__ static inline void mbarrier_arrive( cuda::ptx::sem_release_t, cuda::ptx::scope_cluster_t, @@ -203,3 +208,5 @@ _CCCL_DEVICE static inline void mbarrier_arrive( __cuda_ptx_mbarrier_arrive_is_not_supported_before_SM_90__();)); } #endif // __cccl_ptx_isa >= 800 + +#endif // _CUDA_PTX_GENERATED_MBARRIER_ARRIVE_H_ diff --git a/libcudacxx/include/cuda/__ptx/instructions/generated/mbarrier_arrive_expect_tx.inc b/libcudacxx/include/cuda/__ptx/instructions/generated/mbarrier_arrive_expect_tx.h similarity index 90% rename from libcudacxx/include/cuda/__ptx/instructions/generated/mbarrier_arrive_expect_tx.inc rename to libcudacxx/include/cuda/__ptx/instructions/generated/mbarrier_arrive_expect_tx.h index efb749957b1..79301a57851 100644 --- a/libcudacxx/include/cuda/__ptx/instructions/generated/mbarrier_arrive_expect_tx.inc +++ b/libcudacxx/include/cuda/__ptx/instructions/generated/mbarrier_arrive_expect_tx.h @@ -1,3 +1,8 @@ +// This file was automatically generated. Do not edit. + +#ifndef _CUDA_PTX_GENERATED_MBARRIER_ARRIVE_EXPECT_TX_H_ +#define _CUDA_PTX_GENERATED_MBARRIER_ARRIVE_EXPECT_TX_H_ + /* // mbarrier.arrive.expect_tx{.sem}{.scope}{.space}.b64 state, [addr], tx_count; // 8. PTX ISA 80, SM_90 // .sem = { .release } @@ -32,7 +37,7 @@ _CCCL_DEVICE static inline _CUDA_VSTD::uint64_t mbarrier_arrive_expect_tx( : "=l"(__state) : "r"(__as_ptr_smem(__addr)), "r"(__tx_count) : "memory"); - } _CCCL_ELSE_IF_CONSTEXPR (__scope == scope_cluster) { + } else _CCCL_IF_CONSTEXPR (__scope == scope_cluster) { asm("mbarrier.arrive.expect_tx.release.cluster.shared::cta.b64 %0, [%1], %2; // 8. " : "=l"(__state) : "r"(__as_ptr_smem(__addr)), "r"(__tx_count) @@ -49,7 +54,7 @@ _CCCL_DEVICE static inline _CUDA_VSTD::uint64_t mbarrier_arrive_expect_tx( // .sem = { .release } // .scope = { .cluster } // .space = { .shared::cluster } -template +template __device__ static inline void mbarrier_arrive_expect_tx( cuda::ptx::sem_release_t, cuda::ptx::scope_cluster_t, @@ -77,3 +82,5 @@ _CCCL_DEVICE static inline void mbarrier_arrive_expect_tx( __cuda_ptx_mbarrier_arrive_expect_tx_is_not_supported_before_SM_90__();)); } #endif // __cccl_ptx_isa >= 800 + +#endif // _CUDA_PTX_GENERATED_MBARRIER_ARRIVE_EXPECT_TX_H_ diff --git a/libcudacxx/include/cuda/__ptx/instructions/generated/mbarrier_arrive_no_complete.inc b/libcudacxx/include/cuda/__ptx/instructions/generated/mbarrier_arrive_no_complete.h similarity index 79% rename from libcudacxx/include/cuda/__ptx/instructions/generated/mbarrier_arrive_no_complete.inc rename to libcudacxx/include/cuda/__ptx/instructions/generated/mbarrier_arrive_no_complete.h index 879bedebdc9..cbfb275baa4 100644 --- a/libcudacxx/include/cuda/__ptx/instructions/generated/mbarrier_arrive_no_complete.inc +++ b/libcudacxx/include/cuda/__ptx/instructions/generated/mbarrier_arrive_no_complete.h @@ -1,6 +1,11 @@ +// This file was automatically generated. Do not edit. + +#ifndef _CUDA_PTX_GENERATED_MBARRIER_ARRIVE_NO_COMPLETE_H_ +#define _CUDA_PTX_GENERATED_MBARRIER_ARRIVE_NO_COMPLETE_H_ + /* // mbarrier.arrive.noComplete.shared.b64 state, [addr], count; // 5. PTX ISA 70, SM_80 -template +template __device__ static inline uint64_t mbarrier_arrive_no_complete( uint64_t* addr, const uint32_t& count); @@ -24,3 +29,5 @@ mbarrier_arrive_no_complete(_CUDA_VSTD::uint64_t* __addr, const _CUDA_VSTD::uint __cuda_ptx_mbarrier_arrive_no_complete_is_not_supported_before_SM_80__(); return 0;)); } #endif // __cccl_ptx_isa >= 700 + +#endif // _CUDA_PTX_GENERATED_MBARRIER_ARRIVE_NO_COMPLETE_H_ diff --git a/libcudacxx/include/cuda/__ptx/instructions/generated/mbarrier_init.inc b/libcudacxx/include/cuda/__ptx/instructions/generated/mbarrier_init.h similarity index 78% rename from libcudacxx/include/cuda/__ptx/instructions/generated/mbarrier_init.inc rename to libcudacxx/include/cuda/__ptx/instructions/generated/mbarrier_init.h index 3afeeacfccf..d1e5c57c97e 100644 --- a/libcudacxx/include/cuda/__ptx/instructions/generated/mbarrier_init.inc +++ b/libcudacxx/include/cuda/__ptx/instructions/generated/mbarrier_init.h @@ -1,6 +1,11 @@ +// This file was automatically generated. Do not edit. + +#ifndef _CUDA_PTX_GENERATED_MBARRIER_INIT_H_ +#define _CUDA_PTX_GENERATED_MBARRIER_INIT_H_ + /* // mbarrier.init.shared.b64 [addr], count; // PTX ISA 70, SM_80 -template +template __device__ static inline void mbarrier_init( uint64_t* addr, const uint32_t& count); @@ -21,3 +26,5 @@ _CCCL_DEVICE static inline void mbarrier_init(_CUDA_VSTD::uint64_t* __addr, cons __cuda_ptx_mbarrier_init_is_not_supported_before_SM_80__();)); } #endif // __cccl_ptx_isa >= 700 + +#endif // _CUDA_PTX_GENERATED_MBARRIER_INIT_H_ diff --git a/libcudacxx/include/cuda/__ptx/instructions/generated/mbarrier_test_wait.inc b/libcudacxx/include/cuda/__ptx/instructions/generated/mbarrier_test_wait.h similarity index 90% rename from libcudacxx/include/cuda/__ptx/instructions/generated/mbarrier_test_wait.inc rename to libcudacxx/include/cuda/__ptx/instructions/generated/mbarrier_test_wait.h index 301c0364af4..f3dbb6ed1c3 100644 --- a/libcudacxx/include/cuda/__ptx/instructions/generated/mbarrier_test_wait.inc +++ b/libcudacxx/include/cuda/__ptx/instructions/generated/mbarrier_test_wait.h @@ -1,6 +1,11 @@ +// This file was automatically generated. Do not edit. + +#ifndef _CUDA_PTX_GENERATED_MBARRIER_TEST_WAIT_H_ +#define _CUDA_PTX_GENERATED_MBARRIER_TEST_WAIT_H_ + /* // mbarrier.test_wait.shared.b64 waitComplete, [addr], state; // 1. PTX -ISA 70, SM_80 template +ISA 70, SM_80 template __device__ static inline bool mbarrier_test_wait( uint64_t* addr, const uint64_t& state); @@ -58,7 +63,7 @@ _CCCL_DEVICE static inline bool mbarrier_test_wait( : "=r"(__waitComplete) : "r"(__as_ptr_smem(__addr)), "l"(__state) : "memory"); - } _CCCL_ELSE_IF_CONSTEXPR (__scope == scope_cluster) { + } else _CCCL_IF_CONSTEXPR (__scope == scope_cluster) { asm("{\n\t .reg .pred P_OUT; \n\t" "mbarrier.test_wait.acquire.cluster.shared::cta.b64 P_OUT, [%1], %2; // 2. " "\n\t" @@ -73,3 +78,5 @@ _CCCL_DEVICE static inline bool mbarrier_test_wait( __cuda_ptx_mbarrier_test_wait_is_not_supported_before_SM_90__(); return false;)); } #endif // __cccl_ptx_isa >= 800 + +#endif // _CUDA_PTX_GENERATED_MBARRIER_TEST_WAIT_H_ diff --git a/libcudacxx/include/cuda/__ptx/instructions/generated/mbarrier_test_wait_parity.inc b/libcudacxx/include/cuda/__ptx/instructions/generated/mbarrier_test_wait_parity.h similarity index 90% rename from libcudacxx/include/cuda/__ptx/instructions/generated/mbarrier_test_wait_parity.inc rename to libcudacxx/include/cuda/__ptx/instructions/generated/mbarrier_test_wait_parity.h index 604cfd92045..b975434b2de 100644 --- a/libcudacxx/include/cuda/__ptx/instructions/generated/mbarrier_test_wait_parity.inc +++ b/libcudacxx/include/cuda/__ptx/instructions/generated/mbarrier_test_wait_parity.h @@ -1,6 +1,11 @@ +// This file was automatically generated. Do not edit. + +#ifndef _CUDA_PTX_GENERATED_MBARRIER_TEST_WAIT_PARITY_H_ +#define _CUDA_PTX_GENERATED_MBARRIER_TEST_WAIT_PARITY_H_ + /* // mbarrier.test_wait.parity.shared.b64 waitComplete, [addr], phaseParity; // 3. PTX -ISA 71, SM_80 template +ISA 71, SM_80 template __device__ static inline bool mbarrier_test_wait_parity( uint64_t* addr, const uint32_t& phaseParity); @@ -59,7 +64,7 @@ _CCCL_DEVICE static inline bool mbarrier_test_wait_parity( : "=r"(__waitComplete) : "r"(__as_ptr_smem(__addr)), "r"(__phaseParity) : "memory"); - } _CCCL_ELSE_IF_CONSTEXPR (__scope == scope_cluster) { + } else _CCCL_IF_CONSTEXPR (__scope == scope_cluster) { asm("{\n\t .reg .pred P_OUT; \n\t" "mbarrier.test_wait.parity.acquire.cluster.shared::cta.b64 P_OUT, [%1], %2; // 4. \n\t" "selp.b32 %0, 1, 0, P_OUT; \n" @@ -73,3 +78,5 @@ _CCCL_DEVICE static inline bool mbarrier_test_wait_parity( __cuda_ptx_mbarrier_test_wait_parity_is_not_supported_before_SM_90__(); return false;)); } #endif // __cccl_ptx_isa >= 800 + +#endif // _CUDA_PTX_GENERATED_MBARRIER_TEST_WAIT_PARITY_H_ diff --git a/libcudacxx/include/cuda/__ptx/instructions/generated/mbarrier_try_wait.inc b/libcudacxx/include/cuda/__ptx/instructions/generated/mbarrier_try_wait.h similarity index 93% rename from libcudacxx/include/cuda/__ptx/instructions/generated/mbarrier_try_wait.inc rename to libcudacxx/include/cuda/__ptx/instructions/generated/mbarrier_try_wait.h index c5f2062664c..dd50a2c9f41 100644 --- a/libcudacxx/include/cuda/__ptx/instructions/generated/mbarrier_try_wait.inc +++ b/libcudacxx/include/cuda/__ptx/instructions/generated/mbarrier_try_wait.h @@ -1,6 +1,11 @@ +// This file was automatically generated. Do not edit. + +#ifndef _CUDA_PTX_GENERATED_MBARRIER_TRY_WAIT_H_ +#define _CUDA_PTX_GENERATED_MBARRIER_TRY_WAIT_H_ + /* // mbarrier.try_wait.shared::cta.b64 waitComplete, [addr], state; // 5a. -PTX ISA 78, SM_90 template +PTX ISA 78, SM_90 template __device__ static inline bool mbarrier_try_wait( uint64_t* addr, const uint64_t& state); @@ -29,7 +34,7 @@ _CCCL_DEVICE static inline bool mbarrier_try_wait(_CUDA_VSTD::uint64_t* __addr, /* // mbarrier.try_wait.shared::cta.b64 waitComplete, [addr], state, suspendTimeHint; // 5b. PTX -ISA 78, SM_90 template +ISA 78, SM_90 template __device__ static inline bool mbarrier_try_wait( uint64_t* addr, const uint64_t& state, @@ -89,7 +94,7 @@ _CCCL_DEVICE static inline bool mbarrier_try_wait( : "=r"(__waitComplete) : "r"(__as_ptr_smem(__addr)), "l"(__state) : "memory"); - } _CCCL_ELSE_IF_CONSTEXPR (__scope == scope_cluster) { + } else _CCCL_IF_CONSTEXPR (__scope == scope_cluster) { asm("{\n\t .reg .pred P_OUT; \n\t" "mbarrier.try_wait.acquire.cluster.shared::cta.b64 P_OUT, [%1], %2; // 6a. " "\n\t" @@ -141,7 +146,7 @@ _CCCL_DEVICE static inline bool mbarrier_try_wait( : "=r"(__waitComplete) : "r"(__as_ptr_smem(__addr)), "l"(__state), "r"(__suspendTimeHint) : "memory"); - } _CCCL_ELSE_IF_CONSTEXPR (__scope == scope_cluster) { + } else _CCCL_IF_CONSTEXPR (__scope == scope_cluster) { asm("{\n\t .reg .pred P_OUT; \n\t" "mbarrier.try_wait.acquire.cluster.shared::cta.b64 P_OUT, [%1], %2 , %3; // 6b. \n\t" "selp.b32 %0, 1, 0, P_OUT; \n" @@ -155,3 +160,5 @@ _CCCL_DEVICE static inline bool mbarrier_try_wait( __cuda_ptx_mbarrier_try_wait_is_not_supported_before_SM_90__(); return false;)); } #endif // __cccl_ptx_isa >= 800 + +#endif // _CUDA_PTX_GENERATED_MBARRIER_TRY_WAIT_H_ diff --git a/libcudacxx/include/cuda/__ptx/instructions/generated/mbarrier_try_wait_parity.inc b/libcudacxx/include/cuda/__ptx/instructions/generated/mbarrier_try_wait_parity.h similarity index 93% rename from libcudacxx/include/cuda/__ptx/instructions/generated/mbarrier_try_wait_parity.inc rename to libcudacxx/include/cuda/__ptx/instructions/generated/mbarrier_try_wait_parity.h index 321bfc515da..d3deb3ca1d5 100644 --- a/libcudacxx/include/cuda/__ptx/instructions/generated/mbarrier_try_wait_parity.inc +++ b/libcudacxx/include/cuda/__ptx/instructions/generated/mbarrier_try_wait_parity.h @@ -1,6 +1,11 @@ +// This file was automatically generated. Do not edit. + +#ifndef _CUDA_PTX_GENERATED_MBARRIER_TRY_WAIT_PARITY_H_ +#define _CUDA_PTX_GENERATED_MBARRIER_TRY_WAIT_PARITY_H_ + /* // mbarrier.try_wait.parity.shared::cta.b64 waitComplete, [addr], phaseParity; // 7a. -PTX ISA 78, SM_90 template +PTX ISA 78, SM_90 template __device__ static inline bool mbarrier_try_wait_parity( uint64_t* addr, const uint32_t& phaseParity); @@ -30,7 +35,7 @@ mbarrier_try_wait_parity(_CUDA_VSTD::uint64_t* __addr, const _CUDA_VSTD::uint32_ /* // mbarrier.try_wait.parity.shared::cta.b64 waitComplete, [addr], phaseParity, suspendTimeHint; // 7b. -PTX ISA 78, SM_90 template +PTX ISA 78, SM_90 template __device__ static inline bool mbarrier_try_wait_parity( uint64_t* addr, const uint32_t& phaseParity, @@ -90,7 +95,7 @@ _CCCL_DEVICE static inline bool mbarrier_try_wait_parity( : "=r"(__waitComplete) : "r"(__as_ptr_smem(__addr)), "r"(__phaseParity) : "memory"); - } _CCCL_ELSE_IF_CONSTEXPR (__scope == scope_cluster) { + } else _CCCL_IF_CONSTEXPR (__scope == scope_cluster) { asm("{\n\t .reg .pred P_OUT; \n\t" "mbarrier.try_wait.parity.acquire.cluster.shared::cta.b64 P_OUT, [%1], %2; // 8a. \n\t" "selp.b32 %0, 1, 0, P_OUT; \n" @@ -141,7 +146,7 @@ _CCCL_DEVICE static inline bool mbarrier_try_wait_parity( : "=r"(__waitComplete) : "r"(__as_ptr_smem(__addr)), "r"(__phaseParity), "r"(__suspendTimeHint) : "memory"); - } _CCCL_ELSE_IF_CONSTEXPR (__scope == scope_cluster) { + } else _CCCL_IF_CONSTEXPR (__scope == scope_cluster) { asm("{\n\t .reg .pred P_OUT; \n\t" "mbarrier.try_wait.parity.acquire.cluster.shared::cta.b64 P_OUT, [%1], %2, %3; // 8b. \n\t" "selp.b32 %0, 1, 0, P_OUT; \n" @@ -155,3 +160,5 @@ _CCCL_DEVICE static inline bool mbarrier_try_wait_parity( __cuda_ptx_mbarrier_try_wait_parity_is_not_supported_before_SM_90__(); return false;)); } #endif // __cccl_ptx_isa >= 800 + +#endif // _CUDA_PTX_GENERATED_MBARRIER_TRY_WAIT_PARITY_H_ diff --git a/libcudacxx/include/cuda/__ptx/instructions/generated/red_async.inc b/libcudacxx/include/cuda/__ptx/instructions/generated/red_async.h similarity index 97% rename from libcudacxx/include/cuda/__ptx/instructions/generated/red_async.inc rename to libcudacxx/include/cuda/__ptx/instructions/generated/red_async.h index 3157fa1c627..d88392f3635 100644 --- a/libcudacxx/include/cuda/__ptx/instructions/generated/red_async.inc +++ b/libcudacxx/include/cuda/__ptx/instructions/generated/red_async.h @@ -1,9 +1,14 @@ +// This file was automatically generated. Do not edit. + +#ifndef _CUDA_PTX_GENERATED_RED_ASYNC_H_ +#define _CUDA_PTX_GENERATED_RED_ASYNC_H_ + /* // red.async.relaxed.cluster.shared::cluster.mbarrier::complete_tx::bytes{.op}{.type} [dest], value, [remote_bar]; // PTX ISA 81, SM_90 // .type = { .u32 } // .op = { .inc } -template +template __device__ static inline void red_async( cuda::ptx::op_inc_t, uint32_t* dest, @@ -35,7 +40,7 @@ _CCCL_DEVICE static inline void red_async( PTX ISA 81, SM_90 // .type = { .u32 } // .op = { .dec } -template +template __device__ static inline void red_async( cuda::ptx::op_dec_t, uint32_t* dest, @@ -67,7 +72,7 @@ _CCCL_DEVICE static inline void red_async( PTX ISA 81, SM_90 // .type = { .u32 } // .op = { .min } -template +template __device__ static inline void red_async( cuda::ptx::op_min_t, uint32_t* dest, @@ -99,7 +104,7 @@ _CCCL_DEVICE static inline void red_async( PTX ISA 81, SM_90 // .type = { .u32 } // .op = { .max } -template +template __device__ static inline void red_async( cuda::ptx::op_max_t, uint32_t* dest, @@ -131,7 +136,7 @@ _CCCL_DEVICE static inline void red_async( PTX ISA 81, SM_90 // .type = { .u32 } // .op = { .add } -template +template __device__ static inline void red_async( cuda::ptx::op_add_t, uint32_t* dest, @@ -163,7 +168,7 @@ _CCCL_DEVICE static inline void red_async( PTX ISA 81, SM_90 // .type = { .s32 } // .op = { .min } -template +template __device__ static inline void red_async( cuda::ptx::op_min_t, int32_t* dest, @@ -195,7 +200,7 @@ red_async(op_min_t, _CUDA_VSTD::int32_t* __dest, const _CUDA_VSTD::int32_t& __va PTX ISA 81, SM_90 // .type = { .s32 } // .op = { .max } -template +template __device__ static inline void red_async( cuda::ptx::op_max_t, int32_t* dest, @@ -227,7 +232,7 @@ red_async(op_max_t, _CUDA_VSTD::int32_t* __dest, const _CUDA_VSTD::int32_t& __va PTX ISA 81, SM_90 // .type = { .s32 } // .op = { .add } -template +template __device__ static inline void red_async( cuda::ptx::op_add_t, int32_t* dest, @@ -358,7 +363,7 @@ red_async(op_xor_op_t, _B32* __dest, const _B32& __value, _CUDA_VSTD::uint64_t* PTX ISA 81, SM_90 // .type = { .u64 } // .op = { .add } -template +template __device__ static inline void red_async( cuda::ptx::op_add_t, uint64_t* dest, @@ -389,7 +394,7 @@ _CCCL_DEVICE static inline void red_async( // red.async.relaxed.cluster.shared::cluster.mbarrier::complete_tx::bytes{.op}.u64 [dest], value, [remote_bar]; // .u64 intentional PTX ISA 81, SM_90 // .op = { .add } -template +template __device__ static inline void red_async( cuda::ptx::op_add_t, int64_t* dest, @@ -415,3 +420,5 @@ red_async(op_add_t, _CUDA_VSTD::int64_t* __dest, const _CUDA_VSTD::int64_t& __va __cuda_ptx_red_async_is_not_supported_before_SM_90__();)); } #endif // __cccl_ptx_isa >= 810 + +#endif // _CUDA_PTX_GENERATED_RED_ASYNC_H_ diff --git a/libcudacxx/include/cuda/__ptx/instructions/generated/st_async.inc b/libcudacxx/include/cuda/__ptx/instructions/generated/st_async.h similarity index 93% rename from libcudacxx/include/cuda/__ptx/instructions/generated/st_async.inc rename to libcudacxx/include/cuda/__ptx/instructions/generated/st_async.h index 9dfab243ffe..18fd2c03a41 100644 --- a/libcudacxx/include/cuda/__ptx/instructions/generated/st_async.inc +++ b/libcudacxx/include/cuda/__ptx/instructions/generated/st_async.h @@ -1,3 +1,8 @@ +// This file was automatically generated. Do not edit. + +#ifndef _CUDA_PTX_GENERATED_ST_ASYNC_H_ +#define _CUDA_PTX_GENERATED_ST_ASYNC_H_ + /* // st.async.weak.shared::cluster.mbarrier::complete_tx::bytes{.type} [addr], value, [remote_bar]; // 1. PTX ISA 81, SM_90 @@ -22,7 +27,7 @@ _CCCL_DEVICE static inline void st_async(_Type* __addr, const _Type& __value, _C : : "r"(__as_ptr_remote_dsmem(__addr)), "r"(__as_b32(__value)), "r"(__as_ptr_remote_dsmem(__remote_bar)) : "memory"); - } _CCCL_ELSE_IF_CONSTEXPR (sizeof(_Type) == 8) { + } else _CCCL_IF_CONSTEXPR (sizeof(_Type) == 8) { asm("st.async.weak.shared::cluster.mbarrier::complete_tx::bytes.b64 [%0], %1, [%2]; // 1. " : : "r"(__as_ptr_remote_dsmem(__addr)), "l"(__as_b64(__value)), "r"(__as_ptr_remote_dsmem(__remote_bar)) @@ -61,7 +66,7 @@ _CCCL_DEVICE static inline void st_async(_Type* __addr, const _Type (&__value)[2 "r"(__as_b32(__value[1])), "r"(__as_ptr_remote_dsmem(__remote_bar)) : "memory"); - } _CCCL_ELSE_IF_CONSTEXPR (sizeof(_Type) == 8) { + } else _CCCL_IF_CONSTEXPR (sizeof(_Type) == 8) { asm("st.async.weak.shared::cluster.mbarrier::complete_tx::bytes.v2.b64 [%0], {%1, %2}, [%3]; // 2. " : : "r"(__as_ptr_remote_dsmem(__addr)), @@ -106,3 +111,5 @@ _CCCL_DEVICE static inline void st_async(_B32* __addr, const _B32 (&__value)[4], __cuda_ptx_st_async_is_not_supported_before_SM_90__();)); } #endif // __cccl_ptx_isa >= 810 + +#endif // _CUDA_PTX_GENERATED_ST_ASYNC_H_ diff --git a/libcudacxx/include/cuda/__ptx/instructions/generated/tensormap_cp_fenceproxy.inc b/libcudacxx/include/cuda/__ptx/instructions/generated/tensormap_cp_fenceproxy.h similarity index 85% rename from libcudacxx/include/cuda/__ptx/instructions/generated/tensormap_cp_fenceproxy.inc rename to libcudacxx/include/cuda/__ptx/instructions/generated/tensormap_cp_fenceproxy.h index 033d0606e7f..b51b5185db0 100644 --- a/libcudacxx/include/cuda/__ptx/instructions/generated/tensormap_cp_fenceproxy.inc +++ b/libcudacxx/include/cuda/__ptx/instructions/generated/tensormap_cp_fenceproxy.h @@ -1,3 +1,8 @@ +// This file was automatically generated. Do not edit. + +#ifndef _CUDA_PTX_GENERATED_TENSORMAP_CP_FENCEPROXY_H_ +#define _CUDA_PTX_GENERATED_TENSORMAP_CP_FENCEPROXY_H_ + /* // tensormap.cp_fenceproxy.global.shared::cta.tensormap::generic.sem.scope.sync.aligned [dst], [src], size; // PTX ISA 83, SM_90 @@ -28,19 +33,19 @@ tensormap_cp_fenceproxy(sem_release_t, scope_t<_Scope> __scope, void* __dst, con : : "l"(__as_ptr_gmem(__dst)), "r"(__as_ptr_smem(__src)), "n"(__size.value) : "memory"); - } _CCCL_ELSE_IF_CONSTEXPR (__scope == scope_cluster) { + } else _CCCL_IF_CONSTEXPR (__scope == scope_cluster) { asm volatile( "tensormap.cp_fenceproxy.global.shared::cta.tensormap::generic.release.cluster.sync.aligned [%0], [%1], %2;" : : "l"(__as_ptr_gmem(__dst)), "r"(__as_ptr_smem(__src)), "n"(__size.value) : "memory"); - } _CCCL_ELSE_IF_CONSTEXPR (__scope == scope_gpu) { + } else _CCCL_IF_CONSTEXPR (__scope == scope_gpu) { asm volatile( "tensormap.cp_fenceproxy.global.shared::cta.tensormap::generic.release.gpu.sync.aligned [%0], [%1], %2;" : : "l"(__as_ptr_gmem(__dst)), "r"(__as_ptr_smem(__src)), "n"(__size.value) : "memory"); - } _CCCL_ELSE_IF_CONSTEXPR (__scope == scope_sys) { + } else _CCCL_IF_CONSTEXPR (__scope == scope_sys) { asm volatile( "tensormap.cp_fenceproxy.global.shared::cta.tensormap::generic.release.sys.sync.aligned [%0], [%1], %2;" : @@ -52,3 +57,5 @@ tensormap_cp_fenceproxy(sem_release_t, scope_t<_Scope> __scope, void* __dst, con __cuda_ptx_tensormap_cp_fenceproxy_is_not_supported_before_SM_90__();)); } #endif // __cccl_ptx_isa >= 830 + +#endif // _CUDA_PTX_GENERATED_TENSORMAP_CP_FENCEPROXY_H_ diff --git a/libcudacxx/include/cuda/__ptx/instructions/generated/tensormap_replace.inc b/libcudacxx/include/cuda/__ptx/instructions/generated/tensormap_replace.h similarity index 99% rename from libcudacxx/include/cuda/__ptx/instructions/generated/tensormap_replace.inc rename to libcudacxx/include/cuda/__ptx/instructions/generated/tensormap_replace.h index 3b1060ead38..3889026750d 100644 --- a/libcudacxx/include/cuda/__ptx/instructions/generated/tensormap_replace.inc +++ b/libcudacxx/include/cuda/__ptx/instructions/generated/tensormap_replace.h @@ -1,3 +1,8 @@ +// This file was automatically generated. Do not edit. + +#ifndef _CUDA_PTX_GENERATED_TENSORMAP_REPLACE_H_ +#define _CUDA_PTX_GENERATED_TENSORMAP_REPLACE_H_ + /* // tensormap.replace.tile.global_address.space.b1024.b64 [tm_addr], new_val; // PTX ISA 83, SM_90a // .space = { .global } @@ -567,3 +572,5 @@ _CCCL_DEVICE static inline void tensormap_replace_fill_mode(space_shared_t, void __cuda_ptx_tensormap_replace_fill_mode_is_not_supported_before_SM_90a__();)); } #endif // __cccl_ptx_isa >= 830 + +#endif // _CUDA_PTX_GENERATED_TENSORMAP_REPLACE_H_ diff --git a/libcudacxx/include/cuda/__ptx/instructions/get_sreg.h b/libcudacxx/include/cuda/__ptx/instructions/get_sreg.h index 033005beb5b..3157f7d1da9 100644 --- a/libcudacxx/include/cuda/__ptx/instructions/get_sreg.h +++ b/libcudacxx/include/cuda/__ptx/instructions/get_sreg.h @@ -32,7 +32,7 @@ _LIBCUDACXX_BEGIN_NAMESPACE_CUDA_PTX // 10. Special Registers // https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#special-registers -#include +#include _LIBCUDACXX_END_NAMESPACE_CUDA_PTX diff --git a/libcudacxx/include/cuda/__ptx/instructions/getctarank.h b/libcudacxx/include/cuda/__ptx/instructions/getctarank.h index f5ed3424d3b..c41084f5ae3 100644 --- a/libcudacxx/include/cuda/__ptx/instructions/getctarank.h +++ b/libcudacxx/include/cuda/__ptx/instructions/getctarank.h @@ -32,7 +32,7 @@ _LIBCUDACXX_BEGIN_NAMESPACE_CUDA_PTX // 9.7.8.23. Data Movement and Conversion Instructions: getctarank // https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#data-movement-and-conversion-instructions-getctarank -#include +#include _LIBCUDACXX_END_NAMESPACE_CUDA_PTX diff --git a/libcudacxx/include/cuda/__ptx/instructions/mbarrier_arrive.h b/libcudacxx/include/cuda/__ptx/instructions/mbarrier_arrive.h index fb1341a61d8..0a44942df82 100644 --- a/libcudacxx/include/cuda/__ptx/instructions/mbarrier_arrive.h +++ b/libcudacxx/include/cuda/__ptx/instructions/mbarrier_arrive.h @@ -32,9 +32,9 @@ _LIBCUDACXX_BEGIN_NAMESPACE_CUDA_PTX // 9.7.12.15.13. Parallel Synchronization and Communication Instructions: mbarrier.arrive // https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#parallel-synchronization-and-communication-instructions-mbarrier-arrive -#include -#include -#include +#include +#include +#include _LIBCUDACXX_END_NAMESPACE_CUDA_PTX diff --git a/libcudacxx/include/cuda/__ptx/instructions/mbarrier_init.h b/libcudacxx/include/cuda/__ptx/instructions/mbarrier_init.h index 575abda7a41..b3539245e03 100644 --- a/libcudacxx/include/cuda/__ptx/instructions/mbarrier_init.h +++ b/libcudacxx/include/cuda/__ptx/instructions/mbarrier_init.h @@ -32,7 +32,7 @@ _LIBCUDACXX_BEGIN_NAMESPACE_CUDA_PTX // 9.7.12.15.9. Parallel Synchronization and Communication Instructions: mbarrier.init // https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#parallel-synchronization-and-communication-instructions-mbarrier-init -#include +#include _LIBCUDACXX_END_NAMESPACE_CUDA_PTX diff --git a/libcudacxx/include/cuda/__ptx/instructions/mbarrier_wait.h b/libcudacxx/include/cuda/__ptx/instructions/mbarrier_wait.h index 2d6adb78eec..dfcc03bc01c 100644 --- a/libcudacxx/include/cuda/__ptx/instructions/mbarrier_wait.h +++ b/libcudacxx/include/cuda/__ptx/instructions/mbarrier_wait.h @@ -32,10 +32,10 @@ _LIBCUDACXX_BEGIN_NAMESPACE_CUDA_PTX // 9.7.12.15.16. Parallel Synchronization and Communication Instructions: mbarrier.test_wait/mbarrier.try_wait // https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#parallel-synchronization-and-communication-instructions-mbarrier-test-wait-mbarrier-try-wait -#include -#include -#include -#include +#include +#include +#include +#include _LIBCUDACXX_END_NAMESPACE_CUDA_PTX diff --git a/libcudacxx/include/cuda/__ptx/instructions/red_async.h b/libcudacxx/include/cuda/__ptx/instructions/red_async.h index a610cf2b583..d14a96dc725 100644 --- a/libcudacxx/include/cuda/__ptx/instructions/red_async.h +++ b/libcudacxx/include/cuda/__ptx/instructions/red_async.h @@ -32,7 +32,7 @@ _LIBCUDACXX_BEGIN_NAMESPACE_CUDA_PTX // 9.7.12.7. Parallel Synchronization and Communication Instructions: red.async // https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#parallel-synchronization-and-communication-instructions-red-async -#include +#include _LIBCUDACXX_END_NAMESPACE_CUDA_PTX diff --git a/libcudacxx/include/cuda/__ptx/instructions/st_async.h b/libcudacxx/include/cuda/__ptx/instructions/st_async.h index 09199b4a3ce..ffad9f176d0 100644 --- a/libcudacxx/include/cuda/__ptx/instructions/st_async.h +++ b/libcudacxx/include/cuda/__ptx/instructions/st_async.h @@ -32,7 +32,7 @@ _LIBCUDACXX_BEGIN_NAMESPACE_CUDA_PTX // 9.7.8.12. Data Movement and Conversion Instructions: st.async // https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#data-movement-and-conversion-instructions-st-async -#include +#include _LIBCUDACXX_END_NAMESPACE_CUDA_PTX diff --git a/libcudacxx/include/cuda/__ptx/instructions/tensormap_cp_fenceproxy.h b/libcudacxx/include/cuda/__ptx/instructions/tensormap_cp_fenceproxy.h index de179f69735..22eaa502305 100644 --- a/libcudacxx/include/cuda/__ptx/instructions/tensormap_cp_fenceproxy.h +++ b/libcudacxx/include/cuda/__ptx/instructions/tensormap_cp_fenceproxy.h @@ -32,7 +32,7 @@ _LIBCUDACXX_BEGIN_NAMESPACE_CUDA_PTX // 9.7.12.15.18. Parallel Synchronization and Communication Instructions: tensormap.cp_fenceproxy // https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#parallel-synchronization-and-communication-instructions-tensormap-cp-fenceproxy -#include +#include _LIBCUDACXX_END_NAMESPACE_CUDA_PTX diff --git a/libcudacxx/include/cuda/__ptx/instructions/tensormap_replace.h b/libcudacxx/include/cuda/__ptx/instructions/tensormap_replace.h index 2f81d8b4361..681a820b070 100644 --- a/libcudacxx/include/cuda/__ptx/instructions/tensormap_replace.h +++ b/libcudacxx/include/cuda/__ptx/instructions/tensormap_replace.h @@ -32,7 +32,7 @@ _LIBCUDACXX_BEGIN_NAMESPACE_CUDA_PTX // 9.7.8.25. Data Movement and Conversion Instructions: tensormap.replace // https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#data-movement-and-conversion-instructions-tensormap-replace -#include +#include _LIBCUDACXX_END_NAMESPACE_CUDA_PTX diff --git a/libcudacxx/test/internal_headers/CMakeLists.txt b/libcudacxx/test/internal_headers/CMakeLists.txt index 4c1031e5b4f..1f1e4947efb 100644 --- a/libcudacxx/test/internal_headers/CMakeLists.txt +++ b/libcudacxx/test/internal_headers/CMakeLists.txt @@ -26,6 +26,9 @@ if ("${CMAKE_CXX_COMPILER_ID}" STREQUAL "MSVC" AND NOT "${CMAKE_CXX_STANDARD}" M list(FILTER internal_headers EXCLUDE REGEX "mdspan") endif() +# generated cuda::ptx headers are not standalone +list(FILTER internal_headers EXCLUDE REGEX "__ptx/instructions/generated") + function(libcudacxx_create_internal_header_test header_name, headertest_src, fallback) if(fallback) set(header_name "${header_name}_fallback") From efee771d1b5cdf1feb2ddc256249d14ec0768839 Mon Sep 17 00:00:00 2001 From: Bernhard Manfred Gruber Date: Mon, 25 Nov 2024 19:41:13 +0100 Subject: [PATCH 22/45] Regenerate PTX test (#2953) Overwrites all generated PTX tests and runs `pre-commit run --all-files` --- ...{barrier_cluster.inc => barrier_cluster.h} | 16 +++++++++++ .../{cp_async_bulk.inc => cp_async_bulk.h} | 16 +++++++++++ .../generated/cp_async_bulk_commit_group.h | 26 +++++++++++++++++ .../generated/cp_async_bulk_commit_group.inc | 10 ------- ...ulticast.inc => cp_async_bulk_multicast.h} | 16 +++++++++++ ...bulk_tensor.inc => cp_async_bulk_tensor.h} | 16 +++++++++++ ...t.inc => cp_async_bulk_tensor_multicast.h} | 16 +++++++++++ ...t_group.inc => cp_async_bulk_wait_group.h} | 16 +++++++++++ ..._async_bulk.inc => cp_reduce_async_bulk.h} | 16 +++++++++++ ...k_bf16.inc => cp_reduce_async_bulk_bf16.h} | 28 +++++++++++++++---- ...ulk_f16.inc => cp_reduce_async_bulk_f16.h} | 28 +++++++++++++++---- ...nsor.inc => cp_reduce_async_bulk_tensor.h} | 16 +++++++++++ .../cuda/ptx/generated/{fence.inc => fence.h} | 16 +++++++++++ .../cuda/ptx/generated/fence_mbarrier_init.h | 27 ++++++++++++++++++ .../ptx/generated/fence_mbarrier_init.inc | 11 -------- .../cuda/ptx/generated/fence_proxy_alias.h | 25 +++++++++++++++++ .../cuda/ptx/generated/fence_proxy_alias.inc | 9 ------ ...ce_proxy_async.inc => fence_proxy_async.h} | 16 +++++++++++ ...ic.inc => fence_proxy_tensormap_generic.h} | 16 +++++++++++ .../generated/{get_sreg.inc => get_sreg.h} | 16 +++++++++++ .../cuda/ptx/generated/getctarank.h | 26 +++++++++++++++++ .../cuda/ptx/generated/getctarank.inc | 10 ------- ...{mbarrier_arrive.inc => mbarrier_arrive.h} | 16 +++++++++++ ...ect_tx.inc => mbarrier_arrive_expect_tx.h} | 16 +++++++++++ .../generated/mbarrier_arrive_no_complete.h | 26 +++++++++++++++++ .../generated/mbarrier_arrive_no_complete.inc | 10 ------- .../cuda/ptx/generated/mbarrier_init.h | 26 +++++++++++++++++ .../cuda/ptx/generated/mbarrier_init.inc | 10 ------- ...rrier_try_wait.inc => mbarrier_try_wait.h} | 16 +++++++++++ ..._parity.inc => mbarrier_try_wait_parity.h} | 16 +++++++++++ .../{mbarrier_wait.inc => mbarrier_wait.h} | 0 ...wait_parity.inc => mbarrier_wait_parity.h} | 0 .../generated/{red_async.inc => red_async.h} | 16 +++++++++++ .../generated/{st_async.inc => st_async.h} | 16 +++++++++++ ...nceproxy.inc => tensormap_cp_fenceproxy.h} | 16 +++++++++++ ...sormap_replace.inc => tensormap_replace.h} | 16 +++++++++++ .../ptx/ptx.barrier.cluster.compile.pass.cpp | 19 +------------ ...p.async.bulk.commit_group.compile.pass.cpp | 19 +------------ .../ptx/ptx.cp.async.bulk.compile.pass.cpp | 19 +------------ ...x.cp.async.bulk.multicast.compile.pass.cpp | 19 +------------ .../ptx.cp.async.bulk.tensor.compile.pass.cpp | 19 +------------ ...ync.bulk.tensor.multicast.compile.pass.cpp | 19 +------------ ....cp.async.bulk.wait_group.compile.pass.cpp | 19 +------------ .../ptx.cp.reduce.async.bulk.compile.pass.cpp | 23 ++------------- ....reduce.async.bulk.tensor.compile.pass.cpp | 19 +------------ .../cuda/ptx/ptx.fence.compile.pass.cpp | 27 ++++-------------- .../cuda/ptx/ptx.get_sreg.compile.pass.cpp | 19 +------------ .../cuda/ptx/ptx.getctarank.compile.pass.cpp | 19 +------------ .../ptx/ptx.mbarrier.arrive.compile.pass.cpp | 23 ++------------- .../ptx/ptx.mbarrier.init.compile.pass.cpp | 19 +------------ .../ptx/ptx.mbarrier.wait.compile.pass.cpp | 25 +++-------------- .../cuda/ptx/ptx.red.async.compile.pass.cpp | 19 +------------ .../cuda/ptx/ptx.st.async.compile.pass.cpp | 19 +------------ ...x.tensormap.cp_fenceproxy.compile.pass.cpp | 19 +------------ .../ptx.tensormap.replace.compile.pass.cpp | 19 +------------ 55 files changed, 550 insertions(+), 425 deletions(-) rename libcudacxx/test/libcudacxx/cuda/ptx/generated/{barrier_cluster.inc => barrier_cluster.h} (69%) rename libcudacxx/test/libcudacxx/cuda/ptx/generated/{cp_async_bulk.inc => cp_async_bulk.h} (66%) create mode 100644 libcudacxx/test/libcudacxx/cuda/ptx/generated/cp_async_bulk_commit_group.h delete mode 100644 libcudacxx/test/libcudacxx/cuda/ptx/generated/cp_async_bulk_commit_group.inc rename libcudacxx/test/libcudacxx/cuda/ptx/generated/{cp_async_bulk_multicast.inc => cp_async_bulk_multicast.h} (51%) rename libcudacxx/test/libcudacxx/cuda/ptx/generated/{cp_async_bulk_tensor.inc => cp_async_bulk_tensor.h} (87%) rename libcudacxx/test/libcudacxx/cuda/ptx/generated/{cp_async_bulk_tensor_multicast.inc => cp_async_bulk_tensor_multicast.h} (83%) rename libcudacxx/test/libcudacxx/cuda/ptx/generated/{cp_async_bulk_wait_group.inc => cp_async_bulk_wait_group.h} (50%) rename libcudacxx/test/libcudacxx/cuda/ptx/generated/{cp_reduce_async_bulk.inc => cp_reduce_async_bulk.h} (96%) rename libcudacxx/test/libcudacxx/cuda/ptx/generated/{cp_reduce_async_bulk_bf16.inc => cp_reduce_async_bulk_bf16.h} (65%) rename libcudacxx/test/libcudacxx/cuda/ptx/generated/{cp_reduce_async_bulk_f16.inc => cp_reduce_async_bulk_f16.h} (59%) rename libcudacxx/test/libcudacxx/cuda/ptx/generated/{cp_reduce_async_bulk_tensor.inc => cp_reduce_async_bulk_tensor.h} (97%) rename libcudacxx/test/libcudacxx/cuda/ptx/generated/{fence.inc => fence.h} (71%) create mode 100644 libcudacxx/test/libcudacxx/cuda/ptx/generated/fence_mbarrier_init.h delete mode 100644 libcudacxx/test/libcudacxx/cuda/ptx/generated/fence_mbarrier_init.inc create mode 100644 libcudacxx/test/libcudacxx/cuda/ptx/generated/fence_proxy_alias.h delete mode 100644 libcudacxx/test/libcudacxx/cuda/ptx/generated/fence_proxy_alias.inc rename libcudacxx/test/libcudacxx/cuda/ptx/generated/{fence_proxy_async.inc => fence_proxy_async.h} (58%) rename libcudacxx/test/libcudacxx/cuda/ptx/generated/{fence_proxy_tensormap_generic.inc => fence_proxy_tensormap_generic.h} (78%) rename libcudacxx/test/libcudacxx/cuda/ptx/generated/{get_sreg.inc => get_sreg.h} (94%) create mode 100644 libcudacxx/test/libcudacxx/cuda/ptx/generated/getctarank.h delete mode 100644 libcudacxx/test/libcudacxx/cuda/ptx/generated/getctarank.inc rename libcudacxx/test/libcudacxx/cuda/ptx/generated/{mbarrier_arrive.inc => mbarrier_arrive.h} (82%) rename libcudacxx/test/libcudacxx/cuda/ptx/generated/{mbarrier_arrive_expect_tx.inc => mbarrier_arrive_expect_tx.h} (67%) create mode 100644 libcudacxx/test/libcudacxx/cuda/ptx/generated/mbarrier_arrive_no_complete.h delete mode 100644 libcudacxx/test/libcudacxx/cuda/ptx/generated/mbarrier_arrive_no_complete.inc create mode 100644 libcudacxx/test/libcudacxx/cuda/ptx/generated/mbarrier_init.h delete mode 100644 libcudacxx/test/libcudacxx/cuda/ptx/generated/mbarrier_init.inc rename libcudacxx/test/libcudacxx/cuda/ptx/generated/{mbarrier_try_wait.inc => mbarrier_try_wait.h} (77%) rename libcudacxx/test/libcudacxx/cuda/ptx/generated/{mbarrier_try_wait_parity.inc => mbarrier_try_wait_parity.h} (77%) rename libcudacxx/test/libcudacxx/cuda/ptx/generated/{mbarrier_wait.inc => mbarrier_wait.h} (100%) rename libcudacxx/test/libcudacxx/cuda/ptx/generated/{mbarrier_wait_parity.inc => mbarrier_wait_parity.h} (100%) rename libcudacxx/test/libcudacxx/cuda/ptx/generated/{red_async.inc => red_async.h} (87%) rename libcudacxx/test/libcudacxx/cuda/ptx/generated/{st_async.inc => st_async.h} (70%) rename libcudacxx/test/libcudacxx/cuda/ptx/generated/{tensormap_cp_fenceproxy.inc => tensormap_cp_fenceproxy.h} (70%) rename libcudacxx/test/libcudacxx/cuda/ptx/generated/{tensormap_replace.inc => tensormap_replace.h} (91%) diff --git a/libcudacxx/test/libcudacxx/cuda/ptx/generated/barrier_cluster.inc b/libcudacxx/test/libcudacxx/cuda/ptx/generated/barrier_cluster.h similarity index 69% rename from libcudacxx/test/libcudacxx/cuda/ptx/generated/barrier_cluster.inc rename to libcudacxx/test/libcudacxx/cuda/ptx/generated/barrier_cluster.h index cad5510ba70..52c47bf2f9d 100644 --- a/libcudacxx/test/libcudacxx/cuda/ptx/generated/barrier_cluster.inc +++ b/libcudacxx/test/libcudacxx/cuda/ptx/generated/barrier_cluster.h @@ -1,3 +1,19 @@ +// This file was automatically generated. Do not edit. + +// We use a special strategy to force the generation of the PTX. This is mainly +// a fight against dead-code-elimination in the NVVM layer. +// +// The reason we need this strategy is because certain older versions of ptxas +// segfault when a non-sensical sequence of PTX is generated. So instead, we try +// to force the instantiation and compilation to PTX of all the overloads of the +// PTX wrapping functions. +// +// We do this by writing a function pointer of each overload to the kernel +// parameter `fn_ptr`. +// +// Because `fn_ptr` is possibly visible outside this translation unit, the +// compiler must compile all the functions which are stored. + __global__ void test_barrier_cluster(void** fn_ptr) { #if __cccl_ptx_isa >= 780 diff --git a/libcudacxx/test/libcudacxx/cuda/ptx/generated/cp_async_bulk.inc b/libcudacxx/test/libcudacxx/cuda/ptx/generated/cp_async_bulk.h similarity index 66% rename from libcudacxx/test/libcudacxx/cuda/ptx/generated/cp_async_bulk.inc rename to libcudacxx/test/libcudacxx/cuda/ptx/generated/cp_async_bulk.h index cd66de989a2..a342954591a 100644 --- a/libcudacxx/test/libcudacxx/cuda/ptx/generated/cp_async_bulk.inc +++ b/libcudacxx/test/libcudacxx/cuda/ptx/generated/cp_async_bulk.h @@ -1,3 +1,19 @@ +// This file was automatically generated. Do not edit. + +// We use a special strategy to force the generation of the PTX. This is mainly +// a fight against dead-code-elimination in the NVVM layer. +// +// The reason we need this strategy is because certain older versions of ptxas +// segfault when a non-sensical sequence of PTX is generated. So instead, we try +// to force the instantiation and compilation to PTX of all the overloads of the +// PTX wrapping functions. +// +// We do this by writing a function pointer of each overload to the kernel +// parameter `fn_ptr`. +// +// Because `fn_ptr` is possibly visible outside this translation unit, the +// compiler must compile all the functions which are stored. + __global__ void test_cp_async_bulk(void** fn_ptr) { #if __cccl_ptx_isa >= 800 diff --git a/libcudacxx/test/libcudacxx/cuda/ptx/generated/cp_async_bulk_commit_group.h b/libcudacxx/test/libcudacxx/cuda/ptx/generated/cp_async_bulk_commit_group.h new file mode 100644 index 00000000000..b017312d979 --- /dev/null +++ b/libcudacxx/test/libcudacxx/cuda/ptx/generated/cp_async_bulk_commit_group.h @@ -0,0 +1,26 @@ +// This file was automatically generated. Do not edit. + +// We use a special strategy to force the generation of the PTX. This is mainly +// a fight against dead-code-elimination in the NVVM layer. +// +// The reason we need this strategy is because certain older versions of ptxas +// segfault when a non-sensical sequence of PTX is generated. So instead, we try +// to force the instantiation and compilation to PTX of all the overloads of the +// PTX wrapping functions. +// +// We do this by writing a function pointer of each overload to the kernel +// parameter `fn_ptr`. +// +// Because `fn_ptr` is possibly visible outside this translation unit, the +// compiler must compile all the functions which are stored. + +__global__ void test_cp_async_bulk_commit_group(void** fn_ptr) +{ +#if __cccl_ptx_isa >= 800 + NV_IF_TARGET( + NV_PROVIDES_SM_90, + ( + // cp.async.bulk.commit_group; + * fn_ptr++ = reinterpret_cast(static_cast(cuda::ptx::cp_async_bulk_commit_group));)); +#endif // __cccl_ptx_isa >= 800 +} diff --git a/libcudacxx/test/libcudacxx/cuda/ptx/generated/cp_async_bulk_commit_group.inc b/libcudacxx/test/libcudacxx/cuda/ptx/generated/cp_async_bulk_commit_group.inc deleted file mode 100644 index afdf14abb8a..00000000000 --- a/libcudacxx/test/libcudacxx/cuda/ptx/generated/cp_async_bulk_commit_group.inc +++ /dev/null @@ -1,10 +0,0 @@ -__global__ void test_cp_async_bulk_commit_group(void** fn_ptr) -{ -#if __cccl_ptx_isa >= 800 - NV_IF_TARGET( - NV_PROVIDES_SM_90, - ( - // cp.async.bulk.commit_group; - * fn_ptr++ = reinterpret_cast(static_cast(cuda::ptx::cp_async_bulk_commit_group));)); -#endif // __cccl_ptx_isa >= 800 -} diff --git a/libcudacxx/test/libcudacxx/cuda/ptx/generated/cp_async_bulk_multicast.inc b/libcudacxx/test/libcudacxx/cuda/ptx/generated/cp_async_bulk_multicast.h similarity index 51% rename from libcudacxx/test/libcudacxx/cuda/ptx/generated/cp_async_bulk_multicast.inc rename to libcudacxx/test/libcudacxx/cuda/ptx/generated/cp_async_bulk_multicast.h index b2bd0d968d9..6e2a986e7bd 100644 --- a/libcudacxx/test/libcudacxx/cuda/ptx/generated/cp_async_bulk_multicast.inc +++ b/libcudacxx/test/libcudacxx/cuda/ptx/generated/cp_async_bulk_multicast.h @@ -1,3 +1,19 @@ +// This file was automatically generated. Do not edit. + +// We use a special strategy to force the generation of the PTX. This is mainly +// a fight against dead-code-elimination in the NVVM layer. +// +// The reason we need this strategy is because certain older versions of ptxas +// segfault when a non-sensical sequence of PTX is generated. So instead, we try +// to force the instantiation and compilation to PTX of all the overloads of the +// PTX wrapping functions. +// +// We do this by writing a function pointer of each overload to the kernel +// parameter `fn_ptr`. +// +// Because `fn_ptr` is possibly visible outside this translation unit, the +// compiler must compile all the functions which are stored. + __global__ void test_cp_async_bulk_multicast(void** fn_ptr) { #if __cccl_ptx_isa >= 800 diff --git a/libcudacxx/test/libcudacxx/cuda/ptx/generated/cp_async_bulk_tensor.inc b/libcudacxx/test/libcudacxx/cuda/ptx/generated/cp_async_bulk_tensor.h similarity index 87% rename from libcudacxx/test/libcudacxx/cuda/ptx/generated/cp_async_bulk_tensor.inc rename to libcudacxx/test/libcudacxx/cuda/ptx/generated/cp_async_bulk_tensor.h index f9d0d240d28..4618f3ea7a0 100644 --- a/libcudacxx/test/libcudacxx/cuda/ptx/generated/cp_async_bulk_tensor.inc +++ b/libcudacxx/test/libcudacxx/cuda/ptx/generated/cp_async_bulk_tensor.h @@ -1,3 +1,19 @@ +// This file was automatically generated. Do not edit. + +// We use a special strategy to force the generation of the PTX. This is mainly +// a fight against dead-code-elimination in the NVVM layer. +// +// The reason we need this strategy is because certain older versions of ptxas +// segfault when a non-sensical sequence of PTX is generated. So instead, we try +// to force the instantiation and compilation to PTX of all the overloads of the +// PTX wrapping functions. +// +// We do this by writing a function pointer of each overload to the kernel +// parameter `fn_ptr`. +// +// Because `fn_ptr` is possibly visible outside this translation unit, the +// compiler must compile all the functions which are stored. + __global__ void test_cp_async_bulk_tensor(void** fn_ptr) { #if __cccl_ptx_isa >= 800 diff --git a/libcudacxx/test/libcudacxx/cuda/ptx/generated/cp_async_bulk_tensor_multicast.inc b/libcudacxx/test/libcudacxx/cuda/ptx/generated/cp_async_bulk_tensor_multicast.h similarity index 83% rename from libcudacxx/test/libcudacxx/cuda/ptx/generated/cp_async_bulk_tensor_multicast.inc rename to libcudacxx/test/libcudacxx/cuda/ptx/generated/cp_async_bulk_tensor_multicast.h index 2851aab6d7c..617bc9507bd 100644 --- a/libcudacxx/test/libcudacxx/cuda/ptx/generated/cp_async_bulk_tensor_multicast.inc +++ b/libcudacxx/test/libcudacxx/cuda/ptx/generated/cp_async_bulk_tensor_multicast.h @@ -1,3 +1,19 @@ +// This file was automatically generated. Do not edit. + +// We use a special strategy to force the generation of the PTX. This is mainly +// a fight against dead-code-elimination in the NVVM layer. +// +// The reason we need this strategy is because certain older versions of ptxas +// segfault when a non-sensical sequence of PTX is generated. So instead, we try +// to force the instantiation and compilation to PTX of all the overloads of the +// PTX wrapping functions. +// +// We do this by writing a function pointer of each overload to the kernel +// parameter `fn_ptr`. +// +// Because `fn_ptr` is possibly visible outside this translation unit, the +// compiler must compile all the functions which are stored. + __global__ void test_cp_async_bulk_tensor_multicast(void** fn_ptr) { #if __cccl_ptx_isa >= 800 diff --git a/libcudacxx/test/libcudacxx/cuda/ptx/generated/cp_async_bulk_wait_group.inc b/libcudacxx/test/libcudacxx/cuda/ptx/generated/cp_async_bulk_wait_group.h similarity index 50% rename from libcudacxx/test/libcudacxx/cuda/ptx/generated/cp_async_bulk_wait_group.inc rename to libcudacxx/test/libcudacxx/cuda/ptx/generated/cp_async_bulk_wait_group.h index 0139a65f6ce..fa11225f316 100644 --- a/libcudacxx/test/libcudacxx/cuda/ptx/generated/cp_async_bulk_wait_group.inc +++ b/libcudacxx/test/libcudacxx/cuda/ptx/generated/cp_async_bulk_wait_group.h @@ -1,3 +1,19 @@ +// This file was automatically generated. Do not edit. + +// We use a special strategy to force the generation of the PTX. This is mainly +// a fight against dead-code-elimination in the NVVM layer. +// +// The reason we need this strategy is because certain older versions of ptxas +// segfault when a non-sensical sequence of PTX is generated. So instead, we try +// to force the instantiation and compilation to PTX of all the overloads of the +// PTX wrapping functions. +// +// We do this by writing a function pointer of each overload to the kernel +// parameter `fn_ptr`. +// +// Because `fn_ptr` is possibly visible outside this translation unit, the +// compiler must compile all the functions which are stored. + __global__ void test_cp_async_bulk_wait_group(void** fn_ptr) { #if __cccl_ptx_isa >= 800 diff --git a/libcudacxx/test/libcudacxx/cuda/ptx/generated/cp_reduce_async_bulk.inc b/libcudacxx/test/libcudacxx/cuda/ptx/generated/cp_reduce_async_bulk.h similarity index 96% rename from libcudacxx/test/libcudacxx/cuda/ptx/generated/cp_reduce_async_bulk.inc rename to libcudacxx/test/libcudacxx/cuda/ptx/generated/cp_reduce_async_bulk.h index 5ee274bcbe8..6f3195ebf7d 100644 --- a/libcudacxx/test/libcudacxx/cuda/ptx/generated/cp_reduce_async_bulk.inc +++ b/libcudacxx/test/libcudacxx/cuda/ptx/generated/cp_reduce_async_bulk.h @@ -1,3 +1,19 @@ +// This file was automatically generated. Do not edit. + +// We use a special strategy to force the generation of the PTX. This is mainly +// a fight against dead-code-elimination in the NVVM layer. +// +// The reason we need this strategy is because certain older versions of ptxas +// segfault when a non-sensical sequence of PTX is generated. So instead, we try +// to force the instantiation and compilation to PTX of all the overloads of the +// PTX wrapping functions. +// +// We do this by writing a function pointer of each overload to the kernel +// parameter `fn_ptr`. +// +// Because `fn_ptr` is possibly visible outside this translation unit, the +// compiler must compile all the functions which are stored. + __global__ void test_cp_reduce_async_bulk(void** fn_ptr) { #if __cccl_ptx_isa >= 800 diff --git a/libcudacxx/test/libcudacxx/cuda/ptx/generated/cp_reduce_async_bulk_bf16.inc b/libcudacxx/test/libcudacxx/cuda/ptx/generated/cp_reduce_async_bulk_bf16.h similarity index 65% rename from libcudacxx/test/libcudacxx/cuda/ptx/generated/cp_reduce_async_bulk_bf16.inc rename to libcudacxx/test/libcudacxx/cuda/ptx/generated/cp_reduce_async_bulk_bf16.h index fe38374fe00..f5bfe7ef8b3 100644 --- a/libcudacxx/test/libcudacxx/cuda/ptx/generated/cp_reduce_async_bulk_bf16.inc +++ b/libcudacxx/test/libcudacxx/cuda/ptx/generated/cp_reduce_async_bulk_bf16.h @@ -1,6 +1,22 @@ +// This file was automatically generated. Do not edit. + +// We use a special strategy to force the generation of the PTX. This is mainly +// a fight against dead-code-elimination in the NVVM layer. +// +// The reason we need this strategy is because certain older versions of ptxas +// segfault when a non-sensical sequence of PTX is generated. So instead, we try +// to force the instantiation and compilation to PTX of all the overloads of the +// PTX wrapping functions. +// +// We do this by writing a function pointer of each overload to the kernel +// parameter `fn_ptr`. +// +// Because `fn_ptr` is possibly visible outside this translation unit, the +// compiler must compile all the functions which are stored. + __global__ void test_cp_reduce_async_bulk_bf16(void** fn_ptr) { -# if __cccl_ptx_isa >= 800 +#if __cccl_ptx_isa >= 800 NV_IF_TARGET( NV_PROVIDES_SM_90, ( @@ -12,9 +28,9 @@ __global__ void test_cp_reduce_async_bulk_bf16(void** fn_ptr) __nv_bfloat16*, const __nv_bfloat16*, uint32_t)>(cuda::ptx::cp_reduce_async_bulk));)); -# endif // __cccl_ptx_isa >= 800 +#endif // __cccl_ptx_isa >= 800 -# if __cccl_ptx_isa >= 800 +#if __cccl_ptx_isa >= 800 NV_IF_TARGET( NV_PROVIDES_SM_90, ( @@ -26,9 +42,9 @@ __global__ void test_cp_reduce_async_bulk_bf16(void** fn_ptr) __nv_bfloat16*, const __nv_bfloat16*, uint32_t)>(cuda::ptx::cp_reduce_async_bulk));)); -# endif // __cccl_ptx_isa >= 800 +#endif // __cccl_ptx_isa >= 800 -# if __cccl_ptx_isa >= 800 +#if __cccl_ptx_isa >= 800 NV_IF_TARGET( NV_PROVIDES_SM_90, ( @@ -40,5 +56,5 @@ __global__ void test_cp_reduce_async_bulk_bf16(void** fn_ptr) __nv_bfloat16*, const __nv_bfloat16*, uint32_t)>(cuda::ptx::cp_reduce_async_bulk));)); -# endif // __cccl_ptx_isa >= 800 +#endif // __cccl_ptx_isa >= 800 } diff --git a/libcudacxx/test/libcudacxx/cuda/ptx/generated/cp_reduce_async_bulk_f16.inc b/libcudacxx/test/libcudacxx/cuda/ptx/generated/cp_reduce_async_bulk_f16.h similarity index 59% rename from libcudacxx/test/libcudacxx/cuda/ptx/generated/cp_reduce_async_bulk_f16.inc rename to libcudacxx/test/libcudacxx/cuda/ptx/generated/cp_reduce_async_bulk_f16.h index e7e58cfcb80..b2ce91fc12b 100644 --- a/libcudacxx/test/libcudacxx/cuda/ptx/generated/cp_reduce_async_bulk_f16.inc +++ b/libcudacxx/test/libcudacxx/cuda/ptx/generated/cp_reduce_async_bulk_f16.h @@ -1,6 +1,22 @@ +// This file was automatically generated. Do not edit. + +// We use a special strategy to force the generation of the PTX. This is mainly +// a fight against dead-code-elimination in the NVVM layer. +// +// The reason we need this strategy is because certain older versions of ptxas +// segfault when a non-sensical sequence of PTX is generated. So instead, we try +// to force the instantiation and compilation to PTX of all the overloads of the +// PTX wrapping functions. +// +// We do this by writing a function pointer of each overload to the kernel +// parameter `fn_ptr`. +// +// Because `fn_ptr` is possibly visible outside this translation unit, the +// compiler must compile all the functions which are stored. + __global__ void test_cp_reduce_async_bulk_f16(void** fn_ptr) { -# if __cccl_ptx_isa >= 800 +#if __cccl_ptx_isa >= 800 NV_IF_TARGET( NV_PROVIDES_SM_90, ( @@ -9,9 +25,9 @@ __global__ void test_cp_reduce_async_bulk_f16(void** fn_ptr) static_cast( cuda::ptx::cp_reduce_async_bulk));)); -# endif // __cccl_ptx_isa >= 800 +#endif // __cccl_ptx_isa >= 800 -# if __cccl_ptx_isa >= 800 +#if __cccl_ptx_isa >= 800 NV_IF_TARGET( NV_PROVIDES_SM_90, ( @@ -20,9 +36,9 @@ __global__ void test_cp_reduce_async_bulk_f16(void** fn_ptr) static_cast( cuda::ptx::cp_reduce_async_bulk));)); -# endif // __cccl_ptx_isa >= 800 +#endif // __cccl_ptx_isa >= 800 -# if __cccl_ptx_isa >= 800 +#if __cccl_ptx_isa >= 800 NV_IF_TARGET( NV_PROVIDES_SM_90, ( @@ -31,5 +47,5 @@ __global__ void test_cp_reduce_async_bulk_f16(void** fn_ptr) static_cast( cuda::ptx::cp_reduce_async_bulk));)); -# endif // __cccl_ptx_isa >= 800 +#endif // __cccl_ptx_isa >= 800 } diff --git a/libcudacxx/test/libcudacxx/cuda/ptx/generated/cp_reduce_async_bulk_tensor.inc b/libcudacxx/test/libcudacxx/cuda/ptx/generated/cp_reduce_async_bulk_tensor.h similarity index 97% rename from libcudacxx/test/libcudacxx/cuda/ptx/generated/cp_reduce_async_bulk_tensor.inc rename to libcudacxx/test/libcudacxx/cuda/ptx/generated/cp_reduce_async_bulk_tensor.h index 6f0a7d710ce..270f17a70e3 100644 --- a/libcudacxx/test/libcudacxx/cuda/ptx/generated/cp_reduce_async_bulk_tensor.inc +++ b/libcudacxx/test/libcudacxx/cuda/ptx/generated/cp_reduce_async_bulk_tensor.h @@ -1,3 +1,19 @@ +// This file was automatically generated. Do not edit. + +// We use a special strategy to force the generation of the PTX. This is mainly +// a fight against dead-code-elimination in the NVVM layer. +// +// The reason we need this strategy is because certain older versions of ptxas +// segfault when a non-sensical sequence of PTX is generated. So instead, we try +// to force the instantiation and compilation to PTX of all the overloads of the +// PTX wrapping functions. +// +// We do this by writing a function pointer of each overload to the kernel +// parameter `fn_ptr`. +// +// Because `fn_ptr` is possibly visible outside this translation unit, the +// compiler must compile all the functions which are stored. + __global__ void test_cp_reduce_async_bulk_tensor(void** fn_ptr) { #if __cccl_ptx_isa >= 800 diff --git a/libcudacxx/test/libcudacxx/cuda/ptx/generated/fence.inc b/libcudacxx/test/libcudacxx/cuda/ptx/generated/fence.h similarity index 71% rename from libcudacxx/test/libcudacxx/cuda/ptx/generated/fence.inc rename to libcudacxx/test/libcudacxx/cuda/ptx/generated/fence.h index 2e464580de9..aecfcde5e01 100644 --- a/libcudacxx/test/libcudacxx/cuda/ptx/generated/fence.inc +++ b/libcudacxx/test/libcudacxx/cuda/ptx/generated/fence.h @@ -1,3 +1,19 @@ +// This file was automatically generated. Do not edit. + +// We use a special strategy to force the generation of the PTX. This is mainly +// a fight against dead-code-elimination in the NVVM layer. +// +// The reason we need this strategy is because certain older versions of ptxas +// segfault when a non-sensical sequence of PTX is generated. So instead, we try +// to force the instantiation and compilation to PTX of all the overloads of the +// PTX wrapping functions. +// +// We do this by writing a function pointer of each overload to the kernel +// parameter `fn_ptr`. +// +// Because `fn_ptr` is possibly visible outside this translation unit, the +// compiler must compile all the functions which are stored. + __global__ void test_fence(void** fn_ptr) { #if __cccl_ptx_isa >= 600 diff --git a/libcudacxx/test/libcudacxx/cuda/ptx/generated/fence_mbarrier_init.h b/libcudacxx/test/libcudacxx/cuda/ptx/generated/fence_mbarrier_init.h new file mode 100644 index 00000000000..29d1bf3f627 --- /dev/null +++ b/libcudacxx/test/libcudacxx/cuda/ptx/generated/fence_mbarrier_init.h @@ -0,0 +1,27 @@ +// This file was automatically generated. Do not edit. + +// We use a special strategy to force the generation of the PTX. This is mainly +// a fight against dead-code-elimination in the NVVM layer. +// +// The reason we need this strategy is because certain older versions of ptxas +// segfault when a non-sensical sequence of PTX is generated. So instead, we try +// to force the instantiation and compilation to PTX of all the overloads of the +// PTX wrapping functions. +// +// We do this by writing a function pointer of each overload to the kernel +// parameter `fn_ptr`. +// +// Because `fn_ptr` is possibly visible outside this translation unit, the +// compiler must compile all the functions which are stored. + +__global__ void test_fence_mbarrier_init(void** fn_ptr) +{ +#if __cccl_ptx_isa >= 800 + NV_IF_TARGET( + NV_PROVIDES_SM_90, + ( + // fence.mbarrier_init.release.cluster; // 3. + * fn_ptr++ = reinterpret_cast(static_cast( + cuda::ptx::fence_mbarrier_init));)); +#endif // __cccl_ptx_isa >= 800 +} diff --git a/libcudacxx/test/libcudacxx/cuda/ptx/generated/fence_mbarrier_init.inc b/libcudacxx/test/libcudacxx/cuda/ptx/generated/fence_mbarrier_init.inc deleted file mode 100644 index f503c1d055b..00000000000 --- a/libcudacxx/test/libcudacxx/cuda/ptx/generated/fence_mbarrier_init.inc +++ /dev/null @@ -1,11 +0,0 @@ -__global__ void test_fence_mbarrier_init(void** fn_ptr) -{ -#if __cccl_ptx_isa >= 800 - NV_IF_TARGET( - NV_PROVIDES_SM_90, - ( - // fence.mbarrier_init.release.cluster; // 3. - * fn_ptr++ = reinterpret_cast(static_cast( - cuda::ptx::fence_mbarrier_init));)); -#endif // __cccl_ptx_isa >= 800 -} diff --git a/libcudacxx/test/libcudacxx/cuda/ptx/generated/fence_proxy_alias.h b/libcudacxx/test/libcudacxx/cuda/ptx/generated/fence_proxy_alias.h new file mode 100644 index 00000000000..474f89f8b0f --- /dev/null +++ b/libcudacxx/test/libcudacxx/cuda/ptx/generated/fence_proxy_alias.h @@ -0,0 +1,25 @@ +// This file was automatically generated. Do not edit. + +// We use a special strategy to force the generation of the PTX. This is mainly +// a fight against dead-code-elimination in the NVVM layer. +// +// The reason we need this strategy is because certain older versions of ptxas +// segfault when a non-sensical sequence of PTX is generated. So instead, we try +// to force the instantiation and compilation to PTX of all the overloads of the +// PTX wrapping functions. +// +// We do this by writing a function pointer of each overload to the kernel +// parameter `fn_ptr`. +// +// Because `fn_ptr` is possibly visible outside this translation unit, the +// compiler must compile all the functions which are stored. + +__global__ void test_fence_proxy_alias(void** fn_ptr) +{ +#if __cccl_ptx_isa >= 750 + NV_IF_TARGET(NV_PROVIDES_SM_70, + ( + // fence.proxy.alias; // 4. + * fn_ptr++ = reinterpret_cast(static_cast(cuda::ptx::fence_proxy_alias));)); +#endif // __cccl_ptx_isa >= 750 +} diff --git a/libcudacxx/test/libcudacxx/cuda/ptx/generated/fence_proxy_alias.inc b/libcudacxx/test/libcudacxx/cuda/ptx/generated/fence_proxy_alias.inc deleted file mode 100644 index a8021d3f5be..00000000000 --- a/libcudacxx/test/libcudacxx/cuda/ptx/generated/fence_proxy_alias.inc +++ /dev/null @@ -1,9 +0,0 @@ -__global__ void test_fence_proxy_alias(void** fn_ptr) -{ -#if __cccl_ptx_isa >= 750 - NV_IF_TARGET(NV_PROVIDES_SM_70, - ( - // fence.proxy.alias; // 4. - * fn_ptr++ = reinterpret_cast(static_cast(cuda::ptx::fence_proxy_alias));)); -#endif // __cccl_ptx_isa >= 750 -} diff --git a/libcudacxx/test/libcudacxx/cuda/ptx/generated/fence_proxy_async.inc b/libcudacxx/test/libcudacxx/cuda/ptx/generated/fence_proxy_async.h similarity index 58% rename from libcudacxx/test/libcudacxx/cuda/ptx/generated/fence_proxy_async.inc rename to libcudacxx/test/libcudacxx/cuda/ptx/generated/fence_proxy_async.h index e3d8e6d160a..56ebe6cceb0 100644 --- a/libcudacxx/test/libcudacxx/cuda/ptx/generated/fence_proxy_async.inc +++ b/libcudacxx/test/libcudacxx/cuda/ptx/generated/fence_proxy_async.h @@ -1,3 +1,19 @@ +// This file was automatically generated. Do not edit. + +// We use a special strategy to force the generation of the PTX. This is mainly +// a fight against dead-code-elimination in the NVVM layer. +// +// The reason we need this strategy is because certain older versions of ptxas +// segfault when a non-sensical sequence of PTX is generated. So instead, we try +// to force the instantiation and compilation to PTX of all the overloads of the +// PTX wrapping functions. +// +// We do this by writing a function pointer of each overload to the kernel +// parameter `fn_ptr`. +// +// Because `fn_ptr` is possibly visible outside this translation unit, the +// compiler must compile all the functions which are stored. + __global__ void test_fence_proxy_async(void** fn_ptr) { #if __cccl_ptx_isa >= 800 diff --git a/libcudacxx/test/libcudacxx/cuda/ptx/generated/fence_proxy_tensormap_generic.inc b/libcudacxx/test/libcudacxx/cuda/ptx/generated/fence_proxy_tensormap_generic.h similarity index 78% rename from libcudacxx/test/libcudacxx/cuda/ptx/generated/fence_proxy_tensormap_generic.inc rename to libcudacxx/test/libcudacxx/cuda/ptx/generated/fence_proxy_tensormap_generic.h index 1e0ea93a387..288aa6c3257 100644 --- a/libcudacxx/test/libcudacxx/cuda/ptx/generated/fence_proxy_tensormap_generic.inc +++ b/libcudacxx/test/libcudacxx/cuda/ptx/generated/fence_proxy_tensormap_generic.h @@ -1,3 +1,19 @@ +// This file was automatically generated. Do not edit. + +// We use a special strategy to force the generation of the PTX. This is mainly +// a fight against dead-code-elimination in the NVVM layer. +// +// The reason we need this strategy is because certain older versions of ptxas +// segfault when a non-sensical sequence of PTX is generated. So instead, we try +// to force the instantiation and compilation to PTX of all the overloads of the +// PTX wrapping functions. +// +// We do this by writing a function pointer of each overload to the kernel +// parameter `fn_ptr`. +// +// Because `fn_ptr` is possibly visible outside this translation unit, the +// compiler must compile all the functions which are stored. + __global__ void test_fence_proxy_tensormap_generic(void** fn_ptr) { #if __cccl_ptx_isa >= 830 diff --git a/libcudacxx/test/libcudacxx/cuda/ptx/generated/get_sreg.inc b/libcudacxx/test/libcudacxx/cuda/ptx/generated/get_sreg.h similarity index 94% rename from libcudacxx/test/libcudacxx/cuda/ptx/generated/get_sreg.inc rename to libcudacxx/test/libcudacxx/cuda/ptx/generated/get_sreg.h index 90842352f90..dd4326a6a17 100644 --- a/libcudacxx/test/libcudacxx/cuda/ptx/generated/get_sreg.inc +++ b/libcudacxx/test/libcudacxx/cuda/ptx/generated/get_sreg.h @@ -1,3 +1,19 @@ +// This file was automatically generated. Do not edit. + +// We use a special strategy to force the generation of the PTX. This is mainly +// a fight against dead-code-elimination in the NVVM layer. +// +// The reason we need this strategy is because certain older versions of ptxas +// segfault when a non-sensical sequence of PTX is generated. So instead, we try +// to force the instantiation and compilation to PTX of all the overloads of the +// PTX wrapping functions. +// +// We do this by writing a function pointer of each overload to the kernel +// parameter `fn_ptr`. +// +// Because `fn_ptr` is possibly visible outside this translation unit, the +// compiler must compile all the functions which are stored. + __global__ void test_get_sreg(void** fn_ptr) { #if __cccl_ptx_isa >= 200 diff --git a/libcudacxx/test/libcudacxx/cuda/ptx/generated/getctarank.h b/libcudacxx/test/libcudacxx/cuda/ptx/generated/getctarank.h new file mode 100644 index 00000000000..b6e4b06afd6 --- /dev/null +++ b/libcudacxx/test/libcudacxx/cuda/ptx/generated/getctarank.h @@ -0,0 +1,26 @@ +// This file was automatically generated. Do not edit. + +// We use a special strategy to force the generation of the PTX. This is mainly +// a fight against dead-code-elimination in the NVVM layer. +// +// The reason we need this strategy is because certain older versions of ptxas +// segfault when a non-sensical sequence of PTX is generated. So instead, we try +// to force the instantiation and compilation to PTX of all the overloads of the +// PTX wrapping functions. +// +// We do this by writing a function pointer of each overload to the kernel +// parameter `fn_ptr`. +// +// Because `fn_ptr` is possibly visible outside this translation unit, the +// compiler must compile all the functions which are stored. + +__global__ void test_getctarank(void** fn_ptr) +{ +#if __cccl_ptx_isa >= 780 + NV_IF_TARGET(NV_PROVIDES_SM_90, + ( + // getctarank.shared::cluster.u32 dest, addr; + * fn_ptr++ = reinterpret_cast( + static_cast(cuda::ptx::getctarank));)); +#endif // __cccl_ptx_isa >= 780 +} diff --git a/libcudacxx/test/libcudacxx/cuda/ptx/generated/getctarank.inc b/libcudacxx/test/libcudacxx/cuda/ptx/generated/getctarank.inc deleted file mode 100644 index 28b04c9f738..00000000000 --- a/libcudacxx/test/libcudacxx/cuda/ptx/generated/getctarank.inc +++ /dev/null @@ -1,10 +0,0 @@ -__global__ void test_getctarank(void** fn_ptr) -{ -#if __cccl_ptx_isa >= 780 - NV_IF_TARGET(NV_PROVIDES_SM_90, - ( - // getctarank.shared::cluster.u32 dest, addr; - * fn_ptr++ = reinterpret_cast( - static_cast(cuda::ptx::getctarank));)); -#endif // __cccl_ptx_isa >= 780 -} diff --git a/libcudacxx/test/libcudacxx/cuda/ptx/generated/mbarrier_arrive.inc b/libcudacxx/test/libcudacxx/cuda/ptx/generated/mbarrier_arrive.h similarity index 82% rename from libcudacxx/test/libcudacxx/cuda/ptx/generated/mbarrier_arrive.inc rename to libcudacxx/test/libcudacxx/cuda/ptx/generated/mbarrier_arrive.h index 4a94ec51d45..3cddcb3b54c 100644 --- a/libcudacxx/test/libcudacxx/cuda/ptx/generated/mbarrier_arrive.inc +++ b/libcudacxx/test/libcudacxx/cuda/ptx/generated/mbarrier_arrive.h @@ -1,3 +1,19 @@ +// This file was automatically generated. Do not edit. + +// We use a special strategy to force the generation of the PTX. This is mainly +// a fight against dead-code-elimination in the NVVM layer. +// +// The reason we need this strategy is because certain older versions of ptxas +// segfault when a non-sensical sequence of PTX is generated. So instead, we try +// to force the instantiation and compilation to PTX of all the overloads of the +// PTX wrapping functions. +// +// We do this by writing a function pointer of each overload to the kernel +// parameter `fn_ptr`. +// +// Because `fn_ptr` is possibly visible outside this translation unit, the +// compiler must compile all the functions which are stored. + __global__ void test_mbarrier_arrive(void** fn_ptr) { #if __cccl_ptx_isa >= 700 diff --git a/libcudacxx/test/libcudacxx/cuda/ptx/generated/mbarrier_arrive_expect_tx.inc b/libcudacxx/test/libcudacxx/cuda/ptx/generated/mbarrier_arrive_expect_tx.h similarity index 67% rename from libcudacxx/test/libcudacxx/cuda/ptx/generated/mbarrier_arrive_expect_tx.inc rename to libcudacxx/test/libcudacxx/cuda/ptx/generated/mbarrier_arrive_expect_tx.h index 085723a452b..a2ef4b619bb 100644 --- a/libcudacxx/test/libcudacxx/cuda/ptx/generated/mbarrier_arrive_expect_tx.inc +++ b/libcudacxx/test/libcudacxx/cuda/ptx/generated/mbarrier_arrive_expect_tx.h @@ -1,3 +1,19 @@ +// This file was automatically generated. Do not edit. + +// We use a special strategy to force the generation of the PTX. This is mainly +// a fight against dead-code-elimination in the NVVM layer. +// +// The reason we need this strategy is because certain older versions of ptxas +// segfault when a non-sensical sequence of PTX is generated. So instead, we try +// to force the instantiation and compilation to PTX of all the overloads of the +// PTX wrapping functions. +// +// We do this by writing a function pointer of each overload to the kernel +// parameter `fn_ptr`. +// +// Because `fn_ptr` is possibly visible outside this translation unit, the +// compiler must compile all the functions which are stored. + __global__ void test_mbarrier_arrive_expect_tx(void** fn_ptr) { #if __cccl_ptx_isa >= 800 diff --git a/libcudacxx/test/libcudacxx/cuda/ptx/generated/mbarrier_arrive_no_complete.h b/libcudacxx/test/libcudacxx/cuda/ptx/generated/mbarrier_arrive_no_complete.h new file mode 100644 index 00000000000..9647ff830a8 --- /dev/null +++ b/libcudacxx/test/libcudacxx/cuda/ptx/generated/mbarrier_arrive_no_complete.h @@ -0,0 +1,26 @@ +// This file was automatically generated. Do not edit. + +// We use a special strategy to force the generation of the PTX. This is mainly +// a fight against dead-code-elimination in the NVVM layer. +// +// The reason we need this strategy is because certain older versions of ptxas +// segfault when a non-sensical sequence of PTX is generated. So instead, we try +// to force the instantiation and compilation to PTX of all the overloads of the +// PTX wrapping functions. +// +// We do this by writing a function pointer of each overload to the kernel +// parameter `fn_ptr`. +// +// Because `fn_ptr` is possibly visible outside this translation unit, the +// compiler must compile all the functions which are stored. + +__global__ void test_mbarrier_arrive_no_complete(void** fn_ptr) +{ +#if __cccl_ptx_isa >= 700 + NV_IF_TARGET(NV_PROVIDES_SM_80, + ( + // mbarrier.arrive.noComplete.shared.b64 state, [addr], count; // 5. + * fn_ptr++ = reinterpret_cast( + static_cast(cuda::ptx::mbarrier_arrive_no_complete));)); +#endif // __cccl_ptx_isa >= 700 +} diff --git a/libcudacxx/test/libcudacxx/cuda/ptx/generated/mbarrier_arrive_no_complete.inc b/libcudacxx/test/libcudacxx/cuda/ptx/generated/mbarrier_arrive_no_complete.inc deleted file mode 100644 index d1d017cd3c2..00000000000 --- a/libcudacxx/test/libcudacxx/cuda/ptx/generated/mbarrier_arrive_no_complete.inc +++ /dev/null @@ -1,10 +0,0 @@ -__global__ void test_mbarrier_arrive_no_complete(void** fn_ptr) -{ -#if __cccl_ptx_isa >= 700 - NV_IF_TARGET(NV_PROVIDES_SM_80, - ( - // mbarrier.arrive.noComplete.shared.b64 state, [addr], count; // 5. - * fn_ptr++ = reinterpret_cast( - static_cast(cuda::ptx::mbarrier_arrive_no_complete));)); -#endif // __cccl_ptx_isa >= 700 -} diff --git a/libcudacxx/test/libcudacxx/cuda/ptx/generated/mbarrier_init.h b/libcudacxx/test/libcudacxx/cuda/ptx/generated/mbarrier_init.h new file mode 100644 index 00000000000..d0a87419e77 --- /dev/null +++ b/libcudacxx/test/libcudacxx/cuda/ptx/generated/mbarrier_init.h @@ -0,0 +1,26 @@ +// This file was automatically generated. Do not edit. + +// We use a special strategy to force the generation of the PTX. This is mainly +// a fight against dead-code-elimination in the NVVM layer. +// +// The reason we need this strategy is because certain older versions of ptxas +// segfault when a non-sensical sequence of PTX is generated. So instead, we try +// to force the instantiation and compilation to PTX of all the overloads of the +// PTX wrapping functions. +// +// We do this by writing a function pointer of each overload to the kernel +// parameter `fn_ptr`. +// +// Because `fn_ptr` is possibly visible outside this translation unit, the +// compiler must compile all the functions which are stored. + +__global__ void test_mbarrier_init(void** fn_ptr) +{ +#if __cccl_ptx_isa >= 700 + NV_IF_TARGET(NV_PROVIDES_SM_80, + ( + // mbarrier.init.shared.b64 [addr], count; + * fn_ptr++ = reinterpret_cast( + static_cast(cuda::ptx::mbarrier_init));)); +#endif // __cccl_ptx_isa >= 700 +} diff --git a/libcudacxx/test/libcudacxx/cuda/ptx/generated/mbarrier_init.inc b/libcudacxx/test/libcudacxx/cuda/ptx/generated/mbarrier_init.inc deleted file mode 100644 index f814161d1f9..00000000000 --- a/libcudacxx/test/libcudacxx/cuda/ptx/generated/mbarrier_init.inc +++ /dev/null @@ -1,10 +0,0 @@ -__global__ void test_mbarrier_init(void** fn_ptr) -{ -#if __cccl_ptx_isa >= 700 - NV_IF_TARGET(NV_PROVIDES_SM_80, - ( - // mbarrier.init.shared.b64 [addr], count; - * fn_ptr++ = reinterpret_cast( - static_cast(cuda::ptx::mbarrier_init));)); -#endif // __cccl_ptx_isa >= 700 -} diff --git a/libcudacxx/test/libcudacxx/cuda/ptx/generated/mbarrier_try_wait.inc b/libcudacxx/test/libcudacxx/cuda/ptx/generated/mbarrier_try_wait.h similarity index 77% rename from libcudacxx/test/libcudacxx/cuda/ptx/generated/mbarrier_try_wait.inc rename to libcudacxx/test/libcudacxx/cuda/ptx/generated/mbarrier_try_wait.h index e9d8661a07e..00166f8172c 100644 --- a/libcudacxx/test/libcudacxx/cuda/ptx/generated/mbarrier_try_wait.inc +++ b/libcudacxx/test/libcudacxx/cuda/ptx/generated/mbarrier_try_wait.h @@ -1,3 +1,19 @@ +// This file was automatically generated. Do not edit. + +// We use a special strategy to force the generation of the PTX. This is mainly +// a fight against dead-code-elimination in the NVVM layer. +// +// The reason we need this strategy is because certain older versions of ptxas +// segfault when a non-sensical sequence of PTX is generated. So instead, we try +// to force the instantiation and compilation to PTX of all the overloads of the +// PTX wrapping functions. +// +// We do this by writing a function pointer of each overload to the kernel +// parameter `fn_ptr`. +// +// Because `fn_ptr` is possibly visible outside this translation unit, the +// compiler must compile all the functions which are stored. + __global__ void test_mbarrier_try_wait(void** fn_ptr) { #if __cccl_ptx_isa >= 780 diff --git a/libcudacxx/test/libcudacxx/cuda/ptx/generated/mbarrier_try_wait_parity.inc b/libcudacxx/test/libcudacxx/cuda/ptx/generated/mbarrier_try_wait_parity.h similarity index 77% rename from libcudacxx/test/libcudacxx/cuda/ptx/generated/mbarrier_try_wait_parity.inc rename to libcudacxx/test/libcudacxx/cuda/ptx/generated/mbarrier_try_wait_parity.h index f8c3875451a..8aa588fbab0 100644 --- a/libcudacxx/test/libcudacxx/cuda/ptx/generated/mbarrier_try_wait_parity.inc +++ b/libcudacxx/test/libcudacxx/cuda/ptx/generated/mbarrier_try_wait_parity.h @@ -1,3 +1,19 @@ +// This file was automatically generated. Do not edit. + +// We use a special strategy to force the generation of the PTX. This is mainly +// a fight against dead-code-elimination in the NVVM layer. +// +// The reason we need this strategy is because certain older versions of ptxas +// segfault when a non-sensical sequence of PTX is generated. So instead, we try +// to force the instantiation and compilation to PTX of all the overloads of the +// PTX wrapping functions. +// +// We do this by writing a function pointer of each overload to the kernel +// parameter `fn_ptr`. +// +// Because `fn_ptr` is possibly visible outside this translation unit, the +// compiler must compile all the functions which are stored. + __global__ void test_mbarrier_try_wait_parity(void** fn_ptr) { #if __cccl_ptx_isa >= 780 diff --git a/libcudacxx/test/libcudacxx/cuda/ptx/generated/mbarrier_wait.inc b/libcudacxx/test/libcudacxx/cuda/ptx/generated/mbarrier_wait.h similarity index 100% rename from libcudacxx/test/libcudacxx/cuda/ptx/generated/mbarrier_wait.inc rename to libcudacxx/test/libcudacxx/cuda/ptx/generated/mbarrier_wait.h diff --git a/libcudacxx/test/libcudacxx/cuda/ptx/generated/mbarrier_wait_parity.inc b/libcudacxx/test/libcudacxx/cuda/ptx/generated/mbarrier_wait_parity.h similarity index 100% rename from libcudacxx/test/libcudacxx/cuda/ptx/generated/mbarrier_wait_parity.inc rename to libcudacxx/test/libcudacxx/cuda/ptx/generated/mbarrier_wait_parity.h diff --git a/libcudacxx/test/libcudacxx/cuda/ptx/generated/red_async.inc b/libcudacxx/test/libcudacxx/cuda/ptx/generated/red_async.h similarity index 87% rename from libcudacxx/test/libcudacxx/cuda/ptx/generated/red_async.inc rename to libcudacxx/test/libcudacxx/cuda/ptx/generated/red_async.h index 0d562fd31a7..530d8c85967 100644 --- a/libcudacxx/test/libcudacxx/cuda/ptx/generated/red_async.inc +++ b/libcudacxx/test/libcudacxx/cuda/ptx/generated/red_async.h @@ -1,3 +1,19 @@ +// This file was automatically generated. Do not edit. + +// We use a special strategy to force the generation of the PTX. This is mainly +// a fight against dead-code-elimination in the NVVM layer. +// +// The reason we need this strategy is because certain older versions of ptxas +// segfault when a non-sensical sequence of PTX is generated. So instead, we try +// to force the instantiation and compilation to PTX of all the overloads of the +// PTX wrapping functions. +// +// We do this by writing a function pointer of each overload to the kernel +// parameter `fn_ptr`. +// +// Because `fn_ptr` is possibly visible outside this translation unit, the +// compiler must compile all the functions which are stored. + __global__ void test_red_async(void** fn_ptr) { #if __cccl_ptx_isa >= 810 diff --git a/libcudacxx/test/libcudacxx/cuda/ptx/generated/st_async.inc b/libcudacxx/test/libcudacxx/cuda/ptx/generated/st_async.h similarity index 70% rename from libcudacxx/test/libcudacxx/cuda/ptx/generated/st_async.inc rename to libcudacxx/test/libcudacxx/cuda/ptx/generated/st_async.h index 4efb95ef217..05ba9dd521a 100644 --- a/libcudacxx/test/libcudacxx/cuda/ptx/generated/st_async.inc +++ b/libcudacxx/test/libcudacxx/cuda/ptx/generated/st_async.h @@ -1,3 +1,19 @@ +// This file was automatically generated. Do not edit. + +// We use a special strategy to force the generation of the PTX. This is mainly +// a fight against dead-code-elimination in the NVVM layer. +// +// The reason we need this strategy is because certain older versions of ptxas +// segfault when a non-sensical sequence of PTX is generated. So instead, we try +// to force the instantiation and compilation to PTX of all the overloads of the +// PTX wrapping functions. +// +// We do this by writing a function pointer of each overload to the kernel +// parameter `fn_ptr`. +// +// Because `fn_ptr` is possibly visible outside this translation unit, the +// compiler must compile all the functions which are stored. + __global__ void test_st_async(void** fn_ptr) { #if __cccl_ptx_isa >= 810 diff --git a/libcudacxx/test/libcudacxx/cuda/ptx/generated/tensormap_cp_fenceproxy.inc b/libcudacxx/test/libcudacxx/cuda/ptx/generated/tensormap_cp_fenceproxy.h similarity index 70% rename from libcudacxx/test/libcudacxx/cuda/ptx/generated/tensormap_cp_fenceproxy.inc rename to libcudacxx/test/libcudacxx/cuda/ptx/generated/tensormap_cp_fenceproxy.h index 9a0a8c1f615..f5293e20ec3 100644 --- a/libcudacxx/test/libcudacxx/cuda/ptx/generated/tensormap_cp_fenceproxy.inc +++ b/libcudacxx/test/libcudacxx/cuda/ptx/generated/tensormap_cp_fenceproxy.h @@ -1,3 +1,19 @@ +// This file was automatically generated. Do not edit. + +// We use a special strategy to force the generation of the PTX. This is mainly +// a fight against dead-code-elimination in the NVVM layer. +// +// The reason we need this strategy is because certain older versions of ptxas +// segfault when a non-sensical sequence of PTX is generated. So instead, we try +// to force the instantiation and compilation to PTX of all the overloads of the +// PTX wrapping functions. +// +// We do this by writing a function pointer of each overload to the kernel +// parameter `fn_ptr`. +// +// Because `fn_ptr` is possibly visible outside this translation unit, the +// compiler must compile all the functions which are stored. + __global__ void test_tensormap_cp_fenceproxy(void** fn_ptr) { #if __cccl_ptx_isa >= 830 diff --git a/libcudacxx/test/libcudacxx/cuda/ptx/generated/tensormap_replace.inc b/libcudacxx/test/libcudacxx/cuda/ptx/generated/tensormap_replace.h similarity index 91% rename from libcudacxx/test/libcudacxx/cuda/ptx/generated/tensormap_replace.inc rename to libcudacxx/test/libcudacxx/cuda/ptx/generated/tensormap_replace.h index c69f3d11964..95446eb81fa 100644 --- a/libcudacxx/test/libcudacxx/cuda/ptx/generated/tensormap_replace.inc +++ b/libcudacxx/test/libcudacxx/cuda/ptx/generated/tensormap_replace.h @@ -1,3 +1,19 @@ +// This file was automatically generated. Do not edit. + +// We use a special strategy to force the generation of the PTX. This is mainly +// a fight against dead-code-elimination in the NVVM layer. +// +// The reason we need this strategy is because certain older versions of ptxas +// segfault when a non-sensical sequence of PTX is generated. So instead, we try +// to force the instantiation and compilation to PTX of all the overloads of the +// PTX wrapping functions. +// +// We do this by writing a function pointer of each overload to the kernel +// parameter `fn_ptr`. +// +// Because `fn_ptr` is possibly visible outside this translation unit, the +// compiler must compile all the functions which are stored. + __global__ void test_tensormap_replace(void** fn_ptr) { #if __cccl_ptx_isa >= 830 diff --git a/libcudacxx/test/libcudacxx/cuda/ptx/ptx.barrier.cluster.compile.pass.cpp b/libcudacxx/test/libcudacxx/cuda/ptx/ptx.barrier.cluster.compile.pass.cpp index c460a2e5b09..33d08621ef4 100644 --- a/libcudacxx/test/libcudacxx/cuda/ptx/ptx.barrier.cluster.compile.pass.cpp +++ b/libcudacxx/test/libcudacxx/cuda/ptx/ptx.barrier.cluster.compile.pass.cpp @@ -14,24 +14,7 @@ #include #include -/* - * We use a special strategy to force the generation of the PTX. This is mainly - * a fight against dead-code-elimination in the NVVM layer. - * - * The reason we need this strategy is because certain older versions of ptxas - * segfault when a non-sensical sequence of PTX is generated. So instead, we try - * to force the instantiation and compilation to PTX of all the overloads of the - * PTX wrapping functions. - * - * We do this by writing a function pointer of each overload to the kernel - * parameter `fn_ptr`. - * - * Because `fn_ptr` is possibly visible outside this translation unit, the - * compiler must compile all the functions which are stored. - * - */ - -#include "generated/barrier_cluster.inc" +#include "generated/barrier_cluster.h" int main(int, char**) { diff --git a/libcudacxx/test/libcudacxx/cuda/ptx/ptx.cp.async.bulk.commit_group.compile.pass.cpp b/libcudacxx/test/libcudacxx/cuda/ptx/ptx.cp.async.bulk.commit_group.compile.pass.cpp index 4695221dbc5..e7ff21c2730 100644 --- a/libcudacxx/test/libcudacxx/cuda/ptx/ptx.cp.async.bulk.commit_group.compile.pass.cpp +++ b/libcudacxx/test/libcudacxx/cuda/ptx/ptx.cp.async.bulk.commit_group.compile.pass.cpp @@ -14,24 +14,7 @@ #include #include -/* - * We use a special strategy to force the generation of the PTX. This is mainly - * a fight against dead-code-elimination in the NVVM layer. - * - * The reason we need this strategy is because certain older versions of ptxas - * segfault when a non-sensical sequence of PTX is generated. So instead, we try - * to force the instantiation and compilation to PTX of all the overloads of the - * PTX wrapping functions. - * - * We do this by writing a function pointer of each overload to the kernel - * parameter `fn_ptr`. - * - * Because `fn_ptr` is possibly visible outside this translation unit, the - * compiler must compile all the functions which are stored. - * - */ - -#include "generated/cp_async_bulk_commit_group.inc" +#include "generated/cp_async_bulk_commit_group.h" int main(int, char**) { diff --git a/libcudacxx/test/libcudacxx/cuda/ptx/ptx.cp.async.bulk.compile.pass.cpp b/libcudacxx/test/libcudacxx/cuda/ptx/ptx.cp.async.bulk.compile.pass.cpp index b1811727b66..fdd35749cc6 100644 --- a/libcudacxx/test/libcudacxx/cuda/ptx/ptx.cp.async.bulk.compile.pass.cpp +++ b/libcudacxx/test/libcudacxx/cuda/ptx/ptx.cp.async.bulk.compile.pass.cpp @@ -14,24 +14,7 @@ #include #include -/* - * We use a special strategy to force the generation of the PTX. This is mainly - * a fight against dead-code-elimination in the NVVM layer. - * - * The reason we need this strategy is because certain older versions of ptxas - * segfault when a non-sensical sequence of PTX is generated. So instead, we try - * to force the instantiation and compilation to PTX of all the overloads of the - * PTX wrapping functions. - * - * We do this by writing a function pointer of each overload to the kernel - * parameter `fn_ptr`. - * - * Because `fn_ptr` is possibly visible outside this translation unit, the - * compiler must compile all the functions which are stored. - * - */ - -#include "generated/cp_async_bulk.inc" +#include "generated/cp_async_bulk.h" int main(int, char**) { diff --git a/libcudacxx/test/libcudacxx/cuda/ptx/ptx.cp.async.bulk.multicast.compile.pass.cpp b/libcudacxx/test/libcudacxx/cuda/ptx/ptx.cp.async.bulk.multicast.compile.pass.cpp index c040528cabc..ae1546828ae 100644 --- a/libcudacxx/test/libcudacxx/cuda/ptx/ptx.cp.async.bulk.multicast.compile.pass.cpp +++ b/libcudacxx/test/libcudacxx/cuda/ptx/ptx.cp.async.bulk.multicast.compile.pass.cpp @@ -16,24 +16,7 @@ #include #include -/* - * We use a special strategy to force the generation of the PTX. This is mainly - * a fight against dead-code-elimination in the NVVM layer. - * - * The reason we need this strategy is because certain older versions of ptxas - * segfault when a non-sensical sequence of PTX is generated. So instead, we try - * to force the instantiation and compilation to PTX of all the overloads of the - * PTX wrapping functions. - * - * We do this by writing a function pointer of each overload to the kernel - * parameter `fn_ptr`. - * - * Because `fn_ptr` is possibly visible outside this translation unit, the - * compiler must compile all the functions which are stored. - * - */ - -#include "generated/cp_async_bulk_multicast.inc" +#include "generated/cp_async_bulk_multicast.h" int main(int, char**) { diff --git a/libcudacxx/test/libcudacxx/cuda/ptx/ptx.cp.async.bulk.tensor.compile.pass.cpp b/libcudacxx/test/libcudacxx/cuda/ptx/ptx.cp.async.bulk.tensor.compile.pass.cpp index 0b69b8a8f1c..eeb7b4bf5a5 100644 --- a/libcudacxx/test/libcudacxx/cuda/ptx/ptx.cp.async.bulk.tensor.compile.pass.cpp +++ b/libcudacxx/test/libcudacxx/cuda/ptx/ptx.cp.async.bulk.tensor.compile.pass.cpp @@ -14,24 +14,7 @@ #include #include -/* - * We use a special strategy to force the generation of the PTX. This is mainly - * a fight against dead-code-elimination in the NVVM layer. - * - * The reason we need this strategy is because certain older versions of ptxas - * segfault when a non-sensical sequence of PTX is generated. So instead, we try - * to force the instantiation and compilation to PTX of all the overloads of the - * PTX wrapping functions. - * - * We do this by writing a function pointer of each overload to the kernel - * parameter `fn_ptr`. - * - * Because `fn_ptr` is possibly visible outside this translation unit, the - * compiler must compile all the functions which are stored. - * - */ - -#include "generated/cp_async_bulk_tensor.inc" +#include "generated/cp_async_bulk_tensor.h" int main(int, char**) { diff --git a/libcudacxx/test/libcudacxx/cuda/ptx/ptx.cp.async.bulk.tensor.multicast.compile.pass.cpp b/libcudacxx/test/libcudacxx/cuda/ptx/ptx.cp.async.bulk.tensor.multicast.compile.pass.cpp index 7d53d9ee0c9..d07351a2275 100644 --- a/libcudacxx/test/libcudacxx/cuda/ptx/ptx.cp.async.bulk.tensor.multicast.compile.pass.cpp +++ b/libcudacxx/test/libcudacxx/cuda/ptx/ptx.cp.async.bulk.tensor.multicast.compile.pass.cpp @@ -16,24 +16,7 @@ #include #include -/* - * We use a special strategy to force the generation of the PTX. This is mainly - * a fight against dead-code-elimination in the NVVM layer. - * - * The reason we need this strategy is because certain older versions of ptxas - * segfault when a non-sensical sequence of PTX is generated. So instead, we try - * to force the instantiation and compilation to PTX of all the overloads of the - * PTX wrapping functions. - * - * We do this by writing a function pointer of each overload to the kernel - * parameter `fn_ptr`. - * - * Because `fn_ptr` is possibly visible outside this translation unit, the - * compiler must compile all the functions which are stored. - * - */ - -#include "generated/cp_async_bulk_tensor_multicast.inc" +#include "generated/cp_async_bulk_tensor_multicast.h" int main(int, char**) { diff --git a/libcudacxx/test/libcudacxx/cuda/ptx/ptx.cp.async.bulk.wait_group.compile.pass.cpp b/libcudacxx/test/libcudacxx/cuda/ptx/ptx.cp.async.bulk.wait_group.compile.pass.cpp index 39df53c5f9d..87910d04941 100644 --- a/libcudacxx/test/libcudacxx/cuda/ptx/ptx.cp.async.bulk.wait_group.compile.pass.cpp +++ b/libcudacxx/test/libcudacxx/cuda/ptx/ptx.cp.async.bulk.wait_group.compile.pass.cpp @@ -14,24 +14,7 @@ #include #include -/* - * We use a special strategy to force the generation of the PTX. This is mainly - * a fight against dead-code-elimination in the NVVM layer. - * - * The reason we need this strategy is because certain older versions of ptxas - * segfault when a non-sensical sequence of PTX is generated. So instead, we try - * to force the instantiation and compilation to PTX of all the overloads of the - * PTX wrapping functions. - * - * We do this by writing a function pointer of each overload to the kernel - * parameter `fn_ptr`. - * - * Because `fn_ptr` is possibly visible outside this translation unit, the - * compiler must compile all the functions which are stored. - * - */ - -#include "generated/cp_async_bulk_wait_group.inc" +#include "generated/cp_async_bulk_wait_group.h" int main(int, char**) { diff --git a/libcudacxx/test/libcudacxx/cuda/ptx/ptx.cp.reduce.async.bulk.compile.pass.cpp b/libcudacxx/test/libcudacxx/cuda/ptx/ptx.cp.reduce.async.bulk.compile.pass.cpp index a186e34a809..8b916d74bf9 100644 --- a/libcudacxx/test/libcudacxx/cuda/ptx/ptx.cp.reduce.async.bulk.compile.pass.cpp +++ b/libcudacxx/test/libcudacxx/cuda/ptx/ptx.cp.reduce.async.bulk.compile.pass.cpp @@ -14,31 +14,14 @@ #include #include -/* - * We use a special strategy to force the generation of the PTX. This is mainly - * a fight against dead-code-elimination in the NVVM layer. - * - * The reason we need this strategy is because certain older versions of ptxas - * segfault when a non-sensical sequence of PTX is generated. So instead, we try - * to force the instantiation and compilation to PTX of all the overloads of the - * PTX wrapping functions. - * - * We do this by writing a function pointer of each overload to the kernel - * parameter `fn_ptr`. - * - * Because `fn_ptr` is possibly visible outside this translation unit, the - * compiler must compile all the functions which are stored. - * - */ - -#include "generated/cp_reduce_async_bulk.inc" +#include "generated/cp_reduce_async_bulk.h" #ifdef _LIBCUDACXX_HAS_NVF16 -# include "generated/cp_reduce_async_bulk_f16.inc" +# include "generated/cp_reduce_async_bulk_f16.h" #endif // _LIBCUDACXX_HAS_NVF16 #ifdef _LIBCUDACXX_HAS_NVBF16 -# include "generated/cp_reduce_async_bulk_bf16.inc" +# include "generated/cp_reduce_async_bulk_bf16.h" #endif // _LIBCUDACXX_HAS_NVBF16 int main(int, char**) diff --git a/libcudacxx/test/libcudacxx/cuda/ptx/ptx.cp.reduce.async.bulk.tensor.compile.pass.cpp b/libcudacxx/test/libcudacxx/cuda/ptx/ptx.cp.reduce.async.bulk.tensor.compile.pass.cpp index 14abc0d3ae6..f6a6fd61735 100644 --- a/libcudacxx/test/libcudacxx/cuda/ptx/ptx.cp.reduce.async.bulk.tensor.compile.pass.cpp +++ b/libcudacxx/test/libcudacxx/cuda/ptx/ptx.cp.reduce.async.bulk.tensor.compile.pass.cpp @@ -14,24 +14,7 @@ #include #include -/* - * We use a special strategy to force the generation of the PTX. This is mainly - * a fight against dead-code-elimination in the NVVM layer. - * - * The reason we need this strategy is because certain older versions of ptxas - * segfault when a non-sensical sequence of PTX is generated. So instead, we try - * to force the instantiation and compilation to PTX of all the overloads of the - * PTX wrapping functions. - * - * We do this by writing a function pointer of each overload to the kernel - * parameter `fn_ptr`. - * - * Because `fn_ptr` is possibly visible outside this translation unit, the - * compiler must compile all the functions which are stored. - * - */ - -#include "generated/cp_reduce_async_bulk_tensor.inc" +#include "generated/cp_reduce_async_bulk_tensor.h" int main(int, char**) { diff --git a/libcudacxx/test/libcudacxx/cuda/ptx/ptx.fence.compile.pass.cpp b/libcudacxx/test/libcudacxx/cuda/ptx/ptx.fence.compile.pass.cpp index 641cb83f172..56f54b345f7 100644 --- a/libcudacxx/test/libcudacxx/cuda/ptx/ptx.fence.compile.pass.cpp +++ b/libcudacxx/test/libcudacxx/cuda/ptx/ptx.fence.compile.pass.cpp @@ -14,28 +14,11 @@ #include #include -/* - * We use a special strategy to force the generation of the PTX. This is mainly - * a fight against dead-code-elimination in the NVVM layer. - * - * The reason we need this strategy is because certain older versions of ptxas - * segfault when a non-sensical sequence of PTX is generated. So instead, we try - * to force the instantiation and compilation to PTX of all the overloads of the - * PTX wrapping functions. - * - * We do this by writing a function pointer of each overload to the kernel - * parameter `fn_ptr`. - * - * Because `fn_ptr` is possibly visible outside this translation unit, the - * compiler must compile all the functions which are stored. - * - */ - -#include "generated/fence.inc" -#include "generated/fence_mbarrier_init.inc" -#include "generated/fence_proxy_alias.inc" -#include "generated/fence_proxy_async.inc" -#include "generated/fence_proxy_tensormap_generic.inc" +#include "generated/fence.h" +#include "generated/fence_mbarrier_init.h" +#include "generated/fence_proxy_alias.h" +#include "generated/fence_proxy_async.h" +#include "generated/fence_proxy_tensormap_generic.h" int main(int, char**) { diff --git a/libcudacxx/test/libcudacxx/cuda/ptx/ptx.get_sreg.compile.pass.cpp b/libcudacxx/test/libcudacxx/cuda/ptx/ptx.get_sreg.compile.pass.cpp index 697cc00a1be..91a6dd94bf1 100644 --- a/libcudacxx/test/libcudacxx/cuda/ptx/ptx.get_sreg.compile.pass.cpp +++ b/libcudacxx/test/libcudacxx/cuda/ptx/ptx.get_sreg.compile.pass.cpp @@ -15,24 +15,7 @@ #include #include -/* - * We use a special strategy to force the generation of the PTX. This is mainly - * a fight against dead-code-elimination in the NVVM layer. - * - * The reason we need this strategy is because certain older versions of ptxas - * segfault when a non-sensical sequence of PTX is generated. So instead, we try - * to force the instantiation and compilation to PTX of all the overloads of the - * PTX wrapping functions. - * - * We do this by writing a function pointer of each overload to the kernel - * parameter `fn_ptr`. - * - * Because `fn_ptr` is possibly visible outside this translation unit, the - * compiler must compile all the functions which are stored. - * - */ - -#include "generated/get_sreg.inc" +#include "generated/get_sreg.h" int main(int, char**) { diff --git a/libcudacxx/test/libcudacxx/cuda/ptx/ptx.getctarank.compile.pass.cpp b/libcudacxx/test/libcudacxx/cuda/ptx/ptx.getctarank.compile.pass.cpp index 80fc71c0998..ed39816b7d6 100644 --- a/libcudacxx/test/libcudacxx/cuda/ptx/ptx.getctarank.compile.pass.cpp +++ b/libcudacxx/test/libcudacxx/cuda/ptx/ptx.getctarank.compile.pass.cpp @@ -14,24 +14,7 @@ #include #include -/* - * We use a special strategy to force the generation of the PTX. This is mainly - * a fight against dead-code-elimination in the NVVM layer. - * - * The reason we need this strategy is because certain older versions of ptxas - * segfault when a non-sensical sequence of PTX is generated. So instead, we try - * to force the instantiation and compilation to PTX of all the overloads of the - * PTX wrapping functions. - * - * We do this by writing a function pointer of each overload to the kernel - * parameter `fn_ptr`. - * - * Because `fn_ptr` is possibly visible outside this translation unit, the - * compiler must compile all the functions which are stored. - * - */ - -#include "generated/getctarank.inc" +#include "generated/getctarank.h" int main(int, char**) { diff --git a/libcudacxx/test/libcudacxx/cuda/ptx/ptx.mbarrier.arrive.compile.pass.cpp b/libcudacxx/test/libcudacxx/cuda/ptx/ptx.mbarrier.arrive.compile.pass.cpp index 2350b176630..93263910906 100644 --- a/libcudacxx/test/libcudacxx/cuda/ptx/ptx.mbarrier.arrive.compile.pass.cpp +++ b/libcudacxx/test/libcudacxx/cuda/ptx/ptx.mbarrier.arrive.compile.pass.cpp @@ -14,26 +14,9 @@ #include #include -/* - * We use a special strategy to force the generation of the PTX. This is mainly - * a fight against dead-code-elimination in the NVVM layer. - * - * The reason we need this strategy is because certain older versions of ptxas - * segfault when a non-sensical sequence of PTX is generated. So instead, we try - * to force the instantiation and compilation to PTX of all the overloads of the - * PTX wrapping functions. - * - * We do this by writing a function pointer of each overload to the kernel - * parameter `fn_ptr`. - * - * Because `fn_ptr` is possibly visible outside this translation unit, the - * compiler must compile all the functions which are stored. - * - */ - -#include "generated/mbarrier_arrive.inc" -#include "generated/mbarrier_arrive_expect_tx.inc" -#include "generated/mbarrier_arrive_no_complete.inc" +#include "generated/mbarrier_arrive.h" +#include "generated/mbarrier_arrive_expect_tx.h" +#include "generated/mbarrier_arrive_no_complete.h" int main(int, char**) { diff --git a/libcudacxx/test/libcudacxx/cuda/ptx/ptx.mbarrier.init.compile.pass.cpp b/libcudacxx/test/libcudacxx/cuda/ptx/ptx.mbarrier.init.compile.pass.cpp index b445a61a8a9..7af0db56b70 100644 --- a/libcudacxx/test/libcudacxx/cuda/ptx/ptx.mbarrier.init.compile.pass.cpp +++ b/libcudacxx/test/libcudacxx/cuda/ptx/ptx.mbarrier.init.compile.pass.cpp @@ -14,24 +14,7 @@ #include #include -/* - * We use a special strategy to force the generation of the PTX. This is mainly - * a fight against dead-code-elimination in the NVVM layer. - * - * The reason we need this strategy is because certain older versions of ptxas - * segfault when a non-sensical sequence of PTX is generated. So instead, we try - * to force the instantiation and compilation to PTX of all the overloads of the - * PTX wrapping functions. - * - * We do this by writing a function pointer of each overload to the kernel - * parameter `fn_ptr`. - * - * Because `fn_ptr` is possibly visible outside this translation unit, the - * compiler must compile all the functions which are stored. - * - */ - -#include "generated/mbarrier_init.inc" +#include "generated/mbarrier_init.h" int main(int, char**) { diff --git a/libcudacxx/test/libcudacxx/cuda/ptx/ptx.mbarrier.wait.compile.pass.cpp b/libcudacxx/test/libcudacxx/cuda/ptx/ptx.mbarrier.wait.compile.pass.cpp index e9c17a2024d..896abb8a7d8 100644 --- a/libcudacxx/test/libcudacxx/cuda/ptx/ptx.mbarrier.wait.compile.pass.cpp +++ b/libcudacxx/test/libcudacxx/cuda/ptx/ptx.mbarrier.wait.compile.pass.cpp @@ -14,27 +14,10 @@ #include #include -/* - * We use a special strategy to force the generation of the PTX. This is mainly - * a fight against dead-code-elimination in the NVVM layer. - * - * The reason we need this strategy is because certain older versions of ptxas - * segfault when a non-sensical sequence of PTX is generated. So instead, we try - * to force the instantiation and compilation to PTX of all the overloads of the - * PTX wrapping functions. - * - * We do this by writing a function pointer of each overload to the kernel - * parameter `fn_ptr`. - * - * Because `fn_ptr` is possibly visible outside this translation unit, the - * compiler must compile all the functions which are stored. - * - */ - -#include "generated/mbarrier_try_wait.inc" -#include "generated/mbarrier_try_wait_parity.inc" -#include "generated/mbarrier_wait.inc" -#include "generated/mbarrier_wait_parity.inc" +#include "generated/mbarrier_try_wait.h" +#include "generated/mbarrier_try_wait_parity.h" +#include "generated/mbarrier_wait.h" +#include "generated/mbarrier_wait_parity.h" int main(int, char**) { diff --git a/libcudacxx/test/libcudacxx/cuda/ptx/ptx.red.async.compile.pass.cpp b/libcudacxx/test/libcudacxx/cuda/ptx/ptx.red.async.compile.pass.cpp index 4a380ec8396..c6f66503b1f 100644 --- a/libcudacxx/test/libcudacxx/cuda/ptx/ptx.red.async.compile.pass.cpp +++ b/libcudacxx/test/libcudacxx/cuda/ptx/ptx.red.async.compile.pass.cpp @@ -14,24 +14,7 @@ #include #include -/* - * We use a special strategy to force the generation of the PTX. This is mainly - * a fight against dead-code-elimination in the NVVM layer. - * - * The reason we need this strategy is because certain older versions of ptxas - * segfault when a non-sensical sequence of PTX is generated. So instead, we try - * to force the instantiation and compilation to PTX of all the overloads of the - * PTX wrapping functions. - * - * We do this by writing a function pointer of each overload to the kernel - * parameter `fn_ptr`. - * - * Because `fn_ptr` is possibly visible outside this translation unit, the - * compiler must compile all the functions which are stored. - * - */ - -#include "generated/red_async.inc" +#include "generated/red_async.h" int main(int, char**) { diff --git a/libcudacxx/test/libcudacxx/cuda/ptx/ptx.st.async.compile.pass.cpp b/libcudacxx/test/libcudacxx/cuda/ptx/ptx.st.async.compile.pass.cpp index 2c74f48e04d..7c008b77126 100644 --- a/libcudacxx/test/libcudacxx/cuda/ptx/ptx.st.async.compile.pass.cpp +++ b/libcudacxx/test/libcudacxx/cuda/ptx/ptx.st.async.compile.pass.cpp @@ -14,24 +14,7 @@ #include #include -/* - * We use a special strategy to force the generation of the PTX. This is mainly - * a fight against dead-code-elimination in the NVVM layer. - * - * The reason we need this strategy is because certain older versions of ptxas - * segfault when a non-sensical sequence of PTX is generated. So instead, we try - * to force the instantiation and compilation to PTX of all the overloads of the - * PTX wrapping functions. - * - * We do this by writing a function pointer of each overload to the kernel - * parameter `fn_ptr`. - * - * Because `fn_ptr` is possibly visible outside this translation unit, the - * compiler must compile all the functions which are stored. - * - */ - -#include "generated/st_async.inc" +#include "generated/st_async.h" int main(int, char**) { diff --git a/libcudacxx/test/libcudacxx/cuda/ptx/ptx.tensormap.cp_fenceproxy.compile.pass.cpp b/libcudacxx/test/libcudacxx/cuda/ptx/ptx.tensormap.cp_fenceproxy.compile.pass.cpp index d0d3a967836..bb5578fc730 100644 --- a/libcudacxx/test/libcudacxx/cuda/ptx/ptx.tensormap.cp_fenceproxy.compile.pass.cpp +++ b/libcudacxx/test/libcudacxx/cuda/ptx/ptx.tensormap.cp_fenceproxy.compile.pass.cpp @@ -14,24 +14,7 @@ #include #include -/* - * We use a special strategy to force the generation of the PTX. This is mainly - * a fight against dead-code-elimination in the NVVM layer. - * - * The reason we need this strategy is because certain older versions of ptxas - * segfault when a non-sensical sequence of PTX is generated. So instead, we try - * to force the instantiation and compilation to PTX of all the overloads of the - * PTX wrapping functions. - * - * We do this by writing a function pointer of each overload to the kernel - * parameter `fn_ptr`. - * - * Because `fn_ptr` is possibly visible outside this translation unit, the - * compiler must compile all the functions which are stored. - * - */ - -#include "generated/tensormap_cp_fenceproxy.inc" +#include "generated/tensormap_cp_fenceproxy.h" int main(int, char**) { diff --git a/libcudacxx/test/libcudacxx/cuda/ptx/ptx.tensormap.replace.compile.pass.cpp b/libcudacxx/test/libcudacxx/cuda/ptx/ptx.tensormap.replace.compile.pass.cpp index d780ff26dca..264b7956fbb 100644 --- a/libcudacxx/test/libcudacxx/cuda/ptx/ptx.tensormap.replace.compile.pass.cpp +++ b/libcudacxx/test/libcudacxx/cuda/ptx/ptx.tensormap.replace.compile.pass.cpp @@ -14,24 +14,7 @@ #include #include -/* - * We use a special strategy to force the generation of the PTX. This is mainly - * a fight against dead-code-elimination in the NVVM layer. - * - * The reason we need this strategy is because certain older versions of ptxas - * segfault when a non-sensical sequence of PTX is generated. So instead, we try - * to force the instantiation and compilation to PTX of all the overloads of the - * PTX wrapping functions. - * - * We do this by writing a function pointer of each overload to the kernel - * parameter `fn_ptr`. - * - * Because `fn_ptr` is possibly visible outside this translation unit, the - * compiler must compile all the functions which are stored. - * - */ - -#include "generated/tensormap_replace.inc" +#include "generated/tensormap_replace.h" int main(int, char**) { From dc920c93749d0b050dd306172c5a8888a4cf058a Mon Sep 17 00:00:00 2001 From: Michael Schellenberger Costa Date: Mon, 25 Nov 2024 21:02:52 +0100 Subject: [PATCH 23/45] Do not include extended floating point headers if they are not needed (#2956) Fixes #2933 --- c2h/include/c2h/generators.h | 19 ++++++++++++++++++- cub/cub/detail/fast_modulo_division.cuh | 6 +++--- cub/cub/thread/thread_operators.cuh | 11 +++++++++++ cub/cub/thread/thread_reduce.cuh | 11 +++++++++++ cub/cub/util_type.cuh | 9 +++++++++ .../cuda/std/__cccl/extended_floating_point.h | 11 ----------- .../is_extended_floating_point.h | 18 +++++++++++------- .../include/cuda/std/__type_traits/promote.h | 1 + thrust/thrust/system/cuda/detail/sort.h | 11 +++++++++++ 9 files changed, 75 insertions(+), 22 deletions(-) diff --git a/c2h/include/c2h/generators.h b/c2h/include/c2h/generators.h index 20036088fa8..62f169e9e21 100644 --- a/c2h/include/c2h/generators.h +++ b/c2h/include/c2h/generators.h @@ -35,7 +35,24 @@ #include #if THRUST_DEVICE_SYSTEM == THRUST_DEVICE_SYSTEM_CUDA -# include // for +# if defined(_CCCL_HAS_NVFP16) +# include +# endif // _CCCL_HAS_NVFP16 + +# if defined(_CCCL_HAS_NVBF16) +_CCCL_DIAG_PUSH +_CCCL_DIAG_SUPPRESS_CLANG("-Wunused-function") +# include +_CCCL_DIAG_POP + +# if _CCCL_CUDACC_AT_LEAST(11, 8) +// cuda_fp8.h resets default for C4127, so we have to guard the inclusion +_CCCL_DIAG_PUSH +# include +_CCCL_DIAG_POP +# endif // _CCCL_CUDACC_AT_LEAST(11, 8) +# endif // _CCCL_HAS_NVBF16 + # if defined(__CUDA_FP8_TYPES_EXIST__) namespace std { diff --git a/cub/cub/detail/fast_modulo_division.cuh b/cub/cub/detail/fast_modulo_division.cuh index aa2ffd371c0..24b3204801d 100644 --- a/cub/cub/detail/fast_modulo_division.cuh +++ b/cub/cub/detail/fast_modulo_division.cuh @@ -37,6 +37,9 @@ # pragma system_header #endif // no system header +#include // implicit_prom_t +#include // CUB_IS_INT128_ENABLED + #include // cuda::std::ceil_div #include // std::has_single_bit #include // CHAR_BIT @@ -44,9 +47,6 @@ #include // numeric_limits #include // std::is_integral -#include "cub/detail/type_traits.cuh" // implicit_prom_t -#include "cub/util_type.cuh" // CUB_IS_INT128_ENABLED - #if defined(CCCL_ENABLE_DEVICE_ASSERTIONS) _CCCL_NV_DIAG_SUPPRESS(186) // pointless comparison of unsigned integer with zero #endif // CCCL_ENABLE_DEVICE_ASSERTIONS diff --git a/cub/cub/thread/thread_operators.cuh b/cub/cub/thread/thread_operators.cuh index 45d2446188f..05f2d6a41f6 100644 --- a/cub/cub/thread/thread_operators.cuh +++ b/cub/cub/thread/thread_operators.cuh @@ -56,6 +56,17 @@ #include // cuda::std::common_type #include // cuda::std::forward +#if defined(_CCCL_HAS_NVFP16) +# include +#endif // _CCCL_HAS_NVFP16 + +#if defined(_CCCL_HAS_NVBF16) +_CCCL_DIAG_PUSH +_CCCL_DIAG_SUPPRESS_CLANG("-Wunused-function") +# include +_CCCL_DIAG_POP +#endif // _CCCL_HAS_NVFP16 + CUB_NAMESPACE_BEGIN // TODO(bgruber): deprecate in C++17 with a note: "replace by decltype(cuda::std::not_fn(EqualityOp{}))" diff --git a/cub/cub/thread/thread_reduce.cuh b/cub/cub/thread/thread_reduce.cuh index d4b4a89fdfd..f384d907b34 100644 --- a/cub/cub/thread/thread_reduce.cuh +++ b/cub/cub/thread/thread_reduce.cuh @@ -54,6 +54,17 @@ #include // uint16_t #include // cuda::std::plus +#if defined(_CCCL_HAS_NVFP16) +# include +#endif // _CCCL_HAS_NVFP16 + +#if defined(_CCCL_HAS_NVBF16) +_CCCL_DIAG_PUSH +_CCCL_DIAG_SUPPRESS_CLANG("-Wunused-function") +# include +_CCCL_DIAG_POP +#endif // _CCCL_HAS_NVFP16 + CUB_NAMESPACE_BEGIN //! @rst diff --git a/cub/cub/util_type.cuh b/cub/cub/util_type.cuh index f062ebc4ae9..5bda9dfe98f 100644 --- a/cub/cub/util_type.cuh +++ b/cub/cub/util_type.cuh @@ -50,7 +50,16 @@ #include #include +#if defined(_CCCL_HAS_NVFP16) +# include +#endif // _CCCL_HAS_NVFP16 + #if defined(_CCCL_HAS_NVBF16) +_CCCL_DIAG_PUSH +_CCCL_DIAG_SUPPRESS_CLANG("-Wunused-function") +# include +_CCCL_DIAG_POP + # if _CCCL_CUDACC_AT_LEAST(11, 8) // cuda_fp8.h resets default for C4127, so we have to guard the inclusion _CCCL_DIAG_PUSH diff --git a/libcudacxx/include/cuda/std/__cccl/extended_floating_point.h b/libcudacxx/include/cuda/std/__cccl/extended_floating_point.h index 9d3c835c464..d135f406702 100644 --- a/libcudacxx/include/cuda/std/__cccl/extended_floating_point.h +++ b/libcudacxx/include/cuda/std/__cccl/extended_floating_point.h @@ -39,15 +39,4 @@ # endif #endif // !_CCCL_HAS_NVBF16 -#if defined(_CCCL_HAS_NVFP16) -# include -#endif // _CCCL_HAS_NVFP16 - -#if defined(_CCCL_HAS_NVBF16) -_CCCL_DIAG_PUSH -_CCCL_DIAG_SUPPRESS_CLANG("-Wunused-function") -# include -_CCCL_DIAG_POP -#endif // _CCCL_HAS_NVFP16 - #endif // __CCCL_EXTENDED_FLOATING_POINT_H diff --git a/libcudacxx/include/cuda/std/__type_traits/is_extended_floating_point.h b/libcudacxx/include/cuda/std/__type_traits/is_extended_floating_point.h index dcc4330e107..bb1afa4225b 100644 --- a/libcudacxx/include/cuda/std/__type_traits/is_extended_floating_point.h +++ b/libcudacxx/include/cuda/std/__type_traits/is_extended_floating_point.h @@ -22,6 +22,17 @@ #include +#if defined(_LIBCUDACXX_HAS_NVFP16) +# include +#endif // _LIBCUDACXX_HAS_NVFP16 + +#if defined(_LIBCUDACXX_HAS_NVBF16) +_CCCL_DIAG_PUSH +_CCCL_DIAG_SUPPRESS_CLANG("-Wunused-function") +# include +_CCCL_DIAG_POP +#endif // _LIBCUDACXX_HAS_NVBF16 + _LIBCUDACXX_BEGIN_NAMESPACE_STD template @@ -39,8 +50,6 @@ _CCCL_INLINE_VAR constexpr bool __is_extended_floating_point_v #endif // !_CCCL_NO_VARIABLE_TEMPLATES #if defined(_LIBCUDACXX_HAS_NVFP16) -# include - template <> struct __is_extended_floating_point<__half> : true_type {}; @@ -52,11 +61,6 @@ _CCCL_INLINE_VAR constexpr bool __is_extended_floating_point_v<__half> = true; #endif // _LIBCUDACXX_HAS_NVFP16 #if defined(_LIBCUDACXX_HAS_NVBF16) -_CCCL_DIAG_PUSH -_CCCL_DIAG_SUPPRESS_CLANG("-Wunused-function") -# include -_CCCL_DIAG_POP - template <> struct __is_extended_floating_point<__nv_bfloat16> : true_type {}; diff --git a/libcudacxx/include/cuda/std/__type_traits/promote.h b/libcudacxx/include/cuda/std/__type_traits/promote.h index 01b06989513..daa545c5fa1 100644 --- a/libcudacxx/include/cuda/std/__type_traits/promote.h +++ b/libcudacxx/include/cuda/std/__type_traits/promote.h @@ -28,6 +28,7 @@ #ifdef _LIBCUDACXX_HAS_NVFP16 # include #endif // _LIBCUDACXX_HAS_NVFP16 + #ifdef _LIBCUDACXX_HAS_NVBF16 _CCCL_DIAG_PUSH _CCCL_DIAG_SUPPRESS_CLANG("-Wunused-function") diff --git a/thrust/thrust/system/cuda/detail/sort.h b/thrust/thrust/system/cuda/detail/sort.h index a582cf2f3c6..3de3d5492f7 100644 --- a/thrust/thrust/system/cuda/detail/sort.h +++ b/thrust/thrust/system/cuda/detail/sort.h @@ -60,6 +60,17 @@ # include +# if defined(_CCCL_HAS_NVFP16) +# include +# endif // _CCCL_HAS_NVFP16 + +# if defined(_CCCL_HAS_NVBF16) +_CCCL_DIAG_PUSH +_CCCL_DIAG_SUPPRESS_CLANG("-Wunused-function") +# include +_CCCL_DIAG_POP +# endif // _CCCL_HAS_NVBF16 + THRUST_NAMESPACE_BEGIN namespace cuda_cub { From db47d38e2d7a0352d2b036934729851707ce66d2 Mon Sep 17 00:00:00 2001 From: pciolkosz Date: Mon, 25 Nov 2024 12:27:34 -0800 Subject: [PATCH 24/45] [CUDAX] Add copy_bytes and fill_bytes overloads for mdspan (#2932) * Implement copy_bytes for mdspan * Add final conversion to mdspan and more tests * mdspan fill_bytes * Add docs * Fix issues after rebase * Help old GCC figure out the types * Move runtime extents check to a function * Fix clang and more old GCC fixes --- .../cuda/experimental/__algorithm/common.cuh | 24 +++++- .../cuda/experimental/__algorithm/copy.cuh | 86 ++++++++++++++++++- .../cuda/experimental/__algorithm/fill.cuh | 32 ++++++- cudax/test/algorithm/common.cuh | 29 +++++-- cudax/test/algorithm/copy.cu | 66 +++++++++++++- cudax/test/algorithm/fill.cu | 29 +++++++ 6 files changed, 253 insertions(+), 13 deletions(-) diff --git a/cudax/include/cuda/experimental/__algorithm/common.cuh b/cudax/include/cuda/experimental/__algorithm/common.cuh index 9dd891f7b28..eadb5e50dd5 100644 --- a/cudax/include/cuda/experimental/__algorithm/common.cuh +++ b/cudax/include/cuda/experimental/__algorithm/common.cuh @@ -23,15 +23,17 @@ #include #include +#include #include #include namespace cuda::experimental { + #if _CCCL_STD_VER >= 2020 && defined(_CCCL_SPAN_USES_RANGES) template -concept __valid_copy_fill_argument = _CUDA_VRANGES::contiguous_range>; +concept __valid_1d_copy_fill_argument = _CUDA_VRANGES::contiguous_range>; #else template @@ -45,10 +47,28 @@ inline constexpr bool __convertible_to_span< int>> = true; template -inline constexpr bool __valid_copy_fill_argument = +inline constexpr bool __valid_1d_copy_fill_argument = _CUDA_VRANGES::contiguous_range> || __convertible_to_span<_Tp>; #endif +template > +using __as_mdspan_t = + _CUDA_VSTD::mdspan; + +template +inline constexpr bool __convertible_to_mdspan = false; + +template +inline constexpr bool + __convertible_to_mdspan<_Tp, _CUDA_VSTD::enable_if_t<_CUDA_VSTD::is_convertible_v<_Tp, __as_mdspan_t<_Tp>>, int>> = + true; + +template +inline constexpr bool __valid_nd_copy_fill_argument = __convertible_to_mdspan>; + } // namespace cuda::experimental #endif //__CUDAX_ALGORITHM_COMMON diff --git a/cudax/include/cuda/experimental/__algorithm/copy.cuh b/cudax/include/cuda/experimental/__algorithm/copy.cuh index 9cb5cf99a0a..e2c7c73d51a 100644 --- a/cudax/include/cuda/experimental/__algorithm/copy.cuh +++ b/cudax/include/cuda/experimental/__algorithm/copy.cuh @@ -53,7 +53,8 @@ void __copy_bytes_impl(stream_ref __stream, _CUDA_VSTD::span<_SrcTy> __src, _CUD //! @brief Launches a bytewise memory copy from source to destination into the provided stream. //! -//! Both source and destination needs to either be a `contiguous_range` or implicitly/launch transform to one. +//! Both source and destination needs to either be a `contiguous_range` or launch transform to one. +//! They can also implicitly convert to `cuda::std::span`, but the type needs to contain `value_type` member alias. //! Both source and destination type is required to be trivially copyable. //! //! This call might be synchronous if either source or destination is pagable host memory. @@ -63,7 +64,7 @@ void __copy_bytes_impl(stream_ref __stream, _CUDA_VSTD::span<_SrcTy> __src, _CUD //! @param __src Source to copy from //! @param __dst Destination to copy into _CCCL_TEMPLATE(typename _SrcTy, typename _DstTy) -_CCCL_REQUIRES(__valid_copy_fill_argument<_SrcTy> _CCCL_AND __valid_copy_fill_argument<_DstTy>) +_CCCL_REQUIRES(__valid_1d_copy_fill_argument<_SrcTy> _CCCL_AND __valid_1d_copy_fill_argument<_DstTy>) void copy_bytes(stream_ref __stream, _SrcTy&& __src, _DstTy&& __dst) { __copy_bytes_impl( @@ -74,5 +75,86 @@ void copy_bytes(stream_ref __stream, _SrcTy&& __src, _DstTy&& __dst) detail::__launch_transform(__stream, _CUDA_VSTD::forward<_DstTy>(__dst))))); } +template +inline constexpr bool __copy_bytes_compatible_extents = false; + +template +inline constexpr bool __copy_bytes_compatible_extents<_CUDA_VSTD::extents<_IndexType, _Extents...>, + _CUDA_VSTD::extents<_OtherIndexType, _OtherExtents...>> = + decltype(_CUDA_VSTD::__detail::__check_compatible_extents( + _CUDA_VSTD::integral_constant{}, + _CUDA_VSTD::integer_sequence{}, + _CUDA_VSTD::integer_sequence{}))::value; + +template +_CCCL_NODISCARD bool __copy_bytes_runtime_extents_match(_SrcExtents __src_exts, _DstExtents __dst_exts) +{ + for (typename _SrcExtents::rank_type __i = 0; __i < __src_exts.rank(); __i++) + { + if (__src_exts.extent(__i) + != static_cast( + __dst_exts.extent((static_cast(__i))))) + { + return false; + } + } + return true; +} + +template +void __nd_copy_bytes_impl(stream_ref __stream, + _CUDA_VSTD::mdspan<_SrcElem, _SrcExtents, _SrcLayout, _SrcAccessor> __src, + _CUDA_VSTD::mdspan<_DstElem, _DstExtents, _DstLayout, _DstAccessor> __dst) +{ + static_assert(__copy_bytes_compatible_extents<_SrcExtents, _DstExtents>, + "Multidimensional copy requires both source and destination extents to be compatible"); + static_assert(_CUDA_VSTD::is_same_v<_SrcLayout, _DstLayout>, + "Multidimensional copy requires both source and destination layouts to match"); + + if (!__copy_bytes_runtime_extents_match(__src.extents(), __dst.extents())) + { + _CUDA_VSTD::__throw_invalid_argument("Copy destination size differs from the source"); + } + + __copy_bytes_impl(__stream, + _CUDA_VSTD::span(__src.data_handle(), __src.mapping().required_span_size()), + _CUDA_VSTD::span(__dst.data_handle(), __dst.mapping().required_span_size())); +} + +//! @brief Launches a bytewise memory copy from source to destination into the provided stream. +//! +//! Both source and destination needs to either be an instance of `cuda::std::mdspan` or launch transform to +//! one. They can also implicitly convert to `cuda::std::mdspan`, but the type needs to contain `mdspan` template +//! arguments as member aliases named `value_type`, `extents_type`, `layout_type` and `accessor_type`. Both source and +//! destination type is required to be trivially copyable. +//! +//! This call might be synchronous if either source or destination is pagable host memory. +//! It will be synchronous if both destination and copy is located in host memory. +//! +//! @param __stream Stream that the copy should be inserted into +//! @param __src Source to copy from +//! @param __dst Destination to copy into +_CCCL_TEMPLATE(typename _SrcTy, typename _DstTy) +_CCCL_REQUIRES(__valid_nd_copy_fill_argument<_SrcTy> _CCCL_AND __valid_nd_copy_fill_argument<_DstTy>) +void copy_bytes(stream_ref __stream, _SrcTy&& __src, _DstTy&& __dst) +{ + decltype(auto) __src_transformed = detail::__launch_transform(__stream, _CUDA_VSTD::forward<_SrcTy>(__src)); + decltype(auto) __dst_transformed = detail::__launch_transform(__stream, _CUDA_VSTD::forward<_DstTy>(__dst)); + decltype(auto) __src_as_arg = static_cast>(__src_transformed); + decltype(auto) __dst_as_arg = static_cast>(__dst_transformed); + __nd_copy_bytes_impl( + __stream, __as_mdspan_t(__src_as_arg), __as_mdspan_t(__dst_as_arg)); +} + } // namespace cuda::experimental #endif // __CUDAX_ALGORITHM_COPY diff --git a/cudax/include/cuda/experimental/__algorithm/fill.cuh b/cudax/include/cuda/experimental/__algorithm/fill.cuh index aeb54235c78..cc7ddc61382 100644 --- a/cudax/include/cuda/experimental/__algorithm/fill.cuh +++ b/cudax/include/cuda/experimental/__algorithm/fill.cuh @@ -42,15 +42,17 @@ void __fill_bytes_impl(stream_ref __stream, _CUDA_VSTD::span<_DstTy, _DstSize> _ //! @brief Launches an operation to bytewise fill the memory into the provided stream. //! -//! Destination needs to either be a `contiguous_range` or implicitly/launch transform -//! into one. It can't reside in pagable host memory. +//! Destination needs to either be a `contiguous_range` or launch transform +//! into one. It can also implicitly convert to `cuda::std::span`, but it needs to contain `value_type` member alias. //! Destination type is required to be trivially copyable. //! +//! Destination can't reside in pagable host memory. +//! //! @param __stream Stream that the copy should be inserted into //! @param __dst Destination memory to fill //! @param __value Value to fill into every byte in the destination _CCCL_TEMPLATE(typename _DstTy) -_CCCL_REQUIRES(__valid_copy_fill_argument<_DstTy>) +_CCCL_REQUIRES(__valid_1d_copy_fill_argument<_DstTy>) void fill_bytes(stream_ref __stream, _DstTy&& __dst, uint8_t __value) { __fill_bytes_impl(__stream, @@ -59,5 +61,29 @@ void fill_bytes(stream_ref __stream, _DstTy&& __dst, uint8_t __value) __value); } +//! @brief Launches an operation to bytewise fill the memory into the provided stream. +//! +//! Destination needs to either be an instance of `cuda::std::mdspan` or launch transform +//! into one. It can also implicitly convert to `cuda::std::mdspan`, but the type needs to contain `mdspan` template +//! arguments as member aliases named `value_type`, `extents_type`, `layout_type` and `accessor_type`. Destination +//! type is required to be trivially copyable. +//! +//! Destination can't reside in pagable host memory. +//! +//! @param __stream Stream that the copy should be inserted into +//! @param __dst Destination memory to fill +//! @param __value Value to fill into every byte in the destination +_CCCL_TEMPLATE(typename _DstTy) +_CCCL_REQUIRES(__valid_nd_copy_fill_argument<_DstTy>) +void fill_bytes(stream_ref __stream, _DstTy&& __dst, uint8_t __value) +{ + decltype(auto) __dst_transformed = detail::__launch_transform(__stream, _CUDA_VSTD::forward<_DstTy>(__dst)); + decltype(auto) __dst_as_arg = static_cast>(__dst_transformed); + auto __dst_mdspan = __as_mdspan_t(__dst_as_arg); + + __fill_bytes_impl( + __stream, _CUDA_VSTD::span(__dst_mdspan.data_handle(), __dst_mdspan.mapping().required_span_size()), __value); +} + } // namespace cuda::experimental #endif // __CUDAX_ALGORITHM_FILL diff --git a/cudax/test/algorithm/common.cuh b/cudax/test/algorithm/common.cuh index 2789a1f4802..4b262966190 100644 --- a/cudax/test/algorithm/common.cuh +++ b/cudax/test/algorithm/common.cuh @@ -43,10 +43,24 @@ void check_result_and_erase(cudax::stream_ref stream, Result&& result, uint8_t p } } +template +auto make_buffer_for_mdspan(Extents extents, char value = 0) +{ + cuda::mr::pinned_memory_resource host_resource; + auto mapping = typename Layout::template mapping{extents}; + + cudax::uninitialized_buffer buffer(host_resource, mapping.required_span_size()); + + memset(buffer.data(), value, buffer.size_bytes()); + + return buffer; +} + namespace cuda::experimental { // Need a type that goes through all launch_transform steps, but is not a contiguous_range +template > struct weird_buffer { const cuda::mr::pinned_memory_resource& resource; @@ -57,7 +71,9 @@ struct weird_buffer : resource(res) , data((int*) res.allocate(s * sizeof(int))) , size(s) - {} + { + memset(data, 0, size); + } ~weird_buffer() { @@ -72,12 +88,18 @@ struct weird_buffer int* data; std::size_t size; - using __as_kernel_arg = cuda::std::span; + using __as_kernel_arg = AsKernelArg; operator cuda::std::span() { return {data, size}; } + + template + operator cuda::std::mdspan() + { + return cuda::std::mdspan{data}; + } }; _CCCL_NODISCARD_FRIEND transform_result __cudax_launch_transform(cuda::stream_ref, const weird_buffer& self) noexcept @@ -85,9 +107,6 @@ struct weird_buffer return {self.data, self.size}; } }; - -static_assert(std::is_same_v, cuda::std::span>); - } // namespace cuda::experimental #endif // __ALGORITHM_COMMON__ diff --git a/cudax/test/algorithm/copy.cu b/cudax/test/algorithm/copy.cu index 07eabba32e6..3db65e22c51 100644 --- a/cudax/test/algorithm/copy.cu +++ b/cudax/test/algorithm/copy.cu @@ -10,7 +10,7 @@ #include "common.cuh" -TEST_CASE("Copy", "[data_manipulation]") +TEST_CASE("1d Copy", "[data_manipulation]") { cudax::stream _stream; @@ -103,3 +103,67 @@ TEST_CASE("Copy", "[data_manipulation]") CUDAX_REQUIRE(vec[1] == 0xbeef); } } + +template +void test_mdspan_copy_bytes( + cudax::stream_ref stream, SrcExtents src_extents = SrcExtents(), DstExtents dst_extents = DstExtents()) +{ + auto src_buffer = make_buffer_for_mdspan(src_extents, 1); + auto dst_buffer = make_buffer_for_mdspan(dst_extents, 0); + + cuda::std::mdspan src(src_buffer.data(), src_extents); + cuda::std::mdspan dst(dst_buffer.data(), dst_extents); + + for (int i = 0; i < static_cast(src.extent(1)); i++) + { + src(0, i) = i; + } + + cudax::copy_bytes(stream, std::move(src), dst); + stream.wait(); + + for (int i = 0; i < static_cast(dst.extent(1)); i++) + { + CUDAX_CHECK(dst(0, i) == i); + } +} + +TEST_CASE("Mdspan copy", "[data_manipulation]") +{ + cudax::stream stream; + + SECTION("Different extents") + { + auto static_extents = cuda::std::extents(); + test_mdspan_copy_bytes(stream, static_extents, static_extents); + test_mdspan_copy_bytes(stream, static_extents, static_extents); + + auto dynamic_extents = cuda::std::dextents(3, 4); + test_mdspan_copy_bytes(stream, dynamic_extents, dynamic_extents); + test_mdspan_copy_bytes(stream, static_extents, dynamic_extents); + test_mdspan_copy_bytes(stream, static_extents, dynamic_extents); + + auto mixed_extents = cuda::std::extents(3); + test_mdspan_copy_bytes(stream, dynamic_extents, mixed_extents); + test_mdspan_copy_bytes(stream, mixed_extents, static_extents); + test_mdspan_copy_bytes(stream, mixed_extents, static_extents); + } + + SECTION("Launch transform") + { + auto mixed_extents = + cuda::std::extents(1024, 2); + [[maybe_unused]] auto static_extents = cuda::std::extents(); + auto mdspan_buffer = make_buffer_for_mdspan(mixed_extents, 1); + cuda::std::mdspan mdspan(mdspan_buffer.data(), mixed_extents); + cudax::weird_buffer> buffer{ + cuda::mr::pinned_memory_resource{}, mdspan.mapping().required_span_size()}; + + cudax::copy_bytes(stream, mdspan, buffer); + stream.wait(); + CUDAX_REQUIRE(!memcmp(mdspan_buffer.data(), buffer.data, mdspan_buffer.size())); + } +} diff --git a/cudax/test/algorithm/fill.cu b/cudax/test/algorithm/fill.cu index 7111aa848f3..ce733871f51 100644 --- a/cudax/test/algorithm/fill.cu +++ b/cudax/test/algorithm/fill.cu @@ -44,3 +44,32 @@ TEST_CASE("Fill", "[data_manipulation]") check_result_and_erase(_stream, cuda::std::span(buffer.data, buffer.size)); } } + +TEST_CASE("Mdspan Fill", "[data_manipulation]") +{ + cudax::stream stream; + { + cuda::std::dextents dynamic_extents{1, 2, 3}; + auto buffer = make_buffer_for_mdspan(dynamic_extents, 0); + cuda::std::mdspan dynamic_mdspan(buffer.data(), dynamic_extents); + + cudax::fill_bytes(stream, dynamic_mdspan, fill_byte); + check_result_and_erase(stream, cuda::std::span(buffer.data(), buffer.size())); + } + { + cuda::std::extents mixed_extents{1}; + auto buffer = make_buffer_for_mdspan(mixed_extents, 0); + cuda::std::mdspan mixed_mdspan(buffer.data(), mixed_extents); + + cudax::fill_bytes(stream, cuda::std::move(mixed_mdspan), fill_byte); + check_result_and_erase(stream, cuda::std::span(buffer.data(), buffer.size())); + } + { + using static_extents = cuda::std::extents; + auto size = cuda::std::layout_left::mapping().required_span_size(); + cudax::weird_buffer> buffer(cuda::mr::pinned_memory_resource{}, size); + + cudax::fill_bytes(stream, buffer, fill_byte); + check_result_and_erase(stream, cuda::std::span(buffer.data, buffer.size)); + } +} From a085ba11095d0849a1ff62fb2f375d2601904868 Mon Sep 17 00:00:00 2001 From: Eric Niebler Date: Mon, 25 Nov 2024 13:17:29 -0800 Subject: [PATCH 25/45] add a `_CCCL_NO_CONCEPTS` config macro (#2945) Co-authored-by: Bernhard Manfred Gruber --- libcudacxx/include/cuda/std/__cccl/dialect.h | 6 ++++ .../include/cuda/std/__concepts/arithmetic.h | 4 +-- .../include/cuda/std/__concepts/assignable.h | 6 ++-- .../cuda/std/__concepts/boolean_testable.h | 6 ++-- .../cuda/std/__concepts/class_or_enum.h | 4 +-- .../std/__concepts/common_reference_with.h | 6 ++-- .../include/cuda/std/__concepts/common_with.h | 6 ++-- .../cuda/std/__concepts/concept_macros.h | 23 +++++++------- .../cuda/std/__concepts/constructible.h | 6 ++-- .../cuda/std/__concepts/convertible_to.h | 6 ++-- .../include/cuda/std/__concepts/copyable.h | 6 ++-- .../cuda/std/__concepts/derived_from.h | 6 ++-- .../cuda/std/__concepts/destructible.h | 4 +-- .../cuda/std/__concepts/different_from.h | 4 +-- .../cuda/std/__concepts/equality_comparable.h | 6 ++-- .../include/cuda/std/__concepts/invocable.h | 6 ++-- .../include/cuda/std/__concepts/movable.h | 6 ++-- .../include/cuda/std/__concepts/predicate.h | 6 ++-- .../include/cuda/std/__concepts/regular.h | 6 ++-- .../include/cuda/std/__concepts/relation.h | 6 ++-- .../include/cuda/std/__concepts/same_as.h | 4 +-- .../include/cuda/std/__concepts/semiregular.h | 6 ++-- .../include/cuda/std/__concepts/swappable.h | 26 ++++++++-------- .../cuda/std/__concepts/totally_ordered.h | 6 ++-- .../include/cuda/std/__iterator/concepts.h | 4 +-- .../std/__iterator/incrementable_traits.h | 4 +-- .../include/cuda/std/__iterator/iter_move.h | 12 ++++---- .../include/cuda/std/__iterator/iter_swap.h | 6 ++-- .../cuda/std/__iterator/iterator_traits.h | 22 +++++++------- .../cuda/std/__iterator/move_iterator.h | 18 +++++------ .../cuda/std/__iterator/readable_traits.h | 4 +-- .../cuda/std/__iterator/reverse_iterator.h | 30 +++++++++---------- libcudacxx/include/cuda/std/__ranges/access.h | 12 ++++---- .../include/cuda/std/__ranges/concepts.h | 13 ++++---- libcudacxx/include/cuda/std/__ranges/data.h | 6 ++-- libcudacxx/include/cuda/std/__ranges/empty.h | 6 ++-- .../include/cuda/std/__ranges/enable_view.h | 6 ++-- libcudacxx/include/cuda/std/__ranges/rbegin.h | 6 ++-- libcudacxx/include/cuda/std/__ranges/rend.h | 6 ++-- libcudacxx/include/cuda/std/__ranges/size.h | 10 +++---- .../include/cuda/std/__ranges/subrange.h | 18 +++++------ .../cuda/std/__ranges/view_interface.h | 12 ++++---- 42 files changed, 187 insertions(+), 179 deletions(-) diff --git a/libcudacxx/include/cuda/std/__cccl/dialect.h b/libcudacxx/include/cuda/std/__cccl/dialect.h index 8dfedd5a3cc..407f2db6ecf 100644 --- a/libcudacxx/include/cuda/std/__cccl/dialect.h +++ b/libcudacxx/include/cuda/std/__cccl/dialect.h @@ -80,6 +80,7 @@ # define _CCCL_IF_CONSTEXPR if constexpr # define _CCCL_ELSE_IF_CONSTEXPR else if constexpr #else // ^^^ C++17 ^^^ / vvv C++14 vvv +# define _CCCL_NO_IF_CONSTEXPR # define _CCCL_IF_CONSTEXPR if # define _CCCL_ELSE_IF_CONSTEXPR else if #endif // _CCCL_STD_VER <= 2014 @@ -104,6 +105,11 @@ # define _CCCL_NO_VARIABLE_TEMPLATES #endif // _CCCL_STD_VER <= 2011 +// concepts are only available from C++20 onwards +#if _CCCL_STD_VER <= 2017 || !defined(__cpp_concepts) || (__cpp_concepts < 201907L) +# define _CCCL_NO_CONCEPTS +#endif // _CCCL_STD_VER <= 2017 || !defined(__cpp_concepts) || (__cpp_concepts < 201907L) + // noexcept function types are only available from C++17 onwards #if _CCCL_STD_VER >= 2017 && defined(__cpp_noexcept_function_type) && (__cpp_noexcept_function_type >= 201510L) # define _CCCL_FUNCTION_TYPE_NOEXCEPT noexcept diff --git a/libcudacxx/include/cuda/std/__concepts/arithmetic.h b/libcudacxx/include/cuda/std/__concepts/arithmetic.h index 4f653cd35fc..5a643652824 100644 --- a/libcudacxx/include/cuda/std/__concepts/arithmetic.h +++ b/libcudacxx/include/cuda/std/__concepts/arithmetic.h @@ -30,7 +30,7 @@ _LIBCUDACXX_BEGIN_NAMESPACE_STD -#if _CCCL_STD_VER > 2011 +#if !defined(_CCCL_NO_VARIABLE_TEMPLATES) // [concepts.arithmetic], arithmetic concepts @@ -49,7 +49,7 @@ _CCCL_CONCEPT floating_point = _CCCL_TRAIT(is_floating_point, _Tp); template _CCCL_CONCEPT __libcpp_signed_integer = __libcpp_is_signed_integer<_Tp>::value; -#endif // _CCCL_STD_VER > 2011 +#endif // ^^^ !_CCCL_NO_VARIABLE_TEMPLATES _LIBCUDACXX_END_NAMESPACE_STD diff --git a/libcudacxx/include/cuda/std/__concepts/assignable.h b/libcudacxx/include/cuda/std/__concepts/assignable.h index d3b0c89e96d..d2d3c96d64d 100644 --- a/libcudacxx/include/cuda/std/__concepts/assignable.h +++ b/libcudacxx/include/cuda/std/__concepts/assignable.h @@ -29,7 +29,7 @@ _LIBCUDACXX_BEGIN_NAMESPACE_STD -#if _CCCL_STD_VER > 2017 +#if !defined(_CCCL_NO_CONCEPTS) // [concept.assignable] @@ -40,7 +40,7 @@ concept assignable_from = { __lhs = _CUDA_VSTD::forward<_Rhs>(__rhs) } -> same_as<_Lhs>; }; -#elif _CCCL_STD_VER > 2011 +#elif !defined(_CCCL_NO_VARIABLE_TEMPLATES) // ^^^ !_CCCL_NO_CONCEPTS ^^^ template _CCCL_CONCEPT_FRAGMENT( @@ -53,7 +53,7 @@ _CCCL_CONCEPT_FRAGMENT( template _CCCL_CONCEPT assignable_from = _CCCL_FRAGMENT(__assignable_from_, _Lhs, _Rhs); -#endif // _CCCL_STD_VER > 2011 +#endif // ^^^ !_CCCL_NO_VARIABLE_TEMPLATES _LIBCUDACXX_END_NAMESPACE_STD diff --git a/libcudacxx/include/cuda/std/__concepts/boolean_testable.h b/libcudacxx/include/cuda/std/__concepts/boolean_testable.h index c3717385ebd..adc07b35842 100644 --- a/libcudacxx/include/cuda/std/__concepts/boolean_testable.h +++ b/libcudacxx/include/cuda/std/__concepts/boolean_testable.h @@ -26,7 +26,7 @@ _LIBCUDACXX_BEGIN_NAMESPACE_STD -#if _CCCL_STD_VER > 2017 +#if !defined(_CCCL_NO_CONCEPTS) // [concepts.booleantestable] @@ -38,7 +38,7 @@ concept __boolean_testable = __boolean_testable_impl<_Tp> && requires(_Tp&& __t) { !_CUDA_VSTD::forward<_Tp>(__t) } -> __boolean_testable_impl; }; -#elif _CCCL_STD_VER > 2011 +#elif !defined(_CCCL_NO_VARIABLE_TEMPLATES) // ^^^ !_CCCL_NO_CONCEPTS ^^^ template _CCCL_CONCEPT __boolean_testable_impl = convertible_to<_Tp, bool>; @@ -52,7 +52,7 @@ _CCCL_CONCEPT_FRAGMENT( template _CCCL_CONCEPT __boolean_testable = _CCCL_FRAGMENT(__boolean_testable_, _Tp); -#endif // _CCCL_STD_VER > 2011 +#endif // ^^^ !_CCCL_NO_VARIABLE_TEMPLATES _LIBCUDACXX_END_NAMESPACE_STD diff --git a/libcudacxx/include/cuda/std/__concepts/class_or_enum.h b/libcudacxx/include/cuda/std/__concepts/class_or_enum.h index 390ec8c5991..f94dec899f2 100644 --- a/libcudacxx/include/cuda/std/__concepts/class_or_enum.h +++ b/libcudacxx/include/cuda/std/__concepts/class_or_enum.h @@ -28,7 +28,7 @@ _LIBCUDACXX_BEGIN_NAMESPACE_STD -#if _CCCL_STD_VER > 2011 +#if !defined(_CCCL_NO_VARIABLE_TEMPLATES) template _CCCL_CONCEPT __class_or_enum = _CCCL_TRAIT(is_class, _Tp) || _CCCL_TRAIT(is_union, _Tp) || _CCCL_TRAIT(is_enum, _Tp); @@ -39,7 +39,7 @@ template _CCCL_CONCEPT __workaround_52970 = _CCCL_TRAIT(is_class, remove_cvref_t<_Tp>) || _CCCL_TRAIT(is_union, remove_cvref_t<_Tp>); -#endif // _CCCL_STD_VER > 2011 +#endif // ^^^ !_CCCL_NO_VARIABLE_TEMPLATES _LIBCUDACXX_END_NAMESPACE_STD diff --git a/libcudacxx/include/cuda/std/__concepts/common_reference_with.h b/libcudacxx/include/cuda/std/__concepts/common_reference_with.h index a41f04a1563..648805ca871 100644 --- a/libcudacxx/include/cuda/std/__concepts/common_reference_with.h +++ b/libcudacxx/include/cuda/std/__concepts/common_reference_with.h @@ -29,7 +29,7 @@ _LIBCUDACXX_BEGIN_NAMESPACE_STD -#if _CCCL_STD_VER > 2017 +#if !defined(_CCCL_NO_CONCEPTS) // [concept.commonref] @@ -38,7 +38,7 @@ concept common_reference_with = same_as, common_reference_t<_Up, _Tp>> && convertible_to<_Tp, common_reference_t<_Tp, _Up>> && convertible_to<_Up, common_reference_t<_Tp, _Up>>; -#elif _CCCL_STD_VER > 2011 +#elif !defined(_CCCL_NO_VARIABLE_TEMPLATES) // ^^^ !_CCCL_NO_CONCEPTS ^^^ template _CCCL_CONCEPT_FRAGMENT(__common_reference_exists_, @@ -58,7 +58,7 @@ _CCCL_CONCEPT_FRAGMENT( template _CCCL_CONCEPT common_reference_with = _CCCL_FRAGMENT(__common_reference_with_, _Tp, _Up); -#endif // _CCCL_STD_VER > 2011 +#endif // ^^^ !_CCCL_NO_VARIABLE_TEMPLATES _LIBCUDACXX_END_NAMESPACE_STD diff --git a/libcudacxx/include/cuda/std/__concepts/common_with.h b/libcudacxx/include/cuda/std/__concepts/common_with.h index 683ce44f5e4..20bb3680755 100644 --- a/libcudacxx/include/cuda/std/__concepts/common_with.h +++ b/libcudacxx/include/cuda/std/__concepts/common_with.h @@ -29,7 +29,7 @@ _LIBCUDACXX_BEGIN_NAMESPACE_STD -#if _CCCL_STD_VER > 2017 +#if !defined(_CCCL_NO_CONCEPTS) // [concept.common] @@ -39,7 +39,7 @@ concept common_with = same_as, common_type_t<_Up, _Tp>> static_cast>(_CUDA_VSTD::declval<_Up>()); } && common_reference_with, add_lvalue_reference_t> && common_reference_with>, common_reference_t, add_lvalue_reference_t>>; -#elif _CCCL_STD_VER > 2011 +#elif !defined(_CCCL_NO_VARIABLE_TEMPLATES) // ^^^ !_CCCL_NO_CONCEPTS ^^^ template _CCCL_CONCEPT_FRAGMENT(__common_type_exists_, @@ -71,7 +71,7 @@ _CCCL_CONCEPT_FRAGMENT( template _CCCL_CONCEPT common_with = _CCCL_FRAGMENT(__common_with_, _Tp, _Up); -#endif // _CCCL_STD_VER > 2011 +#endif // ^^^ !_CCCL_NO_VARIABLE_TEMPLATES _LIBCUDACXX_END_NAMESPACE_STD diff --git a/libcudacxx/include/cuda/std/__concepts/concept_macros.h b/libcudacxx/include/cuda/std/__concepts/concept_macros.h index 18587ca57df..8fc98cde0ff 100644 --- a/libcudacxx/include/cuda/std/__concepts/concept_macros.h +++ b/libcudacxx/include/cuda/std/__concepts/concept_macros.h @@ -52,21 +52,22 @@ using __cccl_enable_if_t = typename __cccl_select<_Bp>::template type<_Tp>; template using __cccl_requires_t = typename __cccl_select<_Bp>::template type<_Tp>; -#if (defined(__cpp_concepts) && _CCCL_STD_VER >= 2020) || defined(_CCCL_DOXYGEN_INVOKED) +#if !defined(_CCCL_NO_CONCEPTS) || defined(_CCCL_DOXYGEN_INVOKED) # define _CCCL_TEMPLATE(...) template <__VA_ARGS__> # define _CCCL_REQUIRES(...) requires __VA_ARGS__ # define _CCCL_AND && # define _CCCL_TRAILING_REQUIRES_AUX_(...) requires __VA_ARGS__ # define _CCCL_TRAILING_REQUIRES(...) ->__VA_ARGS__ _CCCL_TRAILING_REQUIRES_AUX_ -#else // ^^^ __cpp_concepts ^^^ / vvv !__cpp_concepts vvv +#else // ^^^ _CCCL_NO_CONCEPTS ^^^ / vvv !_CCCL_NO_CONCEPTS vvv # define _CCCL_TEMPLATE(...) template <__VA_ARGS__ # define _CCCL_REQUIRES(...) , bool __cccl_true_ = true, __cccl_enable_if_t < __VA_ARGS__ && __cccl_true_, int > = 0 > # define _CCCL_AND &&__cccl_true_, int > = 0, __cccl_enable_if_t < # define _CCCL_TRAILING_REQUIRES_AUX_(...) , __VA_ARGS__ > # define _CCCL_TRAILING_REQUIRES(...) ->__cccl_requires_t < __VA_ARGS__ _CCCL_TRAILING_REQUIRES_AUX_ -#endif // !__cpp_concepts +#endif // !defined(_CCCL_NO_CONCEPTS) -#if _CCCL_STD_VER >= 2014 +// The following concepts emulation macros need variable template support +#if !defined(_CCCL_NO_VARIABLE_TEMPLATES) template struct __cccl_tag; @@ -141,7 +142,7 @@ namespace __cccl_unqualified_cuda_std = _CUDA_VSTD; // NOLINT(misc-unused-alias- # define _CCCL_PP_EAT_TYPENAME_SELECT_1(...) _CCCL_PP_CAT3(_CCCL_PP_EAT_TYPENAME_, __VA_ARGS__) # define _CCCL_PP_EAT_TYPENAME_typename -# if (defined(__cpp_concepts) && _CCCL_STD_VER >= 2020) || defined(_CCCL_DOXYGEN_INVOKED) +# if !defined(_CCCL_NO_CONCEPTS) || defined(_CCCL_DOXYGEN_INVOKED) # define _CCCL_CONCEPT concept @@ -167,7 +168,7 @@ namespace __cccl_unqualified_cuda_std = _CUDA_VSTD; // NOLINT(misc-unused-alias- # define _CCCL_FRAGMENT(_NAME, ...) _NAME<__VA_ARGS__> -# else +# else // ^^^ !_CCCL_NO_CONCEPTS ^^^ / vvv _CCCL_NO_CONCEPTS vvv # define _CCCL_CONCEPT _CCCL_INLINE_VAR constexpr bool @@ -207,7 +208,7 @@ namespace __cccl_unqualified_cuda_std = _CUDA_VSTD; // NOLINT(misc-unused-alias- # define _CCCL_FRAGMENT(_NAME, ...) \ (1u == sizeof(_NAME##_CCCL_CONCEPT_FRAGMENT_(static_cast<::__cccl_tag<__VA_ARGS__>*>(nullptr), nullptr))) -# endif +# endif // ^^^ _CCCL_NO_CONCEPTS ^^^ //////////////////////////////////////////////////////////////////////////////// // _CCCL_REQUIRES_EXPR @@ -220,10 +221,10 @@ namespace __cccl_unqualified_cuda_std = _CUDA_VSTD; // NOLINT(misc-unused-alias- // ); // // Can only be used as the last requirement in a concept definition. -# if defined(__cpp_concepts) && _CCCL_STD_VER >= 2020 || defined(_CCCL_DOXYGEN_INVOKED) +# if !defined(_CCCL_NO_CONCEPTS) || defined(_CCCL_DOXYGEN_INVOKED) # define _CCCL_REQUIRES_EXPR(_TY, ...) requires(__VA_ARGS__) _CCCL_REQUIRES_EXPR_2 # define _CCCL_REQUIRES_EXPR_2(...) {_CCCL_PP_FOR_EACH(_CCCL_CONCEPT_FRAGMENT_REQS_M, __VA_ARGS__)} -# else +# else // ^^^ !_CCCL_NO_CONCEPTS ^^^ / vvv _CCCL_NO_CONCEPTS vvv # define _CCCL_REQUIRES_EXPR_TPARAM_PROBE_variadic _CCCL_PP_PROBE(~) # define _CCCL_REQUIRES_EXPR_TPARAM_variadic @@ -268,8 +269,8 @@ namespace __cccl_unqualified_cuda_std = _CUDA_VSTD; // NOLINT(misc-unused-alias- return false; \ } \ } -# endif +# endif // ^^^ _CCCL_NO_CONCEPTS ^^^ -#endif // _CCCL_STD_VER >= 2014 +#endif // ^^^ !_CCCL_NO_VARIABLE_TEMPLATES ^^^ #endif //_CUDA___CONCEPTS diff --git a/libcudacxx/include/cuda/std/__concepts/constructible.h b/libcudacxx/include/cuda/std/__concepts/constructible.h index 13879811f8b..08c579060fe 100644 --- a/libcudacxx/include/cuda/std/__concepts/constructible.h +++ b/libcudacxx/include/cuda/std/__concepts/constructible.h @@ -28,7 +28,7 @@ _LIBCUDACXX_BEGIN_NAMESPACE_STD -#if _CCCL_STD_VER > 2017 +#if !defined(_CCCL_NO_CONCEPTS) // [concept.constructible] template @@ -52,7 +52,7 @@ concept copy_constructible = && constructible_from<_Tp, const _Tp&> && convertible_to && constructible_from<_Tp, const _Tp> && convertible_to; -#elif _CCCL_STD_VER > 2011 +#elif !defined(_CCCL_NO_VARIABLE_TEMPLATES) // ^^^ !_CCCL_NO_CONCEPTS ^^^ template _CCCL_CONCEPT_FRAGMENT(__constructible_from_, @@ -96,7 +96,7 @@ _CCCL_CONCEPT_FRAGMENT( template _CCCL_CONCEPT copy_constructible = _CCCL_FRAGMENT(__copy_constructible_, _Tp); -#endif // _CCCL_STD_VER > 2011 +#endif // ^^^ !_CCCL_NO_VARIABLE_TEMPLATES _LIBCUDACXX_END_NAMESPACE_STD diff --git a/libcudacxx/include/cuda/std/__concepts/convertible_to.h b/libcudacxx/include/cuda/std/__concepts/convertible_to.h index 169383cb095..45eebf9d97d 100644 --- a/libcudacxx/include/cuda/std/__concepts/convertible_to.h +++ b/libcudacxx/include/cuda/std/__concepts/convertible_to.h @@ -28,12 +28,12 @@ _LIBCUDACXX_BEGIN_NAMESPACE_STD // [concept.convertible] -#if _CCCL_STD_VER >= 2020 +#if !defined(_CCCL_NO_CONCEPTS) template concept convertible_to = is_convertible_v<_From, _To> && requires { static_cast<_To>(_CUDA_VSTD::declval<_From>()); }; -#elif _CCCL_STD_VER >= 2014 // ^^^ C++20 ^^^ / vvv C++14/17 vvv +#elif !defined(_CCCL_NO_VARIABLE_TEMPLATES) // ^^^ !_CCCL_NO_CONCEPTS ^^^ # if _CCCL_COMPILER(MSVC) _CCCL_NV_DIAG_SUPPRESS(1211) // nonstandard cast to array type ignored @@ -60,7 +60,7 @@ _CCCL_NV_DIAG_DEFAULT(1211) // nonstandard cast to array type ignored # endif // _CCCL_COMPILER(MSVC) _CCCL_NV_DIAG_DEFAULT(171) // invalid type conversion, e.g. [with _From=int **, _To=const int *const *] -#endif // _CCCL_STD_VER >= 2014 +#endif // ^^^ !_CCCL_NO_VARIABLE_TEMPLATES ^^^ _LIBCUDACXX_END_NAMESPACE_STD diff --git a/libcudacxx/include/cuda/std/__concepts/copyable.h b/libcudacxx/include/cuda/std/__concepts/copyable.h index 11bf23329bc..1ba79c71ed2 100644 --- a/libcudacxx/include/cuda/std/__concepts/copyable.h +++ b/libcudacxx/include/cuda/std/__concepts/copyable.h @@ -27,7 +27,7 @@ _LIBCUDACXX_BEGIN_NAMESPACE_STD -#if _CCCL_STD_VER > 2017 +#if !defined(_CCCL_NO_CONCEPTS) // [concepts.object] @@ -35,7 +35,7 @@ template concept copyable = copy_constructible<_Tp> && movable<_Tp> && assignable_from<_Tp&, _Tp&> && assignable_from<_Tp&, const _Tp&> && assignable_from<_Tp&, const _Tp>; -#elif _CCCL_STD_VER > 2011 +#elif !defined(_CCCL_NO_VARIABLE_TEMPLATES) // ^^^ !_CCCL_NO_CONCEPTS ^^^ template _CCCL_CONCEPT_FRAGMENT( @@ -49,7 +49,7 @@ _CCCL_CONCEPT_FRAGMENT( template _CCCL_CONCEPT copyable = _CCCL_FRAGMENT(__copyable_, _Tp); -#endif // _CCCL_STD_VER > 2011 +#endif // ^^^ !_CCCL_NO_VARIABLE_TEMPLATES _LIBCUDACXX_END_NAMESPACE_STD diff --git a/libcudacxx/include/cuda/std/__concepts/derived_from.h b/libcudacxx/include/cuda/std/__concepts/derived_from.h index ff3f0cb2411..dca99425d54 100644 --- a/libcudacxx/include/cuda/std/__concepts/derived_from.h +++ b/libcudacxx/include/cuda/std/__concepts/derived_from.h @@ -27,14 +27,14 @@ _LIBCUDACXX_BEGIN_NAMESPACE_STD -#if _CCCL_STD_VER > 2017 +#if !defined(_CCCL_NO_CONCEPTS) // [concept.derived] template concept derived_from = is_base_of_v<_Bp, _Dp> && is_convertible_v; -#elif _CCCL_STD_VER > 2011 +#elif !defined(_CCCL_NO_VARIABLE_TEMPLATES) // ^^^ !_CCCL_NO_CONCEPTS ^^^ template _CCCL_CONCEPT_FRAGMENT( @@ -46,7 +46,7 @@ _CCCL_CONCEPT_FRAGMENT( template _CCCL_CONCEPT derived_from = _CCCL_FRAGMENT(__derived_from_, _Dp, _Bp); -#endif // _CCCL_STD_VER > 2011 +#endif // ^^^ !_CCCL_NO_VARIABLE_TEMPLATES _LIBCUDACXX_END_NAMESPACE_STD diff --git a/libcudacxx/include/cuda/std/__concepts/destructible.h b/libcudacxx/include/cuda/std/__concepts/destructible.h index 90426478490..62d241b9e33 100644 --- a/libcudacxx/include/cuda/std/__concepts/destructible.h +++ b/libcudacxx/include/cuda/std/__concepts/destructible.h @@ -30,7 +30,7 @@ _LIBCUDACXX_BEGIN_NAMESPACE_STD -#if _CCCL_STD_VER > 2011 +#if !defined(_CCCL_NO_VARIABLE_TEMPLATES) # if _CCCL_COMPILER(MSVC) @@ -69,7 +69,7 @@ _CCCL_CONCEPT destructible = __destructible<_Tp>; # endif // !_CCCL_COMPILER(MSVC) -#endif // _CCCL_STD_VER > 2011 +#endif // ^^^ !_CCCL_NO_VARIABLE_TEMPLATES _LIBCUDACXX_END_NAMESPACE_STD diff --git a/libcudacxx/include/cuda/std/__concepts/different_from.h b/libcudacxx/include/cuda/std/__concepts/different_from.h index 596fa0c2587..0675c0171b0 100644 --- a/libcudacxx/include/cuda/std/__concepts/different_from.h +++ b/libcudacxx/include/cuda/std/__concepts/different_from.h @@ -26,12 +26,12 @@ _LIBCUDACXX_BEGIN_NAMESPACE_STD -#if _CCCL_STD_VER > 2011 +#if !defined(_CCCL_NO_VARIABLE_TEMPLATES) template _CCCL_CONCEPT __different_from = !same_as, remove_cvref_t<_Up>>; -#endif // _CCCL_STD_VER > 2011 +#endif // ^^^ !_CCCL_NO_VARIABLE_TEMPLATES _LIBCUDACXX_END_NAMESPACE_STD diff --git a/libcudacxx/include/cuda/std/__concepts/equality_comparable.h b/libcudacxx/include/cuda/std/__concepts/equality_comparable.h index c2909df1a3b..ed599a7f2cb 100644 --- a/libcudacxx/include/cuda/std/__concepts/equality_comparable.h +++ b/libcudacxx/include/cuda/std/__concepts/equality_comparable.h @@ -28,7 +28,7 @@ _LIBCUDACXX_BEGIN_NAMESPACE_STD -#if _CCCL_STD_VER > 2017 +#if !defined(_CCCL_NO_CONCEPTS) // [concept.equalitycomparable] @@ -51,7 +51,7 @@ concept equality_comparable_with = && equality_comparable, __make_const_lvalue_ref<_Up>>> && __weakly_equality_comparable_with<_Tp, _Up>; -#elif _CCCL_STD_VER > 2011 +#elif !defined(_CCCL_NO_VARIABLE_TEMPLATES) // ^^^ !_CCCL_NO_CONCEPTS ^^^ template _CCCL_CONCEPT_FRAGMENT(__with_lvalue_reference_, requires()(typename(__make_const_lvalue_ref<_Tp>))); @@ -89,7 +89,7 @@ _CCCL_CONCEPT_FRAGMENT( template _CCCL_CONCEPT equality_comparable_with = _CCCL_FRAGMENT(__equality_comparable_with_, _Tp, _Up); -#endif // _CCCL_STD_VER > 2011 +#endif // ^^^ !_CCCL_NO_VARIABLE_TEMPLATES _LIBCUDACXX_END_NAMESPACE_STD diff --git a/libcudacxx/include/cuda/std/__concepts/invocable.h b/libcudacxx/include/cuda/std/__concepts/invocable.h index c9dda78270e..864821362e7 100644 --- a/libcudacxx/include/cuda/std/__concepts/invocable.h +++ b/libcudacxx/include/cuda/std/__concepts/invocable.h @@ -27,7 +27,7 @@ _LIBCUDACXX_BEGIN_NAMESPACE_STD -#if _CCCL_STD_VER > 2017 +#if !defined(_CCCL_NO_CONCEPTS) // [concept.invocable] @@ -48,7 +48,7 @@ concept __invoke_constructible = requires(_Fun&& __fun, _Args&&... __args) { _CUDA_VSTD::invoke(_CUDA_VSTD::forward<_Fun>(__fun), _CUDA_VSTD::forward<_Args>(__args)...)); }; -#elif _CCCL_STD_VER > 2011 +#elif !defined(_CCCL_NO_VARIABLE_TEMPLATES) // ^^^ !_CCCL_NO_CONCEPTS ^^^ template _CCCL_CONCEPT_FRAGMENT(_Invocable_, @@ -69,7 +69,7 @@ _CCCL_CONCEPT_FRAGMENT( template _CCCL_CONCEPT __invoke_constructible = _CCCL_FRAGMENT(__invoke_constructible_, _Fun, _Args...); -#endif // _CCCL_STD_VER > 2011 +#endif // ^^^ !_CCCL_NO_VARIABLE_TEMPLATES _LIBCUDACXX_END_NAMESPACE_STD diff --git a/libcudacxx/include/cuda/std/__concepts/movable.h b/libcudacxx/include/cuda/std/__concepts/movable.h index 18f47cba6c5..98641e22319 100644 --- a/libcudacxx/include/cuda/std/__concepts/movable.h +++ b/libcudacxx/include/cuda/std/__concepts/movable.h @@ -28,12 +28,12 @@ _LIBCUDACXX_BEGIN_NAMESPACE_STD -#if _CCCL_STD_VER > 2017 +#if !defined(_CCCL_NO_CONCEPTS) template concept movable = is_object_v<_Tp> && move_constructible<_Tp> && assignable_from<_Tp&, _Tp> && swappable<_Tp>; -#elif _CCCL_STD_VER > 2011 +#elif !defined(_CCCL_NO_VARIABLE_TEMPLATES) // ^^^ !_CCCL_NO_CONCEPTS ^^^ // [concepts.object] template @@ -47,7 +47,7 @@ _CCCL_CONCEPT_FRAGMENT( template _CCCL_CONCEPT movable = _CCCL_FRAGMENT(_Movable_, _Tp); -#endif // _CCCL_STD_VER > 2011 +#endif // ^^^ !_CCCL_NO_VARIABLE_TEMPLATES _LIBCUDACXX_END_NAMESPACE_STD diff --git a/libcudacxx/include/cuda/std/__concepts/predicate.h b/libcudacxx/include/cuda/std/__concepts/predicate.h index 7d8ee168583..8538468063c 100644 --- a/libcudacxx/include/cuda/std/__concepts/predicate.h +++ b/libcudacxx/include/cuda/std/__concepts/predicate.h @@ -27,12 +27,12 @@ _LIBCUDACXX_BEGIN_NAMESPACE_STD -#if _CCCL_STD_VER > 2017 +#if !defined(_CCCL_NO_CONCEPTS) template concept predicate = regular_invocable<_Fn, _Args...> && __boolean_testable>; -#elif _CCCL_STD_VER > 2011 +#elif !defined(_CCCL_NO_VARIABLE_TEMPLATES) // ^^^ !_CCCL_NO_CONCEPTS ^^^ // [concept.predicate] template @@ -43,7 +43,7 @@ _CCCL_CONCEPT_FRAGMENT( template _CCCL_CONCEPT predicate = _CCCL_FRAGMENT(_Predicate_, _Fn, _Args...); -#endif // _CCCL_STD_VER > 2011 +#endif // ^^^ !_CCCL_NO_VARIABLE_TEMPLATES _LIBCUDACXX_END_NAMESPACE_STD diff --git a/libcudacxx/include/cuda/std/__concepts/regular.h b/libcudacxx/include/cuda/std/__concepts/regular.h index 506dc7700f7..757976cf338 100644 --- a/libcudacxx/include/cuda/std/__concepts/regular.h +++ b/libcudacxx/include/cuda/std/__concepts/regular.h @@ -26,14 +26,14 @@ _LIBCUDACXX_BEGIN_NAMESPACE_STD -#if _CCCL_STD_VER > 2017 +#if !defined(_CCCL_NO_CONCEPTS) // [concept.object] template concept regular = semiregular<_Tp> && equality_comparable<_Tp>; -#elif _CCCL_STD_VER > 2011 +#elif !defined(_CCCL_NO_VARIABLE_TEMPLATES) // ^^^ !_CCCL_NO_CONCEPTS ^^^ // [concept.object] @@ -43,7 +43,7 @@ _CCCL_CONCEPT_FRAGMENT(__regular_, requires()(requires(semiregular<_Tp>), requir template _CCCL_CONCEPT regular = _CCCL_FRAGMENT(__regular_, _Tp); -#endif // _CCCL_STD_VER > 2011 +#endif // ^^^ !_CCCL_NO_VARIABLE_TEMPLATES _LIBCUDACXX_END_NAMESPACE_STD diff --git a/libcudacxx/include/cuda/std/__concepts/relation.h b/libcudacxx/include/cuda/std/__concepts/relation.h index e6006db9a8a..9d552c195bb 100644 --- a/libcudacxx/include/cuda/std/__concepts/relation.h +++ b/libcudacxx/include/cuda/std/__concepts/relation.h @@ -25,7 +25,7 @@ _LIBCUDACXX_BEGIN_NAMESPACE_STD -#if _CCCL_STD_VER > 2017 +#if !defined(_CCCL_NO_CONCEPTS) // [concept.relation] @@ -43,7 +43,7 @@ concept equivalence_relation = relation<_Rp, _Tp, _Up>; template concept strict_weak_order = relation<_Rp, _Tp, _Up>; -#elif _CCCL_STD_VER > 2011 +#elif !defined(_CCCL_NO_VARIABLE_TEMPLATES) // ^^^ !_CCCL_NO_CONCEPTS ^^^ template _CCCL_CONCEPT_FRAGMENT( @@ -66,7 +66,7 @@ _CCCL_CONCEPT equivalence_relation = relation<_Rp, _Tp, _Up>; template _CCCL_CONCEPT strict_weak_order = relation<_Rp, _Tp, _Up>; -#endif // _CCCL_STD_VER > 2011 +#endif // ^^^ !_CCCL_NO_VARIABLE_TEMPLATES _LIBCUDACXX_END_NAMESPACE_STD diff --git a/libcudacxx/include/cuda/std/__concepts/same_as.h b/libcudacxx/include/cuda/std/__concepts/same_as.h index 59b59d6afb7..6247b74d5ec 100644 --- a/libcudacxx/include/cuda/std/__concepts/same_as.h +++ b/libcudacxx/include/cuda/std/__concepts/same_as.h @@ -25,7 +25,7 @@ _LIBCUDACXX_BEGIN_NAMESPACE_STD -#if _CCCL_STD_VER > 2011 +#if !defined(_CCCL_NO_VARIABLE_TEMPLATES) // [concept.same] @@ -35,7 +35,7 @@ _CCCL_CONCEPT __same_as_impl = _IsSame<_Tp, _Up>::value; template _CCCL_CONCEPT same_as = __same_as_impl<_Tp, _Up> && __same_as_impl<_Up, _Tp>; -#endif // _CCCL_STD_VER > 2011 +#endif // ^^^ !_CCCL_NO_VARIABLE_TEMPLATES _LIBCUDACXX_END_NAMESPACE_STD diff --git a/libcudacxx/include/cuda/std/__concepts/semiregular.h b/libcudacxx/include/cuda/std/__concepts/semiregular.h index ae3876885e3..e3c5dd482a6 100644 --- a/libcudacxx/include/cuda/std/__concepts/semiregular.h +++ b/libcudacxx/include/cuda/std/__concepts/semiregular.h @@ -26,14 +26,14 @@ _LIBCUDACXX_BEGIN_NAMESPACE_STD -#if _CCCL_STD_VER > 2017 +#if !defined(_CCCL_NO_CONCEPTS) // [concept.object] template concept semiregular = copyable<_Tp> && default_initializable<_Tp>; -#elif _CCCL_STD_VER > 2011 +#elif !defined(_CCCL_NO_VARIABLE_TEMPLATES) // ^^^ !_CCCL_NO_CONCEPTS ^^^ // [concept.object] @@ -43,7 +43,7 @@ _CCCL_CONCEPT_FRAGMENT(__semiregular_, requires()(requires(copyable<_Tp>), requi template _CCCL_CONCEPT semiregular = _CCCL_FRAGMENT(__semiregular_, _Tp); -#endif // _CCCL_STD_VER > 2011 +#endif // ^^^ !_CCCL_NO_VARIABLE_TEMPLATES _LIBCUDACXX_END_NAMESPACE_STD diff --git a/libcudacxx/include/cuda/std/__concepts/swappable.h b/libcudacxx/include/cuda/std/__concepts/swappable.h index 8688e71a702..2ad1e4270a3 100644 --- a/libcudacxx/include/cuda/std/__concepts/swappable.h +++ b/libcudacxx/include/cuda/std/__concepts/swappable.h @@ -41,7 +41,7 @@ _CCCL_NV_DIAG_SUPPRESS(461) // nonstandard cast to array type ignored #endif // _CCCL_COMPILER(MSVC) -#if _CCCL_STD_VER > 2011 +#if !defined(_CCCL_NO_VARIABLE_TEMPLATES) _LIBCUDACXX_BEGIN_NAMESPACE_RANGES @@ -52,7 +52,7 @@ _LIBCUDACXX_BEGIN_NAMESPACE_CPO(__swap) template void swap(_Tp&, _Tp&) = delete; -# if _CCCL_STD_VER > 2017 +# if !defined(_CCCL_NO_CONCEPTS) template concept __unqualified_swappable_with = (__class_or_enum> || __class_or_enum>) @@ -62,7 +62,7 @@ template concept __exchangeable = !__unqualified_swappable_with<_Tp&, _Tp&> && move_constructible<_Tp> && assignable_from<_Tp&, _Tp>; -# else // ^^^ CXX20 ^^^ / vvv CXX17 vvv +# else // ^^^ !_CCCL_NO_CONCEPTS ^^^ / vvv _CCCL_NO_CONCEPTS vvv template _CCCL_CONCEPT_FRAGMENT( @@ -80,9 +80,9 @@ _CCCL_CONCEPT_FRAGMENT(__exchangeable_, template _CCCL_CONCEPT __exchangeable = _CCCL_FRAGMENT(__exchangeable_, _Tp); -# endif // _CCCL_STD_VER < 2020 +# endif // _CCCL_NO_CONCEPTS -# if _CCCL_STD_VER > 2017 && !_CCCL_COMPILER(NVHPC) // nvbug4051640 +# if !defined(_CCCL_NO_CONCEPTS) && !_CCCL_COMPILER(NVHPC) // nvbug4051640 struct __fn; _CCCL_NV_DIAG_SUPPRESS(2642) @@ -92,10 +92,10 @@ concept __swappable_arrays = && requires(_Tp (&__t)[_Size], _Up (&__u)[_Size], const __fn& __swap) { __swap(__t[0], __u[0]); }; _CCCL_NV_DIAG_DEFAULT(2642) -# else +# else // ^^^ !_CCCL_NO_CONCEPTS && !_CCCL_COMPILER(NVHPC) ^^^ / vvv _CCCL_NO_CONCEPTS || _CCCL_COMPILER(NVHPC) vvv template _CCCL_INLINE_VAR constexpr bool __swappable_arrays = false; -# endif // _CCCL_STD_VER < 2020 || _CCCL_COMPILER(NVHPC) +# endif // _CCCL_NO_CONCEPTS || _CCCL_COMPILER(NVHPC) template _CCCL_INLINE_VAR constexpr bool __noexcept_swappable_arrays = false; @@ -135,7 +135,7 @@ struct __fn } }; -# if _CCCL_STD_VER < 2020 || _CCCL_COMPILER(NVHPC) +# if defined(_CCCL_NO_CONCEPTS) || _CCCL_COMPILER(NVHPC) template _CCCL_CONCEPT_FRAGMENT( __swappable_arrays_, @@ -147,7 +147,7 @@ _CCCL_CONCEPT_FRAGMENT( template _CCCL_INLINE_VAR constexpr bool __swappable_arrays<_Tp, _Up, _Size, void_t>> = _CCCL_FRAGMENT(__swappable_arrays_, _Tp, _Up, _CUDA_VSTD::integral_constant); -# endif // _CCCL_STD_VER < 2020 || _CCCL_COMPILER(NVHPC) +# endif // _CCCL_NO_CONCEPTS || _CCCL_COMPILER(NVHPC) template _CCCL_INLINE_VAR constexpr bool __noexcept_swappable_arrays<_Tp, _Up, void_t>> = @@ -163,7 +163,7 @@ _LIBCUDACXX_END_NAMESPACE_RANGES _LIBCUDACXX_BEGIN_NAMESPACE_STD -# if _CCCL_STD_VER > 2017 +# if !defined(_CCCL_NO_CONCEPTS) template concept swappable = requires(_Tp& __a, _Tp& __b) { _CUDA_VRANGES::swap(__a, __b); }; @@ -174,7 +174,7 @@ concept swappable_with = common_reference_with<_Tp, _Up> && requires(_Tp&& __t, _CUDA_VRANGES::swap(_CUDA_VSTD::forward<_Tp>(__t), _CUDA_VSTD::forward<_Up>(__u)); _CUDA_VRANGES::swap(_CUDA_VSTD::forward<_Up>(__u), _CUDA_VSTD::forward<_Tp>(__t)); }; -# else +# else // ^^^ !_CCCL_NO_CONCEPTS ^^^ / vvv _CCCL_NO_CONCEPTS vvv template _CCCL_CONCEPT_FRAGMENT(__swappable_, requires(_Tp& __a, _Tp& __b)((_CUDA_VRANGES::swap(__a, __b)))); @@ -193,11 +193,11 @@ _CCCL_CONCEPT_FRAGMENT( template _CCCL_CONCEPT swappable_with = _CCCL_FRAGMENT(__swappable_with_, _Tp, _Up); -# endif +# endif // ^^^ _CCCL_NO_CONCEPTS ^^^ _LIBCUDACXX_END_NAMESPACE_STD -#endif // _CCCL_STD_VER > 2011 +#endif // ^^^ !_CCCL_NO_VARIABLE_TEMPLATES #if _CCCL_COMPILER(MSVC) _CCCL_NV_DIAG_DEFAULT(461) // nonstandard cast to array type ignored diff --git a/libcudacxx/include/cuda/std/__concepts/totally_ordered.h b/libcudacxx/include/cuda/std/__concepts/totally_ordered.h index 59e9254289a..088098956c0 100644 --- a/libcudacxx/include/cuda/std/__concepts/totally_ordered.h +++ b/libcudacxx/include/cuda/std/__concepts/totally_ordered.h @@ -28,7 +28,7 @@ _LIBCUDACXX_BEGIN_NAMESPACE_STD -#if _CCCL_STD_VER > 2017 +#if !defined(_CCCL_NO_CONCEPTS) // [concept.totallyordered] @@ -53,7 +53,7 @@ concept totally_ordered_with = && totally_ordered, __make_const_lvalue_ref<_Up>>> && __partially_ordered_with<_Tp, _Up>; -#elif _CCCL_STD_VER > 2011 +#elif !defined(_CCCL_NO_VARIABLE_TEMPLATES) // ^^^ !_CCCL_NO_CONCEPTS ^^^ template _CCCL_CONCEPT_FRAGMENT( @@ -91,7 +91,7 @@ template _CCCL_CONCEPT totally_ordered_with = _CCCL_FRAGMENT(__totally_ordered_with_, _Tp, _Up); ; -#endif // _CCCL_STD_VER > 2011 +#endif // ^^^ !_CCCL_NO_VARIABLE_TEMPLATES _LIBCUDACXX_END_NAMESPACE_STD diff --git a/libcudacxx/include/cuda/std/__iterator/concepts.h b/libcudacxx/include/cuda/std/__iterator/concepts.h index e4e507afe83..ef36ad11f9d 100644 --- a/libcudacxx/include/cuda/std/__iterator/concepts.h +++ b/libcudacxx/include/cuda/std/__iterator/concepts.h @@ -53,7 +53,7 @@ _LIBCUDACXX_BEGIN_NAMESPACE_STD -#if _CCCL_STD_VER > 2017 +#if !defined(_CCCL_NO_CONCEPTS) // [iterator.concept.readable] template @@ -254,7 +254,7 @@ concept indirectly_copyable_storable = // Note: indirectly_swappable is located in iter_swap.h to prevent a dependency cycle // (both iter_swap and indirectly_swappable require indirectly_readable). -#elif _CCCL_STD_VER > 2014 +#elif _CCCL_STD_VER > 2014 // ^^^ !_CCCL_NO_CONCEPTS ^^^ // [iterator.concept.readable] template diff --git a/libcudacxx/include/cuda/std/__iterator/incrementable_traits.h b/libcudacxx/include/cuda/std/__iterator/incrementable_traits.h index e9d462eeaf4..4555b4ae412 100644 --- a/libcudacxx/include/cuda/std/__iterator/incrementable_traits.h +++ b/libcudacxx/include/cuda/std/__iterator/incrementable_traits.h @@ -37,7 +37,7 @@ _LIBCUDACXX_BEGIN_NAMESPACE_STD -#if _CCCL_STD_VER > 2017 +#if !defined(_CCCL_NO_CONCEPTS) // [incrementable.traits] template @@ -88,7 +88,7 @@ using iter_difference_t = incrementable_traits>, iterator_traits>>::difference_type; -#elif _CCCL_STD_VER > 2014 +#elif _CCCL_STD_VER > 2014 // ^^^ !_CCCL_NO_CONCEPTS ^^^ // [incrementable.traits] template diff --git a/libcudacxx/include/cuda/std/__iterator/iter_move.h b/libcudacxx/include/cuda/std/__iterator/iter_move.h index 1dfb489933b..54ce7692c1e 100644 --- a/libcudacxx/include/cuda/std/__iterator/iter_move.h +++ b/libcudacxx/include/cuda/std/__iterator/iter_move.h @@ -42,7 +42,7 @@ _LIBCUDACXX_BEGIN_NAMESPACE_CPO(__iter_move) _CCCL_HOST_DEVICE void iter_move(); -# if _CCCL_STD_VER >= 2020 +# if !defined(_CCCL_NO_CONCEPTS) template concept __unqualified_iter_move = __class_or_enum> && requires(_Tp&& __t) { iter_move(_CUDA_VSTD::forward<_Tp>(__t)); }; @@ -59,7 +59,7 @@ concept __just_deref = !__unqualified_iter_move<_Tp> && !__move_deref<_Tp> && re requires(!is_lvalue_reference_v); }; -# else // ^^^ _CCCL_STD_VER >= 2020 ^^^ / vvv _CCCL_STD_VER <= 2017 vvv +# else // ^^^ !_CCCL_NO_CONCEPTS ^^^ / vvv _CCCL_NO_CONCEPTS vvv template _CCCL_CONCEPT_FRAGMENT( @@ -85,7 +85,7 @@ _CCCL_CONCEPT_FRAGMENT(__just_deref_, template _CCCL_CONCEPT __just_deref = _CCCL_FRAGMENT(__just_deref_, _Tp); -# endif // _CCCL_STD_VER <= 2017 +# endif // _CCCL_NO_CONCEPTS // [iterator.cust.move] @@ -124,14 +124,14 @@ _LIBCUDACXX_END_NAMESPACE_RANGES _LIBCUDACXX_BEGIN_NAMESPACE_STD -# if _CCCL_STD_VER >= 2020 +# if !defined(_CCCL_NO_CONCEPTS) template <__dereferenceable _Tp> requires requires(_Tp& __t) { { _CUDA_VRANGES::iter_move(__t) } -> __can_reference; } using iter_rvalue_reference_t = decltype(_CUDA_VRANGES::iter_move(_CUDA_VSTD::declval<_Tp&>())); -# else // ^^^ _CCCL_STD_VER >= 2020 ^^^ / vvv _CCCL_STD_VER <= 2017 vvv +# else // ^^^ !_CCCL_NO_CONCEPTS ^^^ / vvv _CCCL_NO_CONCEPTS vvv template _CCCL_CONCEPT_FRAGMENT(__can_iter_rvalue_reference_t_, @@ -146,7 +146,7 @@ using __iter_rvalue_reference_t = decltype(_CUDA_VRANGES::iter_move(_CUDA_VSTD:: template using iter_rvalue_reference_t = enable_if_t<__can_iter_rvalue_reference_t<_Tp>, __iter_rvalue_reference_t<_Tp>>; -# endif // _CCCL_STD_VER <= 2017 +# endif // _CCCL_NO_CONCEPTS _LIBCUDACXX_END_NAMESPACE_STD diff --git a/libcudacxx/include/cuda/std/__iterator/iter_swap.h b/libcudacxx/include/cuda/std/__iterator/iter_swap.h index a4047cbba10..bafeed69742 100644 --- a/libcudacxx/include/cuda/std/__iterator/iter_swap.h +++ b/libcudacxx/include/cuda/std/__iterator/iter_swap.h @@ -39,7 +39,7 @@ _LIBCUDACXX_BEGIN_NAMESPACE_CPO(__iter_swap) template void iter_swap(_I1, _I2) = delete; -# if _CCCL_STD_VER > 2017 +# if !defined(_CCCL_NO_CONCEPTS) template concept __unqualified_iter_swap = (__class_or_enum> || __class_or_enum>) @@ -52,7 +52,7 @@ concept __readable_swappable = !__unqualified_iter_swap<_T1, _T2> && indirectly_ template concept __moveable_storable = !__unqualified_iter_swap<_T1, _T2> && !__readable_swappable<_T1, _T2> && indirectly_movable_storable<_T1, _T2> && indirectly_movable_storable<_T2, _T1>; -# else +# else // ^^^ !_CCCL_NO_CONCEPTS ^^^ / vvv _CCCL_NO_CONCEPTS vvv template _CCCL_CONCEPT_FRAGMENT( __unqualified_iter_swap_, @@ -83,7 +83,7 @@ _CCCL_CONCEPT_FRAGMENT( template _CCCL_CONCEPT __moveable_storable = _CCCL_FRAGMENT(__moveable_storable_, _T1, _T2); -# endif // _CCCL_STD_VER > 2011 +# endif // _CCCL_NO_CONCEPTS struct __fn { diff --git a/libcudacxx/include/cuda/std/__iterator/iterator_traits.h b/libcudacxx/include/cuda/std/__iterator/iterator_traits.h index 759af45cc3a..020f27071db 100644 --- a/libcudacxx/include/cuda/std/__iterator/iterator_traits.h +++ b/libcudacxx/include/cuda/std/__iterator/iterator_traits.h @@ -67,7 +67,7 @@ struct __cccl_std_contiguous_iterator_tag_exists : __cccl_type_is_defined= 2020 +#if !defined(_CCCL_NO_CONCEPTS) template using __with_reference = _Tp&; @@ -87,7 +87,7 @@ using iter_reference_t = decltype(*declval<_Tp&>()); template struct _CCCL_TYPE_VISIBILITY_DEFAULT iterator_traits; -#elif _CCCL_STD_VER >= 2017 +#elif _CCCL_STD_VER > 2014 // ^^^ !_CCCL_NO_CONCEPTS ^^^ template using __with_reference = _Tp&; @@ -113,7 +113,7 @@ using iter_reference_t = enable_if_t<__dereferenceable<_Tp>, decltype(*_CUDA_VST template struct _CCCL_TYPE_VISIBILITY_DEFAULT iterator_traits; -#else // ^^^ _CCCL_STD_VER >= 2017 ^^^ / vvv _CCCL_STD_VER <= 2014 vvv +#else // ^^^ _CCCL_STD_VER > 2014 ^^^ / vvv _CCCL_STD_VER <= 2014 vvv template struct _CCCL_TYPE_VISIBILITY_DEFAULT iterator_traits; #endif // _CCCL_STD_VER <= 2014 @@ -242,7 +242,7 @@ struct __has_iterator_concept static const bool value = decltype(__test<_Tp>(nullptr))::value; }; -#if _CCCL_STD_VER >= 2020 +#if !defined(_CCCL_NO_CONCEPTS) // The `cpp17-*-iterator` exposition-only concepts have very similar names to the `Cpp17*Iterator` named requirements // from `[iterator.cpp17]`. To avoid confusion between the two, the exposition-only concepts have been banished to @@ -484,7 +484,7 @@ struct _CCCL_TYPE_VISIBILITY_DEFAULT iterator_traits : __iterator_traits<_Ip> using __primary_template = iterator_traits; }; -#elif _CCCL_STD_VER >= 2017 +#elif _CCCL_STD_VER > 2014 // ^^^ !_CCCL_NO_CONCEPTS ^^^ / vvv _CCCL_STD_VER > 2014 vvv // The `cpp17-*-iterator` exposition-only concepts have very similar names to the `Cpp17*Iterator` named requirements // from `[iterator.cpp17]`. To avoid confusion between the two, the exposition-only concepts have been banished to @@ -764,7 +764,7 @@ struct _CCCL_TYPE_VISIBILITY_DEFAULT iterator_traits : __iterator_traits<_Ip> using __primary_template = iterator_traits; }; -#else // _CCCL_STD_VER >= 2014 +#else // ^^^ _CCCL_STD_VER > 2014 ^^^ / vvv _CCCL_STD_VER <= 2014 vvv template struct __iterator_traits @@ -804,7 +804,7 @@ struct _CCCL_TYPE_VISIBILITY_DEFAULT iterator_traits : __iterator_traits<_Iter, #endif // _CCCL_STD_VER <= 2014 template -#if _CCCL_STD_VER >= 2020 +#if !defined(_CCCL_NO_CONCEPTS) requires is_object_v<_Tp> #endif struct _CCCL_TYPE_VISIBILITY_DEFAULT iterator_traits<_Tp*> @@ -814,7 +814,7 @@ struct _CCCL_TYPE_VISIBILITY_DEFAULT iterator_traits<_Tp*> typedef _Tp* pointer; typedef typename add_lvalue_reference<_Tp>::type reference; typedef random_access_iterator_tag iterator_category; -#if _CCCL_STD_VER >= 2017 +#if _CCCL_STD_VER > 2014 typedef contiguous_iterator_tag iterator_concept; #endif }; @@ -860,17 +860,17 @@ struct __is_cpp17_random_access_iterator // Such iterators receive special "contiguous" optimizations in // std::copy and std::sort. // -#if _CCCL_STD_VER >= 2017 +#if _CCCL_STD_VER > 2014 template struct __is_cpp17_contiguous_iterator : _Or<__has_iterator_category_convertible_to<_Tp, contiguous_iterator_tag>, __has_iterator_concept_convertible_to<_Tp, contiguous_iterator_tag>> {}; -#else +#else // ^^^ _CCCL_STD_VER > 2014 ^^^ / vvv _CCCL_STD_VER <= 2014 vvv template struct __is_cpp17_contiguous_iterator : false_type {}; -#endif +#endif // _CCCL_STD_VER <= 2014 // Any native pointer which is an iterator is also a contiguous iterator. template diff --git a/libcudacxx/include/cuda/std/__iterator/move_iterator.h b/libcudacxx/include/cuda/std/__iterator/move_iterator.h index efdf656366a..0436b25b36c 100644 --- a/libcudacxx/include/cuda/std/__iterator/move_iterator.h +++ b/libcudacxx/include/cuda/std/__iterator/move_iterator.h @@ -44,7 +44,7 @@ _LIBCUDACXX_BEGIN_NAMESPACE_STD -#if _CCCL_STD_VER > 2017 +#if !defined(_CCCL_NO_CONCEPTS) template struct __move_iter_category_base {}; @@ -67,7 +67,7 @@ concept __move_iter_comparable = requires { template _CCCL_INLINE_VAR constexpr bool __noexcept_move_iter_iter_move = noexcept(_CUDA_VRANGES::iter_move(_CUDA_VSTD::declval<_Iter>())); -#elif _CCCL_STD_VER >= 2017 +#elif _CCCL_STD_VER > 2014 // ^^^ !_CCCL_NO_CONCEPTS ^^^ / vvv _CCCL_STD_VER > 2014 vvv template struct __move_iter_category_base {}; @@ -92,7 +92,7 @@ _CCCL_CONCEPT __move_iter_comparable = _CCCL_FRAGMENT(__move_iter_comparable_, _ template _CCCL_INLINE_VAR constexpr bool __noexcept_move_iter_iter_move = noexcept(_CUDA_VRANGES::iter_move(_CUDA_VSTD::declval<_Iter>())); -#endif // _CCCL_STD_VER >= 2017 +#endif // _CCCL_STD_VER > 2014 template class _CCCL_TYPE_VISIBILITY_DEFAULT move_iterator @@ -179,18 +179,18 @@ class _CCCL_TYPE_VISIBILITY_DEFAULT move_iterator } #if _CCCL_STD_VER > 2014 -# if _CCCL_STD_VER > 2017 +# if !defined(_CCCL_NO_CONCEPTS) _LIBCUDACXX_HIDE_FROM_ABI constexpr move_iterator() requires is_constructible_v<_Iter> : __current_() {} -# else // ^^^ _CCCL_STD_VER > 2017 ^^^ / vvv _CCCL_STD_VER < 2020 vvv +# else // ^^^ !_CCCL_NO_CONCEPTS ^^^ / vvv _CCCL_NO_CONCEPTS vvv _CCCL_TEMPLATE(class _It2 = _Iter) _CCCL_REQUIRES(is_constructible_v<_It2>) _LIBCUDACXX_HIDE_FROM_ABI constexpr move_iterator() : __current_() {} -# endif // _CCCL_STD_VER < 2020 +# endif // _CCCL_NO_CONCEPTS _CCCL_TEMPLATE(class _Up) _CCCL_REQUIRES((!_IsSame<_Up, _Iter>::value) && convertible_to) @@ -460,7 +460,7 @@ operator-(const move_iterator<_Iter1>& __x, const move_iterator<_Iter2>& __y) -> return __x.base() - __y.base(); } -#if _CCCL_STD_VER > 2017 +#if !defined(_CCCL_NO_CONCEPTS) template _LIBCUDACXX_HIDE_FROM_ABI constexpr move_iterator<_Iter> operator+(iter_difference_t<_Iter> __n, const move_iterator<_Iter>& __x) @@ -470,14 +470,14 @@ operator+(iter_difference_t<_Iter> __n, const move_iterator<_Iter>& __x) { return __x + __n; } -#else // ^^^ _CCCL_STD_VER > 2017 ^^^ / vvv _CCCL_STD_VER < 2020 vvv +#else // ^^^ !_CCCL_NO_CONCEPTS ^^^ / vvv _CCCL_NO_CONCEPTS vvv template _LIBCUDACXX_HIDE_FROM_ABI _CCCL_CONSTEXPR_CXX14 move_iterator<_Iter> operator+(typename move_iterator<_Iter>::difference_type __n, const move_iterator<_Iter>& __x) { return move_iterator<_Iter>(__x.base() + __n); } -#endif // _CCCL_STD_VER < 2020 +#endif // _CCCL_NO_CONCEPTS template _LIBCUDACXX_HIDE_FROM_ABI _CCCL_CONSTEXPR_CXX14 move_iterator<_Iter> make_move_iterator(_Iter __i) diff --git a/libcudacxx/include/cuda/std/__iterator/readable_traits.h b/libcudacxx/include/cuda/std/__iterator/readable_traits.h index e7e5dcd3bf4..b73086dd968 100644 --- a/libcudacxx/include/cuda/std/__iterator/readable_traits.h +++ b/libcudacxx/include/cuda/std/__iterator/readable_traits.h @@ -36,7 +36,7 @@ _LIBCUDACXX_BEGIN_NAMESPACE_STD -#if _CCCL_STD_VER > 2017 +#if !defined(_CCCL_NO_CONCEPTS) // [readable.traits] template @@ -106,7 +106,7 @@ using iter_value_t = indirectly_readable_traits>, iterator_traits>>::value_type; -#elif _CCCL_STD_VER > 2014 +#elif _CCCL_STD_VER > 2014 // ^^^ !_CCCL_NO_CONCEPTS ^^^ // [readable.traits] template diff --git a/libcudacxx/include/cuda/std/__iterator/reverse_iterator.h b/libcudacxx/include/cuda/std/__iterator/reverse_iterator.h index 982312731f9..502f090afff 100644 --- a/libcudacxx/include/cuda/std/__iterator/reverse_iterator.h +++ b/libcudacxx/include/cuda/std/__iterator/reverse_iterator.h @@ -175,7 +175,7 @@ class _CCCL_TYPE_VISIBILITY_DEFAULT reverse_iterator return *--__tmp; } -#if _CCCL_STD_VER > 2017 +#if !defined(_CCCL_NO_CONCEPTS) && !defined(_CCCL_NO_IF_CONSTEXPR) _LIBCUDACXX_HIDE_FROM_ABI constexpr pointer operator->() const requires is_pointer_v<_Iter> || requires(const _Iter __i) { __i.operator->(); } { @@ -285,11 +285,11 @@ struct __is_reverse_iterator> : true_type template _LIBCUDACXX_HIDE_FROM_ABI _CCCL_CONSTEXPR_CXX14 bool operator==(const reverse_iterator<_Iter1>& __x, const reverse_iterator<_Iter2>& __y) -#if _CCCL_STD_VER > 2017 +#if !defined(_CCCL_NO_CONCEPTS) requires requires { { __x.base() == __y.base() } -> convertible_to; } -#endif // _CCCL_STD_VER > 2017 +#endif // !_CCCL_NO_CONCEPTS { return __x.base() == __y.base(); } @@ -297,11 +297,11 @@ operator==(const reverse_iterator<_Iter1>& __x, const reverse_iterator<_Iter2>& template _LIBCUDACXX_HIDE_FROM_ABI _CCCL_CONSTEXPR_CXX14 bool operator<(const reverse_iterator<_Iter1>& __x, const reverse_iterator<_Iter2>& __y) -#if _CCCL_STD_VER > 2017 +#if !defined(_CCCL_NO_CONCEPTS) requires requires { { __x.base() > __y.base() } -> convertible_to; } -#endif // _CCCL_STD_VER > 2017 +#endif // !_CCCL_NO_CONCEPTS { return __x.base() > __y.base(); } @@ -309,11 +309,11 @@ operator<(const reverse_iterator<_Iter1>& __x, const reverse_iterator<_Iter2>& _ template _LIBCUDACXX_HIDE_FROM_ABI _CCCL_CONSTEXPR_CXX14 bool operator!=(const reverse_iterator<_Iter1>& __x, const reverse_iterator<_Iter2>& __y) -#if _CCCL_STD_VER > 2017 +#if !defined(_CCCL_NO_CONCEPTS) requires requires { { __x.base() != __y.base() } -> convertible_to; } -#endif // _CCCL_STD_VER > 2017 +#endif // !_CCCL_NO_CONCEPTS { return __x.base() != __y.base(); } @@ -321,11 +321,11 @@ operator!=(const reverse_iterator<_Iter1>& __x, const reverse_iterator<_Iter2>& template _LIBCUDACXX_HIDE_FROM_ABI _CCCL_CONSTEXPR_CXX14 bool operator>(const reverse_iterator<_Iter1>& __x, const reverse_iterator<_Iter2>& __y) -#if _CCCL_STD_VER > 2017 +#if !defined(_CCCL_NO_CONCEPTS) requires requires { { __x.base() < __y.base() } -> convertible_to; } -#endif // _CCCL_STD_VER > 2017 +#endif // !_CCCL_NO_CONCEPTS { return __x.base() < __y.base(); } @@ -333,11 +333,11 @@ operator>(const reverse_iterator<_Iter1>& __x, const reverse_iterator<_Iter2>& _ template _LIBCUDACXX_HIDE_FROM_ABI _CCCL_CONSTEXPR_CXX14 bool operator>=(const reverse_iterator<_Iter1>& __x, const reverse_iterator<_Iter2>& __y) -#if _CCCL_STD_VER > 2017 +#if !defined(_CCCL_NO_CONCEPTS) requires requires { { __x.base() <= __y.base() } -> convertible_to; } -#endif // _CCCL_STD_VER > 2017 +#endif // !_CCCL_NO_CONCEPTS { return __x.base() <= __y.base(); } @@ -345,11 +345,11 @@ operator>=(const reverse_iterator<_Iter1>& __x, const reverse_iterator<_Iter2>& template _LIBCUDACXX_HIDE_FROM_ABI _CCCL_CONSTEXPR_CXX14 bool operator<=(const reverse_iterator<_Iter1>& __x, const reverse_iterator<_Iter2>& __y) -#if _CCCL_STD_VER > 2017 +#if !defined(_CCCL_NO_CONCEPTS) requires requires { { __x.base() >= __y.base() } -> convertible_to; } -#endif // _CCCL_STD_VER > 2017 +#endif // !_CCCL_NO_CONCEPTS { return __x.base() >= __y.base(); } @@ -377,11 +377,11 @@ operator+(typename reverse_iterator<_Iter>::difference_type __n, const reverse_i return reverse_iterator<_Iter>(__x.base() - __n); } -#if _CCCL_STD_VER > 2017 +#if !defined(_CCCL_NO_CONCEPTS) template requires(!sized_sentinel_for<_Iter1, _Iter2>) inline constexpr bool disable_sized_sentinel_for, reverse_iterator<_Iter2>> = true; -#endif // _CCCL_STD_VER > 2017 +#endif // !_CCCL_NO_CONCEPTS #if _CCCL_STD_VER > 2011 template diff --git a/libcudacxx/include/cuda/std/__ranges/access.h b/libcudacxx/include/cuda/std/__ranges/access.h index 2c1525e1ad4..3c5ef7da52b 100644 --- a/libcudacxx/include/cuda/std/__ranges/access.h +++ b/libcudacxx/include/cuda/std/__ranges/access.h @@ -46,7 +46,7 @@ void begin(_Tp&) = delete; template void begin(const _Tp&) = delete; -# if _CCCL_STD_VER > 2017 +# if !defined(_CCCL_NO_CONCEPTS) template concept __member_begin = __can_borrow<_Tp> && __workaround_52970<_Tp> && requires(_Tp&& __t) { { _LIBCUDACXX_AUTO_CAST(__t.begin()) } -> input_or_output_iterator; @@ -57,7 +57,7 @@ concept __unqualified_begin = !__member_begin<_Tp> && __can_borrow<_Tp> && __class_or_enum> && requires(_Tp&& __t) { { _LIBCUDACXX_AUTO_CAST(begin(__t)) } -> input_or_output_iterator; }; -# else // ^^^ CXX20 ^^^ / vvv CXX17 vvv +# else // ^^^ !_CCCL_NO_CONCEPTS ^^^ / vvv _CCCL_NO_CONCEPTS vvv template _CCCL_CONCEPT_FRAGMENT( __member_begin_, @@ -78,7 +78,7 @@ _CCCL_CONCEPT_FRAGMENT( template _CCCL_CONCEPT __unqualified_begin = _CCCL_FRAGMENT(__unqualified_begin_, _Tp); -# endif // _CCCL_STD_VER < 2020 +# endif // _CCCL_NO_CONCEPTS struct __fn { @@ -141,7 +141,7 @@ void end(_Tp&) = delete; template void end(const _Tp&) = delete; -# if _CCCL_STD_VER > 2017 +# if !defined(_CCCL_NO_CONCEPTS) template concept __member_end = __can_borrow<_Tp> && __workaround_52970<_Tp> && requires(_Tp&& __t) { typename iterator_t<_Tp>; @@ -154,7 +154,7 @@ concept __unqualified_end = typename iterator_t<_Tp>; { _LIBCUDACXX_AUTO_CAST(end(__t)) } -> sentinel_for>; }; -# else // ^^^ CXX20 ^^^ / vvv CXX17 vvv +# else // ^^^ !_CCCL_NO_CONCEPTS ^^^ / vvv _CCCL_NO_CONCEPTS vvv template _CCCL_CONCEPT_FRAGMENT( __member_end_, @@ -177,7 +177,7 @@ _CCCL_CONCEPT_FRAGMENT( template _CCCL_CONCEPT __unqualified_end = _CCCL_FRAGMENT(__unqualified_end_, _Tp); -# endif // _CCCL_STD_VER < 2020 +# endif // _CCCL_NO_CONCEPTS struct __fn { diff --git a/libcudacxx/include/cuda/std/__ranges/concepts.h b/libcudacxx/include/cuda/std/__ranges/concepts.h index 26d7fe421e7..4183f423ea6 100644 --- a/libcudacxx/include/cuda/std/__ranges/concepts.h +++ b/libcudacxx/include/cuda/std/__ranges/concepts.h @@ -46,7 +46,7 @@ _LIBCUDACXX_BEGIN_NAMESPACE_RANGES #if _CCCL_STD_VER >= 2017 && !_CCCL_COMPILER(MSVC2017) -# if _CCCL_STD_VER >= 2020 +# if !defined(_CCCL_NO_CONCEPTS) // [range.range] @@ -138,7 +138,8 @@ concept viewable_range = || (!view> && (is_lvalue_reference_v<_Tp> || (movable> && !__is_std_initializer_list>) ))); -# else // ^^^ C++20 ^^^ / vvv C++17 vvv + +# else // ^^^ !_CCCL_NO_CONCEPTS ^^^ / vvv _CCCL_NO_CONCEPTS vvv // [range.range] template @@ -285,13 +286,13 @@ _CCCL_CONCEPT_FRAGMENT( template _CCCL_CONCEPT viewable_range = _CCCL_FRAGMENT(__viewable_range_, _Tp); -# endif // _CCCL_STD_VER >= 2017 +# endif // _CCCL_NO_CONCEPTS //[container.intro.reqmts] -# if _CCCL_STD_VER >= 2020 +# if !defined(_CCCL_NO_CONCEPTS) template concept __container_compatible_range = input_range<_Range> && convertible_to, _Tp>; -# else // ^^^ C++20 ^^^ / vvv C++17 vvv +# else // ^^^ !_CCCL_NO_CONCEPTS ^^^ / vvv _CCCL_NO_CONCEPTS vvv template _CCCL_CONCEPT_FRAGMENT( __container_compatible_range_, @@ -299,7 +300,7 @@ _CCCL_CONCEPT_FRAGMENT( template _CCCL_CONCEPT __container_compatible_range = _CCCL_FRAGMENT(__container_compatible_range_, _Range, _Tp); -# endif // _CCCL_STD_VER <= 2017 +# endif // _CCCL_NO_CONCEPTS #endif // _CCCL_STD_VER >= 2017 && !_CCCL_COMPILER(MSVC2017) diff --git a/libcudacxx/include/cuda/std/__ranges/data.h b/libcudacxx/include/cuda/std/__ranges/data.h index f5bf6015963..0f756d52a9f 100644 --- a/libcudacxx/include/cuda/std/__ranges/data.h +++ b/libcudacxx/include/cuda/std/__ranges/data.h @@ -43,7 +43,7 @@ _LIBCUDACXX_BEGIN_NAMESPACE_CPO(__data) template _CCCL_CONCEPT __ptr_to_object = is_pointer_v<_Tp> && is_object_v>; -# if _CCCL_STD_VER >= 2020 +# if !defined(_CCCL_NO_CONCEPTS) template concept __member_data = __can_borrow<_Tp> && __workaround_52970<_Tp> && requires(_Tp&& __t) { { _LIBCUDACXX_AUTO_CAST(__t.data()) } -> __ptr_to_object; @@ -53,7 +53,7 @@ template concept __ranges_begin_invocable = !__member_data<_Tp> && __can_borrow<_Tp> && requires(_Tp&& __t) { { _CUDA_VRANGES::begin(__t) } -> contiguous_iterator; }; -# else // ^^^ CXX20 ^^^ / vvv CXX17 vvv +# else // ^^^ !_CCCL_NO_CONCEPTS ^^^ / vvv _CCCL_NO_CONCEPTS vvv template _CCCL_CONCEPT_FRAGMENT(__member_data_, requires(_Tp&& __t)(requires(__can_borrow<_Tp>), @@ -71,7 +71,7 @@ _CCCL_CONCEPT_FRAGMENT(__ranges_begin_invocable_, template _CCCL_CONCEPT __ranges_begin_invocable = _CCCL_FRAGMENT(__ranges_begin_invocable_, _Tp); -# endif // _CCCL_STD_VER <= 2017 +# endif // _CCCL_NO_CONCEPTS struct __fn { diff --git a/libcudacxx/include/cuda/std/__ranges/empty.h b/libcudacxx/include/cuda/std/__ranges/empty.h index d8f8213e9a8..1494c18882f 100644 --- a/libcudacxx/include/cuda/std/__ranges/empty.h +++ b/libcudacxx/include/cuda/std/__ranges/empty.h @@ -33,7 +33,7 @@ _LIBCUDACXX_BEGIN_NAMESPACE_RANGES _LIBCUDACXX_BEGIN_NAMESPACE_CPO(__empty) -# if _CCCL_STD_VER >= 2020 +# if !defined(_CCCL_NO_CONCEPTS) template concept __member_empty = __workaround_52970<_Tp> && requires(_Tp&& __t) { bool(__t.empty()); }; @@ -45,7 +45,7 @@ concept __can_compare_begin_end = !__member_empty<_Tp> && !__can_invoke_size<_Tp bool(_CUDA_VRANGES::begin(__t) == _CUDA_VRANGES::end(__t)); { _CUDA_VRANGES::begin(__t) } -> forward_iterator; }; -# else // ^^^ CXX20 ^^^ / vvv CXX17 vvv +# else // ^^^ !_CCCL_NO_CONCEPTS ^^^ / vvv _CCCL_NO_CONCEPTS vvv template _CCCL_CONCEPT_FRAGMENT(__member_empty_, requires(_Tp&& __t)(requires(__workaround_52970<_Tp>), (bool(__t.empty())))); @@ -69,7 +69,7 @@ _CCCL_CONCEPT_FRAGMENT( template _CCCL_CONCEPT __can_compare_begin_end = _CCCL_FRAGMENT(__can_compare_begin_end_, _Tp); -# endif // _CCCL_STD_VER <= 2017 +# endif // _CCCL_NO_CONCEPTS struct __fn { diff --git a/libcudacxx/include/cuda/std/__ranges/enable_view.h b/libcudacxx/include/cuda/std/__ranges/enable_view.h index 1e5a09cd541..72e390c0499 100644 --- a/libcudacxx/include/cuda/std/__ranges/enable_view.h +++ b/libcudacxx/include/cuda/std/__ranges/enable_view.h @@ -56,14 +56,14 @@ _CCCL_TEMPLATE(class _Op, class _Yp) _CCCL_REQUIRES(is_convertible_v<_Op*, view_interface<_Yp>*>) _LIBCUDACXX_HIDE_FROM_ABI void __is_derived_from_view_interface(const _Op*, const view_interface<_Yp>*); -# if _CCCL_STD_VER >= 2020 +# if !defined(_CCCL_NO_CONCEPTS) template _CCCL_INLINE_VAR constexpr bool enable_view = derived_from<_Tp, view_base> || requires { _CUDA_VRANGES::__is_derived_from_view_interface((_Tp*) nullptr, (_Tp*) nullptr); }; -# else // ^^^ _CCCL_STD_VER >= 2020 ^^^ / vvv _CCCL_STD_VER <= 2017 vvv +# else // ^^^ !_CCCL_NO_CONCEPTS ^^^ / vvv _CCCL_NO_CONCEPTS vvv template _CCCL_INLINE_VAR constexpr bool enable_view = derived_from<_Tp, view_base>; @@ -72,7 +72,7 @@ template _CCCL_INLINE_VAR constexpr bool enable_view<_Tp, void_t> = true; -# endif // _CCCL_STD_VER <= 2017 +# endif // _CCCL_NO_CONCEPTS #endif // _CCCL_STD_VER >= 2017 diff --git a/libcudacxx/include/cuda/std/__ranges/rbegin.h b/libcudacxx/include/cuda/std/__ranges/rbegin.h index 8b70f702797..13cf76b9da9 100644 --- a/libcudacxx/include/cuda/std/__ranges/rbegin.h +++ b/libcudacxx/include/cuda/std/__ranges/rbegin.h @@ -43,7 +43,7 @@ void rbegin(_Tp&) = delete; template void rbegin(const _Tp&) = delete; -# if _CCCL_STD_VER >= 2020 +# if !defined(_CCCL_NO_CONCEPTS) template concept __member_rbegin = __can_borrow<_Tp> && __workaround_52970<_Tp> && requires(_Tp&& __t) { { _LIBCUDACXX_AUTO_CAST(__t.rbegin()) } -> input_or_output_iterator; @@ -61,7 +61,7 @@ concept __can_reverse = { _CUDA_VRANGES::begin(__t) } -> same_as; { _CUDA_VRANGES::begin(__t) } -> bidirectional_iterator; }; -# else // ^^^ CXX20 ^^^ / vvv CXX17 vvv +# else // ^^^ !_CCCL_NO_CONCEPTS ^^^ / vvv _CCCL_NO_CONCEPTS vvv template _CCCL_CONCEPT_FRAGMENT( __member_rbegin_, @@ -94,7 +94,7 @@ _CCCL_CONCEPT_FRAGMENT( template _CCCL_CONCEPT __can_reverse = _CCCL_FRAGMENT(__can_reverse_, _Tp); -# endif // _CCCL_STD_VER <= 2017 +# endif // _CCCL_NO_CONCEPTS struct __fn { diff --git a/libcudacxx/include/cuda/std/__ranges/rend.h b/libcudacxx/include/cuda/std/__ranges/rend.h index 5c266d63bdd..3f21c323eba 100644 --- a/libcudacxx/include/cuda/std/__ranges/rend.h +++ b/libcudacxx/include/cuda/std/__ranges/rend.h @@ -44,7 +44,7 @@ void rend(_Tp&) = delete; template void rend(const _Tp&) = delete; -# if _CCCL_STD_VER >= 2020 +# if !defined(_CCCL_NO_CONCEPTS) template concept __member_rend = __can_borrow<_Tp> && __workaround_52970<_Tp> && requires(_Tp&& __t) { _CUDA_VRANGES::rbegin(__t); @@ -63,7 +63,7 @@ concept __can_reverse = __can_borrow<_Tp> && !__member_rend<_Tp> && !__unqualifi { _CUDA_VRANGES::begin(__t) } -> same_as; { _CUDA_VRANGES::begin(__t) } -> bidirectional_iterator; }; -# else // ^^^ CXX20 ^^^ / vvv CXX17 vvv +# else // ^^^ !_CCCL_NO_CONCEPTS ^^^ / vvv _CCCL_NO_CONCEPTS vvv template _CCCL_CONCEPT_FRAGMENT( __member_rend_, @@ -100,7 +100,7 @@ _CCCL_CONCEPT_FRAGMENT( template _CCCL_CONCEPT __can_reverse = _CCCL_FRAGMENT(__can_reverse_, _Tp); -# endif // _CCCL_STD_VER <= 2017 +# endif // _CCCL_NO_CONCEPTS class __fn { diff --git a/libcudacxx/include/cuda/std/__ranges/size.h b/libcudacxx/include/cuda/std/__ranges/size.h index 04487441586..0b432ae6e87 100644 --- a/libcudacxx/include/cuda/std/__ranges/size.h +++ b/libcudacxx/include/cuda/std/__ranges/size.h @@ -52,7 +52,7 @@ void size(const _Tp&) = delete; template _CCCL_CONCEPT __size_enabled = !disable_sized_range>; -# if _CCCL_STD_VER >= 2020 +# if !defined(_CCCL_NO_CONCEPTS) template concept __member_size = __size_enabled<_Tp> && __workaround_52970<_Tp> && requires(_Tp&& __t) { { _LIBCUDACXX_AUTO_CAST(__t.size()) } -> __integer_like; @@ -70,7 +70,7 @@ concept __difference = { _CUDA_VRANGES::begin(__t) } -> forward_iterator; { _CUDA_VRANGES::end(__t) } -> sized_sentinel_for()))>; }; -# else // ^^^ CXX20 ^^^ / vvv CXX17 vvv +# else // ^^^ !_CCCL_NO_CONCEPTS ^^^ / vvv _CCCL_NO_CONCEPTS vvv template _CCCL_CONCEPT_FRAGMENT(__member_size_, requires(_Tp&& __t)(requires(__size_enabled<_Tp>), @@ -103,7 +103,7 @@ _CCCL_CONCEPT_FRAGMENT( template _CCCL_CONCEPT __difference = _CCCL_FRAGMENT(__difference_, _Tp); -# endif // _CCCL_STD_VER <= 2017 +# endif // _CCCL_NO_CONCEPTS struct __fn { @@ -162,10 +162,10 @@ _CCCL_GLOBAL_CONSTANT auto size = __size::__fn{}; // [range.prim.ssize] _LIBCUDACXX_BEGIN_NAMESPACE_CPO(__ssize) -# if _CCCL_STD_VER >= 2020 +# if !defined(_CCCL_NO_CONCEPTS) template concept __can_ssize = requires(_Tp&& __t) { _CUDA_VRANGES::size(__t); }; -# else // ^^^ CXX20 ^^^ / vvv CXX17 vvv +# else // ^^^ !_CCCL_NO_CONCEPTS ^^^ / vvv _CCCL_NO_CONCEPTS vvv template _CCCL_CONCEPT_FRAGMENT(__can_ssize_, requires(_Tp&& __t)(requires(!is_unbounded_array_v<_Tp>), ((void) _CUDA_VRANGES::size(__t)))); diff --git a/libcudacxx/include/cuda/std/__ranges/subrange.h b/libcudacxx/include/cuda/std/__ranges/subrange.h index 190df21d43b..484ce8c1f46 100644 --- a/libcudacxx/include/cuda/std/__ranges/subrange.h +++ b/libcudacxx/include/cuda/std/__ranges/subrange.h @@ -60,7 +60,7 @@ _CCCL_DIAG_SUPPRESS_MSVC(4848) _LIBCUDACXX_BEGIN_NAMESPACE_RANGES _LIBCUDACXX_BEGIN_NAMESPACE_RANGES_ABI -# if _CCCL_STD_VER >= 2020 +# if !defined(_CCCL_NO_CONCEPTS) template concept __uses_nonqualification_pointer_conversion = is_pointer_v<_From> && is_pointer_v<_To> @@ -106,7 +106,7 @@ template concept __subrange_to_pair = __different_from<_Pair, subrange<_Iter, _Sent, _Kind>> && __pair_like_convertible_from<_Pair, const _Iter&, const _Sent&>; -# else // ^^^ C++20 ^^^ / vvv C++17 vvv +# else // ^^^ !_CCCL_NO_CONCEPTS ^^^ / vvv _CCCL_NO_CONCEPTS vvv template _CCCL_CONCEPT_FRAGMENT( @@ -211,19 +211,19 @@ _CCCL_CONCEPT_FRAGMENT(__subrange_to_pair_, template _CCCL_CONCEPT __subrange_to_pair = _CCCL_FRAGMENT(__subrange_to_pair_, _Iter, _Sent, integral_constant, _Pair); -# endif // _CCCL_STD_VER <= 2017 +# endif // _CCCL_NO_CONCEPTS -# if _CCCL_STD_VER >= 2020 +# if !defined(_CCCL_NO_CONCEPTS) template _Sent, subrange_kind _Kind> requires(_Kind == subrange_kind::sized || !sized_sentinel_for<_Sent, _Iter>) -# else // ^^^ C++20 ^^^ / vvv C++17 vvv +# else // ^^^ !_CCCL_NO_CONCEPTS ^^^ / vvv _CCCL_NO_CONCEPTS vvv template , int>, enable_if_t, int>, enable_if_t<(_Kind == subrange_kind::sized || !sized_sentinel_for<_Sent, _Iter>), int>> -# endif // _CCCL_STD_VER <= 2017 +# endif // _CCCL_NO_CONCEPTS class _CCCL_TYPE_VISIBILITY_DEFAULT subrange : public view_interface> { public: @@ -243,15 +243,15 @@ class _CCCL_TYPE_VISIBILITY_DEFAULT subrange : public view_interface= 2020 +# if !defined(_CCCL_NO_CONCEPTS) subrange() requires default_initializable<_Iter> = default; -# else // ^^^ C++20 ^^^ / vvv C++17 vvv +# else // ^^^ !_CCCL_NO_CONCEPTS ^^^ / vvv _CCCL_NO_CONCEPTS vvv template , int> = 0> _LIBCUDACXX_HIDE_FROM_ABI constexpr subrange() noexcept(is_nothrow_default_constructible_v<_It>) : view_interface>(){}; -# endif // _CCCL_STD_VER <= 2017 +# endif // _CCCL_NO_CONCEPTS _CCCL_TEMPLATE(class _It) _CCCL_REQUIRES(__subrange_from_iter_sent<_Iter, _It, _StoreSize>) diff --git a/libcudacxx/include/cuda/std/__ranges/view_interface.h b/libcudacxx/include/cuda/std/__ranges/view_interface.h index 661e20c1b68..a5055867542 100644 --- a/libcudacxx/include/cuda/std/__ranges/view_interface.h +++ b/libcudacxx/include/cuda/std/__ranges/view_interface.h @@ -39,25 +39,25 @@ _LIBCUDACXX_BEGIN_NAMESPACE_RANGES #if _CCCL_STD_VER >= 2017 && !_CCCL_COMPILER(MSVC2017) -# if _CCCL_STD_VER >= 2020 +# if !defined(_CCCL_NO_CONCEPTS) template concept __can_empty = requires(_Tp& __t) { _CUDA_VRANGES::empty(__t); }; -# else // ^^^ C++20 ^^^ / vvv C++17 vvv +# else // ^^^ !_CCCL_NO_CONCEPTS ^^^ / vvv _CCCL_NO_CONCEPTS vvv template _CCCL_CONCEPT_FRAGMENT(__can_empty_, requires(_Tp& __t)(typename(decltype(_CUDA_VRANGES::empty(__t))))); template _CCCL_CONCEPT __can_empty = _CCCL_FRAGMENT(__can_empty_, _Tp); -# endif // _CCCL_STD_VER <= 2017 +# endif // _CCCL_NO_CONCEPTS _LIBCUDACXX_BEGIN_NAMESPACE_RANGES_ABI -# if _CCCL_STD_VER >= 2020 +# if !defined(_CCCL_NO_CONCEPTS) template requires is_class_v<_Derived> && same_as<_Derived, remove_cv_t<_Derived>> -# else // ^^^ C++20 ^^^ / vvv C++17 vvv +# else // ^^^ !_CCCL_NO_CONCEPTS ^^^ / vvv _CCCL_NO_CONCEPTS vvv template && same_as<_Derived, remove_cv_t<_Derived>>, int>> -# endif // _CCCL_STD_VER <= 2017 +# endif // _CCCL_NO_CONCEPTS class view_interface { _LIBCUDACXX_HIDE_FROM_ABI constexpr _Derived& __derived() noexcept From 7d35d56657e65137497ceff6c06858f56ec6fda5 Mon Sep 17 00:00:00 2001 From: Eric Niebler Date: Mon, 25 Nov 2024 21:05:37 -0800 Subject: [PATCH 26/45] remove definition of macro that is no longer used (#2957) --- libcudacxx/test/utils/libcudacxx/test/config.py | 1 - 1 file changed, 1 deletion(-) diff --git a/libcudacxx/test/utils/libcudacxx/test/config.py b/libcudacxx/test/utils/libcudacxx/test/config.py index 0ae9f226d65..4bf1f48739b 100644 --- a/libcudacxx/test/utils/libcudacxx/test/config.py +++ b/libcudacxx/test/utils/libcudacxx/test/config.py @@ -995,7 +995,6 @@ def configure_compile_flags_rtti(self): self.cxx.compile_flags += ['-D_SILENCE_CXX20_CISO646_REMOVED_WARNING'] else: self.cxx.compile_flags += ['-fno-rtti'] - self.cxx.compile_flags += ['-D_LIBCUDACXX_NO_RTTI'] def configure_compile_flags_abi_version(self): abi_version = self.get_lit_conf('abi_version', '').strip() From 658e0bbfa7a8d309c1f234ae77f1352c8f8849ac Mon Sep 17 00:00:00 2001 From: Michael Schellenberger Costa Date: Tue, 26 Nov 2024 09:00:35 +0100 Subject: [PATCH 27/45] Avoid symbol clashes with libc++ (#2955) * Drop `__libcpp` prefix in favor of `__cccl` libc++ has moved towards just using unqualified `std::` for any partial qualification. That leads to a high chance of symbol clashes if we use the same names as them. As a first replace all uses of `__libcpp` with `_cccl` I was wondering about `__libcupp` but :shrug: * Backport `is_constant_evaluated` --- .../cuda/experimental/__async/stop_token.cuh | 2 +- .../cuda/__barrier/barrier_block_scope.h | 16 +- libcudacxx/include/cuda/pipeline | 2 +- .../include/cuda/std/__algorithm/copy.h | 4 +- .../cuda/std/__atomic/wait/notify_wait.h | 4 +- .../include/cuda/std/__atomic/wait/polling.h | 2 +- .../include/cuda/std/__barrier/barrier.h | 4 +- libcudacxx/include/cuda/std/__bit/clz.h | 16 +- libcudacxx/include/cuda/std/__bit/countl.h | 13 +- libcudacxx/include/cuda/std/__bit/countr.h | 13 +- libcudacxx/include/cuda/std/__bit/ctz.h | 16 +- .../include/cuda/std/__bit/has_single_bit.h | 4 +- libcudacxx/include/cuda/std/__bit/integral.h | 10 +- libcudacxx/include/cuda/std/__bit/popc.h | 16 +- libcudacxx/include/cuda/std/__bit/popcount.h | 12 +- libcudacxx/include/cuda/std/__bit/reference.h | 6 +- libcudacxx/include/cuda/std/__bit/rotate.h | 8 +- .../include/cuda/std/__complex/nvbf16.h | 2 +- .../include/cuda/std/__complex/nvfp16.h | 2 +- .../include/cuda/std/__concepts/arithmetic.h | 2 +- .../cuda/std/__iterator/erase_if_container.h | 2 +- .../include/cuda/std/__memory/allocator.h | 16 +- .../cuda/std/__memory/builtin_new_allocator.h | 6 +- .../include/cuda/std/__memory/construct_at.h | 8 +- .../cuda/std/__memory/temporary_buffer.h | 2 +- .../std/__memory/uninitialized_algorithms.h | 6 +- libcudacxx/include/cuda/std/__new/allocate.h | 22 +- .../cuda/std/__semaphore/atomic_semaphore.h | 4 +- .../include/cuda/std/__string/string_view.h | 2 +- .../cuda/std/__thread/threading_support.h | 12 +- .../std/__thread/threading_support_cuda.h | 4 +- .../std/__thread/threading_support_external.h | 4 +- .../std/__thread/threading_support_pthread.h | 52 ++-- .../std/__thread/threading_support_win32.h | 22 +- .../std/__type_traits/add_lvalue_reference.h | 2 +- .../cuda/std/__type_traits/add_pointer.h | 2 +- .../std/__type_traits/add_rvalue_reference.h | 2 +- .../include/cuda/std/__type_traits/decay.h | 2 +- .../cuda/std/__type_traits/is_bounded_array.h | 4 +- .../std/__type_traits/is_constant_evaluated.h | 11 +- .../cuda/std/__type_traits/is_constructible.h | 17 +- .../include/cuda/std/__type_traits/is_empty.h | 6 +- .../std/__type_traits/is_floating_point.h | 10 +- .../cuda/std/__type_traits/is_integral.h | 40 +-- .../is_member_function_pointer.h | 6 +- .../__type_traits/is_member_object_pointer.h | 2 +- .../std/__type_traits/is_member_pointer.h | 2 +- .../std/__type_traits/is_nothrow_assignable.h | 8 +- .../__type_traits/is_nothrow_constructible.h | 12 +- .../__type_traits/is_nothrow_destructible.h | 10 +- .../cuda/std/__type_traits/is_pointer.h | 6 +- .../cuda/std/__type_traits/is_referenceable.h | 8 +- .../cuda/std/__type_traits/is_signed.h | 10 +- .../std/__type_traits/is_signed_integer.h | 14 +- .../cuda/std/__type_traits/is_swappable.h | 4 +- .../__type_traits/is_trivially_destructible.h | 4 +- .../std/__type_traits/is_unbounded_array.h | 4 +- .../include/cuda/std/__type_traits/is_union.h | 4 +- .../cuda/std/__type_traits/is_unsigned.h | 10 +- .../std/__type_traits/is_unsigned_integer.h | 14 +- .../include/cuda/std/__utility/unreachable.h | 2 +- .../cuda/std/detail/libcxx/include/__string | 4 +- .../cuda/std/detail/libcxx/include/algorithm | 2 +- .../cuda/std/detail/libcxx/include/complex | 32 +-- .../cuda/std/detail/libcxx/include/limits | 22 +- libcudacxx/test/NOTES.TXT | 2 +- .../is_constant_evaluated.pass.cpp | 10 +- .../utilities/meta/is_referenceable.pass.cpp | 258 +++++++++--------- .../partial_sort_copy.pass.cpp | 4 +- .../partial_sort_copy_comp.pass.cpp | 4 +- .../sequences/inplace_vector/access.pass.cpp | 2 +- .../sequences/inplace_vector/assign.pass.cpp | 2 +- .../inplace_vector/assignment.pass.cpp | 2 +- .../inplace_vector/capacity.pass.cpp | 2 +- .../inplace_vector/comparison.pass.cpp | 2 +- .../inplace_vector/constructor.pass.cpp | 4 +- .../sequences/inplace_vector/emplace.pass.cpp | 2 +- .../sequences/inplace_vector/insert.pass.cpp | 2 +- .../inplace_vector/iterators.pass.cpp | 2 +- .../sequences/inplace_vector/resize.pass.cpp | 2 +- .../sequences/inplace_vector/swap.pass.cpp | 2 +- .../unique.ptr.ctor/pointer_deleter.pass.cpp | 4 +- .../is_constant_evaluated.fail.cpp | 28 -- .../meta.unary.prop/is_constructible.pass.cpp | 18 +- libcudacxx/test/support/check_assertion.h | 2 +- libcudacxx/test/support/test_macros.h | 2 +- 86 files changed, 455 insertions(+), 493 deletions(-) delete mode 100644 libcudacxx/test/libcudacxx/std/utilities/meta/meta.const.eval/is_constant_evaluated.fail.cpp diff --git a/cudax/include/cuda/experimental/__async/stop_token.cuh b/cudax/include/cuda/experimental/__async/stop_token.cuh index 52ff380ad99..32aeb3ea63a 100644 --- a/cudax/include/cuda/experimental/__async/stop_token.cuh +++ b/cudax/include/cuda/experimental/__async/stop_token.cuh @@ -96,7 +96,7 @@ struct __spin_wait else { --__count_; - _CUDA_VSTD::__libcpp_thread_yield_processor(); + _CUDA_VSTD::__cccl_thread_yield_processor(); } } diff --git a/libcudacxx/include/cuda/__barrier/barrier_block_scope.h b/libcudacxx/include/cuda/__barrier/barrier_block_scope.h index 163aad61da1..e79165ae8d0 100644 --- a/libcudacxx/include/cuda/__barrier/barrier_block_scope.h +++ b/libcudacxx/include/cuda/__barrier/barrier_block_scope.h @@ -229,7 +229,7 @@ class barrier : public __blo ( int32_t __ready = 0; if (!__isClusterShared(&__barrier)) { - return _CUDA_VSTD::__libcpp_thread_poll_with_backoff( + return _CUDA_VSTD::__cccl_thread_poll_with_backoff( _CUDA_VSTD::__barrier_poll_tester_phase(this, _CUDA_VSTD::move(__token)), __nanosec); } else if (!__isShared(&__barrier)) { __trap(); } @@ -256,7 +256,7 @@ class barrier : public __blo ( bool __ready = 0; if (!__isShared(&__barrier)) { - return _CUDA_VSTD::__libcpp_thread_poll_with_backoff( + return _CUDA_VSTD::__cccl_thread_poll_with_backoff( _CUDA_VSTD::__barrier_poll_tester_phase(this, _CUDA_VSTD::move(__token)), __nanosec); } @@ -267,7 +267,7 @@ class barrier : public __blo } while (!__ready && __nanosec > (_CUDA_VSTD::chrono::high_resolution_clock::now() - __start)); return __ready;), NV_ANY_TARGET, - (return _CUDA_VSTD::__libcpp_thread_poll_with_backoff( + (return _CUDA_VSTD::__cccl_thread_poll_with_backoff( _CUDA_VSTD::__barrier_poll_tester_phase(this, _CUDA_VSTD::move(__token)), _CUDA_VSTD::chrono::nanoseconds(__nanosec));)) } @@ -331,7 +331,7 @@ class barrier : public __blo ( int32_t __ready = 0; if (!__isClusterShared(&__barrier)) { - return _CUDA_VSTD::__libcpp_thread_poll_with_backoff( + return _CUDA_VSTD::__cccl_thread_poll_with_backoff( _CUDA_VSTD::__barrier_poll_tester_parity(this, __phase_parity), __nanosec); } else if (!__isShared(&__barrier)) { __trap(); } @@ -359,7 +359,7 @@ class barrier : public __blo ( bool __ready = 0; if (!__isShared(&__barrier)) { - return _CUDA_VSTD::__libcpp_thread_poll_with_backoff( + return _CUDA_VSTD::__cccl_thread_poll_with_backoff( _CUDA_VSTD::__barrier_poll_tester_parity(this, __phase_parity), __nanosec); } @@ -371,20 +371,20 @@ class barrier : public __blo return __ready;), NV_ANY_TARGET, - (return _CUDA_VSTD::__libcpp_thread_poll_with_backoff( + (return _CUDA_VSTD::__cccl_thread_poll_with_backoff( _CUDA_VSTD::__barrier_poll_tester_parity(this, __phase_parity), __nanosec);)) } public: _LIBCUDACXX_HIDE_FROM_ABI void wait(arrival_token&& __phase) const { - _CUDA_VSTD::__libcpp_thread_poll_with_backoff( + _CUDA_VSTD::__cccl_thread_poll_with_backoff( _CUDA_VSTD::__barrier_poll_tester_phase(this, _CUDA_VSTD::move(__phase))); } _LIBCUDACXX_HIDE_FROM_ABI void wait_parity(bool __phase_parity) const { - _CUDA_VSTD::__libcpp_thread_poll_with_backoff( + _CUDA_VSTD::__cccl_thread_poll_with_backoff( _CUDA_VSTD::__barrier_poll_tester_parity(this, __phase_parity)); } diff --git a/libcudacxx/include/cuda/pipeline b/libcudacxx/include/cuda/pipeline index 564075e1827..d034c931644 100644 --- a/libcudacxx/include/cuda/pipeline +++ b/libcudacxx/include/cuda/pipeline @@ -296,7 +296,7 @@ public: _LIBCUDACXX_HIDE_FROM_ABI bool consumer_wait_for(const _CUDA_VSTD::chrono::duration<_Rep, _Period>& __duration) { barrier<_Scope>& __stage_barrier = __shared_state_get_stage(__tail)->__produced; - return _CUDA_VSTD::__libcpp_thread_poll_with_backoff( + return _CUDA_VSTD::__cccl_thread_poll_with_backoff( _CUDA_VSTD::__barrier_poll_tester_parity>(&__stage_barrier, __produced_phase_parity), _CUDA_VSTD::chrono::duration_cast<_CUDA_VSTD::chrono::nanoseconds>(__duration)); } diff --git a/libcudacxx/include/cuda/std/__algorithm/copy.h b/libcudacxx/include/cuda/std/__algorithm/copy.h index 2333a19a604..f4013d4ea73 100644 --- a/libcudacxx/include/cuda/std/__algorithm/copy.h +++ b/libcudacxx/include/cuda/std/__algorithm/copy.h @@ -54,7 +54,7 @@ _LIBCUDACXX_HIDE_FROM_ABI _CCCL_CONSTEXPR_CXX14 bool __dispatch_memmove(_Up* __r return false; #endif - if (__libcpp_is_constant_evaluated()) + if (_CUDA_VSTD::is_constant_evaluated()) { return false; } @@ -114,7 +114,7 @@ _LIBCUDACXX_HIDE_FROM_ABI _CCCL_CONSTEXPR_CXX14 pair<_Tp*, _Up*> __copy(_Tp* __f { return {__last, __result + __n}; } - if ((!__libcpp_is_constant_evaluated() && __first < __result) + if ((!_CUDA_VSTD::is_constant_evaluated() && __first < __result) || __constexpr_tail_overlap(__first, __result, __last)) { for (ptrdiff_t __i = __n; __i > 0; --__i) diff --git a/libcudacxx/include/cuda/std/__atomic/wait/notify_wait.h b/libcudacxx/include/cuda/std/__atomic/wait/notify_wait.h index b79b22adad6..1c4c23d959e 100644 --- a/libcudacxx/include/cuda/std/__atomic/wait/notify_wait.h +++ b/libcudacxx/include/cuda/std/__atomic/wait/notify_wait.h @@ -72,11 +72,11 @@ _LIBCUDACXX_HIDE_FROM_ABI void __atomic_wait( } if (__i < 12) { - _CUDA_VSTD::__libcpp_thread_yield_processor(); + _CUDA_VSTD::__cccl_thread_yield_processor(); } else { - _CUDA_VSTD::__libcpp_thread_yield(); + _CUDA_VSTD::__cccl_thread_yield(); } } while (__nonatomic_compare_equal(__atomic_load_dispatch(__a, __order, _Sco{}), __val)) diff --git a/libcudacxx/include/cuda/std/__atomic/wait/polling.h b/libcudacxx/include/cuda/std/__atomic/wait/polling.h index cbb1a73a4b8..54ba4a08948 100644 --- a/libcudacxx/include/cuda/std/__atomic/wait/polling.h +++ b/libcudacxx/include/cuda/std/__atomic/wait/polling.h @@ -53,7 +53,7 @@ template _CCCL_HOST_DEVICE void __atomic_try_wait_slow_fallback( _Tp const volatile* __a, __atomic_underlying_remove_cv_t<_Tp> __val, memory_order __order, _Sco) { - _CUDA_VSTD::__libcpp_thread_poll_with_backoff(__atomic_poll_tester<_Tp, _Sco>(__a, __val, __order)); + _CUDA_VSTD::__cccl_thread_poll_with_backoff(__atomic_poll_tester<_Tp, _Sco>(__a, __val, __order)); } _LIBCUDACXX_END_NAMESPACE_STD diff --git a/libcudacxx/include/cuda/std/__barrier/barrier.h b/libcudacxx/include/cuda/std/__barrier/barrier.h index 491998132a8..e17d4a2d111 100644 --- a/libcudacxx/include/cuda/std/__barrier/barrier.h +++ b/libcudacxx/include/cuda/std/__barrier/barrier.h @@ -192,12 +192,12 @@ class __barrier_base<__empty_completion, _Sco> } _LIBCUDACXX_HIDE_FROM_ABI void wait(arrival_token&& __phase) const { - _CUDA_VSTD::__libcpp_thread_poll_with_backoff( + _CUDA_VSTD::__cccl_thread_poll_with_backoff( __barrier_poll_tester_phase<__barrier_base>(this, _CUDA_VSTD::move(__phase))); } _LIBCUDACXX_HIDE_FROM_ABI void wait_parity(bool __parity) const { - _CUDA_VSTD::__libcpp_thread_poll_with_backoff(__barrier_poll_tester_parity<__barrier_base>(this, __parity)); + _CUDA_VSTD::__cccl_thread_poll_with_backoff(__barrier_poll_tester_parity<__barrier_base>(this, __parity)); } _LIBCUDACXX_HIDE_FROM_ABI void arrive_and_wait() { diff --git a/libcudacxx/include/cuda/std/__bit/clz.h b/libcudacxx/include/cuda/std/__bit/clz.h index 267f022737a..791db82ca7f 100644 --- a/libcudacxx/include/cuda/std/__bit/clz.h +++ b/libcudacxx/include/cuda/std/__bit/clz.h @@ -75,10 +75,10 @@ _LIBCUDACXX_HIDE_FROM_ABI constexpr int __constexpr_clz(uint64_t __x) noexcept # endif } -_LIBCUDACXX_HIDE_FROM_ABI constexpr int __libcpp_clz(uint32_t __x) noexcept +_LIBCUDACXX_HIDE_FROM_ABI constexpr int __cccl_clz(uint32_t __x) noexcept { # if _CCCL_STD_VER >= 2014 - if (!__libcpp_default_is_constant_evaluated()) + if (!__cccl_default_is_constant_evaluated()) { NV_IF_ELSE_TARGET(NV_IS_DEVICE, (return __clz(__x);), (return __builtin_clz(__x);)) } @@ -86,10 +86,10 @@ _LIBCUDACXX_HIDE_FROM_ABI constexpr int __libcpp_clz(uint32_t __x) noexcept return __constexpr_clz(__x); } -_LIBCUDACXX_HIDE_FROM_ABI constexpr int __libcpp_clz(uint64_t __x) noexcept +_LIBCUDACXX_HIDE_FROM_ABI constexpr int __cccl_clz(uint64_t __x) noexcept { # if _CCCL_STD_VER >= 2014 - if (!__libcpp_default_is_constant_evaluated()) + if (!__cccl_default_is_constant_evaluated()) { NV_IF_ELSE_TARGET(NV_IS_DEVICE, (return __clzll(__x);), (return __builtin_clzll(__x);)) } @@ -100,10 +100,10 @@ _LIBCUDACXX_HIDE_FROM_ABI constexpr int __libcpp_clz(uint64_t __x) noexcept #else // _CCCL_COMPILER(MSVC) // Precondition: __x != 0 -_LIBCUDACXX_HIDE_FROM_ABI constexpr int __libcpp_clz(uint32_t __x) +_LIBCUDACXX_HIDE_FROM_ABI constexpr int __cccl_clz(uint32_t __x) { # if !defined(__CUDA_ARCH__) - if (!__libcpp_default_is_constant_evaluated()) + if (!__cccl_default_is_constant_evaluated()) { unsigned long __where = 0; if (_BitScanReverse(&__where, __x)) @@ -117,10 +117,10 @@ _LIBCUDACXX_HIDE_FROM_ABI constexpr int __libcpp_clz(uint32_t __x) return __binary_clz32(static_cast(__x), 0); } -_LIBCUDACXX_HIDE_FROM_ABI constexpr int __libcpp_clz(uint64_t __x) +_LIBCUDACXX_HIDE_FROM_ABI constexpr int __cccl_clz(uint64_t __x) { # if !defined(__CUDA_ARCH__) - if (!__libcpp_default_is_constant_evaluated()) + if (!__cccl_default_is_constant_evaluated()) { unsigned long __where = 0; # if defined(_LIBCUDACXX_HAS_BITSCAN64) diff --git a/libcudacxx/include/cuda/std/__bit/countl.h b/libcudacxx/include/cuda/std/__bit/countl.h index f15e14a5293..3642d17de09 100644 --- a/libcudacxx/include/cuda/std/__bit/countl.h +++ b/libcudacxx/include/cuda/std/__bit/countl.h @@ -38,14 +38,14 @@ template _LIBCUDACXX_HIDE_FROM_ABI constexpr enable_if_t __countl_zero_dispatch(_Tp __t) noexcept { - return __libcpp_clz(static_cast(__t)) - (numeric_limits::digits - numeric_limits<_Tp>::digits); + return __cccl_clz(static_cast(__t)) - (numeric_limits::digits - numeric_limits<_Tp>::digits); } template _LIBCUDACXX_HIDE_FROM_ABI constexpr enable_if_t __countl_zero_dispatch(_Tp __t) noexcept { - return __libcpp_clz(static_cast(__t)) - (numeric_limits::digits - numeric_limits<_Tp>::digits); + return __cccl_clz(static_cast(__t)) - (numeric_limits::digits - numeric_limits<_Tp>::digits); } template @@ -90,27 +90,26 @@ __countl_zero_dispatch(_Tp __t) noexcept template _LIBCUDACXX_HIDE_FROM_ABI constexpr int __countl_zero(_Tp __t) noexcept { - static_assert(__libcpp_is_unsigned_integer<_Tp>::value, "__countl_zero requires unsigned"); + static_assert(__cccl_is_unsigned_integer<_Tp>::value, "__countl_zero requires unsigned"); return __t ? __countl_zero_dispatch(__t) : numeric_limits<_Tp>::digits; } template _LIBCUDACXX_HIDE_FROM_ABI constexpr int __countl_one(_Tp __t) noexcept { - static_assert(__libcpp_is_unsigned_integer<_Tp>::value, "__countl_one requires unsigned"); + static_assert(__cccl_is_unsigned_integer<_Tp>::value, "__countl_one requires unsigned"); return __t != numeric_limits<_Tp>::max() ? __countl_zero(static_cast<_Tp>(~__t)) : numeric_limits<_Tp>::digits; } template -_LIBCUDACXX_HIDE_FROM_ABI constexpr enable_if_t<__libcpp_is_unsigned_integer<_Tp>::value, int> +_LIBCUDACXX_HIDE_FROM_ABI constexpr enable_if_t<__cccl_is_unsigned_integer<_Tp>::value, int> countl_zero(_Tp __t) noexcept { return __countl_zero(__t); } template -_LIBCUDACXX_HIDE_FROM_ABI constexpr enable_if_t<__libcpp_is_unsigned_integer<_Tp>::value, int> -countl_one(_Tp __t) noexcept +_LIBCUDACXX_HIDE_FROM_ABI constexpr enable_if_t<__cccl_is_unsigned_integer<_Tp>::value, int> countl_one(_Tp __t) noexcept { return __countl_one(__t); } diff --git a/libcudacxx/include/cuda/std/__bit/countr.h b/libcudacxx/include/cuda/std/__bit/countr.h index 21e65f800ba..e7a2b609abe 100644 --- a/libcudacxx/include/cuda/std/__bit/countr.h +++ b/libcudacxx/include/cuda/std/__bit/countr.h @@ -37,14 +37,14 @@ template _LIBCUDACXX_HIDE_FROM_ABI constexpr enable_if_t __countr_zero_dispatch(_Tp __t) noexcept { - return __libcpp_ctz(static_cast(__t)); + return __cccl_ctz(static_cast(__t)); } template _LIBCUDACXX_HIDE_FROM_ABI constexpr enable_if_t __countr_zero_dispatch(_Tp __t) noexcept { - return __libcpp_ctz(static_cast(__t)); + return __cccl_ctz(static_cast(__t)); } template @@ -83,7 +83,7 @@ __countr_zero_dispatch(_Tp __t) noexcept template _LIBCUDACXX_HIDE_FROM_ABI constexpr int __countr_zero(_Tp __t) noexcept { - static_assert(__libcpp_is_unsigned_integer<_Tp>::value, "__countr_zero requires unsigned"); + static_assert(__cccl_is_unsigned_integer<_Tp>::value, "__countr_zero requires unsigned"); return __t ? __countr_zero_dispatch(__t) : numeric_limits<_Tp>::digits; } @@ -91,20 +91,19 @@ _LIBCUDACXX_HIDE_FROM_ABI constexpr int __countr_zero(_Tp __t) noexcept template _LIBCUDACXX_HIDE_FROM_ABI constexpr int __countr_one(_Tp __t) noexcept { - static_assert(__libcpp_is_unsigned_integer<_Tp>::value, "__countr_one requires unsigned"); + static_assert(__cccl_is_unsigned_integer<_Tp>::value, "__countr_one requires unsigned"); return __t != numeric_limits<_Tp>::max() ? __countr_zero(static_cast<_Tp>(~__t)) : numeric_limits<_Tp>::digits; } template -_LIBCUDACXX_HIDE_FROM_ABI constexpr enable_if_t<__libcpp_is_unsigned_integer<_Tp>::value, int> +_LIBCUDACXX_HIDE_FROM_ABI constexpr enable_if_t<__cccl_is_unsigned_integer<_Tp>::value, int> countr_zero(_Tp __t) noexcept { return __countr_zero(__t); } template -_LIBCUDACXX_HIDE_FROM_ABI constexpr enable_if_t<__libcpp_is_unsigned_integer<_Tp>::value, int> -countr_one(_Tp __t) noexcept +_LIBCUDACXX_HIDE_FROM_ABI constexpr enable_if_t<__cccl_is_unsigned_integer<_Tp>::value, int> countr_one(_Tp __t) noexcept { return __countr_one(__t); } diff --git a/libcudacxx/include/cuda/std/__bit/ctz.h b/libcudacxx/include/cuda/std/__bit/ctz.h index 9d2e771bd61..813afa6be65 100644 --- a/libcudacxx/include/cuda/std/__bit/ctz.h +++ b/libcudacxx/include/cuda/std/__bit/ctz.h @@ -75,10 +75,10 @@ _LIBCUDACXX_HIDE_FROM_ABI constexpr int __constexpr_ctz(uint64_t __x) noexcept # endif } -_LIBCUDACXX_HIDE_FROM_ABI constexpr int __libcpp_ctz(uint32_t __x) noexcept +_LIBCUDACXX_HIDE_FROM_ABI constexpr int __cccl_ctz(uint32_t __x) noexcept { # if _CCCL_STD_VER >= 2014 - if (!__libcpp_default_is_constant_evaluated()) + if (!__cccl_default_is_constant_evaluated()) { NV_IF_ELSE_TARGET( NV_IS_DEVICE, (return (!__x) ? (sizeof(uint32_t) * 8) : (__ffs(__x) - 1);), (return __builtin_ctz(__x);)) @@ -87,10 +87,10 @@ _LIBCUDACXX_HIDE_FROM_ABI constexpr int __libcpp_ctz(uint32_t __x) noexcept return __constexpr_ctz(__x); } -_LIBCUDACXX_HIDE_FROM_ABI constexpr int __libcpp_ctz(uint64_t __x) noexcept +_LIBCUDACXX_HIDE_FROM_ABI constexpr int __cccl_ctz(uint64_t __x) noexcept { # if _CCCL_STD_VER >= 2014 - if (!__libcpp_default_is_constant_evaluated()) + if (!__cccl_default_is_constant_evaluated()) { NV_IF_ELSE_TARGET( NV_IS_DEVICE, (return (!__x) ? (sizeof(uint64_t) * 8) : (__ffsll(__x) - 1);), (return __builtin_ctzll(__x);)) @@ -102,10 +102,10 @@ _LIBCUDACXX_HIDE_FROM_ABI constexpr int __libcpp_ctz(uint64_t __x) noexcept #else // _CCCL_COMPILER(MSVC) // Precondition: __x != 0 -_LIBCUDACXX_HIDE_FROM_ABI constexpr int __libcpp_ctz(uint32_t __x) +_LIBCUDACXX_HIDE_FROM_ABI constexpr int __cccl_ctz(uint32_t __x) { # if !defined(__CUDA_ARCH__) - if (!__libcpp_default_is_constant_evaluated()) + if (!__cccl_default_is_constant_evaluated()) { unsigned long __where = 0; if (_BitScanForward(&__where, __x)) @@ -119,10 +119,10 @@ _LIBCUDACXX_HIDE_FROM_ABI constexpr int __libcpp_ctz(uint32_t __x) return __binary_ctz32(static_cast(__x), 0); } -_LIBCUDACXX_HIDE_FROM_ABI constexpr int __libcpp_ctz(uint64_t __x) +_LIBCUDACXX_HIDE_FROM_ABI constexpr int __cccl_ctz(uint64_t __x) { # if !defined(__CUDA_ARCH__) - if (!__libcpp_default_is_constant_evaluated()) + if (!__cccl_default_is_constant_evaluated()) { unsigned long __where = 0; # if defined(_LIBCUDACXX_HAS_BITSCAN64) && (defined(_M_AMD64) || defined(__x86_64__)) diff --git a/libcudacxx/include/cuda/std/__bit/has_single_bit.h b/libcudacxx/include/cuda/std/__bit/has_single_bit.h index 1cd207f72bb..07586899549 100644 --- a/libcudacxx/include/cuda/std/__bit/has_single_bit.h +++ b/libcudacxx/include/cuda/std/__bit/has_single_bit.h @@ -29,12 +29,12 @@ _LIBCUDACXX_BEGIN_NAMESPACE_STD template _LIBCUDACXX_HIDE_FROM_ABI constexpr bool __has_single_bit(_Tp __t) noexcept { - static_assert(__libcpp_is_unsigned_integer<_Tp>::value, "__has_single_bit requires unsigned"); + static_assert(__cccl_is_unsigned_integer<_Tp>::value, "__has_single_bit requires unsigned"); return __t != 0 && (((__t & (__t - 1)) == 0)); } template -_LIBCUDACXX_HIDE_FROM_ABI constexpr enable_if_t<__libcpp_is_unsigned_integer<_Tp>::value, bool> +_LIBCUDACXX_HIDE_FROM_ABI constexpr enable_if_t<__cccl_is_unsigned_integer<_Tp>::value, bool> has_single_bit(_Tp __t) noexcept { return __has_single_bit(__t); diff --git a/libcudacxx/include/cuda/std/__bit/integral.h b/libcudacxx/include/cuda/std/__bit/integral.h index 869972f3422..f0186ad9f5f 100644 --- a/libcudacxx/include/cuda/std/__bit/integral.h +++ b/libcudacxx/include/cuda/std/__bit/integral.h @@ -32,7 +32,7 @@ _LIBCUDACXX_BEGIN_NAMESPACE_STD template _LIBCUDACXX_HIDE_FROM_ABI constexpr uint32_t __bit_log2(_Tp __t) noexcept { - static_assert(__libcpp_is_unsigned_integer<_Tp>::value, "__bit_log2 requires unsigned"); + static_assert(__cccl_is_unsigned_integer<_Tp>::value, "__bit_log2 requires unsigned"); return numeric_limits<_Tp>::digits - 1 - __countl_zero(__t); } @@ -51,21 +51,19 @@ _LIBCUDACXX_HIDE_FROM_ABI constexpr enable_if_t -_LIBCUDACXX_HIDE_FROM_ABI constexpr enable_if_t<__libcpp_is_unsigned_integer<_Tp>::value, _Tp> -bit_floor(_Tp __t) noexcept +_LIBCUDACXX_HIDE_FROM_ABI constexpr enable_if_t<__cccl_is_unsigned_integer<_Tp>::value, _Tp> bit_floor(_Tp __t) noexcept { return __t == 0 ? 0 : static_cast<_Tp>(_Tp{1} << __bit_log2(__t)); } template -_LIBCUDACXX_HIDE_FROM_ABI constexpr enable_if_t<__libcpp_is_unsigned_integer<_Tp>::value, _Tp> bit_ceil(_Tp __t) noexcept +_LIBCUDACXX_HIDE_FROM_ABI constexpr enable_if_t<__cccl_is_unsigned_integer<_Tp>::value, _Tp> bit_ceil(_Tp __t) noexcept { return (__t < 2) ? 1 : static_cast<_Tp>(__ceil2(__t)); } template -_LIBCUDACXX_HIDE_FROM_ABI constexpr enable_if_t<__libcpp_is_unsigned_integer<_Tp>::value, int> -bit_width(_Tp __t) noexcept +_LIBCUDACXX_HIDE_FROM_ABI constexpr enable_if_t<__cccl_is_unsigned_integer<_Tp>::value, int> bit_width(_Tp __t) noexcept { return __t == 0 ? 0 : static_cast(__bit_log2(__t) + 1); } diff --git a/libcudacxx/include/cuda/std/__bit/popc.h b/libcudacxx/include/cuda/std/__bit/popc.h index dc22999b985..6a1cb93239f 100644 --- a/libcudacxx/include/cuda/std/__bit/popc.h +++ b/libcudacxx/include/cuda/std/__bit/popc.h @@ -76,10 +76,10 @@ _LIBCUDACXX_HIDE_FROM_ABI constexpr int __constexpr_popcount(uint64_t __x) noexc # endif } -_LIBCUDACXX_HIDE_FROM_ABI constexpr int __libcpp_popc(uint32_t __x) noexcept +_LIBCUDACXX_HIDE_FROM_ABI constexpr int __cccl_popc(uint32_t __x) noexcept { # if _CCCL_STD_VER >= 2014 - if (!__libcpp_default_is_constant_evaluated()) + if (!__cccl_default_is_constant_evaluated()) { NV_IF_ELSE_TARGET(NV_IS_DEVICE, (return __popc(__x);), (return __builtin_popcount(__x);)) } @@ -87,10 +87,10 @@ _LIBCUDACXX_HIDE_FROM_ABI constexpr int __libcpp_popc(uint32_t __x) noexcept return __constexpr_popcount(static_cast(__x)); } -_LIBCUDACXX_HIDE_FROM_ABI constexpr int __libcpp_popc(uint64_t __x) noexcept +_LIBCUDACXX_HIDE_FROM_ABI constexpr int __cccl_popc(uint64_t __x) noexcept { # if _CCCL_STD_VER >= 2014 - if (!__libcpp_default_is_constant_evaluated()) + if (!__cccl_default_is_constant_evaluated()) { NV_IF_ELSE_TARGET(NV_IS_DEVICE, (return __popcll(__x);), (return __builtin_popcountll(__x);)) } @@ -100,9 +100,9 @@ _LIBCUDACXX_HIDE_FROM_ABI constexpr int __libcpp_popc(uint64_t __x) noexcept #else // _CCCL_COMPILER(MSVC) -_LIBCUDACXX_HIDE_FROM_ABI constexpr int __libcpp_popc(uint32_t __x) +_LIBCUDACXX_HIDE_FROM_ABI constexpr int __cccl_popc(uint32_t __x) { - if (!__libcpp_default_is_constant_evaluated()) + if (!__cccl_default_is_constant_evaluated()) { NV_IF_TARGET(NV_IS_HOST, (return static_cast(_LIBCUDACXX_MSVC_POPC(__x));)) } @@ -110,9 +110,9 @@ _LIBCUDACXX_HIDE_FROM_ABI constexpr int __libcpp_popc(uint32_t __x) return __fallback_popc64(static_cast(__x)); } -_LIBCUDACXX_HIDE_FROM_ABI constexpr int __libcpp_popc(uint64_t __x) +_LIBCUDACXX_HIDE_FROM_ABI constexpr int __cccl_popc(uint64_t __x) { - if (!__libcpp_default_is_constant_evaluated()) + if (!__cccl_default_is_constant_evaluated()) { NV_IF_TARGET(NV_IS_HOST, (return static_cast(_LIBCUDACXX_MSVC_POPC64(__x));)) } diff --git a/libcudacxx/include/cuda/std/__bit/popcount.h b/libcudacxx/include/cuda/std/__bit/popcount.h index 5d4395cb457..18c8d97dd30 100644 --- a/libcudacxx/include/cuda/std/__bit/popcount.h +++ b/libcudacxx/include/cuda/std/__bit/popcount.h @@ -33,14 +33,14 @@ template _LIBCUDACXX_HIDE_FROM_ABI constexpr enable_if_t __popcount_dispatch(_Tp __t) noexcept { - return __libcpp_popc(static_cast(__t)); + return __cccl_popc(static_cast(__t)); } template _LIBCUDACXX_HIDE_FROM_ABI constexpr enable_if_t __popcount_dispatch(_Tp __t) noexcept { - return __libcpp_popc(static_cast(__t)); + return __cccl_popc(static_cast(__t)); } template @@ -49,7 +49,7 @@ struct __popcount_rsh_impl static _LIBCUDACXX_HIDE_FROM_ABI constexpr int __count(_Tp __t) { return __popcount_rsh_impl<_Tp, _St - 1>::__count(__t >> numeric_limits::digits) - + __libcpp_popc(static_cast(__t)); + + __cccl_popc(static_cast(__t)); } }; @@ -58,7 +58,7 @@ struct __popcount_rsh_impl<_Tp, 1> { static _LIBCUDACXX_HIDE_FROM_ABI constexpr int __count(_Tp __t) { - return __libcpp_popc(static_cast(__t)); + return __cccl_popc(static_cast(__t)); } }; @@ -72,13 +72,13 @@ __popcount_dispatch(_Tp __t) noexcept template _LIBCUDACXX_HIDE_FROM_ABI constexpr int __popcount(_Tp __t) noexcept { - static_assert(__libcpp_is_unsigned_integer<_Tp>::value, "__libcpp_popcount requires unsigned"); + static_assert(__cccl_is_unsigned_integer<_Tp>::value, "__cccl_popcount requires unsigned"); return __popcount_dispatch(__t); } template -_LIBCUDACXX_HIDE_FROM_ABI constexpr enable_if_t<__libcpp_is_unsigned_integer<_Tp>::value, int> popcount(_Tp __t) noexcept +_LIBCUDACXX_HIDE_FROM_ABI constexpr enable_if_t<__cccl_is_unsigned_integer<_Tp>::value, int> popcount(_Tp __t) noexcept { return __popcount(__t); } diff --git a/libcudacxx/include/cuda/std/__bit/reference.h b/libcudacxx/include/cuda/std/__bit/reference.h index d4c7320a701..12acac014b1 100644 --- a/libcudacxx/include/cuda/std/__bit/reference.h +++ b/libcudacxx/include/cuda/std/__bit/reference.h @@ -109,7 +109,7 @@ class __bit_reference } _LIBCUDACXX_HIDE_FROM_ABI _CCCL_CONSTEXPR_CXX14 __bit_iterator<_Cp, false> operator&() const noexcept { - return __bit_iterator<_Cp, false>(__seg_, static_cast(_CUDA_VSTD::__libcpp_ctz(__mask_))); + return __bit_iterator<_Cp, false>(__seg_, static_cast(_CUDA_VSTD::__cccl_ctz(__mask_))); } friend _LIBCUDACXX_HIDE_FROM_ABI _CCCL_CONSTEXPR_CXX14 void @@ -180,7 +180,7 @@ class __bit_const_reference _LIBCUDACXX_HIDE_FROM_ABI _CCCL_CONSTEXPR_CXX14 __bit_iterator<_Cp, true> operator&() const noexcept { - return __bit_iterator<_Cp, true>(__seg_, static_cast(_CUDA_VSTD::__libcpp_ctz(__mask_))); + return __bit_iterator<_Cp, true>(__seg_, static_cast(_CUDA_VSTD::__cccl_ctz(__mask_))); } private: @@ -812,7 +812,7 @@ struct __bit_array _LIBCUDACXX_HIDE_FROM_ABI _CCCL_CONSTEXPR_CXX14 explicit __bit_array(difference_type __s) : __size_(__s) { - if (__libcpp_is_constant_evaluated()) + if (_CUDA_VSTD::is_constant_evaluated()) { for (size_t __i = 0; __i != __bit_array<_Cp>::_Np; ++__i) { diff --git a/libcudacxx/include/cuda/std/__bit/rotate.h b/libcudacxx/include/cuda/std/__bit/rotate.h index 0d5d7652a91..bf2c2e5f61a 100644 --- a/libcudacxx/include/cuda/std/__bit/rotate.h +++ b/libcudacxx/include/cuda/std/__bit/rotate.h @@ -30,7 +30,7 @@ _LIBCUDACXX_BEGIN_NAMESPACE_STD template _CCCL_NODISCARD _LIBCUDACXX_HIDE_FROM_ABI constexpr _Tp __rotl(_Tp __t, unsigned int __cnt) noexcept { - static_assert(__libcpp_is_unsigned_integer<_Tp>::value, "__rotl requires unsigned"); + static_assert(__cccl_is_unsigned_integer<_Tp>::value, "__rotl requires unsigned"); using __nlt = numeric_limits<_Tp>; return ((__cnt % __nlt::digits) == 0) @@ -41,7 +41,7 @@ _CCCL_NODISCARD _LIBCUDACXX_HIDE_FROM_ABI constexpr _Tp __rotl(_Tp __t, unsigned template _CCCL_NODISCARD _LIBCUDACXX_HIDE_FROM_ABI constexpr _Tp __rotr(_Tp __t, unsigned int __cnt) noexcept { - static_assert(__libcpp_is_unsigned_integer<_Tp>::value, "__rotr requires unsigned"); + static_assert(__cccl_is_unsigned_integer<_Tp>::value, "__rotr requires unsigned"); using __nlt = numeric_limits<_Tp>; return ((__cnt % __nlt::digits) == 0) @@ -50,7 +50,7 @@ _CCCL_NODISCARD _LIBCUDACXX_HIDE_FROM_ABI constexpr _Tp __rotr(_Tp __t, unsigned } template -_CCCL_NODISCARD _LIBCUDACXX_HIDE_FROM_ABI constexpr enable_if_t<__libcpp_is_unsigned_integer<_Tp>::value, _Tp> +_CCCL_NODISCARD _LIBCUDACXX_HIDE_FROM_ABI constexpr enable_if_t<__cccl_is_unsigned_integer<_Tp>::value, _Tp> rotl(_Tp __t, unsigned int __cnt) noexcept { return __rotl(__t, __cnt); @@ -58,7 +58,7 @@ rotl(_Tp __t, unsigned int __cnt) noexcept // rotr template -_CCCL_NODISCARD _LIBCUDACXX_HIDE_FROM_ABI constexpr enable_if_t<__libcpp_is_unsigned_integer<_Tp>::value, _Tp> +_CCCL_NODISCARD _LIBCUDACXX_HIDE_FROM_ABI constexpr enable_if_t<__cccl_is_unsigned_integer<_Tp>::value, _Tp> rotr(_Tp __t, unsigned int __cnt) noexcept { return __rotr(__t, __cnt); diff --git a/libcudacxx/include/cuda/std/__complex/nvbf16.h b/libcudacxx/include/cuda/std/__complex/nvbf16.h index ede7f05a29a..0167f952141 100644 --- a/libcudacxx/include/cuda/std/__complex/nvbf16.h +++ b/libcudacxx/include/cuda/std/__complex/nvbf16.h @@ -83,7 +83,7 @@ struct __type_to_vector<__nv_bfloat16> }; template <> -struct __libcpp_complex_overload_traits<__nv_bfloat16, false, false> +struct __cccl_complex_overload_traits<__nv_bfloat16, false, false> { typedef __nv_bfloat16 _ValueType; typedef complex<__nv_bfloat16> _ComplexType; diff --git a/libcudacxx/include/cuda/std/__complex/nvfp16.h b/libcudacxx/include/cuda/std/__complex/nvfp16.h index 11406f98588..8ddd2b27747 100644 --- a/libcudacxx/include/cuda/std/__complex/nvfp16.h +++ b/libcudacxx/include/cuda/std/__complex/nvfp16.h @@ -80,7 +80,7 @@ struct __type_to_vector<__half> }; template <> -struct __libcpp_complex_overload_traits<__half, false, false> +struct __cccl_complex_overload_traits<__half, false, false> { typedef __half _ValueType; typedef complex<__half> _ComplexType; diff --git a/libcudacxx/include/cuda/std/__concepts/arithmetic.h b/libcudacxx/include/cuda/std/__concepts/arithmetic.h index 5a643652824..cd909548745 100644 --- a/libcudacxx/include/cuda/std/__concepts/arithmetic.h +++ b/libcudacxx/include/cuda/std/__concepts/arithmetic.h @@ -47,7 +47,7 @@ template _CCCL_CONCEPT floating_point = _CCCL_TRAIT(is_floating_point, _Tp); template -_CCCL_CONCEPT __libcpp_signed_integer = __libcpp_is_signed_integer<_Tp>::value; +_CCCL_CONCEPT __cccl_signed_integer = __cccl_is_signed_integer<_Tp>::value; #endif // ^^^ !_CCCL_NO_VARIABLE_TEMPLATES diff --git a/libcudacxx/include/cuda/std/__iterator/erase_if_container.h b/libcudacxx/include/cuda/std/__iterator/erase_if_container.h index 2d2b6e35767..e4573dc187b 100644 --- a/libcudacxx/include/cuda/std/__iterator/erase_if_container.h +++ b/libcudacxx/include/cuda/std/__iterator/erase_if_container.h @@ -24,7 +24,7 @@ _LIBCUDACXX_BEGIN_NAMESPACE_STD template -_LIBCUDACXX_HIDE_FROM_ABI typename _Container::size_type __libcpp_erase_if_container(_Container& __c, _Predicate& __pred) +_LIBCUDACXX_HIDE_FROM_ABI typename _Container::size_type __cccl_erase_if_container(_Container& __c, _Predicate& __pred) { typename _Container::size_type __old_size = __c.size(); diff --git a/libcudacxx/include/cuda/std/__memory/allocator.h b/libcudacxx/include/cuda/std/__memory/allocator.h index fecac15b13f..c771226e191 100644 --- a/libcudacxx/include/cuda/std/__memory/allocator.h +++ b/libcudacxx/include/cuda/std/__memory/allocator.h @@ -129,13 +129,13 @@ class _CCCL_TYPE_VISIBILITY_DEFAULT allocator : private __non_trivial_if{}.allocate(__n); } #endif // _CCCL_HAS_CONSTEXPR_ALLOCATION { - return static_cast<_Tp*>(_CUDA_VSTD::__libcpp_allocate(__n * sizeof(_Tp), _LIBCUDACXX_ALIGNOF(_Tp))); + return static_cast<_Tp*>(_CUDA_VSTD::__cccl_allocate(__n * sizeof(_Tp), _LIBCUDACXX_ALIGNOF(_Tp))); } } @@ -150,14 +150,14 @@ class _CCCL_TYPE_VISIBILITY_DEFAULT allocator : private __non_trivial_if{}.deallocate(__p, __n); } else #endif // _CCCL_STD_VER >= 2020 { - _CUDA_VSTD::__libcpp_deallocate((void*) __p, __n * sizeof(_Tp), _LIBCUDACXX_ALIGNOF(_Tp)); + _CUDA_VSTD::__cccl_deallocate((void*) __p, __n * sizeof(_Tp), _LIBCUDACXX_ALIGNOF(_Tp)); } } @@ -231,13 +231,13 @@ class _CCCL_TYPE_VISIBILITY_DEFAULT allocator { __throw_bad_array_new_length(); } - if (__libcpp_is_constant_evaluated()) + if (_CUDA_VSTD::is_constant_evaluated()) { return static_cast(::operator new(__n * sizeof(_Tp))); } else { - return static_cast(_CUDA_VSTD::__libcpp_allocate(__n * sizeof(_Tp), _LIBCUDACXX_ALIGNOF(_Tp))); + return static_cast(_CUDA_VSTD::__cccl_allocate(__n * sizeof(_Tp), _LIBCUDACXX_ALIGNOF(_Tp))); } } @@ -250,13 +250,13 @@ class _CCCL_TYPE_VISIBILITY_DEFAULT allocator _LIBCUDACXX_HIDE_FROM_ABI _CCCL_CONSTEXPR_CXX20 void deallocate(const _Tp* __p, size_t __n) noexcept { - if (__libcpp_is_constant_evaluated()) + if (_CUDA_VSTD::is_constant_evaluated()) { ::operator delete(const_cast<_Tp*>(__p)); } else { - _CUDA_VSTD::__libcpp_deallocate((void*) const_cast<_Tp*>(__p), __n * sizeof(_Tp), _LIBCUDACXX_ALIGNOF(_Tp)); + _CUDA_VSTD::__cccl_deallocate((void*) const_cast<_Tp*>(__p), __n * sizeof(_Tp), _LIBCUDACXX_ALIGNOF(_Tp)); } } diff --git a/libcudacxx/include/cuda/std/__memory/builtin_new_allocator.h b/libcudacxx/include/cuda/std/__memory/builtin_new_allocator.h index 03a45cac5a6..5752a48ec04 100644 --- a/libcudacxx/include/cuda/std/__memory/builtin_new_allocator.h +++ b/libcudacxx/include/cuda/std/__memory/builtin_new_allocator.h @@ -45,7 +45,7 @@ struct __builtin_new_allocator _LIBCUDACXX_HIDE_FROM_ABI void operator()(void* __p) const noexcept { - _CUDA_VSTD::__libcpp_deallocate(__p, __size_, __align_); + _CUDA_VSTD::__cccl_deallocate(__p, __size_, __align_); } private: @@ -57,12 +57,12 @@ struct __builtin_new_allocator _LIBCUDACXX_HIDE_FROM_ABI static __holder_t __allocate_bytes(size_t __s, size_t __align) { - return __holder_t(_CUDA_VSTD::__libcpp_allocate(__s, __align), __builtin_new_deleter(__s, __align)); + return __holder_t(_CUDA_VSTD::__cccl_allocate(__s, __align), __builtin_new_deleter(__s, __align)); } _LIBCUDACXX_HIDE_FROM_ABI static void __deallocate_bytes(void* __p, size_t __s, size_t __align) noexcept { - _CUDA_VSTD::__libcpp_deallocate(__p, __s, __align); + _CUDA_VSTD::__cccl_deallocate(__p, __s, __align); } template diff --git a/libcudacxx/include/cuda/std/__memory/construct_at.h b/libcudacxx/include/cuda/std/__memory/construct_at.h index 18300552e7e..aeb39a6bf18 100644 --- a/libcudacxx/include/cuda/std/__memory/construct_at.h +++ b/libcudacxx/include/cuda/std/__memory/construct_at.h @@ -114,7 +114,7 @@ construct_at(_Tp* __location, _Args&&... __args) { _CCCL_ASSERT(__location != nullptr, "null pointer given to construct_at"); // Need to go through `std::construct_at` as that is the explicitly blessed function - if (__libcpp_is_constant_evaluated()) + if (_CUDA_VSTD::is_constant_evaluated()) { return ::std::construct_at(__location, _CUDA_VSTD::forward<_Args>(__args)...); } @@ -131,7 +131,7 @@ construct_at(_Tp* __location, _Args&&... __args) { _CCCL_ASSERT(__location != nullptr, "null pointer given to construct_at"); // Need to go through `std::construct_at` as that is the explicitly blessed function - if (__libcpp_is_constant_evaluated()) + if (_CUDA_VSTD::is_constant_evaluated()) { return ::std::construct_at(__location, _CUDA_VSTD::forward<_Args>(__args)...); } @@ -150,7 +150,7 @@ __construct_at(_Tp* __location, _Args&&... __args) _CCCL_ASSERT(__location != nullptr, "null pointer given to construct_at"); #if _CCCL_STD_VER >= 2020 // Need to go through `std::construct_at` as that is the explicitly blessed function - if (__libcpp_is_constant_evaluated()) + if (_CUDA_VSTD::is_constant_evaluated()) { return ::std::construct_at(__location, _CUDA_VSTD::forward<_Args>(__args)...); } @@ -167,7 +167,7 @@ __construct_at(_Tp* __location, _Args&&... __args) _CCCL_ASSERT(__location != nullptr, "null pointer given to construct_at"); #if _CCCL_STD_VER >= 2020 // Need to go through `std::construct_at` as that is the explicitly blessed function - if (__libcpp_is_constant_evaluated()) + if (_CUDA_VSTD::is_constant_evaluated()) { return ::std::construct_at(__location, _CUDA_VSTD::forward<_Args>(__args)...); } diff --git a/libcudacxx/include/cuda/std/__memory/temporary_buffer.h b/libcudacxx/include/cuda/std/__memory/temporary_buffer.h index 37f64befac4..2aa33cad869 100644 --- a/libcudacxx/include/cuda/std/__memory/temporary_buffer.h +++ b/libcudacxx/include/cuda/std/__memory/temporary_buffer.h @@ -80,7 +80,7 @@ get_temporary_buffer(ptrdiff_t __n) noexcept template _LIBCUDACXX_HIDE_FROM_ABI void return_temporary_buffer(_Tp* __p) noexcept { - _CUDA_VSTD::__libcpp_deallocate_unsized((void*) __p, _LIBCUDACXX_ALIGNOF(_Tp)); + _CUDA_VSTD::__cccl_deallocate_unsized((void*) __p, _LIBCUDACXX_ALIGNOF(_Tp)); } _LIBCUDACXX_END_NAMESPACE_STD diff --git a/libcudacxx/include/cuda/std/__memory/uninitialized_algorithms.h b/libcudacxx/include/cuda/std/__memory/uninitialized_algorithms.h index a194efa5a02..11b476ba76c 100644 --- a/libcudacxx/include/cuda/std/__memory/uninitialized_algorithms.h +++ b/libcudacxx/include/cuda/std/__memory/uninitialized_algorithms.h @@ -349,7 +349,7 @@ __allocator_destroy_multidimensional(_Alloc& __alloc, _BidirIter __first, _Bidir _CCCL_IF_CONSTEXPR (_CCCL_TRAIT(is_array, _ValueType)) { - static_assert(!__libcpp_is_unbounded_array<_ValueType>::value, + static_assert(!__cccl_is_unbounded_array<_ValueType>::value, "arrays of unbounded arrays don't exist, but if they did we would mess up here"); using _Element = remove_extent_t<_ValueType>; @@ -576,7 +576,7 @@ template < _LIBCUDACXX_HIDE_FROM_ABI _CCCL_CONSTEXPR_CXX20 _Out* __uninitialized_allocator_copy_impl(_Alloc&, _In* __first1, _In* __last1, _Out* __first2) { - if (__libcpp_is_constant_evaluated()) + if (_CUDA_VSTD::is_constant_evaluated()) { while (__first1 != __last1) { @@ -650,7 +650,7 @@ template -_LIBCUDACXX_HIDE_FROM_ABI void* __libcpp_operator_new(_Args... __args) +_LIBCUDACXX_HIDE_FROM_ABI void* __cccl_operator_new(_Args... __args) { // Those builtins are not usable on device and the tests crash when using them #if defined(_CCCL_BUILTIN_OPERATOR_NEW) @@ -68,7 +68,7 @@ _LIBCUDACXX_HIDE_FROM_ABI void* __libcpp_operator_new(_Args... __args) } template -_LIBCUDACXX_HIDE_FROM_ABI void __libcpp_operator_delete(_Args... __args) +_LIBCUDACXX_HIDE_FROM_ABI void __cccl_operator_delete(_Args... __args) { // Those builtins are not usable on device and the tests crash when using them #if defined(_CCCL_BUILTIN_OPERATOR_DELETE) @@ -78,17 +78,17 @@ _LIBCUDACXX_HIDE_FROM_ABI void __libcpp_operator_delete(_Args... __args) #endif // !_CCCL_BUILTIN_OPERATOR_DELETE } -_LIBCUDACXX_HIDE_FROM_ABI void* __libcpp_allocate(size_t __size, size_t __align) +_LIBCUDACXX_HIDE_FROM_ABI void* __cccl_allocate(size_t __size, size_t __align) { #ifndef _LIBCUDACXX_HAS_NO_ALIGNED_ALLOCATION if (__is_overaligned_for_new(__align)) { const ::std::align_val_t __align_val = static_cast<::std::align_val_t>(__align); - return __libcpp_operator_new(__size, __align_val); + return __cccl_operator_new(__size, __align_val); } #endif // !_LIBCUDACXX_HAS_NO_ALIGNED_ALLOCATION (void) __align; - return __libcpp_operator_new(__size); + return __cccl_operator_new(__size); } template @@ -96,13 +96,13 @@ _LIBCUDACXX_HIDE_FROM_ABI void __do_deallocate_handle_size(void* __ptr, size_t _ { #ifdef _LIBCUDACXX_HAS_NO_SIZED_DEALLOCATION (void) __size; - return _CUDA_VSTD::__libcpp_operator_delete(__ptr, __args...); + return _CUDA_VSTD::__cccl_operator_delete(__ptr, __args...); #else // ^^^ _LIBCUDACXX_HAS_NO_SIZED_DEALLOCATION ^^^ / vvv !_LIBCUDACXX_HAS_NO_SIZED_DEALLOCATION vvv - return _CUDA_VSTD::__libcpp_operator_delete(__ptr, __size, __args...); + return _CUDA_VSTD::__cccl_operator_delete(__ptr, __size, __args...); #endif // !_LIBCUDACXX_HAS_NO_SIZED_DEALLOCATION } -_LIBCUDACXX_HIDE_FROM_ABI void __libcpp_deallocate(void* __ptr, size_t __size, size_t __align) +_LIBCUDACXX_HIDE_FROM_ABI void __cccl_deallocate(void* __ptr, size_t __size, size_t __align) { #ifndef _LIBCUDACXX_HAS_NO_ALIGNED_ALLOCATION if (__is_overaligned_for_new(__align)) @@ -115,17 +115,17 @@ _LIBCUDACXX_HIDE_FROM_ABI void __libcpp_deallocate(void* __ptr, size_t __size, s return __do_deallocate_handle_size(__ptr, __size); } -_LIBCUDACXX_HIDE_FROM_ABI void __libcpp_deallocate_unsized(void* __ptr, size_t __align) +_LIBCUDACXX_HIDE_FROM_ABI void __cccl_deallocate_unsized(void* __ptr, size_t __align) { #ifndef _LIBCUDACXX_HAS_NO_ALIGNED_ALLOCATION if (__is_overaligned_for_new(__align)) { const ::std::align_val_t __align_val = static_cast<::std::align_val_t>(__align); - return __libcpp_operator_delete(__ptr, __align_val); + return __cccl_operator_delete(__ptr, __align_val); } #endif // !_LIBCUDACXX_HAS_NO_ALIGNED_ALLOCATION (void) __align; - return __libcpp_operator_delete(__ptr); + return __cccl_operator_delete(__ptr); } _LIBCUDACXX_END_NAMESPACE_STD diff --git a/libcudacxx/include/cuda/std/__semaphore/atomic_semaphore.h b/libcudacxx/include/cuda/std/__semaphore/atomic_semaphore.h index 78013392630..fb6d302e771 100644 --- a/libcudacxx/include/cuda/std/__semaphore/atomic_semaphore.h +++ b/libcudacxx/include/cuda/std/__semaphore/atomic_semaphore.h @@ -74,7 +74,7 @@ class __atomic_semaphore _CCCL_NODISCARD _LIBCUDACXX_HIDE_FROM_ABI bool __acquire_slow_timed(chrono::nanoseconds const& __rel_time) { - return _CUDA_VSTD::__libcpp_thread_poll_with_backoff( + return _CUDA_VSTD::__cccl_thread_poll_with_backoff( [this]() { ptrdiff_t const __old = __count.load(memory_order_acquire); return __old != 0 && __fetch_sub_if_slow(__old); @@ -157,7 +157,7 @@ class __atomic_semaphore<_Sco, 1> _CCCL_NODISCARD _LIBCUDACXX_HIDE_FROM_ABI bool __acquire_slow_timed(chrono::nanoseconds const& __rel_time) { - return _CUDA_VSTD::__libcpp_thread_poll_with_backoff( + return _CUDA_VSTD::__cccl_thread_poll_with_backoff( [this]() { return try_acquire(); }, diff --git a/libcudacxx/include/cuda/std/__string/string_view.h b/libcudacxx/include/cuda/std/__string/string_view.h index 458c46e3063..46bf51b589c 100644 --- a/libcudacxx/include/cuda/std/__string/string_view.h +++ b/libcudacxx/include/cuda/std/__string/string_view.h @@ -229,7 +229,7 @@ struct __string_view { // If we're in a constant evaluated context, we cannot compare the __str_ // members for equality. - return __compare(__other, bool_constant<__libcpp_default_is_constant_evaluated()>()); + return __compare(__other, bool_constant<__cccl_default_is_constant_evaluated()>()); } _CCCL_NODISCARD_FRIEND _LIBCUDACXX_HIDE_FROM_ABI constexpr bool diff --git a/libcudacxx/include/cuda/std/__thread/threading_support.h b/libcudacxx/include/cuda/std/__thread/threading_support.h index b131dbf0f94..d2ebacf576f 100644 --- a/libcudacxx/include/cuda/std/__thread/threading_support.h +++ b/libcudacxx/include/cuda/std/__thread/threading_support.h @@ -52,13 +52,13 @@ _LIBCUDACXX_BEGIN_NAMESPACE_STD # define __LIBCUDACXX_ASM_THREAD_YIELD (;) # endif // !__x86_64__ -_LIBCUDACXX_HIDE_FROM_ABI void __libcpp_thread_yield_processor() +_LIBCUDACXX_HIDE_FROM_ABI void __cccl_thread_yield_processor() { NV_IF_TARGET(NV_IS_HOST, __LIBCUDACXX_ASM_THREAD_YIELD) } template -_LIBCUDACXX_HIDE_FROM_ABI bool __libcpp_thread_poll_with_backoff( +_LIBCUDACXX_HIDE_FROM_ABI bool __cccl_thread_poll_with_backoff( _Fn&& __f, _CUDA_VSTD::chrono::nanoseconds __max = _CUDA_VSTD::chrono::nanoseconds::zero()) { _CUDA_VSTD::chrono::high_resolution_clock::time_point const __start = @@ -73,7 +73,7 @@ _LIBCUDACXX_HIDE_FROM_ABI bool __libcpp_thread_poll_with_backoff( { if (__count > (_LIBCUDACXX_POLLING_COUNT >> 1)) { - _CUDA_VSTD::__libcpp_thread_yield_processor(); + _CUDA_VSTD::__cccl_thread_yield_processor(); } __count += 1; continue; @@ -87,15 +87,15 @@ _LIBCUDACXX_HIDE_FROM_ABI bool __libcpp_thread_poll_with_backoff( _CUDA_VSTD::chrono::nanoseconds const __step = __elapsed / 4; if (__step >= _CUDA_VSTD::chrono::milliseconds(1)) { - _CUDA_VSTD::__libcpp_thread_sleep_for(_CUDA_VSTD::chrono::milliseconds(1)); + _CUDA_VSTD::__cccl_thread_sleep_for(_CUDA_VSTD::chrono::milliseconds(1)); } else if (__step >= _CUDA_VSTD::chrono::microseconds(10)) { - _CUDA_VSTD::__libcpp_thread_sleep_for(__step); + _CUDA_VSTD::__cccl_thread_sleep_for(__step); } else { - _CUDA_VSTD::__libcpp_thread_yield(); + _CUDA_VSTD::__cccl_thread_yield(); } } } diff --git a/libcudacxx/include/cuda/std/__thread/threading_support_cuda.h b/libcudacxx/include/cuda/std/__thread/threading_support_cuda.h index c361b0f7e06..c46cf508dca 100644 --- a/libcudacxx/include/cuda/std/__thread/threading_support_cuda.h +++ b/libcudacxx/include/cuda/std/__thread/threading_support_cuda.h @@ -29,9 +29,9 @@ _CCCL_PUSH_MACROS _LIBCUDACXX_BEGIN_NAMESPACE_STD -_LIBCUDACXX_HIDE_FROM_ABI void __libcpp_thread_yield() {} +_LIBCUDACXX_HIDE_FROM_ABI void __cccl_thread_yield() {} -_LIBCUDACXX_HIDE_FROM_ABI void __libcpp_thread_sleep_for(_CUDA_VSTD::chrono::nanoseconds __ns) +_LIBCUDACXX_HIDE_FROM_ABI void __cccl_thread_sleep_for(_CUDA_VSTD::chrono::nanoseconds __ns) { NV_IF_TARGET(NV_IS_DEVICE, (auto const __step = __ns.count(); assert(__step < numeric_limits::max()); diff --git a/libcudacxx/include/cuda/std/__thread/threading_support_external.h b/libcudacxx/include/cuda/std/__thread/threading_support_external.h index 639e117355c..92d0945a029 100644 --- a/libcudacxx/include/cuda/std/__thread/threading_support_external.h +++ b/libcudacxx/include/cuda/std/__thread/threading_support_external.h @@ -28,9 +28,9 @@ _CCCL_PUSH_MACROS _LIBCUDACXX_BEGIN_NAMESPACE_STD -_LIBCUDACXX_HIDE_FROM_ABI void __libcpp_thread_yield(); +_LIBCUDACXX_HIDE_FROM_ABI void __cccl_thread_yield(); -_LIBCUDACXX_HIDE_FROM_ABI void __libcpp_thread_sleep_for(_CUDA_VSTD::chrono::nanoseconds __ns); +_LIBCUDACXX_HIDE_FROM_ABI void __cccl_thread_sleep_for(_CUDA_VSTD::chrono::nanoseconds __ns); _LIBCUDACXX_END_NAMESPACE_STD diff --git a/libcudacxx/include/cuda/std/__thread/threading_support_pthread.h b/libcudacxx/include/cuda/std/__thread/threading_support_pthread.h index 4b1af8c7bc2..3da59117761 100644 --- a/libcudacxx/include/cuda/std/__thread/threading_support_pthread.h +++ b/libcudacxx/include/cuda/std/__thread/threading_support_pthread.h @@ -40,51 +40,51 @@ _CCCL_PUSH_MACROS -typedef ::timespec __libcpp_timespec_t; +typedef ::timespec __cccl_timespec_t; _LIBCUDACXX_BEGIN_NAMESPACE_STD // Mutex -typedef pthread_mutex_t __libcpp_mutex_t; +typedef pthread_mutex_t __cccl_mutex_t; # define _LIBCUDACXX_MUTEX_INITIALIZER PTHREAD_MUTEX_INITIALIZER -typedef pthread_mutex_t __libcpp_recursive_mutex_t; +typedef pthread_mutex_t __cccl_recursive_mutex_t; // Condition Variable -typedef pthread_cond_t __libcpp_condvar_t; +typedef pthread_cond_t __cccl_condvar_t; # define _LIBCUDACXX_CONDVAR_INITIALIZER PTHREAD_COND_INITIALIZER // Semaphore # if defined(__APPLE__) -typedef dispatch_semaphore_t __libcpp_semaphore_t; +typedef dispatch_semaphore_t __cccl_semaphore_t; # define _LIBCUDACXX_SEMAPHORE_MAX numeric_limits::max() # else // ^^^ __APPLE__ ^^^ / vvv !__APPLE__ vvv -typedef sem_t __libcpp_semaphore_t; +typedef sem_t __cccl_semaphore_t; # define _LIBCUDACXX_SEMAPHORE_MAX SEM_VALUE_MAX # endif // !__APPLE__ // Execute once -typedef pthread_once_t __libcpp_exec_once_flag; +typedef pthread_once_t __cccl_exec_once_flag; # define _LIBCUDACXX_EXEC_ONCE_INITIALIZER PTHREAD_ONCE_INIT // Thread id -typedef pthread_t __libcpp_thread_id; +typedef pthread_t __cccl_thread_id; // Thread # define _LIBCUDACXX_NULL_THREAD 0U -typedef pthread_t __libcpp_thread_t; +typedef pthread_t __cccl_thread_t; // Thread Local Storage -typedef pthread_key_t __libcpp_tls_key; +typedef pthread_key_t __cccl_tls_key; # define _LIBCUDACXX_TLS_DESTRUCTOR_CC -_LIBCUDACXX_HIDE_FROM_ABI __libcpp_timespec_t __libcpp_to_timespec(const _CUDA_VSTD::chrono::nanoseconds& __ns) +_LIBCUDACXX_HIDE_FROM_ABI __cccl_timespec_t __cccl_to_timespec(const _CUDA_VSTD::chrono::nanoseconds& __ns) { using namespace chrono; seconds __s = duration_cast(__ns); - __libcpp_timespec_t __ts; + __cccl_timespec_t __ts; typedef decltype(__ts.tv_sec) ts_sec; constexpr ts_sec __ts_sec_max = numeric_limits::max(); @@ -104,73 +104,73 @@ _LIBCUDACXX_HIDE_FROM_ABI __libcpp_timespec_t __libcpp_to_timespec(const _CUDA_V // Semaphore # if defined(__APPLE__) -_LIBCUDACXX_HIDE_FROM_ABI bool __libcpp_semaphore_init(__libcpp_semaphore_t* __sem, int __init) +_LIBCUDACXX_HIDE_FROM_ABI bool __cccl_semaphore_init(__cccl_semaphore_t* __sem, int __init) { return (*__sem = dispatch_semaphore_create(__init)) != nullptr; } -_LIBCUDACXX_HIDE_FROM_ABI bool __libcpp_semaphore_destroy(__libcpp_semaphore_t* __sem) +_LIBCUDACXX_HIDE_FROM_ABI bool __cccl_semaphore_destroy(__cccl_semaphore_t* __sem) { dispatch_release(*__sem); return true; } -_LIBCUDACXX_HIDE_FROM_ABI bool __libcpp_semaphore_post(__libcpp_semaphore_t* __sem) +_LIBCUDACXX_HIDE_FROM_ABI bool __cccl_semaphore_post(__cccl_semaphore_t* __sem) { dispatch_semaphore_signal(*__sem); return true; } -_LIBCUDACXX_HIDE_FROM_ABI bool __libcpp_semaphore_wait(__libcpp_semaphore_t* __sem) +_LIBCUDACXX_HIDE_FROM_ABI bool __cccl_semaphore_wait(__cccl_semaphore_t* __sem) { return dispatch_semaphore_wait(*__sem, DISPATCH_TIME_FOREVER) == 0; } _LIBCUDACXX_HIDE_FROM_ABI bool -__libcpp_semaphore_wait_timed(__libcpp_semaphore_t* __sem, _CUDA_VSTD::chrono::nanoseconds const& __ns) +__cccl_semaphore_wait_timed(__cccl_semaphore_t* __sem, _CUDA_VSTD::chrono::nanoseconds const& __ns) { return dispatch_semaphore_wait(*__sem, dispatch_time(DISPATCH_TIME_NOW, __ns.count())) == 0; } # else // ^^^ __APPLE__ ^^^ / vvv !__APPLE__ vvv -_LIBCUDACXX_HIDE_FROM_ABI bool __libcpp_semaphore_init(__libcpp_semaphore_t* __sem, int __init) +_LIBCUDACXX_HIDE_FROM_ABI bool __cccl_semaphore_init(__cccl_semaphore_t* __sem, int __init) { return sem_init(__sem, 0, __init) == 0; } -_LIBCUDACXX_HIDE_FROM_ABI bool __libcpp_semaphore_destroy(__libcpp_semaphore_t* __sem) +_LIBCUDACXX_HIDE_FROM_ABI bool __cccl_semaphore_destroy(__cccl_semaphore_t* __sem) { return sem_destroy(__sem) == 0; } -_LIBCUDACXX_HIDE_FROM_ABI bool __libcpp_semaphore_post(__libcpp_semaphore_t* __sem) +_LIBCUDACXX_HIDE_FROM_ABI bool __cccl_semaphore_post(__cccl_semaphore_t* __sem) { return sem_post(__sem) == 0; } -_LIBCUDACXX_HIDE_FROM_ABI bool __libcpp_semaphore_wait(__libcpp_semaphore_t* __sem) +_LIBCUDACXX_HIDE_FROM_ABI bool __cccl_semaphore_wait(__cccl_semaphore_t* __sem) { return sem_wait(__sem) == 0; } _LIBCUDACXX_HIDE_FROM_ABI bool -__libcpp_semaphore_wait_timed(__libcpp_semaphore_t* __sem, _CUDA_VSTD::chrono::nanoseconds const& __ns) +__cccl_semaphore_wait_timed(__cccl_semaphore_t* __sem, _CUDA_VSTD::chrono::nanoseconds const& __ns) { - __libcpp_timespec_t __ts = __libcpp_to_timespec(__ns); + __cccl_timespec_t __ts = __cccl_to_timespec(__ns); return sem_timedwait(__sem, &__ts) == 0; } # endif // !__APPLE__ -_LIBCUDACXX_HIDE_FROM_ABI void __libcpp_thread_yield() +_LIBCUDACXX_HIDE_FROM_ABI void __cccl_thread_yield() { sched_yield(); } -_LIBCUDACXX_HIDE_FROM_ABI void __libcpp_thread_sleep_for(_CUDA_VSTD::chrono::nanoseconds __ns) +_LIBCUDACXX_HIDE_FROM_ABI void __cccl_thread_sleep_for(_CUDA_VSTD::chrono::nanoseconds __ns) { - __libcpp_timespec_t __ts = __libcpp_to_timespec(__ns); + __cccl_timespec_t __ts = __cccl_to_timespec(__ns); while (nanosleep(&__ts, &__ts) == -1 && errno == EINTR) ; } diff --git a/libcudacxx/include/cuda/std/__thread/threading_support_win32.h b/libcudacxx/include/cuda/std/__thread/threading_support_win32.h index ff8bd6a35fe..ab59307e5aa 100644 --- a/libcudacxx/include/cuda/std/__thread/threading_support_win32.h +++ b/libcudacxx/include/cuda/std/__thread/threading_support_win32.h @@ -32,47 +32,47 @@ _CCCL_PUSH_MACROS _LIBCUDACXX_BEGIN_NAMESPACE_STD // Mutex -typedef void* __libcpp_mutex_t; +typedef void* __cccl_mutex_t; # define _LIBCUDACXX_MUTEX_INITIALIZER 0 # if defined(_M_IX86) || defined(__i386__) || defined(_M_ARM) || defined(__arm__) -typedef void* __libcpp_recursive_mutex_t[6]; +typedef void* __cccl_recursive_mutex_t[6]; # elif defined(_M_AMD64) || defined(__x86_64__) || defined(_M_ARM64) || defined(__aarch64__) -typedef void* __libcpp_recursive_mutex_t[5]; +typedef void* __cccl_recursive_mutex_t[5]; # else # error Unsupported architecture # endif // Condition Variable -typedef void* __libcpp_condvar_t; +typedef void* __cccl_condvar_t; # define _LIBCUDACXX_CONDVAR_INITIALIZER 0 // Semaphore -typedef void* __libcpp_semaphore_t; +typedef void* __cccl_semaphore_t; // Execute Once -typedef void* __libcpp_exec_once_flag; +typedef void* __cccl_exec_once_flag; # define _LIBCUDACXX_EXEC_ONCE_INITIALIZER 0 // Thread ID -typedef long __libcpp_thread_id; +typedef long __cccl_thread_id; // Thread # define _LIBCUDACXX_NULL_THREAD 0U -typedef void* __libcpp_thread_t; +typedef void* __cccl_thread_t; // Thread Local Storage -typedef long __libcpp_tls_key; +typedef long __cccl_tls_key; # define _LIBCUDACXX_TLS_DESTRUCTOR_CC __stdcall -_LIBCUDACXX_HIDE_FROM_ABI void __libcpp_thread_yield() +_LIBCUDACXX_HIDE_FROM_ABI void __cccl_thread_yield() { SwitchToThread(); } -_LIBCUDACXX_HIDE_FROM_ABI void __libcpp_thread_sleep_for(chrono::nanoseconds __ns) +_LIBCUDACXX_HIDE_FROM_ABI void __cccl_thread_sleep_for(chrono::nanoseconds __ns) { using namespace chrono; // round-up to the nearest milisecond diff --git a/libcudacxx/include/cuda/std/__type_traits/add_lvalue_reference.h b/libcudacxx/include/cuda/std/__type_traits/add_lvalue_reference.h index cc74e6bbbd8..8b70295ce14 100644 --- a/libcudacxx/include/cuda/std/__type_traits/add_lvalue_reference.h +++ b/libcudacxx/include/cuda/std/__type_traits/add_lvalue_reference.h @@ -31,7 +31,7 @@ using add_lvalue_reference_t _CCCL_NODEBUG_ALIAS = _CCCL_BUILTIN_ADD_LVALUE_REFE #else // ^^^ _CCCL_BUILTIN_ADD_LVALUE_REFERENCE ^^^ / vvv !_CCCL_BUILTIN_ADD_LVALUE_REFERENCE vvv -template ::value> +template ::value> struct __add_lvalue_reference_impl { typedef _CCCL_NODEBUG_ALIAS _Tp type; diff --git a/libcudacxx/include/cuda/std/__type_traits/add_pointer.h b/libcudacxx/include/cuda/std/__type_traits/add_pointer.h index bf89c4fd082..65986787c84 100644 --- a/libcudacxx/include/cuda/std/__type_traits/add_pointer.h +++ b/libcudacxx/include/cuda/std/__type_traits/add_pointer.h @@ -34,7 +34,7 @@ template using add_pointer_t _CCCL_NODEBUG_ALIAS = _CCCL_BUILTIN_ADD_POINTER(_Tp); #else // ^^^ _CCCL_BUILTIN_ADD_POINTER ^^^ / vvv !_CCCL_BUILTIN_ADD_POINTER vvv -template ::value || is_void<_Tp>::value> +template ::value || is_void<_Tp>::value> struct __add_pointer_impl { typedef _CCCL_NODEBUG_ALIAS remove_reference_t<_Tp>* type; diff --git a/libcudacxx/include/cuda/std/__type_traits/add_rvalue_reference.h b/libcudacxx/include/cuda/std/__type_traits/add_rvalue_reference.h index c9704de4092..eb9e3f0acdf 100644 --- a/libcudacxx/include/cuda/std/__type_traits/add_rvalue_reference.h +++ b/libcudacxx/include/cuda/std/__type_traits/add_rvalue_reference.h @@ -31,7 +31,7 @@ using add_rvalue_reference_t _CCCL_NODEBUG_ALIAS = _CCCL_BUILTIN_ADD_RVALUE_REFE #else // ^^^ _CCCL_BUILTIN_ADD_RVALUE_REFERENCE ^^^ / vvv !_CCCL_BUILTIN_ADD_RVALUE_REFERENCE vvv -template ::value> +template ::value> struct __add_rvalue_reference_impl { typedef _CCCL_NODEBUG_ALIAS _Tp type; diff --git a/libcudacxx/include/cuda/std/__type_traits/decay.h b/libcudacxx/include/cuda/std/__type_traits/decay.h index 2888466585c..b8d5a744cfd 100644 --- a/libcudacxx/include/cuda/std/__type_traits/decay.h +++ b/libcudacxx/include/cuda/std/__type_traits/decay.h @@ -66,7 +66,7 @@ struct _CCCL_TYPE_VISIBILITY_DEFAULT decay typedef _CCCL_NODEBUG_ALIAS remove_reference_t<_Tp> _Up; public: - typedef _CCCL_NODEBUG_ALIAS typename __decay_impl<_Up, __libcpp_is_referenceable<_Up>::value>::type type; + typedef _CCCL_NODEBUG_ALIAS typename __decay_impl<_Up, __cccl_is_referenceable<_Up>::value>::type type; }; template diff --git a/libcudacxx/include/cuda/std/__type_traits/is_bounded_array.h b/libcudacxx/include/cuda/std/__type_traits/is_bounded_array.h index fa762d24b2d..983e17e3553 100644 --- a/libcudacxx/include/cuda/std/__type_traits/is_bounded_array.h +++ b/libcudacxx/include/cuda/std/__type_traits/is_bounded_array.h @@ -26,10 +26,10 @@ _LIBCUDACXX_BEGIN_NAMESPACE_STD template -struct _CCCL_TYPE_VISIBILITY_DEFAULT __libcpp_is_bounded_array : false_type +struct _CCCL_TYPE_VISIBILITY_DEFAULT __cccl_is_bounded_array : false_type {}; template -struct _CCCL_TYPE_VISIBILITY_DEFAULT __libcpp_is_bounded_array<_Tp[_Np]> : true_type +struct _CCCL_TYPE_VISIBILITY_DEFAULT __cccl_is_bounded_array<_Tp[_Np]> : true_type {}; template diff --git a/libcudacxx/include/cuda/std/__type_traits/is_constant_evaluated.h b/libcudacxx/include/cuda/std/__type_traits/is_constant_evaluated.h index b4281c6c637..fc24b17077a 100644 --- a/libcudacxx/include/cuda/std/__type_traits/is_constant_evaluated.h +++ b/libcudacxx/include/cuda/std/__type_traits/is_constant_evaluated.h @@ -27,21 +27,16 @@ _LIBCUDACXX_HIDE_FROM_ABI constexpr bool is_constant_evaluated() noexcept { return _CCCL_BUILTIN_IS_CONSTANT_EVALUATED(); } - -_LIBCUDACXX_HIDE_FROM_ABI constexpr bool __libcpp_is_constant_evaluated() noexcept -{ - return _CCCL_BUILTIN_IS_CONSTANT_EVALUATED(); -} -_LIBCUDACXX_HIDE_FROM_ABI constexpr bool __libcpp_default_is_constant_evaluated() noexcept +_LIBCUDACXX_HIDE_FROM_ABI constexpr bool __cccl_default_is_constant_evaluated() noexcept { return _CCCL_BUILTIN_IS_CONSTANT_EVALUATED(); } #else // ^^^ _CCCL_BUILTIN_IS_CONSTANT_EVALUATED ^^^ / vvv !_CCCL_BUILTIN_IS_CONSTANT_EVALUATED vvv -_LIBCUDACXX_HIDE_FROM_ABI constexpr bool __libcpp_is_constant_evaluated() noexcept +_LIBCUDACXX_HIDE_FROM_ABI constexpr bool is_constant_evaluated() noexcept { return false; } -_LIBCUDACXX_HIDE_FROM_ABI constexpr bool __libcpp_default_is_constant_evaluated() noexcept +_LIBCUDACXX_HIDE_FROM_ABI constexpr bool __cccl_default_is_constant_evaluated() noexcept { return true; } diff --git a/libcudacxx/include/cuda/std/__type_traits/is_constructible.h b/libcudacxx/include/cuda/std/__type_traits/is_constructible.h index cd82aa9397c..579c45c0295 100644 --- a/libcudacxx/include/cuda/std/__type_traits/is_constructible.h +++ b/libcudacxx/include/cuda/std/__type_traits/is_constructible.h @@ -45,7 +45,7 @@ struct __nat || defined(_LIBCUDACXX_USE_IS_CONSTRUCTIBLE_FALLBACK)) template -struct __libcpp_is_constructible; +struct __cccl_is_constructible; template struct __is_invalid_base_to_derived_cast @@ -54,8 +54,7 @@ struct __is_invalid_base_to_derived_cast using _RawFrom = remove_cvref_t<_From>; using _RawTo = remove_cvref_t<_To>; static const bool value = - _And<_IsNotSame<_RawFrom, _RawTo>, is_base_of<_RawFrom, _RawTo>, _Not<__libcpp_is_constructible<_RawTo, _From>>>:: - value; + _And<_IsNotSame<_RawFrom, _RawTo>, is_base_of<_RawFrom, _RawTo>, _Not<__cccl_is_constructible<_RawTo, _From>>>::value; }; template @@ -123,26 +122,26 @@ struct __is_default_constructible<_Tp[_Nx], false> : __is_default_constructible< {}; template -struct __libcpp_is_constructible +struct __cccl_is_constructible { static_assert(sizeof...(_Args) > 1, "Wrong specialization"); typedef decltype(__is_constructible_helper::__test_nary<_Tp, _Args...>(0)) type; }; template -struct __libcpp_is_constructible<_Tp> : __is_default_constructible<_Tp> +struct __cccl_is_constructible<_Tp> : __is_default_constructible<_Tp> {}; template -struct __libcpp_is_constructible<_Tp, _A0> : public decltype(__is_constructible_helper::__test_unary<_Tp, _A0>(0)) +struct __cccl_is_constructible<_Tp, _A0> : public decltype(__is_constructible_helper::__test_unary<_Tp, _A0>(0)) {}; template -struct __libcpp_is_constructible<_Tp&, _A0> : public decltype(__is_constructible_helper::__test_cast<_Tp&, _A0>(0)) +struct __cccl_is_constructible<_Tp&, _A0> : public decltype(__is_constructible_helper::__test_cast<_Tp&, _A0>(0)) {}; template -struct __libcpp_is_constructible<_Tp&&, _A0> : public decltype(__is_constructible_helper::__test_cast<_Tp&&, _A0>(0)) +struct __cccl_is_constructible<_Tp&&, _A0> : public decltype(__is_constructible_helper::__test_cast<_Tp&&, _A0>(0)) {}; #endif @@ -160,7 +159,7 @@ _CCCL_INLINE_VAR constexpr bool is_constructible_v = _CCCL_BUILTIN_IS_CONSTRUCTI #else template -struct _CCCL_TYPE_VISIBILITY_DEFAULT is_constructible : public __libcpp_is_constructible<_Tp, _Args...>::type +struct _CCCL_TYPE_VISIBILITY_DEFAULT is_constructible : public __cccl_is_constructible<_Tp, _Args...>::type {}; # if !defined(_CCCL_NO_VARIABLE_TEMPLATES) diff --git a/libcudacxx/include/cuda/std/__type_traits/is_empty.h b/libcudacxx/include/cuda/std/__type_traits/is_empty.h index 4b11bc7da88..dc2a3691321 100644 --- a/libcudacxx/include/cuda/std/__type_traits/is_empty.h +++ b/libcudacxx/include/cuda/std/__type_traits/is_empty.h @@ -50,15 +50,15 @@ struct __is_empty2 }; template -struct __libcpp_empty : public integral_constant) == sizeof(__is_empty2)> +struct __cccl_empty : public integral_constant) == sizeof(__is_empty2)> {}; template -struct __libcpp_empty<_Tp, false> : public false_type +struct __cccl_empty<_Tp, false> : public false_type {}; template -struct _CCCL_TYPE_VISIBILITY_DEFAULT is_empty : public __libcpp_empty<_Tp> +struct _CCCL_TYPE_VISIBILITY_DEFAULT is_empty : public __cccl_empty<_Tp> {}; # if !defined(_CCCL_NO_VARIABLE_TEMPLATES) diff --git a/libcudacxx/include/cuda/std/__type_traits/is_floating_point.h b/libcudacxx/include/cuda/std/__type_traits/is_floating_point.h index 59336c6acad..913bacdb2a6 100644 --- a/libcudacxx/include/cuda/std/__type_traits/is_floating_point.h +++ b/libcudacxx/include/cuda/std/__type_traits/is_floating_point.h @@ -26,20 +26,20 @@ _LIBCUDACXX_BEGIN_NAMESPACE_STD template -struct __libcpp_is_floating_point : public false_type +struct __cccl_is_floating_point : public false_type {}; template <> -struct __libcpp_is_floating_point : public true_type +struct __cccl_is_floating_point : public true_type {}; template <> -struct __libcpp_is_floating_point : public true_type +struct __cccl_is_floating_point : public true_type {}; template <> -struct __libcpp_is_floating_point : public true_type +struct __cccl_is_floating_point : public true_type {}; template -struct _CCCL_TYPE_VISIBILITY_DEFAULT is_floating_point : public __libcpp_is_floating_point> +struct _CCCL_TYPE_VISIBILITY_DEFAULT is_floating_point : public __cccl_is_floating_point> {}; #if !defined(_CCCL_NO_VARIABLE_TEMPLATES) diff --git a/libcudacxx/include/cuda/std/__type_traits/is_integral.h b/libcudacxx/include/cuda/std/__type_traits/is_integral.h index d3b412b8135..eddcba144c5 100644 --- a/libcudacxx/include/cuda/std/__type_traits/is_integral.h +++ b/libcudacxx/include/cuda/std/__type_traits/is_integral.h @@ -39,72 +39,72 @@ _CCCL_INLINE_VAR constexpr bool is_integral_v = _CCCL_BUILTIN_IS_INTEGRAL(_Tp); #else // ^^^ _CCCL_BUILTIN_IS_INTEGRAL ^^^ / vvv !_CCCL_BUILTIN_IS_INTEGRAL vvv template -struct __libcpp_is_integral : public false_type +struct __cccl_is_integral : public false_type {}; template <> -struct __libcpp_is_integral : public true_type +struct __cccl_is_integral : public true_type {}; template <> -struct __libcpp_is_integral : public true_type +struct __cccl_is_integral : public true_type {}; template <> -struct __libcpp_is_integral : public true_type +struct __cccl_is_integral : public true_type {}; template <> -struct __libcpp_is_integral : public true_type +struct __cccl_is_integral : public true_type {}; template <> -struct __libcpp_is_integral : public true_type +struct __cccl_is_integral : public true_type {}; # ifndef _LIBCUDACXX_NO_HAS_CHAR8_T template <> -struct __libcpp_is_integral : public true_type +struct __cccl_is_integral : public true_type {}; # endif # ifndef _LIBCUDACXX_HAS_NO_UNICODE_CHARS template <> -struct __libcpp_is_integral : public true_type +struct __cccl_is_integral : public true_type {}; template <> -struct __libcpp_is_integral : public true_type +struct __cccl_is_integral : public true_type {}; # endif // _LIBCUDACXX_HAS_NO_UNICODE_CHARS template <> -struct __libcpp_is_integral : public true_type +struct __cccl_is_integral : public true_type {}; template <> -struct __libcpp_is_integral : public true_type +struct __cccl_is_integral : public true_type {}; template <> -struct __libcpp_is_integral : public true_type +struct __cccl_is_integral : public true_type {}; template <> -struct __libcpp_is_integral : public true_type +struct __cccl_is_integral : public true_type {}; template <> -struct __libcpp_is_integral : public true_type +struct __cccl_is_integral : public true_type {}; template <> -struct __libcpp_is_integral : public true_type +struct __cccl_is_integral : public true_type {}; template <> -struct __libcpp_is_integral : public true_type +struct __cccl_is_integral : public true_type {}; template <> -struct __libcpp_is_integral : public true_type +struct __cccl_is_integral : public true_type {}; # ifndef _LIBCUDACXX_HAS_NO_INT128 template <> -struct __libcpp_is_integral<__int128_t> : public true_type +struct __cccl_is_integral<__int128_t> : public true_type {}; template <> -struct __libcpp_is_integral<__uint128_t> : public true_type +struct __cccl_is_integral<__uint128_t> : public true_type {}; # endif template struct _CCCL_TYPE_VISIBILITY_DEFAULT is_integral - : public integral_constant>::value> + : public integral_constant>::value> {}; # if !defined(_CCCL_NO_VARIABLE_TEMPLATES) diff --git a/libcudacxx/include/cuda/std/__type_traits/is_member_function_pointer.h b/libcudacxx/include/cuda/std/__type_traits/is_member_function_pointer.h index 943ed414a5c..fff6f96ee81 100644 --- a/libcudacxx/include/cuda/std/__type_traits/is_member_function_pointer.h +++ b/libcudacxx/include/cuda/std/__type_traits/is_member_function_pointer.h @@ -28,7 +28,7 @@ _LIBCUDACXX_BEGIN_NAMESPACE_STD template -struct __libcpp_is_member_pointer +struct __cccl_is_member_pointer { enum { @@ -38,7 +38,7 @@ struct __libcpp_is_member_pointer }; }; template -struct __libcpp_is_member_pointer<_Tp _Up::*> +struct __cccl_is_member_pointer<_Tp _Up::*> { enum { @@ -64,7 +64,7 @@ _CCCL_INLINE_VAR constexpr bool is_member_function_pointer_v = _CCCL_BUILTIN_IS_ template struct _CCCL_TYPE_VISIBILITY_DEFAULT is_member_function_pointer - : public integral_constant>::__is_func> + : public integral_constant>::__is_func> {}; # if !defined(_CCCL_NO_VARIABLE_TEMPLATES) diff --git a/libcudacxx/include/cuda/std/__type_traits/is_member_object_pointer.h b/libcudacxx/include/cuda/std/__type_traits/is_member_object_pointer.h index b9f411cf9d7..86ce9dd9d26 100644 --- a/libcudacxx/include/cuda/std/__type_traits/is_member_object_pointer.h +++ b/libcudacxx/include/cuda/std/__type_traits/is_member_object_pointer.h @@ -42,7 +42,7 @@ _CCCL_INLINE_VAR constexpr bool is_member_object_pointer_v = _CCCL_BUILTIN_IS_ME template struct _CCCL_TYPE_VISIBILITY_DEFAULT is_member_object_pointer - : public integral_constant>::__is_obj> + : public integral_constant>::__is_obj> {}; # if !defined(_CCCL_NO_VARIABLE_TEMPLATES) diff --git a/libcudacxx/include/cuda/std/__type_traits/is_member_pointer.h b/libcudacxx/include/cuda/std/__type_traits/is_member_pointer.h index 2f0ff0d5eb6..74ceaf6e7d3 100644 --- a/libcudacxx/include/cuda/std/__type_traits/is_member_pointer.h +++ b/libcudacxx/include/cuda/std/__type_traits/is_member_pointer.h @@ -42,7 +42,7 @@ _CCCL_INLINE_VAR constexpr bool is_member_pointer_v = _CCCL_BUILTIN_IS_MEMBER_PO template struct _CCCL_TYPE_VISIBILITY_DEFAULT is_member_pointer - : public integral_constant>::__is_member> + : public integral_constant>::__is_member> {}; # if !defined(_CCCL_NO_VARIABLE_TEMPLATES) diff --git a/libcudacxx/include/cuda/std/__type_traits/is_nothrow_assignable.h b/libcudacxx/include/cuda/std/__type_traits/is_nothrow_assignable.h index 3232e3eff2c..b12662cb9cc 100644 --- a/libcudacxx/include/cuda/std/__type_traits/is_nothrow_assignable.h +++ b/libcudacxx/include/cuda/std/__type_traits/is_nothrow_assignable.h @@ -42,20 +42,20 @@ _CCCL_INLINE_VAR constexpr bool is_nothrow_assignable_v = _CCCL_BUILTIN_IS_NOTHR #elif !defined(_LIBCUDACXX_HAS_NO_NOEXCEPT) && !defined(_LIBCUDACXX_HAS_NO_NOEXCEPT_SFINAE) template -struct __libcpp_is_nothrow_assignable; +struct __cccl_is_nothrow_assignable; template -struct __libcpp_is_nothrow_assignable : public false_type +struct __cccl_is_nothrow_assignable : public false_type {}; template -struct __libcpp_is_nothrow_assignable +struct __cccl_is_nothrow_assignable : public integral_constant() = _CUDA_VSTD::declval<_Arg>())> {}; template struct _CCCL_TYPE_VISIBILITY_DEFAULT is_nothrow_assignable - : public __libcpp_is_nothrow_assignable::value, _Tp, _Arg> + : public __cccl_is_nothrow_assignable::value, _Tp, _Arg> {}; # if !defined(_CCCL_NO_VARIABLE_TEMPLATES) diff --git a/libcudacxx/include/cuda/std/__type_traits/is_nothrow_constructible.h b/libcudacxx/include/cuda/std/__type_traits/is_nothrow_constructible.h index b225e46cbc0..62440f9b26e 100644 --- a/libcudacxx/include/cuda/std/__type_traits/is_nothrow_constructible.h +++ b/libcudacxx/include/cuda/std/__type_traits/is_nothrow_constructible.h @@ -44,10 +44,10 @@ _CCCL_INLINE_VAR constexpr bool is_nothrow_constructible_v = _CCCL_BUILTIN_IS_NO # if !defined(_LIBCUDACXX_HAS_NO_NOEXCEPT) template -struct __libcpp_is_nothrow_constructible; +struct __cccl_is_nothrow_constructible; template -struct __libcpp_is_nothrow_constructible +struct __cccl_is_nothrow_constructible : public integral_constant()...))> {}; @@ -56,22 +56,22 @@ _LIBCUDACXX_HIDE_FROM_ABI void __implicit_conversion_to(_Tp) noexcept {} template -struct __libcpp_is_nothrow_constructible +struct __cccl_is_nothrow_constructible : public integral_constant(_CUDA_VSTD::declval<_Arg>()))> {}; template -struct __libcpp_is_nothrow_constructible : public false_type +struct __cccl_is_nothrow_constructible : public false_type {}; template struct _CCCL_TYPE_VISIBILITY_DEFAULT is_nothrow_constructible - : __libcpp_is_nothrow_constructible::value, is_reference<_Tp>::value, _Tp, _Args...> + : __cccl_is_nothrow_constructible::value, is_reference<_Tp>::value, _Tp, _Args...> {}; template struct _CCCL_TYPE_VISIBILITY_DEFAULT is_nothrow_constructible<_Tp[_Ns]> - : __libcpp_is_nothrow_constructible::value, is_reference<_Tp>::value, _Tp> + : __cccl_is_nothrow_constructible::value, is_reference<_Tp>::value, _Tp> {}; # else diff --git a/libcudacxx/include/cuda/std/__type_traits/is_nothrow_destructible.h b/libcudacxx/include/cuda/std/__type_traits/is_nothrow_destructible.h index 1cd366424de..23821feaffd 100644 --- a/libcudacxx/include/cuda/std/__type_traits/is_nothrow_destructible.h +++ b/libcudacxx/include/cuda/std/__type_traits/is_nothrow_destructible.h @@ -41,16 +41,16 @@ struct is_nothrow_destructible : public integral_constant::value> -struct __libcpp_is_nothrow_destructible : false_type +struct __cccl_is_nothrow_destructible : false_type {}; template -struct __libcpp_is_nothrow_destructible<_Tp, true> +struct __cccl_is_nothrow_destructible<_Tp, true> : public integral_constant().~_Tp())> {}; template -struct _CCCL_TYPE_VISIBILITY_DEFAULT is_nothrow_destructible : public __libcpp_is_nothrow_destructible<_Tp> +struct _CCCL_TYPE_VISIBILITY_DEFAULT is_nothrow_destructible : public __cccl_is_nothrow_destructible<_Tp> {}; template @@ -68,12 +68,12 @@ struct _CCCL_TYPE_VISIBILITY_DEFAULT is_nothrow_destructible<_Tp&&> : public tru #else template -struct __libcpp_nothrow_destructor : public integral_constant::value || is_reference<_Tp>::value> +struct __cccl_nothrow_destructor : public integral_constant::value || is_reference<_Tp>::value> {}; template struct _CCCL_TYPE_VISIBILITY_DEFAULT is_nothrow_destructible - : public __libcpp_nothrow_destructor> + : public __cccl_nothrow_destructor> {}; template diff --git a/libcudacxx/include/cuda/std/__type_traits/is_pointer.h b/libcudacxx/include/cuda/std/__type_traits/is_pointer.h index b87e5537ca1..67969fbbb56 100644 --- a/libcudacxx/include/cuda/std/__type_traits/is_pointer.h +++ b/libcudacxx/include/cuda/std/__type_traits/is_pointer.h @@ -39,14 +39,14 @@ _CCCL_INLINE_VAR constexpr bool is_pointer_v = _CCCL_BUILTIN_IS_POINTER(_Tp); #else template -struct __libcpp_is_pointer : public false_type +struct __cccl_is_pointer : public false_type {}; template -struct __libcpp_is_pointer<_Tp*> : public true_type +struct __cccl_is_pointer<_Tp*> : public true_type {}; template -struct _CCCL_TYPE_VISIBILITY_DEFAULT is_pointer : public __libcpp_is_pointer> +struct _CCCL_TYPE_VISIBILITY_DEFAULT is_pointer : public __cccl_is_pointer> {}; # if !defined(_CCCL_NO_VARIABLE_TEMPLATES) diff --git a/libcudacxx/include/cuda/std/__type_traits/is_referenceable.h b/libcudacxx/include/cuda/std/__type_traits/is_referenceable.h index 63d2d71fb52..c2ddc771904 100644 --- a/libcudacxx/include/cuda/std/__type_traits/is_referenceable.h +++ b/libcudacxx/include/cuda/std/__type_traits/is_referenceable.h @@ -28,11 +28,11 @@ _LIBCUDACXX_BEGIN_NAMESPACE_STD #if defined(_CCCL_BUILTIN_IS_REFERENCEABLE) && !defined(_LIBCUDACXX_USE_IS_REFERENCEABLE_FALLBACK) template -struct __libcpp_is_referenceable : public integral_constant +struct __cccl_is_referenceable : public integral_constant {}; #else -struct __libcpp_is_referenceable_impl +struct __cccl_is_referenceable_impl { template _CCCL_HOST_DEVICE static _Tp& __test(int); @@ -41,8 +41,8 @@ struct __libcpp_is_referenceable_impl }; template -struct __libcpp_is_referenceable - : integral_constant(0)), false_type>::value> +struct __cccl_is_referenceable + : integral_constant(0)), false_type>::value> {}; #endif // defined(_CCCL_BUILTIN_IS_REFERENCEABLE) && !defined(_LIBCUDACXX_USE_IS_REFERENCEABLE_FALLBACK) diff --git a/libcudacxx/include/cuda/std/__type_traits/is_signed.h b/libcudacxx/include/cuda/std/__type_traits/is_signed.h index 33e5c1eb5c5..220790002ba 100644 --- a/libcudacxx/include/cuda/std/__type_traits/is_signed.h +++ b/libcudacxx/include/cuda/std/__type_traits/is_signed.h @@ -42,23 +42,23 @@ _CCCL_INLINE_VAR constexpr bool is_signed_v = _CCCL_BUILTIN_IS_SIGNED(_Tp); #else template ::value> -struct __libcpp_is_signed_impl : public bool_constant<(_Tp(-1) < _Tp(0))> +struct __cccl_is_signed_impl : public bool_constant<(_Tp(-1) < _Tp(0))> {}; template -struct __libcpp_is_signed_impl<_Tp, false> : public true_type +struct __cccl_is_signed_impl<_Tp, false> : public true_type {}; // floating point template ::value> -struct __libcpp_is_signed : public __libcpp_is_signed_impl<_Tp> +struct __cccl_is_signed : public __cccl_is_signed_impl<_Tp> {}; template -struct __libcpp_is_signed<_Tp, false> : public false_type +struct __cccl_is_signed<_Tp, false> : public false_type {}; template -struct _CCCL_TYPE_VISIBILITY_DEFAULT is_signed : public __libcpp_is_signed<_Tp> +struct _CCCL_TYPE_VISIBILITY_DEFAULT is_signed : public __cccl_is_signed<_Tp> {}; # if !defined(_CCCL_NO_VARIABLE_TEMPLATES) diff --git a/libcudacxx/include/cuda/std/__type_traits/is_signed_integer.h b/libcudacxx/include/cuda/std/__type_traits/is_signed_integer.h index 69ce3aa8a6e..273df0d830b 100644 --- a/libcudacxx/include/cuda/std/__type_traits/is_signed_integer.h +++ b/libcudacxx/include/cuda/std/__type_traits/is_signed_integer.h @@ -25,26 +25,26 @@ _LIBCUDACXX_BEGIN_NAMESPACE_STD template -struct __libcpp_is_signed_integer : public false_type +struct __cccl_is_signed_integer : public false_type {}; template <> -struct __libcpp_is_signed_integer : public true_type +struct __cccl_is_signed_integer : public true_type {}; template <> -struct __libcpp_is_signed_integer : public true_type +struct __cccl_is_signed_integer : public true_type {}; template <> -struct __libcpp_is_signed_integer : public true_type +struct __cccl_is_signed_integer : public true_type {}; template <> -struct __libcpp_is_signed_integer : public true_type +struct __cccl_is_signed_integer : public true_type {}; template <> -struct __libcpp_is_signed_integer : public true_type +struct __cccl_is_signed_integer : public true_type {}; #ifndef _LIBCUDACXX_HAS_NO_INT128 template <> -struct __libcpp_is_signed_integer<__int128_t> : public true_type +struct __cccl_is_signed_integer<__int128_t> : public true_type {}; #endif diff --git a/libcudacxx/include/cuda/std/__type_traits/is_swappable.h b/libcudacxx/include/cuda/std/__type_traits/is_swappable.h index 964f14d6381..d2727ced8b4 100644 --- a/libcudacxx/include/cuda/std/__type_traits/is_swappable.h +++ b/libcudacxx/include/cuda/std/__type_traits/is_swappable.h @@ -163,7 +163,7 @@ struct _CCCL_TYPE_VISIBILITY_DEFAULT is_swappable_with template struct _CCCL_TYPE_VISIBILITY_DEFAULT is_swappable - : public conditional_t<__libcpp_is_referenceable<_Tp>::value, + : public conditional_t<__cccl_is_referenceable<_Tp>::value, is_swappable_with, add_lvalue_reference_t<_Tp>>, false_type> {}; @@ -175,7 +175,7 @@ struct _CCCL_TYPE_VISIBILITY_DEFAULT is_nothrow_swappable_with template struct _CCCL_TYPE_VISIBILITY_DEFAULT is_nothrow_swappable - : public conditional_t<__libcpp_is_referenceable<_Tp>::value, + : public conditional_t<__cccl_is_referenceable<_Tp>::value, is_nothrow_swappable_with, add_lvalue_reference_t<_Tp>>, false_type> {}; diff --git a/libcudacxx/include/cuda/std/__type_traits/is_trivially_destructible.h b/libcudacxx/include/cuda/std/__type_traits/is_trivially_destructible.h index 9116ced5e8f..57a4af4829c 100644 --- a/libcudacxx/include/cuda/std/__type_traits/is_trivially_destructible.h +++ b/libcudacxx/include/cuda/std/__type_traits/is_trivially_destructible.h @@ -47,12 +47,12 @@ _CCCL_SUPPRESS_DEPRECATED_POP #else template -struct __libcpp_trivial_destructor : public integral_constant::value || is_reference<_Tp>::value> +struct __cccl_trivial_destructor : public integral_constant::value || is_reference<_Tp>::value> {}; template struct _CCCL_TYPE_VISIBILITY_DEFAULT is_trivially_destructible - : public __libcpp_trivial_destructor> + : public __cccl_trivial_destructor> {}; template diff --git a/libcudacxx/include/cuda/std/__type_traits/is_unbounded_array.h b/libcudacxx/include/cuda/std/__type_traits/is_unbounded_array.h index 501efded75a..2e09d4c8726 100644 --- a/libcudacxx/include/cuda/std/__type_traits/is_unbounded_array.h +++ b/libcudacxx/include/cuda/std/__type_traits/is_unbounded_array.h @@ -25,10 +25,10 @@ _LIBCUDACXX_BEGIN_NAMESPACE_STD template -struct _CCCL_TYPE_VISIBILITY_DEFAULT __libcpp_is_unbounded_array : false_type +struct _CCCL_TYPE_VISIBILITY_DEFAULT __cccl_is_unbounded_array : false_type {}; template -struct _CCCL_TYPE_VISIBILITY_DEFAULT __libcpp_is_unbounded_array<_Tp[]> : true_type +struct _CCCL_TYPE_VISIBILITY_DEFAULT __cccl_is_unbounded_array<_Tp[]> : true_type {}; template diff --git a/libcudacxx/include/cuda/std/__type_traits/is_union.h b/libcudacxx/include/cuda/std/__type_traits/is_union.h index 37ee313d8a3..9978f99e6be 100644 --- a/libcudacxx/include/cuda/std/__type_traits/is_union.h +++ b/libcudacxx/include/cuda/std/__type_traits/is_union.h @@ -39,10 +39,10 @@ _CCCL_INLINE_VAR constexpr bool is_union_v = _CCCL_BUILTIN_IS_UNION(_Tp); #else template -struct __libcpp_union : public false_type +struct __cccl_union : public false_type {}; template -struct _CCCL_TYPE_VISIBILITY_DEFAULT is_union : public __libcpp_union> +struct _CCCL_TYPE_VISIBILITY_DEFAULT is_union : public __cccl_union> {}; # if !defined(_CCCL_NO_VARIABLE_TEMPLATES) diff --git a/libcudacxx/include/cuda/std/__type_traits/is_unsigned.h b/libcudacxx/include/cuda/std/__type_traits/is_unsigned.h index abd951c7202..4a5ad7d92e2 100644 --- a/libcudacxx/include/cuda/std/__type_traits/is_unsigned.h +++ b/libcudacxx/include/cuda/std/__type_traits/is_unsigned.h @@ -45,23 +45,23 @@ _CCCL_INLINE_VAR constexpr bool is_unsigned_v = _CCCL_BUILTIN_IS_UNSIGNED(_Tp); #else template ::value> -struct __libcpp_is_unsigned_impl : public bool_constant<(_Tp(0) < _Tp(-1))> +struct __cccl_is_unsigned_impl : public bool_constant<(_Tp(0) < _Tp(-1))> {}; template -struct __libcpp_is_unsigned_impl<_Tp, false> : public false_type +struct __cccl_is_unsigned_impl<_Tp, false> : public false_type {}; // floating point template ::value> -struct __libcpp_is_unsigned : public __libcpp_is_unsigned_impl<_Tp> +struct __cccl_is_unsigned : public __cccl_is_unsigned_impl<_Tp> {}; template -struct __libcpp_is_unsigned<_Tp, false> : public false_type +struct __cccl_is_unsigned<_Tp, false> : public false_type {}; template -struct _CCCL_TYPE_VISIBILITY_DEFAULT is_unsigned : public __libcpp_is_unsigned<_Tp> +struct _CCCL_TYPE_VISIBILITY_DEFAULT is_unsigned : public __cccl_is_unsigned<_Tp> {}; # if !defined(_CCCL_NO_VARIABLE_TEMPLATES) diff --git a/libcudacxx/include/cuda/std/__type_traits/is_unsigned_integer.h b/libcudacxx/include/cuda/std/__type_traits/is_unsigned_integer.h index 888020032ed..088c98af66a 100644 --- a/libcudacxx/include/cuda/std/__type_traits/is_unsigned_integer.h +++ b/libcudacxx/include/cuda/std/__type_traits/is_unsigned_integer.h @@ -25,26 +25,26 @@ _LIBCUDACXX_BEGIN_NAMESPACE_STD template -struct __libcpp_is_unsigned_integer : public false_type +struct __cccl_is_unsigned_integer : public false_type {}; template <> -struct __libcpp_is_unsigned_integer : public true_type +struct __cccl_is_unsigned_integer : public true_type {}; template <> -struct __libcpp_is_unsigned_integer : public true_type +struct __cccl_is_unsigned_integer : public true_type {}; template <> -struct __libcpp_is_unsigned_integer : public true_type +struct __cccl_is_unsigned_integer : public true_type {}; template <> -struct __libcpp_is_unsigned_integer : public true_type +struct __cccl_is_unsigned_integer : public true_type {}; template <> -struct __libcpp_is_unsigned_integer : public true_type +struct __cccl_is_unsigned_integer : public true_type {}; #ifndef _LIBCUDACXX_HAS_NO_INT128 template <> -struct __libcpp_is_unsigned_integer<__uint128_t> : public true_type +struct __cccl_is_unsigned_integer<__uint128_t> : public true_type {}; #endif diff --git a/libcudacxx/include/cuda/std/__utility/unreachable.h b/libcudacxx/include/cuda/std/__utility/unreachable.h index e0d704c9b6d..a3b23397e0a 100644 --- a/libcudacxx/include/cuda/std/__utility/unreachable.h +++ b/libcudacxx/include/cuda/std/__utility/unreachable.h @@ -22,7 +22,7 @@ _LIBCUDACXX_BEGIN_NAMESPACE_STD -_CCCL_NORETURN _LIBCUDACXX_HIDE_FROM_ABI void __libcpp_unreachable() +_CCCL_NORETURN _LIBCUDACXX_HIDE_FROM_ABI void __cccl_unreachable() { _CCCL_UNREACHABLE(); } diff --git a/libcudacxx/include/cuda/std/detail/libcxx/include/__string b/libcudacxx/include/cuda/std/detail/libcxx/include/__string index 93cba133797..20857deef7c 100644 --- a/libcudacxx/include/cuda/std/detail/libcxx/include/__string +++ b/libcudacxx/include/cuda/std/detail/libcxx/include/__string @@ -262,7 +262,7 @@ struct _CCCL_TYPE_VISIBILITY_DEFAULT char_traits { #if _CCCL_COMPILER(GCC, <, 13) // absurd workaround for GCC "internal compiler error: in cxx_eval_array_reference" - if (__libcpp_is_constant_evaluated()) + if (_CUDA_VSTD::is_constant_evaluated()) ; #endif #if defined(_CCCL_BUILTIN_STRLEN) @@ -470,7 +470,7 @@ _LIBCUDACXX_HIDE_FROM_ABI constexpr size_t __char_traits_length_checked(const ty #if _LIBCUDACXX_DEBUG_LEVEL >= 1 return __s ? _Traits::length(__s) - : (_CUDA_VSTD::__libcpp_debug_function(_CUDA_VSTD::__libcpp_debug_info( + : (_CUDA_VSTD::__cccl_debug_function(_CUDA_VSTD::__cccl_debug_info( __FILE__, __LINE__, "p == nullptr", "null pointer pass to non-null argument of char_traits<...>::length")), 0); #else diff --git a/libcudacxx/include/cuda/std/detail/libcxx/include/algorithm b/libcudacxx/include/cuda/std/detail/libcxx/include/algorithm index 01e92219370..ec32a3e3f77 100644 --- a/libcudacxx/include/cuda/std/detail/libcxx/include/algorithm +++ b/libcudacxx/include/cuda/std/detail/libcxx/include/algorithm @@ -1070,7 +1070,7 @@ typename uniform_int_distribution<_IntType>::result_type uniform_int_distributio { return static_cast(_Eng(__g, _Dt)()); } - size_t __w = _Dt - __libcpp_clz(_Rp) - 1; + size_t __w = _Dt - __cccl_clz(_Rp) - 1; if ((_Rp & (std::numeric_limits<_UIntType>::max() >> (_Dt - __w))) != 0) { ++__w; diff --git a/libcudacxx/include/cuda/std/detail/libcxx/include/complex b/libcudacxx/include/cuda/std/detail/libcxx/include/complex index 4e98f7c9774..7eecbcc4a20 100644 --- a/libcudacxx/include/cuda/std/detail/libcxx/include/complex +++ b/libcudacxx/include/cuda/std/detail/libcxx/include/complex @@ -509,7 +509,7 @@ operator*(const complex<_Tp>& __z, const complex<_Tp>& __w) #if _CCCL_STD_VER > 2011 && defined(_CCCL_BUILTIN_IS_CONSTANT_EVALUATED) // Avoid floating point operations that are invalid during constant evaluation - if (__libcpp_is_constant_evaluated()) + if (_CUDA_VSTD::is_constant_evaluated()) { bool __z_zero = __a == _Tp(0) && __b == _Tp(0); bool __w_zero = __c == _Tp(0) && __d == _Tp(0); @@ -652,7 +652,7 @@ operator/(const complex<_Tp>& __z, const complex<_Tp>& __w) #if _CCCL_STD_VER > 2011 && defined(_CCCL_BUILTIN_IS_CONSTANT_EVALUATED) // Avoid floating point operations that are invalid during constant evaluation - if (__libcpp_is_constant_evaluated()) + if (_CUDA_VSTD::is_constant_evaluated()) { bool __z_zero = __a == _Tp(0) && __b == _Tp(0); bool __w_zero = __c == _Tp(0) && __d == _Tp(0); @@ -841,12 +841,12 @@ _LIBCUDACXX_HIDE_FROM_ABI _CCCL_CONSTEXPR_CXX14 _Tp real(const complex<_Tp>& __c // 26.3.7 values: template -struct __libcpp_complex_overload_traits +struct __cccl_complex_overload_traits {}; // Integral Types template -struct __libcpp_complex_overload_traits<_Tp, true, false> +struct __cccl_complex_overload_traits<_Tp, true, false> { using _ValueType = double; using _ComplexType = complex; @@ -854,20 +854,20 @@ struct __libcpp_complex_overload_traits<_Tp, true, false> // Floating point types template -struct __libcpp_complex_overload_traits<_Tp, false, true> +struct __cccl_complex_overload_traits<_Tp, false, true> { using _ValueType = _Tp; using _ComplexType = complex<_Tp>; }; template -using __libcpp_complex_value_type = typename __libcpp_complex_overload_traits<_Tp>::_ValueType; +using __cccl_complex_value_type = typename __cccl_complex_overload_traits<_Tp>::_ValueType; template -using __libcpp_complex_complex_type = typename __libcpp_complex_overload_traits<_Tp>::_ComplexType; +using __cccl_complex_complex_type = typename __cccl_complex_overload_traits<_Tp>::_ComplexType; template -_LIBCUDACXX_HIDE_FROM_ABI _CCCL_CONSTEXPR_CXX14 __libcpp_complex_value_type<_Tp> real(_Tp __re) +_LIBCUDACXX_HIDE_FROM_ABI _CCCL_CONSTEXPR_CXX14 __cccl_complex_value_type<_Tp> real(_Tp __re) { return __re; } @@ -881,7 +881,7 @@ _LIBCUDACXX_HIDE_FROM_ABI _CCCL_CONSTEXPR_CXX14 _Tp imag(const complex<_Tp>& __c } template -_LIBCUDACXX_HIDE_FROM_ABI _CCCL_CONSTEXPR_CXX14 __libcpp_complex_value_type<_Tp> imag(_Tp) +_LIBCUDACXX_HIDE_FROM_ABI _CCCL_CONSTEXPR_CXX14 __cccl_complex_value_type<_Tp> imag(_Tp) { return 0; } @@ -940,9 +940,9 @@ _LIBCUDACXX_HIDE_FROM_ABI _CCCL_CONSTEXPR_CXX14 _Tp norm(const complex<_Tp>& __c } template -_LIBCUDACXX_HIDE_FROM_ABI _CCCL_CONSTEXPR_CXX14 __libcpp_complex_value_type<_Tp> norm(_Tp __re) +_LIBCUDACXX_HIDE_FROM_ABI _CCCL_CONSTEXPR_CXX14 __cccl_complex_value_type<_Tp> norm(_Tp __re) { - return static_cast<__libcpp_complex_value_type<_Tp>>(__re) * __re; + return static_cast<__cccl_complex_value_type<_Tp>>(__re) * __re; } // conj @@ -954,9 +954,9 @@ _LIBCUDACXX_HIDE_FROM_ABI _CCCL_CONSTEXPR_CXX14 complex<_Tp> conj(const complex< } template -_LIBCUDACXX_HIDE_FROM_ABI _CCCL_CONSTEXPR_CXX14 __libcpp_complex_complex_type<_Tp> conj(_Tp __re) +_LIBCUDACXX_HIDE_FROM_ABI _CCCL_CONSTEXPR_CXX14 __cccl_complex_complex_type<_Tp> conj(_Tp __re) { - return __libcpp_complex_complex_type<_Tp>(__re); + return __cccl_complex_complex_type<_Tp>(__re); } // proj @@ -973,7 +973,7 @@ _LIBCUDACXX_HIDE_FROM_ABI complex<_Tp> proj(const complex<_Tp>& __c) } template -_LIBCUDACXX_HIDE_FROM_ABI enable_if_t<__is_complex_float<_Tp>::value, __libcpp_complex_complex_type<_Tp>> proj(_Tp __re) +_LIBCUDACXX_HIDE_FROM_ABI enable_if_t<__is_complex_float<_Tp>::value, __cccl_complex_complex_type<_Tp>> proj(_Tp __re) { if (_CUDA_VSTD::__constexpr_isinf(__re)) { @@ -983,9 +983,9 @@ _LIBCUDACXX_HIDE_FROM_ABI enable_if_t<__is_complex_float<_Tp>::value, __libcpp_c } template -_LIBCUDACXX_HIDE_FROM_ABI enable_if_t::value, __libcpp_complex_complex_type<_Tp>> proj(_Tp __re) +_LIBCUDACXX_HIDE_FROM_ABI enable_if_t::value, __cccl_complex_complex_type<_Tp>> proj(_Tp __re) { - return __libcpp_complex_complex_type<_Tp>(__re); + return __cccl_complex_complex_type<_Tp>(__re); } // polar diff --git a/libcudacxx/include/cuda/std/detail/libcxx/include/limits b/libcudacxx/include/cuda/std/detail/libcxx/include/limits index ea830da6046..82f6a00c804 100644 --- a/libcudacxx/include/cuda/std/detail/libcxx/include/limits +++ b/libcudacxx/include/cuda/std/detail/libcxx/include/limits @@ -141,7 +141,7 @@ enum float_denorm_style }; template ::value> -class __libcpp_numeric_limits +class __cccl_numeric_limits { protected: typedef _Tp type; @@ -215,20 +215,20 @@ protected: _CCCL_DIAG_PUSH _CCCL_DIAG_SUPPRESS_MSVC(4309) template -struct __libcpp_compute_min +struct __cccl_compute_min { static constexpr _Tp value = static_cast<_Tp>(_Tp(1) << __digits); }; _CCCL_DIAG_POP template -struct __libcpp_compute_min<_Tp, __digits, false> +struct __cccl_compute_min<_Tp, __digits, false> { static constexpr _Tp value = _Tp(0); }; template -class __libcpp_numeric_limits<_Tp, true> +class __cccl_numeric_limits<_Tp, true> { protected: typedef _Tp type; @@ -239,7 +239,7 @@ protected: static constexpr int digits = static_cast(sizeof(type) * __CHAR_BIT__ - is_signed); static constexpr int digits10 = digits * 3 / 10; static constexpr int max_digits10 = 0; - static constexpr type __min = __libcpp_compute_min::value; + static constexpr type __min = __cccl_compute_min::value; static constexpr type __max = is_signed ? type(type(~0) ^ __min) : type(~0); _LIBCUDACXX_HIDE_FROM_ABI static constexpr type min() noexcept { @@ -307,7 +307,7 @@ protected: }; template <> -class __libcpp_numeric_limits +class __cccl_numeric_limits { protected: typedef bool type; @@ -382,7 +382,7 @@ protected: }; template <> -class __libcpp_numeric_limits +class __cccl_numeric_limits { protected: typedef float type; @@ -470,7 +470,7 @@ protected: }; template <> -class __libcpp_numeric_limits +class __cccl_numeric_limits { protected: typedef double type; @@ -558,7 +558,7 @@ protected: }; template <> -class __libcpp_numeric_limits +class __cccl_numeric_limits { #ifndef _LIBCUDACXX_HAS_NO_LONG_DOUBLE @@ -634,9 +634,9 @@ protected: }; template -class _CCCL_TYPE_VISIBILITY_DEFAULT numeric_limits : private __libcpp_numeric_limits> +class _CCCL_TYPE_VISIBILITY_DEFAULT numeric_limits : private __cccl_numeric_limits> { - typedef __libcpp_numeric_limits> __base; + typedef __cccl_numeric_limits> __base; typedef typename __base::type type; public: diff --git a/libcudacxx/test/NOTES.TXT b/libcudacxx/test/NOTES.TXT index 602de495103..ae5c1575281 100644 --- a/libcudacxx/test/NOTES.TXT +++ b/libcudacxx/test/NOTES.TXT @@ -12,7 +12,7 @@ These notes contain a list of things that must be done after branching for an LLVM release. 1. Update _LIBCUDACXX_VERSION in `__config` -2. Update the __libcpp_version file. +2. Update the __cccl_version file. 3. Update the version number in `docs/conf.py` 4. Create ABI lists for the previous release under `lib/abi` diff --git a/libcudacxx/test/libcudacxx/libcxx/type_traits/is_constant_evaluated.pass.cpp b/libcudacxx/test/libcudacxx/libcxx/type_traits/is_constant_evaluated.pass.cpp index 56aa5c72b3d..cacffd6bc5b 100644 --- a/libcudacxx/test/libcudacxx/libcxx/type_traits/is_constant_evaluated.pass.cpp +++ b/libcudacxx/test/libcudacxx/libcxx/type_traits/is_constant_evaluated.pass.cpp @@ -9,7 +9,7 @@ // -// __libcpp_is_constant_evaluated() +// _CUDA_VSTD::is_constant_evaluated() // returns false when there's no constant evaluation support from the compiler. // as well as when called not in a constexpr context @@ -21,14 +21,14 @@ int main(int, char**) { - ASSERT_SAME_TYPE(decltype(cuda::std::__libcpp_is_constant_evaluated()), bool); - ASSERT_NOEXCEPT(cuda::std::__libcpp_is_constant_evaluated()); + ASSERT_SAME_TYPE(decltype(cuda::std::is_constant_evaluated()), bool); + ASSERT_NOEXCEPT(cuda::std::is_constant_evaluated()); #if defined(_CCCL_BUILTIN_IS_CONSTANT_EVALUATED) - static_assert(cuda::std::__libcpp_is_constant_evaluated(), ""); + static_assert(cuda::std::is_constant_evaluated(), ""); #endif - bool p = cuda::std::__libcpp_is_constant_evaluated(); + bool p = cuda::std::is_constant_evaluated(); assert(!p); return 0; diff --git a/libcudacxx/test/libcudacxx/libcxx/utilities/meta/is_referenceable.pass.cpp b/libcudacxx/test/libcudacxx/libcxx/utilities/meta/is_referenceable.pass.cpp index 8ea4ad3f34f..d1a83917feb 100644 --- a/libcudacxx/test/libcudacxx/libcxx/utilities/meta/is_referenceable.pass.cpp +++ b/libcudacxx/test/libcudacxx/libcxx/utilities/meta/is_referenceable.pass.cpp @@ -7,7 +7,7 @@ //===----------------------------------------------------------------------===// // -// __libcpp_is_referenceable +// __cccl_is_referenceable // // [defns.referenceable] defines "a referenceable type" as: // An object type, a function type that does not have cv-qualifiers @@ -22,141 +22,141 @@ struct Foo {}; -static_assert((!cuda::std::__libcpp_is_referenceable::value), ""); -static_assert((cuda::std::__libcpp_is_referenceable::value), ""); -static_assert((cuda::std::__libcpp_is_referenceable::value), ""); -static_assert((cuda::std::__libcpp_is_referenceable::value), ""); -static_assert((cuda::std::__libcpp_is_referenceable::value), ""); -static_assert((cuda::std::__libcpp_is_referenceable::value), ""); -static_assert((cuda::std::__libcpp_is_referenceable::value), ""); -static_assert((cuda::std::__libcpp_is_referenceable::value), ""); -static_assert((cuda::std::__libcpp_is_referenceable::value), ""); -static_assert((cuda::std::__libcpp_is_referenceable::value), ""); -static_assert((cuda::std::__libcpp_is_referenceable::value), ""); -static_assert((cuda::std::__libcpp_is_referenceable::value), ""); -static_assert((cuda::std::__libcpp_is_referenceable::value), ""); -static_assert((cuda::std::__libcpp_is_referenceable::value), ""); +static_assert((!cuda::std::__cccl_is_referenceable::value), ""); +static_assert((cuda::std::__cccl_is_referenceable::value), ""); +static_assert((cuda::std::__cccl_is_referenceable::value), ""); +static_assert((cuda::std::__cccl_is_referenceable::value), ""); +static_assert((cuda::std::__cccl_is_referenceable::value), ""); +static_assert((cuda::std::__cccl_is_referenceable::value), ""); +static_assert((cuda::std::__cccl_is_referenceable::value), ""); +static_assert((cuda::std::__cccl_is_referenceable::value), ""); +static_assert((cuda::std::__cccl_is_referenceable::value), ""); +static_assert((cuda::std::__cccl_is_referenceable::value), ""); +static_assert((cuda::std::__cccl_is_referenceable::value), ""); +static_assert((cuda::std::__cccl_is_referenceable::value), ""); +static_assert((cuda::std::__cccl_is_referenceable::value), ""); +static_assert((cuda::std::__cccl_is_referenceable::value), ""); #ifndef _LIBCUDACXX_HAS_NO_VECTOR_EXTENSION -static_assert((cuda::std::__libcpp_is_referenceable::value), ""); -static_assert((cuda::std::__libcpp_is_referenceable::value), ""); -static_assert((cuda::std::__libcpp_is_referenceable::value), ""); -static_assert((cuda::std::__libcpp_is_referenceable::value), ""); +static_assert((cuda::std::__cccl_is_referenceable::value), ""); +static_assert((cuda::std::__cccl_is_referenceable::value), ""); +static_assert((cuda::std::__cccl_is_referenceable::value), ""); +static_assert((cuda::std::__cccl_is_referenceable::value), ""); #endif // Functions without cv-qualifiers are referenceable -static_assert((cuda::std::__libcpp_is_referenceable::value), ""); -static_assert((!cuda::std::__libcpp_is_referenceable::value), ""); -static_assert((!cuda::std::__libcpp_is_referenceable::value), ""); -static_assert((!cuda::std::__libcpp_is_referenceable::value), ""); -static_assert((!cuda::std::__libcpp_is_referenceable::value), ""); -static_assert((!cuda::std::__libcpp_is_referenceable::value), ""); - -static_assert((cuda::std::__libcpp_is_referenceable::value), ""); -static_assert((!cuda::std::__libcpp_is_referenceable::value), ""); -static_assert((!cuda::std::__libcpp_is_referenceable::value), ""); -static_assert((!cuda::std::__libcpp_is_referenceable::value), ""); -static_assert((!cuda::std::__libcpp_is_referenceable::value), ""); -static_assert((!cuda::std::__libcpp_is_referenceable::value), ""); - -static_assert((cuda::std::__libcpp_is_referenceable::value), ""); -static_assert((!cuda::std::__libcpp_is_referenceable::value), ""); -static_assert((!cuda::std::__libcpp_is_referenceable::value), ""); -static_assert((!cuda::std::__libcpp_is_referenceable::value), ""); -static_assert((!cuda::std::__libcpp_is_referenceable::value), ""); -static_assert((!cuda::std::__libcpp_is_referenceable::value), ""); - -static_assert((cuda::std::__libcpp_is_referenceable::value), ""); -static_assert((!cuda::std::__libcpp_is_referenceable::value), ""); -static_assert((!cuda::std::__libcpp_is_referenceable::value), ""); -static_assert((!cuda::std::__libcpp_is_referenceable::value), ""); -static_assert((!cuda::std::__libcpp_is_referenceable::value), ""); -static_assert((!cuda::std::__libcpp_is_referenceable::value), ""); - -static_assert((cuda::std::__libcpp_is_referenceable::value), ""); -static_assert((!cuda::std::__libcpp_is_referenceable::value), ""); -static_assert((!cuda::std::__libcpp_is_referenceable::value), ""); -static_assert((!cuda::std::__libcpp_is_referenceable::value), ""); -static_assert((!cuda::std::__libcpp_is_referenceable::value), ""); -static_assert((!cuda::std::__libcpp_is_referenceable::value), ""); - -static_assert((cuda::std::__libcpp_is_referenceable::value), ""); -static_assert((!cuda::std::__libcpp_is_referenceable::value), ""); -static_assert((!cuda::std::__libcpp_is_referenceable::value), ""); -static_assert((!cuda::std::__libcpp_is_referenceable::value), ""); -static_assert((!cuda::std::__libcpp_is_referenceable::value), ""); -static_assert((!cuda::std::__libcpp_is_referenceable::value), ""); - -static_assert((cuda::std::__libcpp_is_referenceable::value), ""); -static_assert((!cuda::std::__libcpp_is_referenceable::value), ""); -static_assert((!cuda::std::__libcpp_is_referenceable::value), ""); -static_assert((!cuda::std::__libcpp_is_referenceable::value), ""); -static_assert((!cuda::std::__libcpp_is_referenceable::value), ""); -static_assert((!cuda::std::__libcpp_is_referenceable::value), ""); - -static_assert((cuda::std::__libcpp_is_referenceable::value), ""); -static_assert((!cuda::std::__libcpp_is_referenceable::value), ""); -static_assert((!cuda::std::__libcpp_is_referenceable::value), ""); -static_assert((!cuda::std::__libcpp_is_referenceable::value), ""); -static_assert((!cuda::std::__libcpp_is_referenceable::value), ""); -static_assert((!cuda::std::__libcpp_is_referenceable::value), ""); +static_assert((cuda::std::__cccl_is_referenceable::value), ""); +static_assert((!cuda::std::__cccl_is_referenceable::value), ""); +static_assert((!cuda::std::__cccl_is_referenceable::value), ""); +static_assert((!cuda::std::__cccl_is_referenceable::value), ""); +static_assert((!cuda::std::__cccl_is_referenceable::value), ""); +static_assert((!cuda::std::__cccl_is_referenceable::value), ""); + +static_assert((cuda::std::__cccl_is_referenceable::value), ""); +static_assert((!cuda::std::__cccl_is_referenceable::value), ""); +static_assert((!cuda::std::__cccl_is_referenceable::value), ""); +static_assert((!cuda::std::__cccl_is_referenceable::value), ""); +static_assert((!cuda::std::__cccl_is_referenceable::value), ""); +static_assert((!cuda::std::__cccl_is_referenceable::value), ""); + +static_assert((cuda::std::__cccl_is_referenceable::value), ""); +static_assert((!cuda::std::__cccl_is_referenceable::value), ""); +static_assert((!cuda::std::__cccl_is_referenceable::value), ""); +static_assert((!cuda::std::__cccl_is_referenceable::value), ""); +static_assert((!cuda::std::__cccl_is_referenceable::value), ""); +static_assert((!cuda::std::__cccl_is_referenceable::value), ""); + +static_assert((cuda::std::__cccl_is_referenceable::value), ""); +static_assert((!cuda::std::__cccl_is_referenceable::value), ""); +static_assert((!cuda::std::__cccl_is_referenceable::value), ""); +static_assert((!cuda::std::__cccl_is_referenceable::value), ""); +static_assert((!cuda::std::__cccl_is_referenceable::value), ""); +static_assert((!cuda::std::__cccl_is_referenceable::value), ""); + +static_assert((cuda::std::__cccl_is_referenceable::value), ""); +static_assert((!cuda::std::__cccl_is_referenceable::value), ""); +static_assert((!cuda::std::__cccl_is_referenceable::value), ""); +static_assert((!cuda::std::__cccl_is_referenceable::value), ""); +static_assert((!cuda::std::__cccl_is_referenceable::value), ""); +static_assert((!cuda::std::__cccl_is_referenceable::value), ""); + +static_assert((cuda::std::__cccl_is_referenceable::value), ""); +static_assert((!cuda::std::__cccl_is_referenceable::value), ""); +static_assert((!cuda::std::__cccl_is_referenceable::value), ""); +static_assert((!cuda::std::__cccl_is_referenceable::value), ""); +static_assert((!cuda::std::__cccl_is_referenceable::value), ""); +static_assert((!cuda::std::__cccl_is_referenceable::value), ""); + +static_assert((cuda::std::__cccl_is_referenceable::value), ""); +static_assert((!cuda::std::__cccl_is_referenceable::value), ""); +static_assert((!cuda::std::__cccl_is_referenceable::value), ""); +static_assert((!cuda::std::__cccl_is_referenceable::value), ""); +static_assert((!cuda::std::__cccl_is_referenceable::value), ""); +static_assert((!cuda::std::__cccl_is_referenceable::value), ""); + +static_assert((cuda::std::__cccl_is_referenceable::value), ""); +static_assert((!cuda::std::__cccl_is_referenceable::value), ""); +static_assert((!cuda::std::__cccl_is_referenceable::value), ""); +static_assert((!cuda::std::__cccl_is_referenceable::value), ""); +static_assert((!cuda::std::__cccl_is_referenceable::value), ""); +static_assert((!cuda::std::__cccl_is_referenceable::value), ""); // member functions with or without cv-qualifiers are referenceable -static_assert((cuda::std::__libcpp_is_referenceable::value), ""); -static_assert((cuda::std::__libcpp_is_referenceable::value), ""); -static_assert((cuda::std::__libcpp_is_referenceable::value), ""); -static_assert((cuda::std::__libcpp_is_referenceable::value), ""); -static_assert((cuda::std::__libcpp_is_referenceable::value), ""); -static_assert((cuda::std::__libcpp_is_referenceable::value), ""); - -static_assert((cuda::std::__libcpp_is_referenceable::value), ""); -static_assert((cuda::std::__libcpp_is_referenceable::value), ""); -static_assert((cuda::std::__libcpp_is_referenceable::value), ""); -static_assert((cuda::std::__libcpp_is_referenceable::value), ""); -static_assert((cuda::std::__libcpp_is_referenceable::value), ""); -static_assert((cuda::std::__libcpp_is_referenceable::value), ""); - -static_assert((cuda::std::__libcpp_is_referenceable::value), ""); -static_assert((cuda::std::__libcpp_is_referenceable::value), ""); -static_assert((cuda::std::__libcpp_is_referenceable::value), ""); -static_assert((cuda::std::__libcpp_is_referenceable::value), ""); -static_assert((cuda::std::__libcpp_is_referenceable::value), ""); -static_assert((cuda::std::__libcpp_is_referenceable::value), ""); - -static_assert((cuda::std::__libcpp_is_referenceable::value), ""); -static_assert((cuda::std::__libcpp_is_referenceable::value), ""); -static_assert((cuda::std::__libcpp_is_referenceable::value), ""); -static_assert((cuda::std::__libcpp_is_referenceable::value), ""); -static_assert((cuda::std::__libcpp_is_referenceable::value), ""); -static_assert((cuda::std::__libcpp_is_referenceable::value), ""); - -static_assert((cuda::std::__libcpp_is_referenceable::value), ""); -static_assert((cuda::std::__libcpp_is_referenceable::value), ""); -static_assert((cuda::std::__libcpp_is_referenceable::value), ""); -static_assert((cuda::std::__libcpp_is_referenceable::value), ""); -static_assert((cuda::std::__libcpp_is_referenceable::value), ""); -static_assert((cuda::std::__libcpp_is_referenceable::value), ""); - -static_assert((cuda::std::__libcpp_is_referenceable::value), ""); -static_assert((cuda::std::__libcpp_is_referenceable::value), ""); -static_assert((cuda::std::__libcpp_is_referenceable::value), ""); -static_assert((cuda::std::__libcpp_is_referenceable::value), ""); -static_assert((cuda::std::__libcpp_is_referenceable::value), ""); -static_assert((cuda::std::__libcpp_is_referenceable::value), ""); - -static_assert((cuda::std::__libcpp_is_referenceable::value), ""); -static_assert((cuda::std::__libcpp_is_referenceable::value), ""); -static_assert((cuda::std::__libcpp_is_referenceable::value), ""); -static_assert((cuda::std::__libcpp_is_referenceable::value), ""); -static_assert((cuda::std::__libcpp_is_referenceable::value), ""); -static_assert((cuda::std::__libcpp_is_referenceable::value), ""); - -static_assert((cuda::std::__libcpp_is_referenceable::value), ""); -static_assert((cuda::std::__libcpp_is_referenceable::value), ""); -static_assert((cuda::std::__libcpp_is_referenceable::value), ""); -static_assert((cuda::std::__libcpp_is_referenceable::value), ""); -static_assert((cuda::std::__libcpp_is_referenceable::value), ""); -static_assert((cuda::std::__libcpp_is_referenceable::value), ""); +static_assert((cuda::std::__cccl_is_referenceable::value), ""); +static_assert((cuda::std::__cccl_is_referenceable::value), ""); +static_assert((cuda::std::__cccl_is_referenceable::value), ""); +static_assert((cuda::std::__cccl_is_referenceable::value), ""); +static_assert((cuda::std::__cccl_is_referenceable::value), ""); +static_assert((cuda::std::__cccl_is_referenceable::value), ""); + +static_assert((cuda::std::__cccl_is_referenceable::value), ""); +static_assert((cuda::std::__cccl_is_referenceable::value), ""); +static_assert((cuda::std::__cccl_is_referenceable::value), ""); +static_assert((cuda::std::__cccl_is_referenceable::value), ""); +static_assert((cuda::std::__cccl_is_referenceable::value), ""); +static_assert((cuda::std::__cccl_is_referenceable::value), ""); + +static_assert((cuda::std::__cccl_is_referenceable::value), ""); +static_assert((cuda::std::__cccl_is_referenceable::value), ""); +static_assert((cuda::std::__cccl_is_referenceable::value), ""); +static_assert((cuda::std::__cccl_is_referenceable::value), ""); +static_assert((cuda::std::__cccl_is_referenceable::value), ""); +static_assert((cuda::std::__cccl_is_referenceable::value), ""); + +static_assert((cuda::std::__cccl_is_referenceable::value), ""); +static_assert((cuda::std::__cccl_is_referenceable::value), ""); +static_assert((cuda::std::__cccl_is_referenceable::value), ""); +static_assert((cuda::std::__cccl_is_referenceable::value), ""); +static_assert((cuda::std::__cccl_is_referenceable::value), ""); +static_assert((cuda::std::__cccl_is_referenceable::value), ""); + +static_assert((cuda::std::__cccl_is_referenceable::value), ""); +static_assert((cuda::std::__cccl_is_referenceable::value), ""); +static_assert((cuda::std::__cccl_is_referenceable::value), ""); +static_assert((cuda::std::__cccl_is_referenceable::value), ""); +static_assert((cuda::std::__cccl_is_referenceable::value), ""); +static_assert((cuda::std::__cccl_is_referenceable::value), ""); + +static_assert((cuda::std::__cccl_is_referenceable::value), ""); +static_assert((cuda::std::__cccl_is_referenceable::value), ""); +static_assert((cuda::std::__cccl_is_referenceable::value), ""); +static_assert((cuda::std::__cccl_is_referenceable::value), ""); +static_assert((cuda::std::__cccl_is_referenceable::value), ""); +static_assert((cuda::std::__cccl_is_referenceable::value), ""); + +static_assert((cuda::std::__cccl_is_referenceable::value), ""); +static_assert((cuda::std::__cccl_is_referenceable::value), ""); +static_assert((cuda::std::__cccl_is_referenceable::value), ""); +static_assert((cuda::std::__cccl_is_referenceable::value), ""); +static_assert((cuda::std::__cccl_is_referenceable::value), ""); +static_assert((cuda::std::__cccl_is_referenceable::value), ""); + +static_assert((cuda::std::__cccl_is_referenceable::value), ""); +static_assert((cuda::std::__cccl_is_referenceable::value), ""); +static_assert((cuda::std::__cccl_is_referenceable::value), ""); +static_assert((cuda::std::__cccl_is_referenceable::value), ""); +static_assert((cuda::std::__cccl_is_referenceable::value), ""); +static_assert((cuda::std::__cccl_is_referenceable::value), ""); int main(int, char**) { diff --git a/libcudacxx/test/libcudacxx/std/algorithms/alg.sorting/alg.sort/partial.sort.copy/partial_sort_copy.pass.cpp b/libcudacxx/test/libcudacxx/std/algorithms/alg.sorting/alg.sort/partial.sort.copy/partial_sort_copy.pass.cpp index d3217b85baf..c02f29f0178 100644 --- a/libcudacxx/test/libcudacxx/std/algorithms/alg.sorting/alg.sort/partial.sort.copy/partial_sort_copy.pass.cpp +++ b/libcudacxx/test/libcudacxx/std/algorithms/alg.sorting/alg.sort/partial.sort.copy/partial_sort_copy.pass.cpp @@ -78,7 +78,7 @@ __host__ __device__ TEST_CONSTEXPR_CXX14 bool test() assert(j == 75); test, random_access_iterator>(); - if (!cuda::std::__libcpp_is_constant_evaluated()) // This breaks some compilers due to excessive constant folding + if (!cuda::std::is_constant_evaluated()) // This breaks some compilers due to excessive constant folding { test, int*>(); test>(); @@ -86,7 +86,7 @@ __host__ __device__ TEST_CONSTEXPR_CXX14 bool test() } test, random_access_iterator>(); - if (!cuda::std::__libcpp_is_constant_evaluated()) // This breaks some compilers due to excessive constant folding + if (!cuda::std::is_constant_evaluated()) // This breaks some compilers due to excessive constant folding { test, MoveOnly*>(); test>(); diff --git a/libcudacxx/test/libcudacxx/std/algorithms/alg.sorting/alg.sort/partial.sort.copy/partial_sort_copy_comp.pass.cpp b/libcudacxx/test/libcudacxx/std/algorithms/alg.sorting/alg.sort/partial.sort.copy/partial_sort_copy_comp.pass.cpp index 45bced305b0..2c0c210fb08 100644 --- a/libcudacxx/test/libcudacxx/std/algorithms/alg.sorting/alg.sort/partial.sort.copy/partial_sort_copy_comp.pass.cpp +++ b/libcudacxx/test/libcudacxx/std/algorithms/alg.sorting/alg.sort/partial.sort.copy/partial_sort_copy_comp.pass.cpp @@ -84,7 +84,7 @@ __host__ __device__ TEST_CONSTEXPR_CXX14 bool test() assert(j == 75); test, random_access_iterator>(); - if (!cuda::std::__libcpp_is_constant_evaluated()) // This breaks some compilers due to excessive constant folding + if (!cuda::std::is_constant_evaluated()) // This breaks some compilers due to excessive constant folding { test, int*>(); test>(); @@ -92,7 +92,7 @@ __host__ __device__ TEST_CONSTEXPR_CXX14 bool test() } test, random_access_iterator>(); - if (!cuda::std::__libcpp_is_constant_evaluated()) // This breaks some compilers due to excessive constant folding + if (!cuda::std::is_constant_evaluated()) // This breaks some compilers due to excessive constant folding { test, MoveOnly*>(); test>(); diff --git a/libcudacxx/test/libcudacxx/std/containers/sequences/inplace_vector/access.pass.cpp b/libcudacxx/test/libcudacxx/std/containers/sequences/inplace_vector/access.pass.cpp index c240c9759a9..64388533bbb 100644 --- a/libcudacxx/test/libcudacxx/std/containers/sequences/inplace_vector/access.pass.cpp +++ b/libcudacxx/test/libcudacxx/std/containers/sequences/inplace_vector/access.pass.cpp @@ -76,7 +76,7 @@ __host__ __device__ constexpr bool test() test(); test(); - if (!cuda::std::__libcpp_is_constant_evaluated()) + if (!cuda::std::is_constant_evaluated()) { test(); test(); diff --git a/libcudacxx/test/libcudacxx/std/containers/sequences/inplace_vector/assign.pass.cpp b/libcudacxx/test/libcudacxx/std/containers/sequences/inplace_vector/assign.pass.cpp index 38fd504a48d..a2d12513c3f 100644 --- a/libcudacxx/test/libcudacxx/std/containers/sequences/inplace_vector/assign.pass.cpp +++ b/libcudacxx/test/libcudacxx/std/containers/sequences/inplace_vector/assign.pass.cpp @@ -234,7 +234,7 @@ __host__ __device__ constexpr bool test() test(); test(); - if (!cuda::std::__libcpp_is_constant_evaluated()) + if (!cuda::std::is_constant_evaluated()) { test(); test(); diff --git a/libcudacxx/test/libcudacxx/std/containers/sequences/inplace_vector/assignment.pass.cpp b/libcudacxx/test/libcudacxx/std/containers/sequences/inplace_vector/assignment.pass.cpp index d905877373f..347ed9d777e 100644 --- a/libcudacxx/test/libcudacxx/std/containers/sequences/inplace_vector/assignment.pass.cpp +++ b/libcudacxx/test/libcudacxx/std/containers/sequences/inplace_vector/assignment.pass.cpp @@ -224,7 +224,7 @@ __host__ __device__ constexpr bool test() test(); test(); - if (!cuda::std::__libcpp_is_constant_evaluated()) + if (!cuda::std::is_constant_evaluated()) { test(); test(); diff --git a/libcudacxx/test/libcudacxx/std/containers/sequences/inplace_vector/capacity.pass.cpp b/libcudacxx/test/libcudacxx/std/containers/sequences/inplace_vector/capacity.pass.cpp index 0b8280f9a43..f7b8a68031a 100644 --- a/libcudacxx/test/libcudacxx/std/containers/sequences/inplace_vector/capacity.pass.cpp +++ b/libcudacxx/test/libcudacxx/std/containers/sequences/inplace_vector/capacity.pass.cpp @@ -66,7 +66,7 @@ __host__ __device__ constexpr bool test() test(); test(); - if (!cuda::std::__libcpp_is_constant_evaluated()) + if (!cuda::std::is_constant_evaluated()) { test(); test(); diff --git a/libcudacxx/test/libcudacxx/std/containers/sequences/inplace_vector/comparison.pass.cpp b/libcudacxx/test/libcudacxx/std/containers/sequences/inplace_vector/comparison.pass.cpp index 5f92e85755a..1a6f587c083 100644 --- a/libcudacxx/test/libcudacxx/std/containers/sequences/inplace_vector/comparison.pass.cpp +++ b/libcudacxx/test/libcudacxx/std/containers/sequences/inplace_vector/comparison.pass.cpp @@ -72,7 +72,7 @@ __host__ __device__ constexpr bool test() test(); test(); - if (!cuda::std::__libcpp_is_constant_evaluated()) + if (!cuda::std::is_constant_evaluated()) { test(); test(); diff --git a/libcudacxx/test/libcudacxx/std/containers/sequences/inplace_vector/constructor.pass.cpp b/libcudacxx/test/libcudacxx/std/containers/sequences/inplace_vector/constructor.pass.cpp index 0b3e86bfc35..e30acffe9c1 100644 --- a/libcudacxx/test/libcudacxx/std/containers/sequences/inplace_vector/constructor.pass.cpp +++ b/libcudacxx/test/libcudacxx/std/containers/sequences/inplace_vector/constructor.pass.cpp @@ -283,7 +283,7 @@ __host__ __device__ constexpr void test() { test_default(); - if (!cuda::std::__libcpp_is_constant_evaluated()) + if (!cuda::std::is_constant_evaluated()) { test_copy_move(); test_size(); @@ -308,7 +308,7 @@ __host__ __device__ constexpr bool test() test(); // Due to reinterpret_cast within the destructor a on trivially destructible type cannot be constexpr at all - if (!cuda::std::__libcpp_is_constant_evaluated()) + if (!cuda::std::is_constant_evaluated()) { test(); } diff --git a/libcudacxx/test/libcudacxx/std/containers/sequences/inplace_vector/emplace.pass.cpp b/libcudacxx/test/libcudacxx/std/containers/sequences/inplace_vector/emplace.pass.cpp index 3178c1b5b25..a581113b0d0 100644 --- a/libcudacxx/test/libcudacxx/std/containers/sequences/inplace_vector/emplace.pass.cpp +++ b/libcudacxx/test/libcudacxx/std/containers/sequences/inplace_vector/emplace.pass.cpp @@ -196,7 +196,7 @@ __host__ __device__ constexpr bool test() test(); test(); - if (!cuda::std::__libcpp_is_constant_evaluated()) + if (!cuda::std::is_constant_evaluated()) { test(); test(); diff --git a/libcudacxx/test/libcudacxx/std/containers/sequences/inplace_vector/insert.pass.cpp b/libcudacxx/test/libcudacxx/std/containers/sequences/inplace_vector/insert.pass.cpp index cb9f5830d8b..9b80f98a67a 100644 --- a/libcudacxx/test/libcudacxx/std/containers/sequences/inplace_vector/insert.pass.cpp +++ b/libcudacxx/test/libcudacxx/std/containers/sequences/inplace_vector/insert.pass.cpp @@ -213,7 +213,7 @@ __host__ __device__ constexpr bool test() test(); test(); - if (!cuda::std::__libcpp_is_constant_evaluated()) + if (!cuda::std::is_constant_evaluated()) { test(); test(); diff --git a/libcudacxx/test/libcudacxx/std/containers/sequences/inplace_vector/iterators.pass.cpp b/libcudacxx/test/libcudacxx/std/containers/sequences/inplace_vector/iterators.pass.cpp index 5b3590cc789..06029272c43 100644 --- a/libcudacxx/test/libcudacxx/std/containers/sequences/inplace_vector/iterators.pass.cpp +++ b/libcudacxx/test/libcudacxx/std/containers/sequences/inplace_vector/iterators.pass.cpp @@ -85,7 +85,7 @@ __host__ __device__ constexpr bool test() test(); test(); - if (!cuda::std::__libcpp_is_constant_evaluated()) + if (!cuda::std::is_constant_evaluated()) { test(); test(); diff --git a/libcudacxx/test/libcudacxx/std/containers/sequences/inplace_vector/resize.pass.cpp b/libcudacxx/test/libcudacxx/std/containers/sequences/inplace_vector/resize.pass.cpp index 3500f591b2d..168cebbd6c2 100644 --- a/libcudacxx/test/libcudacxx/std/containers/sequences/inplace_vector/resize.pass.cpp +++ b/libcudacxx/test/libcudacxx/std/containers/sequences/inplace_vector/resize.pass.cpp @@ -231,7 +231,7 @@ __host__ __device__ constexpr bool test() test(); test(); - if (!cuda::std::__libcpp_is_constant_evaluated()) + if (!cuda::std::is_constant_evaluated()) { test(); test(); diff --git a/libcudacxx/test/libcudacxx/std/containers/sequences/inplace_vector/swap.pass.cpp b/libcudacxx/test/libcudacxx/std/containers/sequences/inplace_vector/swap.pass.cpp index 775d9ecb3d5..552246b888b 100644 --- a/libcudacxx/test/libcudacxx/std/containers/sequences/inplace_vector/swap.pass.cpp +++ b/libcudacxx/test/libcudacxx/std/containers/sequences/inplace_vector/swap.pass.cpp @@ -62,7 +62,7 @@ __host__ __device__ constexpr bool test() test(); test(); - if (!cuda::std::__libcpp_is_constant_evaluated()) + if (!cuda::std::is_constant_evaluated()) { test(); test(); diff --git a/libcudacxx/test/libcudacxx/std/utilities/memory/smartptr/unique.ptr/unique.ptr.class/unique.ptr.ctor/pointer_deleter.pass.cpp b/libcudacxx/test/libcudacxx/std/utilities/memory/smartptr/unique.ptr/unique.ptr.class/unique.ptr.ctor/pointer_deleter.pass.cpp index 46e2a3fdd88..4d57f632361 100644 --- a/libcudacxx/test/libcudacxx/std/utilities/memory/smartptr/unique.ptr/unique.ptr.class/unique.ptr.ctor/pointer_deleter.pass.cpp +++ b/libcudacxx/test/libcudacxx/std/utilities/memory/smartptr/unique.ptr/unique.ptr.class/unique.ptr.ctor/pointer_deleter.pass.cpp @@ -67,7 +67,7 @@ __host__ __device__ TEST_CONSTEXPR_CXX23 void test_sfinae() static_assert(cuda::std::is_constructible::value, ""); static_assert(cuda::std::is_constructible::value, ""); static_assert(cuda::std::is_constructible::value, ""); - // FIXME: __libcpp_compressed_pair attempts to perform a move even though + // FIXME: __cccl_compressed_pair attempts to perform a move even though // it should only copy. // D d; // U u(nullptr, cuda::std::move(d)); @@ -149,7 +149,7 @@ __host__ __device__ TEST_CONSTEXPR_CXX23 void test_sfinae_runtime() static_assert(!cuda::std::is_constructible::value, ""); static_assert(!cuda::std::is_constructible::value, ""); static_assert(!cuda::std::is_constructible::value, ""); - // FIXME: __libcpp_compressed_pair attempts to perform a move even though + // FIXME: __cccl_compressed_pair attempts to perform a move even though // it should only copy. // D d; // U u(nullptr, cuda::std::move(d)); diff --git a/libcudacxx/test/libcudacxx/std/utilities/meta/meta.const.eval/is_constant_evaluated.fail.cpp b/libcudacxx/test/libcudacxx/std/utilities/meta/meta.const.eval/is_constant_evaluated.fail.cpp deleted file mode 100644 index edd04088205..00000000000 --- a/libcudacxx/test/libcudacxx/std/utilities/meta/meta.const.eval/is_constant_evaluated.fail.cpp +++ /dev/null @@ -1,28 +0,0 @@ -//===----------------------------------------------------------------------===// -// -// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. -// See https://llvm.org/LICENSE.txt for license information. -// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception -// -//===----------------------------------------------------------------------===// - -// UNSUPPORTED: c++98, c++03 - -// - -#include -#include - -#include "test_macros.h" - -int main(int, char**) -{ -#ifndef _CCCL_BUILTIN_IS_CONSTANT_EVALUATED - // expected-error@+1 {{no member named 'is_constant_evaluated' in namespace 'std'}} - bool b = cuda::std::is_constant_evaluated(); -#else - // expected-error-re@+1 {{{{(static_assert|static assertion)}} failed}} - static_assert(!cuda::std::is_constant_evaluated(), ""); -#endif - return 0; -} diff --git a/libcudacxx/test/libcudacxx/std/utilities/meta/meta.unary/meta.unary.prop/is_constructible.pass.cpp b/libcudacxx/test/libcudacxx/std/utilities/meta/meta.unary/meta.unary.prop/is_constructible.pass.cpp index e655fa3aa6f..c9e544789be 100644 --- a/libcudacxx/test/libcudacxx/std/utilities/meta/meta.unary/meta.unary.prop/is_constructible.pass.cpp +++ b/libcudacxx/test/libcudacxx/std/utilities/meta/meta.unary/meta.unary.prop/is_constructible.pass.cpp @@ -79,7 +79,7 @@ __host__ __device__ void test_is_constructible() #ifndef TEST_COMPILER_MSVC // The fallback SFINAE version doesn't work reliable with MSVC, and we don't // use it, so waive it. - static_assert((cuda::std::__libcpp_is_constructible::type::value), ""); + static_assert((cuda::std::__cccl_is_constructible::type::value), ""); #endif #if TEST_STD_VER > 2011 static_assert(cuda::std::is_constructible_v, ""); @@ -93,7 +93,7 @@ __host__ __device__ void test_is_constructible() #ifndef TEST_COMPILER_MSVC // The fallback SFINAE version doesn't work reliable with MSVC, and we don't // use it, so waive it. - static_assert((cuda::std::__libcpp_is_constructible::type::value), ""); + static_assert((cuda::std::__cccl_is_constructible::type::value), ""); #endif #if TEST_STD_VER > 2011 static_assert((cuda::std::is_constructible_v), ""); @@ -107,7 +107,7 @@ __host__ __device__ void test_is_constructible() #ifndef TEST_COMPILER_MSVC // The fallback SFINAE version doesn't work reliable with MSVC, and we don't // use it, so waive it. - static_assert((cuda::std::__libcpp_is_constructible::type::value), ""); + static_assert((cuda::std::__cccl_is_constructible::type::value), ""); #endif #if TEST_STD_VER > 2011 static_assert((cuda::std::is_constructible_v), ""); @@ -121,7 +121,7 @@ __host__ __device__ void test_is_constructible() #ifndef TEST_COMPILER_MSVC // The fallback SFINAE version doesn't work reliable with MSVC, and we don't // use it, so waive it. - static_assert((cuda::std::__libcpp_is_constructible::type::value), ""); + static_assert((cuda::std::__cccl_is_constructible::type::value), ""); #endif #if TEST_STD_VER > 2011 static_assert((cuda::std::is_constructible_v), ""); @@ -135,7 +135,7 @@ __host__ __device__ void test_is_not_constructible() #ifndef TEST_COMPILER_MSVC // The fallback SFINAE version doesn't work reliable with MSVC, and we don't // use it, so waive it. - static_assert((!cuda::std::__libcpp_is_constructible::type::value), ""); + static_assert((!cuda::std::__cccl_is_constructible::type::value), ""); #endif #if TEST_STD_VER > 2011 static_assert((!cuda::std::is_constructible_v), ""); @@ -149,7 +149,7 @@ __host__ __device__ void test_is_not_constructible() #if !defined(TEST_COMPILER_MSVC) && !(defined(TEST_COMPILER_CLANG) && __clang_major__ >= 16) // The fallback SFINAE version doesn't work reliable with MSVC, and we don't // use it, so waive it. - static_assert((!cuda::std::__libcpp_is_constructible::type::value), ""); + static_assert((!cuda::std::__cccl_is_constructible::type::value), ""); #endif #if TEST_STD_VER > 2011 static_assert((!cuda::std::is_constructible_v), ""); @@ -297,11 +297,11 @@ int main(int, char**) // FIXME Clang disallows this construction because it thinks that // 'static_cast(declval>())' is ill-formed. LIBCPP_STATIC_ASSERT( - clang_disallows_valid_static_cast_bug != cuda::std::__libcpp_is_constructible>::value, ""); + clang_disallows_valid_static_cast_bug != cuda::std::__cccl_is_constructible>::value, ""); ((void) clang_disallows_valid_static_cast_bug); // Prevent unused warning # else static_assert(clang_disallows_valid_static_cast_bug == false, ""); - LIBCPP_STATIC_ASSERT(cuda::std::__libcpp_is_constructible>::value, ""); + LIBCPP_STATIC_ASSERT(cuda::std::__cccl_is_constructible>::value, ""); # endif #endif @@ -309,7 +309,7 @@ int main(int, char**) #if defined(TEST_CLANG_VER) && !defined(TEST_COMPILER_NVCC) test_is_constructible>(); LIBCPP_STATIC_ASSERT( - clang_disallows_valid_static_cast_bug != cuda::std::__libcpp_is_constructible>::value, ""); + clang_disallows_valid_static_cast_bug != cuda::std::__cccl_is_constructible>::value, ""); static_assert(cuda::std::is_constructible>::value, ""); #elif defined(TEST_COMPILER_MSVC) && defined(TEST_COMPILER_NVCC) // FIXME NVCC and MSVC disagree about the validity of these tests, and give diff --git a/libcudacxx/test/support/check_assertion.h b/libcudacxx/test/support/check_assertion.h index 6def8f701e1..8d1a2de8da4 100644 --- a/libcudacxx/test/support/check_assertion.h +++ b/libcudacxx/test/support/check_assertion.h @@ -312,7 +312,7 @@ struct DeathTest std::string stderr_from_child_; }; -void std::__libcpp_verbose_abort(char const* format, ...) +void std::__cccl_verbose_abort(char const* format, ...) { assert(!GlobalMatcher().empty()); diff --git a/libcudacxx/test/support/test_macros.h b/libcudacxx/test/support/test_macros.h index c81987a0dc1..28915d10565 100644 --- a/libcudacxx/test/support/test_macros.h +++ b/libcudacxx/test/support/test_macros.h @@ -149,7 +149,7 @@ #if TEST_HAS_BUILTIN(__builtin_is_constant_evaluated) || _CCCL_COMPILER(GCC, >=, 9) \ || (_CCCL_COMPILER(MSVC) && _MSC_VER > 1924 && _CCCL_CUDACC_AT_LEAST(11, 3)) -# define TEST_IS_CONSTANT_EVALUATED() _CUDA_VSTD::__libcpp_is_constant_evaluated() +# define TEST_IS_CONSTANT_EVALUATED() cuda::std::is_constant_evaluated() #else # define TEST_IS_CONSTANT_EVALUATED() false #endif From 90120a4a4f06c93c1f4bb5e8677032cd852e8860 Mon Sep 17 00:00:00 2001 From: Bernhard Manfred Gruber Date: Tue, 26 Nov 2024 09:04:04 +0100 Subject: [PATCH 28/45] Add more CUB transform benchmarks (#2906) * Add more CUB transform benchmarks Fixes: #2814 --- .../bench/transform/babelstream1.cu | 2 +- .../bench/transform/babelstream2.cu | 2 +- .../bench/transform/babelstream3.cu | 2 +- .../transform/{babelstream.h => common.h} | 0 cub/benchmarks/bench/transform/complex_cmp.cu | 48 +++++++++++ cub/benchmarks/bench/transform/fib.cu | 76 ++++++++++++++++++ cub/benchmarks/bench/transform/heavy.cu | 79 +++++++++++++++++++ .../nvbench_helper/nvbench_helper.cuh | 13 +++ 8 files changed, 219 insertions(+), 3 deletions(-) rename cub/benchmarks/bench/transform/{babelstream.h => common.h} (100%) create mode 100644 cub/benchmarks/bench/transform/complex_cmp.cu create mode 100644 cub/benchmarks/bench/transform/fib.cu create mode 100644 cub/benchmarks/bench/transform/heavy.cu diff --git a/cub/benchmarks/bench/transform/babelstream1.cu b/cub/benchmarks/bench/transform/babelstream1.cu index 87abdfef6ff..c3b9306398d 100644 --- a/cub/benchmarks/bench/transform/babelstream1.cu +++ b/cub/benchmarks/bench/transform/babelstream1.cu @@ -15,7 +15,7 @@ # endif #endif -#include "babelstream.h" +#include "common.h" #if !TUNE_BASE # if CUB_DETAIL_COUNT(__CUDA_ARCH_LIST__) != 1 diff --git a/cub/benchmarks/bench/transform/babelstream2.cu b/cub/benchmarks/bench/transform/babelstream2.cu index c8fa017b788..61d4e905d92 100644 --- a/cub/benchmarks/bench/transform/babelstream2.cu +++ b/cub/benchmarks/bench/transform/babelstream2.cu @@ -15,7 +15,7 @@ # endif #endif -#include "babelstream.h" +#include "common.h" #if !TUNE_BASE # if CUB_DETAIL_COUNT(__CUDA_ARCH_LIST__) != 1 diff --git a/cub/benchmarks/bench/transform/babelstream3.cu b/cub/benchmarks/bench/transform/babelstream3.cu index db541554210..a5c969764ae 100644 --- a/cub/benchmarks/bench/transform/babelstream3.cu +++ b/cub/benchmarks/bench/transform/babelstream3.cu @@ -15,7 +15,7 @@ # endif #endif -#include "babelstream.h" +#include "common.h" #if !TUNE_BASE # if CUB_DETAIL_COUNT(__CUDA_ARCH_LIST__) != 1 diff --git a/cub/benchmarks/bench/transform/babelstream.h b/cub/benchmarks/bench/transform/common.h similarity index 100% rename from cub/benchmarks/bench/transform/babelstream.h rename to cub/benchmarks/bench/transform/common.h diff --git a/cub/benchmarks/bench/transform/complex_cmp.cu b/cub/benchmarks/bench/transform/complex_cmp.cu new file mode 100644 index 00000000000..ac9eb4b0f8b --- /dev/null +++ b/cub/benchmarks/bench/transform/complex_cmp.cu @@ -0,0 +1,48 @@ +// SPDX-FileCopyrightText: Copyright (c) 2024, NVIDIA CORPORATION. All rights reserved. +// SPDX-License-Identifier: BSD-3-Clause + +// %RANGE% TUNE_THREADS tpb 128:1024:128 +// %RANGE% TUNE_ALGORITHM alg 0:1:1 + +// keep checks at the top so compilation of discarded variants fails really fast +#if !TUNE_BASE +# if TUNE_ALGORITHM == 1 && (__CUDA_ARCH_LIST__) < 900 +# error "Cannot compile algorithm 4 (ublkcp) below sm90" +# endif + +# if TUNE_ALGORITHM == 1 && !defined(_CUB_HAS_TRANSFORM_UBLKCP) +# error "Cannot tune for ublkcp algorithm, which is not provided by CUB (old CTK?)" +# endif +#endif + +#include "common.h" + +#if !TUNE_BASE +# if CUB_DETAIL_COUNT(__CUDA_ARCH_LIST__) != 1 +# error "This benchmark does not support being compiled for multiple architectures" +# endif +#endif + +// This benchmark tests overlapping memory regions for reading and is compute intensive + +template +static void compare_complex(nvbench::state& state, nvbench::type_list) +{ + const auto n = narrow(state.get_int64("Elements{io}")); + thrust::device_vector in = generate(n); + thrust::device_vector out(n - 1); + + state.add_element_count(n); + state.add_global_memory_reads(n); + state.add_global_memory_writes(n); + + // the complex comparison needs lots of compute and transform reads from overlapping input + using compare_op = less_t; + bench_transform(state, ::cuda::std::tuple{in.begin(), in.begin() + 1}, out.begin(), n - 1, compare_op{}); +} + +// TODO(bgruber): hardcode OffsetT? +NVBENCH_BENCH_TYPES(compare_complex, NVBENCH_TYPE_AXES(offset_types)) + .set_name("compare_complex") + .set_type_axes_names({"OffsetT{ct}"}) + .add_int64_power_of_two_axis("Elements{io}", nvbench::range(16, 28, 4)); diff --git a/cub/benchmarks/bench/transform/fib.cu b/cub/benchmarks/bench/transform/fib.cu new file mode 100644 index 00000000000..8a6c4c3dfa8 --- /dev/null +++ b/cub/benchmarks/bench/transform/fib.cu @@ -0,0 +1,76 @@ +// SPDX-FileCopyrightText: Copyright (c) 2024, NVIDIA CORPORATION. All rights reserved. +// SPDX-License-Identifier: BSD-3-Clause + +// %RANGE% TUNE_THREADS tpb 128:1024:128 +// %RANGE% TUNE_ALGORITHM alg 0:1:1 + +// keep checks at the top so compilation of discarded variants fails really fast +#if !TUNE_BASE +# if TUNE_ALGORITHM == 1 && (__CUDA_ARCH_LIST__) < 900 +# error "Cannot compile algorithm 4 (ublkcp) below sm90" +# endif + +# if TUNE_ALGORITHM == 1 && !defined(_CUB_HAS_TRANSFORM_UBLKCP) +# error "Cannot tune for ublkcp algorithm, which is not provided by CUB (old CTK?)" +# endif +#endif + +#include "common.h" + +#if !TUNE_BASE +# if CUB_DETAIL_COUNT(__CUDA_ARCH_LIST__) != 1 +# error "This benchmark does not support being compiled for multiple architectures" +# endif +#endif + +// This benchmark is compute intensive with diverging threads + +template +struct fib_t +{ + __device__ OutputT operator()(IndexT n) + { + OutputT t1 = 0; + OutputT t2 = 1; + + if (n < 1) + { + return t1; + } + if (n == 1) + { + return t1; + } + if (n == 2) + { + return t2; + } + for (IndexT i = 3; i <= n; ++i) + { + const auto next = t1 + t2; + t1 = t2; + t2 = next; + } + return t2; + } +}; +template +static void fibonacci(nvbench::state& state, nvbench::type_list) +{ + using index_t = int64_t; + using output_t = uint32_t; + const auto n = narrow(state.get_int64("Elements{io}")); + thrust::device_vector in = generate(n, bit_entropy::_1_000, index_t{0}, index_t{42}); + thrust::device_vector out(n); + + state.add_element_count(n); + state.add_global_memory_reads(n); + state.add_global_memory_writes(n); + + bench_transform(state, ::cuda::std::tuple{in.begin()}, out.begin(), n, fib_t{}); +} + +NVBENCH_BENCH_TYPES(fibonacci, NVBENCH_TYPE_AXES(offset_types)) + .set_name("fibonacci") + .set_type_axes_names({"OffsetT{ct}"}) + .add_int64_power_of_two_axis("Elements{io}", nvbench::range(16, 28, 4)); diff --git a/cub/benchmarks/bench/transform/heavy.cu b/cub/benchmarks/bench/transform/heavy.cu new file mode 100644 index 00000000000..7c35b069e24 --- /dev/null +++ b/cub/benchmarks/bench/transform/heavy.cu @@ -0,0 +1,79 @@ +// SPDX-FileCopyrightText: Copyright (c) 2024, NVIDIA CORPORATION. All rights reserved. +// SPDX-License-Identifier: BSD-3-Clause + +// %RANGE% TUNE_THREADS tpb 128:1024:128 +// %RANGE% TUNE_ALGORITHM alg 0:1:1 + +// keep checks at the top so compilation of discarded variants fails really fast +#if !TUNE_BASE +# if TUNE_ALGORITHM == 1 && (__CUDA_ARCH_LIST__) < 900 +# error "Cannot compile algorithm 4 (ublkcp) below sm90" +# endif + +# if TUNE_ALGORITHM == 1 && !defined(_CUB_HAS_TRANSFORM_UBLKCP) +# error "Cannot tune for ublkcp algorithm, which is not provided by CUB (old CTK?)" +# endif +#endif + +#include "common.h" + +#if !TUNE_BASE +# if CUB_DETAIL_COUNT(__CUDA_ARCH_LIST__) != 1 +# error "This benchmark does not support being compiled for multiple architectures" +# endif +#endif + +// This benchmark uses a LOT of registers and is compute intensive. + +template +struct heavy_functor +{ + // we need to use an unsigned type so overflow in arithmetic wraps around + __device__ std::uint32_t operator()(std::uint32_t data) const + { + std::uint32_t reg[N]; + reg[0] = data; + for (int i = 1; i < N; ++i) + { + reg[i] = reg[i - 1] * reg[i - 1] + 1; + } + for (int i = 0; i < N; ++i) + { + reg[i] = (reg[i] * reg[i]) % 19; + } + for (int i = 0; i < N; ++i) + { + reg[i] = reg[N - i - 1] * reg[i]; + } + std::uint32_t x = 0; + for (int i = 0; i < N; ++i) + { + x += reg[i]; + } + return x; + } +}; + +template +static void heavy(nvbench::state& state, nvbench::type_list) +{ + using value_t = std::uint32_t; + using offset_t = int; + const auto n = narrow(state.get_int64("Elements{io}")); + thrust::device_vector in = generate(n); + thrust::device_vector out(n); + + state.add_element_count(n); + state.add_global_memory_reads(n); + state.add_global_memory_writes(n); + + bench_transform(state, ::cuda::std::tuple{in.begin()}, out.begin(), n, heavy_functor{}); +} + +template +using ic = ::cuda::std::integral_constant; + +NVBENCH_BENCH_TYPES(heavy, NVBENCH_TYPE_AXES(nvbench::type_list, ic<64>, ic<128>, ic<256>>)) + .set_name("heavy") + .set_type_axes_names({"Heaviness{ct}"}) + .add_int64_power_of_two_axis("Elements{io}", nvbench::range(16, 28, 4)); diff --git a/cub/benchmarks/nvbench_helper/nvbench_helper/nvbench_helper.cuh b/cub/benchmarks/nvbench_helper/nvbench_helper/nvbench_helper.cuh index 88b189cf964..9c16bee3033 100644 --- a/cub/benchmarks/nvbench_helper/nvbench_helper/nvbench_helper.cuh +++ b/cub/benchmarks/nvbench_helper/nvbench_helper/nvbench_helper.cuh @@ -32,6 +32,19 @@ NVBENCH_DECLARE_TYPE_STRINGS(complex, "C64", "complex"); NVBENCH_DECLARE_TYPE_STRINGS(::cuda::std::false_type, "false", "false_type"); NVBENCH_DECLARE_TYPE_STRINGS(::cuda::std::true_type, "true", "true_type"); +template +struct nvbench::type_strings<::cuda::std::integral_constant> +{ + static std::string input_string() + { + return std::to_string(I); + } + static std::string description() + { + return "integral_constant<" + type_strings::description() + ", " + std::to_string(I) + ">"; + } +}; + namespace detail { From 159c1c3ed255e02e72fef860792db9cca3e4dbe1 Mon Sep 17 00:00:00 2001 From: Michael Schellenberger Costa Date: Tue, 26 Nov 2024 09:11:43 +0100 Subject: [PATCH 29/45] Start reworking our math functions (#2749) * Move cmath helpers to `__cmath` subfolder * Drop unused functions * Move `lerp` to its own file * Properly qualify function calls in cmath * Move definition of logarithms into their own file and implement them on our own * Move definition of fp min max to its own file * Move definition of floating point trait functions to their own file * Improve tests to ensure we are not constant folding everything * Also port `fpclassify` to enable proper `isnormal` implementation Co-authored-by: Bernhard Manfred Gruber --- libcudacxx/include/cuda/std/__cccl/builtin.h | 169 +++++- libcudacxx/include/cuda/std/__cmath/common.h | 40 ++ .../include/cuda/std/__cmath/fpclassify.h | 189 +++++++ libcudacxx/include/cuda/std/__cmath/lerp.h | 102 ++++ .../include/cuda/std/__cmath/logarithms.h | 494 ++++++++++++++++++ libcudacxx/include/cuda/std/__cmath/min_max.h | 227 ++++++++ .../cmath_nvbf16.h => __cmath/nvbf16.h} | 57 +- .../cmath_nvfp16.h => __cmath/nvfp16.h} | 79 +-- libcudacxx/include/cuda/std/__cmath/traits.h | 470 +++++++++++++++++ .../include/cuda/std/__complex/nvbf16.h | 2 +- .../include/cuda/std/__complex/nvfp16.h | 2 +- .../include/cuda/std/__type_traits/promote.h | 55 +- .../cuda/std/detail/libcxx/include/cmath | 243 ++------- .../cuda/std/detail/libcxx/include/complex | 201 ++++--- .../std/numerics/c.math/fp_min_max.pass.cpp | 118 +++++ .../std/numerics/c.math/fp_traits.pass.cpp | 458 ++++++++++++++++ .../std/numerics/c.math/lerp.pass.cpp | 86 +++ .../std/numerics/c.math/logarithms.pass.cpp | 109 ++++ 18 files changed, 2643 insertions(+), 458 deletions(-) create mode 100644 libcudacxx/include/cuda/std/__cmath/common.h create mode 100644 libcudacxx/include/cuda/std/__cmath/fpclassify.h create mode 100644 libcudacxx/include/cuda/std/__cmath/lerp.h create mode 100644 libcudacxx/include/cuda/std/__cmath/logarithms.h create mode 100644 libcudacxx/include/cuda/std/__cmath/min_max.h rename libcudacxx/include/cuda/std/{__cuda/cmath_nvbf16.h => __cmath/nvbf16.h} (68%) rename libcudacxx/include/cuda/std/{__cuda/cmath_nvfp16.h => __cmath/nvfp16.h} (70%) create mode 100644 libcudacxx/include/cuda/std/__cmath/traits.h create mode 100644 libcudacxx/test/libcudacxx/std/numerics/c.math/fp_min_max.pass.cpp create mode 100644 libcudacxx/test/libcudacxx/std/numerics/c.math/fp_traits.pass.cpp create mode 100644 libcudacxx/test/libcudacxx/std/numerics/c.math/lerp.pass.cpp create mode 100644 libcudacxx/test/libcudacxx/std/numerics/c.math/logarithms.pass.cpp diff --git a/libcudacxx/include/cuda/std/__cccl/builtin.h b/libcudacxx/include/cuda/std/__cccl/builtin.h index b3a53918054..4e0bfae8a9e 100644 --- a/libcudacxx/include/cuda/std/__cccl/builtin.h +++ b/libcudacxx/include/cuda/std/__cccl/builtin.h @@ -146,6 +146,32 @@ # define _CCCL_BUILTIN_EXPECT(...) __builtin_expect(__VA_ARGS__) #endif // _CCCL_CHECK_BUILTIN(builtin_expect) +#if _CCCL_CHECK_BUILTIN(builtin_fmax) || _CCCL_COMPILER(GCC) +# define _CCCL_BUILTIN_FMAXF(...) __builtin_fmaxf(__VA_ARGS__) +# define _CCCL_BUILTIN_FMAX(...) __builtin_fmax(__VA_ARGS__) +# define _CCCL_BUILTIN_FMAXL(...) __builtin_fmaxl(__VA_ARGS__) +#endif // _CCCL_CHECK_BUILTIN(builtin_fmax) + +// Below 11.7 nvcc treats the builtin as a host only function +#if _CCCL_CUDACC_BELOW(11, 7) +# undef _CCCL_BUILTIN_FMAXF +# undef _CCCL_BUILTIN_FMAX +# undef _CCCL_BUILTIN_FMAXL +#endif // _CCCL_CUDACC_BELOW(11, 7) + +#if _CCCL_CHECK_BUILTIN(builtin_fmin) || _CCCL_COMPILER(GCC) +# define _CCCL_BUILTIN_FMINF(...) __builtin_fminf(__VA_ARGS__) +# define _CCCL_BUILTIN_FMIN(...) __builtin_fmin(__VA_ARGS__) +# define _CCCL_BUILTIN_FMINL(...) __builtin_fminl(__VA_ARGS__) +#endif // _CCCL_CHECK_BUILTIN(builtin_fmin) + +// Below 11.7 nvcc treats the builtin as a host only function +#if _CCCL_CUDACC_BELOW(11, 7) +# undef _CCCL_BUILTIN_FMINF +# undef _CCCL_BUILTIN_FMIN +# undef _CCCL_BUILTIN_FMINL +#endif // _CCCL_CUDACC_BELOW(11, 7) + #if _CCCL_HAS_BUILTIN(__builtin_FILE) || _CCCL_COMPILER(GCC) || _CCCL_COMPILER(MSVC, >=, 19, 27) # define _CCCL_BUILTIN_FILE() __builtin_FILE() #else // ^^^ _CCCL_HAS_BUILTIN(__builtin_FILE) ^^^ / vvv !_CCCL_HAS_BUILTIN(__builtin_FILE) vvv @@ -158,6 +184,15 @@ # define _CCCL_BUILTIN_FILE() __FILE__ #endif // _CCCL_CUDACC_BELOW(11, 3) +#if _CCCL_CHECK_BUILTIN(builtin_fpclassify) || _CCCL_COMPILER(GCC) +# define _CCCL_BUILTIN_FPCLASSIFY(...) __builtin_fpclassify(__VA_ARGS__) +#endif // _CCCL_CHECK_BUILTIN(builtin_fpclassify) + +// Below 11.7 nvcc treats the builtin as a host only function +#if _CCCL_CUDACC_BELOW(11, 7) +# undef _CCCL_BUILTIN_FPCLASSIFY +#endif // _CCCL_CUDACC_BELOW(11, 7) + #if _CCCL_HAS_BUILTIN(__builtin_FUNCTION) || _CCCL_COMPILER(GCC) || _CCCL_COMPILER(MSVC, >=, 19, 27) # define _CCCL_BUILTIN_FUNCTION() __builtin_FUNCTION() #else // ^^^ _CCCL_HAS_BUILTIN(__builtin_FUNCTION) ^^^ / vvv !_CCCL_HAS_BUILTIN(__builtin_FUNCTION) vvv @@ -180,7 +215,34 @@ # undef _CCCL_BUILTIN_IS_CONSTANT_EVALUATED #endif // _CCCL_STD_VER < 2014 && _CCCL_CUDA_COMPILER_NVCC -#if _CCCL_CHECK_BUILTIN(builtin_launder) || _CCCL_COMPILER(GCC, >=, 7) +#if _CCCL_CHECK_BUILTIN(builtin_isfinite) || _CCCL_COMPILER(GCC) || _CCCL_COMPILER(NVRTC) +# define _CCCL_BUILTIN_ISFINITE(...) __builtin_isfinite(__VA_ARGS__) +#endif // _CCCL_CHECK_BUILTIN(isfinite) + +// Below 11.7 nvcc treats the builtin as a host only function +#if _CCCL_CUDACC_BELOW(11, 7) +# undef _CCCL_BUILTIN_ISFINITE +#endif // _CCCL_CUDACC_BELOW(11, 7) + +#if _CCCL_CHECK_BUILTIN(builtin_isinf) || _CCCL_COMPILER(GCC) +# define _CCCL_BUILTIN_ISINF(...) __builtin_isinf(__VA_ARGS__) +#endif // _CCCL_CHECK_BUILTIN(isinf) + +// Below 11.7 nvcc treats the builtin as a host only function +#if _CCCL_CUDACC_BELOW(11, 7) +# undef _CCCL_BUILTIN_ISINF +#endif // _CCCL_CUDACC_BELOW(11, 7) + +#if _CCCL_CHECK_BUILTIN(builtin_isnan) || _CCCL_COMPILER(GCC) +# define _CCCL_BUILTIN_ISNAN(...) __builtin_isnan(__VA_ARGS__) +#endif // _CCCL_CHECK_BUILTIN(isnan) + +// Below 11.7 nvcc treats the builtin as a host only function +#if _CCCL_CUDACC_BELOW(11, 7) +# undef _CCCL_BUILTIN_ISNAN +#endif // _CCCL_CUDACC_BELOW(11, 7) + +#if (_CCCL_CHECK_BUILTIN(builtin_launder) || (_CCCL_COMPILER(GCC) && _CCCL_GCC_VERSION >= 70000)) # define _CCCL_BUILTIN_LAUNDER(...) __builtin_launder(__VA_ARGS__) #endif // _CCCL_CHECK_BUILTIN(builtin_launder) && gcc >= 7 @@ -202,12 +264,105 @@ # define _CCCL_BUILTIN_LINE() __LINE__ #endif // _CCCL_CUDACC_BELOW(11, 3) +#if _CCCL_CHECK_BUILTIN(builtin_log) || _CCCL_COMPILER(GCC) +# define _CCCL_BUILTIN_LOGF(...) __builtin_logf(__VA_ARGS__) +# define _CCCL_BUILTIN_LOG(...) __builtin_log(__VA_ARGS__) +# define _CCCL_BUILTIN_LOGL(...) __builtin_logl(__VA_ARGS__) +#endif // _CCCL_CHECK_BUILTIN(builtin_log) + +// Below 11.7 nvcc treats the builtin as a host only function +// clang-cuda fails with fatal error: error in backend: Undefined external symbol "logf" +#if _CCCL_CUDACC_BELOW(11, 7) || defined(_CCCL_CUDA_COMPILER_CLANG) +# undef _CCCL_BUILTIN_LOGF +# undef _CCCL_BUILTIN_LOG +# undef _CCCL_BUILTIN_LOGL +#endif // _CCCL_CUDACC_BELOW(11, 7) || _CCCL_CUDA_COMPILER_CLANG + +#if _CCCL_CHECK_BUILTIN(builtin_log10) || _CCCL_COMPILER(GCC) +# define _CCCL_BUILTIN_LOG10F(...) __builtin_log10f(__VA_ARGS__) +# define _CCCL_BUILTIN_LOG10(...) __builtin_log10(__VA_ARGS__) +# define _CCCL_BUILTIN_LOG10L(...) __builtin_log10l(__VA_ARGS__) +#endif // _CCCL_CHECK_BUILTIN(builtin_log10) + +// Below 11.7 nvcc treats the builtin as a host only function +// clang-cuda fails with fatal error: error in backend: Undefined external symbol "log10f" +#if _CCCL_CUDACC_BELOW(11, 7) || defined(_CCCL_CUDA_COMPILER_CLANG) +# undef _CCCL_BUILTIN_LOG10F +# undef _CCCL_BUILTIN_LOG10 +# undef _CCCL_BUILTIN_LOG10L +#endif // _CCCL_CUDACC_BELOW(11, 7) + +#if _CCCL_CHECK_BUILTIN(builtin_ilogb) || _CCCL_COMPILER(GCC) +# define _CCCL_BUILTIN_ILOGBF(...) __builtin_ilogbf(__VA_ARGS__) +# define _CCCL_BUILTIN_ILOGB(...) __builtin_ilogb(__VA_ARGS__) +# define _CCCL_BUILTIN_ILOGBL(...) __builtin_ilogbl(__VA_ARGS__) +#endif // _CCCL_CHECK_BUILTIN(builtin_log10) + +// Below 11.7 nvcc treats the builtin as a host only function +// clang-cuda fails with fatal error: error in backend: Undefined external symbol "ilogb" +#if _CCCL_CUDACC_BELOW(11, 7) || defined(_CCCL_CUDA_COMPILER_CLANG) +# undef _CCCL_BUILTIN_ILOGBF +# undef _CCCL_BUILTIN_ILOGB +# undef _CCCL_BUILTIN_ILOGBL +#endif // _CCCL_CUDACC_BELOW(11, 7) || _CCCL_CUDA_COMPILER_CLANG + +#if _CCCL_CHECK_BUILTIN(builtin_log1p) || _CCCL_COMPILER(GCC) +# define _CCCL_BUILTIN_LOG1PF(...) __builtin_log1pf(__VA_ARGS__) +# define _CCCL_BUILTIN_LOG1P(...) __builtin_log1p(__VA_ARGS__) +# define _CCCL_BUILTIN_LOG1PL(...) __builtin_log1pl(__VA_ARGS__) +#endif // _CCCL_CHECK_BUILTIN(builtin_log1p) + +// Below 11.7 nvcc treats the builtin as a host only function +// clang-cuda fails with fatal error: error in backend: Undefined external symbol "log1p" +#if _CCCL_CUDACC_BELOW(11, 7) || defined(_CCCL_CUDA_COMPILER_CLANG) +# undef _CCCL_BUILTIN_LOG1PF +# undef _CCCL_BUILTIN_LOG1P +# undef _CCCL_BUILTIN_LOG1PL +#endif // _CCCL_CUDACC_BELOW(11, 7) || _CCCL_CUDA_COMPILER_CLANG + +#if _CCCL_CHECK_BUILTIN(builtin_log2) || _CCCL_COMPILER(GCC) +# define _CCCL_BUILTIN_LOG2F(...) __builtin_log2f(__VA_ARGS__) +# define _CCCL_BUILTIN_LOG2(...) __builtin_log2(__VA_ARGS__) +# define _CCCL_BUILTIN_LOG2L(...) __builtin_log2l(__VA_ARGS__) +#endif // _CCCL_CHECK_BUILTIN(builtin_log1) + +// Below 11.7 nvcc treats the builtin as a host only function +// clang-cuda fails with fatal error: error in backend: Undefined external symbol "log2f" +#if _CCCL_CUDACC_BELOW(11, 7) || defined(_CCCL_CUDA_COMPILER_CLANG) +# undef _CCCL_BUILTIN_LOG2F +# undef _CCCL_BUILTIN_LOG2 +# undef _CCCL_BUILTIN_LOG2L +#endif // _CCCL_CUDACC_BELOW(11, 7) + +#if _CCCL_CHECK_BUILTIN(builtin_logb) || _CCCL_COMPILER(GCC) +# define _CCCL_BUILTIN_LOGBF(...) __builtin_logbf(__VA_ARGS__) +# define _CCCL_BUILTIN_LOGB(...) __builtin_logb(__VA_ARGS__) +# define _CCCL_BUILTIN_LOGBL(...) __builtin_logbl(__VA_ARGS__) +#endif // _CCCL_CHECK_BUILTIN(builtin_log1) + +// Below 11.7 nvcc treats the builtin as a host only function +// clang-cuda fails with fatal error: error in backend: Undefined external symbol "logb" +#if _CCCL_CUDACC_BELOW(11, 7) || defined(_CCCL_CUDA_COMPILER_CLANG) +# undef _CCCL_BUILTIN_LOGBF +# undef _CCCL_BUILTIN_LOGB +# undef _CCCL_BUILTIN_LOGBL +#endif // _CCCL_CUDACC_BELOW(11, 7) || _CCCL_CUDA_COMPILER_CLANG + #if _CCCL_CHECK_BUILTIN(__builtin_operator_new) && _CCCL_CHECK_BUILTIN(__builtin_operator_delete) \ && defined(_CCCL_CUDA_COMPILER_CLANG) # define _CCCL_BUILTIN_OPERATOR_DELETE(...) __builtin_operator_delete(__VA_ARGS__) # define _CCCL_BUILTIN_OPERATOR_NEW(...) __builtin_operator_new(__VA_ARGS__) #endif // _CCCL_CHECK_BUILTIN(__builtin_operator_new) && _CCCL_CHECK_BUILTIN(__builtin_operator_delete) +#if _CCCL_CHECK_BUILTIN(builtin_signbit) || _CCCL_COMPILER(GCC) +# define _CCCL_BUILTIN_SIGNBIT(...) __builtin_signbit(__VA_ARGS__) +#endif // _CCCL_CHECK_BUILTIN(builtin_signbit) + +// Below 11.7 nvcc treats the builtin as a host only function +#if _CCCL_CUDACC_BELOW(11, 7) +# undef _CCCL_BUILTIN_SIGNBIT +#endif // _CCCL_CUDACC_BELOW(11, 7) + #if _CCCL_HAS_BUILTIN(__decay) && defined(_CCCL_CUDA_COMPILER_CLANG) # define _CCCL_BUILTIN_DECAY(...) __decay(__VA_ARGS__) #endif // _CCCL_HAS_BUILTIN(__decay) && clang-cuda @@ -471,18 +626,6 @@ # define _CCCL_BUILTIN_IS_VOLATILE(...) __is_volatile(__VA_ARGS__) #endif // _CCCL_HAS_BUILTIN(__is_volatile) -#if _CCCL_CHECK_BUILTIN(isfinite) -# define _CCCL_BUILTIN_ISFINITE(...) __builtin_isfinite(__VA_ARGS__) -#endif // _CCCL_CHECK_BUILTIN(isfinite) - -#if _CCCL_CHECK_BUILTIN(isinf) -# define _CCCL_BUILTIN_ISINF(...) __builtin_isinf(__VA_ARGS__) -#endif // _CCCL_CHECK_BUILTIN(isinf) - -#if _CCCL_CHECK_BUILTIN(isnan) -# define _CCCL_BUILTIN_ISNAN(...) __builtin_isnan(__VA_ARGS__) -#endif // _CCCL_CHECK_BUILTIN(isnan) - #if _CCCL_CHECK_BUILTIN(make_integer_seq) || _CCCL_COMPILER(MSVC, >=, 19, 23) # define _CCCL_BUILTIN_MAKE_INTEGER_SEQ(...) __make_integer_seq<__VA_ARGS__> #endif // _CCCL_CHECK_BUILTIN(make_integer_seq) diff --git a/libcudacxx/include/cuda/std/__cmath/common.h b/libcudacxx/include/cuda/std/__cmath/common.h new file mode 100644 index 00000000000..0f6f444d957 --- /dev/null +++ b/libcudacxx/include/cuda/std/__cmath/common.h @@ -0,0 +1,40 @@ +// -*- C++ -*- +//===----------------------------------------------------------------------===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// SPDX-FileCopyrightText: Copyright (c) 2024 NVIDIA CORPORATION & AFFILIATES. +// +//===----------------------------------------------------------------------===// + +#ifndef _LIBCUDACXX___CMATH_COMMON_H +#define _LIBCUDACXX___CMATH_COMMON_H + +#include + +#if defined(_CCCL_IMPLICIT_SYSTEM_HEADER_GCC) +# pragma GCC system_header +#elif defined(_CCCL_IMPLICIT_SYSTEM_HEADER_CLANG) +# pragma clang system_header +#elif defined(_CCCL_IMPLICIT_SYSTEM_HEADER_MSVC) +# pragma system_header +#endif // no system header + +// MSVC and clang cuda need the host side functions included +#if _CCCL_COMPILER(MSVC) || defined(_CCCL_CUDA_COMPILER_CLANG) +# include +#endif // _CCCL_COMPILER(MSVC) || _CCCL_CUDA_COMPILER_CLANG + +#if defined(_LIBCUDACXX_HAS_NVFP16) +# include +#endif // _LIBCUDACXX_HAS_NVFP16 + +#if defined(_LIBCUDACXX_HAS_NVBF16) +_CCCL_DIAG_PUSH +_CCCL_DIAG_SUPPRESS_CLANG("-Wunused-function") +# include +_CCCL_DIAG_POP +#endif // _LIBCUDACXX_HAS_NVBF16 + +#endif // _LIBCUDACXX___CMATH_COMMON_H diff --git a/libcudacxx/include/cuda/std/__cmath/fpclassify.h b/libcudacxx/include/cuda/std/__cmath/fpclassify.h new file mode 100644 index 00000000000..c55e88cb792 --- /dev/null +++ b/libcudacxx/include/cuda/std/__cmath/fpclassify.h @@ -0,0 +1,189 @@ +// -*- C++ -*- +//===----------------------------------------------------------------------===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// SPDX-FileCopyrightText: Copyright (c) 2024 NVIDIA CORPORATION & AFFILIATES. +// +//===----------------------------------------------------------------------===// + +#ifndef _LIBCUDACXX___CMATH_FPCLASSIFY_H +#define _LIBCUDACXX___CMATH_FPCLASSIFY_H + +#include + +#if defined(_CCCL_IMPLICIT_SYSTEM_HEADER_GCC) +# pragma GCC system_header +#elif defined(_CCCL_IMPLICIT_SYSTEM_HEADER_CLANG) +# pragma clang system_header +#elif defined(_CCCL_IMPLICIT_SYSTEM_HEADER_MSVC) +# pragma system_header +#endif // no system header + +#include +#include +#include +#include +#include + +#if _CCCL_COMPILER(NVRTC) +# ifndef FP_NAN +# define FP_NAN 0 +# endif // ! FP_NAN +# ifndef FP_INFINITE +# define FP_INFINITE 1 +# endif // ! FP_INFINITE +# ifndef FP_ZERO +# define FP_ZERO 2 +# endif // ! FP_ZERO +# ifndef FP_SUBNORMAL +# define FP_SUBNORMAL 3 +# endif // ! FP_SUBNORMAL +# ifndef FP_NORMAL +# define FP_NORMAL 4 +# endif // ! FP_NORMAL +#else // ^^^ _CCCL_COMPILER(NVRTC) ^^^ ^/ vvv !_CCCL_COMPILER(NVRTC) vvv +# include +#endif // !_CCCL_COMPILER(NVRTC) + +_LIBCUDACXX_BEGIN_NAMESPACE_STD + +struct _CCCL_FLOAT_BITS +{ +#if defined(_LIBCUDACXX_LITTLE_ENDIAN) + unsigned int man : 23; + unsigned int exp : 8; + unsigned int sign : 1; +#else // ^^^ _LIBCUDACXX_LITTLE_ENDIAN ^^^ / vvv _LIBCUDACXX_BIG_ENDIAN vvv + unsigned int sign : 1; + unsigned int exp : 8; + unsigned int man : 23; +#endif // _LIBCUDACXX_BIG_ENDIAN +}; + +struct _CCCL_DOUBLE_BITS +{ +#if defined(_LIBCUDACXX_LITTLE_ENDIAN) + unsigned int manl : 32; + unsigned int manh : 20; + unsigned int exp : 11; + unsigned int sign : 1; +#else // ^^^ _LIBCUDACXX_LITTLE_ENDIAN ^^^ / vvv _LIBCUDACXX_BIG_ENDIAN vvv + unsigned int sign : 1; + unsigned int exp : 11; + unsigned int manh : 20; + unsigned int manl : 32; +#endif // _LIBCUDACXX_BIG_ENDIAN +}; + +#if defined(_LIBCUDACXX_HAS_NVFP16) +struct _CCCL_HALF_BITS +{ +# if defined(_LIBCUDACXX_LITTLE_ENDIAN) + unsigned short man : 10; + unsigned short exp : 5; + unsigned short sign : 1; +# else // ^^^ _LIBCUDACXX_LITTLE_ENDIAN ^^^ / vvv _LIBCUDACXX_BIG_ENDIAN vvv + unsigned short sign : 1; + unsigned short exp : 5; + unsigned short man : 10; +# endif // _LIBCUDACXX_BIG_ENDIAN +}; +#endif // _LIBCUDACXX_HAS_NVFP16 + +#if defined(_LIBCUDACXX_HAS_NVBF16) +struct _CCCL_NVBFLOAT_BITS +{ +# if defined(_LIBCUDACXX_LITTLE_ENDIAN) + unsigned short man : 7; + unsigned short exp : 8; + unsigned short sign : 1; +# else // ^^^ _LIBCUDACXX_LITTLE_ENDIAN ^^^ / vvv _LIBCUDACXX_BIG_ENDIAN vvv + unsigned short sign : 1; + unsigned short exp : 8; + unsigned short man : 7; +# endif // _LIBCUDACXX_BIG_ENDIAN +}; +#endif // _LIBCUDACXX_HAS_NVBF16 + +_CCCL_NODISCARD _LIBCUDACXX_HIDE_FROM_ABI int fpclassify(float __x) noexcept +{ + _CCCL_FLOAT_BITS __bits = _CUDA_VSTD::bit_cast<_CCCL_FLOAT_BITS>(__x); + if (__bits.exp == 0) + { + return __bits.man == 0 ? FP_ZERO : FP_SUBNORMAL; + } + if (__bits.exp == 255) + { + return __bits.man == 0 ? FP_INFINITE : FP_NAN; + } + return (FP_NORMAL); +} + +_CCCL_NODISCARD _LIBCUDACXX_HIDE_FROM_ABI int fpclassify(double __x) noexcept +{ + _CCCL_DOUBLE_BITS __bits = _CUDA_VSTD::bit_cast<_CCCL_DOUBLE_BITS>(__x); + if (__bits.exp == 0) + { + return (__bits.manl | __bits.manh) == 0 ? FP_ZERO : FP_SUBNORMAL; + } + if (__bits.exp == 2047) + { + return (__bits.manl | __bits.manh) == 0 ? FP_INFINITE : FP_NAN; + } + return (FP_NORMAL); +} + +#if !defined(_LIBCUDACXX_HAS_NO_LONG_DOUBLE) +_CCCL_NODISCARD _LIBCUDACXX_HIDE_FROM_ABI int fpclassify(long double __x) noexcept +{ +# if defined(_CCCL_BUILTIN_FPCLASSIFY) + return _CCCL_BUILTIN_FPCLASSIFY(FP_NAN, FP_INFINITE, FP_NORMAL, FP_SUBNORMAL, FP_ZERO, __x); +# else // ^^^ _CCCL_BUILTIN_SIGNBIT ^^^ / vvv !_CCCL_BUILTIN_SIGNBIT vvv + return ::fpclassify(__x); +# endif // !_CCCL_BUILTIN_SIGNBIT +} +#endif // !_LIBCUDACXX_HAS_NO_LONG_DOUBLE + +#if defined(_LIBCUDACXX_HAS_NVFP16) +_CCCL_NODISCARD _LIBCUDACXX_HIDE_FROM_ABI int fpclassify(__half __x) noexcept +{ + _CCCL_HALF_BITS __bits = _CUDA_VSTD::bit_cast<_CCCL_HALF_BITS>(__x); + if (__bits.exp == 0) + { + return __bits.man == 0 ? FP_ZERO : FP_SUBNORMAL; + } + if (__bits.exp == 31) + { + return __bits.man == 0 ? FP_INFINITE : FP_NAN; + } + return (FP_NORMAL); +} +#endif // _LIBCUDACXX_HAS_NVFP16 + +#if defined(_LIBCUDACXX_HAS_NVBF16) +_CCCL_NODISCARD _LIBCUDACXX_HIDE_FROM_ABI int fpclassify(__nv_bfloat16 __x) noexcept +{ + _CCCL_NVBFLOAT_BITS __bits = _CUDA_VSTD::bit_cast<_CCCL_NVBFLOAT_BITS>(__x); + if (__bits.exp == 0) + { + return __bits.man == 0 ? FP_ZERO : FP_SUBNORMAL; + } + if (__bits.exp == 255) + { + return __bits.man == 0 ? FP_INFINITE : FP_NAN; + } + return (FP_NORMAL); +} +#endif // _LIBCUDACXX_HAS_NVBF16 + +template = 0> +_CCCL_NODISCARD _LIBCUDACXX_HIDE_FROM_ABI int fpclassify(_A1 __x) noexcept +{ + return (__x == 0) ? FP_ZERO : FP_NORMAL; +} + +_LIBCUDACXX_END_NAMESPACE_STD + +#endif // _LIBCUDACXX___CMATH_FPCLASSIFY_H diff --git a/libcudacxx/include/cuda/std/__cmath/lerp.h b/libcudacxx/include/cuda/std/__cmath/lerp.h new file mode 100644 index 00000000000..1665a82bb2f --- /dev/null +++ b/libcudacxx/include/cuda/std/__cmath/lerp.h @@ -0,0 +1,102 @@ +// -*- C++ -*- +//===----------------------------------------------------------------------===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// SPDX-FileCopyrightText: Copyright (c) 2024 NVIDIA CORPORATION & AFFILIATES. +// +//===----------------------------------------------------------------------===// + +#ifndef _LIBCUDACXX___CMATH_LERP_H +#define _LIBCUDACXX___CMATH_LERP_H + +#include + +#if defined(_CCCL_IMPLICIT_SYSTEM_HEADER_GCC) +# pragma GCC system_header +#elif defined(_CCCL_IMPLICIT_SYSTEM_HEADER_CLANG) +# pragma clang system_header +#elif defined(_CCCL_IMPLICIT_SYSTEM_HEADER_MSVC) +# pragma system_header +#endif // no system header + +#include +#include +#include +#include + +_LIBCUDACXX_BEGIN_NAMESPACE_STD + +template +_CCCL_NODISCARD _LIBCUDACXX_HIDE_FROM_ABI _CCCL_CONSTEXPR_CXX14 _Fp __lerp(_Fp __a, _Fp __b, _Fp __t) noexcept +{ + if ((__a <= 0 && __b >= 0) || (__a >= 0 && __b <= 0)) + { + return __t * __b + (1 - __t) * __a; + } + + if (__t == 1) + { + return __b; + } + const _Fp __x = __a + __t * (__b - __a); + if ((__t > 1) == (__b > __a)) + { + return __b < __x ? __x : __b; + } + else + { + return __x < __b ? __x : __b; + } +} + +_CCCL_NODISCARD _LIBCUDACXX_HIDE_FROM_ABI _CCCL_CONSTEXPR_CXX14 float lerp(float __a, float __b, float __t) noexcept +{ + return _CUDA_VSTD::__lerp(__a, __b, __t); +} + +_CCCL_NODISCARD _LIBCUDACXX_HIDE_FROM_ABI _CCCL_CONSTEXPR_CXX14 double lerp(double __a, double __b, double __t) noexcept +{ + return _CUDA_VSTD::__lerp(__a, __b, __t); +} + +#if !defined(_LIBCUDACXX_HAS_NO_LONG_DOUBLE) +_CCCL_NODISCARD _LIBCUDACXX_HIDE_FROM_ABI _CCCL_CONSTEXPR_CXX14 long double +lerp(long double __a, long double __b, long double __t) noexcept +{ + return _CUDA_VSTD::__lerp(__a, __b, __t); +} +#endif // !_LIBCUDACXX_HAS_NO_LONG_DOUBLE + +#if defined(_LIBCUDACXX_HAS_NVFP16) +_CCCL_NODISCARD _LIBCUDACXX_HIDE_FROM_ABI __half lerp(__half __a, __half __b, __half __t) noexcept +{ + return __float2half(_CUDA_VSTD::__lerp(__half2float(__a), __half2float(__b), __half2float(__t))); +} +#endif // _LIBCUDACXX_HAS_NVFP16 + +#if defined(_LIBCUDACXX_HAS_NVBF16) +_CCCL_NODISCARD _LIBCUDACXX_HIDE_FROM_ABI __nv_bfloat16 +lerp(__nv_bfloat16 __a, __nv_bfloat16 __b, __nv_bfloat16 __t) noexcept +{ + return __float2bfloat16(_CUDA_VSTD::__lerp(__bfloat162float(__a), __bfloat162float(__b), __bfloat162float(__t))); +} +#endif // _LIBCUDACXX_HAS_NVBF16 + +template +_CCCL_NODISCARD _LIBCUDACXX_HIDE_FROM_ABI _CCCL_CONSTEXPR_CXX14 +enable_if_t<_CCCL_TRAIT(is_arithmetic, _A1) && _CCCL_TRAIT(is_arithmetic, _A2) && _CCCL_TRAIT(is_arithmetic, _A3), + __promote_t<_A1, _A2, _A3>> +lerp(_A1 __a, _A2 __b, _A3 __t) noexcept +{ + using __result_type = __promote_t<_A1, _A2, _A3>; + static_assert(!(_CCCL_TRAIT(is_same, _A1, __result_type) && _CCCL_TRAIT(is_same, _A2, __result_type) + && _CCCL_TRAIT(is_same, _A3, __result_type)), + ""); + return _CUDA_VSTD::__lerp((__result_type) __a, (__result_type) __b, (__result_type) __t); +} + +_LIBCUDACXX_END_NAMESPACE_STD + +#endif // _LIBCUDACXX___CMATH_LERP_H diff --git a/libcudacxx/include/cuda/std/__cmath/logarithms.h b/libcudacxx/include/cuda/std/__cmath/logarithms.h new file mode 100644 index 00000000000..660b674f99b --- /dev/null +++ b/libcudacxx/include/cuda/std/__cmath/logarithms.h @@ -0,0 +1,494 @@ +// -*- C++ -*- +//===----------------------------------------------------------------------===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// SPDX-FileCopyrightText: Copyright (c) 2024 NVIDIA CORPORATION & AFFILIATES. +// +//===----------------------------------------------------------------------===// + +#ifndef _LIBCUDACXX___CMATH_LOGARITHMS_H +#define _LIBCUDACXX___CMATH_LOGARITHMS_H + +#include + +#if defined(_CCCL_IMPLICIT_SYSTEM_HEADER_GCC) +# pragma GCC system_header +#elif defined(_CCCL_IMPLICIT_SYSTEM_HEADER_CLANG) +# pragma clang system_header +#elif defined(_CCCL_IMPLICIT_SYSTEM_HEADER_MSVC) +# pragma system_header +#endif // no system header + +#include +#include +#include +#include + +#include + +_LIBCUDACXX_BEGIN_NAMESPACE_STD + +// log + +_CCCL_NODISCARD _LIBCUDACXX_HIDE_FROM_ABI float log(float __x) noexcept +{ +#if defined(_CCCL_BUILTIN_LOGF) + return _CCCL_BUILTIN_LOGF(__x); +#else // ^^^ _CCCL_BUILTIN_LOGF ^^^ / vvv !_CCCL_BUILTIN_LOGF vvv + return ::logf(__x); +#endif // !_CCCL_BUILTIN_LOGF +} + +_CCCL_NODISCARD _LIBCUDACXX_HIDE_FROM_ABI float logf(float __x) noexcept +{ +#if defined(_CCCL_BUILTIN_LOGF) + return _CCCL_BUILTIN_LOGF(__x); +#else // ^^^ _CCCL_BUILTIN_LOGF ^^^ / vvv !_CCCL_BUILTIN_LOGF vvv + return ::logf(__x); +#endif // !_CCCL_BUILTIN_LOGF +} + +_CCCL_NODISCARD _LIBCUDACXX_HIDE_FROM_ABI double log(double __x) noexcept +{ +#if defined(_CCCL_BUILTIN_LOG) + return _CCCL_BUILTIN_LOG(__x); +#else // ^^^ _CCCL_BUILTIN_LOG ^^^ / vvv !_CCCL_BUILTIN_LOG vvv + return ::log(__x); +#endif // !_CCCL_BUILTIN_LOG +} + +#if !defined(_LIBCUDACXX_HAS_NO_LONG_DOUBLE) +_CCCL_NODISCARD _LIBCUDACXX_HIDE_FROM_ABI long double log(long double __x) noexcept +{ +# if defined(_CCCL_BUILTIN_LOGL) + return _CCCL_BUILTIN_LOGL(__x); +# else // ^^^ _CCCL_BUILTIN_LOGL ^^^ / vvv !_CCCL_BUILTIN_LOGL vvv + return ::logl(__x); +# endif // !_CCCL_BUILTIN_LOGL +} + +_CCCL_NODISCARD _LIBCUDACXX_HIDE_FROM_ABI long double logl(long double __x) noexcept +{ +# if defined(_CCCL_BUILTIN_LOGL) + return _CCCL_BUILTIN_LOGL(__x); +# else // ^^^ _CCCL_BUILTIN_LOGL ^^^ / vvv !_CCCL_BUILTIN_LOGL vvv + return ::logl(__x); +# endif // !_CCCL_BUILTIN_LOGL +} +#endif // !_LIBCUDACXX_HAS_NO_LONG_DOUBLE + +#if defined(_LIBCUDACXX_HAS_NVFP16) +_CCCL_NODISCARD _LIBCUDACXX_HIDE_FROM_ABI __half log(__half __x) noexcept +{ + NV_IF_ELSE_TARGET(NV_PROVIDES_SM_53, (return ::hlog(__x);), ({ + float __vf = __half2float(__x); + __vf = _CUDA_VSTD::logf(__vf); + __half_raw __ret_repr = ::__float2half_rn(__vf); + + _CUDA_VSTD::uint16_t __repr = __half_raw(__x).x; + switch (__repr) + { + case 7544: + __ret_repr.x -= 1; + break; + + default:; + } + + return __ret_repr; + })) +} +#endif // _LIBCUDACXX_HAS_NVFP16 + +#if defined(_LIBCUDACXX_HAS_NVBF16) +_CCCL_NODISCARD _LIBCUDACXX_HIDE_FROM_ABI __nv_bfloat16 log(__nv_bfloat16 __x) noexcept +{ + NV_IF_ELSE_TARGET( + NV_IS_DEVICE, (return ::hlog(__x);), (return __float2bfloat16(_CUDA_VSTD::logf(__bfloat162float(__x)));)) +} +#endif // _LIBCUDACXX_HAS_NVBF16 + +template = 0> +_CCCL_NODISCARD _LIBCUDACXX_HIDE_FROM_ABI double log(_Integer __x) noexcept +{ +#if defined(_CCCL_BUILTIN_LOG) + return _CCCL_BUILTIN_LOG((double) __x); +#else // ^^^ _CCCL_BUILTIN_LOG ^^^ / vvv !_CCCL_BUILTIN_LOG vvv + return ::log((double) __x); +#endif // !_CCCL_BUILTIN_LOG +} + +// log10 + +_CCCL_NODISCARD _LIBCUDACXX_HIDE_FROM_ABI float log10(float __x) noexcept +{ +#if defined(_CCCL_BUILTIN_LOG10F) + return _CCCL_BUILTIN_LOG10F(__x); +#else // ^^^ _CCCL_BUILTIN_LOG10F ^^^ / vvv !_CCCL_BUILTIN_LOG10F vvv + return ::log10f(__x); +#endif // !_CCCL_BUILTIN_LOG10F +} + +_CCCL_NODISCARD _LIBCUDACXX_HIDE_FROM_ABI float log10f(float __x) noexcept +{ +#if defined(_CCCL_BUILTIN_LOG10F) + return _CCCL_BUILTIN_LOG10F(__x); +#else // ^^^ _CCCL_BUILTIN_LOG10F ^^^ / vvv !_CCCL_BUILTIN_LOG10F vvv + return ::log10f(__x); +#endif // !_CCCL_BUILTIN_LOG10F +} + +_CCCL_NODISCARD _LIBCUDACXX_HIDE_FROM_ABI double log10(double __x) noexcept +{ +#if defined(_CCCL_BUILTIN_LOG10) + return _CCCL_BUILTIN_LOG10(__x); +#else // ^^^ _CCCL_BUILTIN_LOG10 ^^^ / vvv !_CCCL_BUILTIN_LOG10 vvv + return ::log10(__x); +#endif // !_CCCL_BUILTIN_LOG10 +} + +#if !defined(_LIBCUDACXX_HAS_NO_LONG_DOUBLE) +_CCCL_NODISCARD _LIBCUDACXX_HIDE_FROM_ABI long double log10(long double __x) noexcept +{ +# if defined(_CCCL_BUILTIN_LOG10L) + return _CCCL_BUILTIN_LOG10L(__x); +# else // ^^^ _CCCL_BUILTIN_LOG10L ^^^ / vvv !_CCCL_BUILTIN_LOG10L vvv + return ::log10l(__x); +# endif // !_CCCL_BUILTIN_LOG10L +} + +_CCCL_NODISCARD _LIBCUDACXX_HIDE_FROM_ABI long double log10l(long double __x) noexcept +{ +# if defined(_CCCL_BUILTIN_LOG10L) + return _CCCL_BUILTIN_LOG10L(__x); +# else // ^^^ _CCCL_BUILTIN_LOG10L ^^^ / vvv !_CCCL_BUILTIN_LOG10L vvv + return ::log10l(__x); +# endif // !_CCCL_BUILTIN_LOG10L +} +#endif // !_LIBCUDACXX_HAS_NO_LONG_DOUBLE + +#if defined(_LIBCUDACXX_HAS_NVFP16) +_CCCL_NODISCARD _LIBCUDACXX_HIDE_FROM_ABI __half log10(__half __x) noexcept +{ + NV_IF_ELSE_TARGET( + NV_PROVIDES_SM_53, (return ::hlog10(__x);), (return __float2half(_CUDA_VSTD::log10f(__half2float(__x)));)) +} +#endif // _LIBCUDACXX_HAS_NVFP16 + +#if defined(_LIBCUDACXX_HAS_NVBF16) +_CCCL_NODISCARD _LIBCUDACXX_HIDE_FROM_ABI __nv_bfloat16 log10(__nv_bfloat16 __x) noexcept +{ + NV_IF_ELSE_TARGET( + NV_IS_DEVICE, (return ::hlog10(__x);), (return __float2bfloat16(_CUDA_VSTD::log10f(__bfloat162float(__x)));)) +} +#endif // _LIBCUDACXX_HAS_NVBF16 + +template = 0> +_CCCL_NODISCARD _LIBCUDACXX_HIDE_FROM_ABI double log10(_Integer __x) noexcept +{ +#if defined(_CCCL_BUILTIN_LOG10) + return _CCCL_BUILTIN_LOG10((double) __x); +#else // ^^^ _CCCL_BUILTIN_LOG10 ^^^ / vvv !_CCCL_BUILTIN_LOG10 vvv + return ::log10((double) __x); +#endif // !_CCCL_BUILTIN_LOG10 +} + +// ilogb + +_CCCL_NODISCARD _LIBCUDACXX_HIDE_FROM_ABI int ilogb(float __x) noexcept +{ +#if defined(_CCCL_BUILTIN_ILOGBF) + return _CCCL_BUILTIN_ILOGBF(__x); +#else // ^^^ _CCCL_BUILTIN_ILOGBF ^^^ / vvv !_CCCL_BUILTIN_ILOGBF vvv + return ::ilogbf(__x); +#endif // !_CCCL_BUILTIN_ILOGBF +} + +_CCCL_NODISCARD _LIBCUDACXX_HIDE_FROM_ABI int ilogbf(float __x) noexcept +{ +#if defined(_CCCL_BUILTIN_ILOGBF) + return _CCCL_BUILTIN_ILOGBF(__x); +#else // ^^^ _CCCL_BUILTIN_ILOGBF ^^^ / vvv !_CCCL_BUILTIN_ILOGBF vvv + return ::ilogbf(__x); +#endif // !_CCCL_BUILTIN_ILOGBF +} + +_CCCL_NODISCARD _LIBCUDACXX_HIDE_FROM_ABI int ilogb(double __x) noexcept +{ +#if defined(_CCCL_BUILTIN_ILOGB) + return _CCCL_BUILTIN_ILOGB(__x); +#else // ^^^ _CCCL_BUILTIN_ILOGB ^^^ / vvv !_CCCL_BUILTIN_ILOGB vvv + return ::ilogb(__x); +#endif // !_CCCL_BUILTIN_ILOGB +} + +#if !defined(_LIBCUDACXX_HAS_NO_LONG_DOUBLE) +_CCCL_NODISCARD _LIBCUDACXX_HIDE_FROM_ABI int ilogb(long double __x) noexcept +{ +# if defined(_CCCL_BUILTIN_ILOGBL) + return _CCCL_BUILTIN_ILOGBL(__x); +# else // ^^^ _CCCL_BUILTIN_ILOGBL ^^^ / vvv !_CCCL_BUILTIN_ILOGBL vvv + return ::ilogbl(__x); +# endif // !_CCCL_BUILTIN_ILOGBL +} + +_CCCL_NODISCARD _LIBCUDACXX_HIDE_FROM_ABI int ilogbl(long double __x) noexcept +{ +# if defined(_CCCL_BUILTIN_ILOGBL) + return _CCCL_BUILTIN_ILOGBL(__x); +# else // ^^^ _CCCL_BUILTIN_ILOGBL ^^^ / vvv !_CCCL_BUILTIN_ILOGBL vvv + return ::ilogbl(__x); +# endif // !_CCCL_BUILTIN_ILOGBL +} +#endif // !_LIBCUDACXX_HAS_NO_LONG_DOUBLE + +#if defined(_LIBCUDACXX_HAS_NVFP16) +_CCCL_NODISCARD _LIBCUDACXX_HIDE_FROM_ABI int ilogb(__half __x) noexcept +{ + return _CUDA_VSTD::ilogbf(__half2float(__x)); +} +#endif // defined(_LIBCUDACXX_HAS_NVFP16) + +#if defined(_LIBCUDACXX_HAS_NVBF16) +_CCCL_NODISCARD _LIBCUDACXX_HIDE_FROM_ABI int ilogb(__nv_bfloat16 __x) noexcept +{ + return _CUDA_VSTD::ilogbf(__bfloat162float(__x)); +} +#endif // _LIBCUDACXX_HAS_NVBF16 + +template = 0> +_CCCL_NODISCARD _LIBCUDACXX_HIDE_FROM_ABI int ilogb(_Integer __x) noexcept +{ +#if defined(_CCCL_BUILTIN_ILOGB) + return _CCCL_BUILTIN_ILOGB((double) __x); +#else // ^^^ _CCCL_BUILTIN_ILOGB ^^^ / vvv !_CCCL_BUILTIN_ILOGB vvv + return ::ilogb((double) __x); +#endif // !_CCCL_BUILTIN_ILOGB +} + +// log1p + +_CCCL_NODISCARD _LIBCUDACXX_HIDE_FROM_ABI float log1p(float __x) noexcept +{ +#if defined(_CCCL_BUILTIN_LOG1PF) + return _CCCL_BUILTIN_LOG1PF(__x); +#else // ^^^ _CCCL_BUILTIN_LOG1PF ^^^ / vvv !_CCCL_BUILTIN_LOG1PF vvv + return ::log1pf(__x); +#endif // !_CCCL_BUILTIN_LOG1PF +} + +_CCCL_NODISCARD _LIBCUDACXX_HIDE_FROM_ABI float log1pf(float __x) noexcept +{ +#if defined(_CCCL_BUILTIN_LOG1PF) + return _CCCL_BUILTIN_LOG1PF(__x); +#else // ^^^ _CCCL_BUILTIN_LOG1PF ^^^ / vvv !_CCCL_BUILTIN_LOG1PF vvv + return ::log1pf(__x); +#endif // !_CCCL_BUILTIN_LOG1PF +} + +_CCCL_NODISCARD _LIBCUDACXX_HIDE_FROM_ABI double log1p(double __x) noexcept +{ +#if defined(_CCCL_BUILTIN_LOG1P) + return _CCCL_BUILTIN_LOG1P(__x); +#else // ^^^ _CCCL_BUILTIN_LOG1P ^^^ / vvv !_CCCL_BUILTIN_LOG1P vvv + return ::log1p(__x); +#endif // !_CCCL_BUILTIN_LOG1P +} + +#if !defined(_LIBCUDACXX_HAS_NO_LONG_DOUBLE) +_CCCL_NODISCARD _LIBCUDACXX_HIDE_FROM_ABI long double log1p(long double __x) noexcept +{ +# if defined(_CCCL_BUILTIN_LOG1PL) + return _CCCL_BUILTIN_LOG1PL(__x); +# else // ^^^ _CCCL_BUILTIN_LOG1PL ^^^ / vvv !_CCCL_BUILTIN_LOG1PL vvv + return ::log1pl(__x); +# endif // !_CCCL_BUILTIN_LOG1PL +} + +_CCCL_NODISCARD _LIBCUDACXX_HIDE_FROM_ABI long double log1pl(long double __x) noexcept +{ +# if defined(_CCCL_BUILTIN_LOG1PL) + return _CCCL_BUILTIN_LOG1PL(__x); +# else // ^^^ _CCCL_BUILTIN_LOG1PL ^^^ / vvv !_CCCL_BUILTIN_LOG1PL vvv + return ::log1pl(__x); +# endif // !_CCCL_BUILTIN_LOG1PL +} +#endif // !_LIBCUDACXX_HAS_NO_LONG_DOUBLE + +#if defined(_LIBCUDACXX_HAS_NVFP16) +_CCCL_NODISCARD _LIBCUDACXX_HIDE_FROM_ABI __half log1p(__half __x) noexcept +{ + return __float2half(_CUDA_VSTD::log1pf(__half2float(__x))); +} +#endif // _LIBCUDACXX_HAS_NVFP16 + +#if defined(_LIBCUDACXX_HAS_NVBF16) +_CCCL_NODISCARD _LIBCUDACXX_HIDE_FROM_ABI __nv_bfloat16 log1p(__nv_bfloat16 __x) noexcept +{ + return __float2bfloat16(_CUDA_VSTD::log1pf(__bfloat162float(__x))); +} +#endif // _LIBCUDACXX_HAS_NVBF16 + +template = 0> +_CCCL_NODISCARD _LIBCUDACXX_HIDE_FROM_ABI double log1p(_Integer __x) noexcept +{ +#if defined(_CCCL_BUILTIN_LOG1P) + return _CCCL_BUILTIN_LOG1P((double) __x); +#else // ^^^ _CCCL_BUILTIN_LOG1P ^^^ / vvv !_CCCL_BUILTIN_LOG1P vvv + return ::log1p((double) __x); +#endif // !_CCCL_BUILTIN_LOG1P +} + +// log2 + +_CCCL_NODISCARD _LIBCUDACXX_HIDE_FROM_ABI float log2(float __x) noexcept +{ +#if defined(_CCCL_BUILTIN_LOG2F) + return _CCCL_BUILTIN_LOG2F(__x); +#else // ^^^ _CCCL_BUILTIN_LOG2F ^^^ / vvv !_CCCL_BUILTIN_LOG2F vvv + return ::log2f(__x); +#endif // !_CCCL_BUILTIN_LOG2F +} + +_CCCL_NODISCARD _LIBCUDACXX_HIDE_FROM_ABI float log2f(float __x) noexcept +{ +#if defined(_CCCL_BUILTIN_LOG2F) + return _CCCL_BUILTIN_LOG2F(__x); +#else // ^^^ _CCCL_BUILTIN_LOG2F ^^^ / vvv !_CCCL_BUILTIN_LOG2F vvv + return ::log2f(__x); +#endif // !_CCCL_BUILTIN_LOG2F +} + +_CCCL_NODISCARD _LIBCUDACXX_HIDE_FROM_ABI double log2(double __x) noexcept +{ +#if defined(_CCCL_BUILTIN_LOG2) + return _CCCL_BUILTIN_LOG2(__x); +#else // ^^^ _CCCL_BUILTIN_LOG2 ^^^ / vvv !_CCCL_BUILTIN_LOG2 vvv + return ::log2(__x); +#endif // !_CCCL_BUILTIN_LOG2 +} + +#if !defined(_LIBCUDACXX_HAS_NO_LONG_DOUBLE) +_CCCL_NODISCARD _LIBCUDACXX_HIDE_FROM_ABI long double log2(long double __x) noexcept +{ +# if defined(_CCCL_BUILTIN_LOG2L) + return _CCCL_BUILTIN_LOG2L(__x); +# else // ^^^ _CCCL_BUILTIN_LOG2L ^^^ / vvv !_CCCL_BUILTIN_LOG2L vvv + return ::log2l(__x); +# endif // !_CCCL_BUILTIN_LOG2L +} + +_CCCL_NODISCARD _LIBCUDACXX_HIDE_FROM_ABI long double log2l(long double __x) noexcept +{ +# if defined(_CCCL_BUILTIN_LOG2L) + return _CCCL_BUILTIN_LOG2L(__x); +# else // ^^^ _CCCL_BUILTIN_LOG2L ^^^ / vvv !_CCCL_BUILTIN_LOG2L vvv + return ::log2l(__x); +# endif // !_CCCL_BUILTIN_LOG2L +} +#endif // !_LIBCUDACXX_HAS_NO_LONG_DOUBLE + +#if defined(_LIBCUDACXX_HAS_NVFP16) +_CCCL_NODISCARD _LIBCUDACXX_HIDE_FROM_ABI __half log2(__half __x) noexcept +{ + NV_IF_ELSE_TARGET( + NV_PROVIDES_SM_53, (return ::hlog2(__x);), (return __float2half(_CUDA_VSTD::log2f(__half2float(__x)));)) +} +#endif // _LIBCUDACXX_HAS_NVFP16 + +#if defined(_LIBCUDACXX_HAS_NVBF16) +_CCCL_NODISCARD _LIBCUDACXX_HIDE_FROM_ABI __nv_bfloat16 log2(__nv_bfloat16 __x) noexcept +{ + NV_IF_ELSE_TARGET( + NV_IS_DEVICE, (return ::hlog2(__x);), (return __float2bfloat16(_CUDA_VSTD::log2f(__bfloat162float(__x)));)) +} +#endif // _LIBCUDACXX_HAS_NVBF16 + +template = 0> +_CCCL_NODISCARD _LIBCUDACXX_HIDE_FROM_ABI double log2(_Integer __x) noexcept +{ +#if defined(_CCCL_BUILTIN_LOG2) + return _CCCL_BUILTIN_LOG2((double) __x); +#else // ^^^ _CCCL_BUILTIN_LOG2 ^^^ / vvv !_CCCL_BUILTIN_LOG2 vvv + return ::log2((double) __x); +#endif // !_CCCL_BUILTIN_LOG2 +} + +// logb + +_CCCL_NODISCARD _LIBCUDACXX_HIDE_FROM_ABI float logb(float __x) noexcept +{ +#if defined(_CCCL_BUILTIN_LOGBF) + return _CCCL_BUILTIN_LOGBF(__x); +#else // ^^^ _CCCL_BUILTIN_LOGBF ^^^ / vvv !_CCCL_BUILTIN_LOGBF vvv + return ::logbf(__x); +#endif // !_CCCL_BUILTIN_LOGBF +} + +_CCCL_NODISCARD _LIBCUDACXX_HIDE_FROM_ABI float logbf(float __x) noexcept +{ +#if defined(_CCCL_BUILTIN_LOGBF) + return _CCCL_BUILTIN_LOGBF(__x); +#else // ^^^ _CCCL_BUILTIN_LOGBF ^^^ / vvv !_CCCL_BUILTIN_LOGBF vvv + return ::logbf(__x); +#endif // !_CCCL_BUILTIN_LOGBF +} + +_CCCL_NODISCARD _LIBCUDACXX_HIDE_FROM_ABI double logb(double __x) noexcept +{ +#if defined(_CCCL_BUILTIN_LOGB) + return _CCCL_BUILTIN_LOGB(__x); +#else // ^^^ _CCCL_BUILTIN_LOGB ^^^ / vvv !_CCCL_BUILTIN_LOGB vvv + return ::logb(__x); +#endif // !_CCCL_BUILTIN_LOGB +} + +#if !defined(_LIBCUDACXX_HAS_NO_LONG_DOUBLE) +_CCCL_NODISCARD _LIBCUDACXX_HIDE_FROM_ABI long double logb(long double __x) noexcept +{ +# if defined(_CCCL_BUILTIN_LOGBL) + return _CCCL_BUILTIN_LOGBL(__x); +# else // ^^^ _CCCL_BUILTIN_LOGBL ^^^ / vvv !_CCCL_BUILTIN_LOGBL vvv + return ::logbl(__x); +# endif // !_CCCL_BUILTIN_LOGBL +} + +_CCCL_NODISCARD _LIBCUDACXX_HIDE_FROM_ABI long double logbl(long double __x) noexcept +{ +# if defined(_CCCL_BUILTIN_LOGBL) + return _CCCL_BUILTIN_LOGBL(__x); +# else // ^^^ _CCCL_BUILTIN_LOGBL ^^^ / vvv !_CCCL_BUILTIN_LOGBL vvv + return ::logbl(__x); +# endif // !_CCCL_BUILTIN_LOGBL +} +#endif // !_LIBCUDACXX_HAS_NO_LONG_DOUBLE + +#if defined(_LIBCUDACXX_HAS_NVFP16) +_CCCL_NODISCARD _LIBCUDACXX_HIDE_FROM_ABI __half logb(__half __x) noexcept +{ + return __float2half(_CUDA_VSTD::logbf(__half2float(__x))); +} +#endif // _LIBCUDACXX_HAS_NVFP16 + +#if defined(_LIBCUDACXX_HAS_NVBF16) +_CCCL_NODISCARD _LIBCUDACXX_HIDE_FROM_ABI __nv_bfloat16 logb(__nv_bfloat16 __x) noexcept +{ + return __float2bfloat16(_CUDA_VSTD::logbf(__bfloat162float(__x))); +} +#endif // _LIBCUDACXX_HAS_NVBF16 + +template = 0> +_CCCL_NODISCARD _LIBCUDACXX_HIDE_FROM_ABI double logb(_Integer __x) noexcept +{ +#if defined(_CCCL_BUILTIN_LOGB) + return _CCCL_BUILTIN_LOGB((double) __x); +#else // ^^^ _CCCL_BUILTIN_LOGB ^^^ / vvv !_CCCL_BUILTIN_LOGB vvv + return ::logb((double) __x); +#endif // !_CCCL_BUILTIN_LOGB +} + +_LIBCUDACXX_END_NAMESPACE_STD + +#endif // _LIBCUDACXX___CMATH_LOGARITHMS_H diff --git a/libcudacxx/include/cuda/std/__cmath/min_max.h b/libcudacxx/include/cuda/std/__cmath/min_max.h new file mode 100644 index 00000000000..009fd499ac8 --- /dev/null +++ b/libcudacxx/include/cuda/std/__cmath/min_max.h @@ -0,0 +1,227 @@ +// -*- C++ -*- +//===----------------------------------------------------------------------===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// SPDX-FileCopyrightText: Copyright (c) 2024 NVIDIA CORPORATION & AFFILIATES. +// +//===----------------------------------------------------------------------===// + +#ifndef _LIBCUDACXX___CMATH_MIN_MAX_H +#define _LIBCUDACXX___CMATH_MIN_MAX_H + +#include + +#if defined(_CCCL_IMPLICIT_SYSTEM_HEADER_GCC) +# pragma GCC system_header +#elif defined(_CCCL_IMPLICIT_SYSTEM_HEADER_CLANG) +# pragma clang system_header +#elif defined(_CCCL_IMPLICIT_SYSTEM_HEADER_MSVC) +# pragma system_header +#endif // no system header + +#include +#include +#include +#include + +#include + +_LIBCUDACXX_BEGIN_NAMESPACE_STD + +// fmax + +_CCCL_NODISCARD _LIBCUDACXX_HIDE_FROM_ABI float fmax(float __x, float __y) noexcept +{ +#if defined(_CCCL_BUILTIN_FMAX) + return _CCCL_BUILTIN_FMAXF(__x, __y); +#else // ^^^ _CCCL_BUILTIN_FMAX ^^^ / vvv !_CCCL_BUILTIN_FMAX vvv + return ::fmaxf(__x, __y); +#endif // !_CCCL_BUILTIN_FMAX +} + +_CCCL_NODISCARD _LIBCUDACXX_HIDE_FROM_ABI float fmaxf(float __x, float __y) noexcept +{ +#if defined(_CCCL_BUILTIN_FMAX) + return _CCCL_BUILTIN_FMAXF(__x, __y); +#else // ^^^ _CCCL_BUILTIN_FMAX ^^^ / vvv !_CCCL_BUILTIN_FMAX vvv + return ::fmaxf(__x, __y); +#endif // !_CCCL_BUILTIN_FMAX +} + +_CCCL_NODISCARD _LIBCUDACXX_HIDE_FROM_ABI double fmax(double __x, double __y) noexcept +{ +#if defined(_CCCL_BUILTIN_FMAX) + return _CCCL_BUILTIN_FMAX(__x, __y); +#else // ^^^ _CCCL_BUILTIN_FMAX ^^^ / vvv !_CCCL_BUILTIN_FMAX vvv + return ::fmax(__x, __y); +#endif // !_CCCL_BUILTIN_FMAX +} + +#if !defined(_LIBCUDACXX_HAS_NO_LONG_DOUBLE) +_CCCL_NODISCARD _LIBCUDACXX_HIDE_FROM_ABI long double fmax(long double __x, long double __y) noexcept +{ +# if defined(_CCCL_BUILTIN_FMAX) + return _CCCL_BUILTIN_FMAXL(__x, __y); +# else // ^^^ _CCCL_BUILTIN_FMAX ^^^ / vvv !_CCCL_BUILTIN_FMAX vvv + return ::fmaxl(__x, __y); +# endif // !_CCCL_BUILTIN_FMAX +} +_CCCL_NODISCARD _LIBCUDACXX_HIDE_FROM_ABI long double fmaxl(long double __x, long double __y) noexcept +{ +# if defined(_CCCL_BUILTIN_FMAX) + return _CCCL_BUILTIN_FMAXL(__x, __y); +# else // ^^^ _CCCL_BUILTIN_FMAX ^^^ / vvv !_CCCL_BUILTIN_FMAX vvv + return ::fmaxl(__x, __y); +# endif // !_CCCL_BUILTIN_FMAX +} +#endif // !_LIBCUDACXX_HAS_NO_LONG_DOUBLE + +#if defined(_LIBCUDACXX_HAS_NVFP16) +_CCCL_NODISCARD _LIBCUDACXX_HIDE_FROM_ABI __half fmax(__half __x, __half __y) noexcept +{ + NV_IF_ELSE_TARGET(NV_IS_DEVICE, + (return ::__hmax(__x, __y);), + (return __float2half(_CUDA_VSTD::fmaxf(__half2float(__x), __half2float(__y)));)) +} +template = 0> +_CCCL_NODISCARD _LIBCUDACXX_HIDE_FROM_ABI __promote_t fmax(__half __x, _A1 __y) noexcept +{ + return _CUDA_VSTD::fmaxf(__half2float(__x), __y); +} + +template = 0> +_CCCL_NODISCARD _LIBCUDACXX_HIDE_FROM_ABI __promote_t<_A1, float> fmax(_A1 __x, __half __y) noexcept +{ + return _CUDA_VSTD::fmaxf(__x, __half2float(__y)); +} +#endif // _LIBCUDACXX_HAS_NVFP16 + +#if defined(_LIBCUDACXX_HAS_NVBF16) +_CCCL_NODISCARD _LIBCUDACXX_HIDE_FROM_ABI __nv_bfloat16 fmax(__nv_bfloat16 __x, __nv_bfloat16 __y) noexcept +{ + NV_IF_ELSE_TARGET(NV_IS_DEVICE, + (return ::__hmax(__x, __y);), + (return __float2bfloat16(_CUDA_VSTD::fmaxf(__bfloat162float(__x), __bfloat162float(__y)));)) +} +template = 0> +_CCCL_NODISCARD _LIBCUDACXX_HIDE_FROM_ABI __promote_t fmax(__nv_bfloat16 __x, _A1 __y) noexcept +{ + return _CUDA_VSTD::fmaxf(__bfloat162float(__x), __y); +} + +template = 0> +_CCCL_NODISCARD _LIBCUDACXX_HIDE_FROM_ABI __promote_t<_A1, float> fmax(_A1 __x, __nv_bfloat16 __y) noexcept +{ + return _CUDA_VSTD::fmaxf(__x, __bfloat162float(__y)); +} +#endif // _LIBCUDACXX_HAS_NVBF16 + +template = 0> +_CCCL_NODISCARD _LIBCUDACXX_HIDE_FROM_ABI __promote_t<_A1, _A2> fmax(_A1 __x, _A2 __y) noexcept +{ + using __result_type = __promote_t<_A1, _A2>; + static_assert(!(_CCCL_TRAIT(is_same, _A1, __result_type) && _CCCL_TRAIT(is_same, _A2, __result_type)), ""); + return _CUDA_VSTD::fmax((__result_type) __x, (__result_type) __y); +} + +// fmin + +_CCCL_NODISCARD _LIBCUDACXX_HIDE_FROM_ABI float fmin(float __x, float __y) noexcept +{ +#if defined(_CCCL_BUILTIN_FMIN) + return _CCCL_BUILTIN_FMINF(__x, __y); +#else // ^^^ _CCCL_BUILTIN_FMIN ^^^ / vvv !_CCCL_BUILTIN_FMIN vvv + return ::fminf(__x, __y); +#endif // !_CCCL_BUILTIN_FMIN +} + +_CCCL_NODISCARD _LIBCUDACXX_HIDE_FROM_ABI float fminf(float __x, float __y) noexcept +{ +#if defined(_CCCL_BUILTIN_FMIN) + return _CCCL_BUILTIN_FMINF(__x, __y); +#else // ^^^ _CCCL_BUILTIN_FMIN ^^^ / vvv !_CCCL_BUILTIN_FMIN vvv + return ::fminf(__x, __y); +#endif // !_CCCL_BUILTIN_FMIN +} + +_CCCL_NODISCARD _LIBCUDACXX_HIDE_FROM_ABI double fmin(double __x, double __y) noexcept +{ +#if defined(_CCCL_BUILTIN_FMIN) + return _CCCL_BUILTIN_FMIN(__x, __y); +#else // ^^^ _CCCL_BUILTIN_FMIN ^^^ / vvv !_CCCL_BUILTIN_FMIN vvv + return ::fmin(__x, __y); +#endif // !_CCCL_BUILTIN_FMIN +} + +#if !defined(_LIBCUDACXX_HAS_NO_LONG_DOUBLE) +_CCCL_NODISCARD _LIBCUDACXX_HIDE_FROM_ABI long double fmin(long double __x, long double __y) noexcept +{ +# if defined(_CCCL_BUILTIN_FMIN) + return _CCCL_BUILTIN_FMINL(__x, __y); +# else // ^^^ _CCCL_BUILTIN_FMIN ^^^ / vvv !_CCCL_BUILTIN_FMIN vvv + return ::fminl(__x, __y); +# endif // !_CCCL_BUILTIN_FMIN +} +_CCCL_NODISCARD _LIBCUDACXX_HIDE_FROM_ABI long double fminl(long double __x, long double __y) noexcept +{ +# if defined(_CCCL_BUILTIN_FMIN) + return _CCCL_BUILTIN_FMINL(__x, __y); +# else // ^^^ _CCCL_BUILTIN_FMIN ^^^ / vvv !_CCCL_BUILTIN_FMIN vvv + return ::fminl(__x, __y); +# endif // !_CCCL_BUILTIN_FMIN +} +#endif // !_LIBCUDACXX_HAS_NO_LONG_DOUBLE + +#if defined(_LIBCUDACXX_HAS_NVFP16) +_CCCL_NODISCARD _LIBCUDACXX_HIDE_FROM_ABI __half fmin(__half __x, __half __y) noexcept +{ + NV_IF_ELSE_TARGET(NV_IS_DEVICE, + (return ::__hmin(__x, __y);), + (return __float2half(_CUDA_VSTD::fminf(__half2float(__x), __half2float(__y)));)) +} +template = 0> +_CCCL_NODISCARD _LIBCUDACXX_HIDE_FROM_ABI __promote_t fmin(__half __x, _A1 __y) noexcept +{ + return _CUDA_VSTD::fminf(__half2float(__x), __y); +} + +template = 0> +_CCCL_NODISCARD _LIBCUDACXX_HIDE_FROM_ABI __promote_t<_A1, float> fmin(_A1 __x, __half __y) noexcept +{ + return _CUDA_VSTD::fminf(__x, __half2float(__y)); +} +#endif // _LIBCUDACXX_HAS_NVFP16 + +#if defined(_LIBCUDACXX_HAS_NVBF16) +_CCCL_NODISCARD _LIBCUDACXX_HIDE_FROM_ABI __nv_bfloat16 fmin(__nv_bfloat16 __x, __nv_bfloat16 __y) noexcept +{ + NV_IF_ELSE_TARGET(NV_IS_DEVICE, + (return ::__hmin(__x, __y);), + (return __float2bfloat16(_CUDA_VSTD::fminf(__bfloat162float(__x), __bfloat162float(__y)));)) +} +template = 0> +_CCCL_NODISCARD _LIBCUDACXX_HIDE_FROM_ABI __promote_t fmin(__nv_bfloat16 __x, _A1 __y) noexcept +{ + return _CUDA_VSTD::fminf(__bfloat162float(__x), __y); +} + +template = 0> +_CCCL_NODISCARD _LIBCUDACXX_HIDE_FROM_ABI __promote_t<_A1, float> fmin(_A1 __x, __nv_bfloat16 __y) noexcept +{ + return _CUDA_VSTD::fminf(__x, __bfloat162float(__y)); +} +#endif // _LIBCUDACXX_HAS_NVBF16 + +template = 0> +_CCCL_NODISCARD _LIBCUDACXX_HIDE_FROM_ABI __promote_t<_A1, _A2> fmin(_A1 __x, _A2 __y) noexcept +{ + using __result_type = __promote_t<_A1, _A2>; + static_assert(!(_CCCL_TRAIT(is_same, _A1, __result_type) && _CCCL_TRAIT(is_same, _A2, __result_type)), ""); + return _CUDA_VSTD::fmin((__result_type) __x, (__result_type) __y); +} + +_LIBCUDACXX_END_NAMESPACE_STD + +#endif // _LIBCUDACXX___CMATH_MIN_MAX_H diff --git a/libcudacxx/include/cuda/std/__cuda/cmath_nvbf16.h b/libcudacxx/include/cuda/std/__cmath/nvbf16.h similarity index 68% rename from libcudacxx/include/cuda/std/__cuda/cmath_nvbf16.h rename to libcudacxx/include/cuda/std/__cmath/nvbf16.h index 08ad0445e01..8f116968f8b 100644 --- a/libcudacxx/include/cuda/std/__cuda/cmath_nvbf16.h +++ b/libcudacxx/include/cuda/std/__cmath/nvbf16.h @@ -8,8 +8,8 @@ // //===----------------------------------------------------------------------===// -#ifndef _LIBCUDACXX___CUDA_CMATH_NVBF16_H -#define _LIBCUDACXX___CUDA_CMATH_NVBF16_H +#ifndef _LIBCUDACXX___CMATH_NVBF16_H +#define _LIBCUDACXX___CMATH_NVBF16_H #include @@ -70,57 +70,12 @@ _LIBCUDACXX_HIDE_FROM_ABI __nv_bfloat16 atan2(__nv_bfloat16 __x, __nv_bfloat16 _ return __float2bfloat16(::atan2f(__bfloat162float(__x), __bfloat162float(__y))); } -_LIBCUDACXX_HIDE_FROM_ABI __nv_bfloat16 log(__nv_bfloat16 __x) -{ - NV_IF_ELSE_TARGET(NV_IS_DEVICE, (return ::hlog(__x);), (return __float2bfloat16(::logf(__bfloat162float(__x)));)) -} - _LIBCUDACXX_HIDE_FROM_ABI __nv_bfloat16 sqrt(__nv_bfloat16 __x) { NV_IF_ELSE_TARGET(NV_IS_DEVICE, (return ::hsqrt(__x);), (return __float2bfloat16(::sqrtf(__bfloat162float(__x)));)) } // floating point helper -_LIBCUDACXX_HIDE_FROM_ABI bool signbit(__nv_bfloat16 __v) -{ - return ::signbit(::__bfloat162float(__v)); -} - -_LIBCUDACXX_HIDE_FROM_ABI bool __constexpr_isnan(__nv_bfloat16 __x) noexcept -{ - return ::__hisnan(__x); -} - -_LIBCUDACXX_HIDE_FROM_ABI bool isnan(__nv_bfloat16 __v) -{ - return __constexpr_isnan(__v); -} - -_LIBCUDACXX_HIDE_FROM_ABI bool __constexpr_isinf(__nv_bfloat16 __x) noexcept -{ -# if _CCCL_STD_VER >= 2020 && _CCCL_CUDACC_BELOW(12, 3) - // this is a workaround for nvbug 4362808 - return !::__hisnan(__x) && ::__hisnan(__x - __x); -# else // ^^^ C++20 && below 12.3 ^^^ / vvv C++17 or 12.3+ vvv - return ::__hisinf(__x) != 0; -# endif // _CCCL_STD_VER <= 2017 || _CCCL_CUDACC_BELOW(12, 3) -} - -_LIBCUDACXX_HIDE_FROM_ABI bool isinf(__nv_bfloat16 __v) -{ - return __constexpr_isinf(__v); -} - -_LIBCUDACXX_HIDE_FROM_ABI bool __constexpr_isfinite(__nv_bfloat16 __x) noexcept -{ - return !__constexpr_isnan(__x) && !__constexpr_isinf(__x); -} - -_LIBCUDACXX_HIDE_FROM_ABI bool isfinite(__nv_bfloat16 __v) -{ - return __constexpr_isfinite(__v); -} - _LIBCUDACXX_HIDE_FROM_ABI __nv_bfloat16 __constexpr_copysign(__nv_bfloat16 __x, __nv_bfloat16 __y) noexcept { return __float2bfloat16(::copysignf(__bfloat162float(__x), __bfloat162float(__y))); @@ -128,7 +83,7 @@ _LIBCUDACXX_HIDE_FROM_ABI __nv_bfloat16 __constexpr_copysign(__nv_bfloat16 __x, _LIBCUDACXX_HIDE_FROM_ABI __nv_bfloat16 copysign(__nv_bfloat16 __x, __nv_bfloat16 __y) { - return __constexpr_copysign(__x, __y); + return _CUDA_VSTD::__constexpr_copysign(__x, __y); } _LIBCUDACXX_HIDE_FROM_ABI __nv_bfloat16 __constexpr_fabs(__nv_bfloat16 __x) noexcept @@ -138,12 +93,12 @@ _LIBCUDACXX_HIDE_FROM_ABI __nv_bfloat16 __constexpr_fabs(__nv_bfloat16 __x) noex _LIBCUDACXX_HIDE_FROM_ABI __nv_bfloat16 fabs(__nv_bfloat16 __x) { - return __constexpr_fabs(__x); + return _CUDA_VSTD::__constexpr_fabs(__x); } _LIBCUDACXX_HIDE_FROM_ABI __nv_bfloat16 abs(__nv_bfloat16 __x) { - return __constexpr_fabs(__x); + return _CUDA_VSTD::__constexpr_fabs(__x); } _LIBCUDACXX_HIDE_FROM_ABI __nv_bfloat16 __constexpr_fmax(__nv_bfloat16 __x, __nv_bfloat16 __y) noexcept @@ -155,4 +110,4 @@ _LIBCUDACXX_END_NAMESPACE_STD #endif /// _LIBCUDACXX_HAS_NVBF16 -#endif // _LIBCUDACXX___CUDA_CMATH_NVBF16_H +#endif // _LIBCUDACXX___CMATH_NVBF16_H diff --git a/libcudacxx/include/cuda/std/__cuda/cmath_nvfp16.h b/libcudacxx/include/cuda/std/__cmath/nvfp16.h similarity index 70% rename from libcudacxx/include/cuda/std/__cuda/cmath_nvfp16.h rename to libcudacxx/include/cuda/std/__cmath/nvfp16.h index 42f314b36bf..dbcaebbb4ef 100644 --- a/libcudacxx/include/cuda/std/__cuda/cmath_nvfp16.h +++ b/libcudacxx/include/cuda/std/__cmath/nvfp16.h @@ -8,8 +8,8 @@ // //===----------------------------------------------------------------------===// -#ifndef _LIBCUDACXX___CUDA_CMATH_NVFP16_H -#define _LIBCUDACXX___CUDA_CMATH_NVFP16_H +#ifndef _LIBCUDACXX___CMATH_NVFP16_H +#define _LIBCUDACXX___CMATH_NVFP16_H #include @@ -135,79 +135,12 @@ _LIBCUDACXX_HIDE_FROM_ABI __half atan2(__half __x, __half __y) return __float2half(::atan2f(__half2float(__x), __half2float(__y))); } -// clang-format off -_LIBCUDACXX_HIDE_FROM_ABI __half log(__half __x) -{ - NV_IF_ELSE_TARGET(NV_PROVIDES_SM_53, ( - return ::hlog(__x); - ), ( - { - float __vf = __half2float(__x); - __vf = ::logf(__vf); - __half_raw __ret_repr = ::__float2half_rn(__vf); - - uint16_t __repr = __half_raw(__x).x; - switch (__repr) - { - case 7544: - __ret_repr.x -= 1; - break; - - default:; - } - - return __ret_repr; - } - )) -} -// clang-format on - _LIBCUDACXX_HIDE_FROM_ABI __half sqrt(__half __x) { NV_IF_ELSE_TARGET(NV_IS_DEVICE, (return ::hsqrt(__x);), (return __float2half(::sqrtf(__half2float(__x)));)) } // floating point helper -_LIBCUDACXX_HIDE_FROM_ABI bool signbit(__half __v) -{ - return ::signbit(::__half2float(__v)); -} - -_LIBCUDACXX_HIDE_FROM_ABI bool __constexpr_isnan(__half __x) noexcept -{ - return ::__hisnan(__x); -} - -_LIBCUDACXX_HIDE_FROM_ABI bool isnan(__half __v) -{ - return __constexpr_isnan(__v); -} - -_LIBCUDACXX_HIDE_FROM_ABI bool __constexpr_isinf(__half __x) noexcept -{ -# if _CCCL_STD_VER >= 2020 && _CCCL_CUDACC_BELOW(12, 3) - // this is a workaround for nvbug 4362808 - return !::__hisnan(__x) && ::__hisnan(__x - __x); -# else // ^^^ C++20 && below 12.3 ^^^ / vvv C++17 or 12.3+ vvv - return ::__hisinf(__x) != 0; -# endif // _CCCL_STD_VER <= 2017 || _CCCL_CUDACC_BELOW(12, 3) -} - -_LIBCUDACXX_HIDE_FROM_ABI bool isinf(__half __v) -{ - return __constexpr_isinf(__v); -} - -_LIBCUDACXX_HIDE_FROM_ABI bool __constexpr_isfinite(__half __x) noexcept -{ - return !__constexpr_isnan(__x) && !__constexpr_isinf(__x); -} - -_LIBCUDACXX_HIDE_FROM_ABI bool isfinite(__half __v) -{ - return __constexpr_isfinite(__v); -} - _LIBCUDACXX_HIDE_FROM_ABI __half __constexpr_copysign(__half __x, __half __y) noexcept { return __float2half(::copysignf(__half2float(__x), __half2float(__y))); @@ -215,7 +148,7 @@ _LIBCUDACXX_HIDE_FROM_ABI __half __constexpr_copysign(__half __x, __half __y) no _LIBCUDACXX_HIDE_FROM_ABI __half copysign(__half __x, __half __y) { - return __constexpr_copysign(__x, __y); + return _CUDA_VSTD::__constexpr_copysign(__x, __y); } _LIBCUDACXX_HIDE_FROM_ABI __half __constexpr_fabs(__half __x) noexcept @@ -225,12 +158,12 @@ _LIBCUDACXX_HIDE_FROM_ABI __half __constexpr_fabs(__half __x) noexcept _LIBCUDACXX_HIDE_FROM_ABI __half fabs(__half __x) { - return __constexpr_fabs(__x); + return _CUDA_VSTD::__constexpr_fabs(__x); } _LIBCUDACXX_HIDE_FROM_ABI __half abs(__half __x) { - return __constexpr_fabs(__x); + return _CUDA_VSTD::__constexpr_fabs(__x); } _LIBCUDACXX_HIDE_FROM_ABI __half __constexpr_fmax(__half __x, __half __y) noexcept @@ -242,4 +175,4 @@ _LIBCUDACXX_END_NAMESPACE_STD #endif /// _LIBCUDACXX_HAS_NVFP16 -#endif // _LIBCUDACXX___CUDA_CMATH_NVFP16_H +#endif // _LIBCUDACXX___CMATH_NVFP16_H diff --git a/libcudacxx/include/cuda/std/__cmath/traits.h b/libcudacxx/include/cuda/std/__cmath/traits.h new file mode 100644 index 00000000000..cac18ee341b --- /dev/null +++ b/libcudacxx/include/cuda/std/__cmath/traits.h @@ -0,0 +1,470 @@ +// -*- C++ -*- +//===----------------------------------------------------------------------===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// SPDX-FileCopyrightText: Copyright (c) 2024 NVIDIA CORPORATION & AFFILIATES. +// +//===----------------------------------------------------------------------===// + +#ifndef _LIBCUDACXX___CMATH_TRAITS_H +#define _LIBCUDACXX___CMATH_TRAITS_H + +#include + +#if defined(_CCCL_IMPLICIT_SYSTEM_HEADER_GCC) +# pragma GCC system_header +#elif defined(_CCCL_IMPLICIT_SYSTEM_HEADER_CLANG) +# pragma clang system_header +#elif defined(_CCCL_IMPLICIT_SYSTEM_HEADER_MSVC) +# pragma system_header +#endif // no system header + +#include +#include +#include +#include +#include +#include +#include +#include + +#include + +_LIBCUDACXX_BEGIN_NAMESPACE_STD + +_CCCL_NODISCARD _LIBCUDACXX_HIDE_FROM_ABI bool signbit(float __x) noexcept +{ +#if defined(_CCCL_BUILTIN_SIGNBIT) + return _CCCL_BUILTIN_SIGNBIT(__x); +#else // ^^^ _CCCL_BUILTIN_SIGNBIT ^^^ / vvv !_CCCL_BUILTIN_SIGNBIT vvv + return ::signbit(__x); +#endif // !_CCCL_BUILTIN_SIGNBIT +} + +_CCCL_NODISCARD _LIBCUDACXX_HIDE_FROM_ABI bool signbit(double __x) noexcept +{ +#if defined(_CCCL_BUILTIN_SIGNBIT) + return _CCCL_BUILTIN_SIGNBIT(__x); +#else // ^^^ _CCCL_BUILTIN_SIGNBIT ^^^ / vvv !_CCCL_BUILTIN_SIGNBIT vvv + return ::signbit(__x); +#endif // !_CCCL_BUILTIN_SIGNBIT +} + +#if !defined(_LIBCUDACXX_HAS_NO_LONG_DOUBLE) +_CCCL_NODISCARD _LIBCUDACXX_HIDE_FROM_ABI bool signbit(long double __x) noexcept +{ +# if defined(_CCCL_BUILTIN_SIGNBIT) + return _CCCL_BUILTIN_SIGNBIT(__x); +# else // ^^^ _CCCL_BUILTIN_SIGNBIT ^^^ / vvv !_CCCL_BUILTIN_SIGNBIT vvv + return ::signbit(__x); +# endif // !_CCCL_BUILTIN_SIGNBIT +} +#endif // !_LIBCUDACXX_HAS_NO_LONG_DOUBLE + +#if defined(_LIBCUDACXX_HAS_NVFP16) +_CCCL_NODISCARD _LIBCUDACXX_HIDE_FROM_ABI bool signbit(__half __x) noexcept +{ + return _CUDA_VSTD::signbit(__half2float(__x)); +} +#endif // _LIBCUDACXX_HAS_NVFP16 + +#if defined(_LIBCUDACXX_HAS_NVBF16) +_CCCL_NODISCARD _LIBCUDACXX_HIDE_FROM_ABI bool signbit(__nv_bfloat16 __x) noexcept +{ + return _CUDA_VSTD::signbit(__bfloat162float(__x)); +} +#endif // _LIBCUDACXX_HAS_NVBF16 + +template = 0> +_CCCL_NODISCARD _LIBCUDACXX_HIDE_FROM_ABI bool signbit(_A1 __x) noexcept +{ + return __x < 0; +} + +template = 0> +_CCCL_NODISCARD _LIBCUDACXX_HIDE_FROM_ABI bool signbit(_A1) noexcept +{ + return false; +} + +// isfinite + +#if defined(_CCCL_BUILTIN_ISFINITE) || (defined(_CCCL_BUILTIN_ISINF) && defined(_CCCL_BUILTIN_ISNAN)) +# define _CCCL_CONSTEXPR_ISFINITE constexpr +#else // ^^^ _CCCL_BUILTIN_ISFINITE ^^^ / vvv !_CCCL_BUILTIN_ISFINITE vvv +# define _CCCL_CONSTEXPR_ISFINITE +#endif // !_CCCL_BUILTIN_ISFINITE + +template = 0> +_CCCL_NODISCARD _LIBCUDACXX_HIDE_FROM_ABI constexpr bool isfinite(_A1) noexcept +{ + return true; +} + +_CCCL_NODISCARD _LIBCUDACXX_HIDE_FROM_ABI _CCCL_CONSTEXPR_ISFINITE bool isfinite(float __x) noexcept +{ +#if defined(_CCCL_BUILTIN_ISFINITE) + return _CCCL_BUILTIN_ISFINITE(__x); +#elif _CCCL_CUDACC_BELOW(11, 8) + return !::__isinf(__x) && !::__isnan(__x); +#else // ^^^ _CCCL_CUDACC_BELOW(11, 8) ^^^ / vvv !_CCCL_CUDACC_BELOW(11, 8) vvv + return ::isfinite(__x); +#endif // !_CCCL_CUDACC_BELOW(11, 8) +} + +_CCCL_NODISCARD _LIBCUDACXX_HIDE_FROM_ABI _CCCL_CONSTEXPR_ISFINITE bool isfinite(double __x) noexcept +{ +#if defined(_CCCL_BUILTIN_ISFINITE) + return _CCCL_BUILTIN_ISFINITE(__x); +#elif _CCCL_CUDACC_BELOW(11, 8) + return !::__isinf(__x) && !::__isnan(__x); +#else // ^^^ _CCCL_CUDACC_BELOW(11, 8) ^^^ / vvv !_CCCL_CUDACC_BELOW(11, 8) vvv + return ::isfinite(__x); +#endif // !_CCCL_CUDACC_BELOW(11, 8) +} + +#if !defined(_LIBCUDACXX_HAS_NO_LONG_DOUBLE) +_CCCL_NODISCARD _LIBCUDACXX_HIDE_FROM_ABI _CCCL_CONSTEXPR_ISFINITE bool isfinite(long double __x) noexcept +{ +# if defined(_CCCL_BUILTIN_ISFINITE) + return _CCCL_BUILTIN_ISFINITE(__x); +# elif _CCCL_CUDACC_BELOW(11, 8) + return !::__isinf(__x) && !::__isnan(__x); +# else // ^^^ _CCCL_CUDACC_BELOW(11, 8) ^^^ / vvv !_CCCL_CUDACC_BELOW(11, 8) vvv + return ::isfinite(__x); +# endif // !_CCCL_CUDACC_BELOW(11, 8) +} +#endif // !_LIBCUDACXX_HAS_NO_LONG_DOUBLE + +#if defined(_LIBCUDACXX_HAS_NVFP16) +_CCCL_NODISCARD _LIBCUDACXX_HIDE_FROM_ABI bool isfinite(__half __x) noexcept +{ + return !::__hisnan(__x) && !::__hisinf(__x); +} +#endif // _LIBCUDACXX_HAS_NVFP16 + +#if defined(_LIBCUDACXX_HAS_NVBF16) +_CCCL_NODISCARD _LIBCUDACXX_HIDE_FROM_ABI bool isfinite(__nv_bfloat16 __x) noexcept +{ + return !::__hisnan(__x) && !::__hisinf(__x); +} +#endif // _LIBCUDACXX_HAS_NVBF16 + +// isinf + +#if defined(_CCCL_BUILTIN_ISINF) +# define _CCCL_CONSTEXPR_ISINF constexpr +#else // ^^^ _CCCL_BUILTIN_ISINF ^^^ / vvv !_CCCL_BUILTIN_ISINF vvv +# define _CCCL_CONSTEXPR_ISINF +#endif // !_CCCL_BUILTIN_ISINF + +template = 0> +_CCCL_NODISCARD _LIBCUDACXX_HIDE_FROM_ABI constexpr bool isinf(_A1) noexcept +{ + return false; +} + +_CCCL_NODISCARD _LIBCUDACXX_HIDE_FROM_ABI _CCCL_CONSTEXPR_ISINF bool isinf(float __x) noexcept +{ +#if defined(_CCCL_BUILTIN_ISINF) + return _CCCL_BUILTIN_ISINF(__x); +#elif _CCCL_CUDACC_BELOW(11, 8) + return ::__isinf(__x); +#else // ^^^ _CCCL_CUDACC_BELOW(11, 8) ^^^ / vvv !_CCCL_CUDACC_BELOW(11, 8) vvv + return ::isinf(__x); +#endif // !_CCCL_CUDACC_BELOW(11, 8) +} + +_CCCL_NODISCARD _LIBCUDACXX_HIDE_FROM_ABI _CCCL_CONSTEXPR_ISINF bool isinf(double __x) noexcept +{ +#if defined(_CCCL_BUILTIN_ISINF) + return _CCCL_BUILTIN_ISINF(__x); +#elif _CCCL_CUDACC_BELOW(11, 8) + return ::__isinf(__x); +#else // ^^^ _CCCL_CUDACC_BELOW(11, 8) ^^^ / vvv !_CCCL_CUDACC_BELOW(11, 8) vvv + return ::isinf(__x); +#endif // !_CCCL_CUDACC_BELOW(11, 8) +} + +#if !defined(_LIBCUDACXX_HAS_NO_LONG_DOUBLE) +_CCCL_NODISCARD _LIBCUDACXX_HIDE_FROM_ABI _CCCL_CONSTEXPR_ISINF bool isinf(long double __x) noexcept +{ +# if defined(_CCCL_BUILTIN_ISINF) + return _CCCL_BUILTIN_ISINF(__x); +# elif _CCCL_CUDACC_BELOW(11, 8) + return ::__isinf(__x); +# else // ^^^ _CCCL_CUDACC_BELOW(11, 8) ^^^ / vvv !_CCCL_CUDACC_BELOW(11, 8) vvv + return ::isinf(__x); +# endif // !_CCCL_CUDACC_BELOW(11, 8) +} +#endif // !_LIBCUDACXX_HAS_NO_LONG_DOUBLE + +#if defined(_LIBCUDACXX_HAS_NVFP16) +_CCCL_NODISCARD _LIBCUDACXX_HIDE_FROM_ABI bool isinf(__half __x) noexcept +{ +# if _CCCL_STD_VER >= 2020 && _CCCL_CUDACC_BELOW(12, 3) + // this is a workaround for nvbug 4362808 + return !::__hisnan(__x) && ::__hisnan(__x - __x); +# else // ^^^ C++20 && below 12.3 ^^^ / vvv C++17 or 12.3+ vvv + return ::__hisinf(__x) != 0; +# endif // _CCCL_STD_VER <= 2017 || _CCCL_CUDACC_VER < 1203000 +} +#endif // _LIBCUDACXX_HAS_NVFP16 + +#if defined(_LIBCUDACXX_HAS_NVBF16) +_CCCL_NODISCARD _LIBCUDACXX_HIDE_FROM_ABI bool isinf(__nv_bfloat16 __x) noexcept +{ +# if _CCCL_STD_VER >= 2020 && _CCCL_CUDACC_BELOW(12, 3) + // this is a workaround for nvbug 4362808 + return !::__hisnan(__x) && ::__hisnan(__x - __x); +# else // ^^^ C++20 && below 12.3 ^^^ / vvv C++17 or 12.3+ vvv + return ::__hisinf(__x) != 0; +# endif // _CCCL_STD_VER <= 2017 || _CCCL_CUDACC_VER < 1203000 +} +#endif // _LIBCUDACXX_HAS_NVBF16 + +// isnan + +#if defined(_CCCL_BUILTIN_ISNAN) +# define _CCCL_CONSTEXPR_ISNAN constexpr +#else // ^^^ _CCCL_BUILTIN_ISNAN ^^^ / vvv !_CCCL_BUILTIN_ISNAN vvv +# define _CCCL_CONSTEXPR_ISNAN +#endif // !_CCCL_BUILTIN_ISNAN + +template = 0> +_CCCL_NODISCARD _LIBCUDACXX_HIDE_FROM_ABI constexpr bool isnan(_A1) noexcept +{ + return false; +} + +_CCCL_NODISCARD _LIBCUDACXX_HIDE_FROM_ABI _CCCL_CONSTEXPR_ISNAN bool isnan(float __x) noexcept +{ +#if defined(_CCCL_BUILTIN_ISNAN) + return _CCCL_BUILTIN_ISNAN(__x); +#elif _CCCL_CUDACC_BELOW(11, 8) + return ::__isnan(__x); +#else // ^^^ _CCCL_CUDACC_BELOW(11, 8) ^^^ / vvv !_CCCL_CUDACC_BELOW(11, 8) vvv + return ::isnan(__x); +#endif // !_CCCL_CUDACC_BELOW(11, 8) +} + +_CCCL_NODISCARD _LIBCUDACXX_HIDE_FROM_ABI _CCCL_CONSTEXPR_ISNAN bool isnan(double __x) noexcept +{ +#if defined(_CCCL_BUILTIN_ISNAN) + return _CCCL_BUILTIN_ISNAN(__x); +#elif _CCCL_CUDACC_BELOW(11, 8) + return ::__isnan(__x); +#else // ^^^ _CCCL_CUDACC_BELOW(11, 8) ^^^ / vvv !_CCCL_CUDACC_BELOW(11, 8) vvv + return ::isnan(__x); +#endif // !_CCCL_CUDACC_BELOW(11, 8) +} + +#if !defined(_LIBCUDACXX_HAS_NO_LONG_DOUBLE) +_CCCL_NODISCARD _LIBCUDACXX_HIDE_FROM_ABI _CCCL_CONSTEXPR_ISNAN bool isnan(long double __x) noexcept +{ +# if defined(_CCCL_BUILTIN_ISNAN) + return _CCCL_BUILTIN_ISNAN(__x); +# elif _CCCL_CUDACC_BELOW(11, 8) + return ::__isnan(__x); +# else // ^^^ _CCCL_CUDACC_BELOW(11, 8) ^^^ / vvv !_CCCL_CUDACC_BELOW(11, 8) vvv + return ::isnan(__x); +# endif // !_CCCL_CUDACC_BELOW(11, 8) +} +#endif // !_LIBCUDACXX_HAS_NO_LONG_DOUBLE + +#if defined(_LIBCUDACXX_HAS_NVFP16) +_CCCL_NODISCARD _LIBCUDACXX_HIDE_FROM_ABI bool isnan(__half __x) noexcept +{ + return ::__hisnan(__x); +} +#endif // _LIBCUDACXX_HAS_NVFP16 + +#if defined(_LIBCUDACXX_HAS_NVBF16) +_CCCL_NODISCARD _LIBCUDACXX_HIDE_FROM_ABI bool isnan(__nv_bfloat16 __x) noexcept +{ + return ::__hisnan(__x); +} +#endif // _LIBCUDACXX_HAS_NVBF16 + +// isnormal + +template = 0> +_CCCL_NODISCARD _LIBCUDACXX_HIDE_FROM_ABI bool isnormal(_A1 __x) noexcept +{ + return __x != 0; +} + +_CCCL_NODISCARD _LIBCUDACXX_HIDE_FROM_ABI bool isnormal(float __x) noexcept +{ + return _CUDA_VSTD::fpclassify(__x) == FP_NORMAL; +} + +_CCCL_NODISCARD _LIBCUDACXX_HIDE_FROM_ABI bool isnormal(double __x) noexcept +{ + return _CUDA_VSTD::fpclassify(__x) == FP_NORMAL; +} + +#if !defined(_LIBCUDACXX_HAS_NO_LONG_DOUBLE) +_CCCL_NODISCARD _LIBCUDACXX_HIDE_FROM_ABI bool isnormal(long double __x) noexcept +{ + return _CUDA_VSTD::fpclassify(__x) == FP_NORMAL; +} +#endif // !_LIBCUDACXX_HAS_NO_LONG_DOUBLE + +#if defined(_LIBCUDACXX_HAS_NVFP16) +_CCCL_NODISCARD _LIBCUDACXX_HIDE_FROM_ABI bool isnormal(__half __x) noexcept +{ + return _CUDA_VSTD::isnormal(__half2float(__x)); +} +#endif // _LIBCUDACXX_HAS_NVFP16 + +#if defined(_LIBCUDACXX_HAS_NVBF16) +_CCCL_NODISCARD _LIBCUDACXX_HIDE_FROM_ABI bool isnormal(__nv_bfloat16 __x) noexcept +{ + return _CUDA_VSTD::isnormal(__bfloat162float(__x)); +} +#endif // _LIBCUDACXX_HAS_NVBF16 + +// isgreater + +template +struct __is_extended_arithmetic +{ + static constexpr bool value = _CCCL_TRAIT(is_arithmetic, _Tp) || _CCCL_TRAIT(__is_extended_floating_point, _Tp); +}; + +#if !defined(_CCCL_NO_VARIABLE_TEMPLATES) +template +_CCCL_INLINE_VAR constexpr bool __is_extended_arithmetic_v = + is_arithmetic_v<_Tp> || __is_extended_floating_point_v<_Tp>; +#endif // !_CCCL_NO_INLINE_VARIABLES + +template = 0> +_CCCL_NODISCARD _CCCL_DEVICE _CCCL_HIDE_FROM_ABI bool __device_isgreater(_A1 __x, _A1 __y) noexcept +{ + if (_CUDA_VSTD::isnan(__x) || _CUDA_VSTD::isnan(__y)) + { + return false; + } + return __x > __y; +} + +template = 0> +_CCCL_NODISCARD _LIBCUDACXX_HIDE_FROM_ABI bool isgreater(_A1 __x, _A2 __y) noexcept +{ + using type = __promote_t<_A1, _A2>; + NV_IF_ELSE_TARGET(NV_IS_HOST, + (return ::isgreater((type) __x, (type) __y);), + (return _CUDA_VSTD::__device_isgreater((type) __x, (type) __y);)) +} + +// isgreaterequal + +template = 0> +_CCCL_NODISCARD _CCCL_DEVICE _CCCL_HIDE_FROM_ABI bool __device_isgreaterequal(_A1 __x, _A1 __y) noexcept +{ + if (_CUDA_VSTD::isnan(__x) || _CUDA_VSTD::isnan(__y)) + { + return false; + } + return __x >= __y; +} + +template = 0> +_CCCL_NODISCARD _LIBCUDACXX_HIDE_FROM_ABI bool isgreaterequal(_A1 __x, _A2 __y) noexcept +{ + using type = __promote_t<_A1, _A2>; + NV_IF_ELSE_TARGET(NV_IS_HOST, + (return ::isgreaterequal((type) __x, (type) __y);), + (return _CUDA_VSTD::__device_isgreaterequal((type) __x, (type) __y);)) +} + +// isless + +template = 0> +_CCCL_NODISCARD _CCCL_DEVICE _CCCL_HIDE_FROM_ABI bool __device_isless(_A1 __x, _A1 __y) noexcept +{ + if (_CUDA_VSTD::isnan(__x) || _CUDA_VSTD::isnan(__y)) + { + return false; + } + return __x < __y; +} + +template = 0> +_CCCL_NODISCARD _LIBCUDACXX_HIDE_FROM_ABI bool isless(_A1 __x, _A2 __y) noexcept +{ + using type = __promote_t<_A1, _A2>; + NV_IF_ELSE_TARGET(NV_IS_HOST, + (return ::isless((type) __x, (type) __y);), + (return _CUDA_VSTD::__device_isless((type) __x, (type) __y);)) +} + +// islessequal + +template = 0> +_CCCL_NODISCARD _CCCL_DEVICE _CCCL_HIDE_FROM_ABI bool __device_islessequal(_A1 __x, _A1 __y) noexcept +{ + if (_CUDA_VSTD::isnan(__x) || _CUDA_VSTD::isnan(__y)) + { + return false; + } + return __x <= __y; +} + +template = 0> +_CCCL_NODISCARD _LIBCUDACXX_HIDE_FROM_ABI bool islessequal(_A1 __x, _A2 __y) noexcept +{ + using type = __promote_t<_A1, _A2>; + NV_IF_ELSE_TARGET(NV_IS_HOST, + (return ::islessequal((type) __x, (type) __y);), + (return _CUDA_VSTD::__device_islessequal((type) __x, (type) __y);)) +} + +// islessgreater + +template = 0> +_CCCL_NODISCARD _CCCL_DEVICE _CCCL_HIDE_FROM_ABI bool __device_islessgreater(_A1 __x, _A1 __y) noexcept +{ + if (_CUDA_VSTD::isnan(__x) || _CUDA_VSTD::isnan(__y)) + { + return false; + } + return __x < __y || __x > __y; +} + +template = 0> +_CCCL_NODISCARD _LIBCUDACXX_HIDE_FROM_ABI bool islessgreater(_A1 __x, _A2 __y) noexcept +{ + using type = __promote_t<_A1, _A2>; + NV_IF_ELSE_TARGET(NV_IS_HOST, + (return ::islessgreater((type) __x, (type) __y);), + (return _CUDA_VSTD::__device_islessgreater((type) __x, (type) __y);)) +} + +// isunordered + +template = 0> +_CCCL_NODISCARD _LIBCUDACXX_HIDE_FROM_ABI bool isunordered(_A1 __x, _A2 __y) noexcept +{ + using type = __promote_t<_A1, _A2>; + return _CUDA_VSTD::isnan((type) __x) || _CUDA_VSTD::isnan((type) __y); +} + +_LIBCUDACXX_END_NAMESPACE_STD + +#endif // _LIBCUDACXX___CMATH_TRAITS_H diff --git a/libcudacxx/include/cuda/std/__complex/nvbf16.h b/libcudacxx/include/cuda/std/__complex/nvbf16.h index 0167f952141..1282b47f6d9 100644 --- a/libcudacxx/include/cuda/std/__complex/nvbf16.h +++ b/libcudacxx/include/cuda/std/__complex/nvbf16.h @@ -28,8 +28,8 @@ _CCCL_DIAG_SUPPRESS_CLANG("-Wunused-function") # include _CCCL_DIAG_POP +# include # include -# include # include # include # include diff --git a/libcudacxx/include/cuda/std/__complex/nvfp16.h b/libcudacxx/include/cuda/std/__complex/nvfp16.h index 8ddd2b27747..bc2da05d61d 100644 --- a/libcudacxx/include/cuda/std/__complex/nvfp16.h +++ b/libcudacxx/include/cuda/std/__complex/nvfp16.h @@ -25,8 +25,8 @@ # include +# include # include -# include # include # include # include diff --git a/libcudacxx/include/cuda/std/__type_traits/promote.h b/libcudacxx/include/cuda/std/__type_traits/promote.h index daa545c5fa1..18a5afacfef 100644 --- a/libcudacxx/include/cuda/std/__type_traits/promote.h +++ b/libcudacxx/include/cuda/std/__type_traits/promote.h @@ -43,10 +43,10 @@ struct __numeric_type { _LIBCUDACXX_HIDE_FROM_ABI static void __test(...); #ifdef _LIBCUDACXX_HAS_NVFP16 - _LIBCUDACXX_HIDE_FROM_ABI static __half __test(__half); + _LIBCUDACXX_HIDE_FROM_ABI static float __test(__half); #endif // _LIBCUDACXX_HAS_NVBF16 #ifdef _LIBCUDACXX_HAS_NVBF16 - _LIBCUDACXX_HIDE_FROM_ABI static __nv_bfloat16 __test(__nv_bfloat16); + _LIBCUDACXX_HIDE_FROM_ABI static float __test(__nv_bfloat16); #endif // _LIBCUDACXX_HAS_NVFP16 _LIBCUDACXX_HIDE_FROM_ABI static float __test(float); _LIBCUDACXX_HIDE_FROM_ABI static double __test(char); @@ -69,10 +69,55 @@ struct __numeric_type static const bool value = true; }; +template +struct __is_mixed_extended_floating_point +{ + static constexpr bool value = false; +}; + +#if defined(_LIBCUDACXX_HAS_NVFP16) && defined(_LIBCUDACXX_HAS_NVBF16) +template +struct __is_mixed_extended_floating_point<_A1, __half, __nv_bfloat16> +{ + static constexpr bool value = true; +}; + +template +struct __is_mixed_extended_floating_point<_A1, __nv_bfloat16, __half> +{ + static constexpr bool value = true; +}; + +template +struct __is_mixed_extended_floating_point<__half, _A1, __nv_bfloat16> +{ + static constexpr bool value = true; +}; + +template +struct __is_mixed_extended_floating_point<__nv_bfloat16, _A1, __half> +{ + static constexpr bool value = true; +}; + +template +struct __is_mixed_extended_floating_point<__half, __nv_bfloat16, _A1> +{ + static constexpr bool value = true; +}; + +template +struct __is_mixed_extended_floating_point<__nv_bfloat16, __half, _A1> +{ + static constexpr bool value = true; +}; +#endif // _LIBCUDACXX_HAS_NVFP16 && _LIBCUDACXX_HAS_NVBF16 + template ::value && __numeric_type<_A2>::value && __numeric_type<_A3>::value> + bool = __numeric_type<_A1>::value && __numeric_type<_A2>::value && __numeric_type<_A3>::value + && !__is_mixed_extended_floating_point<_A1, _A2, _A3>::value> class __promote_imp { public: @@ -96,8 +141,8 @@ template class __promote_imp<_A1, _A2, void, true> { private: - typedef typename __promote_imp<_A1>::type __type1; - typedef typename __promote_imp<_A2>::type __type2; + using __type1 = typename __promote_imp<_A1>::type; + using __type2 = typename __promote_imp<_A2>::type; public: typedef decltype(__type1() + __type2()) type; diff --git a/libcudacxx/include/cuda/std/detail/libcxx/include/cmath b/libcudacxx/include/cuda/std/detail/libcxx/include/cmath index 7066ddec4f2..0f5610d97fc 100644 --- a/libcudacxx/include/cuda/std/detail/libcxx/include/cmath +++ b/libcudacxx/include/cuda/std/detail/libcxx/include/cmath @@ -318,16 +318,21 @@ long double truncl(long double x); # include #endif // _CCCL_COMPILER(NVHPC) +#include +#include +#include +#include +#include #include #include #include #ifdef _LIBCUDACXX_HAS_NVFP16 -# include +# include #endif // _LIBCUDACXX_HAS_NVFP16 #ifdef _LIBCUDACXX_HAS_NVBF16 -# include +# include #endif // _LIBCUDACXX_HAS_NVBF16 #if _CCCL_COMPILER(NVRTC) @@ -340,11 +345,6 @@ _CCCL_PUSH_MACROS _LIBCUDACXX_BEGIN_NAMESPACE_STD -using ::isfinite; -using ::isinf; -using ::isnan; -using ::signbit; - using ::acos; using ::acosf; using ::asin; @@ -386,9 +386,6 @@ using ::asinhf; using ::atanh; using ::atanhf; -using ::log; -using ::logf; - using ::hypot; using ::hypotf; @@ -398,16 +395,6 @@ using ::abs; #if !_CCCL_COMPILER(NVRTC) -using ::fpclassify; -using ::isgreater; -using ::isgreaterequal; -using ::isless; -using ::islessequal; -using ::islessgreater; -using ::isnormal; - -using ::isunordered; - using ::double_t; using ::float_t; @@ -424,11 +411,6 @@ using ::frexpf; using ::ldexp; using ::ldexpf; -using ::log; -using ::logf; - -using ::log10; -using ::log10f; using ::modf; using ::modff; @@ -472,24 +454,12 @@ using ::fdim; using ::fdimf; using ::fma; using ::fmaf; -using ::fmax; -using ::fmaxf; -using ::fmin; -using ::fminf; -using ::ilogb; -using ::ilogbf; using ::lgamma; using ::lgammaf; using ::llrint; using ::llrintf; using ::llround; using ::llroundf; -using ::log1p; -using ::log1pf; -using ::log2; -using ::log2f; -using ::logb; -using ::logbf; using ::lrint; using ::lrintf; using ::lround; @@ -534,8 +504,6 @@ using ::floorl; using ::fmodl; using ::frexpl; using ::ldexpl; -using ::log10l; -using ::logl; using ::modfl; using ::powl; using ::sinhl; @@ -557,16 +525,10 @@ using ::exp2l; using ::expm1l; using ::fdiml; using ::fmal; -using ::fmaxl; -using ::fminl; using ::hypotl; -using ::ilogbl; using ::lgammal; using ::llrintl; using ::llroundl; -using ::log1pl; -using ::log2l; -using ::logbl; using ::lrintl; using ::lroundl; using ::nanl; @@ -587,16 +549,16 @@ using ::truncl; #if _CCCL_STD_VER > 2014 && !defined(__cuda_std__) _LIBCUDACXX_HIDE_FROM_ABI float hypot(float x, float y, float z) { - return sqrt(x * x + y * y + z * z); + return _CUDA_VSTD::sqrt(x * x + y * y + z * z); } _LIBCUDACXX_HIDE_FROM_ABI double hypot(double x, double y, double z) { - return sqrt(x * x + y * y + z * z); + return _CUDA_VSTD::sqrt(x * x + y * y + z * z); } # ifdef _LIBCUDACXX_HAS_COMPLEX_LONG_DOUBLE _LIBCUDACXX_HIDE_FROM_ABI long double hypot(long double x, long double y, long double z) { - return sqrt(x * x + y * y + z * z); + return _CUDA_VSTD::sqrt(x * x + y * y + z * z); } # endif @@ -610,7 +572,7 @@ hypot(_A1 __lcpp_x, _A2 __lcpp_y, _A3 __lcpp_z) noexcept static_assert( (!(is_same<_A1, __result_type>::value && is_same<_A2, __result_type>::value && is_same<_A3, __result_type>::value)), ""); - return ::hypot((__result_type) __lcpp_x, (__result_type) __lcpp_y, (__result_type) __lcpp_z); + return _CUDA_VSTD::hypot((__result_type) __lcpp_x, (__result_type) __lcpp_y, (__result_type) __lcpp_z); } #endif @@ -620,69 +582,6 @@ hypot(_A1 __lcpp_x, _A2 __lcpp_y, _A3 __lcpp_z) noexcept # define _CCCL_CONSTEXPR_CXX14_COMPLEX #endif // _LIBCUDACXX_HAS_NO_CONSTEXPR_COMPLEX_OPERATIONS -template -_LIBCUDACXX_HIDE_FROM_ABI constexpr enable_if_t::value, bool> -__constexpr_isnan(_A1 __lcpp_x) noexcept -{ -#if _CCCL_CUDACC_BELOW(11, 8) - return __isnan(__lcpp_x); -#elif defined(_CCCL_BUILTIN_ISNAN) - // nvcc at times has issues determining the type of __lcpp_x - return _CCCL_BUILTIN_ISNAN(static_cast(__lcpp_x)); -#else // ^^^ _CCCL_BUILTIN_ISNAN ^^^ / vvv !_CCCL_BUILTIN_ISNAN vvv - return ::isnan(__lcpp_x); -#endif // !_CCCL_BUILTIN_ISNAN -} - -template -_LIBCUDACXX_HIDE_FROM_ABI constexpr enable_if_t::value, bool> -__constexpr_isnan(_A1 __lcpp_x) noexcept -{ - return ::isnan(__lcpp_x); -} - -template -_LIBCUDACXX_HIDE_FROM_ABI constexpr enable_if_t::value, bool> -__constexpr_isinf(_A1 __lcpp_x) noexcept -{ -#if _CCCL_CUDACC_BELOW(11, 8) - return __isinf(__lcpp_x); -#elif defined(_CCCL_BUILTIN_ISINF) - // nvcc at times has issues determining the type of __lcpp_x - return __builtin_isinf(static_cast(__lcpp_x)); -#else // ^^^ _CCCL_BUILTIN_ISINF ^^^ / vvv !_CCCL_BUILTIN_ISINF vvv - return ::isinf(__lcpp_x); -#endif // !_CCCL_BUILTIN_ISINF -} - -template -_LIBCUDACXX_HIDE_FROM_ABI constexpr enable_if_t::value, bool> -__constexpr_isinf(_A1 __lcpp_x) noexcept -{ - return ::isinf(__lcpp_x); -} - -template -_LIBCUDACXX_HIDE_FROM_ABI constexpr enable_if_t::value, bool> -__constexpr_isfinite(_A1 __lcpp_x) noexcept -{ -#if _CCCL_CUDACC_BELOW(11, 8) - return !__isinf(__lcpp_x) && !__isnan(__lcpp_x); -#elif defined(_CCCL_BUILTIN_ISFINITE) - // nvcc at times has issues determining the type of __lcpp_x - return __builtin_isfinite(static_cast(__lcpp_x)); -#else // ^^^ _CCCL_BUILTIN_ISFINITE ^^^ / vvv !_CCCL_BUILTIN_ISFINITE vvv - return ::isfinite(__lcpp_x); -#endif // !_CCCL_BUILTIN_ISFINITE -} - -template -_LIBCUDACXX_HIDE_FROM_ABI constexpr enable_if_t::value, bool> -__constexpr_isfinite(_A1 __lcpp_x) noexcept -{ - return isfinite(__lcpp_x); -} - #if _CCCL_COMPILER(MSVC) || _CCCL_COMPILER(NVRTC) template _LIBCUDACXX_HIDE_FROM_ABI _A1 __constexpr_copysign(_A1 __x, _A1 __y) noexcept @@ -700,11 +599,13 @@ _LIBCUDACXX_HIDE_FROM_ABI _CCCL_CONSTEXPR_CXX14 double __constexpr_copysign(doub return __builtin_copysign(__x, __y); } +# if !defined(_LIBCUDACXX_HAS_NO_LONG_DOUBLE) _LIBCUDACXX_HIDE_FROM_ABI _CCCL_CONSTEXPR_CXX14 long double __constexpr_copysign(long double __x, long double __y) noexcept { return __builtin_copysignl(__x, __y); } +# endif // !_LIBCUDACXX_HAS_NO_LONG_DOUBLE template _LIBCUDACXX_HIDE_FROM_ABI @@ -734,10 +635,12 @@ _LIBCUDACXX_HIDE_FROM_ABI _CCCL_CONSTEXPR_CXX14 double __constexpr_fabs(double _ return __builtin_fabs(__x); } +# if !defined(_LIBCUDACXX_HAS_NO_LONG_DOUBLE) _LIBCUDACXX_HIDE_FROM_ABI _CCCL_CONSTEXPR_CXX14 long double __constexpr_fabs(long double __x) noexcept { return __builtin_fabsl(__x); } +# endif // !_LIBCUDACXX_HAS_NO_LONG_DOUBLE template ::value, int> = 0> _LIBCUDACXX_HIDE_FROM_ABI _CCCL_CONSTEXPR_CXX14 double __constexpr_fabs(_Tp __x) noexcept @@ -762,11 +665,11 @@ _LIBCUDACXX_HIDE_FROM_ABI _CCCL_CONSTEXPR_CXX14_COMPLEX float __constexpr_fmax(f if (_CCCL_BUILTIN_IS_CONSTANT_EVALUATED()) # endif // _CCCL_COMPILER(ICC) && _NV_ISEMPTY(_CCCL_CONSTEXPR_CXX14_COMPLEX) { - if (__constexpr_isnan(__x)) + if (_CUDA_VSTD::isnan(__x)) { return __y; } - if (__constexpr_isnan(__y)) + if (_CUDA_VSTD::isnan(__y)) { return __x; } @@ -785,11 +688,11 @@ _LIBCUDACXX_HIDE_FROM_ABI _CCCL_CONSTEXPR_CXX14_COMPLEX double __constexpr_fmax( if (_CCCL_BUILTIN_IS_CONSTANT_EVALUATED()) # endif // _CCCL_COMPILER(ICC) && _NV_ISEMPTY(_CCCL_CONSTEXPR_CXX14_COMPLEX) { - if (__constexpr_isnan(__x)) + if (_CUDA_VSTD::isnan(__x)) { return __y; } - if (__constexpr_isnan(__y)) + if (_CUDA_VSTD::isnan(__y)) { return __x; } @@ -799,29 +702,31 @@ _LIBCUDACXX_HIDE_FROM_ABI _CCCL_CONSTEXPR_CXX14_COMPLEX double __constexpr_fmax( return __builtin_fmax(__x, __y); } +# if !defined(_LIBCUDACXX_HAS_NO_LONG_DOUBLE) _LIBCUDACXX_HIDE_FROM_ABI _CCCL_CONSTEXPR_CXX14_COMPLEX long double __constexpr_fmax(long double __x, long double __y) noexcept { -# if defined(_CCCL_BUILTIN_IS_CONSTANT_EVALUATED) && !defined(_LIBCUDACXX_HAS_NO_CONSTEXPR_COMPLEX_OPERATIONS) -# if _CCCL_COMPILER(ICC) && _NV_ISEMPTY(_CCCL_CONSTEXPR_CXX14_COMPLEX) +# if defined(_CCCL_BUILTIN_IS_CONSTANT_EVALUATED) && !defined(_LIBCUDACXX_HAS_NO_CONSTEXPR_COMPLEX_OPERATIONS) +# if _CCCL_COMPILER(ICC) && _NV_ISEMPTY(_CCCL_CONSTEXPR_CXX14_COMPLEX) if (false) -# else // _CCCL_COMPILER(ICC) && _NV_ISEMPTY(_CCCL_CONSTEXPR_CXX14_COMPLEX) +# else // _CCCL_COMPILER(ICC) && _NV_ISEMPTY(_CCCL_CONSTEXPR_CXX14_COMPLEX) if (_CCCL_BUILTIN_IS_CONSTANT_EVALUATED()) -# endif // _CCCL_COMPILER(ICC) && _NV_ISEMPTY(_CCCL_CONSTEXPR_CXX14_COMPLEX) +# endif // _CCCL_COMPILER(ICC) && _NV_ISEMPTY(_CCCL_CONSTEXPR_CXX14_COMPLEX) { - if (__constexpr_isnan(__x)) + if (_CUDA_VSTD::isnan(__x)) { return __y; } - if (__constexpr_isnan(__y)) + if (_CUDA_VSTD::isnan(__y)) { return __x; } return __x < __y ? __y : __x; } -# endif // defined(_CCCL_BUILTIN_IS_CONSTANT_EVALUATED) +# endif // defined(_CCCL_BUILTIN_IS_CONSTANT_EVALUATED) return __builtin_fmax(__x, __y); } +# endif // !_LIBCUDACXX_HAS_NO_LONG_DOUBLE template ::value && is_arithmetic<_Up>::value, int> = 0> _LIBCUDACXX_HIDE_FROM_ABI _CCCL_CONSTEXPR_CXX14_COMPLEX __promote_t<_Tp, _Up> __constexpr_fmax(_Tp __x, _Up __y) noexcept @@ -835,7 +740,7 @@ _LIBCUDACXX_HIDE_FROM_ABI _CCCL_CONSTEXPR_CXX14_COMPLEX __promote_t<_Tp, _Up> __ template _LIBCUDACXX_HIDE_FROM_ABI _A1 __constexpr_logb(_A1 __x) { - return ::logb(__x); + return _CUDA_VSTD::logb(__x); } #else template @@ -850,17 +755,17 @@ _LIBCUDACXX_HIDE_FROM_ABI _CCCL_CONSTEXPR_CXX14_COMPLEX _Tp __constexpr_logb(_Tp return -numeric_limits<_Tp>::infinity(); } - if (__constexpr_isinf(__x)) + if (_CUDA_VSTD::isinf(__x)) { return numeric_limits<_Tp>::infinity(); } - if (__constexpr_isnan(__x)) + if (_CUDA_VSTD::isnan(__x)) { return numeric_limits<_Tp>::quiet_NaN(); } - __x = __constexpr_fabs(__x); + __x = _CUDA_VSTD::__constexpr_fabs(__x); unsigned long long __exp = 0; while (__x >= _Tp(numeric_limits<_Tp>::radix)) { @@ -912,7 +817,7 @@ _LIBCUDACXX_HIDE_FROM_ABI _CCCL_CONSTEXPR_CXX14_COMPLEX _Tp __constexpr_scalbn(_ return __x; } - if (__constexpr_isinf(__x)) + if (_CUDA_VSTD::isinf(__x)) { return __x; } @@ -922,7 +827,7 @@ _LIBCUDACXX_HIDE_FROM_ABI _CCCL_CONSTEXPR_CXX14_COMPLEX _Tp __constexpr_scalbn(_ return __x; } - if (__constexpr_isnan(__x)) + if (_CUDA_VSTD::isnan(__x)) { return numeric_limits<_Tp>::quiet_NaN(); } @@ -960,86 +865,6 @@ _LIBCUDACXX_HIDE_FROM_ABI _CCCL_CONSTEXPR_CXX14_COMPLEX _Tp __constexpr_scalbn(_ } #endif // !_CCCL_COMPILER(MSVC) -#if _CCCL_STD_VER > 2017 -template -_LIBCUDACXX_HIDE_FROM_ABI constexpr _Fp __lerp(_Fp __a, _Fp __b, _Fp __t) noexcept -{ - if ((__a <= 0 && __b >= 0) || (__a >= 0 && __b <= 0)) - { - return __t * __b + (1 - __t) * __a; - } - - if (__t == 1) - { - return __b; - } - const _Fp __x = __a + __t * (__b - __a); - if ((__t > 1) == (__b > __a)) - { - return __b < __x ? __x : __b; - } - else - { - return __x < __b ? __x : __b; - } -} - -_LIBCUDACXX_HIDE_FROM_ABI constexpr float lerp(float __a, float __b, float __t) noexcept -{ - return __lerp(__a, __b, __t); -} - -_LIBCUDACXX_HIDE_FROM_ABI constexpr double lerp(double __a, double __b, double __t) noexcept -{ - return __lerp(__a, __b, __t); -} - -_LIBCUDACXX_HIDE_FROM_ABI constexpr long double lerp(long double __a, long double __b, long double __t) noexcept -{ - return __lerp(__a, __b, __t); -} - -#endif // _CCCL_STD_VER > 2017 - -template ::digits > numeric_limits<_IntT>::digits), - int _Bits = (numeric_limits<_IntT>::digits - numeric_limits<_FloatT>::digits)> -_LIBCUDACXX_HIDE_FROM_ABI constexpr _IntT __max_representable_int_for_float() noexcept -{ - static_assert(is_floating_point<_FloatT>::value, "must be a floating point type"); - static_assert(is_integral<_IntT>::value, "must be an integral type"); - static_assert(numeric_limits<_FloatT>::radix == 2, "FloatT has incorrect radix"); -#ifdef _LIBCUDACXX_HAS_COMPLEX_LONG_DOUBLE - static_assert( - (_IsSame<_FloatT, float>::value || _IsSame<_FloatT, double>::value || _IsSame<_FloatT, long double>::value), - "unsupported floating point type"); -#else - static_assert((_IsSame<_FloatT, float>::value || _IsSame<_FloatT, double>::value), "unsupported floating point type"); -#endif - return _FloatBigger ? numeric_limits<_IntT>::max() : (numeric_limits<_IntT>::max() >> _Bits << _Bits); -} - -// Convert a floating point number to the specified integral type after -// clamping to the integral types representable range. -// -// The behavior is undefined if `__r` is NaN. -template -_LIBCUDACXX_HIDE_FROM_ABI _IntT __clamp_to_integral(_RealT __r) noexcept -{ - using _Lim = _CUDA_VSTD::numeric_limits<_IntT>; - const _IntT _MaxVal = _CUDA_VSTD::__max_representable_int_for_float<_IntT, _RealT>(); - if (__r >= ::nextafter(static_cast<_RealT>(_MaxVal), INFINITY)) - { - return _Lim::max(); - } - else if (__r <= _Lim::lowest()) - { - return _Lim::min(); - } - return static_cast<_IntT>(__r); -} - _LIBCUDACXX_END_NAMESPACE_STD _CCCL_POP_MACROS diff --git a/libcudacxx/include/cuda/std/detail/libcxx/include/complex b/libcudacxx/include/cuda/std/detail/libcxx/include/complex index 7eecbcc4a20..22a88aa93db 100644 --- a/libcudacxx/include/cuda/std/detail/libcxx/include/complex +++ b/libcudacxx/include/cuda/std/detail/libcxx/include/complex @@ -513,16 +513,14 @@ operator*(const complex<_Tp>& __z, const complex<_Tp>& __w) { bool __z_zero = __a == _Tp(0) && __b == _Tp(0); bool __w_zero = __c == _Tp(0) && __d == _Tp(0); - bool __z_inf = _CUDA_VSTD::__constexpr_isinf(__a) || _CUDA_VSTD::__constexpr_isinf(__b); - bool __w_inf = _CUDA_VSTD::__constexpr_isinf(__c) || _CUDA_VSTD::__constexpr_isinf(__d); + bool __z_inf = _CUDA_VSTD::isinf(__a) || _CUDA_VSTD::isinf(__b); + bool __w_inf = _CUDA_VSTD::isinf(__c) || _CUDA_VSTD::isinf(__d); bool __z_nan = !__z_inf - && ((_CUDA_VSTD::__constexpr_isnan(__a) && _CUDA_VSTD::__constexpr_isnan(__b)) - || (_CUDA_VSTD::__constexpr_isnan(__a) && __b == _Tp(0)) - || (__a == _Tp(0) && _CUDA_VSTD::__constexpr_isnan(__b))); + && ((_CUDA_VSTD::isnan(__a) && _CUDA_VSTD::isnan(__b)) || (_CUDA_VSTD::isnan(__a) && __b == _Tp(0)) + || (__a == _Tp(0) && _CUDA_VSTD::isnan(__b))); bool __w_nan = !__w_inf - && ((_CUDA_VSTD::__constexpr_isnan(__c) && _CUDA_VSTD::__constexpr_isnan(__d)) - || (_CUDA_VSTD::__constexpr_isnan(__c) && __d == _Tp(0)) - || (__c == _Tp(0) && _CUDA_VSTD::__constexpr_isnan(__d))); + && ((_CUDA_VSTD::isnan(__c) && _CUDA_VSTD::isnan(__d)) || (_CUDA_VSTD::isnan(__c) && __d == _Tp(0)) + || (__c == _Tp(0) && _CUDA_VSTD::isnan(__d))); if (__z_nan || __w_nan) { return complex<_Tp>(_Tp(numeric_limits<_Tp>::quiet_NaN()), _Tp(0)); @@ -535,10 +533,8 @@ operator*(const complex<_Tp>& __z, const complex<_Tp>& __w) } return complex<_Tp>(_Tp(numeric_limits<_Tp>::infinity()), _Tp(numeric_limits<_Tp>::infinity())); } - bool __z_nonzero_nan = - !__z_inf && !__z_nan && (_CUDA_VSTD::__constexpr_isnan(__a) || _CUDA_VSTD::__constexpr_isnan(__b)); - bool __w_nonzero_nan = - !__w_inf && !__w_nan && (_CUDA_VSTD::__constexpr_isnan(__c) || _CUDA_VSTD::__constexpr_isnan(__d)); + bool __z_nonzero_nan = !__z_inf && !__z_nan && (_CUDA_VSTD::isnan(__a) || _CUDA_VSTD::isnan(__b)); + bool __w_nonzero_nan = !__w_inf && !__w_nan && (_CUDA_VSTD::isnan(__c) || _CUDA_VSTD::isnan(__d)); if (__z_nonzero_nan || __w_nonzero_nan) { return complex<_Tp>(_Tp(numeric_limits<_Tp>::quiet_NaN()), _Tp(0)); @@ -551,54 +547,54 @@ operator*(const complex<_Tp>& __z, const complex<_Tp>& __w) _Tp __x = __partials.__ac - __partials.__bd; _Tp __y = __partials.__ad + __partials.__bc; #ifndef LIBCUDACXX_ENABLE_SIMPLIFIED_COMPLEX_MULTIPLICATION - if (_CUDA_VSTD::__constexpr_isnan(__x) && _CUDA_VSTD::__constexpr_isnan(__y)) + if (_CUDA_VSTD::isnan(__x) && _CUDA_VSTD::isnan(__y)) { bool __recalc = false; - if (_CUDA_VSTD::__constexpr_isinf(__a) || _CUDA_VSTD::__constexpr_isinf(__b)) + if (_CUDA_VSTD::isinf(__a) || _CUDA_VSTD::isinf(__b)) { - __a = _CUDA_VSTD::__constexpr_copysign(_CUDA_VSTD::__constexpr_isinf(__a) ? _Tp(1) : _Tp(0), __a); - __b = _CUDA_VSTD::__constexpr_copysign(_CUDA_VSTD::__constexpr_isinf(__b) ? _Tp(1) : _Tp(0), __b); - if (_CUDA_VSTD::__constexpr_isnan(__c)) + __a = _CUDA_VSTD::__constexpr_copysign(_CUDA_VSTD::isinf(__a) ? _Tp(1) : _Tp(0), __a); + __b = _CUDA_VSTD::__constexpr_copysign(_CUDA_VSTD::isinf(__b) ? _Tp(1) : _Tp(0), __b); + if (_CUDA_VSTD::isnan(__c)) { __c = _CUDA_VSTD::__constexpr_copysign(_Tp(0), __c); } - if (_CUDA_VSTD::__constexpr_isnan(__d)) + if (_CUDA_VSTD::isnan(__d)) { __d = _CUDA_VSTD::__constexpr_copysign(_Tp(0), __d); } __recalc = true; } - if (_CUDA_VSTD::__constexpr_isinf(__c) || _CUDA_VSTD::__constexpr_isinf(__d)) + if (_CUDA_VSTD::isinf(__c) || _CUDA_VSTD::isinf(__d)) { - __c = _CUDA_VSTD::__constexpr_copysign(_CUDA_VSTD::__constexpr_isinf(__c) ? _Tp(1) : _Tp(0), __c); - __d = _CUDA_VSTD::__constexpr_copysign(_CUDA_VSTD::__constexpr_isinf(__d) ? _Tp(1) : _Tp(0), __d); - if (_CUDA_VSTD::__constexpr_isnan(__a)) + __c = _CUDA_VSTD::__constexpr_copysign(_CUDA_VSTD::isinf(__c) ? _Tp(1) : _Tp(0), __c); + __d = _CUDA_VSTD::__constexpr_copysign(_CUDA_VSTD::isinf(__d) ? _Tp(1) : _Tp(0), __d); + if (_CUDA_VSTD::isnan(__a)) { __a = _CUDA_VSTD::__constexpr_copysign(_Tp(0), __a); } - if (_CUDA_VSTD::__constexpr_isnan(__b)) + if (_CUDA_VSTD::isnan(__b)) { __b = _CUDA_VSTD::__constexpr_copysign(_Tp(0), __b); } __recalc = true; } if (!__recalc - && (_CUDA_VSTD::__constexpr_isinf(__partials.__ac) || _CUDA_VSTD::__constexpr_isinf(__partials.__bd) - || _CUDA_VSTD::__constexpr_isinf(__partials.__ad) || _CUDA_VSTD::__constexpr_isinf(__partials.__bc))) + && (_CUDA_VSTD::isinf(__partials.__ac) || _CUDA_VSTD::isinf(__partials.__bd) + || _CUDA_VSTD::isinf(__partials.__ad) || _CUDA_VSTD::isinf(__partials.__bc))) { - if (_CUDA_VSTD::__constexpr_isnan(__a)) + if (_CUDA_VSTD::isnan(__a)) { __a = _CUDA_VSTD::__constexpr_copysign(_Tp(0), __a); } - if (_CUDA_VSTD::__constexpr_isnan(__b)) + if (_CUDA_VSTD::isnan(__b)) { __b = _CUDA_VSTD::__constexpr_copysign(_Tp(0), __b); } - if (_CUDA_VSTD::__constexpr_isnan(__c)) + if (_CUDA_VSTD::isnan(__c)) { __c = _CUDA_VSTD::__constexpr_copysign(_Tp(0), __c); } - if (_CUDA_VSTD::__constexpr_isnan(__d)) + if (_CUDA_VSTD::isnan(__d)) { __d = _CUDA_VSTD::__constexpr_copysign(_Tp(0), __d); } @@ -643,7 +639,7 @@ operator/(const complex<_Tp>& __z, const complex<_Tp>& __w) _Tp __d = __w.imag(); _Tp __logbw = _CUDA_VSTD::__constexpr_logb( _CUDA_VSTD::__constexpr_fmax(_CUDA_VSTD::__constexpr_fabs(__c), _CUDA_VSTD::__constexpr_fabs(__d))); - if (_CUDA_VSTD::__constexpr_isfinite(__logbw)) + if (_CUDA_VSTD::isfinite(__logbw)) { __ilogbw = static_cast(__logbw); __c = _CUDA_VSTD::__constexpr_scalbn(__c, -__ilogbw); @@ -656,24 +652,20 @@ operator/(const complex<_Tp>& __z, const complex<_Tp>& __w) { bool __z_zero = __a == _Tp(0) && __b == _Tp(0); bool __w_zero = __c == _Tp(0) && __d == _Tp(0); - bool __z_inf = _CUDA_VSTD::__constexpr_isinf(__a) || _CUDA_VSTD::__constexpr_isinf(__b); - bool __w_inf = _CUDA_VSTD::__constexpr_isinf(__c) || _CUDA_VSTD::__constexpr_isinf(__d); + bool __z_inf = _CUDA_VSTD::isinf(__a) || _CUDA_VSTD::isinf(__b); + bool __w_inf = _CUDA_VSTD::isinf(__c) || _CUDA_VSTD::isinf(__d); bool __z_nan = !__z_inf - && ((_CUDA_VSTD::__constexpr_isnan(__a) && _CUDA_VSTD::__constexpr_isnan(__b)) - || (_CUDA_VSTD::__constexpr_isnan(__a) && __b == _Tp(0)) - || (__a == _Tp(0) && _CUDA_VSTD::__constexpr_isnan(__b))); + && ((_CUDA_VSTD::isnan(__a) && _CUDA_VSTD::isnan(__b)) || (_CUDA_VSTD::isnan(__a) && __b == _Tp(0)) + || (__a == _Tp(0) && _CUDA_VSTD::isnan(__b))); bool __w_nan = !__w_inf - && ((_CUDA_VSTD::__constexpr_isnan(__c) && _CUDA_VSTD::__constexpr_isnan(__d)) - || (_CUDA_VSTD::__constexpr_isnan(__c) && __d == _Tp(0)) - || (__c == _Tp(0) && _CUDA_VSTD::__constexpr_isnan(__d))); + && ((_CUDA_VSTD::isnan(__c) && _CUDA_VSTD::isnan(__d)) || (_CUDA_VSTD::isnan(__c) && __d == _Tp(0)) + || (__c == _Tp(0) && _CUDA_VSTD::isnan(__d))); if ((__z_nan || __w_nan) || (__z_inf && __w_inf)) { return complex<_Tp>(_Tp(numeric_limits<_Tp>::quiet_NaN()), _Tp(0)); } - bool __z_nonzero_nan = - !__z_inf && !__z_nan && (_CUDA_VSTD::__constexpr_isnan(__a) || _CUDA_VSTD::__constexpr_isnan(__b)); - bool __w_nonzero_nan = - !__w_inf && !__w_nan && (_CUDA_VSTD::__constexpr_isnan(__c) || _CUDA_VSTD::__constexpr_isnan(__d)); + bool __z_nonzero_nan = !__z_inf && !__z_nan && (_CUDA_VSTD::isnan(__a) || _CUDA_VSTD::isnan(__b)); + bool __w_nonzero_nan = !__w_inf && !__w_nan && (_CUDA_VSTD::isnan(__c) || _CUDA_VSTD::isnan(__d)); if (__z_nonzero_nan || __w_nonzero_nan) { if (__w_zero) @@ -708,26 +700,25 @@ operator/(const complex<_Tp>& __z, const complex<_Tp>& __w) _Tp __x = _CUDA_VSTD::__constexpr_scalbn((__partials.__ac + __partials.__bd) / __denom, -__ilogbw); _Tp __y = _CUDA_VSTD::__constexpr_scalbn((__partials.__bc - __partials.__ad) / __denom, -__ilogbw); #ifndef LIBCUDACXX_ENABLE_SIMPLIFIED_COMPLEX_DIVISION - if (_CUDA_VSTD::__constexpr_isnan(__x) && _CUDA_VSTD::__constexpr_isnan(__y)) + if (_CUDA_VSTD::isnan(__x) && _CUDA_VSTD::isnan(__y)) { - if ((__denom == _Tp(0)) && (!_CUDA_VSTD::__constexpr_isnan(__a) || !_CUDA_VSTD::__constexpr_isnan(__b))) + if ((__denom == _Tp(0)) && (!_CUDA_VSTD::isnan(__a) || !_CUDA_VSTD::isnan(__b))) { __x = _CUDA_VSTD::__constexpr_copysign(_Tp(INFINITY), __c) * __a; __y = _CUDA_VSTD::__constexpr_copysign(_Tp(INFINITY), __c) * __b; } - else if ((_CUDA_VSTD::__constexpr_isinf(__a) || _CUDA_VSTD::__constexpr_isinf(__b)) - && _CUDA_VSTD::__constexpr_isfinite(__c) && _CUDA_VSTD::__constexpr_isfinite(__d)) + else if ((_CUDA_VSTD::isinf(__a) || _CUDA_VSTD::isinf(__b)) && _CUDA_VSTD::isfinite(__c) + && _CUDA_VSTD::isfinite(__d)) { - __a = _CUDA_VSTD::__constexpr_copysign(_CUDA_VSTD::__constexpr_isinf(__a) ? _Tp(1) : _Tp(0), __a); - __b = _CUDA_VSTD::__constexpr_copysign(_CUDA_VSTD::__constexpr_isinf(__b) ? _Tp(1) : _Tp(0), __b); + __a = _CUDA_VSTD::__constexpr_copysign(_CUDA_VSTD::isinf(__a) ? _Tp(1) : _Tp(0), __a); + __b = _CUDA_VSTD::__constexpr_copysign(_CUDA_VSTD::isinf(__b) ? _Tp(1) : _Tp(0), __b); __x = _Tp(INFINITY) * (__a * __c + __b * __d); __y = _Tp(INFINITY) * (__b * __c - __a * __d); } - else if (_CUDA_VSTD::__constexpr_isinf(__logbw) && __logbw > _Tp(0) && _CUDA_VSTD::__constexpr_isfinite(__a) - && _CUDA_VSTD::__constexpr_isfinite(__b)) + else if (_CUDA_VSTD::isinf(__logbw) && __logbw > _Tp(0) && _CUDA_VSTD::isfinite(__a) && _CUDA_VSTD::isfinite(__b)) { - __c = _CUDA_VSTD::__constexpr_copysign(_CUDA_VSTD::__constexpr_isinf(__c) ? _Tp(1) : _Tp(0), __c); - __d = _CUDA_VSTD::__constexpr_copysign(_CUDA_VSTD::__constexpr_isinf(__d) ? _Tp(1) : _Tp(0), __d); + __c = _CUDA_VSTD::__constexpr_copysign(_CUDA_VSTD::isinf(__c) ? _Tp(1) : _Tp(0), __c); + __d = _CUDA_VSTD::__constexpr_copysign(_CUDA_VSTD::isinf(__d) ? _Tp(1) : _Tp(0), __d); __x = _Tp(0) * (__a * __c + __b * __d); __y = _Tp(0) * (__b * __c - __a * __d); } @@ -928,11 +919,11 @@ _LIBCUDACXX_HIDE_FROM_ABI enable_if_t::value, float> arg(_Tp template _LIBCUDACXX_HIDE_FROM_ABI _CCCL_CONSTEXPR_CXX14 _Tp norm(const complex<_Tp>& __c) { - if (_CUDA_VSTD::__constexpr_isinf(__c.real())) + if (_CUDA_VSTD::isinf(__c.real())) { return _CUDA_VSTD::abs(__c.real()); } - if (_CUDA_VSTD::__constexpr_isinf(__c.imag())) + if (_CUDA_VSTD::isinf(__c.imag())) { return _CUDA_VSTD::abs(__c.imag()); } @@ -965,7 +956,7 @@ template _LIBCUDACXX_HIDE_FROM_ABI complex<_Tp> proj(const complex<_Tp>& __c) { complex<_Tp> __r = __c; - if (_CUDA_VSTD::__constexpr_isinf(__c.real()) || _CUDA_VSTD::__constexpr_isinf(__c.imag())) + if (_CUDA_VSTD::isinf(__c.real()) || _CUDA_VSTD::isinf(__c.imag())) { __r = complex<_Tp>(INFINITY, __constexpr_copysign(_Tp(0), __c.imag())); } @@ -975,7 +966,7 @@ _LIBCUDACXX_HIDE_FROM_ABI complex<_Tp> proj(const complex<_Tp>& __c) template _LIBCUDACXX_HIDE_FROM_ABI enable_if_t<__is_complex_float<_Tp>::value, __cccl_complex_complex_type<_Tp>> proj(_Tp __re) { - if (_CUDA_VSTD::__constexpr_isinf(__re)) + if (_CUDA_VSTD::isinf(__re)) { __re = _CUDA_VSTD::abs(__re); } @@ -993,33 +984,33 @@ _LIBCUDACXX_HIDE_FROM_ABI enable_if_t::value, __cccl_complex_co template _LIBCUDACXX_HIDE_FROM_ABI complex<_Tp> polar(const _Tp& __rho, const _Tp& __theta = _Tp()) { - if (_CUDA_VSTD::__constexpr_isnan(__rho) || _CUDA_VSTD::signbit(__rho)) + if (_CUDA_VSTD::isnan(__rho) || _CUDA_VSTD::signbit(__rho)) { return complex<_Tp>(_Tp(NAN), _Tp(NAN)); } - if (_CUDA_VSTD::__constexpr_isnan(__theta)) + if (_CUDA_VSTD::isnan(__theta)) { - if (_CUDA_VSTD::__constexpr_isinf(__rho)) + if (_CUDA_VSTD::isinf(__rho)) { return complex<_Tp>(__rho, __theta); } return complex<_Tp>(__theta, __theta); } - if (_CUDA_VSTD::__constexpr_isinf(__theta)) + if (_CUDA_VSTD::isinf(__theta)) { - if (_CUDA_VSTD::__constexpr_isinf(__rho)) + if (_CUDA_VSTD::isinf(__rho)) { return complex<_Tp>(__rho, _Tp(NAN)); } return complex<_Tp>(_Tp(NAN), _Tp(NAN)); } _Tp __x = __rho * _CUDA_VSTD::cos(__theta); - if (_CUDA_VSTD::__constexpr_isnan(__x)) + if (_CUDA_VSTD::isnan(__x)) { __x = 0; } _Tp __y = __rho * _CUDA_VSTD::sin(__theta); - if (_CUDA_VSTD::__constexpr_isnan(__y)) + if (_CUDA_VSTD::isnan(__y)) { __y = 0; } @@ -1047,18 +1038,18 @@ _LIBCUDACXX_HIDE_FROM_ABI complex<_Tp> log10(const complex<_Tp>& __x) template _LIBCUDACXX_HIDE_FROM_ABI complex<_Tp> sqrt(const complex<_Tp>& __x) { - if (_CUDA_VSTD::__constexpr_isinf(__x.imag())) + if (_CUDA_VSTD::isinf(__x.imag())) { return complex<_Tp>(_Tp(INFINITY), __x.imag()); } - if (_CUDA_VSTD::__constexpr_isinf(__x.real())) + if (_CUDA_VSTD::isinf(__x.real())) { if (__x.real() > _Tp(0)) { - return complex<_Tp>( - __x.real(), _CUDA_VSTD::__constexpr_isnan(__x.imag()) ? __x.imag() : __constexpr_copysign(_Tp(0), __x.imag())); + return complex<_Tp>(__x.real(), + _CUDA_VSTD::isnan(__x.imag()) ? __x.imag() : __constexpr_copysign(_Tp(0), __x.imag())); } - return complex<_Tp>(_CUDA_VSTD::__constexpr_isnan(__x.imag()) ? __x.imag() : _Tp(0), + return complex<_Tp>(_CUDA_VSTD::isnan(__x.imag()) ? __x.imag() : _Tp(0), __constexpr_copysign(__x.real(), __x.imag())); } return _CUDA_VSTD::polar(_CUDA_VSTD::sqrt(_CUDA_VSTD::abs(__x)), _CUDA_VSTD::arg(__x) / _Tp(2)); @@ -1074,18 +1065,18 @@ _LIBCUDACXX_HIDE_FROM_ABI complex<_Tp> exp(const complex<_Tp>& __x) { return complex<_Tp>(_CUDA_VSTD::exp(__x.real()), __constexpr_copysign(_Tp(0), __x.imag())); } - if (_CUDA_VSTD::__constexpr_isinf(__x.real())) + if (_CUDA_VSTD::isinf(__x.real())) { if (__x.real() < _Tp(0)) { - if (!_CUDA_VSTD::__constexpr_isfinite(__i)) + if (!_CUDA_VSTD::isfinite(__i)) { __i = _Tp(1); } } - else if (__i == _Tp(0) || !_CUDA_VSTD::__constexpr_isfinite(__i)) + else if (__i == _Tp(0) || !_CUDA_VSTD::isfinite(__i)) { - if (_CUDA_VSTD::__constexpr_isinf(__i)) + if (_CUDA_VSTD::isinf(__i)) { __i = _Tp(NAN); } @@ -1144,21 +1135,21 @@ template _LIBCUDACXX_HIDE_FROM_ABI complex<_Tp> asinh(const complex<_Tp>& __x) { const _Tp __pi(static_cast<_Tp>(atan2(+0., -0.))); - if (_CUDA_VSTD::__constexpr_isinf(__x.real())) + if (_CUDA_VSTD::isinf(__x.real())) { - if (_CUDA_VSTD::__constexpr_isnan(__x.imag())) + if (_CUDA_VSTD::isnan(__x.imag())) { return __x; } - if (_CUDA_VSTD::__constexpr_isinf(__x.imag())) + if (_CUDA_VSTD::isinf(__x.imag())) { return complex<_Tp>(__x.real(), __constexpr_copysign(__pi * _Tp(0.25), __x.imag())); } return complex<_Tp>(__x.real(), __constexpr_copysign(_Tp(0), __x.imag())); } - if (_CUDA_VSTD::__constexpr_isnan(__x.real())) + if (_CUDA_VSTD::isnan(__x.real())) { - if (_CUDA_VSTD::__constexpr_isinf(__x.imag())) + if (_CUDA_VSTD::isinf(__x.imag())) { return complex<_Tp>(__x.imag(), __x.real()); } @@ -1168,7 +1159,7 @@ _LIBCUDACXX_HIDE_FROM_ABI complex<_Tp> asinh(const complex<_Tp>& __x) } return complex<_Tp>(__x.real(), __x.real()); } - if (_CUDA_VSTD::__constexpr_isinf(__x.imag())) + if (_CUDA_VSTD::isinf(__x.imag())) { return complex<_Tp>(__constexpr_copysign(__x.imag(), __x.real()), __constexpr_copysign(__pi / _Tp(2), __x.imag())); } @@ -1182,13 +1173,13 @@ template _LIBCUDACXX_HIDE_FROM_ABI complex<_Tp> acosh(const complex<_Tp>& __x) { const _Tp __pi(static_cast<_Tp>(atan2(+0., -0.))); - if (_CUDA_VSTD::__constexpr_isinf(__x.real())) + if (_CUDA_VSTD::isinf(__x.real())) { - if (_CUDA_VSTD::__constexpr_isnan(__x.imag())) + if (_CUDA_VSTD::isnan(__x.imag())) { return complex<_Tp>(_CUDA_VSTD::abs(__x.real()), __x.imag()); } - if (_CUDA_VSTD::__constexpr_isinf(__x.imag())) + if (_CUDA_VSTD::isinf(__x.imag())) { if (__x.real() > _Tp(0)) { @@ -1205,15 +1196,15 @@ _LIBCUDACXX_HIDE_FROM_ABI complex<_Tp> acosh(const complex<_Tp>& __x) } return complex<_Tp>(__x.real(), __constexpr_copysign(_Tp(0), __x.imag())); } - if (_CUDA_VSTD::__constexpr_isnan(__x.real())) + if (_CUDA_VSTD::isnan(__x.real())) { - if (_CUDA_VSTD::__constexpr_isinf(__x.imag())) + if (_CUDA_VSTD::isinf(__x.imag())) { return complex<_Tp>(_CUDA_VSTD::abs(__x.imag()), __x.real()); } return complex<_Tp>(__x.real(), __x.real()); } - if (_CUDA_VSTD::__constexpr_isinf(__x.imag())) + if (_CUDA_VSTD::isinf(__x.imag())) { return complex<_Tp>(_CUDA_VSTD::abs(__x.imag()), __constexpr_copysign(__pi / _Tp(2), __x.imag())); } @@ -1227,23 +1218,23 @@ template _LIBCUDACXX_HIDE_FROM_ABI complex<_Tp> atanh(const complex<_Tp>& __x) { const _Tp __pi(static_cast<_Tp>(atan2(+0., -0.))); - if (_CUDA_VSTD::__constexpr_isinf(__x.imag())) + if (_CUDA_VSTD::isinf(__x.imag())) { return complex<_Tp>(__constexpr_copysign(_Tp(0), __x.real()), __constexpr_copysign(__pi / _Tp(2), __x.imag())); } - if (_CUDA_VSTD::__constexpr_isnan(__x.imag())) + if (_CUDA_VSTD::isnan(__x.imag())) { - if (_CUDA_VSTD::__constexpr_isinf(__x.real()) || __x.real() == _Tp(0)) + if (_CUDA_VSTD::isinf(__x.real()) || __x.real() == _Tp(0)) { return complex<_Tp>(__constexpr_copysign(_Tp(0), __x.real()), __x.imag()); } return complex<_Tp>(__x.imag(), __x.imag()); } - if (_CUDA_VSTD::__constexpr_isnan(__x.real())) + if (_CUDA_VSTD::isnan(__x.real())) { return complex<_Tp>(__x.real(), __x.real()); } - if (_CUDA_VSTD::__constexpr_isinf(__x.real())) + if (_CUDA_VSTD::isinf(__x.real())) { return complex<_Tp>(__constexpr_copysign(_Tp(0), __x.real()), __constexpr_copysign(__pi / _Tp(2), __x.imag())); } @@ -1260,15 +1251,15 @@ _LIBCUDACXX_HIDE_FROM_ABI complex<_Tp> atanh(const complex<_Tp>& __x) template _LIBCUDACXX_HIDE_FROM_ABI complex<_Tp> sinh(const complex<_Tp>& __x) { - if (_CUDA_VSTD::__constexpr_isinf(__x.real()) && !_CUDA_VSTD::__constexpr_isfinite(__x.imag())) + if (_CUDA_VSTD::isinf(__x.real()) && !_CUDA_VSTD::isfinite(__x.imag())) { return complex<_Tp>(__x.real(), _Tp(NAN)); } - if (__x.real() == _Tp(0) && !_CUDA_VSTD::__constexpr_isfinite(__x.imag())) + if (__x.real() == _Tp(0) && !_CUDA_VSTD::isfinite(__x.imag())) { return complex<_Tp>(__x.real(), _Tp(NAN)); } - if (__x.imag() == _Tp(0) && !_CUDA_VSTD::__constexpr_isfinite(__x.real())) + if (__x.imag() == _Tp(0) && !_CUDA_VSTD::isfinite(__x.real())) { return __x; } @@ -1281,11 +1272,11 @@ _LIBCUDACXX_HIDE_FROM_ABI complex<_Tp> sinh(const complex<_Tp>& __x) template _LIBCUDACXX_HIDE_FROM_ABI complex<_Tp> cosh(const complex<_Tp>& __x) { - if (_CUDA_VSTD::__constexpr_isinf(__x.real()) && !_CUDA_VSTD::__constexpr_isfinite(__x.imag())) + if (_CUDA_VSTD::isinf(__x.real()) && !_CUDA_VSTD::isfinite(__x.imag())) { return complex<_Tp>(_CUDA_VSTD::abs(__x.real()), _Tp(NAN)); } - if (__x.real() == _Tp(0) && !_CUDA_VSTD::__constexpr_isfinite(__x.imag())) + if (__x.real() == _Tp(0) && !_CUDA_VSTD::isfinite(__x.imag())) { return complex<_Tp>(_Tp(NAN), __x.real()); } @@ -1293,7 +1284,7 @@ _LIBCUDACXX_HIDE_FROM_ABI complex<_Tp> cosh(const complex<_Tp>& __x) { return complex<_Tp>(_Tp(1), __x.imag()); } - if (__x.imag() == _Tp(0) && !_CUDA_VSTD::__constexpr_isfinite(__x.real())) + if (__x.imag() == _Tp(0) && !_CUDA_VSTD::isfinite(__x.real())) { return complex<_Tp>(_CUDA_VSTD::abs(__x.real()), __x.imag()); } @@ -1306,16 +1297,16 @@ _LIBCUDACXX_HIDE_FROM_ABI complex<_Tp> cosh(const complex<_Tp>& __x) template _LIBCUDACXX_HIDE_FROM_ABI complex<_Tp> tanh(const complex<_Tp>& __x) { - if (_CUDA_VSTD::__constexpr_isinf(__x.real())) + if (_CUDA_VSTD::isinf(__x.real())) { - if (!_CUDA_VSTD::__constexpr_isfinite(__x.imag())) + if (!_CUDA_VSTD::isfinite(__x.imag())) { return complex<_Tp>(__constexpr_copysign(_Tp(1), __x.real()), _Tp(0)); } return complex<_Tp>(__constexpr_copysign(_Tp(1), __x.real()), __constexpr_copysign(_Tp(0), _CUDA_VSTD::sin(_Tp(2) * __x.imag()))); } - if (_CUDA_VSTD::__constexpr_isnan(__x.real()) && __x.imag() == _Tp(0)) + if (_CUDA_VSTD::isnan(__x.real()) && __x.imag() == _Tp(0)) { return __x; } @@ -1323,7 +1314,7 @@ _LIBCUDACXX_HIDE_FROM_ABI complex<_Tp> tanh(const complex<_Tp>& __x) _Tp __2i(_Tp(2) * __x.imag()); _Tp __d(_CUDA_VSTD::cosh(__2r) + _CUDA_VSTD::cos(__2i)); _Tp __2rsh(_CUDA_VSTD::sinh(__2r)); - if (_CUDA_VSTD::__constexpr_isinf(__2rsh) && _CUDA_VSTD::__constexpr_isinf(__d)) + if (_CUDA_VSTD::isinf(__2rsh) && _CUDA_VSTD::isinf(__d)) { return complex<_Tp>(__2rsh > _Tp(0) ? _Tp(1) : _Tp(-1), __2i > _Tp(0) ? _Tp(0) : _Tp(-0.)); } @@ -1345,13 +1336,13 @@ template _LIBCUDACXX_HIDE_FROM_ABI complex<_Tp> acos(const complex<_Tp>& __x) { const _Tp __pi(static_cast<_Tp>(atan2(+0., -0.))); - if (_CUDA_VSTD::__constexpr_isinf(__x.real())) + if (_CUDA_VSTD::isinf(__x.real())) { - if (_CUDA_VSTD::__constexpr_isnan(__x.imag())) + if (_CUDA_VSTD::isnan(__x.imag())) { return complex<_Tp>(__x.imag(), __x.real()); } - if (_CUDA_VSTD::__constexpr_isinf(__x.imag())) + if (_CUDA_VSTD::isinf(__x.imag())) { if (__x.real() < _Tp(0)) { @@ -1365,15 +1356,15 @@ _LIBCUDACXX_HIDE_FROM_ABI complex<_Tp> acos(const complex<_Tp>& __x) } return complex<_Tp>(_Tp(0), _CUDA_VSTD::signbit(__x.imag()) ? __x.real() : -__x.real()); } - if (_CUDA_VSTD::__constexpr_isnan(__x.real())) + if (_CUDA_VSTD::isnan(__x.real())) { - if (_CUDA_VSTD::__constexpr_isinf(__x.imag())) + if (_CUDA_VSTD::isinf(__x.imag())) { return complex<_Tp>(__x.real(), -__x.imag()); } return complex<_Tp>(__x.real(), __x.real()); } - if (_CUDA_VSTD::__constexpr_isinf(__x.imag())) + if (_CUDA_VSTD::isinf(__x.imag())) { return complex<_Tp>(__pi / _Tp(2), -__x.imag()); } diff --git a/libcudacxx/test/libcudacxx/std/numerics/c.math/fp_min_max.pass.cpp b/libcudacxx/test/libcudacxx/std/numerics/c.math/fp_min_max.pass.cpp new file mode 100644 index 00000000000..a42b36caeaa --- /dev/null +++ b/libcudacxx/test/libcudacxx/std/numerics/c.math/fp_min_max.pass.cpp @@ -0,0 +1,118 @@ +//===----------------------------------------------------------------------===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// SPDX-FileCopyrightText: Copyright (c) 2024 NVIDIA CORPORATION & AFFILIATES. +// +//===----------------------------------------------------------------------===// + +// + +#include +#include +#include + +#include "test_macros.h" + +template +__host__ __device__ void test_fmax(T value) +{ + static_assert((cuda::std::is_same::value), ""); + static_assert( + (cuda::std::is_same>::value), ""); + static_assert( + (cuda::std::is_same>::value), ""); + assert(cuda::std::fmax(value, (T) 0) == value); +} + +__host__ __device__ void test_fmax(float value) +{ + test_fmax(value); + test_fmax(value); +#if !defined(_LIBCUDACXX_HAS_NO_LONG_DOUBLE) + test_fmax(value); +#endif // !_LIBCUDACXX_HAS_NO_LONG_DOUBLE +#ifdef _LIBCUDACXX_HAS_NVFP16 + test_fmax<__half>(__float2half(value)); +#endif // _LIBCUDACXX_HAS_NVFP16 +#ifdef _LIBCUDACXX_HAS_NVBF16 + test_fmax<__nv_bfloat16>(__float2bfloat16(value)); +#endif // _LIBCUDACXX_HAS_NVBF16 + + static_assert((cuda::std::is_same::value), ""); + static_assert((cuda::std::is_same::value), ""); + static_assert((cuda::std::is_same::value), ""); + static_assert((cuda::std::is_same::value), ""); + static_assert((cuda::std::is_same::value), ""); + + static_assert((cuda::std::is_same::value), ""); + static_assert((cuda::std::is_same::value), ""); + + static_assert((cuda::std::is_same::value), ""); +#if !defined(_LIBCUDACXX_HAS_NO_LONG_DOUBLE) + static_assert((cuda::std::is_same::value), + ""); + static_assert((cuda::std::is_same::value), ""); +#endif // !_LIBCUDACXX_HAS_NO_LONG_DOUBLE +} + +template +__host__ __device__ void test_fmin(T value) +{ + static_assert((cuda::std::is_same::value), ""); + static_assert( + (cuda::std::is_same>::value), ""); + static_assert( + (cuda::std::is_same>::value), ""); + assert(cuda::std::fmin(value, (T) 0) == T(0)); +} + +__host__ __device__ void test_fmin(float value) +{ + test_fmax(value); + test_fmax(value); +#if !defined(_LIBCUDACXX_HAS_NO_LONG_DOUBLE) + test_fmax(value); +#endif // !_LIBCUDACXX_HAS_NO_LONG_DOUBLE +#ifdef _LIBCUDACXX_HAS_NVFP16 + test_fmax<__half>(__float2half(value)); +#endif // _LIBCUDACXX_HAS_NVFP16 +#ifdef _LIBCUDACXX_HAS_NVBF16 + test_fmax<__nv_bfloat16>(__float2bfloat16(value)); +#endif // _LIBCUDACXX_HAS_NVBF16 + + static_assert((cuda::std::is_same::value), ""); + static_assert((cuda::std::is_same::value), ""); + static_assert((cuda::std::is_same::value), ""); + static_assert((cuda::std::is_same::value), ""); + static_assert((cuda::std::is_same::value), ""); + + static_assert((cuda::std::is_same::value), ""); + static_assert((cuda::std::is_same::value), ""); + + static_assert((cuda::std::is_same::value), ""); +#if !defined(_LIBCUDACXX_HAS_NO_LONG_DOUBLE) + static_assert((cuda::std::is_same::value), + ""); + static_assert((cuda::std::is_same::value), ""); +#endif // !_LIBCUDACXX_HAS_NO_LONG_DOUBLE +} + +__host__ __device__ void test(float value) +{ + test_fmax(value); + test_fmin(value); +} + +__global__ void test_global_kernel(float* value) +{ + test(*value); +} + +int main(int, char**) +{ + volatile float value = 1.0f; + test(value); + return 0; +} diff --git a/libcudacxx/test/libcudacxx/std/numerics/c.math/fp_traits.pass.cpp b/libcudacxx/test/libcudacxx/std/numerics/c.math/fp_traits.pass.cpp new file mode 100644 index 00000000000..93e469bb3f4 --- /dev/null +++ b/libcudacxx/test/libcudacxx/std/numerics/c.math/fp_traits.pass.cpp @@ -0,0 +1,458 @@ +//===----------------------------------------------------------------------===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// SPDX-FileCopyrightText: Copyright (c) 2024 NVIDIA CORPORATION & AFFILIATES. +// +//===----------------------------------------------------------------------===// + +// + +#include +#include +#include +#include + +#include "fp_compare.h" +#include "test_macros.h" + +template +__host__ __device__ void test_fpclassify(float val) +{ + static_assert((cuda::std::is_same::value), ""); + assert(cuda::std::fpclassify(T(val)) == FP_NORMAL); + assert(cuda::std::fpclassify(T(1.0)) == FP_NORMAL); + assert(cuda::std::fpclassify(T(0.0)) == FP_ZERO); + assert(cuda::std::fpclassify(T(-1.0)) == FP_NORMAL); + assert(cuda::std::fpclassify(T(-0.0)) == FP_ZERO); + // extended floating point types have issues here + if (!cuda::std::__is_extended_floating_point::value) + { + assert(cuda::std::fpclassify(T(cuda::std::numeric_limits::quiet_NaN())) == FP_NAN); + assert(cuda::std::fpclassify(T(cuda::std::numeric_limits::infinity())) == FP_INFINITE); + assert(cuda::std::fpclassify(T(cuda::std::numeric_limits::denorm_min())) == FP_SUBNORMAL); + } + else + { + assert(cuda::std::fpclassify(T(cuda::std::numeric_limits::quiet_NaN())) == FP_NAN); + assert(cuda::std::fpclassify(T(cuda::std::numeric_limits::infinity())) == FP_INFINITE); + // float subnormal turns to 0.0 for our half precision types + assert(cuda::std::fpclassify(T(cuda::std::numeric_limits::denorm_min())) == FP_ZERO); + } +} + +__host__ __device__ void test_fpclassify(float val) +{ + test_fpclassify(val); + test_fpclassify(val); +#if !defined(_LIBCUDACXX_HAS_NO_LONG_DOUBLE) + test_fpclassify(val); +#endif // !_LIBCUDACXX_HAS_NO_LONG_DOUBLE +#ifdef _LIBCUDACXX_HAS_NVFP16 + test_fpclassify<__half>(val); +#endif // _LIBCUDACXX_HAS_NVFP16 +#ifdef _LIBCUDACXX_HAS_NVBF16 + test_fpclassify<__nv_bfloat16>(val); +#endif // _LIBCUDACXX_HAS_NVBF16 + + assert(cuda::std::fpclassify(0u) == FP_ZERO); + assert(cuda::std::fpclassify(cuda::std::numeric_limits::max()) == FP_NORMAL); + assert(cuda::std::fpclassify(1) == FP_NORMAL); + assert(cuda::std::fpclassify(-1) == FP_NORMAL); + assert(cuda::std::fpclassify(cuda::std::numeric_limits::max()) == FP_NORMAL); + assert(cuda::std::fpclassify(cuda::std::numeric_limits::min()) == FP_NORMAL); +} + +template +__host__ __device__ void test_signbit(float val) +{ + static_assert((cuda::std::is_same::value), ""); + assert(cuda::std::signbit(T(val)) == false); + assert(cuda::std::signbit(T(-1.0)) == true); + assert(cuda::std::signbit(T(0.0)) == false); +} + +__host__ __device__ void test_signbit(float val) +{ + test_signbit(val); + test_signbit(val); +#if !defined(_LIBCUDACXX_HAS_NO_LONG_DOUBLE) + test_signbit(val); +#endif // !_LIBCUDACXX_HAS_NO_LONG_DOUBLE +#ifdef _LIBCUDACXX_HAS_NVFP16 + test_signbit<__half>(val); +#endif // _LIBCUDACXX_HAS_NVFP16 +#ifdef _LIBCUDACXX_HAS_NVBF16 + test_signbit<__nv_bfloat16>(val); +#endif // _LIBCUDACXX_HAS_NVBF16 + + assert(cuda::std::signbit(0u) == false); + assert(cuda::std::signbit(cuda::std::numeric_limits::max()) == false); + assert(cuda::std::signbit(1) == false); + assert(cuda::std::signbit(-1) == true); + assert(cuda::std::signbit(cuda::std::numeric_limits::max()) == false); + assert(cuda::std::signbit(cuda::std::numeric_limits::min()) == true); +} + +template +__host__ __device__ void test_isfinite(float val) +{ + static_assert((cuda::std::is_same::value), ""); + assert(cuda::std::isfinite(T(val)) == true); + assert(cuda::std::isfinite(T(-1.0f)) == true); + assert(cuda::std::isfinite(T(1.0f)) == true); + assert(cuda::std::isfinite(T(NAN)) == false); + assert(cuda::std::isfinite(T(INFINITY)) == false); + assert(cuda::std::isfinite(-T(INFINITY)) == false); +} + +__host__ __device__ void test_isfinite(float val) +{ + test_isfinite(val); + test_isfinite(val); +#if !defined(_LIBCUDACXX_HAS_NO_LONG_DOUBLE) + test_isfinite(); +#endif // !_LIBCUDACXX_HAS_NO_LONG_DOUBLE +#ifdef _LIBCUDACXX_HAS_NVFP16 + test_isfinite<__half>(val); +#endif // _LIBCUDACXX_HAS_NVFP16 +#ifdef _LIBCUDACXX_HAS_NVBF16 + test_isfinite<__nv_bfloat16>(val); +#endif // _LIBCUDACXX_HAS_NVBF16 + + assert(cuda::std::isfinite(0) == true); + assert(cuda::std::isfinite(1) == true); + assert(cuda::std::isfinite(-1) == true); + assert(cuda::std::isfinite(cuda::std::numeric_limits::max()) == true); + assert(cuda::std::isfinite(cuda::std::numeric_limits::min()) == true); +} + +__host__ __device__ _CCCL_CONSTEXPR_ISFINITE bool test_constexpr_isfinite(float val) +{ + return cuda::std::isfinite(val); +} + +template +__host__ __device__ void test_isnormal(float val) +{ + static_assert((cuda::std::is_same::value), ""); + assert(cuda::std::isnormal(T(val)) == true); + assert(cuda::std::isnormal(T(-1.0f)) == true); + assert(cuda::std::isnormal(T(1.0f)) == true); + assert(cuda::std::isnormal(T(0.0f)) == false); + assert(cuda::std::isnormal(T(NAN)) == false); + assert(cuda::std::isnormal(T(INFINITY)) == false); + assert(cuda::std::isnormal(-T(INFINITY)) == false); +} + +__host__ __device__ void test_isnormal(float val) +{ + test_isnormal(val); + test_isnormal(val); +#if !defined(_LIBCUDACXX_HAS_NO_LONG_DOUBLE) + test_isnormal(); +#endif // !_LIBCUDACXX_HAS_NO_LONG_DOUBLE +#ifdef _LIBCUDACXX_HAS_NVFP16 + test_isnormal<__half>(val); +#endif // _LIBCUDACXX_HAS_NVFP16 +#ifdef _LIBCUDACXX_HAS_NVBF16 + test_isnormal<__nv_bfloat16>(val); +#endif // _LIBCUDACXX_HAS_NVBF16 + + assert(cuda::std::isnormal(0) == false); + assert(cuda::std::isnormal(1) == true); + assert(cuda::std::isnormal(-1) == true); + assert(cuda::std::isnormal(cuda::std::numeric_limits::max()) == true); + assert(cuda::std::isnormal(cuda::std::numeric_limits::min()) == true); +} + +__host__ __device__ void test_isgreater(float val) +{ + static_assert((cuda::std::is_same::value), ""); + static_assert((cuda::std::is_same::value), ""); +#if !defined(_LIBCUDACXX_HAS_NO_LONG_DOUBLE) + static_assert((cuda::std::is_same::value), ""); +#endif // !_LIBCUDACXX_HAS_NO_LONG_DOUBLE + static_assert((cuda::std::is_same::value), ""); + static_assert((cuda::std::is_same::value), ""); + static_assert((cuda::std::is_same::value), ""); +#if !defined(_LIBCUDACXX_HAS_NO_LONG_DOUBLE) + static_assert((cuda::std::is_same::value), ""); + static_assert((cuda::std::is_same::value), ""); + static_assert((cuda::std::is_same::value), ""); + static_assert((cuda::std::is_same::value), + ""); +#endif // !_LIBCUDACXX_HAS_NO_LONG_DOUBLE +#ifdef _LIBCUDACXX_HAS_NVFP16 + static_assert((cuda::std::is_same::value), ""); + static_assert((cuda::std::is_same::value), ""); + static_assert((cuda::std::is_same::value), ""); +#endif // _LIBCUDACXX_HAS_NVFP16 +#ifdef _LIBCUDACXX_HAS_NVBF16 + static_assert((cuda::std::is_same::value), + ""); + static_assert((cuda::std::is_same::value), ""); + static_assert((cuda::std::is_same::value), ""); +#endif // _LIBCUDACXX_HAS_NVBF16 + assert(cuda::std::isgreater(-1.0, 0.F) == false); +} + +__host__ __device__ void test_isgreaterequal(float val) +{ + static_assert((cuda::std::is_same::value), ""); + static_assert((cuda::std::is_same::value), ""); +#if !defined(_LIBCUDACXX_HAS_NO_LONG_DOUBLE) + static_assert((cuda::std::is_same::value), ""); +#endif // !_LIBCUDACXX_HAS_NO_LONG_DOUBLE + static_assert((cuda::std::is_same::value), ""); + static_assert((cuda::std::is_same::value), ""); + static_assert((cuda::std::is_same::value), ""); +#if !defined(_LIBCUDACXX_HAS_NO_LONG_DOUBLE) + static_assert((cuda::std::is_same::value), + ""); + static_assert((cuda::std::is_same::value), ""); + static_assert((cuda::std::is_same::value), + ""); + static_assert( + (cuda::std::is_same::value), ""); +#endif // !_LIBCUDACXX_HAS_NO_LONG_DOUBLE +#ifdef _LIBCUDACXX_HAS_NVFP16 + static_assert((cuda::std::is_same::value), ""); + static_assert((cuda::std::is_same::value), ""); + static_assert((cuda::std::is_same::value), ""); +#endif // _LIBCUDACXX_HAS_NVFP16 +#ifdef _LIBCUDACXX_HAS_NVBF16 + static_assert( + (cuda::std::is_same::value), ""); + static_assert((cuda::std::is_same::value), + ""); + static_assert((cuda::std::is_same::value), + ""); +#endif // _LIBCUDACXX_HAS_NVBF16 + assert(cuda::std::isgreaterequal(-1.0, 0.F) == false); +} + +__host__ __device__ void test_isinf(float val) +{ + static_assert((cuda::std::is_same::value), ""); + + typedef decltype(cuda::std::isinf((double) 0)) DoubleRetType; + static_assert((cuda::std::is_same::value), ""); + static_assert((cuda::std::is_same::value), ""); +#if !defined(_LIBCUDACXX_HAS_NO_LONG_DOUBLE) + static_assert((cuda::std::is_same::value), ""); +#endif // !_LIBCUDACXX_HAS_NO_LONG_DOUBLE +#ifdef _LIBCUDACXX_HAS_NVFP16 + static_assert((cuda::std::is_same::value), ""); +#endif // _LIBCUDACXX_HAS_NVFP16 +#ifdef _LIBCUDACXX_HAS_NVBF16 + static_assert((cuda::std::is_same::value), ""); +#endif // _LIBCUDACXX_HAS_NVBF16 + assert(cuda::std::isinf(-1.0) == false); + assert(cuda::std::isinf(0) == false); + assert(cuda::std::isinf(1) == false); + assert(cuda::std::isinf(-1) == false); + assert(cuda::std::isinf(cuda::std::numeric_limits::max()) == false); + assert(cuda::std::isinf(cuda::std::numeric_limits::min()) == false); +} + +__host__ __device__ _CCCL_CONSTEXPR_ISINF bool test_constexpr_isinf(float val) +{ + return cuda::std::isinf(val); +} + +__host__ __device__ void test_isless(float val) +{ + static_assert((cuda::std::is_same::value), ""); + static_assert((cuda::std::is_same::value), ""); +#if !defined(_LIBCUDACXX_HAS_NO_LONG_DOUBLE) + static_assert((cuda::std::is_same::value), ""); +#endif // !_LIBCUDACXX_HAS_NO_LONG_DOUBLE + static_assert((cuda::std::is_same::value), ""); + static_assert((cuda::std::is_same::value), ""); + static_assert((cuda::std::is_same::value), ""); +#if !defined(_LIBCUDACXX_HAS_NO_LONG_DOUBLE) + static_assert((cuda::std::is_same::value), ""); + static_assert((cuda::std::is_same::value), ""); + static_assert((cuda::std::is_same::value), ""); + static_assert((cuda::std::is_same::value), ""); +#endif // !_LIBCUDACXX_HAS_NO_LONG_DOUBLE +#ifdef _LIBCUDACXX_HAS_NVFP16 + static_assert((cuda::std::is_same::value), ""); + static_assert((cuda::std::is_same::value), ""); + static_assert((cuda::std::is_same::value), ""); +#endif // _LIBCUDACXX_HAS_NVFP16 +#ifdef _LIBCUDACXX_HAS_NVBF16 + static_assert((cuda::std::is_same::value), + ""); + static_assert((cuda::std::is_same::value), ""); + static_assert((cuda::std::is_same::value), ""); +#endif // _LIBCUDACXX_HAS_NVBF16 + assert(cuda::std::isless(-1.0, 0.F) == true); +} + +__host__ __device__ void test_islessequal(float val) +{ + static_assert((cuda::std::is_same::value), ""); + static_assert((cuda::std::is_same::value), ""); +#if !defined(_LIBCUDACXX_HAS_NO_LONG_DOUBLE) + static_assert((cuda::std::is_same::value), ""); +#endif // !_LIBCUDACXX_HAS_NO_LONG_DOUBLE + static_assert((cuda::std::is_same::value), ""); + static_assert((cuda::std::is_same::value), ""); + static_assert((cuda::std::is_same::value), ""); +#if !defined(_LIBCUDACXX_HAS_NO_LONG_DOUBLE) + static_assert((cuda::std::is_same::value), ""); + static_assert((cuda::std::is_same::value), ""); + static_assert((cuda::std::is_same::value), ""); + static_assert((cuda::std::is_same::value), + ""); +#endif // !_LIBCUDACXX_HAS_NO_LONG_DOUBLE +#ifdef _LIBCUDACXX_HAS_NVFP16 + static_assert((cuda::std::is_same::value), ""); + static_assert((cuda::std::is_same::value), ""); + static_assert((cuda::std::is_same::value), ""); +#endif // _LIBCUDACXX_HAS_NVFP16 +#ifdef _LIBCUDACXX_HAS_NVBF16 + static_assert( + (cuda::std::is_same::value), ""); + static_assert((cuda::std::is_same::value), ""); + static_assert((cuda::std::is_same::value), ""); +#endif // _LIBCUDACXX_HAS_NVBF16 + assert(cuda::std::islessequal(-1.0, 0.F) == true); +} + +__host__ __device__ void test_islessgreater(float val) +{ + static_assert((cuda::std::is_same::value), ""); + static_assert((cuda::std::is_same::value), ""); +#if !defined(_LIBCUDACXX_HAS_NO_LONG_DOUBLE) + static_assert((cuda::std::is_same::value), ""); +#endif // !_LIBCUDACXX_HAS_NO_LONG_DOUBLE + static_assert((cuda::std::is_same::value), ""); + static_assert((cuda::std::is_same::value), ""); + static_assert((cuda::std::is_same::value), ""); +#if !defined(_LIBCUDACXX_HAS_NO_LONG_DOUBLE) + static_assert((cuda::std::is_same::value), ""); + static_assert((cuda::std::is_same::value), ""); + static_assert((cuda::std::is_same::value), ""); + static_assert((cuda::std::is_same::value), + ""); +#endif // !_LIBCUDACXX_HAS_NO_LONG_DOUBLE +#ifdef _LIBCUDACXX_HAS_NVFP16 + static_assert((cuda::std::is_same::value), ""); + static_assert((cuda::std::is_same::value), ""); + static_assert((cuda::std::is_same::value), ""); +#endif // _LIBCUDACXX_HAS_NVFP16 +#ifdef _LIBCUDACXX_HAS_NVBF16 + static_assert( + (cuda::std::is_same::value), ""); + static_assert((cuda::std::is_same::value), + ""); + static_assert((cuda::std::is_same::value), + ""); +#endif // _LIBCUDACXX_HAS_NVBF16 + assert(cuda::std::islessgreater(-1.0, 0.F) == true); +} + +__host__ __device__ void test_isnan(float val) +{ + static_assert((cuda::std::is_same::value), ""); + + typedef decltype(cuda::std::isnan((double) 0)) DoubleRetType; + static_assert((cuda::std::is_same::value), ""); + static_assert((cuda::std::is_same::value), ""); +#if !defined(_LIBCUDACXX_HAS_NO_LONG_DOUBLE) + static_assert((cuda::std::is_same::value), ""); +#endif // !_LIBCUDACXX_HAS_NO_LONG_DOUBLE +#ifdef _LIBCUDACXX_HAS_NVFP16 + static_assert((cuda::std::is_same::value), ""); +#endif // _LIBCUDACXX_HAS_NVFP16 +#ifdef _LIBCUDACXX_HAS_NVBF16 + static_assert((cuda::std::is_same::value), ""); +#endif // _LIBCUDACXX_HAS_NVBF16 + assert(cuda::std::isnan(-1.0) == false); + assert(cuda::std::isnan(0) == false); + assert(cuda::std::isnan(1) == false); + assert(cuda::std::isnan(-1) == false); + assert(cuda::std::isnan(cuda::std::numeric_limits::max()) == false); + assert(cuda::std::isnan(cuda::std::numeric_limits::min()) == false); +} + +__host__ __device__ _CCCL_CONSTEXPR_ISNAN bool test_constexpr_isnan(float val) +{ + return cuda::std::isnan(val); +} + +__host__ __device__ void test_isunordered(float val) +{ + static_assert((cuda::std::is_same::value), ""); + static_assert((cuda::std::is_same::value), ""); +#if !defined(_LIBCUDACXX_HAS_NO_LONG_DOUBLE) + static_assert((cuda::std::is_same::value), ""); +#endif // !_LIBCUDACXX_HAS_NO_LONG_DOUBLE + static_assert((cuda::std::is_same::value), ""); + static_assert((cuda::std::is_same::value), ""); + static_assert((cuda::std::is_same::value), ""); +#if !defined(_LIBCUDACXX_HAS_NO_LONG_DOUBLE) + static_assert((cuda::std::is_same::value), ""); + static_assert((cuda::std::is_same::value), ""); + static_assert((cuda::std::is_same::value), ""); + static_assert((cuda::std::is_same::value), + ""); +#endif // !_LIBCUDACXX_HAS_NO_LONG_DOUBLE +#ifdef _LIBCUDACXX_HAS_NVFP16 + static_assert((cuda::std::is_same::value), ""); + static_assert((cuda::std::is_same::value), ""); + static_assert((cuda::std::is_same::value), ""); +#endif // _LIBCUDACXX_HAS_NVFP16 +#ifdef _LIBCUDACXX_HAS_NVBF16 + static_assert( + (cuda::std::is_same::value), ""); + static_assert((cuda::std::is_same::value), ""); + static_assert((cuda::std::is_same::value), ""); +#endif // _LIBCUDACXX_HAS_NVBF16 + assert(cuda::std::isunordered(-1.0, 0.F) == false); +} + +__host__ __device__ void test(float val) +{ + test_fpclassify(val); + test_signbit(val); + test_isfinite(val); + test_isnormal(val); + test_isgreater(val); + test_isgreaterequal(val); + test_isinf(val); + test_isless(val); + test_islessequal(val); + test_islessgreater(val); + test_isnan(val); + test_isunordered(val); +} + +__global__ void test_global_kernel(float* val) +{ + test(*val); +} + +int main(int, char**) +{ + volatile float val = 1.0f; + test(val); + +#if defined(_CCCL_BUILTIN_ISNAN) + static_assert(!test_constexpr_isnan(1.0f), ""); +#endif // _CCCL_BUILTIN_ISNAN + +#if defined(_CCCL_BUILTIN_ISINF) + static_assert(!test_constexpr_isinf(1.0f), ""); +#endif // _CCCL_BUILTIN_ISINF + +#if defined(_CCCL_BUILTIN_ISFINITE) || (defined(_CCCL_BUILTIN_ISINF) && defined(_CCCL_BUILTIN_ISNAN)) + static_assert(test_constexpr_isfinite(1.0f), ""); +#endif // _CCCL_BUILTIN_ISFINITE|| (_CCCL_BUILTIN_ISINF && _CCCL_BUILTIN_ISNAN) + + return 0; +} diff --git a/libcudacxx/test/libcudacxx/std/numerics/c.math/lerp.pass.cpp b/libcudacxx/test/libcudacxx/std/numerics/c.math/lerp.pass.cpp new file mode 100644 index 00000000000..366ff77bd7a --- /dev/null +++ b/libcudacxx/test/libcudacxx/std/numerics/c.math/lerp.pass.cpp @@ -0,0 +1,86 @@ +//===----------------------------------------------------------------------===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// SPDX-FileCopyrightText: Copyright (c) 2024 NVIDIA CORPORATION & AFFILIATES. +// +//===----------------------------------------------------------------------===// + +// + +// constexpr float lerp(float a, float b, float t) noexcept; +// constexpr double lerp(double a, double b, double t) noexcept; +// constexpr long double lerp(long double a, long double b, long double t) noexcept; + +#include +#include +#include +#include + +#include "fp_compare.h" +#include "test_macros.h" + +template +__host__ __device__ TEST_CONSTEXPR_CXX14 bool constexpr_test() +{ + return cuda::std::lerp(T(0.0), T(12), T(0.0)) == T(0.0) && cuda::std::lerp(T(12), T(0.0), T(0.5)) == T(6) + && cuda::std::lerp(T(0.0), T(12), T(2)) == T(24); +} + +template +__host__ __device__ void test() +{ + ASSERT_SAME_TYPE(T, decltype(cuda::std::lerp(T(), T(), T()))); + static_assert(noexcept(cuda::std::lerp(T(), T(), T())), ""); + + const T maxV = cuda::std::numeric_limits::max(); + const T inf = cuda::std::numeric_limits::infinity(); + + // Things that can be compared exactly + assert((cuda::std::lerp(T(0.0), T(12), T(0.0)) == T(0.0))); + assert((cuda::std::lerp(T(0.0), T(12), T(1)) == T(12))); + assert((cuda::std::lerp(T(12), T(0.0), T(0.0)) == T(12))); + assert((cuda::std::lerp(T(12), T(0.0), T(1)) == T(0.0))); + + assert((cuda::std::lerp(T(0.0), T(12), T(0.5)) == T(6))); + assert((cuda::std::lerp(T(12), T(0.0), T(0.5)) == T(6))); + assert((cuda::std::lerp(T(0.0), T(12), T(2)) == T(24))); + assert((cuda::std::lerp(T(12), T(0.0), T(2)) == T(-12))); + + assert((cuda::std::lerp(maxV, maxV / T(10), T(0.0)) == maxV)); + assert((cuda::std::lerp(maxV / T(10), maxV, T(1)) == maxV)); + + assert((cuda::std::lerp(T(2.3), T(2.3), inf) == T(2.3))); + + assert(cuda::std::lerp(T(0.0), T(0.0), T(23)) == T(0.0)); + + // __half and __nvbfloat have precision issues here + if (!cuda::std::__is_extended_floating_point::value) + { + assert(cuda::std::isnan(cuda::std::lerp(T(0.0), T(0.0), T(inf)))); + } +} + +int main(int, char**) +{ + test(); + test(); +#if !defined(_LIBCUDACXX_HAS_NO_LONG_DOUBLE) + test(); +#endif //!_LIBCUDACXX_HAS_NO_LONG_DOUBLE + +#ifdef _LIBCUDACXX_HAS_NVFP16 + test<__half>(); +#endif // _LIBCUDACXX_HAS_NVFP16 +#ifdef _LIBCUDACXX_HAS_NVBF16 + test<__nv_bfloat16>(); +#endif // _LIBCUDACXX_HAS_NVBF16 + +#if TEST_STD_VER >= 2014 + static_assert(constexpr_test(), ""); + static_assert(constexpr_test(), ""); +#endif // TEST_STD_VER >= 2014 + + return 0; +} diff --git a/libcudacxx/test/libcudacxx/std/numerics/c.math/logarithms.pass.cpp b/libcudacxx/test/libcudacxx/std/numerics/c.math/logarithms.pass.cpp new file mode 100644 index 00000000000..ea0aff8f0ff --- /dev/null +++ b/libcudacxx/test/libcudacxx/std/numerics/c.math/logarithms.pass.cpp @@ -0,0 +1,109 @@ +//===----------------------------------------------------------------------===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// SPDX-FileCopyrightText: Copyright (c) 2024 NVIDIA CORPORATION & AFFILIATES. +// +//===----------------------------------------------------------------------===// + +// + +#include +#include +#include + +#include "test_macros.h" + +template +__host__ __device__ void test_log(T value) +{ + using ret = cuda::std::conditional_t::value, double, T>; + static_assert(cuda::std::is_same::value, ""); + assert(cuda::std::log(value) == ret{0}); +} + +template +__host__ __device__ void test_log10(T value) +{ + using ret = cuda::std::conditional_t::value, double, T>; + static_assert(cuda::std::is_same::value, ""); + assert(cuda::std::log10(value) == ret{0}); +} + +template +__host__ __device__ void test_ilogb(T value) +{ + static_assert(cuda::std::is_same::value, ""); + assert(cuda::std::ilogb(value) == 0); +} + +template +__host__ __device__ void test_log1p(T value) +{ + using ret = cuda::std::conditional_t::value, double, T>; + static_assert(cuda::std::is_same::value, ""); + assert(cuda::std::log1p(value - value) == ret{0}); +} + +template +__host__ __device__ void test_log2(T value) +{ + using ret = cuda::std::conditional_t::value, double, T>; + static_assert(cuda::std::is_same::value, ""); + assert(cuda::std::log2(value) == ret{0}); +} + +template +__host__ __device__ void test_logb(T value) +{ + using ret = cuda::std::conditional_t::value, double, T>; + static_assert(cuda::std::is_same::value, ""); + assert(cuda::std::logb(value) == ret{0}); +} + +template +__host__ __device__ void test(T value) +{ + test_log(value); + test_log10(value); + test_ilogb(value); + test_log1p(value); + test_log2(value); + test_logb(value); +} + +__host__ __device__ void test(float value) +{ + test(value); + test(value); +#if !defined(_LIBCUDACXX_HAS_NO_LONG_DOUBLE) + test(value); +#endif // !_LIBCUDACXX_HAS_NO_LONG_DOUBLE +#ifdef _LIBCUDACXX_HAS_NVFP16 + test<__half>(__float2half(value)); +#endif // _LIBCUDACXX_HAS_NVFP16 +#ifdef _LIBCUDACXX_HAS_NVBF16 + test<__nv_bfloat16>(__float2bfloat16(value)); +#endif // _LIBCUDACXX_HAS_NVBF16 + + test(static_cast(value)); + test(static_cast(value)); + test(static_cast(value)); + test(static_cast(value)); + test(static_cast(value)); + test(static_cast(value)); + test(static_cast(value)); +} + +__global__ void test_global_kernel(float* value) +{ + test(*value); +} + +int main(int, char**) +{ + volatile float value = 1.0f; + test(value); + return 0; +} From 831c62e94ce23091cc0d7d6280b6b38e9606baba Mon Sep 17 00:00:00 2001 From: Michael Schellenberger Costa Date: Tue, 26 Nov 2024 10:15:20 +0100 Subject: [PATCH 30/45] Drop memory resources in libcu++ (#2860) This moves our memory resources into cuda::experimental That way it is easier to use them because cudax is the actual user --- cudax/examples/simple_p2p.cu | 6 +- .../uninitialized_async_buffer.cuh | 11 +- .../__container/uninitialized_buffer.cuh | 9 +- .../__memory_resource/any_resource.cuh | 6 +- .../__memory_resource/device_memory_pool.cuh | 17 +- .../device_memory_resource.cuh | 67 ++--- .../managed_memory_resource.cuh | 254 ++++++++++++++++ .../pinned_memory_resource.cuh | 256 ++++++++++++++++ .../__memory_resource/properties.cuh | 49 ++++ .../__memory_resource/shared_resource.cuh | 4 +- .../cuda/experimental/memory_resource.cuh | 3 + cudax/test/CMakeLists.txt | 2 + cudax/test/algorithm/common.cuh | 4 +- cudax/test/algorithm/copy.cu | 10 +- cudax/test/algorithm/fill.cu | 6 +- .../containers/uninitialized_async_buffer.cu | 10 +- cudax/test/containers/uninitialized_buffer.cu | 19 +- .../memory_resource/any_async_resource.cu | 14 +- cudax/test/memory_resource/any_resource.cu | 12 +- .../memory_resource/device_memory_pool.cu | 30 +- .../memory_resource/device_memory_resource.cu | 64 ++-- .../managed_memory_resource.cu | 273 +++++++++++++++++ .../memory_resource/pinned_memory_resource.cu | 274 ++++++++++++++++++ cudax/test/memory_resource/shared_resource.cu | 14 +- .../device_memory_resource.h | 219 -------------- .../managed_memory_resource.h | 201 ------------- .../pinned_memory_resource.h | 204 ------------- libcudacxx/include/cuda/memory_resource | 3 - .../device_memory_resource/allocate.pass.cpp | 95 ------ .../device_memory_resource/equality.pass.cpp | 144 --------- .../device_memory_resource/traits.pass.cpp | 31 -- .../managed_memory_resource/allocate.pass.cpp | 96 ------ .../managed_memory_resource/equality.pass.cpp | 130 --------- .../managed_memory_resource/traits.pass.cpp | 31 -- .../pinned_memory_resource/allocate.pass.cpp | 98 ------- .../pinned_memory_resource/equality.pass.cpp | 132 --------- .../pinned_memory_resource/traits.pass.cpp | 31 -- 37 files changed, 1263 insertions(+), 1566 deletions(-) create mode 100644 cudax/include/cuda/experimental/__memory_resource/managed_memory_resource.cuh create mode 100644 cudax/include/cuda/experimental/__memory_resource/pinned_memory_resource.cuh create mode 100644 cudax/include/cuda/experimental/__memory_resource/properties.cuh create mode 100644 cudax/test/memory_resource/managed_memory_resource.cu create mode 100644 cudax/test/memory_resource/pinned_memory_resource.cu delete mode 100644 libcudacxx/include/cuda/__memory_resource/device_memory_resource.h delete mode 100644 libcudacxx/include/cuda/__memory_resource/managed_memory_resource.h delete mode 100644 libcudacxx/include/cuda/__memory_resource/pinned_memory_resource.h delete mode 100644 libcudacxx/test/libcudacxx/cuda/memory_resource/device_memory_resource/allocate.pass.cpp delete mode 100644 libcudacxx/test/libcudacxx/cuda/memory_resource/device_memory_resource/equality.pass.cpp delete mode 100644 libcudacxx/test/libcudacxx/cuda/memory_resource/device_memory_resource/traits.pass.cpp delete mode 100644 libcudacxx/test/libcudacxx/cuda/memory_resource/managed_memory_resource/allocate.pass.cpp delete mode 100644 libcudacxx/test/libcudacxx/cuda/memory_resource/managed_memory_resource/equality.pass.cpp delete mode 100644 libcudacxx/test/libcudacxx/cuda/memory_resource/managed_memory_resource/traits.pass.cpp delete mode 100644 libcudacxx/test/libcudacxx/cuda/memory_resource/pinned_memory_resource/allocate.pass.cpp delete mode 100644 libcudacxx/test/libcudacxx/cuda/memory_resource/pinned_memory_resource/equality.pass.cpp delete mode 100644 libcudacxx/test/libcudacxx/cuda/memory_resource/pinned_memory_resource/traits.pass.cpp diff --git a/cudax/examples/simple_p2p.cu b/cudax/examples/simple_p2p.cu index 5b83a43b904..c6d9be9f707 100644 --- a/cudax/examples/simple_p2p.cu +++ b/cudax/examples/simple_p2p.cu @@ -121,7 +121,7 @@ void test_cross_device_access_from_kernel( // This will be a pinned memory vector once available cudax::uninitialized_buffer host_buffer( - cuda::mr::pinned_memory_resource(), dev0_buffer.size()); + cudax::pinned_memory_resource(), dev0_buffer.size()); std::generate(host_buffer.begin(), host_buffer.end(), []() { static int i = 0; return static_cast((i++) % 4096); @@ -219,9 +219,9 @@ try cudax::stream dev1_stream(peers[1]); printf("Enabling peer access between GPU%d and GPU%d...\n", peers[0].get(), peers[1].get()); - cudax::mr::device_memory_resource dev0_resource(peers[0]); + cudax::device_memory_resource dev0_resource(peers[0]); dev0_resource.enable_peer_access_from(peers[1]); - cudax::mr::device_memory_resource dev1_resource(peers[1]); + cudax::device_memory_resource dev1_resource(peers[1]); dev1_resource.enable_peer_access_from(peers[0]); // Allocate buffers diff --git a/cudax/include/cuda/experimental/__container/uninitialized_async_buffer.cuh b/cudax/include/cuda/experimental/__container/uninitialized_async_buffer.cuh index fb502cbbf7d..731ed555bb3 100644 --- a/cudax/include/cuda/experimental/__container/uninitialized_async_buffer.cuh +++ b/cudax/include/cuda/experimental/__container/uninitialized_async_buffer.cuh @@ -33,6 +33,7 @@ #include #include +#include #if _CCCL_STD_VER >= 2014 && !_CCCL_COMPILER(MSVC2017) && defined(LIBCUDACXX_ENABLE_EXPERIMENTAL_MEMORY_RESOURCE) @@ -73,10 +74,10 @@ class uninitialized_async_buffer { private: static_assert(_CUDA_VMR::__contains_execution_space_property<_Properties...>, - "The properties of cuda::experimental::mr::uninitialized_async_buffer must contain at least one " + "The properties of cuda::experimental::uninitialized_async_buffer must contain at least one " "execution space property!"); - using __async_resource = ::cuda::experimental::mr::any_async_resource<_Properties...>; + using __async_resource = ::cuda::experimental::any_async_resource<_Properties...>; __async_resource __mr_; ::cuda::stream_ref __stream_ = {}; @@ -117,7 +118,7 @@ private: _CCCL_NODISCARD_FRIEND _CCCL_HIDE_FROM_ABI auto __cudax_launch_transform(::cuda::stream_ref, uninitialized_async_buffer& __self) noexcept _CCCL_TRAILING_REQUIRES(_CUDA_VSTD::span<_Tp>)( - _CUDA_VSTD::same_as<_Tp, _Tp2>&& _CUDA_VSTD::__is_included_in_v<_CUDA_VMR::device_accessible, _Properties...>) + _CUDA_VSTD::same_as<_Tp, _Tp2>&& _CUDA_VSTD::__is_included_in_v) { // TODO add auto synchronization return {__self.__get_data(), __self.size()}; @@ -129,7 +130,7 @@ private: _CCCL_NODISCARD_FRIEND _CCCL_HIDE_FROM_ABI auto __cudax_launch_transform(::cuda::stream_ref, const uninitialized_async_buffer& __self) noexcept _CCCL_TRAILING_REQUIRES(_CUDA_VSTD::span)( - _CUDA_VSTD::same_as<_Tp, _Tp2>&& _CUDA_VSTD::__is_included_in_v<_CUDA_VMR::device_accessible, _Properties...>) + _CUDA_VSTD::same_as<_Tp, _Tp2>&& _CUDA_VSTD::__is_included_in_v) { // TODO add auto synchronization return {__self.__get_data(), __self.size()}; @@ -294,7 +295,7 @@ public: }; template -using uninitialized_async_device_buffer = uninitialized_async_buffer<_Tp, _CUDA_VMR::device_accessible>; +using uninitialized_async_device_buffer = uninitialized_async_buffer<_Tp, mr::device_accessible>; } // namespace cuda::experimental diff --git a/cudax/include/cuda/experimental/__container/uninitialized_buffer.cuh b/cudax/include/cuda/experimental/__container/uninitialized_buffer.cuh index 9a2f1200678..1f661c0c7d5 100644 --- a/cudax/include/cuda/experimental/__container/uninitialized_buffer.cuh +++ b/cudax/include/cuda/experimental/__container/uninitialized_buffer.cuh @@ -32,6 +32,7 @@ #include #include +#include #if _CCCL_STD_VER >= 2014 && !_CCCL_COMPILER(MSVC2017) && defined(LIBCUDACXX_ENABLE_EXPERIMENTAL_MEMORY_RESOURCE) @@ -67,7 +68,7 @@ private: "The properties of cuda::experimental::uninitialized_buffer must contain at least one execution space " "property!"); - using __resource = ::cuda::experimental::mr::any_resource<_Properties...>; + using __resource = ::cuda::experimental::any_resource<_Properties...>; __resource __mr_; size_t __count_ = 0; @@ -107,7 +108,7 @@ private: _CCCL_NODISCARD_FRIEND _CCCL_HIDE_FROM_ABI auto __cudax_launch_transform(::cuda::stream_ref, uninitialized_buffer& __self) noexcept _CCCL_TRAILING_REQUIRES(_CUDA_VSTD::span<_Tp>)( - _CUDA_VSTD::same_as<_Tp, _Tp2>&& _CUDA_VSTD::__is_included_in_v<_CUDA_VMR::device_accessible, _Properties...>) + _CUDA_VSTD::same_as<_Tp, _Tp2>&& _CUDA_VSTD::__is_included_in_v) { return {__self.__get_data(), __self.size()}; } @@ -118,7 +119,7 @@ private: _CCCL_NODISCARD_FRIEND _CCCL_HIDE_FROM_ABI auto __cudax_launch_transform(::cuda::stream_ref, const uninitialized_buffer& __self) noexcept _CCCL_TRAILING_REQUIRES(_CUDA_VSTD::span)( - _CUDA_VSTD::same_as<_Tp, _Tp2>&& _CUDA_VSTD::__is_included_in_v<_CUDA_VMR::device_accessible, _Properties...>) + _CUDA_VSTD::same_as<_Tp, _Tp2>&& _CUDA_VSTD::__is_included_in_v) { return {__self.__get_data(), __self.size()}; } @@ -259,7 +260,7 @@ public: }; template -using uninitialized_device_buffer = uninitialized_buffer<_Tp, _CUDA_VMR::device_accessible>; +using uninitialized_device_buffer = uninitialized_buffer<_Tp, mr::device_accessible>; } // namespace cuda::experimental diff --git a/cudax/include/cuda/experimental/__memory_resource/any_resource.cuh b/cudax/include/cuda/experimental/__memory_resource/any_resource.cuh index f442e56dcfe..c3d6fce7a08 100644 --- a/cudax/include/cuda/experimental/__memory_resource/any_resource.cuh +++ b/cudax/include/cuda/experimental/__memory_resource/any_resource.cuh @@ -51,7 +51,7 @@ #include #include -namespace cuda::experimental::mr +namespace cuda::experimental { template > _CCCL_INLINE_VAR constexpr bool __is_basic_any_resource = false; @@ -73,7 +73,7 @@ class basic_any_resource { private: static_assert(_CUDA_VMR::__contains_execution_space_property<_Properties...>, - "The properties of cuda::experimental::mr::basic_any_resource must contain at least one execution " + "The properties of cuda::experimental::basic_any_resource must contain at least one execution " "space property!"); template <_CUDA_VMR::_AllocType, class...> @@ -352,6 +352,6 @@ auto make_any_async_resource(_Args&&... __args) -> any_async_resource<_Propertie return any_async_resource<_Properties...>{_CUDA_VSTD::in_place_type<_Resource>, _CUDA_VSTD::forward<_Args>(__args)...}; } -} // namespace cuda::experimental::mr +} // namespace cuda::experimental #endif //_CUDAX__MEMORY_RESOURCE_ANY_RESOURCE_H diff --git a/cudax/include/cuda/experimental/__memory_resource/device_memory_pool.cuh b/cudax/include/cuda/experimental/__memory_resource/device_memory_pool.cuh index c74f7d68f77..f3ffcfeea24 100644 --- a/cudax/include/cuda/experimental/__memory_resource/device_memory_pool.cuh +++ b/cudax/include/cuda/experimental/__memory_resource/device_memory_pool.cuh @@ -43,7 +43,7 @@ //! @file //! The \c device_memory_pool class provides a wrapper around a `cudaMempool_t`. -namespace cuda::experimental::mr +namespace cuda::experimental { //! @brief Checks whether the current device supports \c cudaMallocAsync. @@ -166,7 +166,7 @@ private: _CCCL_NODISCARD static cudaMemPool_t __create_cuda_mempool(const int __device_id, memory_pool_properties __properties) noexcept { - ::cuda::experimental::mr::__device_supports_stream_ordered_allocations(__device_id); + ::cuda::experimental::__device_supports_stream_ordered_allocations(__device_id); device_memory_pool::__cuda_supports_export_handle_type(__device_id, __properties.allocation_handle_type); ::cudaMemPoolProps __pool_properties{}; @@ -315,7 +315,7 @@ public: //! @param __devices A vector of `device_ref`s listing devices to enable access for void enable_peer_access_from(const ::std::vector& __devices) { - ::cuda::experimental::mr::__mempool_switch_peer_access( + ::cuda::experimental::__mempool_switch_peer_access( __pool_handle_, {__devices.data(), __devices.size()}, cudaMemAccessFlagsProtReadWrite); } @@ -324,8 +324,7 @@ public: //! @param __device device_ref indicating for which device the access should be enabled void enable_peer_access_from(device_ref __device) { - ::cuda::experimental::mr::__mempool_switch_peer_access( - __pool_handle_, {&__device, 1}, cudaMemAccessFlagsProtReadWrite); + ::cuda::experimental::__mempool_switch_peer_access(__pool_handle_, {&__device, 1}, cudaMemAccessFlagsProtReadWrite); } //! @brief Disable peer access to this memory pool from the supplied devices @@ -335,7 +334,7 @@ public: //! @param __devices A vector of `device_ref`s listing devices to disable access for void disable_peer_access_from(const ::std::vector& __devices) { - ::cuda::experimental::mr::__mempool_switch_peer_access( + ::cuda::experimental::__mempool_switch_peer_access( __pool_handle_, {__devices.data(), __devices.size()}, cudaMemAccessFlagsProtNone); } @@ -344,7 +343,7 @@ public: //! @param __device device_ref indicating for which device the access should be disable void disable_peer_access_from(device_ref __device) { - ::cuda::experimental::mr::__mempool_switch_peer_access(__pool_handle_, {&__device, 1}, cudaMemAccessFlagsProtNone); + ::cuda::experimental::__mempool_switch_peer_access(__pool_handle_, {&__device, 1}, cudaMemAccessFlagsProtNone); } //! @brief Query if memory allocated through this memory resource is accessible by the supplied device @@ -352,7 +351,7 @@ public: //! @param __device device for which the peer access is queried _CCCL_NODISCARD bool is_accessible_from(device_ref __device) { - return ::cuda::experimental::mr::__mempool_get_access(__pool_handle_, __device); + return ::cuda::experimental::__mempool_get_access(__pool_handle_, __device); } //! @brief Equality comparison with another \c device_memory_pool. @@ -424,7 +423,7 @@ public: static device_memory_pool from_native_handle(_CUDA_VSTD::nullptr_t) = delete; }; -} // namespace cuda::experimental::mr +} // namespace cuda::experimental # endif // _CCCL_STD_VER >= 2014 diff --git a/cudax/include/cuda/experimental/__memory_resource/device_memory_resource.cuh b/cudax/include/cuda/experimental/__memory_resource/device_memory_resource.cuh index fffe3dea722..c0aedab7fa9 100644 --- a/cudax/include/cuda/experimental/__memory_resource/device_memory_resource.cuh +++ b/cudax/include/cuda/experimental/__memory_resource/device_memory_resource.cuh @@ -40,6 +40,7 @@ # include # include +# include # include # if _CCCL_STD_VER >= 2014 @@ -47,7 +48,7 @@ //! @file //! The \c device_memory_pool class provides an asynchronous memory resource that allocates device memory in stream //! order. -namespace cuda::experimental::mr +namespace cuda::experimental { //! @brief global stream to synchronize in the synchronous interface of \c device_memory_resource @@ -92,7 +93,7 @@ private: //! @returns The default memory pool of the specified device. _CCCL_NODISCARD static ::cudaMemPool_t __get_default_mem_pool(const int __device_id) { - ::cuda::experimental::mr::__device_supports_stream_ordered_allocations(__device_id); + ::cuda::experimental::__device_supports_stream_ordered_allocations(__device_id); ::cudaMemPool_t __pool; _CCCL_TRY_CUDA_API( @@ -247,7 +248,7 @@ public: //! @param __devices A vector of `device_ref`s listing devices to enable access for void enable_peer_access_from(const ::std::vector& __devices) { - ::cuda::experimental::mr::__mempool_switch_peer_access( + ::cuda::experimental::__mempool_switch_peer_access( __pool_, {__devices.data(), __devices.size()}, cudaMemAccessFlagsProtReadWrite); } @@ -259,7 +260,7 @@ public: //! @param __device device_ref indicating for which device the access should be enabled void enable_peer_access_from(device_ref __device) { - ::cuda::experimental::mr::__mempool_switch_peer_access(__pool_, {&__device, 1}, cudaMemAccessFlagsProtReadWrite); + ::cuda::experimental::__mempool_switch_peer_access(__pool_, {&__device, 1}, cudaMemAccessFlagsProtReadWrite); } //! @brief Enable peer access to memory allocated through this memory resource by the supplied devices @@ -271,7 +272,7 @@ public: //! @param __devices A vector of `device_ref`s listing devices to disable access for void disable_peer_access_from(const ::std::vector& __devices) { - ::cuda::experimental::mr::__mempool_switch_peer_access( + ::cuda::experimental::__mempool_switch_peer_access( __pool_, {__devices.data(), __devices.size()}, cudaMemAccessFlagsProtNone); } @@ -283,7 +284,7 @@ public: //! @param __device device_ref indicating for which device the access should be enabled void disable_peer_access_from(device_ref __device) { - ::cuda::experimental::mr::__mempool_switch_peer_access(__pool_, {&__device, 1}, cudaMemAccessFlagsProtNone); + ::cuda::experimental::__mempool_switch_peer_access(__pool_, {&__device, 1}, cudaMemAccessFlagsProtNone); } //! @brief Query if memory allocated through this memory resource is accessible by the supplied device @@ -291,7 +292,7 @@ public: //! @param __device device for which the peer access is queried _CCCL_NODISCARD bool is_accessible_from(device_ref __device) { - return ::cuda::experimental::mr::__mempool_get_access(__pool_, __device); + return ::cuda::experimental::__mempool_get_access(__pool_, __device); } //! @brief Equality comparison with another device_memory_resource. @@ -319,10 +320,10 @@ public: _CCCL_REQUIRES((_CUDA_VMR::__different_resource) ) _CCCL_NODISCARD bool operator==(_Resource const& __rhs) const noexcept { - if constexpr (has_property<_Resource, _CUDA_VMR::device_accessible>) + if constexpr (has_property<_Resource, device_accessible>) { - return _CUDA_VMR::resource_ref<_CUDA_VMR::device_accessible>{const_cast(this)} - == _CUDA_VMR::resource_ref<_CUDA_VMR::device_accessible>{const_cast<_Resource&>(__rhs)}; + return _CUDA_VMR::resource_ref{const_cast(this)} + == _CUDA_VMR::resource_ref{const_cast<_Resource&>(__rhs)}; } else { @@ -332,68 +333,68 @@ public: # else // ^^^ C++20 ^^^ / vvv C++17 template _CCCL_NODISCARD_FRIEND auto operator==(device_memory_resource const& __lhs, _Resource const& __rhs) noexcept - _CCCL_TRAILING_REQUIRES(bool)(_CUDA_VMR::__different_resource&& - has_property<_Resource, _CUDA_VMR::device_accessible>) + _CCCL_TRAILING_REQUIRES(bool)( + _CUDA_VMR::__different_resource&& has_property<_Resource, device_accessible>) { - return _CUDA_VMR::resource_ref<_CUDA_VMR::device_accessible>{const_cast(__lhs)} - == _CUDA_VMR::resource_ref<_CUDA_VMR::device_accessible>{const_cast<_Resource&>(__rhs)}; + return _CUDA_VMR::resource_ref{const_cast(__lhs)} + == _CUDA_VMR::resource_ref{const_cast<_Resource&>(__rhs)}; } template _CCCL_NODISCARD_FRIEND auto operator==(device_memory_resource const&, _Resource const&) noexcept _CCCL_TRAILING_REQUIRES(bool)(_CUDA_VMR::__different_resource - && !has_property<_Resource, _CUDA_VMR::device_accessible>) + && !has_property<_Resource, device_accessible>) { return false; } template _CCCL_NODISCARD_FRIEND auto operator==(_Resource const& __rhs, device_memory_resource const& __lhs) noexcept - _CCCL_TRAILING_REQUIRES(bool)(_CUDA_VMR::__different_resource&& - has_property<_Resource, _CUDA_VMR::device_accessible>) + _CCCL_TRAILING_REQUIRES(bool)( + _CUDA_VMR::__different_resource&& has_property<_Resource, device_accessible>) { - return _CUDA_VMR::resource_ref<_CUDA_VMR::device_accessible>{const_cast(__lhs)} - == _CUDA_VMR::resource_ref<_CUDA_VMR::device_accessible>{const_cast<_Resource&>(__rhs)}; + return _CUDA_VMR::resource_ref{const_cast(__lhs)} + == _CUDA_VMR::resource_ref{const_cast<_Resource&>(__rhs)}; } template _CCCL_NODISCARD_FRIEND auto operator==(_Resource const&, device_memory_resource const&) noexcept _CCCL_TRAILING_REQUIRES(bool)(_CUDA_VMR::__different_resource - && !has_property<_Resource, _CUDA_VMR::device_accessible>) + && !has_property<_Resource, device_accessible>) { return false; } template _CCCL_NODISCARD_FRIEND auto operator!=(device_memory_resource const& __lhs, _Resource const& __rhs) noexcept - _CCCL_TRAILING_REQUIRES(bool)(_CUDA_VMR::__different_resource&& - has_property<_Resource, _CUDA_VMR::device_accessible>) + _CCCL_TRAILING_REQUIRES(bool)( + _CUDA_VMR::__different_resource&& has_property<_Resource, device_accessible>) { - return _CUDA_VMR::resource_ref<_CUDA_VMR::device_accessible>{const_cast(__lhs)} - != _CUDA_VMR::resource_ref<_CUDA_VMR::device_accessible>{const_cast<_Resource&>(__rhs)}; + return _CUDA_VMR::resource_ref{const_cast(__lhs)} + != _CUDA_VMR::resource_ref{const_cast<_Resource&>(__rhs)}; } template _CCCL_NODISCARD_FRIEND auto operator!=(device_memory_resource const&, _Resource const&) noexcept _CCCL_TRAILING_REQUIRES(bool)(_CUDA_VMR::__different_resource - && !has_property<_Resource, _CUDA_VMR::device_accessible>) + && !has_property<_Resource, device_accessible>) { return true; } template _CCCL_NODISCARD_FRIEND auto operator!=(_Resource const& __rhs, device_memory_resource const& __lhs) noexcept - _CCCL_TRAILING_REQUIRES(bool)(_CUDA_VMR::__different_resource&& - has_property<_Resource, _CUDA_VMR::device_accessible>) + _CCCL_TRAILING_REQUIRES(bool)( + _CUDA_VMR::__different_resource&& has_property<_Resource, device_accessible>) { - return _CUDA_VMR::resource_ref<_CUDA_VMR::device_accessible>{const_cast(__lhs)} - != _CUDA_VMR::resource_ref<_CUDA_VMR::device_accessible>{const_cast<_Resource&>(__rhs)}; + return _CUDA_VMR::resource_ref{const_cast(__lhs)} + != _CUDA_VMR::resource_ref{const_cast<_Resource&>(__rhs)}; } template _CCCL_NODISCARD_FRIEND auto operator!=(_Resource const&, device_memory_resource const&) noexcept _CCCL_TRAILING_REQUIRES(bool)(_CUDA_VMR::__different_resource - && !has_property<_Resource, _CUDA_VMR::device_accessible>) + && !has_property<_Resource, device_accessible>) { return true; } @@ -408,12 +409,12 @@ public: # ifndef _CCCL_DOXYGEN_INVOKED // Doxygen cannot handle the friend function //! @brief Enables the \c device_accessible property for \c device_memory_resource. //! @relates device_memory_resource - friend constexpr void get_property(device_memory_resource const&, _CUDA_VMR::device_accessible) noexcept {} + friend constexpr void get_property(device_memory_resource const&, device_accessible) noexcept {} # endif // _CCCL_DOXYGEN_INVOKED }; -static_assert(_CUDA_VMR::resource_with, ""); +static_assert(_CUDA_VMR::resource_with, ""); -} // namespace cuda::experimental::mr +} // namespace cuda::experimental # endif // _CCCL_STD_VER >= 2014 diff --git a/cudax/include/cuda/experimental/__memory_resource/managed_memory_resource.cuh b/cudax/include/cuda/experimental/__memory_resource/managed_memory_resource.cuh new file mode 100644 index 00000000000..f240155339c --- /dev/null +++ b/cudax/include/cuda/experimental/__memory_resource/managed_memory_resource.cuh @@ -0,0 +1,254 @@ +//===----------------------------------------------------------------------===// +// +// Part of CUDA Experimental in CUDA C++ Core Libraries, +// under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// SPDX-FileCopyrightText: Copyright (c) 2024 NVIDIA CORPORATION & AFFILIATES. +// +//===----------------------------------------------------------------------===// + +#ifndef _CUDAX__MEMORY_RESOURCE_MANAGED_MEMORY_RESOURCE_CUH +#define _CUDAX__MEMORY_RESOURCE_MANAGED_MEMORY_RESOURCE_CUH + +#include + +#if defined(_CCCL_IMPLICIT_SYSTEM_HEADER_GCC) +# pragma GCC system_header +#elif defined(_CCCL_IMPLICIT_SYSTEM_HEADER_CLANG) +# pragma clang system_header +#elif defined(_CCCL_IMPLICIT_SYSTEM_HEADER_MSVC) +# pragma system_header +#endif // no system header + +#if defined(_CCCL_CUDA_COMPILER_CLANG) +# include +#endif // _CCCL_CUDA_COMPILER_CLANG + +#include +#include +#include +#include +#include +#include +#include + +#include + +//! @file +//! The \c managed_memory_resource class provides a memory resource that allocates managed memory. +namespace cuda::experimental +{ + +//! @brief \c managed_memory_resource uses `cudaMallocManaged` / `cudaFree` for allocation / deallocation. +class managed_memory_resource +{ +private: + unsigned int __flags_ = cudaMemAttachGlobal; + + static constexpr unsigned int __available_flags = cudaMemAttachGlobal | cudaMemAttachHost; + +public: + constexpr managed_memory_resource(const unsigned int __flags = cudaMemAttachGlobal) noexcept + : __flags_(__flags & __available_flags) + { + _CCCL_ASSERT(__flags_ == __flags, "Unexpected flags passed to managed_memory_resource"); + } + + //! @brief Allocate CUDA unified memory of size at least \p __bytes. + //! @param __bytes The size in bytes of the allocation. + //! @param __alignment The requested alignment of the allocation. + //! @throw std::invalid_argument in case of invalid alignment or \c cuda::cuda_error of the returned error code. + //! @return Pointer to the newly allocated memory + _CCCL_NODISCARD void* allocate(const size_t __bytes, + const size_t __alignment = _CUDA_VMR::default_cuda_malloc_alignment) const + { + // We need to ensure that the provided alignment matches the minimal provided alignment + if (!__is_valid_alignment(__alignment)) + { + _CUDA_VSTD::__throw_invalid_argument("Invalid alignment passed to managed_memory_resource::allocate."); + } + + void* __ptr{nullptr}; + _CCCL_TRY_CUDA_API( + ::cudaMallocManaged, "Failed to allocate memory with cudaMallocManaged.", &__ptr, __bytes, __flags_); + return __ptr; + } + + //! @brief Allocate CUDA unified memory of size at least \p __bytes. + //! @param __bytes The size in bytes of the allocation. + //! @param __alignment The requested alignment of the allocation. + //! @param __stream Stream on which to perform allocation. Currently ignored + //! @throws std::invalid_argument In case of invalid alignment. + //! @throws cuda::cuda_error If an error code was return by the cuda api call. + //! @returns Pointer to the newly allocated memory. + _CCCL_NODISCARD void* allocate_async(const size_t __bytes, const size_t __alignment, const ::cuda::stream_ref __stream) + { + (void) __stream; + return allocate(__bytes, __alignment); + } + + //! @brief Allocate CUDA unified memory of size at least \p __bytes. + //! @param __bytes The size in bytes of the allocation. + //! @param __stream Stream on which to perform allocation. + //! @throws cuda::cuda_error If an error code was return by the cuda api call. + //! @returns Pointer to the newly allocated memory. + _CCCL_NODISCARD void* allocate_async(const size_t __bytes, const ::cuda::stream_ref __stream) + { + (void) __stream; + return allocate(__bytes); + } + + //! @brief Deallocate memory pointed to by \p __ptr. + //! @param __ptr Pointer to be deallocated. Must have been allocated through a call to `allocate`. + //! @param __bytes The number of bytes that was passed to the `allocate` call that returned \p __ptr. + //! @param __alignment The alignment that was passed to the `allocate` call that returned \p __ptr. + void deallocate( + void* __ptr, const size_t, const size_t __alignment = _CUDA_VMR::default_cuda_malloc_alignment) const noexcept + { + // We need to ensure that the provided alignment matches the minimal provided alignment + _CCCL_ASSERT(__is_valid_alignment(__alignment), "Invalid alignment passed to managed_memory_resource::deallocate."); + _CCCL_ASSERT_CUDA_API(::cudaFree, "managed_memory_resource::deallocate failed", __ptr); + (void) __alignment; + } + + //! @brief Deallocate memory pointed to by \p __ptr. + //! @param __ptr Pointer to be deallocated. Must have been allocated through a call to `allocate_async`. + //! @param __bytes The number of bytes that was passed to the `allocate_async` call that returned \p __ptr. + //! @param __alignment The alignment that was passed to the `allocate_async` call that returned \p __ptr. + //! @param __stream A stream that has a stream ordering relationship with the stream used in the + //! allocate_async call + //! that returned \p __ptr. + //! @note The pointer passed to `deallocate_async` must not be in use in a stream other than \p __stream. + //! It is the caller's responsibility to properly synchronize all relevant streams before calling `deallocate_async`. + void deallocate_async(void* __ptr, const size_t __bytes, const size_t __alignment, const ::cuda::stream_ref __stream) + { + deallocate(__ptr, __bytes); + (void) __alignment; + (void) __stream; + } + + //! @brief Deallocate memory pointed to by \p __ptr. + //! @param __ptr Pointer to be deallocated. Must have been allocated through a call to `allocate_async`. + //! @param __bytes The number of bytes that was passed to the `allocate_async` call that returned \p __ptr. + //! @param __stream A stream that has a stream ordering relationship with the stream used in the + //! allocate_async call + //! that returned \p __ptr. + //! @note The pointer passed to `deallocate_async` must not be in use in a stream other than \p __stream. + //! It is the caller's responsibility to properly synchronize all relevant streams before calling `deallocate_async`. + void deallocate_async(void* __ptr, size_t __bytes, const ::cuda::stream_ref __stream) + { + deallocate(__ptr, __bytes); + (void) __stream; + } + + //! @brief Equality comparison with another \c managed_memory_resource. + //! @param __other The other \c managed_memory_resource. + //! @return Whether both \c managed_memory_resource were constructed with the same flags. + _CCCL_NODISCARD constexpr bool operator==(managed_memory_resource const& __other) const noexcept + { + return __flags_ == __other.__flags_; + } +#if _CCCL_STD_VER <= 2017 + //! @brief Inequality comparison with another \c managed_memory_resource. + //! @param __other The other \c managed_memory_resource. + //! @return Whether both \c managed_memory_resource were constructed with different flags. + _CCCL_NODISCARD constexpr bool operator!=(managed_memory_resource const& __other) const noexcept + { + return __flags_ != __other.__flags_; + } +#endif // _CCCL_STD_VER <= 2017 + +#ifndef DOXYGEN_SHOULD_SKIP_THIS // Do not document +# if _CCCL_STD_VER >= 2020 + //! @brief Equality comparison between a \c managed_memory_resource and another resource + //! @param __rhs The resource to compare to + //! @return If the underlying types are equality comparable, returns the result of equality comparison of both + //! resources. Otherwise, returns false. + _CCCL_TEMPLATE(class _Resource) + _CCCL_REQUIRES(_CUDA_VMR::__different_resource) + _CCCL_NODISCARD bool operator==(_Resource const& __rhs) const noexcept + { + if constexpr (has_property<_Resource, mr::device_accessible>) + { + return _CUDA_VMR::resource_ref{const_cast(this)} + == _CUDA_VMR::resource_ref{const_cast<_Resource&>(__rhs)}; + } + else if constexpr (has_property<_Resource, mr::device_accessible>) + { + return _CUDA_VMR::resource_ref{const_cast(this)} + == _CUDA_VMR::resource_ref{const_cast<_Resource&>(__rhs)}; + } + else + { + return false; + } + } +# else // ^^^ C++20 ^^^ / vvv C++17 + template + _CCCL_NODISCARD_FRIEND auto operator==(managed_memory_resource const& __lhs, _Resource const& __rhs) noexcept + _CCCL_TRAILING_REQUIRES(bool)(_CUDA_VMR::__different_resource&& + has_property<_Resource, mr::device_accessible>) + { + return _CUDA_VMR::resource_ref{const_cast(__lhs)} + == _CUDA_VMR::resource_ref{const_cast<_Resource&>(__rhs)}; + } + template + _CCCL_NODISCARD_FRIEND auto operator==(managed_memory_resource const& __lhs, _Resource const& __rhs) noexcept + _CCCL_TRAILING_REQUIRES(bool)( + _CUDA_VMR::__different_resource + && !has_property<_Resource, mr::device_accessible> && has_property<_Resource, mr::device_accessible>) + { + return _CUDA_VMR::resource_ref{const_cast(__lhs)} + == _CUDA_VMR::resource_ref{const_cast<_Resource&>(__rhs)}; + } + template + _CCCL_NODISCARD_FRIEND auto operator==(managed_memory_resource const&, _Resource const&) noexcept + _CCCL_TRAILING_REQUIRES(bool)( + _CUDA_VMR::__different_resource + && !has_property<_Resource, mr::device_accessible> && !has_property<_Resource, mr::device_accessible>) + { + return false; + } + + template + _CCCL_NODISCARD_FRIEND auto operator==(_Resource const& __lhs, managed_memory_resource const& __rhs) noexcept + _CCCL_TRAILING_REQUIRES(bool)(_CUDA_VMR::__different_resource) + { + return __rhs == __lhs; + } + + template + _CCCL_NODISCARD_FRIEND auto operator!=(managed_memory_resource const& __lhs, _Resource const& __rhs) noexcept + _CCCL_TRAILING_REQUIRES(bool)(_CUDA_VMR::__different_resource) + { + return !(__lhs == __rhs); + } + + template + _CCCL_NODISCARD_FRIEND auto operator!=(_Resource const& __rhs, managed_memory_resource const& __lhs) noexcept + _CCCL_TRAILING_REQUIRES(bool)(_CUDA_VMR::__different_resource) + { + return !(__rhs == __lhs); + } +# endif // _CCCL_STD_VER <= 2017 + + //! @brief Enables the \c device_accessible property + friend constexpr void get_property(managed_memory_resource const&, mr::device_accessible) noexcept {} + //! @brief Enables the \c host_accessible property + friend constexpr void get_property(managed_memory_resource const&, mr::host_accessible) noexcept {} +#endif // DOXYGEN_SHOULD_SKIP_THIS + + //! @brief Checks whether the passed in alignment is valid + static constexpr bool __is_valid_alignment(const size_t __alignment) noexcept + { + return __alignment <= _CUDA_VMR::default_cuda_malloc_alignment + && (_CUDA_VMR::default_cuda_malloc_alignment % __alignment == 0); + } +}; +static_assert(_CUDA_VMR::async_resource_with, ""); +static_assert(_CUDA_VMR::async_resource_with, ""); + +} // namespace cuda::experimental + +#endif //_CUDAX__MEMORY_RESOURCE_MANAGED_MEMORY_RESOURCE_CUH diff --git a/cudax/include/cuda/experimental/__memory_resource/pinned_memory_resource.cuh b/cudax/include/cuda/experimental/__memory_resource/pinned_memory_resource.cuh new file mode 100644 index 00000000000..60ec7c9b49e --- /dev/null +++ b/cudax/include/cuda/experimental/__memory_resource/pinned_memory_resource.cuh @@ -0,0 +1,256 @@ +//===----------------------------------------------------------------------===// +// +// Part of CUDA Experimental in CUDA C++ Core Libraries, +// under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// SPDX-FileCopyrightText: Copyright (c) 2024 NVIDIA CORPORATION & AFFILIATES. +// +//===----------------------------------------------------------------------===// + +#ifndef _CUDA__MEMORY_RESOURCE_CUDA_PINNED_MEMORY_RESOURCE_H +#define _CUDA__MEMORY_RESOURCE_CUDA_PINNED_MEMORY_RESOURCE_H + +#include + +#if defined(_CCCL_IMPLICIT_SYSTEM_HEADER_GCC) +# pragma GCC system_header +#elif defined(_CCCL_IMPLICIT_SYSTEM_HEADER_CLANG) +# pragma clang system_header +#elif defined(_CCCL_IMPLICIT_SYSTEM_HEADER_MSVC) +# pragma system_header +#endif // no system header + +#if defined(_CCCL_CUDA_COMPILER_CLANG) +# include +# include +#endif // _CCCL_CUDA_COMPILER_CLANG + +#include +#include +#include +#include +#include +#include +#include + +#include + +//! @file +//! The \c managed_memory_resource class provides a memory resource that allocates pinned memory. +namespace cuda::experimental +{ + +//! @brief pinned_memory_resource uses `cudaMallocHost` / `cudaFreeHost` for allocation / deallocation. +class pinned_memory_resource +{ +private: + unsigned int __flags_ = cudaHostAllocDefault; + + static constexpr unsigned int __available_flags = + cudaHostAllocDefault | cudaHostAllocPortable | cudaHostAllocMapped | cudaHostAllocWriteCombined; + +public: + constexpr pinned_memory_resource(const unsigned int __flags = cudaHostAllocDefault) noexcept + : __flags_(__flags & __available_flags) + { + _CCCL_ASSERT(__flags_ == __flags, "Unexpected flags passed to pinned_memory_resource"); + } + + //! @brief Allocate host memory of size at least \p __bytes. + //! @param __bytes The size in bytes of the allocation. + //! @param __alignment The requested alignment of the allocation. + //! @throw std::invalid_argument in case of invalid alignment or \c cuda::cuda_error of the returned error code. + //! @return Pointer to the newly allocated memory + _CCCL_NODISCARD void* allocate(const size_t __bytes, + const size_t __alignment = _CUDA_VMR::default_cuda_malloc_host_alignment) const + { + // We need to ensure that the provided alignment matches the minimal provided alignment + if (!__is_valid_alignment(__alignment)) + { + _CUDA_VSTD::__throw_invalid_argument("Invalid alignment passed to pinned_memory_resource::allocate."); + } + + void* __ptr{nullptr}; + _CCCL_TRY_CUDA_API(::cudaMallocHost, "Failed to allocate memory with cudaMallocHost.", &__ptr, __bytes, __flags_); + return __ptr; + } + + //! @brief Allocate host memory of size at least \p __bytes. + //! @param __bytes The size in bytes of the allocation. + //! @param __alignment The requested alignment of the allocation. + //! @param __stream Stream on which to perform allocation. Currently ignored + //! @throws std::invalid_argument In case of invalid alignment. + //! @throws cuda::cuda_error If an error code was return by the cuda api call. + //! @returns Pointer to the newly allocated memory. + _CCCL_NODISCARD void* allocate_async(const size_t __bytes, const size_t __alignment, const ::cuda::stream_ref __stream) + { + (void) __stream; + return allocate(__bytes, __alignment); + } + + //! @brief Allocate host memory of size at least \p __bytes. + //! @param __bytes The size in bytes of the allocation. + //! @param __stream Stream on which to perform allocation. + //! @throws cuda::cuda_error If an error code was return by the cuda api call. + //! @returns Pointer to the newly allocated memory. + _CCCL_NODISCARD void* allocate_async(const size_t __bytes, const ::cuda::stream_ref __stream) + { + (void) __stream; + return allocate(__bytes); + } + + //! @brief Deallocate memory pointed to by \p __ptr. + //! @param __ptr Pointer to be deallocated. Must have been allocated through a call to `allocate`. + //! @param __bytes The number of bytes that was passed to the `allocate` call that returned \p __ptr. + //! @param __alignment The alignment that was passed to the `allocate` call that returned \p __ptr. + void deallocate( + void* __ptr, const size_t, const size_t __alignment = _CUDA_VMR::default_cuda_malloc_host_alignment) const noexcept + { + // We need to ensure that the provided alignment matches the minimal provided alignment + _CCCL_ASSERT(__is_valid_alignment(__alignment), "Invalid alignment passed to pinned_memory_resource::deallocate."); + _CCCL_ASSERT_CUDA_API(::cudaFreeHost, "pinned_memory_resource::deallocate failed", __ptr); + (void) __alignment; + } + + //! @brief Deallocate memory pointed to by \p __ptr. + //! @param __ptr Pointer to be deallocated. Must have been allocated through a call to `allocate_async`. + //! @param __bytes The number of bytes that was passed to the `allocate_async` call that returned \p __ptr. + //! @param __alignment The alignment that was passed to the `allocate_async` call that returned \p __ptr. + //! @param __stream A stream that has a stream ordering relationship with the stream used in the + //! allocate_async call + //! that returned \p __ptr. + //! @note The pointer passed to `deallocate_async` must not be in use in a stream other than \p __stream. + //! It is the caller's responsibility to properly synchronize all relevant streams before calling `deallocate_async`. + void deallocate_async(void* __ptr, const size_t __bytes, const size_t __alignment, const ::cuda::stream_ref __stream) + { + deallocate(__ptr, __bytes); + (void) __alignment; + (void) __stream; + } + + //! @brief Deallocate memory pointed to by \p __ptr. + //! @param __ptr Pointer to be deallocated. Must have been allocated through a call to `allocate_async`. + //! @param __bytes The number of bytes that was passed to the `allocate_async` call that returned \p __ptr. + //! @param __stream A stream that has a stream ordering relationship with the stream used in the + //! allocate_async call + //! that returned \p __ptr. + //! @note The pointer passed to `deallocate_async` must not be in use in a stream other than \p __stream. + //! It is the caller's responsibility to properly synchronize all relevant streams before calling `deallocate_async`. + void deallocate_async(void* __ptr, size_t __bytes, const ::cuda::stream_ref __stream) + { + deallocate(__ptr, __bytes); + (void) __stream; + } + + //! @brief Equality comparison with another \c pinned_memory_resource. + //! @param __other The other \c pinned_memory_resource. + //! @return Whether both \c pinned_memory_resource were constructed with the same flags. + _CCCL_NODISCARD constexpr bool operator==(pinned_memory_resource const& __other) const noexcept + { + return __flags_ == __other.__flags_; + } +#if _CCCL_STD_VER <= 2017 + //! @brief Equality comparison with another \c pinned_memory_resource. + //! @param __other The other \c pinned_memory_resource. + //! @return Whether both \c pinned_memory_resource were constructed with different flags. + _CCCL_NODISCARD constexpr bool operator!=(pinned_memory_resource const& __other) const noexcept + { + return __flags_ != __other.__flags_; + } +#endif // _CCCL_STD_VER <= 2017 + +#ifndef DOXYGEN_SHOULD_SKIP_THIS // Do not document +# if _CCCL_STD_VER >= 2020 + //! @brief Equality comparison between a \c pinned_memory_resource and another resource + //! @param __rhs The resource to compare to + //! @return If the underlying types are equality comparable, returns the result of equality comparison of both + //! resources. Otherwise, returns false. + _CCCL_TEMPLATE(class _Resource) + _CCCL_REQUIRES(_CUDA_VMR::__different_resource) + _CCCL_NODISCARD bool operator==(_Resource const& __rhs) const noexcept + { + if constexpr (has_property<_Resource, device_accessible>) + { + return _CUDA_VMR::resource_ref{const_cast(this)} + == _CUDA_VMR::resource_ref{const_cast<_Resource&>(__rhs)}; + } + else if constexpr (has_property<_Resource, device_accessible>) + { + return _CUDA_VMR::resource_ref{const_cast(this)} + == _CUDA_VMR::resource_ref{const_cast<_Resource&>(__rhs)}; + } + else + { + return false; + } + } +# else // ^^^ C++20 ^^^ / vvv C++17 + template + _CCCL_NODISCARD_FRIEND auto operator==(pinned_memory_resource const& __lhs, _Resource const& __rhs) noexcept + _CCCL_TRAILING_REQUIRES(bool)( + _CUDA_VMR::__different_resource&& ::cuda::has_property<_Resource, device_accessible>) + { + return _CUDA_VMR::resource_ref{const_cast(__lhs)} + == _CUDA_VMR::resource_ref{const_cast<_Resource&>(__rhs)}; + } + template + _CCCL_NODISCARD_FRIEND auto operator==(pinned_memory_resource const& __lhs, _Resource const& __rhs) noexcept + _CCCL_TRAILING_REQUIRES(bool)( + _CUDA_VMR::__different_resource + && !::cuda::has_property<_Resource, device_accessible> && ::cuda::has_property<_Resource, device_accessible>) + { + return _CUDA_VMR::resource_ref{const_cast(__lhs)} + == _CUDA_VMR::resource_ref{const_cast<_Resource&>(__rhs)}; + } + template + _CCCL_NODISCARD_FRIEND auto operator==(pinned_memory_resource const&, _Resource const&) noexcept + _CCCL_TRAILING_REQUIRES(bool)( + _CUDA_VMR::__different_resource + && !::cuda::has_property<_Resource, device_accessible> && !::cuda::has_property<_Resource, device_accessible>) + { + return false; + } + + template + _CCCL_NODISCARD_FRIEND auto operator==(_Resource const& __lhs, pinned_memory_resource const& __rhs) noexcept + _CCCL_TRAILING_REQUIRES(bool)(_CUDA_VMR::__different_resource) + { + return __rhs == __lhs; + } + + template + _CCCL_NODISCARD_FRIEND auto operator!=(pinned_memory_resource const& __lhs, _Resource const& __rhs) noexcept + _CCCL_TRAILING_REQUIRES(bool)(_CUDA_VMR::__different_resource) + { + return !(__lhs == __rhs); + } + + template + _CCCL_NODISCARD_FRIEND auto operator!=(_Resource const& __rhs, pinned_memory_resource const& __lhs) noexcept + _CCCL_TRAILING_REQUIRES(bool)(_CUDA_VMR::__different_resource) + { + return !(__rhs == __lhs); + } +# endif // _CCCL_STD_VER <= 2017 + + //! @brief Enables the \c device_accessible property + friend constexpr void get_property(pinned_memory_resource const&, device_accessible) noexcept {} + //! @brief Enables the \c host_accessible property + friend constexpr void get_property(pinned_memory_resource const&, host_accessible) noexcept {} +#endif // DOXYGEN_SHOULD_SKIP_THIS + + //! @brief Checks whether the passed in alignment is valid + static constexpr bool __is_valid_alignment(const size_t __alignment) noexcept + { + return __alignment <= _CUDA_VMR::default_cuda_malloc_host_alignment + && (_CUDA_VMR::default_cuda_malloc_host_alignment % __alignment == 0); + } +}; +static_assert(_CUDA_VMR::async_resource_with, ""); +static_assert(_CUDA_VMR::async_resource_with, ""); + +} // namespace cuda::experimental + +#endif //_CUDA__MEMORY_RESOURCE_CUDA_PINNED_MEMORY_RESOURCE_H diff --git a/cudax/include/cuda/experimental/__memory_resource/properties.cuh b/cudax/include/cuda/experimental/__memory_resource/properties.cuh new file mode 100644 index 00000000000..b1646ab4b36 --- /dev/null +++ b/cudax/include/cuda/experimental/__memory_resource/properties.cuh @@ -0,0 +1,49 @@ +//===----------------------------------------------------------------------===// +// +// Part of CUDA Experimental in CUDA C++ Core Libraries, +// under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// SPDX-FileCopyrightText: Copyright (c) 2024 NVIDIA CORPORATION & AFFILIATES. +// +//===----------------------------------------------------------------------===// + +#ifndef _CUDAX__MEMORY_RESOURCE_PROPERTIES_CUH +#define _CUDAX__MEMORY_RESOURCE_PROPERTIES_CUH + +#include + +#if defined(_CCCL_IMPLICIT_SYSTEM_HEADER_GCC) +# pragma GCC system_header +#elif defined(_CCCL_IMPLICIT_SYSTEM_HEADER_CLANG) +# pragma clang system_header +#elif defined(_CCCL_IMPLICIT_SYSTEM_HEADER_MSVC) +# pragma system_header +#endif // no system header + +// If the memory resource header was included without the experimental flag, +// tell the user to define the experimental flag. +#if defined(_CUDA_MEMORY_RESOURCE) && !defined(LIBCUDACXX_ENABLE_EXPERIMENTAL_MEMORY_RESOURCE) +# error "To use the experimental memory resource, define LIBCUDACXX_ENABLE_EXPERIMENTAL_MEMORY_RESOURCE" +#endif + +// cuda::mr is unavable on MSVC 2017 +#if _CCCL_COMPILER(MSVC2017) +# error "The any_resource header is not supported on MSVC 2017" +#endif // _CCCL_COMPILER(MSVC2017) + +#if !defined(LIBCUDACXX_ENABLE_EXPERIMENTAL_MEMORY_RESOURCE) +# define LIBCUDACXX_ENABLE_EXPERIMENTAL_MEMORY_RESOURCE +#endif + +#include + +namespace cuda::experimental +{ + +using ::cuda::mr::device_accessible; +using ::cuda::mr::host_accessible; + +} // namespace cuda::experimental + +#endif //_CUDAX__MEMORY_RESOURCE_PROPERTIES_CUH diff --git a/cudax/include/cuda/experimental/__memory_resource/shared_resource.cuh b/cudax/include/cuda/experimental/__memory_resource/shared_resource.cuh index 1b0a81320b1..bfea3e43e68 100644 --- a/cudax/include/cuda/experimental/__memory_resource/shared_resource.cuh +++ b/cudax/include/cuda/experimental/__memory_resource/shared_resource.cuh @@ -44,7 +44,7 @@ #include #include -namespace cuda::experimental::mr +namespace cuda::experimental { //! @rst @@ -268,6 +268,6 @@ auto make_shared_resource(_Args&&... __args) -> shared_resource<_Resource> return shared_resource<_Resource>{_CUDA_VSTD::forward<_Args>(__args)...}; } -} // namespace cuda::experimental::mr +} // namespace cuda::experimental #endif // _CUDAX__MEMORY_RESOURCE_SHARED_RESOURCE_H diff --git a/cudax/include/cuda/experimental/memory_resource.cuh b/cudax/include/cuda/experimental/memory_resource.cuh index 42f32a97d8a..c1bb3b916ac 100644 --- a/cudax/include/cuda/experimental/memory_resource.cuh +++ b/cudax/include/cuda/experimental/memory_resource.cuh @@ -14,6 +14,9 @@ #include #include #include +#include +#include +#include #include #endif // __CUDAX_MEMORY_RESOURCE___ diff --git a/cudax/test/CMakeLists.txt b/cudax/test/CMakeLists.txt index 2e54f2ca6d1..9af2c83cc6f 100644 --- a/cudax/test/CMakeLists.txt +++ b/cudax/test/CMakeLists.txt @@ -110,6 +110,8 @@ foreach(cn_target IN LISTS cudax_TARGETS) memory_resource/any_resource.cu memory_resource/device_memory_pool.cu memory_resource/device_memory_resource.cu + memory_resource/managed_memory_resource.cu + memory_resource/pinned_memory_resource.cu memory_resource/shared_resource.cu ) diff --git a/cudax/test/algorithm/common.cuh b/cudax/test/algorithm/common.cuh index 4b262966190..c4c7be0d02c 100644 --- a/cudax/test/algorithm/common.cuh +++ b/cudax/test/algorithm/common.cuh @@ -63,11 +63,11 @@ namespace cuda::experimental template > struct weird_buffer { - const cuda::mr::pinned_memory_resource& resource; + const pinned_memory_resource& resource; int* data; std::size_t size; - weird_buffer(const cuda::mr::pinned_memory_resource& res, std::size_t s) + weird_buffer(const pinned_memory_resource& res, std::size_t s) : resource(res) , data((int*) res.allocate(s * sizeof(int))) , size(s) diff --git a/cudax/test/algorithm/copy.cu b/cudax/test/algorithm/copy.cu index 3db65e22c51..afb9a2b71d5 100644 --- a/cudax/test/algorithm/copy.cu +++ b/cudax/test/algorithm/copy.cu @@ -16,7 +16,7 @@ TEST_CASE("1d Copy", "[data_manipulation]") SECTION("Device resource") { - cudax::mr::device_memory_resource device_resource; + cudax::device_memory_resource device_resource; std::vector host_vector(buffer_size); { @@ -46,8 +46,8 @@ TEST_CASE("1d Copy", "[data_manipulation]") SECTION("Host and managed resource") { - cuda::mr::managed_memory_resource managed_resource; - cuda::mr::pinned_memory_resource host_resource; + cudax::managed_memory_resource managed_resource; + cudax::pinned_memory_resource host_resource; { cudax::uninitialized_buffer host_buffer(host_resource, buffer_size); @@ -78,7 +78,7 @@ TEST_CASE("1d Copy", "[data_manipulation]") } SECTION("Launch transform") { - cuda::mr::pinned_memory_resource host_resource; + cudax::pinned_memory_resource host_resource; cudax::weird_buffer input(host_resource, buffer_size); cudax::weird_buffer output(host_resource, buffer_size); @@ -90,7 +90,7 @@ TEST_CASE("1d Copy", "[data_manipulation]") SECTION("Asymetric size") { - cuda::mr::pinned_memory_resource host_resource; + cudax::pinned_memory_resource host_resource; cudax::uninitialized_buffer host_buffer(host_resource, 1); cudax::fill_bytes(_stream, host_buffer, fill_byte); diff --git a/cudax/test/algorithm/fill.cu b/cudax/test/algorithm/fill.cu index ce733871f51..35fae342ad3 100644 --- a/cudax/test/algorithm/fill.cu +++ b/cudax/test/algorithm/fill.cu @@ -15,7 +15,7 @@ TEST_CASE("Fill", "[data_manipulation]") cudax::stream _stream; SECTION("Host resource") { - cuda::mr::pinned_memory_resource host_resource; + cudax::pinned_memory_resource host_resource; cudax::uninitialized_buffer buffer(host_resource, buffer_size); cudax::fill_bytes(_stream, buffer, fill_byte); @@ -25,7 +25,7 @@ TEST_CASE("Fill", "[data_manipulation]") SECTION("Device resource") { - cuda::mr::device_memory_resource device_resource; + cudax::device_memory_resource device_resource; cudax::uninitialized_buffer buffer(device_resource, buffer_size); cudax::fill_bytes(_stream, buffer, fill_byte); @@ -37,7 +37,7 @@ TEST_CASE("Fill", "[data_manipulation]") } SECTION("Launch transform") { - cuda::mr::pinned_memory_resource host_resource; + cudax::pinned_memory_resource host_resource; cudax::weird_buffer buffer(host_resource, buffer_size); cudax::fill_bytes(_stream, buffer, fill_byte); diff --git a/cudax/test/containers/uninitialized_async_buffer.cu b/cudax/test/containers/uninitialized_async_buffer.cu index 6a63a5f99f2..3ec6f1bed6d 100644 --- a/cudax/test/containers/uninitialized_async_buffer.cu +++ b/cudax/test/containers/uninitialized_async_buffer.cu @@ -42,7 +42,7 @@ constexpr int get_property( { return 42; } -constexpr int get_property(const cuda::experimental::mr::device_memory_resource&, my_property) +constexpr int get_property(const cuda::experimental::device_memory_resource&, my_property) { return 42; } @@ -56,7 +56,7 @@ TEMPLATE_TEST_CASE( static_assert(!cuda::std::is_copy_constructible::value, ""); static_assert(!cuda::std::is_copy_assignable::value, ""); - cuda::experimental::mr::device_memory_resource resource{}; + cuda::experimental::device_memory_resource resource{}; cuda::experimental::stream stream{}; SECTION("construction") @@ -207,7 +207,7 @@ TEMPLATE_TEST_CASE( // A test resource that keeps track of the number of resources are // currently alive. -struct test_async_device_memory_resource : cudax::mr::device_memory_resource +struct test_async_device_memory_resource : cudax::device_memory_resource { static int count; @@ -217,7 +217,7 @@ struct test_async_device_memory_resource : cudax::mr::device_memory_resource } test_async_device_memory_resource(const test_async_device_memory_resource& other) - : cudax::mr::device_memory_resource{other} + : cudax::device_memory_resource{other} { ++count; } @@ -234,7 +234,7 @@ TEST_CASE("uninitialized_async_buffer's memory resource does not dangle", "[cont { cuda::experimental::stream stream{}; cudax::uninitialized_async_buffer buffer{ - cudax::mr::device_memory_resource{}, stream, 0}; + cudax::device_memory_resource{}, stream, 0}; { CHECK(test_async_device_memory_resource::count == 0); diff --git a/cudax/test/containers/uninitialized_buffer.cu b/cudax/test/containers/uninitialized_buffer.cu index 22fe1ef473c..56ac77da86c 100644 --- a/cudax/test/containers/uninitialized_buffer.cu +++ b/cudax/test/containers/uninitialized_buffer.cu @@ -12,7 +12,6 @@ #include #include -#include #include #include #include @@ -20,6 +19,7 @@ #include #include +#include #include #include "testing.cuh" @@ -56,7 +56,7 @@ constexpr int get_property( { return 42; } -constexpr int get_property(const cuda::mr::device_memory_resource&, my_property) +constexpr int get_property(const cudax::device_memory_resource&, my_property) { return 42; } @@ -69,7 +69,7 @@ TEMPLATE_TEST_CASE( static_assert(!cuda::std::is_copy_constructible::value, ""); static_assert(!cuda::std::is_copy_assignable::value, ""); - cuda::mr::device_memory_resource resource{}; + cudax::device_memory_resource resource{}; SECTION("construction") { @@ -111,7 +111,7 @@ TEMPLATE_TEST_CASE( { static_assert(!cuda::std::is_copy_assignable::value, ""); { - cuda::mr::managed_memory_resource other_resource{}; + cudax::managed_memory_resource other_resource{}; uninitialized_buffer input{other_resource, 42}; uninitialized_buffer buf{resource, 1337}; const auto* old_ptr = buf.data(); @@ -222,7 +222,7 @@ TEST_CASE("uninitialized_buffer is usable with cudax::launch", "[container]") SECTION("non-const") { const int grid_size = 4; - cudax::uninitialized_buffer buffer{cuda::mr::device_memory_resource{}, 1024}; + cudax::uninitialized_buffer buffer{cudax::device_memory_resource{}, 1024}; auto dimensions = cudax::make_hierarchy(cudax::grid_dims(grid_size), cudax::block_dims<256>()); cudax::stream stream; @@ -233,8 +233,7 @@ TEST_CASE("uninitialized_buffer is usable with cudax::launch", "[container]") SECTION("const") { const int grid_size = 4; - const cudax::uninitialized_buffer buffer{ - cuda::mr::device_memory_resource{}, 1024}; + const cudax::uninitialized_buffer buffer{cudax::device_memory_resource{}, 1024}; auto dimensions = cudax::make_hierarchy(cudax::grid_dims(grid_size), cudax::block_dims<256>()); cudax::stream stream; @@ -245,7 +244,7 @@ TEST_CASE("uninitialized_buffer is usable with cudax::launch", "[container]") // A test resource that keeps track of the number of resources are // currently alive. -struct test_device_memory_resource : cuda::mr::device_memory_resource +struct test_device_memory_resource : cudax::device_memory_resource { static int count; @@ -255,7 +254,7 @@ struct test_device_memory_resource : cuda::mr::device_memory_resource } test_device_memory_resource(const test_device_memory_resource& other) - : cuda::mr::device_memory_resource{other} + : cudax::device_memory_resource{other} { ++count; } @@ -270,7 +269,7 @@ int test_device_memory_resource::count = 0; TEST_CASE("uninitialized_buffer's memory resource does not dangle", "[container]") { - cudax::uninitialized_buffer buffer{cuda::mr::device_memory_resource{}, 0}; + cudax::uninitialized_buffer buffer{cudax::device_memory_resource{}, 0}; { CHECK(test_device_memory_resource::count == 0); diff --git a/cudax/test/memory_resource/any_async_resource.cu b/cudax/test/memory_resource/any_async_resource.cu index f032ac3f6b8..89c28b8a279 100644 --- a/cudax/test/memory_resource/any_async_resource.cu +++ b/cudax/test/memory_resource/any_async_resource.cu @@ -25,7 +25,7 @@ TEMPLATE_TEST_CASE_METHOD(test_fixture, "any_async_resource", "[container][resou Counts expected{}; CHECK(this->counts == expected); { - cudax::mr::any_async_resource mr{TestResource{42, this}}; + cudax::any_async_resource mr{TestResource{42, this}}; expected.new_count += is_big; ++expected.object_count; ++expected.move_count; @@ -44,7 +44,7 @@ TEMPLATE_TEST_CASE_METHOD(test_fixture, "any_async_resource", "[container][resou Counts expected{}; CHECK(this->counts == expected); { - cudax::mr::any_async_resource mr{TestResource{42, this}}; + cudax::any_async_resource mr{TestResource{42, this}}; expected.new_count += is_big; ++expected.object_count; ++expected.move_count; @@ -79,7 +79,7 @@ TEMPLATE_TEST_CASE_METHOD(test_fixture, "any_async_resource", "[container][resou Counts expected{}; CHECK(this->counts == expected); { - cudax::mr::any_async_resource mr{TestResource{42, this}}; + cudax::any_async_resource mr{TestResource{42, this}}; expected.new_count += is_big; ++expected.object_count; ++expected.move_count; @@ -108,7 +108,7 @@ TEMPLATE_TEST_CASE_METHOD(test_fixture, "any_async_resource", "[container][resou CHECK(this->counts == expected); { cudax::stream stream{}; - cudax::mr::any_async_resource mr{TestResource{42, this}}; + cudax::any_async_resource mr{TestResource{42, this}}; expected.new_count += is_big; ++expected.object_count; ++expected.move_count; @@ -135,7 +135,7 @@ TEMPLATE_TEST_CASE_METHOD(test_fixture, "any_async_resource", "[container][resou { Counts expected{}; { - cudax::mr::any_async_resource mr{TestResource{42, this}}; + cudax::any_async_resource mr{TestResource{42, this}}; expected.new_count += is_big; ++expected.object_count; ++expected.move_count; @@ -165,8 +165,8 @@ TEMPLATE_TEST_CASE_METHOD(test_fixture, "any_async_resource", "[container][resou Counts expected{}; CHECK(this->counts == expected); { - cudax::mr::any_async_resource mr = - cudax::mr::make_any_async_resource(42, this); + cudax::any_async_resource mr = + cudax::make_any_async_resource(42, this); expected.new_count += is_big; ++expected.object_count; CHECK(this->counts == expected); diff --git a/cudax/test/memory_resource/any_resource.cu b/cudax/test/memory_resource/any_resource.cu index 213dee61d93..c013785f32f 100644 --- a/cudax/test/memory_resource/any_resource.cu +++ b/cudax/test/memory_resource/any_resource.cu @@ -24,7 +24,7 @@ TEMPLATE_TEST_CASE_METHOD(test_fixture, "any_resource", "[container][resource]", Counts expected{}; CHECK(this->counts == expected); { - cudax::mr::any_resource mr{TestResource{42, this}}; + cudax::any_resource mr{TestResource{42, this}}; expected.new_count += is_big; ++expected.object_count; ++expected.move_count; @@ -43,7 +43,7 @@ TEMPLATE_TEST_CASE_METHOD(test_fixture, "any_resource", "[container][resource]", Counts expected{}; CHECK(this->counts == expected); { - cudax::mr::any_resource mr{TestResource{42, this}}; + cudax::any_resource mr{TestResource{42, this}}; expected.new_count += is_big; ++expected.object_count; ++expected.move_count; @@ -78,7 +78,7 @@ TEMPLATE_TEST_CASE_METHOD(test_fixture, "any_resource", "[container][resource]", Counts expected{}; CHECK(this->counts == expected); { - cudax::mr::any_resource mr{TestResource{42, this}}; + cudax::any_resource mr{TestResource{42, this}}; expected.new_count += is_big; ++expected.object_count; ++expected.move_count; @@ -105,7 +105,7 @@ TEMPLATE_TEST_CASE_METHOD(test_fixture, "any_resource", "[container][resource]", { Counts expected{}; { - cudax::mr::any_resource mr{TestResource{42, this}}; + cudax::any_resource mr{TestResource{42, this}}; expected.new_count += is_big; ++expected.object_count; ++expected.move_count; @@ -135,8 +135,8 @@ TEMPLATE_TEST_CASE_METHOD(test_fixture, "any_resource", "[container][resource]", Counts expected{}; CHECK(this->counts == expected); { - cudax::mr::any_resource mr = - cudax::mr::make_any_resource(42, this); + cudax::any_resource mr = + cudax::make_any_resource(42, this); expected.new_count += is_big; ++expected.object_count; CHECK(this->counts == expected); diff --git a/cudax/test/memory_resource/device_memory_pool.cu b/cudax/test/memory_resource/device_memory_pool.cu index 22faeda6bb8..3260829c4b6 100644 --- a/cudax/test/memory_resource/device_memory_pool.cu +++ b/cudax/test/memory_resource/device_memory_pool.cu @@ -22,7 +22,7 @@ #include namespace cudax = cuda::experimental; -using pool = cudax::mr::device_memory_pool; +using pool = cudax::device_memory_pool; static_assert(!cuda::std::is_trivial::value, ""); static_assert(!cuda::std::is_trivially_default_constructible::value, ""); static_assert(!cuda::std::is_default_constructible::value, ""); @@ -89,10 +89,10 @@ TEST_CASE("device_memory_pool construction", "[memory_resource]") current_device); } - using memory_pool = cudax::mr::device_memory_pool; + using memory_pool = cudax::device_memory_pool; SECTION("Construct from device id") { - cudax::mr::device_memory_pool from_device{current_device}; + cudax::device_memory_pool from_device{current_device}; ::cudaMemPool_t get = from_device.get(); CHECK(get != current_default_pool); @@ -109,7 +109,7 @@ TEST_CASE("device_memory_pool construction", "[memory_resource]") SECTION("Construct with empty properties") { - cudax::mr::memory_pool_properties props{}; + cudax::memory_pool_properties props{}; memory_pool from_defaulted_properties{current_device, props}; ::cudaMemPool_t get = from_defaulted_properties.get(); @@ -127,7 +127,7 @@ TEST_CASE("device_memory_pool construction", "[memory_resource]") SECTION("Construct with initial pool size") { - cudax::mr::memory_pool_properties props = {42, 20}; + cudax::memory_pool_properties props = {42, 20}; memory_pool with_threshold{current_device, props}; ::cudaMemPool_t get = with_threshold.get(); @@ -147,8 +147,8 @@ TEST_CASE("device_memory_pool construction", "[memory_resource]") #if _CCCL_CUDACC_AT_LEAST(11, 2) SECTION("Construct with allocation handle") { - cudax::mr::memory_pool_properties props = { - 42, 20, cudax::mr::cudaMemAllocationHandleType::cudaMemHandleTypePosixFileDescriptor}; + cudax::memory_pool_properties props = { + 42, 20, cudax::cudaMemAllocationHandleType::cudaMemHandleTypePosixFileDescriptor}; memory_pool with_allocation_handle{current_device, props}; ::cudaMemPool_t get = with_allocation_handle.get(); @@ -175,7 +175,7 @@ TEST_CASE("device_memory_pool construction", "[memory_resource]") ::cudaMemPool_t new_pool{}; _CCCL_TRY_CUDA_API(::cudaMemPoolCreate, "Failed to call cudaMemPoolCreate", &new_pool, &pool_properties); - cudax::mr::device_memory_pool from_handle = cudax::mr::device_memory_pool::from_native_handle(new_pool); + cudax::device_memory_pool from_handle = cudax::device_memory_pool::from_native_handle(new_pool); CHECK(from_handle.get() == new_pool); } } @@ -200,9 +200,9 @@ TEST_CASE("device_memory_pool comparison", "[memory_resource]") current_device); } - cudax::mr::device_memory_pool first{current_device}; + cudax::device_memory_pool first{current_device}; { // comparison against a plain device_memory_pool - cudax::mr::device_memory_pool second{current_device}; + cudax::device_memory_pool second{current_device}; CHECK(first == first); CHECK(first != second); } @@ -237,7 +237,7 @@ TEST_CASE("device_memory_pool accessors", "[memory_resource]") SECTION("device_memory_pool::set_attribute") { - cudax::mr::device_memory_pool pool{current_device}; + cudax::device_memory_pool pool{current_device}; { // cudaMemPoolReuseFollowEventDependencies // Get the attribute value @@ -300,7 +300,7 @@ TEST_CASE("device_memory_pool accessors", "[memory_resource]") } // prime the pool to a given size - cudax::mr::device_memory_resource resource{pool}; + cudax::device_memory_resource resource{pool}; cudax::stream stream{}; // Allocate a buffer to prime @@ -417,9 +417,9 @@ TEST_CASE("device_memory_pool accessors", "[memory_resource]") SECTION("device_memory_pool::trim_to") { - cudax::mr::device_memory_pool pool{current_device}; + cudax::device_memory_pool pool{current_device}; // prime the pool to a given size - cudax::mr::device_memory_resource resource{pool}; + cudax::device_memory_resource resource{pool}; cudax::stream stream{}; // Allocate 2 buffers @@ -476,7 +476,7 @@ TEST_CASE("device_memory_pool accessors", "[memory_resource]") auto peers = cudax::devices[0].get_peers(); if (peers.size() > 0) { - cudax::mr::device_memory_pool pool{cudax::devices[0]}; + cudax::device_memory_pool pool{cudax::devices[0]}; CUDAX_CHECK(pool.is_accessible_from(cudax::devices[0])); pool.enable_peer_access_from(peers); diff --git a/cudax/test/memory_resource/device_memory_resource.cu b/cudax/test/memory_resource/device_memory_resource.cu index 55839831bc4..29b5d4d9baf 100644 --- a/cudax/test/memory_resource/device_memory_resource.cu +++ b/cudax/test/memory_resource/device_memory_resource.cu @@ -21,15 +21,15 @@ namespace cudax = cuda::experimental; -static_assert(!cuda::std::is_trivial::value, ""); -static_assert(!cuda::std::is_trivially_default_constructible::value, ""); -static_assert(cuda::std::is_default_constructible::value, ""); -static_assert(cuda::std::is_copy_constructible::value, ""); -static_assert(cuda::std::is_move_constructible::value, ""); -static_assert(cuda::std::is_copy_assignable::value, ""); -static_assert(cuda::std::is_move_assignable::value, ""); -static_assert(cuda::std::is_trivially_destructible::value, ""); -static_assert(!cuda::std::is_empty::value, ""); +static_assert(!cuda::std::is_trivial::value, ""); +static_assert(!cuda::std::is_trivially_default_constructible::value, ""); +static_assert(cuda::std::is_default_constructible::value, ""); +static_assert(cuda::std::is_copy_constructible::value, ""); +static_assert(cuda::std::is_move_constructible::value, ""); +static_assert(cuda::std::is_copy_assignable::value, ""); +static_assert(cuda::std::is_move_assignable::value, ""); +static_assert(cuda::std::is_trivially_destructible::value, ""); +static_assert(!cuda::std::is_empty::value, ""); static bool ensure_release_threshold(::cudaMemPool_t pool, const size_t expected_threshold) { @@ -87,7 +87,7 @@ TEST_CASE("device_memory_resource construction", "[memory_resource]") current_device); } - using async_resource = cuda::experimental::mr::device_memory_resource; + using async_resource = cuda::experimental::device_memory_resource; SECTION("Default construction") { { @@ -99,7 +99,7 @@ TEST_CASE("device_memory_resource construction", "[memory_resource]") void* ptr{nullptr}; _CCCL_TRY_CUDA_API( ::cudaMallocAsync, - "Failed to allocate with pool passed to cuda::experimental::mr::device_memory_resource", + "Failed to allocate with pool passed to cuda::experimental::device_memory_resource", &ptr, 42, current_default_pool, @@ -108,7 +108,7 @@ TEST_CASE("device_memory_resource construction", "[memory_resource]") _CCCL_ASSERT_CUDA_API( ::cudaFreeAsync, - "Failed to deallocate with pool passed to cuda::experimental::mr::device_memory_resource", + "Failed to deallocate with pool passed to cuda::experimental::device_memory_resource", ptr, ::cudaStream_t{0}); } @@ -133,7 +133,7 @@ TEST_CASE("device_memory_resource construction", "[memory_resource]") void* ptr{nullptr}; _CCCL_TRY_CUDA_API( ::cudaMallocAsync, - "Failed to allocate with pool passed to cuda::experimental::mr::device_memory_resource", + "Failed to allocate with pool passed to cuda::experimental::device_memory_resource", &ptr, 42, current_default_pool, @@ -142,17 +142,17 @@ TEST_CASE("device_memory_resource construction", "[memory_resource]") _CCCL_ASSERT_CUDA_API( ::cudaFreeAsync, - "Failed to deallocate with pool passed to cuda::experimental::mr::device_memory_resource", + "Failed to deallocate with pool passed to cuda::experimental::device_memory_resource", ptr, ::cudaStream_t{0}); } SECTION("Construct with initial pool size") { - cuda::experimental::mr::memory_pool_properties props = { + cuda::experimental::memory_pool_properties props = { 42, }; - cuda::experimental::mr::device_memory_pool pool{current_device, props}; + cuda::experimental::device_memory_pool pool{current_device, props}; async_resource from_initial_pool_size{pool}; ::cudaMemPool_t get = from_initial_pool_size.get(); @@ -170,11 +170,11 @@ TEST_CASE("device_memory_resource construction", "[memory_resource]") SECTION("Construct with release threshold") { - cuda::experimental::mr::memory_pool_properties props = { + cuda::experimental::memory_pool_properties props = { 42, 20, }; - cuda::experimental::mr::device_memory_pool pool{current_device, props}; + cuda::experimental::device_memory_pool pool{current_device, props}; async_resource with_threshold{pool}; ::cudaMemPool_t get = with_threshold.get(); @@ -194,12 +194,12 @@ TEST_CASE("device_memory_resource construction", "[memory_resource]") #if _CCCL_CUDACC_AT_LEAST(11, 2) SECTION("Construct with allocation handle") { - cuda::experimental::mr::memory_pool_properties props = { + cuda::experimental::memory_pool_properties props = { 42, 20, - cuda::experimental::mr::cudaMemAllocationHandleType::cudaMemHandleTypePosixFileDescriptor, + cuda::experimental::cudaMemAllocationHandleType::cudaMemHandleTypePosixFileDescriptor, }; - cuda::experimental::mr::device_memory_pool pool{current_device, props}; + cuda::experimental::device_memory_pool pool{current_device, props}; async_resource with_allocation_handle{pool}; ::cudaMemPool_t get = with_allocation_handle.get(); @@ -228,7 +228,7 @@ static void ensure_device_ptr(void* ptr) TEST_CASE("device_memory_resource allocation", "[memory_resource]") { - cuda::experimental::mr::device_memory_resource res{}; + cuda::experimental::device_memory_resource res{}; { // allocate / deallocate auto* ptr = res.allocate(42); @@ -404,9 +404,9 @@ TEST_CASE("device_memory_resource comparison", "[memory_resource]") _CCCL_TRY_CUDA_API(::cudaGetDevice, "Failed to query current device with cudaGetDevice.", ¤t_device); } - cuda::experimental::mr::device_memory_resource first{}; + cuda::experimental::device_memory_resource first{}; { // comparison against a plain device_memory_resource - cuda::experimental::mr::device_memory_resource second{}; + cuda::experimental::device_memory_resource second{}; CHECK(first == second); CHECK(!(first != second)); } @@ -421,13 +421,13 @@ TEST_CASE("device_memory_resource comparison", "[memory_resource]") pool_properties.location.id = current_device; _CCCL_TRY_CUDA_API(::cudaMemPoolCreate, "Failed to call cudaMemPoolCreate", &cuda_pool_handle, &pool_properties); } - cuda::experimental::mr::device_memory_resource second{cuda_pool_handle}; + cuda::experimental::device_memory_resource second{cuda_pool_handle}; CHECK(first != second); CHECK(!(first == second)); } { // comparison against a device_memory_resource wrapped inside a resource_ref - cuda::experimental::mr::device_memory_resource second{}; + cuda::experimental::device_memory_resource second{}; cuda::mr::resource_ref second_ref{second}; CHECK(first == second_ref); CHECK(!(first != second_ref)); @@ -436,7 +436,7 @@ TEST_CASE("device_memory_resource comparison", "[memory_resource]") } { // comparison against a device_memory_resource wrapped inside a async_resource_ref - cuda::experimental::mr::device_memory_resource second{}; + cuda::experimental::device_memory_resource second{}; cuda::mr::async_resource_ref second_ref{second}; CHECK(first == second_ref); @@ -481,8 +481,8 @@ TEST_CASE("Async memory resource peer access") auto peers = cudax::devices[0].get_peers(); if (peers.size() > 0) { - cudax::mr::device_memory_pool pool{cudax::devices[0]}; - cudax::mr::device_memory_resource resource{pool}; + cudax::device_memory_pool pool{cudax::devices[0]}; + cudax::device_memory_resource resource{pool}; cudax::stream stream{peers.front()}; CUDAX_CHECK(resource.is_accessible_from(cudax::devices[0])); @@ -503,7 +503,7 @@ TEST_CASE("Async memory resource peer access") CUDAX_CHECK(resource.is_accessible_from(peers.front())); allocate_and_check_access(resource); - cudax::mr::device_memory_resource another_resource{pool}; + cudax::device_memory_resource another_resource{pool}; CUDAX_CHECK(another_resource.is_accessible_from(peers.front())); allocate_and_check_access(another_resource); @@ -527,8 +527,8 @@ TEST_CASE("Async memory resource peer access") resource.enable_peer_access_from(peers); // Check the resource using the default pool - cudax::mr::device_memory_resource default_pool_resource{}; - cudax::mr::device_memory_resource another_default_pool_resource{}; + cudax::device_memory_resource default_pool_resource{}; + cudax::device_memory_resource another_default_pool_resource{}; default_pool_resource.enable_peer_access_from(peers.front()); diff --git a/cudax/test/memory_resource/managed_memory_resource.cu b/cudax/test/memory_resource/managed_memory_resource.cu new file mode 100644 index 00000000000..073402124bd --- /dev/null +++ b/cudax/test/memory_resource/managed_memory_resource.cu @@ -0,0 +1,273 @@ +//===----------------------------------------------------------------------===// +// +// Part of CUDA Experimental in CUDA C++ Core Libraries, +// under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// SPDX-FileCopyrightText: Copyright (c) 2024 NVIDIA CORPORATION & AFFILIATES. +// +//===----------------------------------------------------------------------===// + +#include +#include +#include + +#include +#include + +#include + +#include +#include + +namespace cudax = cuda::experimental; + +using managed_resource = cudax::managed_memory_resource; +static_assert(!cuda::std::is_trivial::value, ""); +static_assert(!cuda::std::is_trivially_default_constructible::value, ""); +static_assert(cuda::std::is_trivially_copy_constructible::value, ""); +static_assert(cuda::std::is_trivially_move_constructible::value, ""); +static_assert(cuda::std::is_trivially_copy_assignable::value, ""); +static_assert(cuda::std::is_trivially_move_assignable::value, ""); +static_assert(cuda::std::is_trivially_destructible::value, ""); +static_assert(!cuda::std::is_empty::value, ""); + +static void ensure_managed_ptr(void* ptr) +{ + CHECK(ptr != nullptr); + cudaPointerAttributes attributes; + cudaError_t status = cudaPointerGetAttributes(&attributes, ptr); + CHECK(status == cudaSuccess); + CHECK(attributes.type == cudaMemoryTypeManaged); +} + +TEST_CASE("managed_memory_resource construction", "[memory_resource]") +{ + SECTION("Default construction") + { + STATIC_REQUIRE(cuda::std::is_default_constructible_v); + } + + SECTION("Construct with flag") + { + managed_resource defaulted{}; + managed_resource with_flag{cudaMemAttachHost}; + CHECK(defaulted != with_flag); + } +} + +TEST_CASE("managed_memory_resource allocation", "[memory_resource]") +{ + managed_resource res{}; + cudax::stream stream{}; + + { // allocate / deallocate + auto* ptr = res.allocate(42); + static_assert(cuda::std::is_same::value, ""); + ensure_managed_ptr(ptr); + + res.deallocate(ptr, 42); + } + + { // allocate / deallocate with alignment + auto* ptr = res.allocate(42, 4); + static_assert(cuda::std::is_same::value, ""); + ensure_managed_ptr(ptr); + + res.deallocate(ptr, 42, 4); + } + + { // allocate_async / deallocate_async + auto* ptr = res.allocate_async(42, stream); + static_assert(cuda::std::is_same::value, ""); + + stream.wait(); + ensure_managed_ptr(ptr); + + res.deallocate_async(ptr, 42, stream); + } + + { // allocate_async / deallocate_async with alignment + auto* ptr = res.allocate_async(42, 4, stream); + static_assert(cuda::std::is_same::value, ""); + + stream.wait(); + ensure_managed_ptr(ptr); + + res.deallocate_async(ptr, 42, 4, stream); + } + +#ifndef _LIBCUDACXX_NO_EXCEPTIONS + { // allocate with too small alignment + while (true) + { + try + { + auto* ptr = res.allocate(5, 42); + (void) ptr; + } + catch (std::invalid_argument&) + { + break; + } + CHECK(false); + } + } + + { // allocate with non matching alignment + while (true) + { + try + { + auto* ptr = res.allocate(5, 1337); + (void) ptr; + } + catch (std::invalid_argument&) + { + break; + } + CHECK(false); + } + } + { // allocate_async with too small alignment + while (true) + { + try + { + auto* ptr = res.allocate_async(5, 42, stream); + (void) ptr; + } + catch (std::invalid_argument&) + { + break; + } + CHECK(false); + } + } + + { // allocate_async with non matching alignment + while (true) + { + try + { + auto* ptr = res.allocate_async(5, 1337, stream); + (void) ptr; + } + catch (std::invalid_argument&) + { + break; + } + CHECK(false); + } + } +#endif // _LIBCUDACXX_NO_EXCEPTIONS +} + +enum class AccessibilityType +{ + Device, + Host, +}; + +template +struct resource +{ + void* allocate(size_t, size_t) + { + return nullptr; + } + void deallocate(void*, size_t, size_t) noexcept {} + + bool operator==(const resource&) const + { + return true; + } + bool operator!=(const resource& other) const + { + return false; + } +}; +static_assert(cuda::mr::resource>, ""); +static_assert(cuda::mr::resource>, ""); + +template +struct async_resource : public resource +{ + void* allocate_async(size_t, size_t, cuda::stream_ref) + { + return nullptr; + } + void deallocate_async(void*, size_t, size_t, cuda::stream_ref) {} +}; +static_assert(cuda::mr::async_resource>, ""); +static_assert(cuda::mr::async_resource>, ""); + +// test for cccl#2214: https://github.com/NVIDIA/cccl/issues/2214 +struct derived_managed_resource : cudax::managed_memory_resource +{ + using cudax::managed_memory_resource::managed_memory_resource; +}; +static_assert(cuda::mr::resource, ""); + +TEST_CASE("managed_memory_resource comparison", "[memory_resource]") +{ + managed_resource first{}; + { // comparison against a plain managed_memory_resource + managed_resource second{}; + CHECK(first == second); + CHECK(!(first != second)); + } + + { // comparison against a plain managed_memory_resource with a different pool + managed_resource second{cudaMemAttachHost}; + CHECK(first != second); + CHECK(!(first == second)); + } + + { // comparison against a managed_memory_resource wrapped inside a resource_ref + managed_resource second{}; + cuda::mr::resource_ref second_ref{second}; + CHECK(first == second_ref); + CHECK(!(first != second_ref)); + CHECK(second_ref == first); + CHECK(!(second_ref != first)); + } + + { // comparison against a managed_memory_resource wrapped inside a async_resource_ref + managed_resource second{}; + cuda::mr::async_resource_ref second_ref{second}; + + CHECK(first == second_ref); + CHECK(!(first != second_ref)); + CHECK(second_ref == first); + CHECK(!(second_ref != first)); + } + + { // comparison against a different managed_resource through resource_ref + resource host_resource{}; + resource device_resource{}; + CHECK(!(first == host_resource)); + CHECK(first != host_resource); + CHECK(!(first == device_resource)); + CHECK(first != device_resource); + + CHECK(!(host_resource == first)); + CHECK(host_resource != first); + CHECK(!(device_resource == first)); + CHECK(device_resource != first); + } + + { // comparison against a different managed_resource through resource_ref + resource host_async_resource{}; + resource device_async_resource{}; + CHECK(!(first == host_async_resource)); + CHECK(first != host_async_resource); + CHECK(!(first == device_async_resource)); + CHECK(first != device_async_resource); + + CHECK(!(host_async_resource == first)); + CHECK(host_async_resource != first); + CHECK(!(device_async_resource == first)); + CHECK(device_async_resource != first); + } +} diff --git a/cudax/test/memory_resource/pinned_memory_resource.cu b/cudax/test/memory_resource/pinned_memory_resource.cu new file mode 100644 index 00000000000..6423b292de0 --- /dev/null +++ b/cudax/test/memory_resource/pinned_memory_resource.cu @@ -0,0 +1,274 @@ +//===----------------------------------------------------------------------===// +// +// Part of CUDA Experimental in CUDA C++ Core Libraries, +// under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// SPDX-FileCopyrightText: Copyright (c) 2024 NVIDIA CORPORATION & AFFILIATES. +// +//===----------------------------------------------------------------------===// + +#include +#include +#include + +#include +#include + +#include + +#include +#include + +namespace cudax = cuda::experimental; + +using pinned_resource = cudax::pinned_memory_resource; +static_assert(!cuda::std::is_trivial::value, ""); +static_assert(!cuda::std::is_trivially_default_constructible::value, ""); +static_assert(cuda::std::is_trivially_copy_constructible::value, ""); +static_assert(cuda::std::is_trivially_move_constructible::value, ""); +static_assert(cuda::std::is_trivially_copy_assignable::value, ""); +static_assert(cuda::std::is_trivially_move_assignable::value, ""); +static_assert(cuda::std::is_trivially_destructible::value, ""); +static_assert(!cuda::std::is_empty::value, ""); + +static void ensure_pinned_ptr(void* ptr) +{ + CHECK(ptr != nullptr); + cudaPointerAttributes attributes; + cudaError_t status = cudaPointerGetAttributes(&attributes, ptr); + CHECK(status == cudaSuccess); + CHECK(attributes.type == cudaMemoryTypeHost); + CHECK(attributes.devicePointer != nullptr); +} + +TEST_CASE("pinned_memory_resource construction", "[memory_resource]") +{ + SECTION("Default construction") + { + STATIC_REQUIRE(cuda::std::is_default_constructible_v); + } + + SECTION("Construct with flag") + { + pinned_resource defaulted{}; + pinned_resource with_flag{cudaHostAllocMapped}; + CHECK(defaulted != with_flag); + } +} + +TEST_CASE("pinned_memory_resource allocation", "[memory_resource]") +{ + pinned_resource res{}; + cudax::stream stream{}; + + { // allocate / deallocate + auto* ptr = res.allocate(42); + static_assert(cuda::std::is_same::value, ""); + ensure_pinned_ptr(ptr); + + res.deallocate(ptr, 42); + } + + { // allocate / deallocate with alignment + auto* ptr = res.allocate(42, 4); + static_assert(cuda::std::is_same::value, ""); + ensure_pinned_ptr(ptr); + + res.deallocate(ptr, 42, 4); + } + + { // allocate_async / deallocate_async + auto* ptr = res.allocate_async(42, stream); + static_assert(cuda::std::is_same::value, ""); + + stream.wait(); + ensure_pinned_ptr(ptr); + + res.deallocate_async(ptr, 42, stream); + } + + { // allocate_async / deallocate_async with alignment + auto* ptr = res.allocate_async(42, 4, stream); + static_assert(cuda::std::is_same::value, ""); + + stream.wait(); + ensure_pinned_ptr(ptr); + + res.deallocate_async(ptr, 42, 4, stream); + } + +#ifndef _LIBCUDACXX_NO_EXCEPTIONS + { // allocate with too small alignment + while (true) + { + try + { + auto* ptr = res.allocate(5, 42); + (void) ptr; + } + catch (std::invalid_argument&) + { + break; + } + CHECK(false); + } + } + + { // allocate with non matching alignment + while (true) + { + try + { + auto* ptr = res.allocate(5, 1337); + (void) ptr; + } + catch (std::invalid_argument&) + { + break; + } + CHECK(false); + } + } + { // allocate_async with too small alignment + while (true) + { + try + { + auto* ptr = res.allocate_async(5, 42, stream); + (void) ptr; + } + catch (std::invalid_argument&) + { + break; + } + CHECK(false); + } + } + + { // allocate_async with non matching alignment + while (true) + { + try + { + auto* ptr = res.allocate_async(5, 1337, stream); + (void) ptr; + } + catch (std::invalid_argument&) + { + break; + } + CHECK(false); + } + } +#endif // _LIBCUDACXX_NO_EXCEPTIONS +} + +enum class AccessibilityType +{ + Device, + Host, +}; + +template +struct resource +{ + void* allocate(size_t, size_t) + { + return nullptr; + } + void deallocate(void*, size_t, size_t) noexcept {} + + bool operator==(const resource&) const + { + return true; + } + bool operator!=(const resource& other) const + { + return false; + } +}; +static_assert(cuda::mr::resource>, ""); +static_assert(cuda::mr::resource>, ""); + +template +struct async_resource : public resource +{ + void* allocate_async(size_t, size_t, cuda::stream_ref) + { + return nullptr; + } + void deallocate_async(void*, size_t, size_t, cuda::stream_ref) {} +}; +static_assert(cuda::mr::async_resource>, ""); +static_assert(cuda::mr::async_resource>, ""); + +// test for cccl#2214: https://github.com/NVIDIA/cccl/issues/2214 +struct derived_pinned_resource : cudax::pinned_memory_resource +{ + using cudax::pinned_memory_resource::pinned_memory_resource; +}; +static_assert(cuda::mr::resource, ""); + +TEST_CASE("pinned_memory_resource comparison", "[memory_resource]") +{ + pinned_resource first{}; + { // comparison against a plain pinned_memory_resource + pinned_resource second{}; + CHECK(first == second); + CHECK(!(first != second)); + } + + { // comparison against a plain pinned_memory_resource with a different pool + pinned_resource second{cudaMemAttachHost}; + CHECK(first != second); + CHECK(!(first == second)); + } + + { // comparison against a pinned_memory_resource wrapped inside a resource_ref + pinned_resource second{}; + cuda::mr::resource_ref second_ref{second}; + CHECK(first == second_ref); + CHECK(!(first != second_ref)); + CHECK(second_ref == first); + CHECK(!(second_ref != first)); + } + + { // comparison against a pinned_memory_resource wrapped inside a async_resource_ref + pinned_resource second{}; + cuda::mr::async_resource_ref second_ref{second}; + + CHECK(first == second_ref); + CHECK(!(first != second_ref)); + CHECK(second_ref == first); + CHECK(!(second_ref != first)); + } + + { // comparison against a different pinned_resource through resource_ref + resource host_resource{}; + resource device_resource{}; + CHECK(!(first == host_resource)); + CHECK(first != host_resource); + CHECK(!(first == device_resource)); + CHECK(first != device_resource); + + CHECK(!(host_resource == first)); + CHECK(host_resource != first); + CHECK(!(device_resource == first)); + CHECK(device_resource != first); + } + + { // comparison against a different pinned_resource through resource_ref + resource host_async_resource{}; + resource device_async_resource{}; + CHECK(!(first == host_async_resource)); + CHECK(first != host_async_resource); + CHECK(!(first == device_async_resource)); + CHECK(first != device_async_resource); + + CHECK(!(host_async_resource == first)); + CHECK(host_async_resource != first); + CHECK(!(device_async_resource == first)); + CHECK(device_async_resource != first); + } +} diff --git a/cudax/test/memory_resource/shared_resource.cu b/cudax/test/memory_resource/shared_resource.cu index 4cdd7bc1d31..02e98f10cf9 100644 --- a/cudax/test/memory_resource/shared_resource.cu +++ b/cudax/test/memory_resource/shared_resource.cu @@ -18,14 +18,14 @@ TEMPLATE_TEST_CASE_METHOD(test_fixture, "shared_resource", "[container][resource]", big_resource, small_resource) { using TestResource = TestType; - static_assert(cuda::mr::async_resource>); + static_assert(cuda::mr::async_resource>); SECTION("construct and destruct") { Counts expected{}; CHECK(this->counts == expected); { - cudax::mr::shared_resource mr{42, this}; + cudax::shared_resource mr{42, this}; ++expected.object_count; CHECK(this->counts == expected); } @@ -42,7 +42,7 @@ TEMPLATE_TEST_CASE_METHOD(test_fixture, "shared_resource", "[container][resource Counts expected{}; CHECK(this->counts == expected); { - cudax::mr::shared_resource mr{42, this}; + cudax::shared_resource mr{42, this}; ++expected.object_count; CHECK(this->counts == expected); @@ -56,7 +56,7 @@ TEMPLATE_TEST_CASE_METHOD(test_fixture, "shared_resource", "[container][resource CHECK(mr2 == mr3); // pointers compare equal, no call to TestResource::operator== CHECK(this->counts == expected); - cudax::mr::shared_resource mr4{TestResource{42, this}}; + cudax::shared_resource mr4{TestResource{42, this}}; ++expected.object_count; ++expected.move_count; CHECK(mr3 == mr4); // pointers are not equal, calls TestResource::operator== @@ -76,7 +76,7 @@ TEMPLATE_TEST_CASE_METHOD(test_fixture, "shared_resource", "[container][resource Counts expected{}; CHECK(this->counts == expected); { - cudax::mr::shared_resource mr{42, this}; + cudax::shared_resource mr{42, this}; ++expected.object_count; CHECK(this->counts == expected); @@ -101,7 +101,7 @@ TEMPLATE_TEST_CASE_METHOD(test_fixture, "shared_resource", "[container][resource { Counts expected{}; { - cudax::mr::shared_resource mr{42, this}; + cudax::shared_resource mr{42, this}; ++expected.object_count; CHECK(this->counts == expected); @@ -130,7 +130,7 @@ TEMPLATE_TEST_CASE_METHOD(test_fixture, "shared_resource", "[container][resource { bytes(42 * sizeof(int)); cudax::uninitialized_buffer buffer{ - cudax::mr::shared_resource(42, this), 42}; + cudax::shared_resource(42, this), 42}; ++expected.object_count; ++expected.allocate_count; CHECK(this->counts == expected); diff --git a/libcudacxx/include/cuda/__memory_resource/device_memory_resource.h b/libcudacxx/include/cuda/__memory_resource/device_memory_resource.h deleted file mode 100644 index 72e01a5521d..00000000000 --- a/libcudacxx/include/cuda/__memory_resource/device_memory_resource.h +++ /dev/null @@ -1,219 +0,0 @@ -//===----------------------------------------------------------------------===// -// -// Part of libcu++, the C++ Standard Library for your entire system, -// under the Apache License v2.0 with LLVM Exceptions. -// See https://llvm.org/LICENSE.txt for license information. -// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception -// SPDX-FileCopyrightText: Copyright (c) 2024 NVIDIA CORPORATION & AFFILIATES. -// -//===----------------------------------------------------------------------===// - -#ifndef _CUDA__MEMORY_RESOURCE_CUDA_MEMORY_RESOURCE_H -#define _CUDA__MEMORY_RESOURCE_CUDA_MEMORY_RESOURCE_H - -#include - -#if defined(_CCCL_IMPLICIT_SYSTEM_HEADER_GCC) -# pragma GCC system_header -#elif defined(_CCCL_IMPLICIT_SYSTEM_HEADER_CLANG) -# pragma clang system_header -#elif defined(_CCCL_IMPLICIT_SYSTEM_HEADER_MSVC) -# pragma system_header -#endif // no system header - -#if !_CCCL_COMPILER(MSVC2017) && defined(LIBCUDACXX_ENABLE_EXPERIMENTAL_MEMORY_RESOURCE) - -# if defined(_CCCL_CUDA_COMPILER_CLANG) -# include -# endif // _CCCL_CUDA_COMPILER_CLANG - -# include -# include -# include -# include -# include -# include -# include -# include - -# if _CCCL_STD_VER >= 2014 - -_LIBCUDACXX_BEGIN_NAMESPACE_CUDA_MR - -//! @brief device_memory_resource uses `cudaMalloc` / `cudaFree` for allocation / deallocation. -//! By default uses device 0 to allocate memory -class device_memory_resource -{ -private: - int __device_id_{0}; - -public: - //! @brief default constructs a device_memory_resource allocating memory on device 0 - _CCCL_HIDE_FROM_ABI device_memory_resource() = default; - - //! @brief default constructs a device_memory_resource allocating memory on device \p __device_id - //! @param __device_id The id of the device we are allocating memory on - constexpr device_memory_resource(const int __device_id) noexcept - : __device_id_(__device_id) - {} - - //! @brief Allocate device memory of size at least \p __bytes. - //! @param __bytes The size in bytes of the allocation. - //! @param __alignment The requested alignment of the allocation. - //! @throw std::invalid_argument in case of invalid alignment or \c cuda::cuda_error of the returned error code. - //! @return Pointer to the newly allocated memory - _CCCL_NODISCARD void* allocate(const size_t __bytes, const size_t __alignment = default_cuda_malloc_alignment) const - { - // We need to ensure that the provided alignment matches the minimal provided alignment - if (!__is_valid_alignment(__alignment)) - { - _CUDA_VSTD::__throw_invalid_argument("Invalid alignment passed to device_memory_resource::allocate."); - } - - // We need to ensure that we allocate on the right device as `cudaMalloc` always uses the current device - __ensure_current_device __device_wrapper{__device_id_}; - - void* __ptr{nullptr}; - _CCCL_TRY_CUDA_API(::cudaMalloc, "Failed to allocate memory with cudaMalloc.", &__ptr, __bytes); - return __ptr; - } - - //! @brief Deallocate memory pointed to by \p __ptr. - //! @param __ptr Pointer to be deallocated. Must have been allocated through a call to `allocate` - //! @param __bytes The number of bytes that was passed to the `allocate` call that returned \p __ptr. - //! @param __alignment The alignment that was passed to the `allocate` call that returned \p __ptr. - void deallocate(void* __ptr, const size_t, const size_t __alignment = default_cuda_malloc_alignment) const noexcept - { - // We need to ensure that the provided alignment matches the minimal provided alignment - _CCCL_ASSERT(__is_valid_alignment(__alignment), "Invalid alignment passed to device_memory_resource::deallocate."); - _CCCL_ASSERT_CUDA_API(::cudaFree, "device_memory_resource::deallocate failed", __ptr); - (void) __alignment; - } - - //! @brief Equality comparison with another \c device_memory_resource - //! @param __other The other \c device_memory_resource - //! @return true, if both resources hold the same device id - _CCCL_NODISCARD constexpr bool operator==(device_memory_resource const& __other) const noexcept - { - return __device_id_ == __other.__device_id_; - } -# if _CCCL_STD_VER <= 2017 - //! @brief Inequality comparison with another \c device_memory_resource - //! @param __other The other \c device_memory_resource - //! @return true, if both resources hold different device id's - _CCCL_NODISCARD constexpr bool operator!=(device_memory_resource const& __other) const noexcept - { - return __device_id_ != __other.__device_id_; - } -# endif // _CCCL_STD_VER <= 2017 - -# if _CCCL_STD_VER >= 2020 - //! @brief Equality comparison between a \c device_memory_resource and another resource - //! @param __rhs The resource to compare to - //! @return If the underlying types are equality comparable, returns the result of equality comparison of both - //! resources. Otherwise, returns false. - _CCCL_TEMPLATE(class _Resource) - _CCCL_REQUIRES((__different_resource) ) - _CCCL_NODISCARD bool operator==(_Resource const& __rhs) const noexcept - { - if constexpr (has_property<_Resource, device_accessible>) - { - return resource_ref{const_cast(this)} - == resource_ref{const_cast<_Resource&>(__rhs)}; - } - else - { - return false; - } - } -# else // ^^^ C++20 ^^^ / vvv C++17 - template - _CCCL_NODISCARD_FRIEND auto operator==(device_memory_resource const& __lhs, _Resource const& __rhs) noexcept - _CCCL_TRAILING_REQUIRES(bool)( - __different_resource&& has_property<_Resource, device_accessible>) - { - return resource_ref{const_cast(__lhs)} - == resource_ref{const_cast<_Resource&>(__rhs)}; - } - - template - _CCCL_NODISCARD_FRIEND auto operator==(device_memory_resource const&, _Resource const&) noexcept - _CCCL_TRAILING_REQUIRES(bool)(__different_resource - && !has_property<_Resource, device_accessible>) - { - return false; - } - - template - _CCCL_NODISCARD_FRIEND auto operator==(_Resource const& __rhs, device_memory_resource const& __lhs) noexcept - _CCCL_TRAILING_REQUIRES(bool)( - __different_resource&& has_property<_Resource, device_accessible>) - { - return resource_ref{const_cast(__lhs)} - == resource_ref{const_cast<_Resource&>(__rhs)}; - } - - template - _CCCL_NODISCARD_FRIEND auto operator==(_Resource const&, device_memory_resource const&) noexcept - _CCCL_TRAILING_REQUIRES(bool)(__different_resource - && !has_property<_Resource, device_accessible>) - { - return false; - } - - template - _CCCL_NODISCARD_FRIEND auto operator!=(device_memory_resource const& __lhs, _Resource const& __rhs) noexcept - _CCCL_TRAILING_REQUIRES(bool)( - __different_resource&& has_property<_Resource, device_accessible>) - { - return resource_ref{const_cast(__lhs)} - != resource_ref{const_cast<_Resource&>(__rhs)}; - } - - template - _CCCL_NODISCARD_FRIEND auto operator!=(device_memory_resource const&, _Resource const&) noexcept - _CCCL_TRAILING_REQUIRES(bool)(__different_resource - && !has_property<_Resource, device_accessible>) - { - return true; - } - - template - _CCCL_NODISCARD_FRIEND auto operator!=(_Resource const& __rhs, device_memory_resource const& __lhs) noexcept - _CCCL_TRAILING_REQUIRES(bool)( - __different_resource&& has_property<_Resource, device_accessible>) - { - return resource_ref{const_cast(__lhs)} - != resource_ref{const_cast<_Resource&>(__rhs)}; - } - - template - _CCCL_NODISCARD_FRIEND auto operator!=(_Resource const&, device_memory_resource const&) noexcept - _CCCL_TRAILING_REQUIRES(bool)(__different_resource - && !has_property<_Resource, device_accessible>) - { - return true; - } -# endif // _CCCL_STD_VER <= 2017 - - //! @brief Enables the \c device_accessible property - friend constexpr void get_property(device_memory_resource const&, device_accessible) noexcept {} - - //! @brief Checks whether the passed in alignment is valid - static constexpr bool __is_valid_alignment(const size_t __alignment) noexcept - { - return __alignment <= default_cuda_malloc_alignment && (default_cuda_malloc_alignment % __alignment == 0); - } -}; -static_assert(resource_with, ""); - -// For backward compatability -using cuda_memory_resource _LIBCUDACXX_DEPRECATED = device_memory_resource; - -_LIBCUDACXX_END_NAMESPACE_CUDA_MR - -# endif // _CCCL_STD_VER >= 2014 - -#endif // !_CCCL_COMPILER(MSVC2017) && LIBCUDACXX_ENABLE_EXPERIMENTAL_MEMORY_RESOURCE - -#endif // _CUDA__MEMORY_RESOURCE_CUDA_MEMORY_RESOURCE_H diff --git a/libcudacxx/include/cuda/__memory_resource/managed_memory_resource.h b/libcudacxx/include/cuda/__memory_resource/managed_memory_resource.h deleted file mode 100644 index 86835aede18..00000000000 --- a/libcudacxx/include/cuda/__memory_resource/managed_memory_resource.h +++ /dev/null @@ -1,201 +0,0 @@ -//===----------------------------------------------------------------------===// -// -// Part of libcu++, the C++ Standard Library for your entire system, -// under the Apache License v2.0 with LLVM Exceptions. -// See https://llvm.org/LICENSE.txt for license information. -// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception -// SPDX-FileCopyrightText: Copyright (c) 2024 NVIDIA CORPORATION & AFFILIATES. -// -//===----------------------------------------------------------------------===// - -#ifndef _CUDA__MEMORY_RESOURCE_CUDA_MANAGED_MEMORY_RESOURCE_H -#define _CUDA__MEMORY_RESOURCE_CUDA_MANAGED_MEMORY_RESOURCE_H - -#include - -#if defined(_CCCL_IMPLICIT_SYSTEM_HEADER_GCC) -# pragma GCC system_header -#elif defined(_CCCL_IMPLICIT_SYSTEM_HEADER_CLANG) -# pragma clang system_header -#elif defined(_CCCL_IMPLICIT_SYSTEM_HEADER_MSVC) -# pragma system_header -#endif // no system header - -#if !_CCCL_COMPILER(MSVC2017) && defined(LIBCUDACXX_ENABLE_EXPERIMENTAL_MEMORY_RESOURCE) - -# if defined(_CCCL_CUDA_COMPILER_CLANG) -# include -# endif // _CCCL_CUDA_COMPILER_CLANG - -# include -# include -# include -# include -# include -# include -# include - -# if _CCCL_STD_VER >= 2014 - -_LIBCUDACXX_BEGIN_NAMESPACE_CUDA_MR - -//! @brief \c managed_memory_resource uses `cudaMallocManaged` / `cudaFree` for allocation / deallocation. -class managed_memory_resource -{ -private: - unsigned int __flags_ = cudaMemAttachGlobal; - - static constexpr unsigned int __available_flags = cudaMemAttachGlobal | cudaMemAttachHost; - -public: - constexpr managed_memory_resource(const unsigned int __flags = cudaMemAttachGlobal) noexcept - : __flags_(__flags & __available_flags) - { - _CCCL_ASSERT(__flags_ == __flags, "Unexpected flags passed to managed_memory_resource"); - } - - //! @brief Allocate CUDA unified memory of size at least \p __bytes. - //! @param __bytes The size in bytes of the allocation. - //! @param __alignment The requested alignment of the allocation. - //! @throw std::invalid_argument in case of invalid alignment or \c cuda::cuda_error of the returned error code. - //! @return Pointer to the newly allocated memory - _CCCL_NODISCARD void* allocate(const size_t __bytes, const size_t __alignment = default_cuda_malloc_alignment) const - { - // We need to ensure that the provided alignment matches the minimal provided alignment - if (!__is_valid_alignment(__alignment)) - { - _CUDA_VSTD::__throw_invalid_argument("Invalid alignment passed to managed_memory_resource::allocate."); - } - - void* __ptr{nullptr}; - _CCCL_TRY_CUDA_API( - ::cudaMallocManaged, "Failed to allocate memory with cudaMallocManaged.", &__ptr, __bytes, __flags_); - return __ptr; - } - - //! @brief Deallocate memory pointed to by \p __ptr. - //! @param __ptr Pointer to be deallocated. Must have been allocated through a call to `allocate`. - //! @param __bytes The number of bytes that was passed to the `allocate` call that returned \p __ptr. - //! @param __alignment The alignment that was passed to the `allocate` call that returned \p __ptr. - void deallocate(void* __ptr, const size_t, const size_t __alignment = default_cuda_malloc_alignment) const noexcept - { - // We need to ensure that the provided alignment matches the minimal provided alignment - _CCCL_ASSERT(__is_valid_alignment(__alignment), "Invalid alignment passed to managed_memory_resource::deallocate."); - _CCCL_ASSERT_CUDA_API(::cudaFree, "managed_memory_resource::deallocate failed", __ptr); - (void) __alignment; - } - - //! @brief Equality comparison with another \c managed_memory_resource. - //! @param __other The other \c managed_memory_resource. - //! @return Whether both \c managed_memory_resource were constructed with the same flags. - _CCCL_NODISCARD constexpr bool operator==(managed_memory_resource const& __other) const noexcept - { - return __flags_ == __other.__flags_; - } -# if _CCCL_STD_VER <= 2017 - //! @brief Inequality comparison with another \c managed_memory_resource. - //! @param __other The other \c managed_memory_resource. - //! @return Whether both \c managed_memory_resource were constructed with different flags. - _CCCL_NODISCARD constexpr bool operator!=(managed_memory_resource const& __other) const noexcept - { - return __flags_ != __other.__flags_; - } -# endif // _CCCL_STD_VER <= 2017 - -# if _CCCL_STD_VER >= 2020 - //! @brief Equality comparison between a \c managed_memory_resource and another resource - //! @param __rhs The resource to compare to - //! @return If the underlying types are equality comparable, returns the result of equality comparison of both - //! resources. Otherwise, returns false. - _CCCL_TEMPLATE(class _Resource) - _CCCL_REQUIRES(__different_resource) - _CCCL_NODISCARD bool operator==(_Resource const& __rhs) const noexcept - { - if constexpr (has_property<_Resource, host_accessible>) - { - return resource_ref{const_cast(this)} - == resource_ref{const_cast<_Resource&>(__rhs)}; - } - else if constexpr (has_property<_Resource, device_accessible>) - { - return resource_ref{const_cast(this)} - == resource_ref{const_cast<_Resource&>(__rhs)}; - } - else - { - return false; - } - } -# else // ^^^ C++20 ^^^ / vvv C++17 - template - _CCCL_NODISCARD_FRIEND auto operator==(managed_memory_resource const& __lhs, _Resource const& __rhs) noexcept - _CCCL_TRAILING_REQUIRES(bool)( - __different_resource&& has_property<_Resource, host_accessible>) - { - return resource_ref{const_cast(__lhs)} - == resource_ref{const_cast<_Resource&>(__rhs)}; - } - template - _CCCL_NODISCARD_FRIEND auto operator==(managed_memory_resource const& __lhs, _Resource const& __rhs) noexcept - _CCCL_TRAILING_REQUIRES(bool)( - __different_resource && !has_property<_Resource, host_accessible> - && has_property<_Resource, device_accessible>) - { - return resource_ref{const_cast(__lhs)} - == resource_ref{const_cast<_Resource&>(__rhs)}; - } - template - _CCCL_NODISCARD_FRIEND auto operator==(managed_memory_resource const& __lhs, _Resource const& __rhs) noexcept - _CCCL_TRAILING_REQUIRES(bool)( - __different_resource && !has_property<_Resource, host_accessible> - && !has_property<_Resource, device_accessible>) - { - return false; - } - - template - _CCCL_NODISCARD_FRIEND auto operator==(_Resource const& __lhs, managed_memory_resource const& __rhs) noexcept - _CCCL_TRAILING_REQUIRES(bool)(__different_resource) - { - return __rhs == __lhs; - } - - template - _CCCL_NODISCARD_FRIEND auto operator!=(managed_memory_resource const& __lhs, _Resource const& __rhs) noexcept - _CCCL_TRAILING_REQUIRES(bool)(__different_resource) - { - return !(__lhs == __rhs); - } - - template - _CCCL_NODISCARD_FRIEND auto operator!=(_Resource const& __rhs, managed_memory_resource const& __lhs) noexcept - _CCCL_TRAILING_REQUIRES(bool)(__different_resource) - { - return !(__rhs == __lhs); - } -# endif // _CCCL_STD_VER <= 2017 - - //! @brief Enables the \c device_accessible property - friend constexpr void get_property(managed_memory_resource const&, device_accessible) noexcept {} - //! @brief Enables the \c host_accessible property - friend constexpr void get_property(managed_memory_resource const&, host_accessible) noexcept {} - - //! @brief Checks whether the passed in alignment is valid - static constexpr bool __is_valid_alignment(const size_t __alignment) noexcept - { - return __alignment <= default_cuda_malloc_alignment && (default_cuda_malloc_alignment % __alignment == 0); - } -}; -static_assert(resource_with, ""); -static_assert(resource_with, ""); - -// For backward compatability -using cuda_managed_memory_resource _LIBCUDACXX_DEPRECATED = managed_memory_resource; - -_LIBCUDACXX_END_NAMESPACE_CUDA_MR - -# endif // _CCCL_STD_VER >= 2014 - -#endif // !_CCCL_COMPILER(MSVC2017) && LIBCUDACXX_ENABLE_EXPERIMENTAL_MEMORY_RESOURCE - -#endif //_CUDA__MEMORY_RESOURCE_CUDA_MANAGED_MEMORY_RESOURCE_H diff --git a/libcudacxx/include/cuda/__memory_resource/pinned_memory_resource.h b/libcudacxx/include/cuda/__memory_resource/pinned_memory_resource.h deleted file mode 100644 index 819d485a104..00000000000 --- a/libcudacxx/include/cuda/__memory_resource/pinned_memory_resource.h +++ /dev/null @@ -1,204 +0,0 @@ -//===----------------------------------------------------------------------===// -// -// Part of libcu++, the C++ Standard Library for your entire system, -// under the Apache License v2.0 with LLVM Exceptions. -// See https://llvm.org/LICENSE.txt for license information. -// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception -// SPDX-FileCopyrightText: Copyright (c) 2024 NVIDIA CORPORATION & AFFILIATES. -// -//===----------------------------------------------------------------------===// - -#ifndef _CUDA__MEMORY_RESOURCE_CUDA_PINNED_MEMORY_RESOURCE_H -#define _CUDA__MEMORY_RESOURCE_CUDA_PINNED_MEMORY_RESOURCE_H - -#include - -#if defined(_CCCL_IMPLICIT_SYSTEM_HEADER_GCC) -# pragma GCC system_header -#elif defined(_CCCL_IMPLICIT_SYSTEM_HEADER_CLANG) -# pragma clang system_header -#elif defined(_CCCL_IMPLICIT_SYSTEM_HEADER_MSVC) -# pragma system_header -#endif // no system header - -#if !_CCCL_COMPILER(MSVC2017) && defined(LIBCUDACXX_ENABLE_EXPERIMENTAL_MEMORY_RESOURCE) - -# if defined(_CCCL_CUDA_COMPILER_CLANG) -# include -# include -# endif // _CCCL_CUDA_COMPILER_CLANG - -# include -# include -# include -# include -# include -# include -# include - -# if _CCCL_STD_VER >= 2014 - -_LIBCUDACXX_BEGIN_NAMESPACE_CUDA_MR - -//! @brief pinned_memory_resource uses `cudaMallocHost` / `cudaFreeHost` for allocation / deallocation. -class pinned_memory_resource -{ -private: - unsigned int __flags_ = cudaHostAllocDefault; - - static constexpr unsigned int __available_flags = - cudaHostAllocDefault | cudaHostAllocPortable | cudaHostAllocMapped | cudaHostAllocWriteCombined; - -public: - constexpr pinned_memory_resource(const unsigned int __flags = cudaHostAllocDefault) noexcept - : __flags_(__flags & __available_flags) - { - _CCCL_ASSERT(__flags_ == __flags, "Unexpected flags passed to pinned_memory_resource"); - } - - //! @brief Allocate host memory of size at least \p __bytes. - //! @param __bytes The size in bytes of the allocation. - //! @param __alignment The requested alignment of the allocation. - //! @throw std::invalid_argument in case of invalid alignment or \c cuda::cuda_error of the returned error code. - //! @return Pointer to the newly allocated memory - _CCCL_NODISCARD void* allocate(const size_t __bytes, - const size_t __alignment = default_cuda_malloc_host_alignment) const - { - // We need to ensure that the provided alignment matches the minimal provided alignment - if (!__is_valid_alignment(__alignment)) - { - _CUDA_VSTD::__throw_invalid_argument("Invalid alignment passed to pinned_memory_resource::allocate."); - } - - void* __ptr{nullptr}; - _CCCL_TRY_CUDA_API(::cudaMallocHost, "Failed to allocate memory with cudaMallocHost.", &__ptr, __bytes, __flags_); - return __ptr; - } - - //! @brief Deallocate memory pointed to by \p __ptr. - //! @param __ptr Pointer to be deallocated. Must have been allocated through a call to `allocate`. - //! @param __bytes The number of bytes that was passed to the `allocate` call that returned \p __ptr. - //! @param __alignment The alignment that was passed to the `allocate` call that returned \p __ptr. - void - deallocate(void* __ptr, const size_t, const size_t __alignment = default_cuda_malloc_host_alignment) const noexcept - { - // We need to ensure that the provided alignment matches the minimal provided alignment - _CCCL_ASSERT(__is_valid_alignment(__alignment), "Invalid alignment passed to pinned_memory_resource::deallocate."); - _CCCL_ASSERT_CUDA_API(::cudaFreeHost, "pinned_memory_resource::deallocate failed", __ptr); - (void) __alignment; - } - - //! @brief Equality comparison with another \c pinned_memory_resource. - //! @param __other The other \c pinned_memory_resource. - //! @return Whether both \c pinned_memory_resource were constructed with the same flags. - _CCCL_NODISCARD constexpr bool operator==(pinned_memory_resource const& __other) const noexcept - { - return __flags_ == __other.__flags_; - } -# if _CCCL_STD_VER <= 2017 - //! @brief Equality comparison with another \c pinned_memory_resource. - //! @param __other The other \c pinned_memory_resource. - //! @return Whether both \c pinned_memory_resource were constructed with different flags. - _CCCL_NODISCARD constexpr bool operator!=(pinned_memory_resource const& __other) const noexcept - { - return __flags_ != __other.__flags_; - } -# endif // _CCCL_STD_VER <= 2017 - -# if _CCCL_STD_VER >= 2020 - //! @brief Equality comparison between a \c pinned_memory_resource and another resource - //! @param __rhs The resource to compare to - //! @return If the underlying types are equality comparable, returns the result of equality comparison of both - //! resources. Otherwise, returns false. - _CCCL_TEMPLATE(class _Resource) - _CCCL_REQUIRES(__different_resource) - _CCCL_NODISCARD bool operator==(_Resource const& __rhs) const noexcept - { - if constexpr (has_property<_Resource, host_accessible>) - { - return resource_ref{const_cast(this)} - == resource_ref{const_cast<_Resource&>(__rhs)}; - } - else if constexpr (has_property<_Resource, device_accessible>) - { - return resource_ref{const_cast(this)} - == resource_ref{const_cast<_Resource&>(__rhs)}; - } - else - { - return false; - } - } -# else // ^^^ C++20 ^^^ / vvv C++17 - template - _CCCL_NODISCARD_FRIEND auto operator==(pinned_memory_resource const& __lhs, _Resource const& __rhs) noexcept - _CCCL_TRAILING_REQUIRES(bool)( - __different_resource&& has_property<_Resource, host_accessible>) - { - return resource_ref{const_cast(__lhs)} - == resource_ref{const_cast<_Resource&>(__rhs)}; - } - template - _CCCL_NODISCARD_FRIEND auto operator==(pinned_memory_resource const& __lhs, _Resource const& __rhs) noexcept - _CCCL_TRAILING_REQUIRES(bool)( - __different_resource && !has_property<_Resource, host_accessible> - && has_property<_Resource, device_accessible>) - { - return resource_ref{const_cast(__lhs)} - == resource_ref{const_cast<_Resource&>(__rhs)}; - } - template - _CCCL_NODISCARD_FRIEND auto operator==(pinned_memory_resource const& __lhs, _Resource const& __rhs) noexcept - _CCCL_TRAILING_REQUIRES(bool)( - __different_resource && !has_property<_Resource, host_accessible> - && !has_property<_Resource, device_accessible>) - { - return false; - } - - template - _CCCL_NODISCARD_FRIEND auto operator==(_Resource const& __lhs, pinned_memory_resource const& __rhs) noexcept - _CCCL_TRAILING_REQUIRES(bool)(__different_resource) - { - return __rhs == __lhs; - } - - template - _CCCL_NODISCARD_FRIEND auto operator!=(pinned_memory_resource const& __lhs, _Resource const& __rhs) noexcept - _CCCL_TRAILING_REQUIRES(bool)(__different_resource) - { - return !(__lhs == __rhs); - } - - template - _CCCL_NODISCARD_FRIEND auto operator!=(_Resource const& __rhs, pinned_memory_resource const& __lhs) noexcept - _CCCL_TRAILING_REQUIRES(bool)(__different_resource) - { - return !(__rhs == __lhs); - } -# endif // _CCCL_STD_VER <= 2017 - - //! @brief Enables the \c device_accessible property - friend constexpr void get_property(pinned_memory_resource const&, device_accessible) noexcept {} - //! @brief Enables the \c host_accessible property - friend constexpr void get_property(pinned_memory_resource const&, host_accessible) noexcept {} - - //! @brief Checks whether the passed in alignment is valid - static constexpr bool __is_valid_alignment(const size_t __alignment) noexcept - { - return __alignment <= default_cuda_malloc_host_alignment && (default_cuda_malloc_host_alignment % __alignment == 0); - } -}; -static_assert(resource_with, ""); -static_assert(resource_with, ""); - -// For backward compatability -using cuda_pinned_memory_resource _LIBCUDACXX_DEPRECATED = pinned_memory_resource; - -_LIBCUDACXX_END_NAMESPACE_CUDA_MR - -# endif // _CCCL_STD_VER >= 2014 - -#endif // !_CCCL_COMPILER(MSVC2017) && LIBCUDACXX_ENABLE_EXPERIMENTAL_MEMORY_RESOURCE - -#endif //_CUDA__MEMORY_RESOURCE_CUDA_PINNED_MEMORY_RESOURCE_H diff --git a/libcudacxx/include/cuda/memory_resource b/libcudacxx/include/cuda/memory_resource index d2e4296b749..e1c0ac468c1 100644 --- a/libcudacxx/include/cuda/memory_resource +++ b/libcudacxx/include/cuda/memory_resource @@ -32,10 +32,7 @@ //! //!@endrst -#include #include -#include -#include #include #include #include diff --git a/libcudacxx/test/libcudacxx/cuda/memory_resource/device_memory_resource/allocate.pass.cpp b/libcudacxx/test/libcudacxx/cuda/memory_resource/device_memory_resource/allocate.pass.cpp deleted file mode 100644 index fe983aa93de..00000000000 --- a/libcudacxx/test/libcudacxx/cuda/memory_resource/device_memory_resource/allocate.pass.cpp +++ /dev/null @@ -1,95 +0,0 @@ -//===----------------------------------------------------------------------===// -// -// Part of libcu++, the C++ Standard Library for your entire system, -// under the Apache License v2.0 with LLVM Exceptions. -// See https://llvm.org/LICENSE.txt for license information. -// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception -// SPDX-FileCopyrightText: Copyright (c) 2024 NVIDIA CORPORATION & AFFILIATES. -// -//===----------------------------------------------------------------------===// - -// UNSUPPORTED: c++03, c++11 -// UNSUPPORTED: msvc-19.16 -// UNSUPPORTED: nvrtc - -#include -#include -#include -#include - -#include "test_macros.h" - -void ensure_device_ptr(void* ptr) -{ - assert(ptr != nullptr); - cudaPointerAttributes attributes; - cudaError_t status = cudaPointerGetAttributes(&attributes, ptr); - assert(status == cudaSuccess); - assert(attributes.type == cudaMemoryTypeDevice); -} - -void test() -{ - cuda::mr::device_memory_resource res{}; - - { // allocate / deallocate - auto* ptr = res.allocate(42); - static_assert(cuda::std::is_same::value, ""); - ensure_device_ptr(ptr); - - res.deallocate(ptr, 42); - } - - { // allocate / deallocate with alignment - constexpr size_t desired_alignment = 64; - auto* ptr = res.allocate(42, desired_alignment); - static_assert(cuda::std::is_same::value, ""); - ensure_device_ptr(ptr); - - // also check the alignment - const auto address = reinterpret_cast(ptr); - const auto alignment = address & (~address + 1ULL); - assert(alignment >= desired_alignment); - res.deallocate(ptr, 42, desired_alignment); - } - -#ifndef TEST_HAS_NO_EXCEPTIONS - { // allocate with too small alignment - while (true) - { - try - { - auto* ptr = res.allocate(5, 42); - unused(ptr); - } - catch (const std::invalid_argument&) - { - break; - } - assert(false); - } - } - - { // allocate with non matching alignment - while (true) - { - try - { - auto* ptr = res.allocate(5, 1337); - unused(ptr); - } - catch (const std::invalid_argument&) - { - break; - } - assert(false); - } - } -#endif // TEST_HAS_NO_EXCEPTIONS -} - -int main(int, char**) -{ - NV_IF_TARGET(NV_IS_HOST, test();) - return 0; -} diff --git a/libcudacxx/test/libcudacxx/cuda/memory_resource/device_memory_resource/equality.pass.cpp b/libcudacxx/test/libcudacxx/cuda/memory_resource/device_memory_resource/equality.pass.cpp deleted file mode 100644 index 56be1650df5..00000000000 --- a/libcudacxx/test/libcudacxx/cuda/memory_resource/device_memory_resource/equality.pass.cpp +++ /dev/null @@ -1,144 +0,0 @@ -//===----------------------------------------------------------------------===// -// -// Part of libcu++, the C++ Standard Library for your entire system, -// under the Apache License v2.0 with LLVM Exceptions. -// See https://llvm.org/LICENSE.txt for license information. -// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception -// SPDX-FileCopyrightText: Copyright (c) 2024 NVIDIA CORPORATION & AFFILIATES. -// -//===----------------------------------------------------------------------===// - -// UNSUPPORTED: c++03, c++11 -// UNSUPPORTED: msvc-19.16 -// UNSUPPORTED: nvrtc - -#include -#include -#include -#include - -enum class AccessibilityType -{ - Device, - Host, -}; - -template -struct resource -{ - void* allocate(size_t, size_t) - { - return nullptr; - } - void deallocate(void*, size_t, size_t) noexcept {} - - bool operator==(const resource&) const - { - return true; - } - bool operator!=(const resource& other) const - { - return false; - } - - template = 0> - friend void get_property(const resource&, cuda::mr::device_accessible) noexcept - {} - template = 0> - friend void get_property(const resource&, cuda::mr::host_accessible) noexcept - {} -}; -static_assert(cuda::mr::resource>, ""); -static_assert(!cuda::mr::resource_with, cuda::mr::device_accessible>, ""); -static_assert(cuda::mr::resource>, ""); -static_assert(cuda::mr::resource_with, cuda::mr::device_accessible>, ""); - -template -struct async_resource : public resource -{ - void* allocate_async(size_t, size_t, cuda::stream_ref) - { - return nullptr; - } - void deallocate_async(void*, size_t, size_t, cuda::stream_ref) {} -}; -static_assert(cuda::mr::async_resource>, ""); -static_assert(!cuda::mr::async_resource_with, cuda::mr::device_accessible>, ""); -static_assert(cuda::mr::async_resource>, ""); -static_assert(cuda::mr::async_resource_with, cuda::mr::device_accessible>, - ""); - -// test for cccl#2214: https://github.com/NVIDIA/cccl/issues/2214 -struct derived_resource : cuda::mr::device_memory_resource -{ - using cuda::mr::device_memory_resource::device_memory_resource; -}; -static_assert(cuda::mr::resource, ""); - -// Ensure that we can only - -void test() -{ - cuda::mr::device_memory_resource first{}; - { // comparison against a plain device_memory_resource - cuda::mr::device_memory_resource second{}; - assert(first == second); - assert(!(first != second)); - } - - { // comparison against a device_memory_resource wrapped inside a resource_ref - cuda::mr::device_memory_resource second{}; - cuda::mr::resource_ref second_ref{second}; - assert(first == second_ref); - assert(!(first != second_ref)); - assert(second_ref == first); - assert(!(second_ref != first)); - } - - { // comparison against a device_memory_resource wrapped inside a resource_ref - cuda::mr::device_memory_resource second{}; - cuda::mr::resource_ref second_ref{second}; - assert(first == second_ref); - assert(!(first != second_ref)); - assert(second_ref == first); - assert(!(second_ref != first)); - } - - { // comparison against a different resource - resource host_resource{}; - resource device_resource{}; - assert(!(first == host_resource)); - assert(first != host_resource); - assert(!(first == device_resource)); - assert(first != device_resource); - - assert(!(host_resource == first)); - assert(host_resource != first); - assert(!(device_resource == first)); - assert(device_resource != first); - } - - { // comparison against a different resource through resource_ref - async_resource host_async_resource{}; - async_resource device_async_resource{}; - cuda::mr::resource_ref host_ref{host_async_resource}; - cuda::mr::resource_ref device_ref{device_async_resource}; - assert(!(first == host_ref)); - assert(first != host_ref); - assert(!(first == device_async_resource)); - assert(first != device_async_resource); - - assert(!(host_ref == first)); - assert(host_ref != first); - assert(!(device_async_resource == first)); - assert(device_async_resource != first); - } -} - -int main(int, char**) -{ - NV_IF_TARGET(NV_IS_HOST, test();) - return 0; -} diff --git a/libcudacxx/test/libcudacxx/cuda/memory_resource/device_memory_resource/traits.pass.cpp b/libcudacxx/test/libcudacxx/cuda/memory_resource/device_memory_resource/traits.pass.cpp deleted file mode 100644 index d642b83bf02..00000000000 --- a/libcudacxx/test/libcudacxx/cuda/memory_resource/device_memory_resource/traits.pass.cpp +++ /dev/null @@ -1,31 +0,0 @@ -//===----------------------------------------------------------------------===// -// -// Part of libcu++, the C++ Standard Library for your entire system, -// under the Apache License v2.0 with LLVM Exceptions. -// See https://llvm.org/LICENSE.txt for license information. -// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception -// SPDX-FileCopyrightText: Copyright (c) 2024 NVIDIA CORPORATION & AFFILIATES. -// -//===----------------------------------------------------------------------===// - -// UNSUPPORTED: c++03, c++11 -// UNSUPPORTED: msvc-19.16 -// UNSUPPORTED: nvrtc - -#include -#include - -using resource = cuda::mr::device_memory_resource; -static_assert(!cuda::std::is_trivial::value, ""); -static_assert(!cuda::std::is_trivially_default_constructible::value, ""); -static_assert(cuda::std::is_trivially_copy_constructible::value, ""); -static_assert(cuda::std::is_trivially_move_constructible::value, ""); -static_assert(cuda::std::is_trivially_copy_assignable::value, ""); -static_assert(cuda::std::is_trivially_move_assignable::value, ""); -static_assert(cuda::std::is_trivially_destructible::value, ""); -static_assert(!cuda::std::is_empty::value, ""); - -int main(int, char**) -{ - return 0; -} diff --git a/libcudacxx/test/libcudacxx/cuda/memory_resource/managed_memory_resource/allocate.pass.cpp b/libcudacxx/test/libcudacxx/cuda/memory_resource/managed_memory_resource/allocate.pass.cpp deleted file mode 100644 index f32093a1582..00000000000 --- a/libcudacxx/test/libcudacxx/cuda/memory_resource/managed_memory_resource/allocate.pass.cpp +++ /dev/null @@ -1,96 +0,0 @@ -//===----------------------------------------------------------------------===// -// -// Part of libcu++, the C++ Standard Library for your entire system, -// under the Apache License v2.0 with LLVM Exceptions. -// See https://llvm.org/LICENSE.txt for license information. -// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception -// SPDX-FileCopyrightText: Copyright (c) 2024 NVIDIA CORPORATION & AFFILIATES. -// -//===----------------------------------------------------------------------===// - -// UNSUPPORTED: c++03, c++11 -// UNSUPPORTED: msvc-19.16 -// UNSUPPORTED: nvrtc - -#include -#include -#include -#include - -#include "test_macros.h" - -void ensure_managed_ptr(void* ptr) -{ - assert(ptr != nullptr); - cudaPointerAttributes attributes; - cudaError_t status = cudaPointerGetAttributes(&attributes, ptr); - assert(status == cudaSuccess); - assert(attributes.type == cudaMemoryTypeManaged); -} - -void test(const unsigned int flag) -{ - cuda::mr::managed_memory_resource res{flag}; - - { // allocate / deallocate - auto* ptr = res.allocate(42); - static_assert(cuda::std::is_same::value, ""); - ensure_managed_ptr(ptr); - - res.deallocate(ptr, 42); - } - - { // allocate / deallocate with alignment - auto* ptr = res.allocate(42, 4); - static_assert(cuda::std::is_same::value, ""); - ensure_managed_ptr(ptr); - - res.deallocate(ptr, 42, 4); - } - -#ifndef TEST_HAS_NO_EXCEPTIONS - { // allocate with too small alignment - while (true) - { - try - { - auto* ptr = res.allocate(5, 42); - unused(ptr); - } - catch (const std::invalid_argument&) - { - break; - } - assert(false); - } - } - - { // allocate with non matching alignment - while (true) - { - try - { - auto* ptr = res.allocate(5, 1337); - unused(ptr); - } - catch (const std::invalid_argument&) - { - break; - } - assert(false); - } - } -#endif // TEST_HAS_NO_EXCEPTIONS -} - -void test() -{ - test(cudaMemAttachGlobal); - test(cudaMemAttachHost); -} - -int main(int, char**) -{ - NV_IF_TARGET(NV_IS_HOST, test();) - return 0; -} diff --git a/libcudacxx/test/libcudacxx/cuda/memory_resource/managed_memory_resource/equality.pass.cpp b/libcudacxx/test/libcudacxx/cuda/memory_resource/managed_memory_resource/equality.pass.cpp deleted file mode 100644 index 2c42c469b4b..00000000000 --- a/libcudacxx/test/libcudacxx/cuda/memory_resource/managed_memory_resource/equality.pass.cpp +++ /dev/null @@ -1,130 +0,0 @@ -//===----------------------------------------------------------------------===// -// -// Part of libcu++, the C++ Standard Library for your entire system, -// under the Apache License v2.0 with LLVM Exceptions. -// See https://llvm.org/LICENSE.txt for license information. -// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception -// SPDX-FileCopyrightText: Copyright (c) 2024 NVIDIA CORPORATION & AFFILIATES. -// -//===----------------------------------------------------------------------===// - -// UNSUPPORTED: c++03, c++11 -// UNSUPPORTED: msvc-19.16 -// UNSUPPORTED: nvrtc - -#include -#include -#include -#include - -enum class AccessibilityType -{ - Device, - Host, -}; - -template -struct resource -{ - void* allocate(size_t, size_t) - { - return nullptr; - } - void deallocate(void*, size_t, size_t) noexcept {} - - bool operator==(const resource&) const - { - return true; - } - bool operator!=(const resource& other) const - { - return false; - } -}; -static_assert(cuda::mr::resource>, ""); -static_assert(cuda::mr::resource>, ""); - -template -struct async_resource : public resource -{ - void* allocate_async(size_t, size_t, cuda::stream_ref) - { - return nullptr; - } - void deallocate_async(void*, size_t, size_t, cuda::stream_ref) {} -}; -static_assert(cuda::mr::async_resource>, ""); -static_assert(cuda::mr::async_resource>, ""); - -// test for cccl#2214: https://github.com/NVIDIA/cccl/issues/2214 -struct derived_managed_resource : cuda::mr::managed_memory_resource -{ - using cuda::mr::managed_memory_resource::managed_memory_resource; -}; -static_assert(cuda::mr::resource, ""); - -void test() -{ - cuda::mr::managed_memory_resource first{}; - { // comparison against a plain managed_memory_resource - cuda::mr::managed_memory_resource second{}; - assert(first == second); - assert(!(first != second)); - } - - { // comparison against a plain managed_memory_resource with a different flag set - cuda::mr::managed_memory_resource second{cudaMemAttachHost}; - assert(!(first == second)); - assert((first != second)); - } - - { // comparison against a managed_memory_resource wrapped inside a resource_ref - cuda::mr::managed_memory_resource second{}; - assert(first == cuda::mr::resource_ref{second}); - assert(!(first != cuda::mr::resource_ref{second})); - assert(cuda::mr::resource_ref{second} == first); - assert(!(cuda::mr::resource_ref{second} != first)); - } - - { // comparison against a managed_memory_resource wrapped inside a resource_ref - cuda::mr::managed_memory_resource second{}; - assert(first == cuda::mr::resource_ref{second}); - assert(!(first != cuda::mr::resource_ref{second})); - assert(cuda::mr::resource_ref{second} == first); - assert(!(cuda::mr::resource_ref{second} != first)); - } - - { // comparison against a different resource through resource_ref - resource host_resource{}; - resource device_resource{}; - assert(!(first == host_resource)); - assert(first != host_resource); - assert(!(first == device_resource)); - assert(first != device_resource); - - assert(!(host_resource == first)); - assert(host_resource != first); - assert(!(device_resource == first)); - assert(device_resource != first); - } - - { // comparison against a different resource through resource_ref - async_resource host_async_resource{}; - async_resource device_async_resource{}; - assert(!(first == host_async_resource)); - assert(first != host_async_resource); - assert(!(first == device_async_resource)); - assert(first != device_async_resource); - - assert(!(host_async_resource == first)); - assert(host_async_resource != first); - assert(!(device_async_resource == first)); - assert(device_async_resource != first); - } -} - -int main(int, char**) -{ - NV_IF_TARGET(NV_IS_HOST, test();) - return 0; -} diff --git a/libcudacxx/test/libcudacxx/cuda/memory_resource/managed_memory_resource/traits.pass.cpp b/libcudacxx/test/libcudacxx/cuda/memory_resource/managed_memory_resource/traits.pass.cpp deleted file mode 100644 index 02b9bd0294c..00000000000 --- a/libcudacxx/test/libcudacxx/cuda/memory_resource/managed_memory_resource/traits.pass.cpp +++ /dev/null @@ -1,31 +0,0 @@ -//===----------------------------------------------------------------------===// -// -// Part of libcu++, the C++ Standard Library for your entire system, -// under the Apache License v2.0 with LLVM Exceptions. -// See https://llvm.org/LICENSE.txt for license information. -// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception -// SPDX-FileCopyrightText: Copyright (c) 2024 NVIDIA CORPORATION & AFFILIATES. -// -//===----------------------------------------------------------------------===// - -// UNSUPPORTED: c++03, c++11 -// UNSUPPORTED: msvc-19.16 -// UNSUPPORTED: nvrtc - -#include -#include - -using resource = cuda::mr::managed_memory_resource; -static_assert(!cuda::std::is_trivial::value, ""); -static_assert(!cuda::std::is_trivially_default_constructible::value, ""); -static_assert(cuda::std::is_trivially_copy_constructible::value, ""); -static_assert(cuda::std::is_trivially_move_constructible::value, ""); -static_assert(cuda::std::is_trivially_copy_assignable::value, ""); -static_assert(cuda::std::is_trivially_move_assignable::value, ""); -static_assert(cuda::std::is_trivially_destructible::value, ""); -static_assert(!cuda::std::is_empty::value, ""); - -int main(int, char**) -{ - return 0; -} diff --git a/libcudacxx/test/libcudacxx/cuda/memory_resource/pinned_memory_resource/allocate.pass.cpp b/libcudacxx/test/libcudacxx/cuda/memory_resource/pinned_memory_resource/allocate.pass.cpp deleted file mode 100644 index a8fff25ffa6..00000000000 --- a/libcudacxx/test/libcudacxx/cuda/memory_resource/pinned_memory_resource/allocate.pass.cpp +++ /dev/null @@ -1,98 +0,0 @@ -//===----------------------------------------------------------------------===// -// -// Part of libcu++, the C++ Standard Library for your entire system, -// under the Apache License v2.0 with LLVM Exceptions. -// See https://llvm.org/LICENSE.txt for license information. -// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception -// SPDX-FileCopyrightText: Copyright (c) 2024 NVIDIA CORPORATION & AFFILIATES. -// -//===----------------------------------------------------------------------===// - -// UNSUPPORTED: c++03, c++11 -// UNSUPPORTED: msvc-19.16 -// UNSUPPORTED: nvrtc - -#include -#include -#include -#include - -#include "test_macros.h" - -void ensure_pinned_host_ptr(void* ptr) -{ - assert(ptr != nullptr); - cudaPointerAttributes attributes; - cudaError_t status = cudaPointerGetAttributes(&attributes, ptr); - assert(status == cudaSuccess); - assert((attributes.type == cudaMemoryTypeHost) && (attributes.devicePointer != nullptr)); -} - -void test(const unsigned int flag) -{ - cuda::mr::pinned_memory_resource res{flag}; - - { // allocate / deallocate - auto* ptr = res.allocate(42); - static_assert(cuda::std::is_same::value, ""); - ensure_pinned_host_ptr(ptr); - - res.deallocate(ptr, 42); - } - - { // allocate / deallocate with alignment - auto* ptr = res.allocate(42, 4); - static_assert(cuda::std::is_same::value, ""); - ensure_pinned_host_ptr(ptr); - - res.deallocate(ptr, 42, 4); - } - -#ifndef TEST_HAS_NO_EXCEPTIONS - { // allocate with too small alignment - while (true) - { - try - { - auto* ptr = res.allocate(5, 42); - unused(ptr); - } - catch (const std::invalid_argument&) - { - break; - } - assert(false); - } - } - - { // allocate with non matching alignment - while (true) - { - try - { - auto* ptr = res.allocate(5, 1337); - unused(ptr); - } - catch (const std::invalid_argument&) - { - break; - } - assert(false); - } - } -#endif // TEST_HAS_NO_EXCEPTIONS -} - -void test() -{ - test(cudaHostAllocDefault); - test(cudaHostAllocPortable); - test(cudaHostAllocMapped); - test(cudaHostAllocWriteCombined); -} - -int main(int, char**) -{ - NV_IF_TARGET(NV_IS_HOST, test();) - return 0; -} diff --git a/libcudacxx/test/libcudacxx/cuda/memory_resource/pinned_memory_resource/equality.pass.cpp b/libcudacxx/test/libcudacxx/cuda/memory_resource/pinned_memory_resource/equality.pass.cpp deleted file mode 100644 index e7f9918895d..00000000000 --- a/libcudacxx/test/libcudacxx/cuda/memory_resource/pinned_memory_resource/equality.pass.cpp +++ /dev/null @@ -1,132 +0,0 @@ -//===----------------------------------------------------------------------===// -// -// Part of libcu++, the C++ Standard Library for your entire system, -// under the Apache License v2.0 with LLVM Exceptions. -// See https://llvm.org/LICENSE.txt for license information. -// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception -// SPDX-FileCopyrightText: Copyright (c) 2024 NVIDIA CORPORATION & AFFILIATES. -// -//===----------------------------------------------------------------------===// - -// UNSUPPORTED: c++03, c++11 -// UNSUPPORTED: msvc-19.16 -// UNSUPPORTED: nvrtc - -#include -#include -#include -#include - -enum class AccessibilityType -{ - Device, - Host, -}; - -template -struct resource -{ - void* allocate(size_t, size_t) - { - return nullptr; - } - void deallocate(void*, size_t, size_t) noexcept {} - - bool operator==(const resource&) const - { - return true; - } - bool operator!=(const resource& other) const - { - return false; - } -}; -static_assert(cuda::mr::resource>, ""); -static_assert(cuda::mr::resource>, ""); - -template -struct async_resource : public resource -{ - void* allocate_async(size_t, size_t, cuda::stream_ref) - { - return nullptr; - } - void deallocate_async(void*, size_t, size_t, cuda::stream_ref) {} -}; -static_assert(cuda::mr::async_resource>, ""); -static_assert(cuda::mr::async_resource>, ""); - -// test for cccl#2214: https://github.com/NVIDIA/cccl/issues/2214 -struct derived_pinned_resource : cuda::mr::pinned_memory_resource -{ - using cuda::mr::pinned_memory_resource::pinned_memory_resource; -}; -static_assert(cuda::mr::resource, ""); - -void test() -{ - cuda::mr::pinned_memory_resource first{}; - { // comparison against a plain pinned_memory_resource - cuda::mr::pinned_memory_resource second{cudaHostAllocDefault}; - assert(first == second); - assert(!(first != second)); - } - - { // comparison against a plain pinned_memory_resource with a different flag set - cuda::mr::pinned_memory_resource second{cudaHostAllocPortable}; - assert(!(first == second)); - assert((first != second)); - } - - { // comparison against a pinned_memory_resource wrapped inside a resource_ref - cuda::mr::pinned_memory_resource second{}; - cuda::mr::resource_ref second_ref{second}; - assert(first == second_ref); - assert(!(first != second_ref)); - assert(second_ref == first); - assert(!(second_ref != first)); - } - - { // comparison against a pinned_memory_resource wrapped inside a resource_ref - cuda::mr::pinned_memory_resource second{}; - cuda::mr::resource_ref second_ref{second}; - assert(first == second_ref); - assert(!(first != second_ref)); - assert(second_ref == first); - assert(!(second_ref != first)); - } - - { // comparison against a different resource through resource_ref - resource host_resource{}; - resource device_resource{}; - assert(!(first == host_resource)); - assert(first != host_resource); - assert(!(first == device_resource)); - assert(first != device_resource); - - assert(!(host_resource == first)); - assert(host_resource != first); - assert(!(device_resource == first)); - assert(device_resource != first); - } - - { // comparison against a different resource through resource_ref - async_resource host_async_resource{}; - async_resource device_async_resource{}; - assert(!(first == host_async_resource)); - assert(first != host_async_resource); - assert(!(first == device_async_resource)); - assert(first != device_async_resource); - - assert(!(host_async_resource == first)); - assert(host_async_resource != first); - assert(!(device_async_resource == first)); - assert(device_async_resource != first); - } -} - -int main(int, char**) -{ - NV_IF_TARGET(NV_IS_HOST, test();) - return 0; -} diff --git a/libcudacxx/test/libcudacxx/cuda/memory_resource/pinned_memory_resource/traits.pass.cpp b/libcudacxx/test/libcudacxx/cuda/memory_resource/pinned_memory_resource/traits.pass.cpp deleted file mode 100644 index b0bbae95268..00000000000 --- a/libcudacxx/test/libcudacxx/cuda/memory_resource/pinned_memory_resource/traits.pass.cpp +++ /dev/null @@ -1,31 +0,0 @@ -//===----------------------------------------------------------------------===// -// -// Part of libcu++, the C++ Standard Library for your entire system, -// under the Apache License v2.0 with LLVM Exceptions. -// See https://llvm.org/LICENSE.txt for license information. -// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception -// SPDX-FileCopyrightText: Copyright (c) 2024 NVIDIA CORPORATION & AFFILIATES. -// -//===----------------------------------------------------------------------===// - -// UNSUPPORTED: c++03, c++11 -// UNSUPPORTED: msvc-19.16 -// UNSUPPORTED: nvrtc - -#include -#include - -using resource = cuda::mr::pinned_memory_resource; -static_assert(!cuda::std::is_trivial::value, ""); -static_assert(!cuda::std::is_trivially_default_constructible::value, ""); -static_assert(cuda::std::is_trivially_copy_constructible::value, ""); -static_assert(cuda::std::is_trivially_move_constructible::value, ""); -static_assert(cuda::std::is_trivially_copy_assignable::value, ""); -static_assert(cuda::std::is_trivially_move_assignable::value, ""); -static_assert(cuda::std::is_trivially_destructible::value, ""); -static_assert(!cuda::std::is_empty::value, ""); - -int main(int, char**) -{ - return 0; -} From c7ed9749f052fc1c8aaf825af2f7d6447c479b17 Mon Sep 17 00:00:00 2001 From: Federico Busato <50413820+fbusato@users.noreply.github.com> Date: Tue, 26 Nov 2024 01:32:34 -0800 Subject: [PATCH 31/45] `std::dims` (#2961) --- docs/libcudacxx/standard_api.rst | 6 ++-- .../standard_api/container_library/mdspan.rst | 1 + .../include/cuda/std/__mdspan/extents.h | 3 ++ .../mdspan.extents.dims/compare.pass.cpp | 29 +++++++++++++++++++ 4 files changed, 37 insertions(+), 2 deletions(-) create mode 100644 libcudacxx/test/libcudacxx/std/containers/views/mdspan/mdspan.extents.dims/compare.pass.cpp diff --git a/docs/libcudacxx/standard_api.rst b/docs/libcudacxx/standard_api.rst index 0729df55406..cb01d478702 100644 --- a/docs/libcudacxx/standard_api.rst +++ b/docs/libcudacxx/standard_api.rst @@ -101,5 +101,7 @@ Feature availability: - C++23 ```` is available in C++17. - - mdspan is feature complete in C++17 onwards. - - mdspan on msvc is only supported in C++20 and onwards. + - ``mdspan`` is feature complete in C++17 onwards. + - ``mdspan`` on msvc is only supported in C++20 and onwards. + +- C++26 ``std::dims`` is available in C++17. diff --git a/docs/libcudacxx/standard_api/container_library/mdspan.rst b/docs/libcudacxx/standard_api/container_library/mdspan.rst index 664a60eb48e..72174d13624 100644 --- a/docs/libcudacxx/standard_api/container_library/mdspan.rst +++ b/docs/libcudacxx/standard_api/container_library/mdspan.rst @@ -7,6 +7,7 @@ Extensions ---------- - All features of ```` are made available in C++17 onwards +- C++26 ``std::dims`` is made available in C++17 onwards Restrictions ------------ diff --git a/libcudacxx/include/cuda/std/__mdspan/extents.h b/libcudacxx/include/cuda/std/__mdspan/extents.h index d0bdfd016f6..0acb6579d7d 100644 --- a/libcudacxx/include/cuda/std/__mdspan/extents.h +++ b/libcudacxx/include/cuda/std/__mdspan/extents.h @@ -523,6 +523,9 @@ struct __make_dextents<_IndexType, 0, _CUDA_VSTD::extents<_IndexType, _ExtentsPa template using dextents = typename __detail::__make_dextents<_IndexType, _Rank>::type; +template +using dims = dextents<_IndexType, _Rank>; + # if defined(__MDSPAN_USE_CLASS_TEMPLATE_ARGUMENT_DEDUCTION) template _CCCL_HOST_DEVICE extents(_IndexTypes...) diff --git a/libcudacxx/test/libcudacxx/std/containers/views/mdspan/mdspan.extents.dims/compare.pass.cpp b/libcudacxx/test/libcudacxx/std/containers/views/mdspan/mdspan.extents.dims/compare.pass.cpp new file mode 100644 index 00000000000..ec2e8c6d725 --- /dev/null +++ b/libcudacxx/test/libcudacxx/std/containers/views/mdspan/mdspan.extents.dims/compare.pass.cpp @@ -0,0 +1,29 @@ +//===----------------------------------------------------------------------===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// SPDX-FileCopyrightText: Copyright (c) 2024 NVIDIA CORPORATION & AFFILIATES. +// +//===----------------------------------------------------------------------===// + +// UNSUPPORTED: c++11 +// UNSUPPORTED: msvc && c++14, msvc && c++17 + +#include +#include + +int main(int, char**) +{ + { + using index_t = size_t; + + cuda::std::dextents e0{1, 2, 3}; + cuda::std::dims<3> e1{1, 2, 3}; + + static_assert(cuda::std::is_same::value, ""); + assert(e0 == e1); + } + + return 0; +} From 8d6986d46ca5288d4bd7af7b9088f8a55297ba93 Mon Sep 17 00:00:00 2001 From: Michael Schellenberger Costa Date: Tue, 26 Nov 2024 11:29:19 +0100 Subject: [PATCH 32/45] Fix merge conflict from moving resources up a namespace (#2965) --- cudax/test/algorithm/common.cuh | 2 +- cudax/test/algorithm/copy.cu | 2 +- cudax/test/algorithm/fill.cu | 2 +- 3 files changed, 3 insertions(+), 3 deletions(-) diff --git a/cudax/test/algorithm/common.cuh b/cudax/test/algorithm/common.cuh index c4c7be0d02c..661d087f3bc 100644 --- a/cudax/test/algorithm/common.cuh +++ b/cudax/test/algorithm/common.cuh @@ -46,7 +46,7 @@ void check_result_and_erase(cudax::stream_ref stream, Result&& result, uint8_t p template auto make_buffer_for_mdspan(Extents extents, char value = 0) { - cuda::mr::pinned_memory_resource host_resource; + cudax::pinned_memory_resource host_resource; auto mapping = typename Layout::template mapping{extents}; cudax::uninitialized_buffer buffer(host_resource, mapping.required_span_size()); diff --git a/cudax/test/algorithm/copy.cu b/cudax/test/algorithm/copy.cu index afb9a2b71d5..583c3a836aa 100644 --- a/cudax/test/algorithm/copy.cu +++ b/cudax/test/algorithm/copy.cu @@ -160,7 +160,7 @@ TEST_CASE("Mdspan copy", "[data_manipulation]") auto mdspan_buffer = make_buffer_for_mdspan(mixed_extents, 1); cuda::std::mdspan mdspan(mdspan_buffer.data(), mixed_extents); cudax::weird_buffer> buffer{ - cuda::mr::pinned_memory_resource{}, mdspan.mapping().required_span_size()}; + cudax::pinned_memory_resource{}, mdspan.mapping().required_span_size()}; cudax::copy_bytes(stream, mdspan, buffer); stream.wait(); diff --git a/cudax/test/algorithm/fill.cu b/cudax/test/algorithm/fill.cu index 35fae342ad3..80bf6ef57e6 100644 --- a/cudax/test/algorithm/fill.cu +++ b/cudax/test/algorithm/fill.cu @@ -67,7 +67,7 @@ TEST_CASE("Mdspan Fill", "[data_manipulation]") { using static_extents = cuda::std::extents; auto size = cuda::std::layout_left::mapping().required_span_size(); - cudax::weird_buffer> buffer(cuda::mr::pinned_memory_resource{}, size); + cudax::weird_buffer> buffer(cudax::pinned_memory_resource{}, size); cudax::fill_bytes(stream, buffer, fill_byte); check_result_and_erase(stream, cuda::std::span(buffer.data, buffer.size)); From 3e826380f2c42162d529dc67944ad8e8435d4d18 Mon Sep 17 00:00:00 2001 From: pciolkosz Date: Tue, 26 Nov 2024 15:45:03 -0800 Subject: [PATCH 33/45] [CUDAX] Add a way to combine thread hierarchies (#2746) * Implement hierarchy_dimensions::combine * Fix issues after the merge --- .../__hierarchy/hierarchy_dimensions.cuh | 66 ++++++++++++++++++- .../__hierarchy/hierarchy_levels.cuh | 11 ++-- cudax/test/hierarchy/hierarchy_smoke.cu | 51 ++++++++++++++ 3 files changed, 120 insertions(+), 8 deletions(-) diff --git a/cudax/include/cuda/experimental/__hierarchy/hierarchy_dimensions.cuh b/cudax/include/cuda/experimental/__hierarchy/hierarchy_dimensions.cuh index a458c0d4017..61ddc5cb203 100644 --- a/cudax/include/cuda/experimental/__hierarchy/hierarchy_dimensions.cuh +++ b/cudax/include/cuda/experimental/__hierarchy/hierarchy_dimensions.cuh @@ -138,7 +138,7 @@ template struct can_stack_checker { template - using can_stack = ::cuda::std::__fold_and...>; + using can_stack = ::cuda::std::__fold_and...>; }; template @@ -436,6 +436,9 @@ private: }; public: + template + friend struct hierarchy_dimensions_fragment; + template using extents_type = decltype(::cuda::std::apply( ::cuda::std::declval>(), @@ -715,6 +718,63 @@ public: return ::cuda::std::apply(detail::get_level_helper{}, levels); } + + //! @brief Returns a new hierarchy with combined levels of this and the other supplied hierarchy + //! + //! This function combines this hierarchy with the supplied hierarchy, the resulting hierarchy + //! holds levels present in both hierarchies. In case of overlap of levels this hierarchy + //! is prioritized, so the result always holds all levels from this hierarchy and non-overlapping + //! levels from the other hierarchy. + //! + //! @param other The other hierarchy to be combined with this hierarchy + //! + //! @return Hierarchy holding the combined levels from both hierarchies + template + constexpr auto combine(const hierarchy_dimensions_fragment& other) + { + using this_top_level = __level_type_of<::cuda::std::__type_index_c<0, Levels...>>; + using this_bottom_level = __level_type_of<::cuda::std::__type_index_c>; + using other_top_level = __level_type_of<::cuda::std::__type_index_c<0, OtherLevels...>>; + using other_bottom_level = __level_type_of<::cuda::std::__type_index_c>; + if constexpr (detail::can_rhs_stack_on_lhs) + { + // Easily stackable case, example this is (grid), other is (cluster, block) + return ::cuda::std::apply(fragment_helper(), ::cuda::std::tuple_cat(levels, other.levels)); + } + else if constexpr (has_level> + && (!has_level> + || ::cuda::std::is_same_v) ) + { + // Overlap with this on the top, e.g. this is (grid, cluster), other is (cluster, block), can fully overlap + // Do we have some CCCL tuple utils that can select all but the first? + auto to_add_with_one_too_many = other.template levels_range(); + auto to_add = ::cuda::std::apply( + [](auto&&, auto&&... rest) { + return ::cuda::std::make_tuple(rest...); + }, + to_add_with_one_too_many); + return ::cuda::std::apply(fragment_helper(), ::cuda::std::tuple_cat(levels, to_add)); + } + else + { + if constexpr (detail::can_rhs_stack_on_lhs) + { + // Easily stackable case again, just reversed + return ::cuda::std::apply(fragment_helper(), ::cuda::std::tuple_cat(other.levels, levels)); + } + else + { + // Overlap with this on the bottom, e.g. this is (cluster, block), other is (grid, cluster), can fully overlap + static_assert(has_level> + && (!has_level> + || ::cuda::std::is_same_v), + "Can't combine the hierarchies"); + + auto to_add = other.template levels_range(); + return ::cuda::std::apply(fragment_helper(), ::cuda::std::tuple_cat(to_add, levels)); + } + } + } }; /** @@ -810,14 +870,14 @@ _CUDAX_API constexpr auto operator&(const hierarchy_dimensions_fragment>; using bottom_level = __level_type_of<::cuda::std::__type_index_c>; - if constexpr (detail::can_stack_on_top>) + if constexpr (detail::can_rhs_stack_on_lhs>) { return hierarchy_dimensions_fragment( ::cuda::std::tuple_cat(::cuda::std::make_tuple(new_level), ls.levels)); } else { - static_assert(detail::can_stack_on_top<__level_type_of, bottom_level>, + static_assert(detail::can_rhs_stack_on_lhs<__level_type_of, bottom_level>, "Not supported order of levels in hierarchy"); using NewUnit = detail::__default_unit_below<__level_type_of>; return hierarchy_dimensions_fragment( diff --git a/cudax/include/cuda/experimental/__hierarchy/hierarchy_levels.cuh b/cudax/include/cuda/experimental/__hierarchy/hierarchy_levels.cuh index 23593866c75..bbdcdcfc77e 100644 --- a/cudax/include/cuda/experimental/__hierarchy/hierarchy_levels.cuh +++ b/cudax/include/cuda/experimental/__hierarchy/hierarchy_levels.cuh @@ -68,6 +68,7 @@ struct dimensions_query return hierarchy::extents(); } }; + } // namespace detail // Struct to represent levels allowed below or above a certain level, @@ -91,12 +92,12 @@ _CCCL_INLINE_VAR constexpr bool is_level_allowed...>; template -_CCCL_INLINE_VAR constexpr bool can_stack_on_top = +_CCCL_INLINE_VAR constexpr bool can_rhs_stack_on_lhs = is_level_allowed || is_level_allowed; template _CCCL_INLINE_VAR constexpr bool legal_unit_for_level = - can_stack_on_top || legal_unit_for_level>; + can_rhs_stack_on_lhs || legal_unit_for_level>; template _CCCL_INLINE_VAR constexpr bool legal_unit_for_level = false; @@ -275,7 +276,7 @@ struct dims_helper template /* _CCCL_NODISCARD */ _CCCL_DEVICE auto extents_impl() { - if constexpr (::cuda::std::is_same_v || can_stack_on_top) + if constexpr (::cuda::std::is_same_v || can_rhs_stack_on_lhs) { return dim3_to_dims(dims_helper::extents()); } @@ -291,7 +292,7 @@ template template /* _CCCL_NODISCARD */ _CCCL_DEVICE auto index_impl() { - if constexpr (::cuda::std::is_same_v || detail::can_stack_on_top) + if constexpr (::cuda::std::is_same_v || detail::can_rhs_stack_on_lhs) { return dim3_to_dims(dims_helper::index()); } @@ -386,7 +387,7 @@ template _CCCL_DEVICE auto rank(const Unit&, const Level&) { static_assert(detail::legal_unit_for_level); - if constexpr (detail::can_stack_on_top) + if constexpr (detail::can_rhs_stack_on_lhs) { return detail::index_to_linear( detail::index_impl(), detail::extents_impl()); diff --git a/cudax/test/hierarchy/hierarchy_smoke.cu b/cudax/test/hierarchy/hierarchy_smoke.cu index 582e745ce3c..206c71d45bb 100644 --- a/cudax/test/hierarchy/hierarchy_smoke.cu +++ b/cudax/test/hierarchy/hierarchy_smoke.cu @@ -526,3 +526,54 @@ TEST_CASE("cudax::distribute", "[hierarchy]") CUDAX_REQUIRE(dims.count(cudax::thread, cudax::block) == 256); CUDAX_REQUIRE(dims.count(cudax::block, cudax::grid) == (numElements + threadsPerBlock - 1) / threadsPerBlock); } + +TEST_CASE("hierarchy merge", "[hierarchy]") +{ + SECTION("Non overlapping") + { + auto h1 = cudax::make_hierarchy_fragment(cudax::grid_dims<2>()); + auto h2 = cudax::make_hierarchy_fragment(cudax::block_dims<3>()); + auto combined = h1.combine(h2); + static_assert(combined.count(cudax::thread) == 6); + static_assert(combined.count(cudax::thread, cudax::block) == 3); + static_assert(combined.count(cudax::block) == 2); + auto combined_the_other_way = h2.combine(h1); + static_assert(cuda::std::is_same_v); + static_assert(combined_the_other_way.count(cudax::thread) == 6); + + auto dynamic_values = cudax::cluster_dims(4) & cudax::block_dims(5); + auto combined_dynamic = dynamic_values.combine(h1); + CUDAX_REQUIRE(combined_dynamic.count(cudax::thread) == 40); + } + SECTION("Overlapping") + { + auto h1 = cudax::make_hierarchy_fragment(cudax::grid_dims<2>(), cudax::cluster_dims<3>()); + auto h2 = cudax::make_hierarchy_fragment(cudax::block_dims<4>(), cudax::cluster_dims<5>()); + auto combined = h1.combine(h2); + static_assert(combined.count(cudax::thread) == 24); + static_assert(combined.count(cudax::thread, cudax::block) == 4); + static_assert(combined.count(cudax::block) == 6); + + auto combined_the_other_way = h2.combine(h1); + static_assert(!cuda::std::is_same_v); + static_assert(combined_the_other_way.count(cudax::thread) == 40); + static_assert(combined_the_other_way.count(cudax::thread, cudax::block) == 4); + static_assert(combined_the_other_way.count(cudax::block) == 10); + + auto ultimate_combination = combined.combine(combined_the_other_way); + static_assert(cuda::std::is_same_v); + static_assert(ultimate_combination.count(cudax::thread) == 24); + + auto block_level_replacement = cudax::make_hierarchy_fragment(cudax::block_dims<6>()); + auto with_block_replaced = block_level_replacement.combine(combined); + static_assert(with_block_replaced.count(cudax::thread) == 36); + static_assert(with_block_replaced.count(cudax::thread, cudax::block) == 6); + + auto grid_cluster_level_replacement = + cudax::make_hierarchy_fragment(cudax::grid_dims<7>(), cudax::cluster_dims<8>()); + auto with_grid_cluster_replaced = grid_cluster_level_replacement.combine(combined); + static_assert(with_grid_cluster_replaced.count(cudax::thread) == 7 * 8 * 4); + static_assert(with_grid_cluster_replaced.count(cudax::block, cudax::cluster) == 8); + static_assert(with_grid_cluster_replaced.count(cudax::cluster) == 7); + } +} From ab87e540f47abb5f85adc2edad12d00afbfc34d9 Mon Sep 17 00:00:00 2001 From: Bradley Dice Date: Tue, 26 Nov 2024 18:11:57 -0600 Subject: [PATCH 34/45] Require approval to run CI on draft PRs. (#2969) --- .github/copy-pr-bot.yaml | 1 + 1 file changed, 1 insertion(+) diff --git a/.github/copy-pr-bot.yaml b/.github/copy-pr-bot.yaml index d799c24aa69..f1297e5fb15 100644 --- a/.github/copy-pr-bot.yaml +++ b/.github/copy-pr-bot.yaml @@ -2,6 +2,7 @@ # https://docs.gha-runners.nvidia.com/apps/copy-pr-bot/ enabled: true +auto_sync_draft: false additional_trustees: - ahendriksen - gonzalobg From 27d8c87eb887feb61b6aadd4557f0444a2681562 Mon Sep 17 00:00:00 2001 From: Federico Busato <50413820+fbusato@users.noreply.github.com> Date: Tue, 26 Nov 2024 16:42:47 -0800 Subject: [PATCH 35/45] fix thread-reduce performance regression (#2944) --- cub/cub/thread/thread_operators.cuh | 17 ++++ cub/cub/thread/thread_reduce.cuh | 147 ++++++++++++++++++++++------ 2 files changed, 134 insertions(+), 30 deletions(-) diff --git a/cub/cub/thread/thread_operators.cuh b/cub/cub/thread/thread_operators.cuh index 05f2d6a41f6..2ba2f6e0c1b 100644 --- a/cub/cub/thread/thread_operators.cuh +++ b/cub/cub/thread/thread_operators.cuh @@ -702,6 +702,10 @@ struct CubOperatorToSimdOperator<::cuda::minimum<>, T> using simd_type = typename type::simd_type; }; +template +struct CubOperatorToSimdOperator<::cuda::minimum, T> : CubOperatorToSimdOperator<::cuda::minimum<>, T> +{}; + template struct CubOperatorToSimdOperator<::cuda::maximum<>, T> { @@ -709,6 +713,10 @@ struct CubOperatorToSimdOperator<::cuda::maximum<>, T> using simd_type = typename type::simd_type; }; +template +struct CubOperatorToSimdOperator<::cuda::maximum, T> : CubOperatorToSimdOperator<::cuda::maximum<>, T> +{}; + template struct CubOperatorToSimdOperator<::cuda::std::plus<>, T> { @@ -716,6 +724,10 @@ struct CubOperatorToSimdOperator<::cuda::std::plus<>, T> using simd_type = typename type::simd_type; }; +template +struct CubOperatorToSimdOperator<::cuda::std::plus, T> : CubOperatorToSimdOperator<::cuda::std::plus<>, T> +{}; + template struct CubOperatorToSimdOperator<::cuda::std::multiplies<>, T> { @@ -723,6 +735,11 @@ struct CubOperatorToSimdOperator<::cuda::std::multiplies<>, T> using simd_type = typename type::simd_type; }; +template +struct CubOperatorToSimdOperator<::cuda::std::multiplies, T> + : CubOperatorToSimdOperator<::cuda::std::multiplies<>, T> +{}; + template using cub_operator_to_simd_operator_t = typename CubOperatorToSimdOperator::type; diff --git a/cub/cub/thread/thread_reduce.cuh b/cub/cub/thread/thread_reduce.cuh index f384d907b34..ad8342d65a9 100644 --- a/cub/cub/thread/thread_reduce.cuh +++ b/cub/cub/thread/thread_reduce.cuh @@ -229,8 +229,10 @@ namespace internal template struct enable_generic_simd_reduction_traits { - static constexpr bool value = cub::detail::is_one_of() - && cub::detail::is_one_of, ::cuda::maximum<>>(); + static constexpr bool value = + cub::detail::is_one_of() + && cub::detail:: + is_one_of, ::cuda::minimum, ::cuda::maximum<>, ::cuda::maximum>(); }; # if defined(_CCCL_HAS_NVFP16) @@ -238,8 +240,16 @@ struct enable_generic_simd_reduction_traits template struct enable_generic_simd_reduction_traits<__half, ReductionOp> { - static constexpr bool value = cub::detail:: - is_one_of, ::cuda::maximum<>, ::cuda::std::plus<>, ::cuda::std::multiplies<>>(); + static constexpr bool value = cub::detail::is_one_of< + ReductionOp, + ::cuda::minimum<>, + ::cuda::minimum<__half>, + ::cuda::maximum<>, + ::cuda::maximum<__half>, + ::cuda::std::plus<>, + ::cuda::std::plus<__half>, + ::cuda::std::multiplies<>, + ::cuda::std::multiplies<__half>>(); }; # endif // defined(_CCCL_HAS_NVFP16) @@ -248,8 +258,16 @@ struct enable_generic_simd_reduction_traits<__half, ReductionOp> template struct enable_generic_simd_reduction_traits<__nv_bfloat16, ReductionOp> { - static constexpr bool value = cub::detail:: - is_one_of, ::cuda::maximum<>, ::cuda::std::plus<>, ::cuda::std::multiplies<>>(); + static constexpr bool value = cub::detail::is_one_of< + ReductionOp, + ::cuda::minimum<>, + ::cuda::minimum<__nv_bfloat16>, + ::cuda::maximum<>, + ::cuda::maximum<__nv_bfloat16>, + ::cuda::std::plus<>, + ::cuda::std::plus<__nv_bfloat16>, + ::cuda::std::multiplies<>, + ::cuda::std::multiplies<__nv_bfloat16>>(); }; # endif // defined(_CCCL_HAS_NVBF16) @@ -269,7 +287,8 @@ _CCCL_NODISCARD _CCCL_DEVICE constexpr bool enable_sm90_simd_reduction() using cub::detail::is_one_of; // ::cuda::std::plus<> not handled: IADD3 always produces less instructions than VIADD2 return is_one_of() && // - is_one_of, ::cuda::maximum<>>() && Length >= 10; + is_one_of, ::cuda::minimum, ::cuda::maximum<>, ::cuda::maximum>() + && Length >= 10; } template @@ -277,7 +296,15 @@ _CCCL_NODISCARD _CCCL_DEVICE constexpr bool enable_sm80_simd_reduction() { using cub::detail::is_one_of; using ::cuda::std::is_same; - return is_one_of, ::cuda::maximum<>, ::cuda::std::plus<>, ::cuda::std::multiplies<>>() + return is_one_of, + ::cuda::minimum, + ::cuda::maximum<>, + ::cuda::maximum, + ::cuda::std::plus<>, + ::cuda::std::plus, + ::cuda::std::multiplies<>, + ::cuda::std::multiplies>() && Length >= 4 # if defined(_CCCL_HAS_NVFP16) && defined(_CCCL_HAS_NVBF16) && (is_same::value || is_same::value) @@ -295,7 +322,12 @@ _CCCL_NODISCARD _CCCL_DEVICE constexpr bool enable_sm70_simd_reduction() using cub::detail::is_one_of; using ::cuda::std::is_same; # if defined(_CCCL_HAS_NVFP16) - return is_same::value && is_one_of, ::cuda::std::multiplies<>>() + return is_same::value + && is_one_of, + ::cuda::std::plus, + ::cuda::std::multiplies<>, + ::cuda::std::multiplies>() && Length >= 4; # else return false; @@ -344,14 +376,21 @@ template struct enable_ternary_reduction_sm90 { static constexpr bool value = - cub::detail::is_one_of - && cub::detail::is_one_of, - ::cuda::maximum<>, - ::cuda::std::plus<>, - ::cuda::std::bit_and<>, - ::cuda::std::bit_or<>, - ::cuda::std::bit_xor<>>(); + cub::detail::is_one_of() + && cub::detail::is_one_of< + ReductionOp, + ::cuda::minimum<>, + ::cuda::minimum, + ::cuda::maximum<>, + ::cuda::maximum, + ::cuda::std::plus<>, + ::cuda::std::plus, + ::cuda::std::bit_and<>, + ::cuda::std::bit_and, + ::cuda::std::bit_or<>, + ::cuda::std::bit_or, + ::cuda::std::bit_xor<>, + ::cuda::std::bit_xor>(); }; # if defined(_CCCL_HAS_NVFP16) @@ -360,7 +399,13 @@ template struct enable_ternary_reduction_sm90<__half2, ReductionOp> { static constexpr bool value = - cub::detail::is_one_of, ::cuda::maximum<>, SimdMin<__half>, SimdMax<__half>>(); + cub::detail::is_one_of, + ::cuda::minimum<__half2>, + ::cuda::maximum<>, + ::cuda::maximum<__half2>, + SimdMin<__half>, + SimdMax<__half>>(); }; # endif // defined(_CCCL_HAS_NVFP16) @@ -370,8 +415,14 @@ struct enable_ternary_reduction_sm90<__half2, ReductionOp> template struct enable_ternary_reduction_sm90<__nv_bfloat162, ReductionOp> { - static constexpr bool value = cub::detail:: - is_one_of, ::cuda::maximum<>, SimdMin<__nv_bfloat16>, SimdMax<__nv_bfloat16>>(); + static constexpr bool value = + cub::detail::is_one_of, + ::cuda::minimum<__nv_bfloat162>, + ::cuda::maximum<>, + ::cuda::maximum<__nv_bfloat162>, + SimdMin<__nv_bfloat16>, + SimdMax<__nv_bfloat16>>(); }; # endif // defined(_CCCL_HAS_NVBF16) @@ -394,10 +445,11 @@ _CCCL_NODISCARD _CCCL_DEVICE _CCCL_FORCEINLINE _CCCL_CONSTEXPR_CXX14 bool enable NV_PROVIDES_SM_90, (return enable_ternary_reduction_sm90::value;), NV_PROVIDES_SM_50, - (return is_one_of() - && is_one_of, ::cuda::std::bit_and<>, ::cuda::std::bit_or<>, - ::cuda::std::bit_xor<>>();), + (return is_one_of() + && is_one_of, ::cuda::std::plus, + ::cuda::std::bit_and<>, ::cuda::std::bit_and, + ::cuda::std::bit_or<>, ::cuda::std::bit_or, + ::cuda::std::bit_xor<>, ::cuda::std::bit_xor>();), NV_ANY_TARGET, (return false;) ); @@ -415,12 +467,19 @@ _CCCL_NODISCARD _CCCL_DEVICE constexpr bool enable_promotion() return ::cuda::std::is_integral::value && sizeof(T) <= 2 && is_one_of, + ::cuda::std::plus, ::cuda::std::multiplies<>, + ::cuda::std::multiplies, ::cuda::std::bit_and<>, + ::cuda::std::bit_and, ::cuda::std::bit_or<>, + ::cuda::std::bit_or, ::cuda::std::bit_xor<>, + ::cuda::std::bit_xor, ::cuda::maximum<>, - ::cuda::minimum<>>(); + ::cuda::maximum, + ::cuda::minimum<>, + ::cuda::minimum>(); } /*********************************************************************************************************************** @@ -551,18 +610,46 @@ _CCCL_NODISCARD _CCCL_DEVICE _CCCL_FORCEINLINE AccumT ThreadReduce(const Input& using cub::internal::enable_simd_reduction; using cub::internal::enable_ternary_reduction; using PromT = ::cuda::std::_If(), int, AccumT>; + _CCCL_IF_CONSTEXPR (!cub::detail::is_one_of< + ReductionOp, + ::cuda::std::plus<>, + ::cuda::std::plus, + ::cuda::std::multiplies<>, + ::cuda::std::multiplies, + ::cuda::std::bit_and<>, + ::cuda::std::bit_and, + ::cuda::std::bit_or<>, + ::cuda::std::bit_or, + ::cuda::std::bit_xor<>, + ::cuda::std::bit_xor, + ::cuda::maximum<>, + ::cuda::maximum, + ::cuda::minimum<>, + ::cuda::minimum, + cub::internal::SimdMin, + cub::internal::SimdMax>()) + { + return cub::internal::ThreadReduceSequential(input, reduction_op); + } + _CCCL_IF_CONSTEXPR (cub::detail::is_one_of, ::cuda::std::plus>() + && cub::detail::is_one_of()) + { + // clang-format off + NV_IF_TARGET(NV_PROVIDES_SM_90, + (return cub::internal::ThreadReduceSequential(input, reduction_op);), + (return cub::internal::ThreadReduceTernaryTree(input, reduction_op);) + ); + // clang-format on + } if (enable_simd_reduction()) { return cub::internal::ThreadReduceSimd(input, reduction_op); } - else if (enable_ternary_reduction()) + if (enable_ternary_reduction()) { return cub::internal::ThreadReduceTernaryTree(input, reduction_op); } - else - { - return cub::internal::ThreadReduceBinaryTree(input, reduction_op); - } + return cub::internal::ThreadReduceBinaryTree(input, reduction_op); } //! @brief Reduction over statically-sized array-like types, seeded with the specified @p prefix. From 83aca35dde9ecc81044398a6d21a89d88a3f708f Mon Sep 17 00:00:00 2001 From: Eric Niebler Date: Tue, 26 Nov 2024 18:22:01 -0800 Subject: [PATCH 36/45] add a `__type_switch` utility and use it the ptx generator (#2946) --- docs/repo.toml | 5 +- .../functions/cuda_ptx_generated_helper.h | 92 +++++++------- libcudacxx/include/cuda/std/__cccl/dialect.h | 9 ++ .../cuda/std/__type_traits/type_list.h | 112 +++++++++++++++--- .../test/libcudacxx/cuda/type_list.pass.cpp | 20 ++++ 5 files changed, 168 insertions(+), 70 deletions(-) diff --git a/docs/repo.toml b/docs/repo.toml index f7c426f13db..ace31c74c71 100644 --- a/docs/repo.toml +++ b/docs/repo.toml @@ -149,6 +149,7 @@ doxygen_predefined = [ "_CCCL_FORCEINLINE", "_CCCL_STD_VER", "_CCCL_NODISCARD", + "_CCCL_NTTP_AUTO=auto", "_CCCL_VISIBILITY_HIDDEN", "_CCCL_SUPPRESS_DEPRECATED_PUSH", "_CCCL_SUPPRESS_DEPRECATED_POP", @@ -261,6 +262,7 @@ doxygen_predefined = [ "_CCCL_HOST=", "_CCCL_HOST_DEVICE=", "_CCCL_NODISCARD=[[nodiscard]]", + "_CCCL_NTTP_AUTO=auto", "_CCCL_STD_VER", "_CCCL_SUPPRESS_DEPRECATED_PUSH", "_CCCL_SUPPRESS_DEPRECATED_POP", @@ -408,6 +410,7 @@ doxygen_predefined = [ "_CCCL_CUDACC_AT_LEAST(x, y)=1", "_CCCL_CUDACC_BELOW(x, y)=0", "_CCCL_DEVICE=", + "_CCCL_DOXYGEN_INVOKED", "_CCCL_EAT_REST(x)=", "_CCCL_EXEC_CHECK_DISABLE=", "_CCCL_FORCEINLINE=", @@ -419,6 +422,7 @@ doxygen_predefined = [ "_CCCL_INLINE_VAR=inline", "_CCCL_NODISCARD=[[nodiscard]]", "_CCCL_NODISCARD_FRIEND=", + "_CCCL_NTTP_AUTO=auto", "_CCCL_STD_VER=2020", "_CCCL_TRAIT(x, y)=x::value", "_CUDA_VMR=cuda::mr", @@ -443,7 +447,6 @@ doxygen_predefined = [ "_CUDAX_TRIVIAL_DEVICE_API", "_CUDAX_PUBLIC_API", "LIBCUDACXX_ENABLE_EXPERIMENTAL_MEMORY_RESOURCE=", - "_CCCL_DOXYGEN_INVOKED", ] # make sure to use ./fetch_imgs.sh diff --git a/libcudacxx/include/cuda/std/__atomic/functions/cuda_ptx_generated_helper.h b/libcudacxx/include/cuda/std/__atomic/functions/cuda_ptx_generated_helper.h index 2e2266ce979..cdb35957509 100644 --- a/libcudacxx/include/cuda/std/__atomic/functions/cuda_ptx_generated_helper.h +++ b/libcudacxx/include/cuda/std/__atomic/functions/cuda_ptx_generated_helper.h @@ -28,6 +28,7 @@ #include #include #include +#include #include #include @@ -110,61 +111,50 @@ struct __atomic_longlong2 template using __atomic_cuda_deduce_bitwise = - _If, - _If, - _If, - _If, - __atomic_cuda_operand_deduction<__atomic_longlong2, __atomic_cuda_operand_b128>>>>>; + __type_switch>, + __type_case<2, __atomic_cuda_operand_deduction>, + __type_case<4, __atomic_cuda_operand_deduction>, + __type_case<8, __atomic_cuda_operand_deduction>, + __type_default<__atomic_cuda_operand_deduction<__atomic_longlong2, __atomic_cuda_operand_b128>>>; template -using __atomic_cuda_deduce_arithmetic = - _If<_CCCL_TRAIT(is_floating_point, _Type), - _If, - __atomic_cuda_operand_deduction>, - _If<_CCCL_TRAIT(is_signed, _Type), - _If, - _If, - _If, - __atomic_cuda_operand_deduction>>>, // There is no - // atom.add.s64 - _If, - _If, - _If, - __atomic_cuda_operand_deduction>>>>>; +using __atomic_cuda_deduce_arithmetic = _If< + _CCCL_TRAIT(is_floating_point, _Type), + _If, + __atomic_cuda_operand_deduction>, + _If<_CCCL_TRAIT(is_signed, _Type), + __type_switch>, + __type_case<2, __atomic_cuda_operand_deduction>, + __type_case<4, __atomic_cuda_operand_deduction>, + __type_default<__atomic_cuda_operand_deduction>>, // There is no + // atom.add.s64 + __type_switch>, + __type_case<2, __atomic_cuda_operand_deduction>, + __type_case<4, __atomic_cuda_operand_deduction>, + __type_default<__atomic_cuda_operand_deduction>>>>; template -using __atomic_cuda_deduce_minmax = - _If<_CCCL_TRAIT(is_floating_point, _Type), - _If, - __atomic_cuda_operand_deduction>, - _If<_CCCL_TRAIT(is_signed, _Type), - _If, - _If, - _If, - __atomic_cuda_operand_deduction>>>, // atom.min|max.s64 - // supported - _If, - _If, - _If, - __atomic_cuda_operand_deduction>>>>>; +using __atomic_cuda_deduce_minmax = _If< + _CCCL_TRAIT(is_floating_point, _Type), + _If, + __atomic_cuda_operand_deduction>, + _If<_CCCL_TRAIT(is_signed, _Type), + __type_switch>, + __type_case<2, __atomic_cuda_operand_deduction>, + __type_case<4, __atomic_cuda_operand_deduction>, + __type_default<__atomic_cuda_operand_deduction>>, // atom.min|max.s64 + // supported + __type_switch>, + __type_case<2, __atomic_cuda_operand_deduction>, + __type_case<4, __atomic_cuda_operand_deduction>, + __type_default<__atomic_cuda_operand_deduction>>>>; template using __atomic_enable_if_native_bitwise = bool; diff --git a/libcudacxx/include/cuda/std/__cccl/dialect.h b/libcudacxx/include/cuda/std/__cccl/dialect.h index 407f2db6ecf..06387172b9b 100644 --- a/libcudacxx/include/cuda/std/__cccl/dialect.h +++ b/libcudacxx/include/cuda/std/__cccl/dialect.h @@ -105,6 +105,15 @@ # define _CCCL_NO_VARIABLE_TEMPLATES #endif // _CCCL_STD_VER <= 2011 +// Declaring a non-type template parameters with auto is only available from C++17 onwards +#if _CCCL_STD_VER >= 2017 && defined(__cpp_nontype_template_parameter_auto) \ + && (__cpp_nontype_template_parameter_auto >= 201606L) +# define _CCCL_NTTP_AUTO auto +#else // ^^^ C++17 ^^^ / vvv C++14 vvv +# define _CCCL_NO_NONTYPE_TEMPLATE_PARAMETER_AUTO +# define _CCCL_NTTP_AUTO unsigned long long int +#endif // _CCCL_STD_VER <= 2014 + // concepts are only available from C++20 onwards #if _CCCL_STD_VER <= 2017 || !defined(__cpp_concepts) || (__cpp_concepts < 201907L) # define _CCCL_NO_CONCEPTS diff --git a/libcudacxx/include/cuda/std/__type_traits/type_list.h b/libcudacxx/include/cuda/std/__type_traits/type_list.h index 00f69273673..1beb22b1807 100644 --- a/libcudacxx/include/cuda/std/__type_traits/type_list.h +++ b/libcudacxx/include/cuda/std/__type_traits/type_list.h @@ -557,6 +557,100 @@ using __type_front = __type_at_c<0, _List>; template using __type_back = __type_at_c<_List::__size - 1, _List>; +//! \brief A pair of types +template +struct _CCCL_TYPE_VISIBILITY_DEFAULT __type_pair +{ + using __first _CCCL_NODEBUG_ALIAS = _First; + using __second _CCCL_NODEBUG_ALIAS = _Second; +}; + +//! \brief Retrieve the first of a pair of types +//! \pre \c _Pair is a specialization of \c __type_pair +template +using __type_pair_first _CCCL_NODEBUG_ALIAS = typename _Pair::__first; + +//! \brief Retrieve the second of a pair of types +//! \pre \c _Pair is a specialization of \c __type_pair +template +using __type_pair_second _CCCL_NODEBUG_ALIAS = typename _Pair::__second; + +//! \see __type_switch +template +struct _CCCL_TYPE_VISIBILITY_DEFAULT __type_default +{ + template + using __rebind _CCCL_NODEBUG_ALIAS = __type_default; + + using type _CCCL_NODEBUG_ALIAS = _Value; +}; + +# if _CCCL_CUDACC_AT_LEAST(12, 0) || defined(_CCCL_DOXYGEN_INVOKED) + +//! \see __type_switch +template <_CCCL_NTTP_AUTO _Label, class _Value> +struct _CCCL_TYPE_VISIBILITY_DEFAULT __type_case +{ + template + using __rebind _CCCL_NODEBUG_ALIAS = __type_case(_Label), _Value>; + + using type = _Value; +}; + +# else // ^^^ CUDACC >= 12.0 || DOXYGEN ^^^ / vvv CUDACC < 12.0 && !DOXYGEN vvv + +template +struct _CCCL_TYPE_VISIBILITY_DEFAULT __type_case_ +{ + template + using __rebind _CCCL_NODEBUG_ALIAS = __type_case_, _Value>; + + using type = _Value; +}; + +template <_CCCL_NTTP_AUTO _Label, class _Value> +using __type_case _CCCL_NODEBUG_ALIAS = __type_case_, _Value>; + +# endif // CUDACC < 12.0 && !DOXYGEN + +namespace __detail +{ +template <_CCCL_NTTP_AUTO _Label, class _Value> +_LIBCUDACXX_HIDE_FROM_ABI auto __type_switch_fn(__type_case<_Label, _Value>*, int) -> __type_case<_Label, _Value>; + +template <_CCCL_NTTP_AUTO _Label, class _Value> +_LIBCUDACXX_HIDE_FROM_ABI auto __type_switch_fn(__type_default<_Value>*, long) -> __type_default<_Value>; +} // namespace __detail + +//! \see __type_switch +template +struct _CCCL_TYPE_VISIBILITY_DEFAULT _LIBCUDACXX_DECLSPEC_EMPTY_BASES __type_switch_fn + : _Cases::template __rebind<_Type>... +{ + template + using __call _CCCL_NODEBUG_ALIAS = + __type(static_cast<__type_switch_fn*>(nullptr), 0))>; +}; + +//! \brief Given an integral constant \c _Label and a pack of "cases" +//! consisting of one or more specializations of \c __type_case and zero or +//! one specializations of \c __type_default, `__type_switch<_Label, _Cases...>` +//! returns the value associated with the first case whose label matches the +//! given label. If no such case exists, the value associated with the default +//! case is returned. If no default case exists, the type is ill-formed. +//! +//! \p Example: +//! \code +//! using result = __type_switch<2, +//! __type_case<1, char>, +//! __type_case<2, double>, +//! __type_default>; +//! static_assert(is_same_v); +//! \endcode +template <_CCCL_NTTP_AUTO _Label, class... _Cases> +using __type_switch _CCCL_NODEBUG_ALIAS = + __type_call<__type_switch_fn, integral_constant>; + namespace __detail { # if _CCCL_COMPILER(MSVC, <, 19, 38) @@ -907,24 +1001,6 @@ struct _CCCL_TYPE_VISIBILITY_DEFAULT __type_sizeof using __call _CCCL_NODEBUG_ALIAS = integral_constant; }; -//! \brief A pair of types -template -struct _CCCL_TYPE_VISIBILITY_DEFAULT __type_pair -{ - using __first _CCCL_NODEBUG_ALIAS = _First; - using __second _CCCL_NODEBUG_ALIAS = _Second; -}; - -//! \brief Retreive the first of a pair of types -//! \pre \c _Pair is a specialization of \c __type_pair -template -using __type_pair_first = typename _Pair::__first; - -//! \brief Retreive the second of a pair of types -//! \pre \c _Pair is a specialization of \c __type_pair -template -using __type_pair_second = typename _Pair::__second; - //! \brief A list of compile-time values, and a meta-callable that accepts a //! meta-callable and evaluates it with the values, each value wrapped in an //! integral constant wrapper. diff --git a/libcudacxx/test/libcudacxx/cuda/type_list.pass.cpp b/libcudacxx/test/libcudacxx/cuda/type_list.pass.cpp index 2e477d3622a..9978d7e2c35 100644 --- a/libcudacxx/test/libcudacxx/cuda/type_list.pass.cpp +++ b/libcudacxx/test/libcudacxx/cuda/type_list.pass.cpp @@ -494,6 +494,26 @@ static_assert( ""); #endif +// __type_switch +static_assert(::cuda::std::is_same<::cuda::std::__type_switch<0, + ::cuda::std::__type_case<0, char>, + ::cuda::std::__type_case<1, double>, + ::cuda::std::__type_default>, + char>::value, + ""); +static_assert(::cuda::std::is_same<::cuda::std::__type_switch<1, + ::cuda::std::__type_case<0, char>, + ::cuda::std::__type_case<1, double>, + ::cuda::std::__type_default>, + double>::value, + ""); +static_assert(::cuda::std::is_same<::cuda::std::__type_switch<2, + ::cuda::std::__type_case<0, char>, + ::cuda::std::__type_case<1, double>, + ::cuda::std::__type_default>, + float>::value, + ""); + // __type_concat static_assert(::cuda::std::is_same<::cuda::std::__type_concat<>, ::cuda::std::__type_list<>>::value, ""); From 0f8687420e80bdd7449e0b173babf7692fcb9a18 Mon Sep 17 00:00:00 2001 From: Eric Niebler Date: Wed, 27 Nov 2024 14:22:52 -0800 Subject: [PATCH 37/45] replace use of old `_CONCEPT_FRAGMENT` macro in cudax (#2973) * replace use of old `_CONCEPT_FRAGMENT` macro in cudax * fix docs build --- .../cuda/experimental/__stream/get_stream.cuh | 21 ++++++++----------- 1 file changed, 9 insertions(+), 12 deletions(-) diff --git a/cudax/include/cuda/experimental/__stream/get_stream.cuh b/cudax/include/cuda/experimental/__stream/get_stream.cuh index 9edf1d251df..6703cf67ead 100644 --- a/cudax/include/cuda/experimental/__stream/get_stream.cuh +++ b/cudax/include/cuda/experimental/__stream/get_stream.cuh @@ -33,24 +33,21 @@ namespace cuda::experimental { - -template -_CCCL_CONCEPT __convertible_to_stream_ref = _CUDA_VSTD::convertible_to<_Tp, ::cuda::stream_ref>; - -template -_CCCL_CONCEPT_FRAGMENT( - __has_member_get_stream_, - requires(const _Tp& __t)(requires(!__convertible_to_stream_ref<_Tp>), - requires(_CUDA_VSTD::same_as))); - +// clang-format off template -_CCCL_CONCEPT __has_member_get_stream = _CCCL_FRAGMENT(__has_member_get_stream_, _Tp); +_CCCL_CONCEPT __has_member_get_stream = + _CCCL_REQUIRES_EXPR((_Tp), const _Tp& __t) + ( + requires(!_CUDA_VSTD::convertible_to<_Tp, ::cuda::stream_ref>), + _Same_as(::cuda::stream_ref) __t.get_stream() + ); +// clang-format on //! @brief `get_stream` is a customization point object that queries a type `T` for an associated stream struct get_stream_t { _CCCL_TEMPLATE(class _Tp) - _CCCL_REQUIRES(__convertible_to_stream_ref<_Tp>) + _CCCL_REQUIRES((_CUDA_VSTD::convertible_to<_Tp, ::cuda::stream_ref>) ) _CCCL_NODISCARD _CCCL_HIDE_FROM_ABI constexpr ::cuda::stream_ref operator()(const _Tp& __t) const noexcept(noexcept(static_cast<::cuda::stream_ref>(__t))) { From a5d33e070708f65234b353baed0df380bad4b367 Mon Sep 17 00:00:00 2001 From: Eric Niebler Date: Wed, 27 Nov 2024 22:46:05 -0800 Subject: [PATCH 38/45] remove vestigal uses of the old `DOXYGEN_SHOULD_SKIP_THIS` macro (#2978) --- .../__memory_resource/managed_memory_resource.cuh | 4 ++-- .../experimental/__memory_resource/pinned_memory_resource.cuh | 4 ++-- 2 files changed, 4 insertions(+), 4 deletions(-) diff --git a/cudax/include/cuda/experimental/__memory_resource/managed_memory_resource.cuh b/cudax/include/cuda/experimental/__memory_resource/managed_memory_resource.cuh index f240155339c..57394558757 100644 --- a/cudax/include/cuda/experimental/__memory_resource/managed_memory_resource.cuh +++ b/cudax/include/cuda/experimental/__memory_resource/managed_memory_resource.cuh @@ -159,7 +159,7 @@ public: } #endif // _CCCL_STD_VER <= 2017 -#ifndef DOXYGEN_SHOULD_SKIP_THIS // Do not document +#ifndef _CCCL_DOXYGEN_INVOKED // Do not document # if _CCCL_STD_VER >= 2020 //! @brief Equality comparison between a \c managed_memory_resource and another resource //! @param __rhs The resource to compare to @@ -237,7 +237,7 @@ public: friend constexpr void get_property(managed_memory_resource const&, mr::device_accessible) noexcept {} //! @brief Enables the \c host_accessible property friend constexpr void get_property(managed_memory_resource const&, mr::host_accessible) noexcept {} -#endif // DOXYGEN_SHOULD_SKIP_THIS +#endif // _CCCL_DOXYGEN_INVOKED //! @brief Checks whether the passed in alignment is valid static constexpr bool __is_valid_alignment(const size_t __alignment) noexcept diff --git a/cudax/include/cuda/experimental/__memory_resource/pinned_memory_resource.cuh b/cudax/include/cuda/experimental/__memory_resource/pinned_memory_resource.cuh index 60ec7c9b49e..7b36888b0ef 100644 --- a/cudax/include/cuda/experimental/__memory_resource/pinned_memory_resource.cuh +++ b/cudax/include/cuda/experimental/__memory_resource/pinned_memory_resource.cuh @@ -160,7 +160,7 @@ public: } #endif // _CCCL_STD_VER <= 2017 -#ifndef DOXYGEN_SHOULD_SKIP_THIS // Do not document +#ifndef _CCCL_DOXYGEN_INVOKED // Do not document # if _CCCL_STD_VER >= 2020 //! @brief Equality comparison between a \c pinned_memory_resource and another resource //! @param __rhs The resource to compare to @@ -239,7 +239,7 @@ public: friend constexpr void get_property(pinned_memory_resource const&, device_accessible) noexcept {} //! @brief Enables the \c host_accessible property friend constexpr void get_property(pinned_memory_resource const&, host_accessible) noexcept {} -#endif // DOXYGEN_SHOULD_SKIP_THIS +#endif // _CCCL_DOXYGEN_INVOKED //! @brief Checks whether the passed in alignment is valid static constexpr bool __is_valid_alignment(const size_t __alignment) noexcept From d68714d45c608d35f7d4a36f2b404a58780fbc82 Mon Sep 17 00:00:00 2001 From: Bernhard Manfred Gruber Date: Thu, 28 Nov 2024 08:24:09 +0100 Subject: [PATCH 39/45] Fix proclaim_copyable_arguments for lambdas (#2833) Co-authored-by: Michael Schellenberger Costa --- cudax/cmake/cudaxBuildCompilerTargets.cmake | 1 + .../cuda/__functional/address_stability.h | 11 +++++- thrust/testing/address_stability.cmake | 12 ++++++ thrust/testing/address_stability.cu | 37 +++++++++++++++++++ 4 files changed, 59 insertions(+), 2 deletions(-) create mode 100644 thrust/testing/address_stability.cmake diff --git a/cudax/cmake/cudaxBuildCompilerTargets.cmake b/cudax/cmake/cudaxBuildCompilerTargets.cmake index f19ced87e49..84fec426823 100644 --- a/cudax/cmake/cudaxBuildCompilerTargets.cmake +++ b/cudax/cmake/cudaxBuildCompilerTargets.cmake @@ -47,6 +47,7 @@ function(cudax_build_compiler_targets) if("Clang" STREQUAL "${CMAKE_CXX_COMPILER_ID}") # stf heavily uses host device lambdas which break on clang due to a warning about the implicitly # deleted copy constructor + # TODO(bgruber): remove this when NVBug 4980157 is resolved append_option_if_available("-Wno-deprecated-copy" cxx_compile_options) endif() diff --git a/libcudacxx/include/cuda/__functional/address_stability.h b/libcudacxx/include/cuda/__functional/address_stability.h index f745b963b42..f2ef9f6d331 100644 --- a/libcudacxx/include/cuda/__functional/address_stability.h +++ b/libcudacxx/include/cuda/__functional/address_stability.h @@ -48,6 +48,13 @@ _CCCL_INLINE_VAR constexpr bool proclaims_copyable_arguments_v = proclaims_copya template struct __callable_permitting_copied_arguments : F { +#if _CCCL_STD_VER <= 2014 + template + _LIBCUDACXX_HIDE_FROM_ABI constexpr __callable_permitting_copied_arguments(G&& g) + : F(::cuda::std::forward(g)) + {} +#endif // _CCCL_STD_VER <= 2014 + using F::operator(); }; @@ -61,9 +68,9 @@ struct proclaims_copyable_arguments<__callable_permitting_copied_arguments> : //! @see proclaims_copyable_arguments template _CCCL_NODISCARD _LIBCUDACXX_HIDE_FROM_ABI constexpr auto -proclaim_copyable_arguments(F f) -> __callable_permitting_copied_arguments +proclaim_copyable_arguments(F&& f) -> __callable_permitting_copied_arguments<::cuda::std::decay_t> { - return __callable_permitting_copied_arguments{_CUDA_VSTD::move(f)}; + return {::cuda::std::forward(f)}; } // Specializations for libcu++ function objects are provided here to not pull this include into `` headers diff --git a/thrust/testing/address_stability.cmake b/thrust/testing/address_stability.cmake new file mode 100644 index 00000000000..e02e34f5870 --- /dev/null +++ b/thrust/testing/address_stability.cmake @@ -0,0 +1,12 @@ +target_compile_options(${test_target} PRIVATE $<$: --extended-lambda>) + +# this check is actually not correct, because we must check the host compiler, not the CXX compiler. +# We rely on that those are usually the same ;) +if ("Clang" STREQUAL "${CMAKE_CXX_COMPILER_ID}" AND CMAKE_CXX_COMPILER_VERSION VERSION_GREATER_EQUAL 13) + # When clang >= 13 is used as host compiler, we get the following warning: + # nvcc_internal_extended_lambda_implementation:312:22: error: definition of implicit copy constructor for '__nv_hdl_wrapper_t, int (const int &)>' is deprecated because it has a user-declared copy assignment operator [-Werror,-Wdeprecated-copy] + # 312 | __nv_hdl_wrapper_t & operator=(const __nv_hdl_wrapper_t &in) = delete; + # | ^ + # Let's suppress it until NVBug 4980157 is resolved. + target_compile_options(${test_target} PRIVATE $<$: -Wno-deprecated-copy>) +endif () diff --git a/thrust/testing/address_stability.cu b/thrust/testing/address_stability.cu index 987fc938058..1fed9100097 100644 --- a/thrust/testing/address_stability.cu +++ b/thrust/testing/address_stability.cu @@ -83,3 +83,40 @@ void TestAddressStabilityUserDefinedFunctionObject() static_assert(proclaims_copyable_arguments{}))>::value, ""); } DECLARE_UNITTEST(TestAddressStabilityUserDefinedFunctionObject); + +void TestAddressStabilityLambda() +{ + using ::cuda::proclaim_copyable_arguments; + using ::cuda::proclaims_copyable_arguments; + + { + auto l = [](const int& i) { + return i + 2; + }; + static_assert(!proclaims_copyable_arguments::value, ""); + auto pr_l = proclaim_copyable_arguments(l); + ASSERT_EQUAL(pr_l(3), 5); + static_assert(proclaims_copyable_arguments::value, ""); + } + + { + auto l = [] _CCCL_DEVICE(const int& i) { + return i + 2; + }; + static_assert(!proclaims_copyable_arguments::value, ""); + auto pr_device_l = proclaim_copyable_arguments(l); + (void) &pr_device_l; + static_assert(proclaims_copyable_arguments::value, ""); + } + + { + auto l = [] _CCCL_HOST_DEVICE(const int& i) { + return i + 2; + }; + static_assert(!proclaims_copyable_arguments::value, ""); + auto pr_l = proclaim_copyable_arguments(l); + ASSERT_EQUAL(pr_l(3), 5); + static_assert(proclaims_copyable_arguments::value, ""); + } +} +DECLARE_UNITTEST(TestAddressStabilityLambda); From af0a8bb6239503709d74d7e1120c06870ff9687a Mon Sep 17 00:00:00 2001 From: Allard Hendriksen Date: Thu, 28 Nov 2024 11:53:37 +0100 Subject: [PATCH 40/45] Forward declare half types in cuda::ptx (#2981) --- .../__ptx/instructions/cp_reduce_async_bulk.h | 16 ++++++---------- 1 file changed, 6 insertions(+), 10 deletions(-) diff --git a/libcudacxx/include/cuda/__ptx/instructions/cp_reduce_async_bulk.h b/libcudacxx/include/cuda/__ptx/instructions/cp_reduce_async_bulk.h index f1487301ada..ce7af1ecc20 100644 --- a/libcudacxx/include/cuda/__ptx/instructions/cp_reduce_async_bulk.h +++ b/libcudacxx/include/cuda/__ptx/instructions/cp_reduce_async_bulk.h @@ -28,16 +28,12 @@ #include // __CUDA_MINIMUM_ARCH__ and friends -#if defined(_LIBCUDACXX_HAS_NVFP16) -# include -#endif // _LIBCUDACXX_HAS_NVFP16 - -#if defined(_LIBCUDACXX_HAS_NVBF16) -_CCCL_DIAG_PUSH -_CCCL_DIAG_SUPPRESS_CLANG("-Wunused-function") -# include -_CCCL_DIAG_POP -#endif // _LIBCUDACXX_HAS_NVBF16 +// Forward-declare __half and __nv_bfloat16. The cuda_fp16.h and cuda_bf16.h are +// expensive to include. The APIs use only pointers, so we do not have to define +// the types. If the user wants to use these types, it is their responsibility +// to include the headers. +struct __half; +struct __nv_bfloat16; _LIBCUDACXX_BEGIN_NAMESPACE_CUDA_PTX From 9beeb267e3012fe4fd9c0378e7f6c11c2573c2a3 Mon Sep 17 00:00:00 2001 From: Bernhard Manfred Gruber Date: Thu, 28 Nov 2024 18:35:20 +0100 Subject: [PATCH 41/45] Fix tuning benchmark for `cub::DeviceTransform` (#2970) * Replace CUB_DETAIL_COUNT by _CCCL_PP_COUNT. It was removed at some point, but not replaced everywhere. * Add missing pragma once to header * Fix use of _CUB_HAS_TRANSFORM_UBLKCP before it is defined --- cub/benchmarks/bench/transform/babelstream1.cu | 17 ----------------- cub/benchmarks/bench/transform/babelstream2.cu | 17 ----------------- cub/benchmarks/bench/transform/babelstream3.cu | 17 ----------------- cub/benchmarks/bench/transform/common.h | 15 +++++++++++++++ cub/benchmarks/bench/transform/complex_cmp.cu | 17 ----------------- cub/benchmarks/bench/transform/fib.cu | 17 ----------------- cub/benchmarks/bench/transform/heavy.cu | 17 ----------------- 7 files changed, 15 insertions(+), 102 deletions(-) diff --git a/cub/benchmarks/bench/transform/babelstream1.cu b/cub/benchmarks/bench/transform/babelstream1.cu index c3b9306398d..ba796f4982e 100644 --- a/cub/benchmarks/bench/transform/babelstream1.cu +++ b/cub/benchmarks/bench/transform/babelstream1.cu @@ -4,25 +4,8 @@ // %RANGE% TUNE_THREADS tpb 128:1024:128 // %RANGE% TUNE_ALGORITHM alg 0:1:1 -// keep checks at the top so compilation of discarded variants fails really fast -#if !TUNE_BASE -# if TUNE_ALGORITHM == 1 && (__CUDA_ARCH_LIST__) < 900 -# error "Cannot compile algorithm 4 (ublkcp) below sm90" -# endif - -# if TUNE_ALGORITHM == 1 && !defined(_CUB_HAS_TRANSFORM_UBLKCP) -# error "Cannot tune for ublkcp algorithm, which is not provided by CUB (old CTK?)" -# endif -#endif - #include "common.h" -#if !TUNE_BASE -# if CUB_DETAIL_COUNT(__CUDA_ARCH_LIST__) != 1 -# error "This benchmark does not support being compiled for multiple architectures" -# endif -#endif - template static void mul(nvbench::state& state, nvbench::type_list) { diff --git a/cub/benchmarks/bench/transform/babelstream2.cu b/cub/benchmarks/bench/transform/babelstream2.cu index 61d4e905d92..33ffd6ee173 100644 --- a/cub/benchmarks/bench/transform/babelstream2.cu +++ b/cub/benchmarks/bench/transform/babelstream2.cu @@ -4,25 +4,8 @@ // %RANGE% TUNE_THREADS tpb 128:1024:128 // %RANGE% TUNE_ALGORITHM alg 0:1:1 -// keep checks at the top so compilation of discarded variants fails really fast -#if !TUNE_BASE -# if TUNE_ALGORITHM == 1 && (__CUDA_ARCH_LIST__) < 900 -# error "Cannot compile algorithm 4 (ublkcp) below sm90" -# endif - -# if TUNE_ALGORITHM == 1 && !defined(_CUB_HAS_TRANSFORM_UBLKCP) -# error "Cannot tune for ublkcp algorithm, which is not provided by CUB (old CTK?)" -# endif -#endif - #include "common.h" -#if !TUNE_BASE -# if CUB_DETAIL_COUNT(__CUDA_ARCH_LIST__) != 1 -# error "This benchmark does not support being compiled for multiple architectures" -# endif -#endif - template static void add(nvbench::state& state, nvbench::type_list) { diff --git a/cub/benchmarks/bench/transform/babelstream3.cu b/cub/benchmarks/bench/transform/babelstream3.cu index a5c969764ae..90ce2e74ac4 100644 --- a/cub/benchmarks/bench/transform/babelstream3.cu +++ b/cub/benchmarks/bench/transform/babelstream3.cu @@ -4,25 +4,8 @@ // %RANGE% TUNE_THREADS tpb 128:1024:128 // %RANGE% TUNE_ALGORITHM alg 0:1:1 -// keep checks at the top so compilation of discarded variants fails really fast -#if !TUNE_BASE -# if TUNE_ALGORITHM == 1 && (__CUDA_ARCH_LIST__) < 900 -# error "Cannot compile algorithm 4 (ublkcp) below sm90" -# endif - -# if TUNE_ALGORITHM == 1 && !defined(_CUB_HAS_TRANSFORM_UBLKCP) -# error "Cannot tune for ublkcp algorithm, which is not provided by CUB (old CTK?)" -# endif -#endif - #include "common.h" -#if !TUNE_BASE -# if CUB_DETAIL_COUNT(__CUDA_ARCH_LIST__) != 1 -# error "This benchmark does not support being compiled for multiple architectures" -# endif -#endif - template static void nstream(nvbench::state& state, nvbench::type_list) { diff --git a/cub/benchmarks/bench/transform/common.h b/cub/benchmarks/bench/transform/common.h index 68a158c92bb..d8339645429 100644 --- a/cub/benchmarks/bench/transform/common.h +++ b/cub/benchmarks/bench/transform/common.h @@ -1,7 +1,22 @@ // SPDX-FileCopyrightText: Copyright (c) 2024, NVIDIA CORPORATION. All rights reserved. // SPDX-License-Identifier: BSD-3-Clause +#pragma once + +// keep checks at the top so compilation of discarded variants fails really fast #include +#if !TUNE_BASE && TUNE_ALGORITHM == 1 +# if _CCCL_PP_COUNT(__CUDA_ARCH_LIST__) != 1 +# error "When tuning, this benchmark does not support being compiled for multiple architectures" +# endif +# if (__CUDA_ARCH_LIST__) < 900 +# error "Cannot compile algorithm 4 (ublkcp) below sm90" +# endif +# ifndef _CUB_HAS_TRANSFORM_UBLKCP +# error "Cannot tune for ublkcp algorithm, which is not provided by CUB (old CTK?)" +# endif +#endif + #include #include diff --git a/cub/benchmarks/bench/transform/complex_cmp.cu b/cub/benchmarks/bench/transform/complex_cmp.cu index ac9eb4b0f8b..6849820ee5b 100644 --- a/cub/benchmarks/bench/transform/complex_cmp.cu +++ b/cub/benchmarks/bench/transform/complex_cmp.cu @@ -4,25 +4,8 @@ // %RANGE% TUNE_THREADS tpb 128:1024:128 // %RANGE% TUNE_ALGORITHM alg 0:1:1 -// keep checks at the top so compilation of discarded variants fails really fast -#if !TUNE_BASE -# if TUNE_ALGORITHM == 1 && (__CUDA_ARCH_LIST__) < 900 -# error "Cannot compile algorithm 4 (ublkcp) below sm90" -# endif - -# if TUNE_ALGORITHM == 1 && !defined(_CUB_HAS_TRANSFORM_UBLKCP) -# error "Cannot tune for ublkcp algorithm, which is not provided by CUB (old CTK?)" -# endif -#endif - #include "common.h" -#if !TUNE_BASE -# if CUB_DETAIL_COUNT(__CUDA_ARCH_LIST__) != 1 -# error "This benchmark does not support being compiled for multiple architectures" -# endif -#endif - // This benchmark tests overlapping memory regions for reading and is compute intensive template diff --git a/cub/benchmarks/bench/transform/fib.cu b/cub/benchmarks/bench/transform/fib.cu index 8a6c4c3dfa8..b7e16031907 100644 --- a/cub/benchmarks/bench/transform/fib.cu +++ b/cub/benchmarks/bench/transform/fib.cu @@ -4,25 +4,8 @@ // %RANGE% TUNE_THREADS tpb 128:1024:128 // %RANGE% TUNE_ALGORITHM alg 0:1:1 -// keep checks at the top so compilation of discarded variants fails really fast -#if !TUNE_BASE -# if TUNE_ALGORITHM == 1 && (__CUDA_ARCH_LIST__) < 900 -# error "Cannot compile algorithm 4 (ublkcp) below sm90" -# endif - -# if TUNE_ALGORITHM == 1 && !defined(_CUB_HAS_TRANSFORM_UBLKCP) -# error "Cannot tune for ublkcp algorithm, which is not provided by CUB (old CTK?)" -# endif -#endif - #include "common.h" -#if !TUNE_BASE -# if CUB_DETAIL_COUNT(__CUDA_ARCH_LIST__) != 1 -# error "This benchmark does not support being compiled for multiple architectures" -# endif -#endif - // This benchmark is compute intensive with diverging threads template diff --git a/cub/benchmarks/bench/transform/heavy.cu b/cub/benchmarks/bench/transform/heavy.cu index 7c35b069e24..be17a04fd8c 100644 --- a/cub/benchmarks/bench/transform/heavy.cu +++ b/cub/benchmarks/bench/transform/heavy.cu @@ -4,25 +4,8 @@ // %RANGE% TUNE_THREADS tpb 128:1024:128 // %RANGE% TUNE_ALGORITHM alg 0:1:1 -// keep checks at the top so compilation of discarded variants fails really fast -#if !TUNE_BASE -# if TUNE_ALGORITHM == 1 && (__CUDA_ARCH_LIST__) < 900 -# error "Cannot compile algorithm 4 (ublkcp) below sm90" -# endif - -# if TUNE_ALGORITHM == 1 && !defined(_CUB_HAS_TRANSFORM_UBLKCP) -# error "Cannot tune for ublkcp algorithm, which is not provided by CUB (old CTK?)" -# endif -#endif - #include "common.h" -#if !TUNE_BASE -# if CUB_DETAIL_COUNT(__CUDA_ARCH_LIST__) != 1 -# error "This benchmark does not support being compiled for multiple architectures" -# endif -#endif - // This benchmark uses a LOT of registers and is compute intensive. template From d9a94936d88670dfff12516cb5c2b1c400c6e3b2 Mon Sep 17 00:00:00 2001 From: David Bayer <48736217+davebayer@users.noreply.github.com> Date: Fri, 29 Nov 2024 09:48:44 +0100 Subject: [PATCH 42/45] fix old gcc version check (#2989) --- libcudacxx/include/cuda/std/__cccl/builtin.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/libcudacxx/include/cuda/std/__cccl/builtin.h b/libcudacxx/include/cuda/std/__cccl/builtin.h index 4e0bfae8a9e..2097bad2d74 100644 --- a/libcudacxx/include/cuda/std/__cccl/builtin.h +++ b/libcudacxx/include/cuda/std/__cccl/builtin.h @@ -242,7 +242,7 @@ # undef _CCCL_BUILTIN_ISNAN #endif // _CCCL_CUDACC_BELOW(11, 7) -#if (_CCCL_CHECK_BUILTIN(builtin_launder) || (_CCCL_COMPILER(GCC) && _CCCL_GCC_VERSION >= 70000)) +#if _CCCL_CHECK_BUILTIN(builtin_launder) || _CCCL_COMPILER(GCC, >=, 7) # define _CCCL_BUILTIN_LAUNDER(...) __builtin_launder(__VA_ARGS__) #endif // _CCCL_CHECK_BUILTIN(builtin_launder) && gcc >= 7 From 5bb947109065c8189a96deacbd853e11ff253a43 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Hao=20Zhang=28=E5=BC=A0=E6=B5=A9=29?= Date: Sat, 30 Nov 2024 23:17:16 +0800 Subject: [PATCH 43/45] Fix a typo in thrust/binary_search.h (#2980) (#2992) --- thrust/thrust/binary_search.h | 16 ++++++++-------- 1 file changed, 8 insertions(+), 8 deletions(-) diff --git a/thrust/thrust/binary_search.h b/thrust/thrust/binary_search.h index 20e96722ea3..d370fe37f95 100644 --- a/thrust/thrust/binary_search.h +++ b/thrust/thrust/binary_search.h @@ -756,8 +756,8 @@ bool binary_search(ForwardIterator first, ForwardIterator last, const T& value, * thrust::equal_range(thrust::device, input.begin(), input.end(), 1); // returns [input.begin() + 1, input.begin() + * 1) thrust::equal_range(thrust::device, input.begin(), input.end(), 2); // returns [input.begin() + 1, input.begin() + * 2) thrust::equal_range(thrust::device, input.begin(), input.end(), 3); // returns [input.begin() + 2, input.begin() + - * 2) thrust::equal_range(thrust::device, input.begin(), input.end(), 8); // returns [input.begin() + 4, input.end) - * thrust::equal_range(thrust::device, input.begin(), input.end(), 9); // returns [input.end(), input.end) + * 2) thrust::equal_range(thrust::device, input.begin(), input.end(), 8); // returns [input.begin() + 4, input.end()) + * thrust::equal_range(thrust::device, input.begin(), input.end(), 9); // returns [input.end(), input.end()) * \endcode * * \see https://en.cppreference.com/w/cpp/algorithm/equal_range @@ -821,8 +821,8 @@ _CCCL_HOST_DEVICE thrust::pair equal_range( * thrust::equal_range(input.begin(), input.end(), 1); // returns [input.begin() + 1, input.begin() + 1) * thrust::equal_range(input.begin(), input.end(), 2); // returns [input.begin() + 1, input.begin() + 2) * thrust::equal_range(input.begin(), input.end(), 3); // returns [input.begin() + 2, input.begin() + 2) - * thrust::equal_range(input.begin(), input.end(), 8); // returns [input.begin() + 4, input.end) - * thrust::equal_range(input.begin(), input.end(), 9); // returns [input.end(), input.end) + * thrust::equal_range(input.begin(), input.end(), 8); // returns [input.begin() + 4, input.end()) + * thrust::equal_range(input.begin(), input.end(), 9); // returns [input.end(), input.end()) * \endcode * * \see https://en.cppreference.com/w/cpp/algorithm/equal_range @@ -893,8 +893,8 @@ equal_range(ForwardIterator first, ForwardIterator last, const LessThanComparabl * thrust::less()); // returns [input.begin() + 1, input.begin() + 2) thrust::equal_range(thrust::device, * input.begin(), input.end(), 3, thrust::less()); // returns [input.begin() + 2, input.begin() + 2) * thrust::equal_range(thrust::device, input.begin(), input.end(), 8, thrust::less()); // returns [input.begin() + - * 4, input.end) thrust::equal_range(thrust::device, input.begin(), input.end(), 9, thrust::less()); // returns - * [input.end(), input.end) \endcode + * 4, input.end()) thrust::equal_range(thrust::device, input.begin(), input.end(), 9, thrust::less()); // returns + * [input.end(), input.end()) \endcode * * \see https://en.cppreference.com/w/cpp/algorithm/equal_range * \see \p lower_bound @@ -962,8 +962,8 @@ _CCCL_HOST_DEVICE thrust::pair equal_range( * input.begin() + 1) thrust::equal_range(input.begin(), input.end(), 2, thrust::less()); // returns [input.begin() * + 1, input.begin() + 2) thrust::equal_range(input.begin(), input.end(), 3, thrust::less()); // returns * [input.begin() + 2, input.begin() + 2) thrust::equal_range(input.begin(), input.end(), 8, thrust::less()); // - * returns [input.begin() + 4, input.end) thrust::equal_range(input.begin(), input.end(), 9, thrust::less()); // - * returns [input.end(), input.end) \endcode + * returns [input.begin() + 4, input.end()) thrust::equal_range(input.begin(), input.end(), 9, thrust::less()); // + * returns [input.end(), input.end()) \endcode * * \see https://en.cppreference.com/w/cpp/algorithm/equal_range * \see \p lower_bound From 2a1273921aca66a80850f850143092458b31efb7 Mon Sep 17 00:00:00 2001 From: Bernhard Manfred Gruber Date: Sat, 30 Nov 2024 16:23:09 +0100 Subject: [PATCH 44/45] Enable assertions for CCCL users in CMake Debug builds (#2986) Fixes: #2975 --- lib/cmake/libcudacxx/libcudacxx-config.cmake | 1 + 1 file changed, 1 insertion(+) diff --git a/lib/cmake/libcudacxx/libcudacxx-config.cmake b/lib/cmake/libcudacxx/libcudacxx-config.cmake index 824a4976b19..3945f726af0 100644 --- a/lib/cmake/libcudacxx/libcudacxx-config.cmake +++ b/lib/cmake/libcudacxx/libcudacxx-config.cmake @@ -39,6 +39,7 @@ set(_libcudacxx_INCLUDE_DIR "${_libcudacxx_VERSION_INCLUDE_DIR}" ) unset(_libcudacxx_VERSION_INCLUDE_DIR CACHE) # Clear tmp variable from cache target_include_directories(_libcudacxx_libcudacxx INTERFACE "${_libcudacxx_INCLUDE_DIR}") +target_compile_definitions(_libcudacxx_libcudacxx INTERFACE $<$:CCCL_ENABLE_ASSERTIONS>) # # Standardize version info From cb5921b33dc8ae1c4038a6386d79c1a973422c45 Mon Sep 17 00:00:00 2001 From: Bernhard Manfred Gruber Date: Sat, 30 Nov 2024 16:31:02 +0100 Subject: [PATCH 45/45] Fix CMake warning for FindPythonInterp (#2982) CMake Warning (dev) at libcudacxx/CMakeLists.txt:43 (include): Policy CMP0148 is not set: The FindPythonInterp and FindPythonLibs modules are removed. Run "cmake --help-policy CMP0148" for policy details. Use the cmake_policy command to set the policy and suppress this warning. --- libcudacxx/CMakeLists.txt | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/libcudacxx/CMakeLists.txt b/libcudacxx/CMakeLists.txt index 1ccfb6a92ff..39f86b6bdb2 100644 --- a/libcudacxx/CMakeLists.txt +++ b/libcudacxx/CMakeLists.txt @@ -40,8 +40,8 @@ option(LIBCUDACXX_ENABLE_LIBCUDACXX_TESTS "Enable libcu++ tests." ON) if (LIBCUDACXX_ENABLE_LIBCUDACXX_TESTS) enable_testing() - include(FindPythonInterp) - if (NOT PYTHONINTERP_FOUND) + find_package (Python COMPONENTS Interpreter) + if (NOT Python_Interpreter_FOUND) message(FATAL_ERROR "Failed to find python interpreter, which is required for running tests and " "building a libcu++ static library.")