Merge branch 'main' into impl_cxx_26_std_ignore

NVIDIA · Nov 30, 2024 · ba4800b · ba4800b
2 parents b50844d + cb5921b
commit ba4800b
Show file tree

Hide file tree

Showing 560 changed files with 18,955 additions and 15,050 deletions.
diff --git a/.github/copy-pr-bot.yaml b/.github/copy-pr-bot.yaml
@@ -2,6 +2,7 @@
 # https://docs.gha-runners.nvidia.com/apps/copy-pr-bot/
 
 enabled: true
+auto_sync_draft: false
 additional_trustees:
   - ahendriksen
   - gonzalobg
diff --git a/c2h/include/c2h/generators.h b/c2h/include/c2h/generators.h
@@ -35,7 +35,24 @@
 #include <c2h/vector.h>
 
 #if THRUST_DEVICE_SYSTEM == THRUST_DEVICE_SYSTEM_CUDA
-#  include <cub/util_type.cuh> // for <cuda_fp8.h>
+#  if defined(_CCCL_HAS_NVFP16)
+#    include <cuda_fp16.h>
+#  endif // _CCCL_HAS_NVFP16
+
+#  if defined(_CCCL_HAS_NVBF16)
+_CCCL_DIAG_PUSH
+_CCCL_DIAG_SUPPRESS_CLANG("-Wunused-function")
+#    include <cuda_bf16.h>
+_CCCL_DIAG_POP
+
+#    if _CCCL_CUDACC_AT_LEAST(11, 8)
+// cuda_fp8.h resets default for C4127, so we have to guard the inclusion
+_CCCL_DIAG_PUSH
+#      include <cuda_fp8.h>
+_CCCL_DIAG_POP
+#    endif // _CCCL_CUDACC_AT_LEAST(11, 8)
+#  endif // _CCCL_HAS_NVBF16
+
 #  if defined(__CUDA_FP8_TYPES_EXIST__)
 namespace std
 {

diff --git a/cub/benchmarks/bench/transform/babelstream1.cu b/cub/benchmarks/bench/transform/babelstream1.cu
@@ -4,24 +4,7 @@
 // %RANGE% TUNE_THREADS tpb 128:1024:128
 // %RANGE% TUNE_ALGORITHM alg 0:1:1
 
-// keep checks at the top so compilation of discarded variants fails really fast
-#if !TUNE_BASE
-#  if TUNE_ALGORITHM == 1 && (__CUDA_ARCH_LIST__) < 900
-#    error "Cannot compile algorithm 4 (ublkcp) below sm90"
-#  endif
-
-#  if TUNE_ALGORITHM == 1 && !defined(_CUB_HAS_TRANSFORM_UBLKCP)
-#    error "Cannot tune for ublkcp algorithm, which is not provided by CUB (old CTK?)"
-#  endif
-#endif
-
-#include "babelstream.h"
-
-#if !TUNE_BASE
-#  if CUB_DETAIL_COUNT(__CUDA_ARCH_LIST__) != 1
-#    error "This benchmark does not support being compiled for multiple architectures"
-#  endif
-#endif
+#include "common.h"
 
 template <typename T, typename OffsetT>
 static void mul(nvbench::state& state, nvbench::type_list<T, OffsetT>)

diff --git a/cub/benchmarks/bench/transform/babelstream2.cu b/cub/benchmarks/bench/transform/babelstream2.cu
@@ -4,24 +4,7 @@
 // %RANGE% TUNE_THREADS tpb 128:1024:128
 // %RANGE% TUNE_ALGORITHM alg 0:1:1
 
-// keep checks at the top so compilation of discarded variants fails really fast
-#if !TUNE_BASE
-#  if TUNE_ALGORITHM == 1 && (__CUDA_ARCH_LIST__) < 900
-#    error "Cannot compile algorithm 4 (ublkcp) below sm90"
-#  endif
-
-#  if TUNE_ALGORITHM == 1 && !defined(_CUB_HAS_TRANSFORM_UBLKCP)
-#    error "Cannot tune for ublkcp algorithm, which is not provided by CUB (old CTK?)"
-#  endif
-#endif
-
-#include "babelstream.h"
-
-#if !TUNE_BASE
-#  if CUB_DETAIL_COUNT(__CUDA_ARCH_LIST__) != 1
-#    error "This benchmark does not support being compiled for multiple architectures"
-#  endif
-#endif
+#include "common.h"
 
 template <typename T, typename OffsetT>
 static void add(nvbench::state& state, nvbench::type_list<T, OffsetT>)

diff --git a/cub/benchmarks/bench/transform/babelstream3.cu b/cub/benchmarks/bench/transform/babelstream3.cu
@@ -4,24 +4,7 @@
 // %RANGE% TUNE_THREADS tpb 128:1024:128
 // %RANGE% TUNE_ALGORITHM alg 0:1:1
 
-// keep checks at the top so compilation of discarded variants fails really fast
-#if !TUNE_BASE
-#  if TUNE_ALGORITHM == 1 && (__CUDA_ARCH_LIST__) < 900
-#    error "Cannot compile algorithm 4 (ublkcp) below sm90"
-#  endif
-
-#  if TUNE_ALGORITHM == 1 && !defined(_CUB_HAS_TRANSFORM_UBLKCP)
-#    error "Cannot tune for ublkcp algorithm, which is not provided by CUB (old CTK?)"
-#  endif
-#endif
-
-#include "babelstream.h"
-
-#if !TUNE_BASE
-#  if CUB_DETAIL_COUNT(__CUDA_ARCH_LIST__) != 1
-#    error "This benchmark does not support being compiled for multiple architectures"
-#  endif
-#endif
+#include "common.h"
 
 template <typename T, typename OffsetT>
 static void nstream(nvbench::state& state, nvbench::type_list<T, OffsetT>)

diff --git a/cub/benchmarks/bench/transform/babelstream.h → cub/benchmarks/bench/transform/common.h b/cub/benchmarks/bench/transform/babelstream.h → cub/benchmarks/bench/transform/common.h
@@ -1,7 +1,22 @@
 // SPDX-FileCopyrightText: Copyright (c) 2024, NVIDIA CORPORATION. All rights reserved.
 // SPDX-License-Identifier: BSD-3-Clause
 
+#pragma once
+
+// keep checks at the top so compilation of discarded variants fails really fast
 #include <cub/device/dispatch/dispatch_transform.cuh>
+#if !TUNE_BASE && TUNE_ALGORITHM == 1
+#  if _CCCL_PP_COUNT(__CUDA_ARCH_LIST__) != 1
+#    error "When tuning, this benchmark does not support being compiled for multiple architectures"
+#  endif
+#  if (__CUDA_ARCH_LIST__) < 900
+#    error "Cannot compile algorithm 4 (ublkcp) below sm90"
+#  endif
+#  ifndef _CUB_HAS_TRANSFORM_UBLKCP
+#    error "Cannot tune for ublkcp algorithm, which is not provided by CUB (old CTK?)"
+#  endif
+#endif
+
 #include <cub/util_namespace.cuh>
 
 #include <cuda/std/type_traits>

diff --git a/cub/benchmarks/bench/transform/complex_cmp.cu b/cub/benchmarks/bench/transform/complex_cmp.cu
@@ -0,0 +1,31 @@
+// SPDX-FileCopyrightText: Copyright (c) 2024, NVIDIA CORPORATION. All rights reserved.
+// SPDX-License-Identifier: BSD-3-Clause
+
+// %RANGE% TUNE_THREADS tpb 128:1024:128
+// %RANGE% TUNE_ALGORITHM alg 0:1:1
+
+#include "common.h"
+
+// This benchmark tests overlapping memory regions for reading and is compute intensive
+
+template <typename OffsetT>
+static void compare_complex(nvbench::state& state, nvbench::type_list<OffsetT>)
+{
+  const auto n                      = narrow<OffsetT>(state.get_int64("Elements{io}"));
+  thrust::device_vector<complex> in = generate(n);
+  thrust::device_vector<bool> out(n - 1);
+
+  state.add_element_count(n);
+  state.add_global_memory_reads<complex>(n);
+  state.add_global_memory_writes<bool>(n);
+
+  // the complex comparison needs lots of compute and transform reads from overlapping input
+  using compare_op = less_t;
+  bench_transform(state, ::cuda::std::tuple{in.begin(), in.begin() + 1}, out.begin(), n - 1, compare_op{});
+}
+
+// TODO(bgruber): hardcode OffsetT?
+NVBENCH_BENCH_TYPES(compare_complex, NVBENCH_TYPE_AXES(offset_types))
+  .set_name("compare_complex")
+  .set_type_axes_names({"OffsetT{ct}"})
+  .add_int64_power_of_two_axis("Elements{io}", nvbench::range(16, 28, 4));
diff --git a/cub/benchmarks/bench/transform/fib.cu b/cub/benchmarks/bench/transform/fib.cu
@@ -0,0 +1,59 @@
+// SPDX-FileCopyrightText: Copyright (c) 2024, NVIDIA CORPORATION. All rights reserved.
+// SPDX-License-Identifier: BSD-3-Clause
+
+// %RANGE% TUNE_THREADS tpb 128:1024:128
+// %RANGE% TUNE_ALGORITHM alg 0:1:1
+
+#include "common.h"
+
+// This benchmark is compute intensive with diverging threads
+
+template <class IndexT, class OutputT>
+struct fib_t
+{
+  __device__ OutputT operator()(IndexT n)
+  {
+    OutputT t1 = 0;
+    OutputT t2 = 1;
+
+    if (n < 1)
+    {
+      return t1;
+    }
+    if (n == 1)
+    {
+      return t1;
+    }
+    if (n == 2)
+    {
+      return t2;
+    }
+    for (IndexT i = 3; i <= n; ++i)
+    {
+      const auto next = t1 + t2;
+      t1              = t2;
+      t2              = next;
+    }
+    return t2;
+  }
+};
+template <typename OffsetT>
+static void fibonacci(nvbench::state& state, nvbench::type_list<OffsetT>)
+{
+  using index_t                     = int64_t;
+  using output_t                    = uint32_t;
+  const auto n                      = narrow<OffsetT>(state.get_int64("Elements{io}"));
+  thrust::device_vector<index_t> in = generate(n, bit_entropy::_1_000, index_t{0}, index_t{42});
+  thrust::device_vector<output_t> out(n);
+
+  state.add_element_count(n);
+  state.add_global_memory_reads<index_t>(n);
+  state.add_global_memory_writes<output_t>(n);
+
+  bench_transform(state, ::cuda::std::tuple{in.begin()}, out.begin(), n, fib_t<index_t, output_t>{});
+}
+
+NVBENCH_BENCH_TYPES(fibonacci, NVBENCH_TYPE_AXES(offset_types))
+  .set_name("fibonacci")
+  .set_type_axes_names({"OffsetT{ct}"})
+  .add_int64_power_of_two_axis("Elements{io}", nvbench::range(16, 28, 4));
diff --git a/cub/benchmarks/bench/transform/heavy.cu b/cub/benchmarks/bench/transform/heavy.cu
@@ -0,0 +1,62 @@
+// SPDX-FileCopyrightText: Copyright (c) 2024, NVIDIA CORPORATION. All rights reserved.
+// SPDX-License-Identifier: BSD-3-Clause
+
+// %RANGE% TUNE_THREADS tpb 128:1024:128
+// %RANGE% TUNE_ALGORITHM alg 0:1:1
+
+#include "common.h"
+
+// This benchmark uses a LOT of registers and is compute intensive.
+
+template <int N>
+struct heavy_functor
+{
+  // we need to use an unsigned type so overflow in arithmetic wraps around
+  __device__ std::uint32_t operator()(std::uint32_t data) const
+  {
+    std::uint32_t reg[N];
+    reg[0] = data;
+    for (int i = 1; i < N; ++i)
+    {
+      reg[i] = reg[i - 1] * reg[i - 1] + 1;
+    }
+    for (int i = 0; i < N; ++i)
+    {
+      reg[i] = (reg[i] * reg[i]) % 19;
+    }
+    for (int i = 0; i < N; ++i)
+    {
+      reg[i] = reg[N - i - 1] * reg[i];
+    }
+    std::uint32_t x = 0;
+    for (int i = 0; i < N; ++i)
+    {
+      x += reg[i];
+    }
+    return x;
+  }
+};
+
+template <typename Heaviness>
+static void heavy(nvbench::state& state, nvbench::type_list<Heaviness>)
+{
+  using value_t                     = std::uint32_t;
+  using offset_t                    = int;
+  const auto n                      = narrow<offset_t>(state.get_int64("Elements{io}"));
+  thrust::device_vector<value_t> in = generate(n);
+  thrust::device_vector<value_t> out(n);
+
+  state.add_element_count(n);
+  state.add_global_memory_reads<value_t>(n);
+  state.add_global_memory_writes<value_t>(n);
+
+  bench_transform(state, ::cuda::std::tuple{in.begin()}, out.begin(), n, heavy_functor<Heaviness::value>{});
+}
+
+template <int I>
+using ic = ::cuda::std::integral_constant<int, I>;
+
+NVBENCH_BENCH_TYPES(heavy, NVBENCH_TYPE_AXES(nvbench::type_list<ic<32>, ic<64>, ic<128>, ic<256>>))
+  .set_name("heavy")
+  .set_type_axes_names({"Heaviness{ct}"})
+  .add_int64_power_of_two_axis("Elements{io}", nvbench::range(16, 28, 4));