Skip to content

Commit

Permalink
Merge branch 'main' into impl_cxx_26_std_ignore
Browse files Browse the repository at this point in the history
  • Loading branch information
miscco authored Nov 30, 2024
2 parents b50844d + cb5921b commit ba4800b
Show file tree
Hide file tree
Showing 560 changed files with 18,955 additions and 15,050 deletions.
1 change: 1 addition & 0 deletions .github/copy-pr-bot.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -2,6 +2,7 @@
# https://docs.gha-runners.nvidia.com/apps/copy-pr-bot/

enabled: true
auto_sync_draft: false
additional_trustees:
- ahendriksen
- gonzalobg
19 changes: 18 additions & 1 deletion c2h/include/c2h/generators.h
Original file line number Diff line number Diff line change
Expand Up @@ -35,7 +35,24 @@
#include <c2h/vector.h>

#if THRUST_DEVICE_SYSTEM == THRUST_DEVICE_SYSTEM_CUDA
# include <cub/util_type.cuh> // for <cuda_fp8.h>
# if defined(_CCCL_HAS_NVFP16)
# include <cuda_fp16.h>
# endif // _CCCL_HAS_NVFP16

# if defined(_CCCL_HAS_NVBF16)
_CCCL_DIAG_PUSH
_CCCL_DIAG_SUPPRESS_CLANG("-Wunused-function")
# include <cuda_bf16.h>
_CCCL_DIAG_POP

# if _CCCL_CUDACC_AT_LEAST(11, 8)
// cuda_fp8.h resets default for C4127, so we have to guard the inclusion
_CCCL_DIAG_PUSH
# include <cuda_fp8.h>
_CCCL_DIAG_POP
# endif // _CCCL_CUDACC_AT_LEAST(11, 8)
# endif // _CCCL_HAS_NVBF16

# if defined(__CUDA_FP8_TYPES_EXIST__)
namespace std
{
Expand Down
19 changes: 1 addition & 18 deletions cub/benchmarks/bench/transform/babelstream1.cu
Original file line number Diff line number Diff line change
Expand Up @@ -4,24 +4,7 @@
// %RANGE% TUNE_THREADS tpb 128:1024:128
// %RANGE% TUNE_ALGORITHM alg 0:1:1

// keep checks at the top so compilation of discarded variants fails really fast
#if !TUNE_BASE
# if TUNE_ALGORITHM == 1 && (__CUDA_ARCH_LIST__) < 900
# error "Cannot compile algorithm 4 (ublkcp) below sm90"
# endif

# if TUNE_ALGORITHM == 1 && !defined(_CUB_HAS_TRANSFORM_UBLKCP)
# error "Cannot tune for ublkcp algorithm, which is not provided by CUB (old CTK?)"
# endif
#endif

#include "babelstream.h"

#if !TUNE_BASE
# if CUB_DETAIL_COUNT(__CUDA_ARCH_LIST__) != 1
# error "This benchmark does not support being compiled for multiple architectures"
# endif
#endif
#include "common.h"

template <typename T, typename OffsetT>
static void mul(nvbench::state& state, nvbench::type_list<T, OffsetT>)
Expand Down
19 changes: 1 addition & 18 deletions cub/benchmarks/bench/transform/babelstream2.cu
Original file line number Diff line number Diff line change
Expand Up @@ -4,24 +4,7 @@
// %RANGE% TUNE_THREADS tpb 128:1024:128
// %RANGE% TUNE_ALGORITHM alg 0:1:1

// keep checks at the top so compilation of discarded variants fails really fast
#if !TUNE_BASE
# if TUNE_ALGORITHM == 1 && (__CUDA_ARCH_LIST__) < 900
# error "Cannot compile algorithm 4 (ublkcp) below sm90"
# endif

# if TUNE_ALGORITHM == 1 && !defined(_CUB_HAS_TRANSFORM_UBLKCP)
# error "Cannot tune for ublkcp algorithm, which is not provided by CUB (old CTK?)"
# endif
#endif

#include "babelstream.h"

#if !TUNE_BASE
# if CUB_DETAIL_COUNT(__CUDA_ARCH_LIST__) != 1
# error "This benchmark does not support being compiled for multiple architectures"
# endif
#endif
#include "common.h"

template <typename T, typename OffsetT>
static void add(nvbench::state& state, nvbench::type_list<T, OffsetT>)
Expand Down
19 changes: 1 addition & 18 deletions cub/benchmarks/bench/transform/babelstream3.cu
Original file line number Diff line number Diff line change
Expand Up @@ -4,24 +4,7 @@
// %RANGE% TUNE_THREADS tpb 128:1024:128
// %RANGE% TUNE_ALGORITHM alg 0:1:1

// keep checks at the top so compilation of discarded variants fails really fast
#if !TUNE_BASE
# if TUNE_ALGORITHM == 1 && (__CUDA_ARCH_LIST__) < 900
# error "Cannot compile algorithm 4 (ublkcp) below sm90"
# endif

# if TUNE_ALGORITHM == 1 && !defined(_CUB_HAS_TRANSFORM_UBLKCP)
# error "Cannot tune for ublkcp algorithm, which is not provided by CUB (old CTK?)"
# endif
#endif

#include "babelstream.h"

#if !TUNE_BASE
# if CUB_DETAIL_COUNT(__CUDA_ARCH_LIST__) != 1
# error "This benchmark does not support being compiled for multiple architectures"
# endif
#endif
#include "common.h"

template <typename T, typename OffsetT>
static void nstream(nvbench::state& state, nvbench::type_list<T, OffsetT>)
Expand Down
Original file line number Diff line number Diff line change
@@ -1,7 +1,22 @@
// SPDX-FileCopyrightText: Copyright (c) 2024, NVIDIA CORPORATION. All rights reserved.
// SPDX-License-Identifier: BSD-3-Clause

#pragma once

// keep checks at the top so compilation of discarded variants fails really fast
#include <cub/device/dispatch/dispatch_transform.cuh>
#if !TUNE_BASE && TUNE_ALGORITHM == 1
# if _CCCL_PP_COUNT(__CUDA_ARCH_LIST__) != 1
# error "When tuning, this benchmark does not support being compiled for multiple architectures"
# endif
# if (__CUDA_ARCH_LIST__) < 900
# error "Cannot compile algorithm 4 (ublkcp) below sm90"
# endif
# ifndef _CUB_HAS_TRANSFORM_UBLKCP
# error "Cannot tune for ublkcp algorithm, which is not provided by CUB (old CTK?)"
# endif
#endif

#include <cub/util_namespace.cuh>

#include <cuda/std/type_traits>
Expand Down
31 changes: 31 additions & 0 deletions cub/benchmarks/bench/transform/complex_cmp.cu
Original file line number Diff line number Diff line change
@@ -0,0 +1,31 @@
// SPDX-FileCopyrightText: Copyright (c) 2024, NVIDIA CORPORATION. All rights reserved.
// SPDX-License-Identifier: BSD-3-Clause

// %RANGE% TUNE_THREADS tpb 128:1024:128
// %RANGE% TUNE_ALGORITHM alg 0:1:1

#include "common.h"

// This benchmark tests overlapping memory regions for reading and is compute intensive

template <typename OffsetT>
static void compare_complex(nvbench::state& state, nvbench::type_list<OffsetT>)
{
const auto n = narrow<OffsetT>(state.get_int64("Elements{io}"));
thrust::device_vector<complex> in = generate(n);
thrust::device_vector<bool> out(n - 1);

state.add_element_count(n);
state.add_global_memory_reads<complex>(n);
state.add_global_memory_writes<bool>(n);

// the complex comparison needs lots of compute and transform reads from overlapping input
using compare_op = less_t;
bench_transform(state, ::cuda::std::tuple{in.begin(), in.begin() + 1}, out.begin(), n - 1, compare_op{});
}

// TODO(bgruber): hardcode OffsetT?
NVBENCH_BENCH_TYPES(compare_complex, NVBENCH_TYPE_AXES(offset_types))
.set_name("compare_complex")
.set_type_axes_names({"OffsetT{ct}"})
.add_int64_power_of_two_axis("Elements{io}", nvbench::range(16, 28, 4));
59 changes: 59 additions & 0 deletions cub/benchmarks/bench/transform/fib.cu
Original file line number Diff line number Diff line change
@@ -0,0 +1,59 @@
// SPDX-FileCopyrightText: Copyright (c) 2024, NVIDIA CORPORATION. All rights reserved.
// SPDX-License-Identifier: BSD-3-Clause

// %RANGE% TUNE_THREADS tpb 128:1024:128
// %RANGE% TUNE_ALGORITHM alg 0:1:1

#include "common.h"

// This benchmark is compute intensive with diverging threads

template <class IndexT, class OutputT>
struct fib_t
{
__device__ OutputT operator()(IndexT n)
{
OutputT t1 = 0;
OutputT t2 = 1;

if (n < 1)
{
return t1;
}
if (n == 1)
{
return t1;
}
if (n == 2)
{
return t2;
}
for (IndexT i = 3; i <= n; ++i)
{
const auto next = t1 + t2;
t1 = t2;
t2 = next;
}
return t2;
}
};
template <typename OffsetT>
static void fibonacci(nvbench::state& state, nvbench::type_list<OffsetT>)
{
using index_t = int64_t;
using output_t = uint32_t;
const auto n = narrow<OffsetT>(state.get_int64("Elements{io}"));
thrust::device_vector<index_t> in = generate(n, bit_entropy::_1_000, index_t{0}, index_t{42});
thrust::device_vector<output_t> out(n);

state.add_element_count(n);
state.add_global_memory_reads<index_t>(n);
state.add_global_memory_writes<output_t>(n);

bench_transform(state, ::cuda::std::tuple{in.begin()}, out.begin(), n, fib_t<index_t, output_t>{});
}

NVBENCH_BENCH_TYPES(fibonacci, NVBENCH_TYPE_AXES(offset_types))
.set_name("fibonacci")
.set_type_axes_names({"OffsetT{ct}"})
.add_int64_power_of_two_axis("Elements{io}", nvbench::range(16, 28, 4));
62 changes: 62 additions & 0 deletions cub/benchmarks/bench/transform/heavy.cu
Original file line number Diff line number Diff line change
@@ -0,0 +1,62 @@
// SPDX-FileCopyrightText: Copyright (c) 2024, NVIDIA CORPORATION. All rights reserved.
// SPDX-License-Identifier: BSD-3-Clause

// %RANGE% TUNE_THREADS tpb 128:1024:128
// %RANGE% TUNE_ALGORITHM alg 0:1:1

#include "common.h"

// This benchmark uses a LOT of registers and is compute intensive.

template <int N>
struct heavy_functor
{
// we need to use an unsigned type so overflow in arithmetic wraps around
__device__ std::uint32_t operator()(std::uint32_t data) const
{
std::uint32_t reg[N];
reg[0] = data;
for (int i = 1; i < N; ++i)
{
reg[i] = reg[i - 1] * reg[i - 1] + 1;
}
for (int i = 0; i < N; ++i)
{
reg[i] = (reg[i] * reg[i]) % 19;
}
for (int i = 0; i < N; ++i)
{
reg[i] = reg[N - i - 1] * reg[i];
}
std::uint32_t x = 0;
for (int i = 0; i < N; ++i)
{
x += reg[i];
}
return x;
}
};

template <typename Heaviness>
static void heavy(nvbench::state& state, nvbench::type_list<Heaviness>)
{
using value_t = std::uint32_t;
using offset_t = int;
const auto n = narrow<offset_t>(state.get_int64("Elements{io}"));
thrust::device_vector<value_t> in = generate(n);
thrust::device_vector<value_t> out(n);

state.add_element_count(n);
state.add_global_memory_reads<value_t>(n);
state.add_global_memory_writes<value_t>(n);

bench_transform(state, ::cuda::std::tuple{in.begin()}, out.begin(), n, heavy_functor<Heaviness::value>{});
}

template <int I>
using ic = ::cuda::std::integral_constant<int, I>;

NVBENCH_BENCH_TYPES(heavy, NVBENCH_TYPE_AXES(nvbench::type_list<ic<32>, ic<64>, ic<128>, ic<256>>))
.set_name("heavy")
.set_type_axes_names({"Heaviness{ct}"})
.add_int64_power_of_two_axis("Elements{io}", nvbench::range(16, 28, 4));
Loading

0 comments on commit ba4800b

Please sign in to comment.