-
Notifications
You must be signed in to change notification settings - Fork 168
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
Merge branch 'main' into impl_cxx_26_std_ignore
- Loading branch information
Showing
560 changed files
with
18,955 additions
and
15,050 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
15 changes: 15 additions & 0 deletions
15
cub/benchmarks/bench/transform/babelstream.h → cub/benchmarks/bench/transform/common.h
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,31 @@ | ||
// SPDX-FileCopyrightText: Copyright (c) 2024, NVIDIA CORPORATION. All rights reserved. | ||
// SPDX-License-Identifier: BSD-3-Clause | ||
|
||
// %RANGE% TUNE_THREADS tpb 128:1024:128 | ||
// %RANGE% TUNE_ALGORITHM alg 0:1:1 | ||
|
||
#include "common.h" | ||
|
||
// This benchmark tests overlapping memory regions for reading and is compute intensive | ||
|
||
template <typename OffsetT> | ||
static void compare_complex(nvbench::state& state, nvbench::type_list<OffsetT>) | ||
{ | ||
const auto n = narrow<OffsetT>(state.get_int64("Elements{io}")); | ||
thrust::device_vector<complex> in = generate(n); | ||
thrust::device_vector<bool> out(n - 1); | ||
|
||
state.add_element_count(n); | ||
state.add_global_memory_reads<complex>(n); | ||
state.add_global_memory_writes<bool>(n); | ||
|
||
// the complex comparison needs lots of compute and transform reads from overlapping input | ||
using compare_op = less_t; | ||
bench_transform(state, ::cuda::std::tuple{in.begin(), in.begin() + 1}, out.begin(), n - 1, compare_op{}); | ||
} | ||
|
||
// TODO(bgruber): hardcode OffsetT? | ||
NVBENCH_BENCH_TYPES(compare_complex, NVBENCH_TYPE_AXES(offset_types)) | ||
.set_name("compare_complex") | ||
.set_type_axes_names({"OffsetT{ct}"}) | ||
.add_int64_power_of_two_axis("Elements{io}", nvbench::range(16, 28, 4)); |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,59 @@ | ||
// SPDX-FileCopyrightText: Copyright (c) 2024, NVIDIA CORPORATION. All rights reserved. | ||
// SPDX-License-Identifier: BSD-3-Clause | ||
|
||
// %RANGE% TUNE_THREADS tpb 128:1024:128 | ||
// %RANGE% TUNE_ALGORITHM alg 0:1:1 | ||
|
||
#include "common.h" | ||
|
||
// This benchmark is compute intensive with diverging threads | ||
|
||
template <class IndexT, class OutputT> | ||
struct fib_t | ||
{ | ||
__device__ OutputT operator()(IndexT n) | ||
{ | ||
OutputT t1 = 0; | ||
OutputT t2 = 1; | ||
|
||
if (n < 1) | ||
{ | ||
return t1; | ||
} | ||
if (n == 1) | ||
{ | ||
return t1; | ||
} | ||
if (n == 2) | ||
{ | ||
return t2; | ||
} | ||
for (IndexT i = 3; i <= n; ++i) | ||
{ | ||
const auto next = t1 + t2; | ||
t1 = t2; | ||
t2 = next; | ||
} | ||
return t2; | ||
} | ||
}; | ||
template <typename OffsetT> | ||
static void fibonacci(nvbench::state& state, nvbench::type_list<OffsetT>) | ||
{ | ||
using index_t = int64_t; | ||
using output_t = uint32_t; | ||
const auto n = narrow<OffsetT>(state.get_int64("Elements{io}")); | ||
thrust::device_vector<index_t> in = generate(n, bit_entropy::_1_000, index_t{0}, index_t{42}); | ||
thrust::device_vector<output_t> out(n); | ||
|
||
state.add_element_count(n); | ||
state.add_global_memory_reads<index_t>(n); | ||
state.add_global_memory_writes<output_t>(n); | ||
|
||
bench_transform(state, ::cuda::std::tuple{in.begin()}, out.begin(), n, fib_t<index_t, output_t>{}); | ||
} | ||
|
||
NVBENCH_BENCH_TYPES(fibonacci, NVBENCH_TYPE_AXES(offset_types)) | ||
.set_name("fibonacci") | ||
.set_type_axes_names({"OffsetT{ct}"}) | ||
.add_int64_power_of_two_axis("Elements{io}", nvbench::range(16, 28, 4)); |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,62 @@ | ||
// SPDX-FileCopyrightText: Copyright (c) 2024, NVIDIA CORPORATION. All rights reserved. | ||
// SPDX-License-Identifier: BSD-3-Clause | ||
|
||
// %RANGE% TUNE_THREADS tpb 128:1024:128 | ||
// %RANGE% TUNE_ALGORITHM alg 0:1:1 | ||
|
||
#include "common.h" | ||
|
||
// This benchmark uses a LOT of registers and is compute intensive. | ||
|
||
template <int N> | ||
struct heavy_functor | ||
{ | ||
// we need to use an unsigned type so overflow in arithmetic wraps around | ||
__device__ std::uint32_t operator()(std::uint32_t data) const | ||
{ | ||
std::uint32_t reg[N]; | ||
reg[0] = data; | ||
for (int i = 1; i < N; ++i) | ||
{ | ||
reg[i] = reg[i - 1] * reg[i - 1] + 1; | ||
} | ||
for (int i = 0; i < N; ++i) | ||
{ | ||
reg[i] = (reg[i] * reg[i]) % 19; | ||
} | ||
for (int i = 0; i < N; ++i) | ||
{ | ||
reg[i] = reg[N - i - 1] * reg[i]; | ||
} | ||
std::uint32_t x = 0; | ||
for (int i = 0; i < N; ++i) | ||
{ | ||
x += reg[i]; | ||
} | ||
return x; | ||
} | ||
}; | ||
|
||
template <typename Heaviness> | ||
static void heavy(nvbench::state& state, nvbench::type_list<Heaviness>) | ||
{ | ||
using value_t = std::uint32_t; | ||
using offset_t = int; | ||
const auto n = narrow<offset_t>(state.get_int64("Elements{io}")); | ||
thrust::device_vector<value_t> in = generate(n); | ||
thrust::device_vector<value_t> out(n); | ||
|
||
state.add_element_count(n); | ||
state.add_global_memory_reads<value_t>(n); | ||
state.add_global_memory_writes<value_t>(n); | ||
|
||
bench_transform(state, ::cuda::std::tuple{in.begin()}, out.begin(), n, heavy_functor<Heaviness::value>{}); | ||
} | ||
|
||
template <int I> | ||
using ic = ::cuda::std::integral_constant<int, I>; | ||
|
||
NVBENCH_BENCH_TYPES(heavy, NVBENCH_TYPE_AXES(nvbench::type_list<ic<32>, ic<64>, ic<128>, ic<256>>)) | ||
.set_name("heavy") | ||
.set_type_axes_names({"Heaviness{ct}"}) | ||
.add_int64_power_of_two_axis("Elements{io}", nvbench::range(16, 28, 4)); |
Oops, something went wrong.