From 8a81e7c77b79ef1074426dde5e0e1e36cac2de83 Mon Sep 17 00:00:00 2001 From: michael bacci Date: Thu, 24 Jun 2021 00:23:23 +0200 Subject: [PATCH] Add new Algorithms using explicit batch type --- README.md | 40 ++++ include/xsimd/stl/algorithms.hpp | 323 ++++++++++++++++++++++--------- test/test_algorithms.cpp | 159 ++++++++++++++- 3 files changed, 429 insertions(+), 93 deletions(-) diff --git a/README.md b/README.md index aa769dfe1..25fc1e9c3 100644 --- a/README.md +++ b/README.md @@ -151,6 +151,46 @@ void mean(const vector_type& a, const vector_type& b, vector_type& res) } ``` +Algorithms like `xsimd::reduce` and `xsimd::transform` are available also in the batch explicit modality: + +```cpp +template ::type> +T nansum(const C& v) +{ + return xsimd::reduce_batch(v.begin(), v.end(), 0.0, + [](auto x, auto y) { + return (std::isnan(x) ? 0.0 : x) + (std::isnan(y) ? 0.0 : y); + }, + [](auto x, auto y) { + static decltype(x) zero(0.0); + auto xnan = xsimd::isnan(x); + auto ynan = xsimd::isnan(y); + auto xs = xsimd::select(xnan, zero, x); + auto ys = xsimd::select(ynan, zero, y); + return xs + ys; + }); +} +``` + +To switch from `std::count_if` to `xsimd::count_if`: + +```cpp +// v is an aligned vector of int type +auto count_expected = std::count_if(v.begin(), v.end(), +[](auto x) { + return x >= 50 && x <= 70 ? 1 : 0; +}); +auto count = xsimd::count_if(v.begin(), v.end(), +[](auto x) { + return x >= 50 && x <= 70 ? 1 : 0; +}, +[](auto b) { + static decltype(b) zero(0); + static decltype(b) one(1); + return xsimd::hadd(xsimd::select(b >= 50 && b <= 70, one, zero)); +}); +assert(count_expected == count); +``` ## Building and Running the Tests diff --git a/include/xsimd/stl/algorithms.hpp b/include/xsimd/stl/algorithms.hpp index 82defaa6e..f71b1b9c2 100644 --- a/include/xsimd/stl/algorithms.hpp +++ b/include/xsimd/stl/algorithms.hpp @@ -11,125 +11,175 @@ #ifndef XSIMD_ALGORITHMS_HPP #define XSIMD_ALGORITHMS_HPP +#include #include "../memory/xsimd_load_store.hpp" namespace xsimd { - template - void transform(I1 first, I2 last, O1 out_first, UF&& f) + namespace detail { - using value_type = typename std::decay::type; - using traits = simd_traits; - using batch_type = typename traits::type; + template + using void_t_v = void; - std::size_t size = static_cast(std::distance(first, last)); - std::size_t simd_size = traits::size; + template + struct has_increment : std::false_type {}; - const auto* ptr_begin = &(*first); - auto* ptr_out = &(*out_first); + template + struct has_increment())>> : std::true_type {}; - std::size_t align_begin = xsimd::get_alignment_offset(ptr_begin, size, simd_size); - std::size_t out_align = xsimd::get_alignment_offset(ptr_out, size, simd_size); - std::size_t align_end = align_begin + ((size - align_begin) & ~(simd_size - 1)); + template + struct have_increment : all_true::value...> {}; + + template + struct not_have_increment : all_true::value...> {}; + + template + using enable_unary_algorithm_t = typename std::enable_if::value && not_have_increment::value, int>::type; - if (align_begin == out_align) + template + using enable_binary_algorithm_t = typename std::enable_if::value && not_have_increment::value, int>::type; + + template + void transform(I1 first, I2 last, O1 out_first, UF&& f, UFB&& fb) { - for (std::size_t i = 0; i < align_begin; ++i) - { - out_first[i] = f(first[i]); - } + using value_type = typename std::decay::type; + using traits = simd_traits; + using batch_type = typename traits::type; - batch_type batch; - for (std::size_t i = align_begin; i < align_end; i += simd_size) + std::size_t size = static_cast(std::distance(first, last)); + std::size_t simd_size = traits::size; + + const auto* ptr_begin = &(*first); + auto* ptr_out = &(*out_first); + + std::size_t align_begin = xsimd::get_alignment_offset(ptr_begin, size, simd_size); + std::size_t out_align = xsimd::get_alignment_offset(ptr_out, size, simd_size); + std::size_t align_end = align_begin + ((size - align_begin) & ~(simd_size - 1)); + + if (align_begin == out_align) { - xsimd::load_aligned(&first[i], batch); - xsimd::store_aligned(&out_first[i], f(batch)); + for (std::size_t i = 0; i < align_begin; ++i) + { + out_first[i] = f(first[i]); + } + + batch_type batch; + for (std::size_t i = align_begin; i < align_end; i += simd_size) + { + xsimd::load_aligned(&first[i], batch); + xsimd::store_aligned(&out_first[i], fb(batch)); + } + + for (std::size_t i = align_end; i < size; ++i) + { + out_first[i] = f(first[i]); + } } - - for (std::size_t i = align_end; i < size; ++i) + else { - out_first[i] = f(first[i]); + for (std::size_t i = 0; i < align_begin; ++i) + { + out_first[i] = f(first[i]); + } + + batch_type batch; + for (std::size_t i = align_begin; i < align_end; i += simd_size) + { + xsimd::load_aligned(&first[i], batch); + xsimd::store_unaligned(&out_first[i], fb(batch)); + } + + for (std::size_t i = align_end; i < size; ++i) + { + out_first[i] = f(first[i]); + } } } - else + + template + void transform(I1 first_1, I2 last_1, I3 first_2, O1 out_first, UF&& f, UFB&& fb) { - for (std::size_t i = 0; i < align_begin; ++i) + using value_type = typename std::decay::type; + using traits = simd_traits; + using batch_type = typename traits::type; + + std::size_t size = static_cast(std::distance(first_1, last_1)); + std::size_t simd_size = traits::size; + + const auto* ptr_begin_1 = &(*first_1); + const auto* ptr_begin_2 = &(*first_2); + auto* ptr_out = &(*out_first); + + std::size_t align_begin_1 = xsimd::get_alignment_offset(ptr_begin_1, size, simd_size); + std::size_t align_begin_2 = xsimd::get_alignment_offset(ptr_begin_2, size, simd_size); + std::size_t out_align = xsimd::get_alignment_offset(ptr_out, size, simd_size); + std::size_t align_end = align_begin_1 + ((size - align_begin_1) & ~(simd_size - 1)); + + #define XSIMD_LOOP_MACRO(A1, A2, A3) \ + for (std::size_t i = 0; i < align_begin_1; ++i) \ + { \ + out_first[i] = f(first_1[i], first_2[i]); \ + } \ + \ + batch_type batch_1, batch_2; \ + for (std::size_t i = align_begin_1; i < align_end; i += simd_size) \ + { \ + xsimd::A1(&first_1[i], batch_1); \ + xsimd::A2(&first_2[i], batch_2); \ + xsimd::A3(&out_first[i], fb(batch_1, batch_2)); \ + } \ + \ + for (std::size_t i = align_end; i < size; ++i) \ + { \ + out_first[i] = f(first_1[i], first_2[i]); \ + } \ + + if (align_begin_1 == out_align && align_begin_1 == align_begin_2) { - out_first[i] = f(first[i]); + XSIMD_LOOP_MACRO(load_aligned, load_aligned, store_aligned); } - - batch_type batch; - for (std::size_t i = align_begin; i < align_end; i += simd_size) + else if (align_begin_1 == out_align && align_begin_1 != align_begin_2) { - xsimd::load_aligned(&first[i], batch); - xsimd::store_unaligned(&out_first[i], f(batch)); + XSIMD_LOOP_MACRO(load_aligned, load_unaligned, store_aligned); } - - for (std::size_t i = align_end; i < size; ++i) + else if (align_begin_1 != out_align && align_begin_1 == align_begin_2) { - out_first[i] = f(first[i]); + XSIMD_LOOP_MACRO(load_aligned, load_aligned, store_unaligned); } + else if (align_begin_1 != out_align && align_begin_1 != align_begin_2) + { + XSIMD_LOOP_MACRO(load_aligned, load_unaligned, store_unaligned); + } + + #undef XSIMD_LOOP_MACRO } } - template - void transform(I1 first_1, I2 last_1, I3 first_2, O1 out_first, UF&& f) + template = 0> + void transform(I1 first, I2 last, O1 out_first, UF&& f, UFB&& fb) { - using value_type = typename std::decay::type; - using traits = simd_traits; - using batch_type = typename traits::type; + detail::transform(first, last, out_first, f, fb); + } - std::size_t size = static_cast(std::distance(first_1, last_1)); - std::size_t simd_size = traits::size; - - const auto* ptr_begin_1 = &(*first_1); - const auto* ptr_begin_2 = &(*first_2); - auto* ptr_out = &(*out_first); - - std::size_t align_begin_1 = xsimd::get_alignment_offset(ptr_begin_1, size, simd_size); - std::size_t align_begin_2 = xsimd::get_alignment_offset(ptr_begin_2, size, simd_size); - std::size_t out_align = xsimd::get_alignment_offset(ptr_out, size, simd_size); - std::size_t align_end = align_begin_1 + ((size - align_begin_1) & ~(simd_size - 1)); - - #define XSIMD_LOOP_MACRO(A1, A2, A3) \ - for (std::size_t i = 0; i < align_begin_1; ++i) \ - { \ - out_first[i] = f(first_1[i], first_2[i]); \ - } \ - \ - batch_type batch_1, batch_2; \ - for (std::size_t i = align_begin_1; i < align_end; i += simd_size) \ - { \ - xsimd::A1(&first_1[i], batch_1); \ - xsimd::A2(&first_2[i], batch_2); \ - xsimd::A3(&out_first[i], f(batch_1, batch_2)); \ - } \ - \ - for (std::size_t i = align_end; i < size; ++i) \ - { \ - out_first[i] = f(first_1[i], first_2[i]); \ - } \ - - if (align_begin_1 == out_align && align_begin_1 == align_begin_2) - { - XSIMD_LOOP_MACRO(load_aligned, load_aligned, store_aligned); - } - else if (align_begin_1 == out_align && align_begin_1 != align_begin_2) - { - XSIMD_LOOP_MACRO(load_aligned, load_unaligned, store_aligned); - } - else if (align_begin_1 != out_align && align_begin_1 == align_begin_2) - { - XSIMD_LOOP_MACRO(load_aligned, load_aligned, store_unaligned); - } - else if (align_begin_1 != out_align && align_begin_1 != align_begin_2) - { - XSIMD_LOOP_MACRO(load_aligned, load_unaligned, store_unaligned); - } + template + void transform(I1 first, I2 last, O1 out_first, UF&& f) + { + detail::transform(first, last, out_first, f, f); + } - #undef XSIMD_LOOP_MACRO + template + void transform(I1 first_1, I2 last_1, I3 first_2, O1 out_first, UF&& f, UFB&& fb) + { + detail::transform(first_1, last_1, first_2, out_first, f, fb); } + template = 0> + void transform(I1 first_1, I2 last_1, I3 first_2, O1 out_first, UF&& f) + { + detail::transform(first_1, last_1, first_2, out_first, f, f); + } // TODO: Remove this once we drop C++11 support namespace detail @@ -141,9 +191,8 @@ namespace xsimd }; } - - template - Init reduce(Iterator1 first, Iterator2 last, Init init, BinaryFunction&& binfun = detail::plus{}) + template + Init reduce(Iterator1 first, Iterator2 last, Init init, BinaryFunction&& binfun, BinaryFunctionBatch&& binfun_batch) { using value_type = typename std::decay::type; using traits = simd_traits; @@ -180,7 +229,7 @@ namespace xsimd for (auto const end = ptr_begin + align_end; ptr < end; ptr += simd_size) { xsimd::load_aligned(ptr, batch); - batch_init = binfun(batch_init, batch); + batch_init = binfun_batch(batch_init, batch); } // reduce across batch @@ -197,6 +246,96 @@ namespace xsimd return init; } + template + Init reduce(Iterator1 first, Iterator2 last, Init init, BinaryFunction&& binfun = detail::plus{}) + { + return reduce(first, last, init, binfun, binfun); + } + + namespace detail + { + template + struct count_batch + { + count_batch(T value) + : value(value) + {} + + count_batch(const count_batch&) = default; + count_batch(count_batch&&) = default; + + template + std::size_t operator()(const B& b) + { + static auto zero = B(T(0)); + static auto one = B(T(1)); + return static_cast(xsimd::hadd(xsimd::select(b == value, one, zero))); + } + + private: + T value; + }; + } + + template + std::size_t count_if(Iterator1 first, Iterator2 last, UnaryPredicate&& predicate, UnaryPredicateBatch&& predicate_batch) + { + using value_type = typename std::decay::type; + using traits = simd_traits; + using batch_type = typename traits::type; + + std::size_t size = static_cast(std::distance(first, last)); + constexpr std::size_t simd_size = traits::size; + + std::size_t counter(0); + if(size < simd_size) + { + while(first != last) + { + counter += predicate(*first++); + } + return counter; + } + + const auto* const ptr_begin = &(*first); + + std::size_t align_begin = xsimd::get_alignment_offset(ptr_begin, size, simd_size); + std::size_t align_end = align_begin + ((size - align_begin) & ~(simd_size - 1)); + + // reduce initial unaligned part + for (std::size_t i = 0; i < align_begin; ++i) + { + counter += predicate(first[i]); + } + + // reduce aligned part + batch_type batch; + auto ptr = ptr_begin + align_begin; + for (auto const end = ptr_begin + align_end; ptr < end; ptr += simd_size) + { + xsimd::load_aligned(ptr, batch); + counter += predicate_batch(batch); + } + + // reduce final unaligned part + for (std::size_t i = align_end; i < size; ++i) + { + counter += predicate(first[i]); + } + + return counter; + } + + template + std::size_t count(Iterator1 first, Iterator2 last, const T& value) + { + return count_if(first, + last, + [&value](const T& x) { return value == x; }, + detail::count_batch{value} + ); + } + } #endif diff --git a/test/test_algorithms.cpp b/test/test_algorithms.cpp index b749e6874..5fb2c4d73 100644 --- a/test/test_algorithms.cpp +++ b/test/test_algorithms.cpp @@ -10,6 +10,10 @@ #include #include "test_utils.hpp" +#include +#include +#include +#include struct binary_functor { @@ -37,6 +41,64 @@ template using test_allocator_type = std::allocator; #endif +template +struct types { + using value_type = typename std::decay::type; + using traits = xsimd::simd_traits; + using batch_type = typename traits::type; +}; + +#if XSIMD_X86_INSTR_SET > XSIMD_VERSION_NUMBER_NOT_AVAILABLE || XSIMD_ARM_INSTR_SET > XSIMD_VERSION_NUMBER_NOT_AVAILABLE +TEST(algorithms, unary_transform_batch) +{ + using vector_type = std::vector>; + using batch_type = types::batch_type; + auto flip_flop = vector_type(42, 0); + std::iota(flip_flop.begin(), flip_flop.end(), 1); + auto square_pair = [](int x) { + return !(x % 2) ? x : x*x; + }; + auto flip_flop_axpected = flip_flop; + std::transform(flip_flop_axpected.begin(), flip_flop_axpected.end(), flip_flop_axpected.begin(), square_pair); + + xsimd::transform(flip_flop.begin(), flip_flop.end(), flip_flop.begin(), + // NOTE: since c++14 a simple `[](auto x)` reduce code complexity + [](int x) { + return !(x % 2) ? x : x*x; + }, + // NOTE: since c++14 a simple `[](auto b)` reduce code complexity + [](batch_type b) { + return xsimd::select(!(b % 2), b, b*b); + }); + EXPECT_TRUE(std::equal(flip_flop_axpected.begin(), flip_flop_axpected.end(), flip_flop.begin()) && flip_flop_axpected.size() == flip_flop.size()); +} + +TEST(algorithms, binary_transform_batch) +{ + using vector_type = std::vector>; + using batch_type = types::batch_type; + auto flip_flop_a = vector_type(42, 0); + auto flip_flop_b = vector_type(42, 0); + std::iota(flip_flop_a.begin(), flip_flop_a.end(), 1); + std::iota(flip_flop_b.begin(), flip_flop_b.end(), 3); + auto square_pair = [](int x, int y) { + return !((x + y) % 2) ? x + y : x*x + y*y; + }; + auto flip_flop_axpected = flip_flop_a; + std::transform(flip_flop_a.begin(), flip_flop_a.end(), flip_flop_b.begin(), flip_flop_axpected.begin(), square_pair); + + auto flip_flop_result = vector_type(flip_flop_axpected.size()); + xsimd::transform(flip_flop_a.begin(), flip_flop_a.end(), flip_flop_b.begin(), flip_flop_result.begin(), + [](int x, int y) { + return !((x +y) % 2) ? x + y : x*x + y*y; + }, + [](batch_type bx, batch_type by) { + return xsimd::select(!((bx + by) % 2), bx + by, bx*bx + by+by); + }); + EXPECT_TRUE(std::equal(flip_flop_axpected.begin(), flip_flop_axpected.end(), flip_flop_result.begin()) && flip_flop_axpected.size() == flip_flop_result.size()); +} +#endif + TEST(algorithms, binary_transform) { std::vector expected(93); @@ -83,7 +145,6 @@ TEST(algorithms, binary_transform) std::fill(ca.begin(), ca.end(), -1); // erase } - TEST(algorithms, unary_transform) { std::vector expected(93); @@ -216,6 +277,102 @@ TEST_F(xsimd_reduce, using_custom_binary_function) } } +TEST(algorithms, reduce_batch) +{ + const float nan = std::numeric_limits::quiet_NaN(); + using vector_type = std::vector>; + using batch_type = types::batch_type; + auto vector_with_nan = vector_type(1000, 0); + std::iota(vector_with_nan.begin(), vector_with_nan.end(), 3.5); + auto i = 0; + auto add_nan = [&i, &nan](const float x) { + return i % 2 ? nan : x; + }; + std::transform(vector_with_nan.begin(), vector_with_nan.end(), vector_with_nan.begin(), add_nan); + + auto nansum_expected = std::accumulate(vector_with_nan.begin(), vector_with_nan.end(), 0.0, + [](float x, float y) { + return (std::isnan(x) ? 0.0 : x) + (std::isnan(y) ? 0.0 : y); + }); + + auto nansum = xsimd::reduce(vector_with_nan.begin(), vector_with_nan.end(), 0.0, + [](float x, float y) { + return (std::isnan(x) ? 0.0 : x) + (std::isnan(y) ? 0.0 : y); + }, + [](batch_type x, batch_type y) { + static batch_type zero(0.0); + auto xnan = xsimd::isnan(x); + auto ynan = xsimd::isnan(y); + auto xs = xsimd::select(xnan, zero, x); + auto ys = xsimd::select(ynan, zero, y); + return xs + ys; + }); + + EXPECT_NEAR(nansum_expected, nansum, 1e-6); + + auto count_nan_expected = std::count_if(vector_with_nan.begin(), vector_with_nan.end(), + [](float x){ + return static_cast(std::isnan(x)); + }); + + auto count_nan = xsimd::count_if(vector_with_nan.begin(), vector_with_nan.end(), + [](float x){ + return static_cast(std::isnan(x)); + }, + [](batch_type b) { + static decltype(b) zero(0.0); + static decltype(b) one(1.0); + auto bnan = xsimd::isnan(b); + return static_cast(xsimd::hadd(xsimd::select(bnan, one, zero))); + }); + + EXPECT_EQ(count_nan_expected, count_nan); + + auto count_not_nan_expected = vector_with_nan.size() - count_nan_expected; + auto count_not_nan = vector_with_nan.size() - count_nan; + + auto nanmean_expected = count_not_nan_expected ? nansum_expected / (float)count_not_nan_expected : 0; + auto nanmean = count_not_nan ? nansum / (float)count_not_nan : 0; + + EXPECT_NEAR(nanmean_expected, nanmean, 1e-6); +} + +TEST(algorithms, count) +{ + using vector_type = std::vector>; + auto v = vector_type(100, 0); + std::iota(v.begin(), v.end(), 3.14); + v[12] = 123.321; + v[42] = 123.321; + v[93] = 123.321; + + EXPECT_EQ(3, xsimd::count(v.begin(), v.end(), 123.321)); +} + +TEST(algorithms, count_if) +{ + using vector_type = std::vector>; + using batch_type = types::batch_type; + auto v = vector_type(100, 0); + std::iota(v.begin(), v.end(), 1); + + auto count_expected = std::count_if(v.begin(), v.end(), + [](int x) { + return x >= 50 && x <= 70 ? 1 : 0; + }); + + auto count = xsimd::count_if(v.begin(), v.end(), + [](int x) { + return x >= 50 && x <= 70 ? 1 : 0; + }, + [](batch_type b) { + static batch_type zero(0); + static batch_type one(1); + return xsimd::hadd(xsimd::select(b >= 50 && b <= 70, one, zero)); + }); + EXPECT_EQ(count_expected, count); +} + #if XSIMD_X86_INSTR_SET > XSIMD_VERSION_NUMBER_NOT_AVAILABLE || XSIMD_ARM_INSTR_SET > XSIMD_VERSION_NUMBER_NOT_AVAILABLE TEST(algorithms, iterator) {