Skip to content

Commit

Permalink
make the upper limit on TMP loop unrolling configurable (NVIDIA#2971)
Browse files Browse the repository at this point in the history
* make the upper limit on TMP loop unrolling configurable

* drop the TMP PP unrolling limit on nvcc and nvhpc whose parsers are slow
  • Loading branch information
ericniebler authored and davebayer committed Dec 2, 2024
1 parent 3009523 commit a2aa971
Showing 1 changed file with 22 additions and 14 deletions.
36 changes: 22 additions & 14 deletions libcudacxx/include/cuda/std/__type_traits/type_list.h
Original file line number Diff line number Diff line change
Expand Up @@ -40,6 +40,14 @@
//! For the purpose of this file, a "trait type" is a class type with a nested
//! type alias named \c type.

#if !defined(_CCCL_META_UNROLL_LIMIT)
# if defined(_CCCL_CUDA_COMPILER_NVCC) || _CCCL_COMPILER(NVHPC)
# define _CCCL_META_UNROLL_LIMIT 10
# else
# define _CCCL_META_UNROLL_LIMIT 16
# endif
#endif

_LIBCUDACXX_BEGIN_NAMESPACE_STD

#ifndef _CCCL_DOXYGEN_INVOKED // Do not document
Expand Down Expand Up @@ -502,7 +510,7 @@ struct __type_index_small_size_fn;
using __call _CCCL_NODEBUG_ALIAS = _Ty; \
};

_CCCL_PP_REPEAT_REVERSE(16, _M1)
_CCCL_PP_REPEAT_REVERSE(_CCCL_META_UNROLL_LIMIT, _M1)

# undef _M0
# undef _M1
Expand All @@ -524,7 +532,7 @@ struct __type_index_select_fn<true> // Fast implementation for smaller indices
} // namespace __detail

template <class _Ip, class... _Ts>
using __type_index = __type_call<__detail::__type_index_select_fn<(_Ip::value < 16)>, _Ip, _Ts...>;
using __type_index = __type_call<__detail::__type_index_select_fn<(_Ip::value < _CCCL_META_UNROLL_LIMIT)>, _Ip, _Ts...>;

template <size_t _Ip, class... _Ts>
using __type_index_c = __type_index<integral_constant<size_t, _Ip>, _Ts...>;
Expand Down Expand Up @@ -858,17 +866,17 @@ struct _CCCL_TYPE_VISIBILITY_DEFAULT __type_fold_left_fn;
using __call _CCCL_NODEBUG_ALIAS = _CCCL_PP_REPEAT(_N, _M1) _State _CCCL_PP_REPEAT(_N, _M3); \
};

_CCCL_PP_REPEAT_REVERSE(17, _LIBCUDACXX_TYPE_LIST_FOLD_RIGHT)
_CCCL_PP_REPEAT_REVERSE(_CCCL_PP_INC(_CCCL_META_UNROLL_LIMIT), _LIBCUDACXX_TYPE_LIST_FOLD_RIGHT)

template <size_t _Np>
struct _CCCL_TYPE_VISIBILITY_DEFAULT __type_fold_right_fn
{
template <class _Fn, class _State _CCCL_PP_REPEAT(16, _M0), class... _Rest>
using __call _CCCL_NODEBUG_ALIAS =
__type_call_indirect<__type_fold_right_fn<_Np - 16>,
_Fn,
__type_call<__type_fold_right_fn<16>, _Fn, _State _CCCL_PP_REPEAT(16, _M2)>,
_Rest...>;
template <class _Fn, class _State _CCCL_PP_REPEAT(_CCCL_META_UNROLL_LIMIT, _M0), class... _Rest>
using __call _CCCL_NODEBUG_ALIAS = __type_call_indirect<
__type_fold_right_fn<_Np - _CCCL_META_UNROLL_LIMIT>,
_Fn,
__type_call<__type_fold_right_fn<_CCCL_META_UNROLL_LIMIT>, _Fn, _State _CCCL_PP_REPEAT(_CCCL_META_UNROLL_LIMIT, _M2)>,
_Rest...>;
};

template <class _Init, class _Fn>
Expand All @@ -886,17 +894,17 @@ struct _CCCL_TYPE_VISIBILITY_DEFAULT __type_fold_right_select_fn
using __call _CCCL_NODEBUG_ALIAS = _CCCL_PP_REPEAT(_N, _M1) _State _CCCL_PP_REPEAT(_N, _M4, _N, _CCCL_PP_DEC); \
};

_CCCL_PP_REPEAT_REVERSE(17, _LIBCUDACXX_TYPE_FOLD_LEFT)
_CCCL_PP_REPEAT_REVERSE(_CCCL_PP_INC(_CCCL_META_UNROLL_LIMIT), _LIBCUDACXX_TYPE_FOLD_LEFT)

template <size_t _Np>
struct _CCCL_TYPE_VISIBILITY_DEFAULT __type_fold_left_fn
{
template <class _Fn, class _State _CCCL_PP_REPEAT(16, _M0), class... _Rest>
template <class _Fn, class _State _CCCL_PP_REPEAT(_CCCL_META_UNROLL_LIMIT, _M0), class... _Rest>
using __call _CCCL_NODEBUG_ALIAS =
__type_call<__type_fold_left_fn<16>,
__type_call<__type_fold_left_fn<_CCCL_META_UNROLL_LIMIT>,
_Fn,
__type_call_indirect<__type_fold_left_fn<_Np - 16>, _Fn, _State, _Rest...> //
_CCCL_PP_REPEAT(16, _M2, 0, _CCCL_PP_INC)>;
__type_call_indirect<__type_fold_left_fn<_Np - _CCCL_META_UNROLL_LIMIT>, _Fn, _State, _Rest...> //
_CCCL_PP_REPEAT(_CCCL_META_UNROLL_LIMIT, _M2, 0, _CCCL_PP_INC)>;
};

template <class _Init, class _Fn>
Expand Down

0 comments on commit a2aa971

Please sign in to comment.