From 185d7ef71bd0485ecdcbf98d27c338a1b7b1ee05 Mon Sep 17 00:00:00 2001 From: Auburn Date: Tue, 26 Mar 2024 19:55:59 +0000 Subject: [PATCH] Replaced FMA feature flag for relaxed which also ties in the other platform precision variation functions like invsqrt and reciprical. Fixed WASM arch detect --- CMakeLists.txt | 4 ++ dispatch/cmake/ClassSIMD.cmake | 56 ++++++++++++------- dispatch/impl/DispatchClassImpl.h | 8 +-- examples/header_only/main.cpp | 2 +- include/FastSIMD/ToolSet.h | 6 ++ include/FastSIMD/ToolSet/Generic/Functions.h | 18 +++++- include/FastSIMD/ToolSet/Generic/Register.h | 3 + .../FastSIMD/ToolSet/Generic/Scalar/f32x1.h | 4 +- include/FastSIMD/ToolSet/WASM/128/f32x4.h | 47 ++++++++-------- include/FastSIMD/ToolSet/WASM/WASM.h | 9 ++- include/FastSIMD/ToolSet/x86/128/f32x4.h | 10 +++- include/FastSIMD/ToolSet/x86/256/f32x8.h | 18 ++++-- include/FastSIMD/ToolSet/x86/512/f32x16.h | 18 ++++-- include/FastSIMD/Utility/ArchDetect.h | 24 ++++---- include/FastSIMD/Utility/FeatureEnums.h | 9 +-- src/FastSIMD.cpp | 39 ++++--------- tests/CMakeLists.txt | 5 +- tests/test.cpp | 30 ++++++++-- tests/test.h | 5 +- tests/test.inl | 24 ++++---- 20 files changed, 202 insertions(+), 137 deletions(-) diff --git a/CMakeLists.txt b/CMakeLists.txt index 23ccbc0..cbc0713 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -30,6 +30,10 @@ else() target_compile_definitions(FastSIMD PUBLIC FASTSIMD_STATIC_LIB) endif() +if(CMAKE_SYSTEM_NAME STREQUAL "Emscripten") + target_compile_options(FastSIMD PUBLIC "-msimd128") +endif() + target_include_directories(FastSIMD PUBLIC $ $ diff --git a/dispatch/cmake/ClassSIMD.cmake b/dispatch/cmake/ClassSIMD.cmake index a0ed2c8..1a37d1d 100644 --- a/dispatch/cmake/ClassSIMD.cmake +++ b/dispatch/cmake/ClassSIMD.cmake @@ -1,5 +1,5 @@ -function(fastsimd_add_feature_set_source simd_inl feature_set) +function(fastsimd_add_feature_set_source simd_inl feature_set is_relaxed) set(feature_set_source "${simd_library_source_dir}/${simd_library_name}_${feature_set}.cpp") set(simd_inl_full "${CMAKE_CURRENT_LIST_DIR}/${simd_inl}") @@ -9,44 +9,56 @@ function(fastsimd_add_feature_set_source simd_inl feature_set) if("${CMAKE_CXX_COMPILER_ID}" STREQUAL "MSVC") # MSVC 32bit needs SSE2 flag for all SSE levels if(${feature_set} MATCHES "SSE[^(0-9)]" AND CMAKE_SIZEOF_VOID_P EQUAL 4) - set_source_files_properties(${feature_set_source} PROPERTIES COMPILE_FLAGS "/arch:SSE2") + set_property(SOURCE ${feature_set_source} APPEND PROPERTY COMPILE_OPTIONS /arch:SSE2) elseif(${feature_set} MATCHES "AVX[^(0-9)]") - set_source_files_properties(${feature_set_source} PROPERTIES COMPILE_FLAGS "/arch:AVX") + set_property(SOURCE ${feature_set_source} APPEND PROPERTY COMPILE_OPTIONS /arch:AVX) elseif(${feature_set} MATCHES AVX2) - set_source_files_properties(${feature_set_source} PROPERTIES COMPILE_FLAGS "/arch:AVX2") + set_property(SOURCE ${feature_set_source} APPEND PROPERTY COMPILE_OPTIONS /arch:AVX2) elseif(${feature_set} MATCHES AVX512) - set_source_files_properties(${feature_set_source} PROPERTIES COMPILE_FLAGS "/arch:AVX512") + set_property(SOURCE ${feature_set_source} APPEND PROPERTY COMPILE_OPTIONS /arch:AVX512) endif() else() if(${feature_set} MATCHES SSE2 AND CMAKE_SIZEOF_VOID_P EQUAL 4) - set_source_files_properties(${feature_set_source} PROPERTIES COMPILE_FLAGS "-msse2") + set_property(SOURCE ${feature_set_source} APPEND PROPERTY COMPILE_OPTIONS -msse2) elseif(${feature_set} MATCHES SSE3) - set_source_files_properties(${feature_set_source} PROPERTIES COMPILE_FLAGS "-msse3") + set_property(SOURCE ${feature_set_source} APPEND PROPERTY COMPILE_OPTIONS -msse3) elseif(${feature_set} MATCHES SSSE3) - set_source_files_properties(${feature_set_source} PROPERTIES COMPILE_FLAGS "-mssse3") + set_property(SOURCE ${feature_set_source} APPEND PROPERTY COMPILE_OPTIONS -mssse3) elseif(${feature_set} MATCHES SSE41) - set_source_files_properties(${feature_set_source} PROPERTIES COMPILE_FLAGS "-msse4.1") + set_property(SOURCE ${feature_set_source} APPEND PROPERTY COMPILE_OPTIONS -msse4.1) elseif(${feature_set} MATCHES SSE42) - set_source_files_properties(${feature_set_source} PROPERTIES COMPILE_FLAGS "-msse4.2") + set_property(SOURCE ${feature_set_source} APPEND PROPERTY COMPILE_OPTIONS -msse4.2) elseif(${feature_set} MATCHES "AVX[^(0-9)]") - set_source_files_properties(${feature_set_source} PROPERTIES COMPILE_FLAGS "-mavx") + set_property(SOURCE ${feature_set_source} APPEND PROPERTY COMPILE_OPTIONS -mavx) elseif(${feature_set} MATCHES AVX2) - set_source_files_properties(${feature_set_source} PROPERTIES COMPILE_FLAGS "-mavx2 -mfma") + if(is_relaxed) + set_property(SOURCE ${feature_set_source} APPEND PROPERTY COMPILE_OPTIONS -mfma) + else() + set_property(SOURCE ${feature_set_source} APPEND PROPERTY COMPILE_OPTIONS -mno-fma) + endif() + set_property(SOURCE ${feature_set_source} APPEND PROPERTY COMPILE_OPTIONS -mavx2) elseif(${feature_set} MATCHES AVX512) - set_source_files_properties(${feature_set_source} PROPERTIES COMPILE_FLAGS "-mavx512f -mavx512dq -mavx512vl -mavx512bw -mfma") + if(is_relaxed) + set_property(SOURCE ${feature_set_source} APPEND PROPERTY COMPILE_OPTIONS -mfma) + else() + set_property(SOURCE ${feature_set_source} APPEND PROPERTY COMPILE_OPTIONS -mno-fma) + endif() + set_property(SOURCE ${feature_set_source} APPEND PROPERTY COMPILE_OPTIONS -mavx512f -mavx512dq -mavx512vl -mavx512bw) elseif(${feature_set} MATCHES WASM) - set_source_files_properties(${feature_set_source} PROPERTIES COMPILE_FLAGS "-msimd128 -mrelaxed-simd") + if(is_relaxed) + set_property(SOURCE ${feature_set_source} APPEND PROPERTY COMPILE_OPTIONS -mrelaxed-simd) + endif() endif() endif() @@ -54,7 +66,7 @@ endfunction() function(fastsimd_create_dispatch_library simd_library_name) - cmake_parse_arguments(PARSE_ARGV 0 fastsimd_create_dispatch_library "" "" "SOURCES;FEATURE_SETS") + cmake_parse_arguments(PARSE_ARGV 0 fastsimd_create_dispatch_library "RELAXED" "" "SOURCES;FEATURE_SETS") list(LENGTH fastsimd_create_dispatch_library_FEATURE_SETS FEATURE_SET_COUNT) list(LENGTH fastsimd_create_dispatch_library_SOURCES SOURCES_COUNT) @@ -68,10 +80,10 @@ function(fastsimd_create_dispatch_library simd_library_name) set(fastsimd_create_dispatch_library_FEATURE_SETS SSE2 SSE41 - AVX2_FMA - AVX512_FMA - NEON_FMA - AARCH64_FMA + AVX2 + AVX512 + NEON + AARCH64 WASM) endif() @@ -93,6 +105,10 @@ function(fastsimd_create_dispatch_library simd_library_name) if(CMAKE_COMPILER_IS_GNUCC) set_target_properties(${simd_library_name} PROPERTIES COMPILE_FLAGS "-Wno-ignored-attributes") endif() + + if(fastsimd_create_dispatch_library_RELAXED) + target_compile_definitions(${simd_library_name} PUBLIC FASTSIMD_IS_RELAXED=1) + endif() set(feature_set_list "") set(feature_set_list_debug "") @@ -112,7 +128,7 @@ function(fastsimd_create_dispatch_library simd_library_name) if ("${COMPILE_OUTPUT}" MATCHES "TEST_FEATURE_SET_ACTIVE_SUCCESS") list(APPEND feature_set_list "FastSIMD::FeatureSet::${feature_set}") list(APPEND feature_set_list_debug "${feature_set}") - fastsimd_add_feature_set_source(${simd_inl} ${feature_set}) + fastsimd_add_feature_set_source(${simd_inl} ${feature_set} ${fastsimd_create_dispatch_library_RELAXED}) endif() endforeach() endforeach() diff --git a/dispatch/impl/DispatchClassImpl.h b/dispatch/impl/DispatchClassImpl.h index b83e1c2..15539b9 100644 --- a/dispatch/impl/DispatchClassImpl.h +++ b/dispatch/impl/DispatchClassImpl.h @@ -58,8 +58,8 @@ namespace FastSIMD }; - template - FS_FORCEINLINE static T* DispatchClassFactoryIterator( FastSIMD::FeatureSet maxFeatureSet, MemoryAllocator allocator ) + template + FS_FORCEINLINE static T* DispatchClassFactoryIterator( FeatureSet maxFeatureSet, MemoryAllocator allocator ) { if( maxFeatureSet < SIMD ) { @@ -68,7 +68,7 @@ namespace FastSIMD constexpr auto NextCompiled = FastSIMD::FASTSIMD_LIBRARY_NAME::CompiledFeatureSets::NextAfter; - if constexpr( NextCompiled != FastSIMD::FeatureSet::Max ) + if constexpr( NextCompiled != FeatureSet::Max ) { if( maxFeatureSet >= NextCompiled ) { @@ -80,7 +80,7 @@ namespace FastSIMD } template - FASTSIMD_API T* NewDispatchClass( FastSIMD::FeatureSet maxFeatureSet, MemoryAllocator allocator ) + FASTSIMD_API T* NewDispatchClass( FeatureSet maxFeatureSet, MemoryAllocator allocator ) { if( maxFeatureSet == FeatureSet::Max ) { diff --git a/examples/header_only/main.cpp b/examples/header_only/main.cpp index 404d9a0..1ef5487 100644 --- a/examples/header_only/main.cpp +++ b/examples/header_only/main.cpp @@ -10,5 +10,5 @@ int main() auto out = FS::Masked( invSqrt != FS::f32<4>( INFINITY ), dist ); - return 0; + return FS::Extract0( FS::Convert( out ) ); } diff --git a/include/FastSIMD/ToolSet.h b/include/FastSIMD/ToolSet.h index 6bd2aea..67ac66e 100644 --- a/include/FastSIMD/ToolSet.h +++ b/include/FastSIMD/ToolSet.h @@ -12,6 +12,12 @@ namespace FastSIMD { return SIMD; } + + template + static constexpr bool IsRelaxed() + { + return RELAXED; + } } // namespace FastSIMD #include "ToolSet/Generic/Functions.h" diff --git a/include/FastSIMD/ToolSet/Generic/Functions.h b/include/FastSIMD/ToolSet/Generic/Functions.h index f3ea0e4..369c621 100644 --- a/include/FastSIMD/ToolSet/Generic/Functions.h +++ b/include/FastSIMD/ToolSet/Generic/Functions.h @@ -464,8 +464,22 @@ namespace FS template FS_FORCEINLINE Register InvSqrt( const Register& a ) { - static_assert( !IsNativeV>, "FastSIMD: FS::InvSqrt not supported with provided types" ); - return Register{ InvSqrt( a.v0 ), InvSqrt( a.v1 ) }; + if constexpr( IsNativeV> ) + { + return Register( 1 ) / Sqrt( a ); + } + else + { + return Register{ InvSqrt( a.v0 ), InvSqrt( a.v1 ) }; + } + } + + // Sqrt + template + FS_FORCEINLINE Register Sqrt( const Register& a ) + { + static_assert( !IsNativeV>, "FastSIMD: FS::Sqrt not supported with provided types" ); + return Register{ Sqrt( a.v0 ), Sqrt( a.v1 ) }; } template diff --git a/include/FastSIMD/ToolSet/Generic/Register.h b/include/FastSIMD/ToolSet/Generic/Register.h index c750780..c81033a 100644 --- a/include/FastSIMD/ToolSet/Generic/Register.h +++ b/include/FastSIMD/ToolSet/Generic/Register.h @@ -287,6 +287,9 @@ namespace FS template constexpr bool IsNativeV = IsNative::value; + template + using EnableIfRelaxed = std::enable_if_t()>; + template using i32 = Register; diff --git a/include/FastSIMD/ToolSet/Generic/Scalar/f32x1.h b/include/FastSIMD/ToolSet/Generic/Scalar/f32x1.h index 692115d..221ee7f 100644 --- a/include/FastSIMD/ToolSet/Generic/Scalar/f32x1.h +++ b/include/FastSIMD/ToolSet/Generic/Scalar/f32x1.h @@ -123,8 +123,8 @@ namespace FS template>> - FS_FORCEINLINE f32<1, SIMD> InvSqrt( const f32<1, SIMD>& a ) + FS_FORCEINLINE f32<1, SIMD> Sqrt( const f32<1, SIMD>& a ) { - return 1 / std::sqrt( a.native.f ); + return std::sqrt( a.native.f ); } } diff --git a/include/FastSIMD/ToolSet/WASM/128/f32x4.h b/include/FastSIMD/ToolSet/WASM/128/f32x4.h index 2a86203..ff2ccd4 100644 --- a/include/FastSIMD/ToolSet/WASM/128/f32x4.h +++ b/include/FastSIMD/ToolSet/WASM/128/f32x4.h @@ -156,13 +156,27 @@ namespace FS template>> FS_FORCEINLINE f32<4, SIMD> Min( const f32<4, SIMD>& a, const f32<4, SIMD>& b ) { - return wasm_f32x4_pmin( a.native, b.native ); + if constexpr( FastSIMD::IsRelaxed() ) + { + return wasm_f32x4_relaxed_min( a.native, b.native ); + } + else + { + return wasm_f32x4_min( a.native, b.native ); + } } template>> FS_FORCEINLINE f32<4, SIMD> Max( const f32<4, SIMD>& a, const f32<4, SIMD>& b ) { - return wasm_f32x4_pmax( a.native, b.native ); + if constexpr( FastSIMD::IsRelaxed() ) + { + return wasm_f32x4_relaxed_max( a.native, b.native ); + } + else + { + return wasm_f32x4_max( a.native, b.native ); + } } template>> @@ -182,50 +196,33 @@ namespace FS FS_FORCEINLINE f32<4, SIMD> MaskedIncrement( const typename f32<4, SIMD>::MaskTypeArg& mask, const f32<4, SIMD>& a ) { return wasm_f32x4_sub( a.native, - wasm_f32x4_convert_i32x4(static_cast(mask.native)) ); + wasm_f32x4_convert_i32x4( static_cast( mask.native ) ) ); } template>> FS_FORCEINLINE f32<4, SIMD> MaskedDecrement( const typename f32<4, SIMD>::MaskTypeArg& mask, const f32<4, SIMD>& a ) { return wasm_f32x4_add( a.native, - wasm_f32x4_convert_i32x4(static_cast(mask.native)) ); + wasm_f32x4_convert_i32x4( static_cast( mask.native ) ) ); } template>> - FS_FORCEINLINE f32<4, SIMD> Reciprocal( const f32<4, SIMD>& a ) + FS_FORCEINLINE f32<4, SIMD> Sqrt( const f32<4, SIMD>& a ) { - return wasm_f32x4_div( f32<4, SIMD>{1.0f}.native, a.native ); + return wasm_f32x4_sqrt( a.native ); } - template>> - FS_FORCEINLINE f32<4, SIMD> InvSqrt( const f32<4, SIMD>& a ) - { - return wasm_f32x4_div( f32<4, SIMD>{1.0f}.native, wasm_f32x4_sqrt( a.native ) ); - } - template>, typename = std::enable_if_t> + template>, typename = EnableIfRelaxed()> FS_FORCEINLINE f32<4, SIMD> FMulAdd( const f32<4, SIMD>& a, const f32<4, SIMD>& b, const f32<4, SIMD>& c ) { return wasm_f32x4_relaxed_madd( c.native, a.native, b.native ); } - template>, typename = std::enable_if_t> - FS_FORCEINLINE f32<4, SIMD> FMulSub( const f32<4, SIMD>& a, const f32<4, SIMD>& b, const f32<4, SIMD>& c ) - { - return wasm_f32x4_relaxed_msub( c.native, a.native, b.native ); - } - - template>, typename = std::enable_if_t> + template>, typename = EnableIfRelaxed()> FS_FORCEINLINE f32<4, SIMD> FNMulAdd( const f32<4, SIMD>& a, const f32<4, SIMD>& b, const f32<4, SIMD>& c ) { return wasm_f32x4_relaxed_nmadd( c.native, a.native, b.native ); } - - template>, typename = std::enable_if_t> - FS_FORCEINLINE f32<4, SIMD> FNMulSub( const f32<4, SIMD>& a, const f32<4, SIMD>& b, const f32<4, SIMD>& c ) - { - return wasm_f32x4_relaxed_nmsub( c.native, a.native, b.native ); - } } diff --git a/include/FastSIMD/ToolSet/WASM/WASM.h b/include/FastSIMD/ToolSet/WASM/WASM.h index b04ff42..7d200f9 100644 --- a/include/FastSIMD/ToolSet/WASM/WASM.h +++ b/include/FastSIMD/ToolSet/WASM/WASM.h @@ -13,7 +13,14 @@ namespace FS template>> FS_FORCEINLINE i32<4, SIMD> Convert( const f32<4, SIMD>& a, TypeDummy ) { - return wasm_i32x4_trunc_sat_f32x4( Round( a ).native ); + if constexpr( FastSIMD::IsRelaxed() ) + { + return wasm_i32x4_relaxed_trunc_f32x4( Round( a ).native ); + } + else + { + return wasm_i32x4_trunc_sat_f32x4( Round( a ).native ); + } } template>> diff --git a/include/FastSIMD/ToolSet/x86/128/f32x4.h b/include/FastSIMD/ToolSet/x86/128/f32x4.h index 086552b..46d7891 100644 --- a/include/FastSIMD/ToolSet/x86/128/f32x4.h +++ b/include/FastSIMD/ToolSet/x86/128/f32x4.h @@ -247,15 +247,21 @@ namespace FS } - template>> + template>, typename = EnableIfRelaxed()> FS_FORCEINLINE f32<4, SIMD> Reciprocal( const f32<4, SIMD>& a ) { return _mm_rcp_ps( a.native ); } - template>> + template>, typename = EnableIfRelaxed()> FS_FORCEINLINE f32<4, SIMD> InvSqrt( const f32<4, SIMD>& a ) { return _mm_rsqrt_ps( a.native ); } + + template>> + FS_FORCEINLINE f32<4, SIMD> Sqrt( const f32<4, SIMD>& a ) + { + return _mm_sqrt_ps( a.native ); + } } diff --git a/include/FastSIMD/ToolSet/x86/256/f32x8.h b/include/FastSIMD/ToolSet/x86/256/f32x8.h index ca4a807..5d0a2e3 100644 --- a/include/FastSIMD/ToolSet/x86/256/f32x8.h +++ b/include/FastSIMD/ToolSet/x86/256/f32x8.h @@ -212,37 +212,43 @@ namespace FS } - template>> + template>, typename = EnableIfRelaxed> FS_FORCEINLINE f32<8, SIMD> Reciprocal( const f32<8, SIMD>& a ) { return _mm256_rcp_ps( a.native ); } - template>> + template>, typename = EnableIfRelaxed> FS_FORCEINLINE f32<8, SIMD> InvSqrt( const f32<8, SIMD>& a ) { return _mm256_rsqrt_ps( a.native ); } - template>, typename = std::enable_if_t> + template>> + FS_FORCEINLINE f32<8, SIMD> Sqrt( const f32<8, SIMD>& a ) + { + return _mm256_sqrt_ps( a.native ); + } + + template>, typename = EnableIfRelaxed, typename = std::enable_if_t> FS_FORCEINLINE f32<8, SIMD> FMulAdd( const f32<8, SIMD>& a, const f32<8, SIMD>& b, const f32<8, SIMD>& c ) { return _mm256_fmadd_ps( a.native, b.native, c.native ); } - template>, typename = std::enable_if_t> + template>, typename = EnableIfRelaxed, typename = std::enable_if_t> FS_FORCEINLINE f32<8, SIMD> FMulSub( const f32<8, SIMD>& a, const f32<8, SIMD>& b, const f32<8, SIMD>& c ) { return _mm256_fmsub_ps( a.native, b.native, c.native ); } - template>, typename = std::enable_if_t> + template>, typename = EnableIfRelaxed, typename = std::enable_if_t> FS_FORCEINLINE f32<8, SIMD> FNMulAdd( const f32<8, SIMD>& a, const f32<8, SIMD>& b, const f32<8, SIMD>& c ) { return _mm256_fnmadd_ps( a.native, b.native, c.native ); } - template>, typename = std::enable_if_t> + template>, typename = EnableIfRelaxed, typename = std::enable_if_t> FS_FORCEINLINE f32<8, SIMD> FNMulSub( const f32<8, SIMD>& a, const f32<8, SIMD>& b, const f32<8, SIMD>& c ) { return _mm256_fnmsub_ps( a.native, b.native, c.native ); diff --git a/include/FastSIMD/ToolSet/x86/512/f32x16.h b/include/FastSIMD/ToolSet/x86/512/f32x16.h index ae1d043..82fa27c 100644 --- a/include/FastSIMD/ToolSet/x86/512/f32x16.h +++ b/include/FastSIMD/ToolSet/x86/512/f32x16.h @@ -227,37 +227,43 @@ namespace FS return _mm512_mask_mul_ps( a.native, ~mask.native, a.native, b.native ); } - template>> + template>, typename = EnableIfRelaxed()> FS_FORCEINLINE f32<16, SIMD> Reciprocal( const f32<16, SIMD>& a ) { return _mm512_rcp14_ps( a.native ); } - template>> + template>, typename = EnableIfRelaxed()> FS_FORCEINLINE f32<16, SIMD> InvSqrt( const f32<16, SIMD>& a ) { return _mm512_rsqrt14_ps( a.native ); } + + template>> + FS_FORCEINLINE f32<16, SIMD> Sqrt( const f32<16, SIMD>& a ) + { + return _mm512_sqrt_ps( a.native ); + } - template>, typename = std::enable_if_t> + template>, typename = EnableIfRelaxed()> FS_FORCEINLINE f32<16, SIMD> FMulAdd( const f32<16, SIMD>& a, const f32<16, SIMD>& b, const f32<16, SIMD>& c ) { return _mm512_fmadd_ps( a.native, b.native, c.native ); } - template>, typename = std::enable_if_t> + template>, typename = EnableIfRelaxed()> FS_FORCEINLINE f32<16, SIMD> FMulSub( const f32<16, SIMD>& a, const f32<16, SIMD>& b, const f32<16, SIMD>& c ) { return _mm512_fmsub_ps( a.native, b.native, c.native ); } - template>, typename = std::enable_if_t> + template>, typename = EnableIfRelaxed()> FS_FORCEINLINE f32<16, SIMD> FNMulAdd( const f32<16, SIMD>& a, const f32<16, SIMD>& b, const f32<16, SIMD>& c ) { return _mm512_fnmadd_ps( a.native, b.native, c.native ); } - template>, typename = std::enable_if_t> + template>, typename = EnableIfRelaxed()> FS_FORCEINLINE f32<16, SIMD> FNMulSub( const f32<16, SIMD>& a, const f32<16, SIMD>& b, const f32<16, SIMD>& c ) { return _mm512_fnmsub_ps( a.native, b.native, c.native ); diff --git a/include/FastSIMD/Utility/ArchDetect.h b/include/FastSIMD/Utility/ArchDetect.h index e3e9545..f95eebe 100644 --- a/include/FastSIMD/Utility/ArchDetect.h +++ b/include/FastSIMD/Utility/ArchDetect.h @@ -8,19 +8,18 @@ #define FASTSIMD_FEATURE_VALUE_SCALAR() 1 // -- Web Assembly -- -#if defined( __EMSCRIPTEN__ ) || defined(EMSCRIPTEN) -#define FASTSIMD_ARCH_WASM() 1 -#define FASTSIMD_ARCH_DETECT() WASM -#define FASTSIMD_FEATURE_DETECT() FASTSIMD_ARCH_WASM() +#if defined( __EMSCRIPTEN__ ) || defined( EMSCRIPTEN ) + #define FASTSIMD_FEATURE_VALUE_WASM() 2 +#define FASTSIMD_FEATURE_DETECT() WASM +#define FASTSIMD_ARCH_DETECT() WASM + // -- ARM -- #elif defined( __arm__ ) || defined( __TARGET_ARCH_ARM ) || defined( _M_ARM ) || defined( _M_ARM64 ) || defined( __aarch64__ ) || defined( __ARM64__ ) #define FASTSIMD_FEATURE_VALUE_NEON() 2 -#define FASTSIMD_FEATURE_VALUE_NEON_FMA() 3 -#define FASTSIMD_FEATURE_VALUE_AARCH64() 4 -#define FASTSIMD_FEATURE_VALUE_AARCH64_FMA() 5 +#define FASTSIMD_FEATURE_VALUE_AARCH64() 3 #if defined( __ARM64_ARCH_8__ ) || defined( __aarch64__ ) || defined( __ARMv8__ ) || defined( __ARMv8_A__ ) || defined( _M_ARM64 ) || defined( __ARM_NEON__ ) #define FASTSIMD_FEATURE_DETECT() AARCH64 @@ -46,11 +45,8 @@ #define FASTSIMD_FEATURE_VALUE_SSE41() 6 #define FASTSIMD_FEATURE_VALUE_SSE42() 7 #define FASTSIMD_FEATURE_VALUE_AVX() 8 -#define FASTSIMD_FEATURE_VALUE_AVX_FMA() 9 -#define FASTSIMD_FEATURE_VALUE_AVX2() 10 -#define FASTSIMD_FEATURE_VALUE_AVX2_FMA() 11 -#define FASTSIMD_FEATURE_VALUE_AVX512() 12 -#define FASTSIMD_FEATURE_VALUE_AVX512_FMA() 13 +#define FASTSIMD_FEATURE_VALUE_AVX2() 9 +#define FASTSIMD_FEATURE_VALUE_AVX512() 10 #if defined( __AVX512F__ ) && defined( __AVX512VL__ ) && defined( __AVX512BW__ ) && defined( __AVX512DQ__ ) #define FASTSIMD_FEATURE_DETECT() AVX512 @@ -97,3 +93,7 @@ #define FASTSIMD_DEFAULT_FEATURE_VALUE() FASTSIMD_FEATURE_VALUE( FASTSIMD_DEFAULT_FEATURE_SET ) #define FASTSIMD_MAX_FEATURE_VALUE() FASTSIMD_FEATURE_VALUE( FASTSIMD_MAX_FEATURE_SET ) #define FASTSIMD_ARCH_NAME() FASTSIMD_ARCH_DETECT()=FASTSIMD_MAX_FEATURE_SET + +#ifndef FASTSIMD_IS_RELAXED +#define FASTSIMD_IS_RELAXED 0 +#endif diff --git a/include/FastSIMD/Utility/FeatureEnums.h b/include/FastSIMD/Utility/FeatureEnums.h index 978f704..07c0a10 100644 --- a/include/FastSIMD/Utility/FeatureEnums.h +++ b/include/FastSIMD/Utility/FeatureEnums.h @@ -7,8 +7,6 @@ namespace FastSIMD { enum class FeatureFlag { - FMA, - Scalar, x86, @@ -29,7 +27,7 @@ namespace FastSIMD NEON, AARCH64, - WASM + WASM, }; constexpr std::uint32_t operator |( FeatureFlag a, FeatureFlag b ) @@ -55,16 +53,11 @@ namespace FastSIMD SSE41 = SSSE3 | FeatureFlag::SSE41, SSE42 = SSE41 | FeatureFlag::SSE42, AVX = SSE42 | FeatureFlag::AVX, - AVX_FMA = AVX | FeatureFlag::FMA, AVX2 = AVX | FeatureFlag::AVX2, - AVX2_FMA = AVX2 | FeatureFlag::FMA, AVX512 = AVX2 | FeatureFlag::AVX512_F | FeatureFlag::AVX512_VL | FeatureFlag::AVX512_DQ | FeatureFlag::AVX512_BW, - AVX512_FMA = AVX512 | FeatureFlag::FMA, NEON = FeatureFlag::ARM | FeatureFlag::NEON, - NEON_FMA = NEON | FeatureFlag::FMA, AARCH64 = NEON | FeatureFlag::AARCH64, - AARCH64_FMA = AARCH64 | FeatureFlag::FMA, WASM = Invalid | FeatureFlag::WASM, diff --git a/src/FastSIMD.cpp b/src/FastSIMD.cpp index bef8cf0..f029791 100644 --- a/src/FastSIMD.cpp +++ b/src/FastSIMD.cpp @@ -1,12 +1,6 @@ -#include -#include +#include #if FASTSIMD_CURRENT_ARCH_IS( X86 ) -#ifdef __GNUG__ -#include -#else -#include -#endif // Define interface to cpuid instruction. // input: eax = functionnumber, ecx = 0 @@ -84,7 +78,7 @@ namespace FastSIMD #if FASTSIMD_CURRENT_ARCH_IS( X86 ) static std::uint32_t DetectCpuSupportedFlags() { - std::uint32_t supportedFlags = FastSIMD::FeatureFlag::x86 | FastSIMD::FeatureFlag::Scalar; + std::uint32_t supportedFlags = FeatureFlag::x86 | FeatureFlag::Scalar; //#if FASTSIMD_x86 int abcd[4] = { 0, 0, 0, 0 }; // cpuid results @@ -140,10 +134,6 @@ namespace FastSIMD supportedFlags = supportedFlags | FeatureFlag::SSE42; // SSE4.2 supported - if( ( abcd[2] >> 12 & 1 ) == 1 ) - supportedFlags = supportedFlags | FeatureFlag::FMA; - // FMA3 supported - if( ( abcd[2] >> 26 & 1 ) == 0 ) return supportedFlags; // no XSAVE if( ( abcd[2] >> 27 & 1 ) == 0 ) @@ -157,6 +147,12 @@ namespace FastSIMD supportedFlags = supportedFlags | FeatureFlag::AVX; // AVX supported + if constexpr( IsRelaxed() ) + { + if( ( abcd[2] >> 12 & 1 ) == 0 ) + return supportedFlags; // no FMA3 + } + cpuid( abcd, 7 ); // call cpuid leaf 7 for feature flags if( ( abcd[1] >> 5 & 1 ) == 0 ) return supportedFlags; // no AVX2 @@ -197,8 +193,7 @@ namespace FastSIMD FastSIMD::FeatureFlag::ARM | FastSIMD::FeatureFlag::Scalar | FastSIMD::FeatureFlag::NEON | - FastSIMD::FeatureFlag::AARCH64 | - FastSIMD::FeatureFlag::FMA; + FastSIMD::FeatureFlag::AARCH64; return supportedFlags; } @@ -226,32 +221,27 @@ namespace FastSIMD FeatureSet::SSE41, FeatureSet::SSE42, FeatureSet::AVX, - FeatureSet::AVX_FMA, FeatureSet::AVX2, - FeatureSet::AVX2_FMA, FeatureSet::AVX512, - FeatureSet::AVX512_FMA, #elif FASTSIMD_CURRENT_ARCH_IS( ARM ) FeatureSet::NEON, - FeatureSet::NEON_FMA, FeatureSet::AARCH64, - FeatureSet::AARCH64_FMA, #elif FASTSIMD_CURRENT_ARCH_IS( WASM ) FeatureSet::WASM, #endif }; - FASTSIMD_API FastSIMD::FeatureSet DetectCpuMaxFeatureSet() + FASTSIMD_API FeatureSet DetectCpuMaxFeatureSet() { - static FastSIMD::FeatureSet cache = [] + static FeatureSet cache = [] { std::uint32_t supportedFlags = DetectCpuSupportedFlags(); FeatureSet maxSupported = FeatureSet::Invalid; - for( FeatureSet featureSet: FeatureSetValues ) + for( FeatureSet featureSet : FeatureSetValues ) { // Check if feature set contains unsupported flags if( ( static_cast( featureSet ) ^ supportedFlags ) & ~supportedFlags ) @@ -281,15 +271,10 @@ namespace FastSIMD case FeatureSet::SSE41: return "SSE4.1"; case FeatureSet::SSE42: return "SSE4.2"; case FeatureSet::AVX: return "AVX"; - case FeatureSet::AVX_FMA: return "AVX_FMA"; case FeatureSet::AVX2: return "AVX2"; - case FeatureSet::AVX2_FMA: return "AVX2_FMA"; case FeatureSet::AVX512: return "AVX512"; - case FeatureSet::AVX512_FMA: return "AVX512_FMA"; case FeatureSet::NEON: return "NEON"; - case FeatureSet::NEON_FMA: return "NEON_FMA"; case FeatureSet::AARCH64: return "AARCH64"; - case FeatureSet::AARCH64_FMA: return "AARCH64_FMA"; case FeatureSet::WASM: return "WASM"; case FeatureSet::Max: return "Max"; } diff --git a/tests/CMakeLists.txt b/tests/CMakeLists.txt index 35a77cc..a0b0c74 100644 --- a/tests/CMakeLists.txt +++ b/tests/CMakeLists.txt @@ -1,8 +1,9 @@ -fastsimd_create_dispatch_library(test_simd SOURCES "test.inl" FEATURE_SETS SCALAR SSE2 SSE41 AVX2_FMA AVX512_FMA NEON_FMA AARCH64_FMA WASM) +fastsimd_create_dispatch_library(test_simd SOURCES "test.inl" FEATURE_SETS SCALAR SSE2 SSE41 AVX2 AVX512 NEON AARCH64 WASM) +fastsimd_create_dispatch_library(test_simd_relaxed RELAXED SOURCES "test.inl" FEATURE_SETS SCALAR SSE2 SSE41 AVX2 AVX512 NEON AARCH64 WASM) add_executable(test "test.cpp") -target_link_libraries(test PRIVATE FastSIMD test_simd) +target_link_libraries(test PRIVATE FastSIMD test_simd test_simd_relaxed) if(CMAKE_SYSTEM_NAME STREQUAL "Emscripten") set(CMAKE_EXECUTABLE_SUFFIX ".html") diff --git a/tests/test.cpp b/tests/test.cpp index 2b8c127..946a12c 100644 --- a/tests/test.cpp +++ b/tests/test.cpp @@ -7,6 +7,7 @@ #include #include #include +#include #include @@ -75,12 +76,20 @@ struct TestRunner if( HEAD <= FastSIMD::DetectCpuMaxFeatureSet() ) { std::cout << "Generating Tests: " << FastSIMD::GetFeatureSetString( HEAD ) << std::endl; + { + std::unique_ptr> testSimd( FastSIMD::NewDispatchClass>( HEAD ) ); - std::unique_ptr> testSimd( FastSIMD::NewDispatchClass>( HEAD ) ); + TestCollection simdCollection = testSimd->RegisterTests(); - TestCollection simdCollection = testSimd->RegisterTests(); + collections.insert( collections.begin(), simdCollection.begin(), simdCollection.end() ); + } + { + std::unique_ptr> testSimd( FastSIMD::NewDispatchClass>( HEAD ) ); - collections.insert( collections.begin(), simdCollection.begin(), simdCollection.end() ); + TestCollection simdCollection = testSimd->RegisterTests(); + + collections.insert( collections.begin(), simdCollection.begin(), simdCollection.end() ); + } } return collections; @@ -233,13 +242,22 @@ struct TestRunner std::cerr << "Tests do not match: " << testName; throw std::exception(); } - else if( test.featureSet == FastSIMD::FeatureSet::SCALAR ) + if( test.featureSet == FastSIMD::FeatureSet::SCALAR && !test.relaxed ) { std::cerr << "Multiple tests with same name: " << testName; throw std::exception(); } - if( !CompareOutputs( testName, test.featureSet, test.returnType, test.accuracy, outputCount, scalarResults, simdResults ) ) + std::string testNameRelaxed = testName.data(); + float accuracy = 0; + + if( test.relaxed ) + { + testNameRelaxed += " RELAXED"; + accuracy = test.relaxedAccuracy; + } + + if( !CompareOutputs( testNameRelaxed, test.featureSet, test.returnType, accuracy, outputCount, scalarResults, simdResults ) ) { std::cerr << "Inputs: " << tests[0].inputsFunc( idx, rndInts, rndFloats ) << std::endl; failed = true; @@ -259,7 +277,7 @@ struct TestRunner static void Run() { - std::cout << "Starting Tests Register Size: " << RegisterBytes * 8 << " (" << RegisterBytes << "b)" << std::endl; + std::cout << "Starting Tests - Register Size: " << RegisterBytes * 8 << " (" << RegisterBytes << "b)" << std::endl; TestSet testSet = TestOrganiser::GetSet(); diff --git a/tests/test.h b/tests/test.h index 53fbf8a..223bf08 100644 --- a/tests/test.h +++ b/tests/test.h @@ -24,15 +24,16 @@ struct TestData }; FastSIMD::FeatureSet featureSet; + bool relaxed; ReturnType returnType; - float accuracy = 0; + float relaxedAccuracy = 0; std::function testFunc; std::function inputsFunc; }; using TestCollection = std::vector>; -template +template class TestFastSIMD { public: diff --git a/tests/test.inl b/tests/test.inl index 65842ab..fac0ffc 100644 --- a/tests/test.inl +++ b/tests/test.inl @@ -76,8 +76,8 @@ template static constexpr size_t GetReturnCount> = N; -template -class FastSIMD::DispatchClass, SIMD> : public TestFastSIMD +template +class FastSIMD::DispatchClass, SIMD> : public TestFastSIMD { template using TestReg = FS::Register; @@ -263,6 +263,7 @@ class FastSIMD::DispatchClass, SIMD> : public TestFa { TestData data; data.featureSet = SIMD; + data.relaxed = FastSIMD::IsRelaxed(); data.returnType = GetReturn::ReturnType>::Type; data.testFunc = TestFunctionFactory::Create( func ); @@ -350,7 +351,7 @@ class FastSIMD::DispatchClass, SIMD> : public TestFa RegisterTest( tests, "f32 plus operator", std::plus() ); RegisterTest( tests, "f32 minus operator", std::minus() ); RegisterTest( tests, "f32 multiply operator", std::multiplies() ); - RegisterTest( tests, "f32 divide operator", std::divides() ).accuracy = 64; + RegisterTest( tests, "f32 divide operator", std::divides() ).relaxedAccuracy = 64; RegisterTest( tests, "f32 fused multiply add", []( TestRegf32 a, TestRegf32 b ) { return FS::FMulAdd( a, TestRegf32( -1 ), b ); } ); RegisterTest( tests, "f32 fused multiply sub", []( TestRegf32 a, TestRegf32 b ) { return FS::FMulSub( a, TestRegf32( -1 ), b ); } ); @@ -394,17 +395,18 @@ class FastSIMD::DispatchClass, SIMD> : public TestFa RegisterTest( tests, "f32 ceil", []( TestRegf32 a ) { return FS::Ceil( a ); } ); RegisterTest( tests, "f32 floor", []( TestRegf32 a ) { return FS::Floor( a ); } ); - RegisterTest( tests, "f32 inv sqrt", []( TestRegf32 a ) { return FS::InvSqrt( FS::Min( FS::Max( FS::Abs( a ), TestRegf32( 1.e-16f ) ), TestRegf32( 1.e+16f ) ) ); } ).accuracy = 8192; + RegisterTest( tests, "f32 sqrt", []( TestRegf32 a ) { return FS::Sqrt( FS::Min( FS::Max( FS::Abs( a ), TestRegf32( 1.e-16f ) ), TestRegf32( 1.e+16f ) ) ); } ); + RegisterTest( tests, "f32 inv sqrt", []( TestRegf32 a ) { return FS::InvSqrt( FS::Min( FS::Max( FS::Abs( a ), TestRegf32( 1.e-16f ) ), TestRegf32( 1.e+16f ) ) ); } ).relaxedAccuracy = 8192; RegisterTest( tests, "f32 reciprocal", []( TestRegf32 a ) { TestRegf32 clamped = FS::Min( FS::Max( FS::Abs( a ), TestRegf32( 1.e-16f ) ), TestRegf32( 1.e+16f ) ); return FS::Reciprocal( FS::Select( a > TestRegf32( 0 ), clamped, -clamped ) ); - } ).accuracy = 8192; + } ).relaxedAccuracy = 8192; - RegisterTest( tests, "f32 cos", []( TestRegf32 a ) { return FS::Cos( a ); } ).accuracy = 8192; - RegisterTest( tests, "f32 sin", []( TestRegf32 a ) { return FS::Sin( a ); } ).accuracy = 8192; - RegisterTest( tests, "f32 exp", []( TestRegf32 a ) { return FS::Exp( a ); } ).accuracy = 8192; - RegisterTest( tests, "f32 log", []( TestRegf32 a ) { return FS::Log( a ); } ).accuracy = 8192; - RegisterTest( tests, "f32 pow", []( TestRegf32 a, TestRegf32 b ) { return FS::Pow( a, b ); } ).accuracy = 8192; + RegisterTest( tests, "f32 cos", []( TestRegf32 a ) { return FS::Cos( a ); } ).relaxedAccuracy = 8192; + RegisterTest( tests, "f32 sin", []( TestRegf32 a ) { return FS::Sin( a ); } ).relaxedAccuracy = 8192; + RegisterTest( tests, "f32 exp", []( TestRegf32 a ) { return FS::Exp( a ); } ).relaxedAccuracy = 8192; + RegisterTest( tests, "f32 log", []( TestRegf32 a ) { return FS::Log( a ); } ).relaxedAccuracy = 8192; + RegisterTest( tests, "f32 pow", []( TestRegf32 a, TestRegf32 b ) { return FS::Pow( a, b ); } ).relaxedAccuracy = 8192; RegisterTest( tests, "i32 convert to f32", []( TestRegi32 a ) { return FS::Convert( a ); } ); RegisterTest( tests, "i32 cast to f32", []( TestRegi32 a ) { return FS::Cast( a ); } ); @@ -415,4 +417,4 @@ class FastSIMD::DispatchClass, SIMD> : public TestFa } }; -template class FastSIMD::RegisterDispatchClass>; +template class FastSIMD::RegisterDispatchClass>;