From 126313969ee870e1ea41a6f60f04d2cc9243ae3c Mon Sep 17 00:00:00 2001 From: mawen1250 Date: Sat, 23 Jan 2016 09:35:01 +0800 Subject: [PATCH] Optimized the SSE2 path for hard-thresholding filter Thanks for the idea from MonoS: https://github.com/MonoS/VapourSynth-BM3D/commit/421fa782ee60b7a106597d2243fe90d42a247f90 --- include/Helper.h | 9 +++++++++ source/BM3D_Basic.cpp | 9 +++------ source/VBM3D_Basic.cpp | 9 +++------ 3 files changed, 15 insertions(+), 12 deletions(-) diff --git a/include/Helper.h b/include/Helper.h index 9ec15c2..6d2b576 100644 --- a/include/Helper.h +++ b/include/Helper.h @@ -104,6 +104,15 @@ class ClockCounter #endif +#if defined(__SSE2__) +inline __m128 _mm_abs_ps(const __m128 &x) +{ + static const __m128 mask = _mm_castsi128_ps(_mm_set1_epi32(~0x80000000)); + return _mm_and_ps(x, mask); +} +#endif + + //////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// // Exception handle diff --git a/source/BM3D_Basic.cpp b/source/BM3D_Basic.cpp index d3cd6d2..b0c067b 100644 --- a/source/BM3D_Basic.cpp +++ b/source/BM3D_Basic.cpp @@ -88,18 +88,15 @@ void BM3D_Basic_Process::CollaborativeFilter(int plane, const ptrdiff_t simd_residue = srcGroup.size() % simd_step; const ptrdiff_t simd_width = srcGroup.size() - simd_residue; - static const __m128 zero_ps = _mm_setzero_ps(); __m128i cmp_sum = _mm_setzero_si128(); for (const auto upper1 = srcp + simd_width; srcp < upper1; srcp += simd_step, thrp += simd_step) { const __m128 s1 = _mm_load_ps(srcp); - const __m128 t1p = _mm_load_ps(thrp); - const __m128 t1n = _mm_sub_ps(zero_ps, t1p); + const __m128 t1 = _mm_load_ps(thrp); - const __m128 cmp1 = _mm_cmpgt_ps(s1, t1p); - const __m128 cmp2 = _mm_cmplt_ps(s1, t1n); - const __m128 cmp = _mm_or_ps(cmp1, cmp2); + const __m128 s1abs = _mm_abs_ps(s1); + const __m128 cmp = _mm_cmpgt_ps(s1abs, t1); const __m128 d1 = _mm_and_ps(cmp, s1); _mm_store_ps(srcp, d1); diff --git a/source/VBM3D_Basic.cpp b/source/VBM3D_Basic.cpp index 8fdcd2c..fea5302 100644 --- a/source/VBM3D_Basic.cpp +++ b/source/VBM3D_Basic.cpp @@ -88,18 +88,15 @@ void VBM3D_Basic_Process::CollaborativeFilter(int plane, const ptrdiff_t simd_residue = srcGroup.size() % simd_step; const ptrdiff_t simd_width = srcGroup.size() - simd_residue; - static const __m128 zero_ps = _mm_setzero_ps(); __m128i cmp_sum = _mm_setzero_si128(); for (const auto upper1 = srcp + simd_width; srcp < upper1; srcp += simd_step, thrp += simd_step) { const __m128 s1 = _mm_load_ps(srcp); - const __m128 t1p = _mm_load_ps(thrp); - const __m128 t1n = _mm_sub_ps(zero_ps, t1p); + const __m128 t1 = _mm_load_ps(thrp); - const __m128 cmp1 = _mm_cmpgt_ps(s1, t1p); - const __m128 cmp2 = _mm_cmplt_ps(s1, t1n); - const __m128 cmp = _mm_or_ps(cmp1, cmp2); + const __m128 s1abs = _mm_abs_ps(s1); + const __m128 cmp = _mm_cmpgt_ps(s1abs, t1); const __m128 d1 = _mm_and_ps(cmp, s1); _mm_store_ps(srcp, d1);