Skip to content

Commit

Permalink
Optimized the SSE2 path for hard-thresholding filter
Browse files Browse the repository at this point in the history
Thanks for the idea from MonoS: MonoS@421fa78
  • Loading branch information
mawen1250 committed Jan 23, 2016
1 parent 1fc6137 commit 1263139
Show file tree
Hide file tree
Showing 3 changed files with 15 additions and 12 deletions.
9 changes: 9 additions & 0 deletions include/Helper.h
Original file line number Diff line number Diff line change
Expand Up @@ -104,6 +104,15 @@ class ClockCounter
#endif


#if defined(__SSE2__)
inline __m128 _mm_abs_ps(const __m128 &x)
{
static const __m128 mask = _mm_castsi128_ps(_mm_set1_epi32(~0x80000000));
return _mm_and_ps(x, mask);
}
#endif


////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
// Exception handle

Expand Down
9 changes: 3 additions & 6 deletions source/BM3D_Basic.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -88,18 +88,15 @@ void BM3D_Basic_Process::CollaborativeFilter(int plane,
const ptrdiff_t simd_residue = srcGroup.size() % simd_step;
const ptrdiff_t simd_width = srcGroup.size() - simd_residue;

static const __m128 zero_ps = _mm_setzero_ps();
__m128i cmp_sum = _mm_setzero_si128();

for (const auto upper1 = srcp + simd_width; srcp < upper1; srcp += simd_step, thrp += simd_step)
{
const __m128 s1 = _mm_load_ps(srcp);
const __m128 t1p = _mm_load_ps(thrp);
const __m128 t1n = _mm_sub_ps(zero_ps, t1p);
const __m128 t1 = _mm_load_ps(thrp);

const __m128 cmp1 = _mm_cmpgt_ps(s1, t1p);
const __m128 cmp2 = _mm_cmplt_ps(s1, t1n);
const __m128 cmp = _mm_or_ps(cmp1, cmp2);
const __m128 s1abs = _mm_abs_ps(s1);
const __m128 cmp = _mm_cmpgt_ps(s1abs, t1);

const __m128 d1 = _mm_and_ps(cmp, s1);
_mm_store_ps(srcp, d1);
Expand Down
9 changes: 3 additions & 6 deletions source/VBM3D_Basic.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -88,18 +88,15 @@ void VBM3D_Basic_Process::CollaborativeFilter(int plane,
const ptrdiff_t simd_residue = srcGroup.size() % simd_step;
const ptrdiff_t simd_width = srcGroup.size() - simd_residue;

static const __m128 zero_ps = _mm_setzero_ps();
__m128i cmp_sum = _mm_setzero_si128();

for (const auto upper1 = srcp + simd_width; srcp < upper1; srcp += simd_step, thrp += simd_step)
{
const __m128 s1 = _mm_load_ps(srcp);
const __m128 t1p = _mm_load_ps(thrp);
const __m128 t1n = _mm_sub_ps(zero_ps, t1p);
const __m128 t1 = _mm_load_ps(thrp);

const __m128 cmp1 = _mm_cmpgt_ps(s1, t1p);
const __m128 cmp2 = _mm_cmplt_ps(s1, t1n);
const __m128 cmp = _mm_or_ps(cmp1, cmp2);
const __m128 s1abs = _mm_abs_ps(s1);
const __m128 cmp = _mm_cmpgt_ps(s1abs, t1);

const __m128 d1 = _mm_and_ps(cmp, s1);
_mm_store_ps(srcp, d1);
Expand Down

0 comments on commit 1263139

Please sign in to comment.