Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

NEON: properly implement _high intrinsics #1030

Open
wants to merge 2 commits into
base: master
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
17 changes: 8 additions & 9 deletions simde/arm/neon/addl_high.h
Original file line number Diff line number Diff line change
Expand Up @@ -28,9 +28,8 @@
#if !defined(SIMDE_ARM_NEON_ADDL_HIGH_H)
#define SIMDE_ARM_NEON_ADDL_HIGH_H

#include "add.h"
#include "movl.h"
#include "movl_high.h"
#include "addl.h"
#include "get_high.h"
#include "types.h"

HEDLEY_DIAGNOSTIC_PUSH
Expand All @@ -43,7 +42,7 @@ simde_vaddl_high_s8(simde_int8x16_t a, simde_int8x16_t b) {
#if defined(SIMDE_ARM_NEON_A64V8_NATIVE)
return vaddl_high_s8(a, b);
#else
return simde_vaddq_s16(simde_vmovl_high_s8(a), simde_vmovl_high_s8(b));
return simde_vaddl_s8(simde_vget_high_s8(a), simde_vget_high_s8(b));
#endif
}
#if defined(SIMDE_ARM_NEON_A64V8_ENABLE_NATIVE_ALIASES)
Expand All @@ -57,7 +56,7 @@ simde_vaddl_high_s16(simde_int16x8_t a, simde_int16x8_t b) {
#if defined(SIMDE_ARM_NEON_A64V8_NATIVE)
return vaddl_high_s16(a, b);
#else
return simde_vaddq_s32(simde_vmovl_high_s16(a), simde_vmovl_high_s16(b));
return simde_vaddl_s16(simde_vget_high_s16(a), simde_vget_high_s16(b));
#endif
}
#if defined(SIMDE_ARM_NEON_A64V8_ENABLE_NATIVE_ALIASES)
Expand All @@ -71,7 +70,7 @@ simde_vaddl_high_s32(simde_int32x4_t a, simde_int32x4_t b) {
#if defined(SIMDE_ARM_NEON_A64V8_NATIVE)
return vaddl_high_s32(a, b);
#else
return simde_vaddq_s64(simde_vmovl_high_s32(a), simde_vmovl_high_s32(b));
return simde_vaddl_s32(simde_vget_high_s32(a), simde_vget_high_s32(b));
#endif
}
#if defined(SIMDE_ARM_NEON_A64V8_ENABLE_NATIVE_ALIASES)
Expand All @@ -85,7 +84,7 @@ simde_vaddl_high_u8(simde_uint8x16_t a, simde_uint8x16_t b) {
#if defined(SIMDE_ARM_NEON_A64V8_NATIVE)
return vaddl_high_u8(a, b);
#else
return simde_vaddq_u16(simde_vmovl_high_u8(a), simde_vmovl_high_u8(b));
return simde_vaddl_u8(simde_vget_high_u8(a), simde_vget_high_u8(b));
#endif
}
#if defined(SIMDE_ARM_NEON_A64V8_ENABLE_NATIVE_ALIASES)
Expand All @@ -99,7 +98,7 @@ simde_vaddl_high_u16(simde_uint16x8_t a, simde_uint16x8_t b) {
#if defined(SIMDE_ARM_NEON_A64V8_NATIVE)
return vaddl_high_u16(a, b);
#else
return simde_vaddq_u32(simde_vmovl_high_u16(a), simde_vmovl_high_u16(b));
return simde_vaddl_u16(simde_vget_high_u16(a), simde_vget_high_u16(b));
#endif
}
#if defined(SIMDE_ARM_NEON_A64V8_ENABLE_NATIVE_ALIASES)
Expand All @@ -113,7 +112,7 @@ simde_vaddl_high_u32(simde_uint32x4_t a, simde_uint32x4_t b) {
#if defined(SIMDE_ARM_NEON_A64V8_NATIVE)
return vaddl_high_u32(a, b);
#else
return simde_vaddq_u64(simde_vmovl_high_u32(a), simde_vmovl_high_u32(b));
return simde_vaddl_u32(simde_vget_high_u32(a), simde_vget_high_u32(b));
#endif
}
#if defined(SIMDE_ARM_NEON_A64V8_ENABLE_NATIVE_ALIASES)
Expand Down
82 changes: 8 additions & 74 deletions simde/arm/neon/addw_high.h
Original file line number Diff line number Diff line change
Expand Up @@ -28,8 +28,8 @@
#define SIMDE_ARM_NEON_ADDW_HIGH_H

#include "types.h"
#include "movl_high.h"
#include "add.h"
#include "get_high.h"
#include "addw.h"

HEDLEY_DIAGNOSTIC_PUSH
SIMDE_DISABLE_UNWANTED_DIAGNOSTICS
Expand All @@ -40,19 +40,8 @@ simde_int16x8_t
simde_vaddw_high_s8(simde_int16x8_t a, simde_int8x16_t b) {
#if defined(SIMDE_ARM_NEON_A64V8_NATIVE)
return vaddw_high_s8(a, b);
#elif SIMDE_NATURAL_VECTOR_SIZE_GE(128)
return simde_vaddq_s16(a, simde_vmovl_high_s8(b));
#else
simde_int16x8_private r_;
simde_int16x8_private a_ = simde_int16x8_to_private(a);
simde_int8x16_private b_ = simde_int8x16_to_private(b);

SIMDE_VECTORIZE
for (size_t i = 0 ; i < (sizeof(r_.values) / sizeof(r_.values[0])) ; i++) {
r_.values[i] = a_.values[i] + b_.values[i + ((sizeof(b_.values) / sizeof(b_.values[0])) / 2)];
}

return simde_int16x8_from_private(r_);
Comment on lines -46 to -55
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Hmm.. So you think that there is no architecture/compiler combo that would produce better code from this vectorize loop than the fallback of simde_vaddw_s8(a, simde_vget_high_s8(b)) ?

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I am mostly going for ease of implementation on this PR.

If the compiler is reasonably intelligent it would be able to detect the redundant assignment/shuffle and eliminate it. However I haven't tested codegen.

Copy link
Contributor Author

@easyaspi314 easyaspi314 May 31, 2023

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

GCC and Clang both generate identical code on a downscaled version, eliding the copy.

MSVC x86 emits a few extra instructions on /arch:IA32 either way if I use a copy loop or memcpy, but it isn't terrible. https://godbolt.org/z/Y3v4vjz46

Here is /arch:SSE2: https://godbolt.org/z/nWTKMfh7K

However, 99% of the time MSVC will use SSE2 by default — /arch:IA32 is opt-in.

GCC and Clang are the ones where scalar counts, and they emit identical code.

Long story short, 99% free code reuse.

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Hold up, the story changes with uint16_t... GCC vomits.

Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

With which version does GCC vomit when compiling the uint16_t functions: the vectorized or the downscaled version?

Copy link
Contributor Author

@easyaspi314 easyaspi314 Jun 1, 2023

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

It actually seems to be the opposite problem. The autovec codegen is actually bad on vaddw_u16. GCC couldn't autovec the one-shot one.

Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

It actually seems to be the opposite problem. The autovec codegen is actually bad on vaddw_u16. GCC couldn't autovec the one-shot one.

So you're seeing better code from this PR for GCC?

Copy link
Contributor Author

@easyaspi314 easyaspi314 Jun 4, 2023

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

No. Rather it is vaddw_u16 having mediocre codegen and reusing it passes those codegen issues to vaddw_high_u16. This is because GCC vectorizes it internally which is better for when SIMD is available

Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Okay. Is this PR ready, or do you want to make other changes?

return simde_vaddw_s8(a, simde_vget_high_s8(b));
#endif
}
#if defined(SIMDE_ARM_NEON_A64V8_ENABLE_NATIVE_ALIASES)
Expand All @@ -65,19 +54,8 @@ simde_int32x4_t
simde_vaddw_high_s16(simde_int32x4_t a, simde_int16x8_t b) {
#if defined(SIMDE_ARM_NEON_A64V8_NATIVE)
return vaddw_high_s16(a, b);
#elif SIMDE_NATURAL_VECTOR_SIZE_GE(128)
return simde_vaddq_s32(a, simde_vmovl_high_s16(b));
#else
simde_int32x4_private r_;
simde_int32x4_private a_ = simde_int32x4_to_private(a);
simde_int16x8_private b_ = simde_int16x8_to_private(b);

SIMDE_VECTORIZE
for (size_t i = 0 ; i < (sizeof(r_.values) / sizeof(r_.values[0])) ; i++) {
r_.values[i] = a_.values[i] + b_.values[i + ((sizeof(b_.values) / sizeof(b_.values[0])) / 2)];
}

return simde_int32x4_from_private(r_);
return simde_vaddw_s16(a, simde_vget_high_s16(b));
#endif
}
#if defined(SIMDE_ARM_NEON_A64V8_ENABLE_NATIVE_ALIASES)
Expand All @@ -90,19 +68,8 @@ simde_int64x2_t
simde_vaddw_high_s32(simde_int64x2_t a, simde_int32x4_t b) {
#if defined(SIMDE_ARM_NEON_A64V8_NATIVE)
return vaddw_high_s32(a, b);
#elif SIMDE_NATURAL_VECTOR_SIZE_GE(128)
return simde_vaddq_s64(a, simde_vmovl_high_s32(b));
#else
simde_int64x2_private r_;
simde_int64x2_private a_ = simde_int64x2_to_private(a);
simde_int32x4_private b_ = simde_int32x4_to_private(b);

SIMDE_VECTORIZE
for (size_t i = 0 ; i < (sizeof(r_.values) / sizeof(r_.values[0])) ; i++) {
r_.values[i] = a_.values[i] + b_.values[i + ((sizeof(b_.values) / sizeof(b_.values[0])) / 2)];
}

return simde_int64x2_from_private(r_);
return simde_vaddw_s32(a, simde_vget_high_s32(b));
#endif
}
#if defined(SIMDE_ARM_NEON_A64V8_ENABLE_NATIVE_ALIASES)
Expand All @@ -115,19 +82,8 @@ simde_uint16x8_t
simde_vaddw_high_u8(simde_uint16x8_t a, simde_uint8x16_t b) {
#if defined(SIMDE_ARM_NEON_A64V8_NATIVE)
return vaddw_high_u8(a, b);
#elif SIMDE_NATURAL_VECTOR_SIZE_GE(128)
return simde_vaddq_u16(a, simde_vmovl_high_u8(b));
#else
simde_uint16x8_private r_;
simde_uint16x8_private a_ = simde_uint16x8_to_private(a);
simde_uint8x16_private b_ = simde_uint8x16_to_private(b);

SIMDE_VECTORIZE
for (size_t i = 0 ; i < (sizeof(r_.values) / sizeof(r_.values[0])) ; i++) {
r_.values[i] = a_.values[i] + b_.values[i + ((sizeof(b_.values) / sizeof(b_.values[0])) / 2)];
}

return simde_uint16x8_from_private(r_);
return simde_vaddw_u8(a, simde_vget_high_u8(b));
#endif
}
#if defined(SIMDE_ARM_NEON_A64V8_ENABLE_NATIVE_ALIASES)
Expand All @@ -140,19 +96,8 @@ simde_uint32x4_t
simde_vaddw_high_u16(simde_uint32x4_t a, simde_uint16x8_t b) {
#if defined(SIMDE_ARM_NEON_A64V8_NATIVE)
return vaddw_high_u16(a, b);
#elif SIMDE_NATURAL_VECTOR_SIZE_GE(128)
return simde_vaddq_u32(a, simde_vmovl_high_u16(b));
#else
simde_uint32x4_private r_;
simde_uint32x4_private a_ = simde_uint32x4_to_private(a);
simde_uint16x8_private b_ = simde_uint16x8_to_private(b);

SIMDE_VECTORIZE
for (size_t i = 0 ; i < (sizeof(r_.values) / sizeof(r_.values[0])) ; i++) {
r_.values[i] = a_.values[i] + b_.values[i + ((sizeof(b_.values) / sizeof(b_.values[0])) / 2)];
}

return simde_uint32x4_from_private(r_);
return simde_vaddw_u16(a, simde_vget_high_u16(b));
#endif
}
#if defined(SIMDE_ARM_NEON_A64V8_ENABLE_NATIVE_ALIASES)
Expand All @@ -165,19 +110,8 @@ simde_uint64x2_t
simde_vaddw_high_u32(simde_uint64x2_t a, simde_uint32x4_t b) {
#if defined(SIMDE_ARM_NEON_A64V8_NATIVE)
return vaddw_high_u32(a, b);
#elif SIMDE_NATURAL_VECTOR_SIZE_GE(128)
return simde_vaddq_u64(a, simde_vmovl_high_u32(b));
#else
simde_uint64x2_private r_;
simde_uint64x2_private a_ = simde_uint64x2_to_private(a);
simde_uint32x4_private b_ = simde_uint32x4_to_private(b);

SIMDE_VECTORIZE
for (size_t i = 0 ; i < (sizeof(r_.values) / sizeof(r_.values[0])) ; i++) {
r_.values[i] = a_.values[i] + b_.values[i + ((sizeof(b_.values) / sizeof(b_.values[0])) / 2)];
}

return simde_uint64x2_from_private(r_);
return simde_vaddw_u32(a, simde_vget_high_u32(b));
#endif
}
#if defined(SIMDE_ARM_NEON_A64V8_ENABLE_NATIVE_ALIASES)
Expand Down
46 changes: 8 additions & 38 deletions simde/arm/neon/mlal_high.h
Original file line number Diff line number Diff line change
Expand Up @@ -28,8 +28,8 @@
#if !defined(SIMDE_ARM_NEON_MLAL_HIGH_H)
#define SIMDE_ARM_NEON_MLAL_HIGH_H

#include "movl_high.h"
#include "mla.h"
#include "get_high.h"
#include "mlal.h"
#include "types.h"

HEDLEY_DIAGNOSTIC_PUSH
Expand All @@ -42,7 +42,7 @@ simde_vmlal_high_s8(simde_int16x8_t a, simde_int8x16_t b, simde_int8x16_t c) {
#if defined(SIMDE_ARM_NEON_A64V8_NATIVE)
return vmlal_high_s8(a, b, c);
#else
return simde_vmlaq_s16(a, simde_vmovl_high_s8(b), simde_vmovl_high_s8(c));
return simde_vmlal_s8(a, simde_vget_high_s8(b), simde_vget_high_s8(c));
#endif
}
#if defined(SIMDE_ARM_NEON_A64V8_ENABLE_NATIVE_ALIASES)
Expand All @@ -56,7 +56,7 @@ simde_vmlal_high_s16(simde_int32x4_t a, simde_int16x8_t b, simde_int16x8_t c) {
#if defined(SIMDE_ARM_NEON_A64V8_NATIVE)
return vmlal_high_s16(a, b, c);
#else
return simde_vmlaq_s32(a, simde_vmovl_high_s16(b), simde_vmovl_high_s16(c));
return simde_vmlal_s16(a, simde_vget_high_s16(b), simde_vget_high_s16(c));
#endif
}
#if defined(SIMDE_ARM_NEON_A64V8_ENABLE_NATIVE_ALIASES)
Expand All @@ -70,22 +70,7 @@ simde_vmlal_high_s32(simde_int64x2_t a, simde_int32x4_t b, simde_int32x4_t c) {
#if defined(SIMDE_ARM_NEON_A64V8_NATIVE)
return vmlal_high_s32(a, b, c);
#else
simde_int64x2_private
r_,
a_ = simde_int64x2_to_private(a),
b_ = simde_int64x2_to_private(simde_vmovl_high_s32(b)),
c_ = simde_int64x2_to_private(simde_vmovl_high_s32(c));

#if defined(SIMDE_VECTOR_SUBSCRIPT_OPS)
r_.values = (b_.values * c_.values) + a_.values;
#else
SIMDE_VECTORIZE
for (size_t i = 0 ; i < (sizeof(r_.values) / sizeof(r_.values[0])) ; i++) {
r_.values[i] = (b_.values[i] * c_.values[i]) + a_.values[i];
}
#endif

return simde_int64x2_from_private(r_);
return simde_vmlal_s32(a, simde_vget_high_s32(b), simde_vget_high_s32(c));
#endif
}
#if defined(SIMDE_ARM_NEON_A64V8_ENABLE_NATIVE_ALIASES)
Expand All @@ -99,7 +84,7 @@ simde_vmlal_high_u8(simde_uint16x8_t a, simde_uint8x16_t b, simde_uint8x16_t c)
#if defined(SIMDE_ARM_NEON_A64V8_NATIVE)
return vmlal_high_u8(a, b, c);
#else
return simde_vmlaq_u16(a, simde_vmovl_high_u8(b), simde_vmovl_high_u8(c));
return simde_vmlal_u8(a, simde_vget_high_u8(b), simde_vget_high_u8(c));
#endif
}
#if defined(SIMDE_ARM_NEON_A64V8_ENABLE_NATIVE_ALIASES)
Expand All @@ -113,7 +98,7 @@ simde_vmlal_high_u16(simde_uint32x4_t a, simde_uint16x8_t b, simde_uint16x8_t c)
#if defined(SIMDE_ARM_NEON_A64V8_NATIVE)
return vmlal_high_u16(a, b, c);
#else
return simde_vmlaq_u32(a, simde_vmovl_high_u16(b), simde_vmovl_high_u16(c));
return simde_vmlal_u16(a, simde_vget_high_u16(b), simde_vget_high_u16(c));
#endif
}
#if defined(SIMDE_ARM_NEON_A64V8_ENABLE_NATIVE_ALIASES)
Expand All @@ -127,22 +112,7 @@ simde_vmlal_high_u32(simde_uint64x2_t a, simde_uint32x4_t b, simde_uint32x4_t c)
#if defined(SIMDE_ARM_NEON_A64V8_NATIVE)
return vmlal_high_u32(a, b, c);
#else
simde_uint64x2_private
r_,
a_ = simde_uint64x2_to_private(a),
b_ = simde_uint64x2_to_private(simde_vmovl_high_u32(b)),
c_ = simde_uint64x2_to_private(simde_vmovl_high_u32(c));

#if defined(SIMDE_VECTOR_SUBSCRIPT_OPS)
r_.values = (b_.values * c_.values) + a_.values;
#else
SIMDE_VECTORIZE
for (size_t i = 0 ; i < (sizeof(r_.values) / sizeof(r_.values[0])) ; i++) {
r_.values[i] = (b_.values[i] * c_.values[i]) + a_.values[i];
}
#endif

return simde_uint64x2_from_private(r_);
return simde_vmlal_u32(a, simde_vget_high_u32(b), simde_vget_high_u32(c));
#endif
}
#if defined(SIMDE_ARM_NEON_A64V8_ENABLE_NATIVE_ALIASES)
Expand Down
43 changes: 6 additions & 37 deletions simde/arm/neon/mlal_high_n.h
Original file line number Diff line number Diff line change
Expand Up @@ -27,9 +27,8 @@
#if !defined(SIMDE_ARM_NEON_MLAL_HIGH_N_H)
#define SIMDE_ARM_NEON_MLAL_HIGH_N_H

#include "movl_high.h"
#include "dup_n.h"
#include "mla.h"
#include "get_high.h"
#include "mlal_n.h"
#include "types.h"

HEDLEY_DIAGNOSTIC_PUSH
Expand All @@ -42,7 +41,7 @@ simde_vmlal_high_n_s16(simde_int32x4_t a, simde_int16x8_t b, int16_t c) {
#if defined(SIMDE_ARM_NEON_A64V8_NATIVE)
return vmlal_high_n_s16(a, b, c);
#else
return simde_vmlaq_s32(a, simde_vmovl_high_s16(b), simde_vdupq_n_s32(c));
return simde_vmlal_n_s16(a, simde_vget_high_s16(b), c);
#endif
}
#if defined(SIMDE_ARM_NEON_A64V8_ENABLE_NATIVE_ALIASES)
Expand All @@ -56,22 +55,7 @@ simde_vmlal_high_n_s32(simde_int64x2_t a, simde_int32x4_t b, int32_t c) {
#if defined(SIMDE_ARM_NEON_A64V8_NATIVE)
return vmlal_high_n_s32(a, b, c);
#else
simde_int64x2_private
r_,
a_ = simde_int64x2_to_private(a),
b_ = simde_int64x2_to_private(simde_vmovl_high_s32(b)),
c_ = simde_int64x2_to_private(simde_vdupq_n_s64(c));

#if defined(SIMDE_VECTOR_SUBSCRIPT_OPS)
r_.values = (b_.values * c_.values) + a_.values;
#else
SIMDE_VECTORIZE
for (size_t i = 0 ; i < (sizeof(r_.values) / sizeof(r_.values[0])) ; i++) {
r_.values[i] = (b_.values[i] * c_.values[i]) + a_.values[i];
}
#endif

return simde_int64x2_from_private(r_);
return simde_vmlal_n_s32(a, simde_vget_high_s32(b), c);
#endif
}
#if defined(SIMDE_ARM_NEON_A64V8_ENABLE_NATIVE_ALIASES)
Expand All @@ -85,7 +69,7 @@ simde_vmlal_high_n_u16(simde_uint32x4_t a, simde_uint16x8_t b, uint16_t c) {
#if defined(SIMDE_ARM_NEON_A64V8_NATIVE)
return vmlal_high_n_u16(a, b, c);
#else
return simde_vmlaq_u32(a, simde_vmovl_high_u16(b), simde_vdupq_n_u32(c));
return simde_vmlal_n_u16(a, simde_vget_high_u16(b), c);
#endif
}
#if defined(SIMDE_ARM_NEON_A64V8_ENABLE_NATIVE_ALIASES)
Expand All @@ -99,22 +83,7 @@ simde_vmlal_high_n_u32(simde_uint64x2_t a, simde_uint32x4_t b, uint32_t c) {
#if defined(SIMDE_ARM_NEON_A64V8_NATIVE)
return vmlal_high_n_u32(a, b, c);
#else
simde_uint64x2_private
r_,
a_ = simde_uint64x2_to_private(a),
b_ = simde_uint64x2_to_private(simde_vmovl_high_u32(b)),
c_ = simde_uint64x2_to_private(simde_vdupq_n_u64(c));

#if defined(SIMDE_VECTOR_SUBSCRIPT_OPS)
r_.values = (b_.values * c_.values) + a_.values;
#else
SIMDE_VECTORIZE
for (size_t i = 0 ; i < (sizeof(r_.values) / sizeof(r_.values[0])) ; i++) {
r_.values[i] = (b_.values[i] * c_.values[i]) + a_.values[i];
}
#endif

return simde_uint64x2_from_private(r_);
return simde_vmlal_n_u32(a, simde_vget_high_u32(b), c);
#endif
}
#if defined(SIMDE_ARM_NEON_A64V8_ENABLE_NATIVE_ALIASES)
Expand Down
Loading
Loading