diff --git a/simde/arm/neon/addl_high.h b/simde/arm/neon/addl_high.h index fdef796c9..d4fc7ba7e 100644 --- a/simde/arm/neon/addl_high.h +++ b/simde/arm/neon/addl_high.h @@ -28,9 +28,8 @@ #if !defined(SIMDE_ARM_NEON_ADDL_HIGH_H) #define SIMDE_ARM_NEON_ADDL_HIGH_H -#include "add.h" -#include "movl.h" -#include "movl_high.h" +#include "addl.h" +#include "get_high.h" #include "types.h" HEDLEY_DIAGNOSTIC_PUSH @@ -43,7 +42,7 @@ simde_vaddl_high_s8(simde_int8x16_t a, simde_int8x16_t b) { #if defined(SIMDE_ARM_NEON_A64V8_NATIVE) return vaddl_high_s8(a, b); #else - return simde_vaddq_s16(simde_vmovl_high_s8(a), simde_vmovl_high_s8(b)); + return simde_vaddl_s8(simde_vget_high_s8(a), simde_vget_high_s8(b)); #endif } #if defined(SIMDE_ARM_NEON_A64V8_ENABLE_NATIVE_ALIASES) @@ -57,7 +56,7 @@ simde_vaddl_high_s16(simde_int16x8_t a, simde_int16x8_t b) { #if defined(SIMDE_ARM_NEON_A64V8_NATIVE) return vaddl_high_s16(a, b); #else - return simde_vaddq_s32(simde_vmovl_high_s16(a), simde_vmovl_high_s16(b)); + return simde_vaddl_s16(simde_vget_high_s16(a), simde_vget_high_s16(b)); #endif } #if defined(SIMDE_ARM_NEON_A64V8_ENABLE_NATIVE_ALIASES) @@ -71,7 +70,7 @@ simde_vaddl_high_s32(simde_int32x4_t a, simde_int32x4_t b) { #if defined(SIMDE_ARM_NEON_A64V8_NATIVE) return vaddl_high_s32(a, b); #else - return simde_vaddq_s64(simde_vmovl_high_s32(a), simde_vmovl_high_s32(b)); + return simde_vaddl_s32(simde_vget_high_s32(a), simde_vget_high_s32(b)); #endif } #if defined(SIMDE_ARM_NEON_A64V8_ENABLE_NATIVE_ALIASES) @@ -85,7 +84,7 @@ simde_vaddl_high_u8(simde_uint8x16_t a, simde_uint8x16_t b) { #if defined(SIMDE_ARM_NEON_A64V8_NATIVE) return vaddl_high_u8(a, b); #else - return simde_vaddq_u16(simde_vmovl_high_u8(a), simde_vmovl_high_u8(b)); + return simde_vaddl_u8(simde_vget_high_u8(a), simde_vget_high_u8(b)); #endif } #if defined(SIMDE_ARM_NEON_A64V8_ENABLE_NATIVE_ALIASES) @@ -99,7 +98,7 @@ simde_vaddl_high_u16(simde_uint16x8_t a, simde_uint16x8_t b) { #if defined(SIMDE_ARM_NEON_A64V8_NATIVE) return vaddl_high_u16(a, b); #else - return simde_vaddq_u32(simde_vmovl_high_u16(a), simde_vmovl_high_u16(b)); + return simde_vaddl_u16(simde_vget_high_u16(a), simde_vget_high_u16(b)); #endif } #if defined(SIMDE_ARM_NEON_A64V8_ENABLE_NATIVE_ALIASES) @@ -113,7 +112,7 @@ simde_vaddl_high_u32(simde_uint32x4_t a, simde_uint32x4_t b) { #if defined(SIMDE_ARM_NEON_A64V8_NATIVE) return vaddl_high_u32(a, b); #else - return simde_vaddq_u64(simde_vmovl_high_u32(a), simde_vmovl_high_u32(b)); + return simde_vaddl_u32(simde_vget_high_u32(a), simde_vget_high_u32(b)); #endif } #if defined(SIMDE_ARM_NEON_A64V8_ENABLE_NATIVE_ALIASES) diff --git a/simde/arm/neon/addw_high.h b/simde/arm/neon/addw_high.h index 1f2df9052..eb18437e3 100644 --- a/simde/arm/neon/addw_high.h +++ b/simde/arm/neon/addw_high.h @@ -28,8 +28,8 @@ #define SIMDE_ARM_NEON_ADDW_HIGH_H #include "types.h" -#include "movl_high.h" -#include "add.h" +#include "get_high.h" +#include "addw.h" HEDLEY_DIAGNOSTIC_PUSH SIMDE_DISABLE_UNWANTED_DIAGNOSTICS @@ -40,19 +40,8 @@ simde_int16x8_t simde_vaddw_high_s8(simde_int16x8_t a, simde_int8x16_t b) { #if defined(SIMDE_ARM_NEON_A64V8_NATIVE) return vaddw_high_s8(a, b); - #elif SIMDE_NATURAL_VECTOR_SIZE_GE(128) - return simde_vaddq_s16(a, simde_vmovl_high_s8(b)); #else - simde_int16x8_private r_; - simde_int16x8_private a_ = simde_int16x8_to_private(a); - simde_int8x16_private b_ = simde_int8x16_to_private(b); - - SIMDE_VECTORIZE - for (size_t i = 0 ; i < (sizeof(r_.values) / sizeof(r_.values[0])) ; i++) { - r_.values[i] = a_.values[i] + b_.values[i + ((sizeof(b_.values) / sizeof(b_.values[0])) / 2)]; - } - - return simde_int16x8_from_private(r_); + return simde_vaddw_s8(a, simde_vget_high_s8(b)); #endif } #if defined(SIMDE_ARM_NEON_A64V8_ENABLE_NATIVE_ALIASES) @@ -65,19 +54,8 @@ simde_int32x4_t simde_vaddw_high_s16(simde_int32x4_t a, simde_int16x8_t b) { #if defined(SIMDE_ARM_NEON_A64V8_NATIVE) return vaddw_high_s16(a, b); - #elif SIMDE_NATURAL_VECTOR_SIZE_GE(128) - return simde_vaddq_s32(a, simde_vmovl_high_s16(b)); #else - simde_int32x4_private r_; - simde_int32x4_private a_ = simde_int32x4_to_private(a); - simde_int16x8_private b_ = simde_int16x8_to_private(b); - - SIMDE_VECTORIZE - for (size_t i = 0 ; i < (sizeof(r_.values) / sizeof(r_.values[0])) ; i++) { - r_.values[i] = a_.values[i] + b_.values[i + ((sizeof(b_.values) / sizeof(b_.values[0])) / 2)]; - } - - return simde_int32x4_from_private(r_); + return simde_vaddw_s16(a, simde_vget_high_s16(b)); #endif } #if defined(SIMDE_ARM_NEON_A64V8_ENABLE_NATIVE_ALIASES) @@ -90,19 +68,8 @@ simde_int64x2_t simde_vaddw_high_s32(simde_int64x2_t a, simde_int32x4_t b) { #if defined(SIMDE_ARM_NEON_A64V8_NATIVE) return vaddw_high_s32(a, b); - #elif SIMDE_NATURAL_VECTOR_SIZE_GE(128) - return simde_vaddq_s64(a, simde_vmovl_high_s32(b)); #else - simde_int64x2_private r_; - simde_int64x2_private a_ = simde_int64x2_to_private(a); - simde_int32x4_private b_ = simde_int32x4_to_private(b); - - SIMDE_VECTORIZE - for (size_t i = 0 ; i < (sizeof(r_.values) / sizeof(r_.values[0])) ; i++) { - r_.values[i] = a_.values[i] + b_.values[i + ((sizeof(b_.values) / sizeof(b_.values[0])) / 2)]; - } - - return simde_int64x2_from_private(r_); + return simde_vaddw_s32(a, simde_vget_high_s32(b)); #endif } #if defined(SIMDE_ARM_NEON_A64V8_ENABLE_NATIVE_ALIASES) @@ -115,19 +82,8 @@ simde_uint16x8_t simde_vaddw_high_u8(simde_uint16x8_t a, simde_uint8x16_t b) { #if defined(SIMDE_ARM_NEON_A64V8_NATIVE) return vaddw_high_u8(a, b); - #elif SIMDE_NATURAL_VECTOR_SIZE_GE(128) - return simde_vaddq_u16(a, simde_vmovl_high_u8(b)); #else - simde_uint16x8_private r_; - simde_uint16x8_private a_ = simde_uint16x8_to_private(a); - simde_uint8x16_private b_ = simde_uint8x16_to_private(b); - - SIMDE_VECTORIZE - for (size_t i = 0 ; i < (sizeof(r_.values) / sizeof(r_.values[0])) ; i++) { - r_.values[i] = a_.values[i] + b_.values[i + ((sizeof(b_.values) / sizeof(b_.values[0])) / 2)]; - } - - return simde_uint16x8_from_private(r_); + return simde_vaddw_u8(a, simde_vget_high_u8(b)); #endif } #if defined(SIMDE_ARM_NEON_A64V8_ENABLE_NATIVE_ALIASES) @@ -140,19 +96,8 @@ simde_uint32x4_t simde_vaddw_high_u16(simde_uint32x4_t a, simde_uint16x8_t b) { #if defined(SIMDE_ARM_NEON_A64V8_NATIVE) return vaddw_high_u16(a, b); - #elif SIMDE_NATURAL_VECTOR_SIZE_GE(128) - return simde_vaddq_u32(a, simde_vmovl_high_u16(b)); #else - simde_uint32x4_private r_; - simde_uint32x4_private a_ = simde_uint32x4_to_private(a); - simde_uint16x8_private b_ = simde_uint16x8_to_private(b); - - SIMDE_VECTORIZE - for (size_t i = 0 ; i < (sizeof(r_.values) / sizeof(r_.values[0])) ; i++) { - r_.values[i] = a_.values[i] + b_.values[i + ((sizeof(b_.values) / sizeof(b_.values[0])) / 2)]; - } - - return simde_uint32x4_from_private(r_); + return simde_vaddw_u16(a, simde_vget_high_u16(b)); #endif } #if defined(SIMDE_ARM_NEON_A64V8_ENABLE_NATIVE_ALIASES) @@ -165,19 +110,8 @@ simde_uint64x2_t simde_vaddw_high_u32(simde_uint64x2_t a, simde_uint32x4_t b) { #if defined(SIMDE_ARM_NEON_A64V8_NATIVE) return vaddw_high_u32(a, b); - #elif SIMDE_NATURAL_VECTOR_SIZE_GE(128) - return simde_vaddq_u64(a, simde_vmovl_high_u32(b)); #else - simde_uint64x2_private r_; - simde_uint64x2_private a_ = simde_uint64x2_to_private(a); - simde_uint32x4_private b_ = simde_uint32x4_to_private(b); - - SIMDE_VECTORIZE - for (size_t i = 0 ; i < (sizeof(r_.values) / sizeof(r_.values[0])) ; i++) { - r_.values[i] = a_.values[i] + b_.values[i + ((sizeof(b_.values) / sizeof(b_.values[0])) / 2)]; - } - - return simde_uint64x2_from_private(r_); + return simde_vaddw_u32(a, simde_vget_high_u32(b)); #endif } #if defined(SIMDE_ARM_NEON_A64V8_ENABLE_NATIVE_ALIASES) diff --git a/simde/arm/neon/mlal_high.h b/simde/arm/neon/mlal_high.h index f7222d16f..9862c740e 100644 --- a/simde/arm/neon/mlal_high.h +++ b/simde/arm/neon/mlal_high.h @@ -28,8 +28,8 @@ #if !defined(SIMDE_ARM_NEON_MLAL_HIGH_H) #define SIMDE_ARM_NEON_MLAL_HIGH_H -#include "movl_high.h" -#include "mla.h" +#include "get_high.h" +#include "mlal.h" #include "types.h" HEDLEY_DIAGNOSTIC_PUSH @@ -42,7 +42,7 @@ simde_vmlal_high_s8(simde_int16x8_t a, simde_int8x16_t b, simde_int8x16_t c) { #if defined(SIMDE_ARM_NEON_A64V8_NATIVE) return vmlal_high_s8(a, b, c); #else - return simde_vmlaq_s16(a, simde_vmovl_high_s8(b), simde_vmovl_high_s8(c)); + return simde_vmlal_s8(a, simde_vget_high_s8(b), simde_vget_high_s8(c)); #endif } #if defined(SIMDE_ARM_NEON_A64V8_ENABLE_NATIVE_ALIASES) @@ -56,7 +56,7 @@ simde_vmlal_high_s16(simde_int32x4_t a, simde_int16x8_t b, simde_int16x8_t c) { #if defined(SIMDE_ARM_NEON_A64V8_NATIVE) return vmlal_high_s16(a, b, c); #else - return simde_vmlaq_s32(a, simde_vmovl_high_s16(b), simde_vmovl_high_s16(c)); + return simde_vmlal_s16(a, simde_vget_high_s16(b), simde_vget_high_s16(c)); #endif } #if defined(SIMDE_ARM_NEON_A64V8_ENABLE_NATIVE_ALIASES) @@ -70,22 +70,7 @@ simde_vmlal_high_s32(simde_int64x2_t a, simde_int32x4_t b, simde_int32x4_t c) { #if defined(SIMDE_ARM_NEON_A64V8_NATIVE) return vmlal_high_s32(a, b, c); #else - simde_int64x2_private - r_, - a_ = simde_int64x2_to_private(a), - b_ = simde_int64x2_to_private(simde_vmovl_high_s32(b)), - c_ = simde_int64x2_to_private(simde_vmovl_high_s32(c)); - - #if defined(SIMDE_VECTOR_SUBSCRIPT_OPS) - r_.values = (b_.values * c_.values) + a_.values; - #else - SIMDE_VECTORIZE - for (size_t i = 0 ; i < (sizeof(r_.values) / sizeof(r_.values[0])) ; i++) { - r_.values[i] = (b_.values[i] * c_.values[i]) + a_.values[i]; - } - #endif - - return simde_int64x2_from_private(r_); + return simde_vmlal_s32(a, simde_vget_high_s32(b), simde_vget_high_s32(c)); #endif } #if defined(SIMDE_ARM_NEON_A64V8_ENABLE_NATIVE_ALIASES) @@ -99,7 +84,7 @@ simde_vmlal_high_u8(simde_uint16x8_t a, simde_uint8x16_t b, simde_uint8x16_t c) #if defined(SIMDE_ARM_NEON_A64V8_NATIVE) return vmlal_high_u8(a, b, c); #else - return simde_vmlaq_u16(a, simde_vmovl_high_u8(b), simde_vmovl_high_u8(c)); + return simde_vmlal_u8(a, simde_vget_high_u8(b), simde_vget_high_u8(c)); #endif } #if defined(SIMDE_ARM_NEON_A64V8_ENABLE_NATIVE_ALIASES) @@ -113,7 +98,7 @@ simde_vmlal_high_u16(simde_uint32x4_t a, simde_uint16x8_t b, simde_uint16x8_t c) #if defined(SIMDE_ARM_NEON_A64V8_NATIVE) return vmlal_high_u16(a, b, c); #else - return simde_vmlaq_u32(a, simde_vmovl_high_u16(b), simde_vmovl_high_u16(c)); + return simde_vmlal_u16(a, simde_vget_high_u16(b), simde_vget_high_u16(c)); #endif } #if defined(SIMDE_ARM_NEON_A64V8_ENABLE_NATIVE_ALIASES) @@ -127,22 +112,7 @@ simde_vmlal_high_u32(simde_uint64x2_t a, simde_uint32x4_t b, simde_uint32x4_t c) #if defined(SIMDE_ARM_NEON_A64V8_NATIVE) return vmlal_high_u32(a, b, c); #else - simde_uint64x2_private - r_, - a_ = simde_uint64x2_to_private(a), - b_ = simde_uint64x2_to_private(simde_vmovl_high_u32(b)), - c_ = simde_uint64x2_to_private(simde_vmovl_high_u32(c)); - - #if defined(SIMDE_VECTOR_SUBSCRIPT_OPS) - r_.values = (b_.values * c_.values) + a_.values; - #else - SIMDE_VECTORIZE - for (size_t i = 0 ; i < (sizeof(r_.values) / sizeof(r_.values[0])) ; i++) { - r_.values[i] = (b_.values[i] * c_.values[i]) + a_.values[i]; - } - #endif - - return simde_uint64x2_from_private(r_); + return simde_vmlal_u32(a, simde_vget_high_u32(b), simde_vget_high_u32(c)); #endif } #if defined(SIMDE_ARM_NEON_A64V8_ENABLE_NATIVE_ALIASES) diff --git a/simde/arm/neon/mlal_high_n.h b/simde/arm/neon/mlal_high_n.h index 0c26174ec..915989c1b 100644 --- a/simde/arm/neon/mlal_high_n.h +++ b/simde/arm/neon/mlal_high_n.h @@ -27,9 +27,8 @@ #if !defined(SIMDE_ARM_NEON_MLAL_HIGH_N_H) #define SIMDE_ARM_NEON_MLAL_HIGH_N_H -#include "movl_high.h" -#include "dup_n.h" -#include "mla.h" +#include "get_high.h" +#include "mlal_n.h" #include "types.h" HEDLEY_DIAGNOSTIC_PUSH @@ -42,7 +41,7 @@ simde_vmlal_high_n_s16(simde_int32x4_t a, simde_int16x8_t b, int16_t c) { #if defined(SIMDE_ARM_NEON_A64V8_NATIVE) return vmlal_high_n_s16(a, b, c); #else - return simde_vmlaq_s32(a, simde_vmovl_high_s16(b), simde_vdupq_n_s32(c)); + return simde_vmlal_n_s16(a, simde_vget_high_s16(b), c); #endif } #if defined(SIMDE_ARM_NEON_A64V8_ENABLE_NATIVE_ALIASES) @@ -56,22 +55,7 @@ simde_vmlal_high_n_s32(simde_int64x2_t a, simde_int32x4_t b, int32_t c) { #if defined(SIMDE_ARM_NEON_A64V8_NATIVE) return vmlal_high_n_s32(a, b, c); #else - simde_int64x2_private - r_, - a_ = simde_int64x2_to_private(a), - b_ = simde_int64x2_to_private(simde_vmovl_high_s32(b)), - c_ = simde_int64x2_to_private(simde_vdupq_n_s64(c)); - - #if defined(SIMDE_VECTOR_SUBSCRIPT_OPS) - r_.values = (b_.values * c_.values) + a_.values; - #else - SIMDE_VECTORIZE - for (size_t i = 0 ; i < (sizeof(r_.values) / sizeof(r_.values[0])) ; i++) { - r_.values[i] = (b_.values[i] * c_.values[i]) + a_.values[i]; - } - #endif - - return simde_int64x2_from_private(r_); + return simde_vmlal_n_s32(a, simde_vget_high_s32(b), c); #endif } #if defined(SIMDE_ARM_NEON_A64V8_ENABLE_NATIVE_ALIASES) @@ -85,7 +69,7 @@ simde_vmlal_high_n_u16(simde_uint32x4_t a, simde_uint16x8_t b, uint16_t c) { #if defined(SIMDE_ARM_NEON_A64V8_NATIVE) return vmlal_high_n_u16(a, b, c); #else - return simde_vmlaq_u32(a, simde_vmovl_high_u16(b), simde_vdupq_n_u32(c)); + return simde_vmlal_n_u16(a, simde_vget_high_u16(b), c); #endif } #if defined(SIMDE_ARM_NEON_A64V8_ENABLE_NATIVE_ALIASES) @@ -99,22 +83,7 @@ simde_vmlal_high_n_u32(simde_uint64x2_t a, simde_uint32x4_t b, uint32_t c) { #if defined(SIMDE_ARM_NEON_A64V8_NATIVE) return vmlal_high_n_u32(a, b, c); #else - simde_uint64x2_private - r_, - a_ = simde_uint64x2_to_private(a), - b_ = simde_uint64x2_to_private(simde_vmovl_high_u32(b)), - c_ = simde_uint64x2_to_private(simde_vdupq_n_u64(c)); - - #if defined(SIMDE_VECTOR_SUBSCRIPT_OPS) - r_.values = (b_.values * c_.values) + a_.values; - #else - SIMDE_VECTORIZE - for (size_t i = 0 ; i < (sizeof(r_.values) / sizeof(r_.values[0])) ; i++) { - r_.values[i] = (b_.values[i] * c_.values[i]) + a_.values[i]; - } - #endif - - return simde_uint64x2_from_private(r_); + return simde_vmlal_n_u32(a, simde_vget_high_u32(b), c); #endif } #if defined(SIMDE_ARM_NEON_A64V8_ENABLE_NATIVE_ALIASES) diff --git a/simde/arm/neon/mlsl_high.h b/simde/arm/neon/mlsl_high.h index d70ca935d..d20cb4274 100644 --- a/simde/arm/neon/mlsl_high.h +++ b/simde/arm/neon/mlsl_high.h @@ -27,8 +27,8 @@ #if !defined(SIMDE_ARM_NEON_MLSL_HIGH_H) #define SIMDE_ARM_NEON_MLSL_HIGH_H -#include "mull_high.h" -#include "sub.h" +#include "get_high.h" +#include "mlsl.h" HEDLEY_DIAGNOSTIC_PUSH SIMDE_DISABLE_UNWANTED_DIAGNOSTICS @@ -40,7 +40,7 @@ simde_vmlsl_high_s8(simde_int16x8_t a, simde_int8x16_t b, simde_int8x16_t c) { #if defined(SIMDE_ARM_NEON_A64V8_NATIVE) return vmlsl_high_s8(a, b, c); #else - return simde_vsubq_s16(a, simde_vmull_high_s8(b, c)); + return simde_vmlsl_s8(a, simde_vget_high_s8(b), simde_vget_high_s8(c)); #endif } #if defined(SIMDE_ARM_NEON_A64V8_ENABLE_NATIVE_ALIASES) @@ -54,7 +54,7 @@ simde_vmlsl_high_s16(simde_int32x4_t a, simde_int16x8_t b, simde_int16x8_t c) { #if defined(SIMDE_ARM_NEON_A64V8_NATIVE) return vmlsl_high_s16(a, b, c); #else - return simde_vsubq_s32(a, simde_vmull_high_s16(b, c)); + return simde_vmlsl_s16(a, simde_vget_high_s16(b), simde_vget_high_s16(c)); #endif } #if defined(SIMDE_ARM_NEON_A64V8_ENABLE_NATIVE_ALIASES) @@ -68,7 +68,7 @@ simde_vmlsl_high_s32(simde_int64x2_t a, simde_int32x4_t b, simde_int32x4_t c) { #if defined(SIMDE_ARM_NEON_A64V8_NATIVE) return vmlsl_high_s32(a, b, c); #else - return simde_vsubq_s64(a, simde_vmull_high_s32(b, c)); + return simde_vmlsl_s32(a, simde_vget_high_s32(b), simde_vget_high_s32(c)); #endif } #if defined(SIMDE_ARM_NEON_A64V8_ENABLE_NATIVE_ALIASES) @@ -82,7 +82,7 @@ simde_vmlsl_high_u8(simde_uint16x8_t a, simde_uint8x16_t b, simde_uint8x16_t c) #if defined(SIMDE_ARM_NEON_A64V8_NATIVE) return vmlsl_high_u8(a, b, c); #else - return simde_vsubq_u16(a, simde_vmull_high_u8(b, c)); + return simde_vmlsl_u8(a, simde_vget_high_u8(b), simde_vget_high_u8(c)); #endif } #if defined(SIMDE_ARM_NEON_A64V8_ENABLE_NATIVE_ALIASES) @@ -96,7 +96,7 @@ simde_vmlsl_high_u16(simde_uint32x4_t a, simde_uint16x8_t b, simde_uint16x8_t c) #if defined(SIMDE_ARM_NEON_A64V8_NATIVE) return vmlsl_high_u16(a, b, c); #else - return simde_vsubq_u32(a, simde_vmull_high_u16(b, c)); + return simde_vmlsl_u16(a, simde_vget_high_u16(b), simde_vget_high_u16(c)); #endif } #if defined(SIMDE_ARM_NEON_A64V8_ENABLE_NATIVE_ALIASES) @@ -110,7 +110,7 @@ simde_vmlsl_high_u32(simde_uint64x2_t a, simde_uint32x4_t b, simde_uint32x4_t c) #if defined(SIMDE_ARM_NEON_A64V8_NATIVE) return vmlsl_high_u32(a, b, c); #else - return simde_vsubq_u64(a, simde_vmull_high_u32(b, c)); + return simde_vmlsl_u32(a, simde_vget_high_u32(b), simde_vget_high_u32(c)); #endif } #if defined(SIMDE_ARM_NEON_A64V8_ENABLE_NATIVE_ALIASES) diff --git a/simde/arm/neon/mlsl_high_n.h b/simde/arm/neon/mlsl_high_n.h index 7be34c81b..9a1ad1c37 100644 --- a/simde/arm/neon/mlsl_high_n.h +++ b/simde/arm/neon/mlsl_high_n.h @@ -27,9 +27,8 @@ #if !defined(SIMDE_ARM_NEON_MLSL_HIGH_N_H) #define SIMDE_ARM_NEON_MLSL_HIGH_N_H -#include "movl_high.h" -#include "dup_n.h" -#include "mls.h" +#include "get_high.h" +#include "mlsl_n.h" #include "types.h" HEDLEY_DIAGNOSTIC_PUSH @@ -42,7 +41,7 @@ simde_vmlsl_high_n_s16(simde_int32x4_t a, simde_int16x8_t b, int16_t c) { #if defined(SIMDE_ARM_NEON_A64V8_NATIVE) return vmlsl_high_n_s16(a, b, c); #else - return simde_vmlsq_s32(a, simde_vmovl_high_s16(b), simde_vdupq_n_s32(c)); + return simde_vmlsl_n_s16(a, simde_vget_high_s16(b), c); #endif } #if defined(SIMDE_ARM_NEON_A64V8_ENABLE_NATIVE_ALIASES) @@ -56,22 +55,7 @@ simde_vmlsl_high_n_s32(simde_int64x2_t a, simde_int32x4_t b, int32_t c) { #if defined(SIMDE_ARM_NEON_A64V8_NATIVE) return vmlsl_high_n_s32(a, b, c); #else - simde_int64x2_private - r_, - a_ = simde_int64x2_to_private(a), - b_ = simde_int64x2_to_private(simde_vmovl_high_s32(b)), - c_ = simde_int64x2_to_private(simde_vdupq_n_s64(c)); - - #if defined(SIMDE_VECTOR_SUBSCRIPT_OPS) - r_.values = a_.values - (b_.values * c_.values); - #else - SIMDE_VECTORIZE - for (size_t i = 0 ; i < (sizeof(r_.values) / sizeof(r_.values[0])) ; i++) { - r_.values[i] = a_.values[i] - (b_.values[i] * c_.values[i]); - } - #endif - - return simde_int64x2_from_private(r_); + return simde_vmlsl_n_s32(a, simde_vget_high_s32(b), c); #endif } #if defined(SIMDE_ARM_NEON_A64V8_ENABLE_NATIVE_ALIASES) @@ -85,7 +69,7 @@ simde_vmlsl_high_n_u16(simde_uint32x4_t a, simde_uint16x8_t b, uint16_t c) { #if defined(SIMDE_ARM_NEON_A64V8_NATIVE) return vmlsl_high_n_u16(a, b, c); #else - return simde_vmlsq_u32(a, simde_vmovl_high_u16(b), simde_vdupq_n_u32(c)); + return simde_vmlsl_n_u16(a, simde_vget_high_u16(b), c); #endif } #if defined(SIMDE_ARM_NEON_A64V8_ENABLE_NATIVE_ALIASES) @@ -99,22 +83,7 @@ simde_vmlsl_high_n_u32(simde_uint64x2_t a, simde_uint32x4_t b, uint32_t c) { #if defined(SIMDE_ARM_NEON_A64V8_NATIVE) return vmlsl_high_n_u32(a, b, c); #else - simde_uint64x2_private - r_, - a_ = simde_uint64x2_to_private(a), - b_ = simde_uint64x2_to_private(simde_vmovl_high_u32(b)), - c_ = simde_uint64x2_to_private(simde_vdupq_n_u64(c)); - - #if defined(SIMDE_VECTOR_SUBSCRIPT_OPS) - r_.values = a_.values - (b_.values * c_.values); - #else - SIMDE_VECTORIZE - for (size_t i = 0 ; i < (sizeof(r_.values) / sizeof(r_.values[0])) ; i++) { - r_.values[i] = a_.values[i] - (b_.values[i] * c_.values[i]); - } - #endif - - return simde_uint64x2_from_private(r_); + return simde_vmlsl_n_u32(a, simde_vget_high_u32(b), c); #endif } #if defined(SIMDE_ARM_NEON_A64V8_ENABLE_NATIVE_ALIASES) diff --git a/simde/arm/neon/mull_high.h b/simde/arm/neon/mull_high.h index 87e83369a..6f262e7f4 100644 --- a/simde/arm/neon/mull_high.h +++ b/simde/arm/neon/mull_high.h @@ -29,9 +29,9 @@ #define SIMDE_ARM_NEON_MULL_HIGH_H #include "types.h" -#include "mul.h" -#include "movl_high.h" #include "mull.h" +#include "get_high.h" + HEDLEY_DIAGNOSTIC_PUSH SIMDE_DISABLE_UNWANTED_DIAGNOSTICS @@ -43,7 +43,7 @@ simde_vmull_high_s8(simde_int8x16_t a, simde_int8x16_t b) { #if defined(SIMDE_ARM_NEON_A64V8_NATIVE) return vmull_high_s8(a, b); #else - return simde_vmulq_s16(simde_vmovl_high_s8(a), simde_vmovl_high_s8(b)); + return simde_vmull_s8(simde_vget_high_s8(a), simde_vget_high_s8(b)); #endif } #if defined(SIMDE_ARM_NEON_A64V8_ENABLE_NATIVE_ALIASES) @@ -57,7 +57,7 @@ simde_vmull_high_s16(simde_int16x8_t a, simde_int16x8_t b) { #if defined(SIMDE_ARM_NEON_A64V8_NATIVE) return vmull_high_s16(a, b); #else - return simde_vmulq_s32(simde_vmovl_high_s16(a), simde_vmovl_high_s16(b)); + return simde_vmull_s16(simde_vget_high_s16(a), simde_vget_high_s16(b)); #endif } #if defined(SIMDE_ARM_NEON_A64V8_ENABLE_NATIVE_ALIASES) @@ -71,7 +71,7 @@ simde_vmull_high_s32(simde_int32x4_t a, simde_int32x4_t b) { #if defined(SIMDE_ARM_NEON_A64V8_NATIVE) return vmull_high_s32(a, b); #else - return simde_x_vmulq_s64(simde_vmovl_high_s32(a), simde_vmovl_high_s32(b)); + return simde_vmull_s32(simde_vget_high_s32(a), simde_vget_high_s32(b)); #endif } #if defined(SIMDE_ARM_NEON_A64V8_ENABLE_NATIVE_ALIASES) @@ -85,7 +85,7 @@ simde_vmull_high_u8(simde_uint8x16_t a, simde_uint8x16_t b) { #if defined(SIMDE_ARM_NEON_A64V8_NATIVE) return vmull_high_u8(a, b); #else - return simde_vmulq_u16(simde_vmovl_high_u8(a), simde_vmovl_high_u8(b)); + return simde_vmull_u8(simde_vget_high_u8(a), simde_vget_high_u8(b)); #endif } #if defined(SIMDE_ARM_NEON_A64V8_ENABLE_NATIVE_ALIASES) @@ -99,7 +99,7 @@ simde_vmull_high_u16(simde_uint16x8_t a, simde_uint16x8_t b) { #if defined(SIMDE_ARM_NEON_A64V8_NATIVE) return vmull_high_u16(a, b); #else - return simde_vmulq_u32(simde_vmovl_high_u16(a), simde_vmovl_high_u16(b)); + return simde_vmull_u16(simde_vget_high_u16(a), simde_vget_high_u16(b)); #endif } #if defined(SIMDE_ARM_NEON_A64V8_ENABLE_NATIVE_ALIASES) @@ -113,7 +113,7 @@ simde_vmull_high_u32(simde_uint32x4_t a, simde_uint32x4_t b) { #if defined(SIMDE_ARM_NEON_A64V8_NATIVE) return vmull_high_u32(a, b); #else - return simde_x_vmulq_u64(simde_vmovl_high_u32(a), simde_vmovl_high_u32(b)); + return simde_vmull_u32(simde_vget_high_u32(a), simde_vget_high_u32(b)); #endif } #if defined(SIMDE_ARM_NEON_A64V8_ENABLE_NATIVE_ALIASES) diff --git a/simde/arm/neon/subl_high.h b/simde/arm/neon/subl_high.h index d45f4989b..8d003345a 100644 --- a/simde/arm/neon/subl_high.h +++ b/simde/arm/neon/subl_high.h @@ -27,9 +27,8 @@ #if !defined(SIMDE_ARM_NEON_SUBL_HIGH_H) #define SIMDE_ARM_NEON_SUBL_HIGH_H -#include "sub.h" -#include "movl.h" -#include "movl_high.h" +#include "subl.h" +#include "get_high.h" #include "types.h" HEDLEY_DIAGNOSTIC_PUSH @@ -42,7 +41,7 @@ simde_vsubl_high_s8(simde_int8x16_t a, simde_int8x16_t b) { #if defined(SIMDE_ARM_NEON_A64V8_NATIVE) return vsubl_high_s8(a, b); #else - return simde_vsubq_s16(simde_vmovl_high_s8(a), simde_vmovl_high_s8(b)); + return simde_vsubl_s8(simde_vget_high_s8(a), simde_vget_high_s8(b)); #endif } #if defined(SIMDE_ARM_NEON_A64V8_ENABLE_NATIVE_ALIASES) @@ -56,7 +55,7 @@ simde_vsubl_high_s16(simde_int16x8_t a, simde_int16x8_t b) { #if defined(SIMDE_ARM_NEON_A64V8_NATIVE) return vsubl_high_s16(a, b); #else - return simde_vsubq_s32(simde_vmovl_high_s16(a), simde_vmovl_high_s16(b)); + return simde_vsubl_s16(simde_vget_high_s16(a), simde_vget_high_s16(b)); #endif } #if defined(SIMDE_ARM_NEON_A64V8_ENABLE_NATIVE_ALIASES) @@ -70,7 +69,7 @@ simde_vsubl_high_s32(simde_int32x4_t a, simde_int32x4_t b) { #if defined(SIMDE_ARM_NEON_A64V8_NATIVE) return vsubl_high_s32(a, b); #else - return simde_vsubq_s64(simde_vmovl_high_s32(a), simde_vmovl_high_s32(b)); + return simde_vsubl_s32(simde_vget_high_s32(a), simde_vget_high_s32(b)); #endif } #if defined(SIMDE_ARM_NEON_A64V8_ENABLE_NATIVE_ALIASES) @@ -84,7 +83,7 @@ simde_vsubl_high_u8(simde_uint8x16_t a, simde_uint8x16_t b) { #if defined(SIMDE_ARM_NEON_A64V8_NATIVE) return vsubl_high_u8(a, b); #else - return simde_vsubq_u16(simde_vmovl_high_u8(a), simde_vmovl_high_u8(b)); + return simde_vsubl_u8(simde_vget_high_u8(a), simde_vget_high_u8(b)); #endif } #if defined(SIMDE_ARM_NEON_A64V8_ENABLE_NATIVE_ALIASES) @@ -98,7 +97,7 @@ simde_vsubl_high_u16(simde_uint16x8_t a, simde_uint16x8_t b) { #if defined(SIMDE_ARM_NEON_A64V8_NATIVE) return vsubl_high_u16(a, b); #else - return simde_vsubq_u32(simde_vmovl_high_u16(a), simde_vmovl_high_u16(b)); + return simde_vsubl_u16(simde_vget_high_u16(a), simde_vget_high_u16(b)); #endif } #if defined(SIMDE_ARM_NEON_A64V8_ENABLE_NATIVE_ALIASES) @@ -112,7 +111,7 @@ simde_vsubl_high_u32(simde_uint32x4_t a, simde_uint32x4_t b) { #if defined(SIMDE_ARM_NEON_A64V8_NATIVE) return vsubl_high_u32(a, b); #else - return simde_vsubq_u64(simde_vmovl_high_u32(a), simde_vmovl_high_u32(b)); + return simde_vsubl_u32(simde_vget_high_u32(a), simde_vget_high_u32(b)); #endif } #if defined(SIMDE_ARM_NEON_A64V8_ENABLE_NATIVE_ALIASES) diff --git a/simde/arm/neon/subw_high.h b/simde/arm/neon/subw_high.h index 729a478a7..e09551f44 100644 --- a/simde/arm/neon/subw_high.h +++ b/simde/arm/neon/subw_high.h @@ -28,8 +28,8 @@ #define SIMDE_ARM_NEON_SUBW_HIGH_H #include "types.h" -#include "movl_high.h" -#include "sub.h" +#include "get_high.h" +#include "subw.h" HEDLEY_DIAGNOSTIC_PUSH SIMDE_DISABLE_UNWANTED_DIAGNOSTICS @@ -40,24 +40,8 @@ simde_int16x8_t simde_vsubw_high_s8(simde_int16x8_t a, simde_int8x16_t b) { #if defined(SIMDE_ARM_NEON_A64V8_NATIVE) return vsubw_high_s8(a, b); - #elif SIMDE_NATURAL_VECTOR_SIZE_GE(128) - return simde_vsubq_s16(a, simde_vmovl_high_s8(b)); #else - simde_int16x8_private r_; - simde_int16x8_private a_ = simde_int16x8_to_private(a); - simde_int8x16_private b_ = simde_int8x16_to_private(b); - - #if (SIMDE_NATURAL_VECTOR_SIZE > 0) && defined(SIMDE_VECTOR_SUBSCRIPT_OPS) && defined(SIMDE_CONVERT_VECTOR_) - SIMDE_CONVERT_VECTOR_(r_.values, b_.values); - r_.values -= a_.values; - #else - SIMDE_VECTORIZE - for (size_t i = 0 ; i < (sizeof(r_.values) / sizeof(r_.values[0])) ; i++) { - r_.values[i] = a_.values[i] - b_.values[i + ((sizeof(b_.values) / sizeof(b_.values[0])) / 2)]; - } - #endif - - return simde_int16x8_from_private(r_); + return simde_vsubw_s8(a, simde_vget_high_s8(b)); #endif } #if defined(SIMDE_ARM_NEON_A64V8_ENABLE_NATIVE_ALIASES) @@ -70,24 +54,8 @@ simde_int32x4_t simde_vsubw_high_s16(simde_int32x4_t a, simde_int16x8_t b) { #if defined(SIMDE_ARM_NEON_A64V8_NATIVE) return vsubw_high_s16(a, b); - #elif SIMDE_NATURAL_VECTOR_SIZE_GE(128) - return simde_vsubq_s32(a, simde_vmovl_high_s16(b)); #else - simde_int32x4_private r_; - simde_int32x4_private a_ = simde_int32x4_to_private(a); - simde_int16x8_private b_ = simde_int16x8_to_private(b); - - #if (SIMDE_NATURAL_VECTOR_SIZE > 0) && defined(SIMDE_VECTOR_SUBSCRIPT_OPS) && defined(SIMDE_CONVERT_VECTOR_) - SIMDE_CONVERT_VECTOR_(r_.values, b_.values); - r_.values -= a_.values; - #else - SIMDE_VECTORIZE - for (size_t i = 0 ; i < (sizeof(r_.values) / sizeof(r_.values[0])) ; i++) { - r_.values[i] = a_.values[i] - b_.values[i + ((sizeof(b_.values) / sizeof(b_.values[0])) / 2)]; - } - #endif - - return simde_int32x4_from_private(r_); + return simde_vsubw_s16(a, simde_vget_high_s16(b)); #endif } #if defined(SIMDE_ARM_NEON_A64V8_ENABLE_NATIVE_ALIASES) @@ -100,24 +68,8 @@ simde_int64x2_t simde_vsubw_high_s32(simde_int64x2_t a, simde_int32x4_t b) { #if defined(SIMDE_ARM_NEON_A64V8_NATIVE) return vsubw_high_s32(a, b); - #elif SIMDE_NATURAL_VECTOR_SIZE_GE(128) - return simde_vsubq_s64(a, simde_vmovl_high_s32(b)); #else - simde_int64x2_private r_; - simde_int64x2_private a_ = simde_int64x2_to_private(a); - simde_int32x4_private b_ = simde_int32x4_to_private(b); - - #if (SIMDE_NATURAL_VECTOR_SIZE > 0) && defined(SIMDE_VECTOR_SUBSCRIPT_OPS) && defined(SIMDE_CONVERT_VECTOR_) - SIMDE_CONVERT_VECTOR_(r_.values, b_.values); - r_.values -= a_.values; - #else - SIMDE_VECTORIZE - for (size_t i = 0 ; i < (sizeof(r_.values) / sizeof(r_.values[0])) ; i++) { - r_.values[i] = a_.values[i] - b_.values[i + ((sizeof(b_.values) / sizeof(b_.values[0])) / 2)]; - } - #endif - - return simde_int64x2_from_private(r_); + return simde_vsubw_s32(a, simde_vget_high_s32(b)); #endif } #if defined(SIMDE_ARM_NEON_A64V8_ENABLE_NATIVE_ALIASES) @@ -130,24 +82,8 @@ simde_uint16x8_t simde_vsubw_high_u8(simde_uint16x8_t a, simde_uint8x16_t b) { #if defined(SIMDE_ARM_NEON_A64V8_NATIVE) return vsubw_high_u8(a, b); - #elif SIMDE_NATURAL_VECTOR_SIZE_GE(128) - return simde_vsubq_u16(a, simde_vmovl_high_u8(b)); #else - simde_uint16x8_private r_; - simde_uint16x8_private a_ = simde_uint16x8_to_private(a); - simde_uint8x16_private b_ = simde_uint8x16_to_private(b); - - #if (SIMDE_NATURAL_VECTOR_SIZE > 0) && defined(SIMDE_VECTOR_SUBSCRIPT_OPS) && defined(SIMDE_CONVERT_VECTOR_) - SIMDE_CONVERT_VECTOR_(r_.values, b_.values); - r_.values -= a_.values; - #else - SIMDE_VECTORIZE - for (size_t i = 0 ; i < (sizeof(r_.values) / sizeof(r_.values[0])) ; i++) { - r_.values[i] = a_.values[i] - b_.values[i + ((sizeof(b_.values) / sizeof(b_.values[0])) / 2)]; - } - #endif - - return simde_uint16x8_from_private(r_); + return simde_vsubw_u8(a, simde_vget_high_u8(b)); #endif } #if defined(SIMDE_ARM_NEON_A64V8_ENABLE_NATIVE_ALIASES) @@ -160,24 +96,8 @@ simde_uint32x4_t simde_vsubw_high_u16(simde_uint32x4_t a, simde_uint16x8_t b) { #if defined(SIMDE_ARM_NEON_A64V8_NATIVE) return vsubw_high_u16(a, b); - #elif SIMDE_NATURAL_VECTOR_SIZE_GE(128) - return simde_vsubq_u32(a, simde_vmovl_high_u16(b)); #else - simde_uint32x4_private r_; - simde_uint32x4_private a_ = simde_uint32x4_to_private(a); - simde_uint16x8_private b_ = simde_uint16x8_to_private(b); - - #if (SIMDE_NATURAL_VECTOR_SIZE > 0) && defined(SIMDE_VECTOR_SUBSCRIPT_OPS) && defined(SIMDE_CONVERT_VECTOR_) - SIMDE_CONVERT_VECTOR_(r_.values, b_.values); - r_.values -= a_.values; - #else - SIMDE_VECTORIZE - for (size_t i = 0 ; i < (sizeof(r_.values) / sizeof(r_.values[0])) ; i++) { - r_.values[i] = a_.values[i] - b_.values[i + ((sizeof(b_.values) / sizeof(b_.values[0])) / 2)]; - } - #endif - - return simde_uint32x4_from_private(r_); + return simde_vsubw_u16(a, simde_vget_high_u16(b)); #endif } #if defined(SIMDE_ARM_NEON_A64V8_ENABLE_NATIVE_ALIASES) @@ -190,24 +110,8 @@ simde_uint64x2_t simde_vsubw_high_u32(simde_uint64x2_t a, simde_uint32x4_t b) { #if defined(SIMDE_ARM_NEON_A64V8_NATIVE) return vsubw_high_u32(a, b); - #elif SIMDE_NATURAL_VECTOR_SIZE_GE(128) - return simde_vsubq_u64(a, simde_vmovl_high_u32(b)); #else - simde_uint64x2_private r_; - simde_uint64x2_private a_ = simde_uint64x2_to_private(a); - simde_uint32x4_private b_ = simde_uint32x4_to_private(b); - - #if (SIMDE_NATURAL_VECTOR_SIZE > 0) && defined(SIMDE_VECTOR_SUBSCRIPT_OPS) && defined(SIMDE_CONVERT_VECTOR_) - SIMDE_CONVERT_VECTOR_(r_.values, b_.values); - r_.values -= a_.values; - #else - SIMDE_VECTORIZE - for (size_t i = 0 ; i < (sizeof(r_.values) / sizeof(r_.values[0])) ; i++) { - r_.values[i] = a_.values[i] - b_.values[i + ((sizeof(b_.values) / sizeof(b_.values[0])) / 2)]; - } - #endif - - return simde_uint64x2_from_private(r_); + return simde_vsubw_u32(a, simde_vget_high_u32(b)); #endif } #if defined(SIMDE_ARM_NEON_A64V8_ENABLE_NATIVE_ALIASES)