Skip to content

Commit

Permalink
Release 2.1.0
Browse files Browse the repository at this point in the history
### Known Issues

- half8 "equals" and "not equals" operators don't conform to the IEEE 754 standard - Unity has not yet reacted to my bug-report in regards to their "half" implementation

### Fixes

- fixed triggered burst compilation error by "Sse4_1.blend_epi16" when compiling for SSE2 due to fallback code not using a constant value for "imm8"
- fixed incorrect CPU feature checks for quarter vector type-conversion code when compiling for SSE2
- fixed "tzcnt" implementations (were completely broken)
- fixed scalar (single value and C# fallback) "lzcnt" implementations for (s)byte and (u)short values and (u)long4 vectors

### Additions

- added "ulong countbits(void* ptr, ulong bytes)", which counts the number of 1-bits in a given block of memory, using Wojciech Mula's SIMD population count algorithm
- added high performance and/or SIMD "gcd" a.k.a. greatest common divisor functions for (u)int, (u)long and all integer vector types, which always return unsgined types and vectors
- added high performance and/or SIMD "lcm" a.k.a. least common multiple functions for (u)int, (u)long and all integer vector types, which always return unsgined types and vectors
- added high performance and/or SIMD "intsqrt" - integer square root (floor(sqrt(x)) functions for all integer- and integer vector types, with the functions for signed integers throwing an ArgumentOutOfRangeException in case a value is negative

### Improvements

- performance improvements of "avg" functions for signed integer vectors
- added SIMD implementations of the "transpose" functions for all matrix types
- added SSE4 and SSE2 fallback code for variable bitshifts ("shl", "shrl" and "shra")
- added SSE2 fallback code for (s)byte vector-by-vector division and modulo operations
- added SSE2 fallback code for "all_dif" for (s)byte16, (u)short8 and (u)int8 vectors
- added SSE2 fallback code for typecasting, propagating through the entire library
- added SSE2 fallback code for "addsub" and "subadd" functions
- bitmask32 and bitmask64 now allow for masks to be up to 32 and 64 bits wide, respectively

### Changes

- renamed "BurstCompilerException" to "CPUFeatureCheckException"
- "shl", "shrl" and "shra" now have undefined behavior when bitshifting any value beyond [0, 8 * sizeof(integer_type)] for performance reasons and because of differences between SSE, AVX and managed C#

### Fixed Oversights

- added "shl", "shrl" and "shra" functions for (s)byte and (u)short vectors
- added "ror" and "rol" (varying per element) functions for (s)byte and (u)short vectors
- added "compareto" functions for all vector types except half- and quarter vectors
- added "all_dif" functions for (s)byte32 vectors
- added vshr/l and vror/l functions for (s)byte32 and (u)short16 vectors
  • Loading branch information
MrUnbelievable92 committed Feb 26, 2021
1 parent 2970a15 commit a6b5c64
Show file tree
Hide file tree
Showing 142 changed files with 19,088 additions and 6,549 deletions.
78 changes: 46 additions & 32 deletions Runtime/Functions/Arithmetic/Add-Subtract.cs
Original file line number Diff line number Diff line change
Expand Up @@ -18,7 +18,7 @@ public static float2 addsub(float2 a, float2 b)
}
else
{
return new float2(a.x + b.x, a.y - b.y);
return a + math.select(b, -b, new bool2(false, true));
}
}

Expand All @@ -32,7 +32,7 @@ public static float3 addsub(float3 a, float3 b)
}
else
{
return new float3(a.x + b.x, a.y - b.y, a.z + b.z);
return a + math.select(b, -b, new bool3(false, true, false));
}
}

Expand All @@ -46,7 +46,7 @@ public static float4 addsub(float4 a, float4 b)
}
else
{
return new float4(a.x + b.x, a.y - b.y, a.z + b.z, a.w - b.w);
return a + math.select(b, -b, new bool4(false, true, false, true));
}
}

Expand Down Expand Up @@ -75,7 +75,7 @@ public static double2 addsub(double2 a, double2 b)
}
else
{
return new double2(a.x + b.x, a.y - b.y);
return a + math.select(b, -b, new bool2(false, true));
}
}

Expand All @@ -89,7 +89,7 @@ public static double3 addsub(double3 a, double3 b)
}
else
{
return new double3(a.x + b.x, a.y - b.y, a.z + b.z);
return a + math.select(b, -b, new bool3(false, true, false));
}
}

Expand All @@ -103,7 +103,7 @@ public static double4 addsub(double4 a, double4 b)
}
else
{
return new double4(a.x + b.x, a.y - b.y, a.z + b.z, a.w - b.w);
return a + math.select(b, -b, new bool4(false, true, false, true));
}
}

Expand All @@ -118,7 +118,7 @@ public static byte2 addsub(byte2 a, byte2 b)
}
else
{
return new byte2((byte)(a.x + b.x), (byte)(a.y - b.y));
return a + select(b, (byte2)(-(sbyte2)b), new bool2(false, true));
}
}

Expand All @@ -132,7 +132,7 @@ public static byte3 addsub(byte3 a, byte3 b)
}
else
{
return new byte3((byte)(a.x + b.x), (byte)(a.y - b.y), (byte)(a.z + b.z));
return a + select(b, (byte3)(-(sbyte3)b), new bool3(false, true, false));
}
}

Expand All @@ -146,7 +146,7 @@ public static byte4 addsub(byte4 a, byte4 b)
}
else
{
return new byte4((byte)(a.x + b.x), (byte)(a.y - b.y), (byte)(a.z + b.z), (byte)(a.w - b.w));
return a + select(b, (byte4)(-(sbyte4)b), new bool4(false, true, false, true));
}
}

Expand All @@ -160,7 +160,7 @@ public static byte8 addsub(byte8 a, byte8 b)
}
else
{
return new byte8((byte)(a.x0 + b.x0), (byte)(a.x1 - b.x1), (byte)(a.x2 + b.x2), (byte)(a.x3 - b.x3), (byte)(a.x4 + b.x4), (byte)(a.x5 - b.x5), (byte)(a.x6 + b.x6), (byte)(a.x7 - b.x7));
return a + select(b, (byte8)(-(sbyte8)b), new bool8(false, true, false, true, false, true, false, true));
}
}

Expand All @@ -174,7 +174,7 @@ public static byte16 addsub(byte16 a, byte16 b)
}
else
{
return new byte16((byte)(a.x0 + b.x0), (byte)(a.x1 - b.x1), (byte)(a.x2 + b.x2), (byte)(a.x3 - b.x3), (byte)(a.x4 + b.x4), (byte)(a.x5 - b.x5), (byte)(a.x6 + b.x6), (byte)(a.x7 - b.x7), (byte)(a.x8 + b.x8), (byte)(a.x9 - b.x9), (byte)(a.x10 + b.x10), (byte)(a.x11 - b.x11), (byte)(a.x12 + b.x12), (byte)(a.x13 - b.x13), (byte)(a.x14 + b.x14), (byte)(a.x15 - b.x15));
return a + select(b, (byte16)(-(sbyte16)b), new bool16(false, true, false, true, false, true, false, true, false, true, false, true, false, true, false, true));
}
}

Expand Down Expand Up @@ -203,7 +203,7 @@ public static sbyte2 addsub(sbyte2 a, sbyte2 b)
}
else
{
return new sbyte2((sbyte)(a.x + b.x), (sbyte)(a.y - b.y));
return a + select(b, -b, new bool2(false, true));
}
}

Expand All @@ -217,7 +217,7 @@ public static sbyte3 addsub(sbyte3 a, sbyte3 b)
}
else
{
return new sbyte3((sbyte)(a.x + b.x), (sbyte)(a.y - b.y), (sbyte)(a.z + b.z));
return a + select(b, -b, new bool3(false, true, false));
}
}

Expand All @@ -231,7 +231,7 @@ public static sbyte4 addsub(sbyte4 a, sbyte4 b)
}
else
{
return new sbyte4((sbyte)(a.x + b.x), (sbyte)(a.y - b.y), (sbyte)(a.z + b.z), (sbyte)(a.w - b.w));
return a + select(b, -b, new bool4(false, true, false, true));
}
}

Expand All @@ -245,7 +245,7 @@ public static sbyte8 addsub(sbyte8 a, sbyte8 b)
}
else
{
return new sbyte8((sbyte)(a.x0 + b.x0), (sbyte)(a.x1 - b.x1), (sbyte)(a.x2 + b.x2), (sbyte)(a.x3 - b.x3), (sbyte)(a.x4 + b.x4), (sbyte)(a.x5 - b.x5), (sbyte)(a.x6 + b.x6), (sbyte)(a.x7 - b.x7));
return a + select(b, -b, new bool8(false, true, false, true, false, true, false, true));
}
}

Expand All @@ -259,7 +259,7 @@ public static sbyte16 addsub(sbyte16 a, sbyte16 b)
}
else
{
return new sbyte16((sbyte)(a.x0 + b.x0), (sbyte)(a.x1 - b.x1), (sbyte)(a.x2 + b.x2), (sbyte)(a.x3 - b.x3), (sbyte)(a.x4 + b.x4), (sbyte)(a.x5 - b.x5), (sbyte)(a.x6 + b.x6), (sbyte)(a.x7 - b.x7), (sbyte)(a.x8 + b.x8), (sbyte)(a.x9 - b.x9), (sbyte)(a.x10 + b.x10), (sbyte)(a.x11 - b.x11), (sbyte)(a.x12 + b.x12), (sbyte)(a.x13 - b.x13), (sbyte)(a.x14 + b.x14), (sbyte)(a.x15 - b.x15));
return a + select(b, -b, new bool16(false, true, false, true, false, true, false, true, false, true, false, true, false, true, false, true));
}
}

Expand Down Expand Up @@ -288,7 +288,7 @@ public static ushort2 addsub(ushort2 a, ushort2 b)
}
else
{
return new ushort2((ushort)(a.x + b.x), (ushort)(a.y - b.y));
return a + select(b, (ushort2)(-(short2)b), new bool2(false, true));
}
}

Expand All @@ -302,7 +302,7 @@ public static ushort3 addsub(ushort3 a, ushort3 b)
}
else
{
return new ushort3((ushort)(a.x + b.x), (ushort)(a.y - b.y), (ushort)(a.z + b.z));
return a + select(b, (ushort3)(-(short3)b), new bool3(false, true, false));
}
}

Expand All @@ -316,7 +316,7 @@ public static ushort4 addsub(ushort4 a, ushort4 b)
}
else
{
return new ushort4((ushort)(a.x + b.x), (ushort)(a.y - b.y), (ushort)(a.z + b.z), (ushort)(a.w - b.w));
return a + select(b, (ushort4)(-(short4)b), new bool4(false, true, false, true));
}
}

Expand All @@ -330,7 +330,7 @@ public static ushort8 addsub(ushort8 a, ushort8 b)
}
else
{
return new ushort8((ushort)(a.x0 + b.x0), (ushort)(a.x1 - b.x1), (ushort)(a.x2 + b.x2), (ushort)(a.x3 - b.x3), (ushort)(a.x4 + b.x4), (ushort)(a.x5 - b.x5), (ushort)(a.x6 + b.x6), (ushort)(a.x7 - b.x7));
return a + select(b, (ushort8)(-(short8)b), new bool8(false, true, false, true, false, true, false, true));
}
}

Expand Down Expand Up @@ -359,7 +359,7 @@ public static short2 addsub(short2 a, short2 b)
}
else
{
return new short2((short)(a.x + b.x), (short)(a.y - b.y));
return a + select(b, -b, new bool2(false, true));
}
}

Expand All @@ -373,7 +373,7 @@ public static short3 addsub(short3 a, short3 b)
}
else
{
return new short3((short)(a.x + b.x), (short)(a.y - b.y), (short)(a.z + b.z));
return a + select(b, -b, new bool3(false, true, false));
}
}

Expand All @@ -387,7 +387,7 @@ public static short4 addsub(short4 a, short4 b)
}
else
{
return new short4((short)(a.x + b.x), (short)(a.y - b.y), (short)(a.z + b.z), (short)(a.w - b.w));
return a + select(b, -b, new bool4(false, true, false, true));
}
}

Expand All @@ -401,7 +401,7 @@ public static short8 addsub(short8 a, short8 b)
}
else
{
return new short8((short)(a.x0 + b.x0), (short)(a.x1 - b.x1), (short)(a.x2 + b.x2), (short)(a.x3 - b.x3), (short)(a.x4 + b.x4), (short)(a.x5 - b.x5), (short)(a.x6 + b.x6), (short)(a.x7 - b.x7));
return a + select(b, -b, new bool8(false, true, false, true, false, true, false, true));
}
}

Expand Down Expand Up @@ -432,7 +432,7 @@ public static uint2 addsub(uint2 a, uint2 b)
}
else
{
return new uint2(a.x + b.x, a.y - b.y);
return a + math.select(b, (uint2)(-(int2)b), new bool2(false, true));
}
}

Expand All @@ -448,7 +448,7 @@ public static uint3 addsub(uint3 a, uint3 b)
}
else
{
return new uint3(a.x + b.x, a.y - b.y, a.z + b.z);
return a + math.select(b, (uint3)(-(int3)b), new bool3(false, true, false));
}
}

Expand All @@ -464,7 +464,7 @@ public static uint4 addsub(uint4 a, uint4 b)
}
else
{
return new uint4(a.x + b.x, a.y - b.y, a.z + b.z, a.w - b.w);
return a + math.select(b, (uint4)(-(int4)b), new bool4(false, true, false, true));
}
}

Expand Down Expand Up @@ -495,7 +495,7 @@ public static int2 addsub(int2 a, int2 b)
}
else
{
return new int2(a.x + b.x, a.y - b.y);
return a + math.select(b, -b, new bool2(false, true));
}
}

Expand All @@ -511,7 +511,7 @@ public static int3 addsub(int3 a, int3 b)
}
else
{
return new int3(a.x + b.x, a.y - b.y, a.z + b.z);
return a + math.select(b, -b, new bool3(false, true, false));
}
}

Expand All @@ -527,7 +527,7 @@ public static int4 addsub(int4 a, int4 b)
}
else
{
return new int4(a.x + b.x, a.y - b.y, a.z + b.z, a.w - b.w);
return a + math.select(b, -b, new bool4(false, true, false, true));
}
}

Expand All @@ -552,7 +552,14 @@ public static ulong2 addsub(ulong2 a, ulong2 b)
{
if (Sse2.IsSse2Supported)
{
return a + Mask.BlendEpi16(b, default(v128) - b, 0b1111_0000);
if (Sse4_1.IsSse41Supported)
{
return a + Sse4_1.blend_epi16(b, default(v128) - b, 0b1111_0000);
}
else
{
return a + Mask.BlendEpi16_SSE2(b, default(v128) - b, 0b1111_0000);
}
}
else
{
Expand Down Expand Up @@ -595,7 +602,14 @@ public static long2 addsub(long2 a, long2 b)
{
if (Sse2.IsSse2Supported)
{
return a + Mask.BlendEpi16(b, -b, 0b1111_0000);
if (Sse4_1.IsSse41Supported)
{
return a + Sse4_1.blend_epi16(b, -b, 0b1111_0000);
}
else
{
return a + Mask.BlendEpi16_SSE2(b, -b, 0b1111_0000);
}
}
else
{
Expand Down
Loading

0 comments on commit a6b5c64

Please sign in to comment.