Release 2.1.0

### Known Issues - half8 "equals" and "not equals" operators don't conform to the IEEE 754 standard - Unity has not yet reacted to my bug-report in regards to their "half" implementation ### Fixes - fixed triggered burst compilation error by "Sse4_1.blend_epi16" when compiling for SSE2 due to fallback code not using a constant value for "imm8" - fixed incorrect CPU feature checks for quarter vector type-conversion code when compiling for SSE2 - fixed "tzcnt" implementations (were completely broken) - fixed scalar (single value and C# fallback) "lzcnt" implementations for (s)byte and (u)short values and (u)long4 vectors ### Additions - added "ulong countbits(void* ptr, ulong bytes)", which counts the number of 1-bits in a given block of memory, using Wojciech Mula's SIMD population count algorithm - added high performance and/or SIMD "gcd" a.k.a. greatest common divisor functions for (u)int, (u)long and all integer vector types, which always return unsgined types and vectors - added high performance and/or SIMD "lcm" a.k.a. least common multiple functions for (u)int, (u)long and all integer vector types, which always return unsgined types and vectors - added high performance and/or SIMD "intsqrt" - integer square root (floor(sqrt(x)) functions for all integer- and integer vector types, with the functions for signed integers throwing an ArgumentOutOfRangeException in case a value is negative ### Improvements - performance improvements of "avg" functions for signed integer vectors - added SIMD implementations of the "transpose" functions for all matrix types - added SSE4 and SSE2 fallback code for variable bitshifts ("shl", "shrl" and "shra") - added SSE2 fallback code for (s)byte vector-by-vector division and modulo operations - added SSE2 fallback code for "all_dif" for (s)byte16, (u)short8 and (u)int8 vectors - added SSE2 fallback code for typecasting, propagating through the entire library - added SSE2 fallback code for "addsub" and "subadd" functions - bitmask32 and bitmask64 now allow for masks to be up to 32 and 64 bits wide, respectively ### Changes - renamed "BurstCompilerException" to "CPUFeatureCheckException" - "shl", "shrl" and "shra" now have undefined behavior when bitshifting any value beyond [0, 8 * sizeof(integer_type)] for performance reasons and because of differences between SSE, AVX and managed C# ### Fixed Oversights - added "shl", "shrl" and "shra" functions for (s)byte and (u)short vectors - added "ror" and "rol" (varying per element) functions for (s)byte and (u)short vectors - added "compareto" functions for all vector types except half- and quarter vectors - added "all_dif" functions for (s)byte32 vectors - added vshr/l and vror/l functions for (s)byte32 and (u)short16 vectors
MrUnbelievable92 · Feb 26, 2021 · a6b5c64 · a6b5c64
1 parent 2970a15
commit a6b5c64
Show file tree

Hide file tree

Showing 142 changed files with 19,088 additions and 6,549 deletions.
diff --git a/Runtime/Functions/Arithmetic/Add-Subtract.cs b/Runtime/Functions/Arithmetic/Add-Subtract.cs
@@ -18,7 +18,7 @@ public static float2 addsub(float2 a, float2 b)
             }
             else
             {
-                return new float2(a.x + b.x, a.y - b.y);
+                return a + math.select(b, -b, new bool2(false, true));
             }
         }
 
@@ -32,7 +32,7 @@ public static float3 addsub(float3 a, float3 b)
             }
             else
             {
-                return new float3(a.x + b.x, a.y - b.y, a.z + b.z);
+                return a + math.select(b, -b, new bool3(false, true, false));
             }
         }
 
@@ -46,7 +46,7 @@ public static float4 addsub(float4 a, float4 b)
             }
             else
             {
-                return new float4(a.x + b.x, a.y - b.y, a.z + b.z, a.w - b.w);
+                return a + math.select(b, -b, new bool4(false, true, false, true));
             }
         }
 
@@ -75,7 +75,7 @@ public static double2 addsub(double2 a, double2 b)
             }
             else
             {
-                return new double2(a.x + b.x, a.y - b.y);
+                return a + math.select(b, -b, new bool2(false, true));
             }
         }
 
@@ -89,7 +89,7 @@ public static double3 addsub(double3 a, double3 b)
             }
             else
             {
-                return new double3(a.x + b.x, a.y - b.y, a.z + b.z);
+                return a + math.select(b, -b, new bool3(false, true, false));
             }
         }
 
@@ -103,7 +103,7 @@ public static double4 addsub(double4 a, double4 b)
             }
             else
             {
-                return new double4(a.x + b.x, a.y - b.y, a.z + b.z, a.w - b.w);
+                return a + math.select(b, -b, new bool4(false, true, false, true));
             }
         }
 
@@ -118,7 +118,7 @@ public static byte2 addsub(byte2 a, byte2 b)
             }
             else
             {
-                return new byte2((byte)(a.x + b.x), (byte)(a.y - b.y));
+                return a + select(b, (byte2)(-(sbyte2)b), new bool2(false, true));
             }
         }
 
@@ -132,7 +132,7 @@ public static byte3 addsub(byte3 a, byte3 b)
             }
             else
             {
-                return new byte3((byte)(a.x + b.x), (byte)(a.y - b.y), (byte)(a.z + b.z));
+                return a + select(b, (byte3)(-(sbyte3)b), new bool3(false, true, false));
             }
         }
 
@@ -146,7 +146,7 @@ public static byte4 addsub(byte4 a, byte4 b)
             }
             else
             {
-                return new byte4((byte)(a.x + b.x), (byte)(a.y - b.y), (byte)(a.z + b.z), (byte)(a.w - b.w));
+                return a + select(b, (byte4)(-(sbyte4)b), new bool4(false, true, false, true));
             }
         }
 
@@ -160,7 +160,7 @@ public static byte8 addsub(byte8 a, byte8 b)
             }
             else
             {
-                return new byte8((byte)(a.x0 + b.x0), (byte)(a.x1 - b.x1), (byte)(a.x2 + b.x2), (byte)(a.x3 - b.x3), (byte)(a.x4 + b.x4), (byte)(a.x5 - b.x5), (byte)(a.x6 + b.x6), (byte)(a.x7 - b.x7));
+                return a + select(b, (byte8)(-(sbyte8)b), new bool8(false, true, false, true, false, true, false, true));
             }
         }
 
@@ -174,7 +174,7 @@ public static byte16 addsub(byte16 a, byte16 b)
             }
             else
             {
-                return new byte16((byte)(a.x0 + b.x0), (byte)(a.x1 - b.x1), (byte)(a.x2 + b.x2), (byte)(a.x3 - b.x3), (byte)(a.x4 + b.x4), (byte)(a.x5 - b.x5), (byte)(a.x6 + b.x6), (byte)(a.x7 - b.x7), (byte)(a.x8 + b.x8), (byte)(a.x9 - b.x9), (byte)(a.x10 + b.x10), (byte)(a.x11 - b.x11), (byte)(a.x12 + b.x12), (byte)(a.x13 - b.x13), (byte)(a.x14 + b.x14), (byte)(a.x15 - b.x15));
+                return a + select(b, (byte16)(-(sbyte16)b), new bool16(false, true, false, true, false, true, false, true, false, true, false, true, false, true, false, true));
             }
         }
 
@@ -203,7 +203,7 @@ public static sbyte2 addsub(sbyte2 a, sbyte2 b)
             }
             else
             {
-                return new sbyte2((sbyte)(a.x + b.x), (sbyte)(a.y - b.y));
+                return a + select(b, -b, new bool2(false, true));
             }
         }
 
@@ -217,7 +217,7 @@ public static sbyte3 addsub(sbyte3 a, sbyte3 b)
             }
             else
             {
-                return new sbyte3((sbyte)(a.x + b.x), (sbyte)(a.y - b.y), (sbyte)(a.z + b.z));
+                return a + select(b, -b, new bool3(false, true, false));
             }
         }
 
@@ -231,7 +231,7 @@ public static sbyte4 addsub(sbyte4 a, sbyte4 b)
             }
             else
             {
-                return new sbyte4((sbyte)(a.x + b.x), (sbyte)(a.y - b.y), (sbyte)(a.z + b.z), (sbyte)(a.w - b.w));
+                return a + select(b, -b, new bool4(false, true, false, true));
             }
         }
 
@@ -245,7 +245,7 @@ public static sbyte8 addsub(sbyte8 a, sbyte8 b)
             }
             else
             {
-                return new sbyte8((sbyte)(a.x0 + b.x0), (sbyte)(a.x1 - b.x1), (sbyte)(a.x2 + b.x2), (sbyte)(a.x3 - b.x3), (sbyte)(a.x4 + b.x4), (sbyte)(a.x5 - b.x5), (sbyte)(a.x6 + b.x6), (sbyte)(a.x7 - b.x7));
+                return a + select(b, -b, new bool8(false, true, false, true, false, true, false, true));
             }
         }
 
@@ -259,7 +259,7 @@ public static sbyte16 addsub(sbyte16 a, sbyte16 b)
             }
             else
             {
-                return new sbyte16((sbyte)(a.x0 + b.x0), (sbyte)(a.x1 - b.x1), (sbyte)(a.x2 + b.x2), (sbyte)(a.x3 - b.x3), (sbyte)(a.x4 + b.x4), (sbyte)(a.x5 - b.x5), (sbyte)(a.x6 + b.x6), (sbyte)(a.x7 - b.x7), (sbyte)(a.x8 + b.x8), (sbyte)(a.x9 - b.x9), (sbyte)(a.x10 + b.x10), (sbyte)(a.x11 - b.x11), (sbyte)(a.x12 + b.x12), (sbyte)(a.x13 - b.x13), (sbyte)(a.x14 + b.x14), (sbyte)(a.x15 - b.x15));
+                return a + select(b, -b, new bool16(false, true, false, true, false, true, false, true, false, true, false, true, false, true, false, true));
             }
         }
 
@@ -288,7 +288,7 @@ public static ushort2 addsub(ushort2 a, ushort2 b)
             }
             else
             {
-                return new ushort2((ushort)(a.x + b.x), (ushort)(a.y - b.y));
+                return a + select(b, (ushort2)(-(short2)b), new bool2(false, true));
             }
         }
 
@@ -302,7 +302,7 @@ public static ushort3 addsub(ushort3 a, ushort3 b)
             }
             else
             {
-                return new ushort3((ushort)(a.x + b.x), (ushort)(a.y - b.y), (ushort)(a.z + b.z));
+                return a + select(b, (ushort3)(-(short3)b), new bool3(false, true, false));
             }
         }
 
@@ -316,7 +316,7 @@ public static ushort4 addsub(ushort4 a, ushort4 b)
             }
             else
             {
-                return new ushort4((ushort)(a.x + b.x), (ushort)(a.y - b.y), (ushort)(a.z + b.z), (ushort)(a.w - b.w));
+                return a + select(b, (ushort4)(-(short4)b), new bool4(false, true, false, true));
             }
         }
 
@@ -330,7 +330,7 @@ public static ushort8 addsub(ushort8 a, ushort8 b)
             }
             else
             {
-                return new ushort8((ushort)(a.x0 + b.x0), (ushort)(a.x1 - b.x1), (ushort)(a.x2 + b.x2), (ushort)(a.x3 - b.x3), (ushort)(a.x4 + b.x4), (ushort)(a.x5 - b.x5), (ushort)(a.x6 + b.x6), (ushort)(a.x7 - b.x7));
+                return a + select(b, (ushort8)(-(short8)b), new bool8(false, true, false, true, false, true, false, true));
             }
         }
 
@@ -359,7 +359,7 @@ public static short2 addsub(short2 a, short2 b)
             }
             else
             {
-                return new short2((short)(a.x + b.x), (short)(a.y - b.y));
+                return a + select(b, -b, new bool2(false, true));
             }
         }
 
@@ -373,7 +373,7 @@ public static short3 addsub(short3 a, short3 b)
             }
             else
             {
-                return new short3((short)(a.x + b.x), (short)(a.y - b.y), (short)(a.z + b.z));
+                return a + select(b, -b, new bool3(false, true, false));
             }
         }
 
@@ -387,7 +387,7 @@ public static short4 addsub(short4 a, short4 b)
             }
             else
             {
-                return new short4((short)(a.x + b.x), (short)(a.y - b.y), (short)(a.z + b.z), (short)(a.w - b.w));
+                return a + select(b, -b, new bool4(false, true, false, true));
             }
         }
 
@@ -401,7 +401,7 @@ public static short8 addsub(short8 a, short8 b)
             }
             else
             {
-                return new short8((short)(a.x0 + b.x0), (short)(a.x1 - b.x1), (short)(a.x2 + b.x2), (short)(a.x3 - b.x3), (short)(a.x4 + b.x4), (short)(a.x5 - b.x5), (short)(a.x6 + b.x6), (short)(a.x7 - b.x7));
+                return a + select(b, -b, new bool8(false, true, false, true, false, true, false, true));
             }
         }
 
@@ -432,7 +432,7 @@ public static uint2 addsub(uint2 a, uint2 b)
             }
             else
             {
-                return new uint2(a.x + b.x, a.y - b.y);
+                return a + math.select(b, (uint2)(-(int2)b), new bool2(false, true));
             }
         }
 
@@ -448,7 +448,7 @@ public static uint3 addsub(uint3 a, uint3 b)
             }
             else
             {
-                return new uint3(a.x + b.x, a.y - b.y, a.z + b.z);
+                return a + math.select(b, (uint3)(-(int3)b), new bool3(false, true, false));
             }
         }
 
@@ -464,7 +464,7 @@ public static uint4 addsub(uint4 a, uint4 b)
             }
             else
             {
-                return new uint4(a.x + b.x, a.y - b.y, a.z + b.z, a.w - b.w);
+                return a + math.select(b, (uint4)(-(int4)b), new bool4(false, true, false, true));
             }
         }
 
@@ -495,7 +495,7 @@ public static int2 addsub(int2 a, int2 b)
             }
             else
             {
-                return new int2(a.x + b.x, a.y - b.y);
+                return a + math.select(b, -b, new bool2(false, true));
             }
         }
 
@@ -511,7 +511,7 @@ public static int3 addsub(int3 a, int3 b)
             }
             else
             {
-                return new int3(a.x + b.x, a.y - b.y, a.z + b.z);
+                return a + math.select(b, -b, new bool3(false, true, false));
             }
         }
 
@@ -527,7 +527,7 @@ public static int4 addsub(int4 a, int4 b)
             }
             else
             {
-                return new int4(a.x + b.x, a.y - b.y, a.z + b.z, a.w - b.w);
+                return a + math.select(b, -b, new bool4(false, true, false, true));
             }
         }
 
@@ -552,7 +552,14 @@ public static ulong2 addsub(ulong2 a, ulong2 b)
         {
             if (Sse2.IsSse2Supported)
             {
-                return a + Mask.BlendEpi16(b, default(v128) - b, 0b1111_0000);
+                if (Sse4_1.IsSse41Supported)
+                {
+                    return a + Sse4_1.blend_epi16(b, default(v128) - b, 0b1111_0000);
+                }
+                else
+                {
+                    return a + Mask.BlendEpi16_SSE2(b, default(v128) - b, 0b1111_0000);
+                }
             }
             else
             {
@@ -595,7 +602,14 @@ public static long2 addsub(long2 a, long2 b)
         {
             if (Sse2.IsSse2Supported)
             {
-                return a + Mask.BlendEpi16(b, -b, 0b1111_0000);
+                if (Sse4_1.IsSse41Supported)
+                {
+                    return a + Sse4_1.blend_epi16(b, -b, 0b1111_0000);
+                }
+                else
+                {
+                    return a + Mask.BlendEpi16_SSE2(b, -b, 0b1111_0000);
+                }
             }
             else
             {