Skip to content

Commit

Permalink
Update Random8.cs
Browse files Browse the repository at this point in the history
Known Issues

    half8 "equals" and "not equals" operators don't conform to the IEEE 754 standard - Unity has not yet reacted to my bug-report in regards to their "half" implementation

Fixes

    fixed triggered burst compilation error by "Sse4_1.blend_epi16" when compiling for SSE2 due to fallback code not using a constant value for "imm8"
    fixed incorrect CPU feature checks for quarter vector type-conversion code when compiling for SSE2
    fixed "tzcnt" implementations (were completely broken)
    fixed scalar (single value and C# fallback) "lzcnt" implementations for (s)byte and (u)short values and (u)long4 vectors

Additions

    added "ulong countbits(void* ptr, ulong bytes)", which counts the number of 1-bits in a given block of memory, using Wojciech Mula's SIMD population count algorithm
    added high performance and/or SIMD "gcd" a.k.a. greatest common divisor functions for (u)int, (u)long and all integer vector types, which always return unsigned types and vectors
    added high performance and/or SIMD "lcm" a.k.a. least common multiple functions for (u)int, (u)long and all integer vector types, which always return unsigned types and vectors
    added high performance and/or SIMD "intsqrt" - integer square root (floor(sqrt(x)) functions for all integer- and integer vector types, with the functions for signed integers and vectors throwing an ArgumentOutOfRangeException in case a value is negative

Improvements

    performance improvements of "avg" functions for signed integer vectors
    added SIMD implementations of the "transpose" functions for all matrix types
    added SSE4 and SSE2 fallback code for variable bitshifts ("shl", "shrl" and "shra")
    added SSE2 fallback code for (s)byte vector-by-vector division and modulo operations
    added SSE2 fallback code for "all_dif" for (s)byte16, (u)short8 and (u)int8 vectors
    added SSE2 fallback code for typecasting, propagating through the entire library
    added SSE2 fallback code for "addsub" and "subadd" functions
    bitmask32 and bitmask64 now allow for masks to be up to 32 and 64 bits wide, respectively

Changes

    renamed "BurstCompilerException" to "CPUFeatureCheckException"
    "shl", "shrl" and "shra" now have undefined behavior when bitshifting any value outside of the interval [0, 8 * sizeof(integer_type) - 1] for performance reasons and because of differences between SSE, AVX and managed C#

Fixed Oversights

    added "shl", "shrl" and "shra" (varying per element) functions for (s)byte and (u)short vectors
    added "ror" and "rol" (varying per element) functions for (s)byte and (u)short vectors
    added "compareto" functions for all vector types except half- and quarter vectors
    added "all_dif" functions for (s)byte32 vectors
    added vshr/l and vror/l functions for (s)byte32 and (u)short16 vectors

2.1.1 Hotfix
Fixes

    fixed SSE2 "shl", "shrl" and "shra" implementations
    fixed SSE2 "intsqrt" implementations

Improvements

    improved performance of (s)byte2, -3, -4, -8, -16 and (u)short2, -3, -4, -8 "gcd" functions (and thus "lcm") when compiling for Avx2
    improved performance of "tzcnt" and "lzcnt" implementations for all vector types if compiling for SSE4 or higher, propagating through a lot of the library

Fixed Oversights

    Added documentation for RandomX methods
  • Loading branch information
MrUnbelievable92 committed Mar 1, 2021
1 parent 9da6887 commit e3eff09
Showing 1 changed file with 9 additions and 9 deletions.
18 changes: 9 additions & 9 deletions Runtime/Types/Random/Random8.cs
Original file line number Diff line number Diff line change
Expand Up @@ -304,7 +304,7 @@ public sbyte16 NextSByte16(sbyte16 min, sbyte16 max)
temp = Avx2.mm256_shuffle_epi8(temp, new v256(1, 3, 5, 7, 9, 11, 13, 15, 0, 0, 0, 0, 0, 0, 0, 0,
1, 3, 5, 7, 9, 11, 13, 15, 0, 0, 0, 0, 0, 0, 0, 0));

return min + Avx.mm256_castsi256_si127(Avx2.mm256_permute4x64_epi64(temp, Sse.SHUFFLE(0, 0, 2, 0)));
return min + Avx.mm256_castsi256_si128(Avx2.mm256_permute4x64_epi64(temp, Sse.SHUFFLE(0, 0, 2, 0)));
}
else
{
Expand Down Expand Up @@ -361,8 +361,8 @@ public sbyte32 NextSByte32(sbyte32 min, sbyte32 max)
hi = Avx2.mm256_shuffle_epi8(hi, new v256(1, 3, 5, 7, 9, 11, 13, 15, 0, 0, 0, 0, 0, 0, 0, 0,
1, 3, 5, 7, 9, 11, 13, 15, 0, 0, 0, 0, 0, 0, 0, 0));

return min + new sbyte32(Avx.mm256_castsi256_si127(Avx2.mm256_permute4x64_epi64(lo, Sse.SHUFFLE(0, 0, 2, 0))),
Avx.mm256_castsi256_si127(Avx2.mm256_permute4x64_epi64(hi, Sse.SHUFFLE(0, 0, 2, 0))));
return min + new sbyte32(Avx.mm256_castsi256_si128(Avx2.mm256_permute4x64_epi64(lo, Sse.SHUFFLE(0, 0, 2, 0))),
Avx.mm256_castsi256_si128(Avx2.mm256_permute4x64_epi64(hi, Sse.SHUFFLE(0, 0, 2, 0))));
}
else
{
Expand Down Expand Up @@ -503,7 +503,7 @@ public byte16 NextByte16(byte16 max)
temp = Avx2.mm256_shuffle_epi8(temp, new v256(1, 3, 5, 7, 9, 11, 13, 15, 0, 0, 0, 0, 0, 0, 0, 0,
1, 3, 5, 7, 9, 11, 13, 15, 0, 0, 0, 0, 0, 0, 0, 0));

return Avx.mm256_castsi256_si127(Avx2.mm256_permute4x64_epi64(temp, Sse.SHUFFLE(0, 0, 2, 0)));
return Avx.mm256_castsi256_si128(Avx2.mm256_permute4x64_epi64(temp, Sse.SHUFFLE(0, 0, 2, 0)));
}
else
{
Expand All @@ -525,8 +525,8 @@ public byte32 NextByte32(byte32 max)
hi = Avx2.mm256_shuffle_epi8(hi, new v256(1, 3, 5, 7, 9, 11, 13, 15, 0, 0, 0, 0, 0, 0, 0, 0,
1, 3, 5, 7, 9, 11, 13, 15, 0, 0, 0, 0, 0, 0, 0, 0));

return new byte32(Avx.mm256_castsi256_si127(Avx2.mm256_permute4x64_epi64(lo, Sse.SHUFFLE(0, 0, 2, 0))),
Avx.mm256_castsi256_si127(Avx2.mm256_permute4x64_epi64(hi, Sse.SHUFFLE(0, 0, 2, 0))));
return new byte32(Avx.mm256_castsi256_si128(Avx2.mm256_permute4x64_epi64(lo, Sse.SHUFFLE(0, 0, 2, 0))),
Avx.mm256_castsi256_si128(Avx2.mm256_permute4x64_epi64(hi, Sse.SHUFFLE(0, 0, 2, 0))));
}
else
{
Expand Down Expand Up @@ -657,7 +657,7 @@ public byte16 NextByte16(byte16 min, byte16 max)
temp = Avx2.mm256_shuffle_epi8(temp, new v256(1, 3, 5, 7, 9, 11, 13, 15, 0, 0, 0, 0, 0, 0, 0, 0,
1, 3, 5, 7, 9, 11, 13, 15, 0, 0, 0, 0, 0, 0, 0, 0));

return min + Avx.mm256_castsi256_si127(Avx2.mm256_permute4x64_epi64(temp, Sse.SHUFFLE(0, 0, 2, 0)));
return min + Avx.mm256_castsi256_si128(Avx2.mm256_permute4x64_epi64(temp, Sse.SHUFFLE(0, 0, 2, 0)));
}
else
{
Expand Down Expand Up @@ -714,8 +714,8 @@ public byte32 NextByte32(byte32 min, byte32 max)
hi = Avx2.mm256_shuffle_epi8(hi, new v256(1, 3, 5, 7, 9, 11, 13, 15, 0, 0, 0, 0, 0, 0, 0, 0,
1, 3, 5, 7, 9, 11, 13, 15, 0, 0, 0, 0, 0, 0, 0, 0));

return min + new byte32(Avx.mm256_castsi256_si127(Avx2.mm256_permute4x64_epi64(lo, Sse.SHUFFLE(0, 0, 2, 0))),
Avx.mm256_castsi256_si127(Avx2.mm256_permute4x64_epi64(hi, Sse.SHUFFLE(0, 0, 2, 0))));
return min + new byte32(Avx.mm256_castsi256_si128(Avx2.mm256_permute4x64_epi64(lo, Sse.SHUFFLE(0, 0, 2, 0))),
Avx.mm256_castsi256_si128(Avx2.mm256_permute4x64_epi64(hi, Sse.SHUFFLE(0, 0, 2, 0))));
}
else
{
Expand Down

0 comments on commit e3eff09

Please sign in to comment.