-
Notifications
You must be signed in to change notification settings - Fork 33
/
MonteCarloPiEstimator.cs
105 lines (87 loc) · 4.67 KB
/
MonteCarloPiEstimator.cs
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
using Hast.Algorithms.Random;
using Hast.Layer;
using Hast.Synthesis.Attributes;
using Hast.Transformer.SimpleMemory;
using Lombiq.HelpfulLibraries.Common.Utilities;
using System;
using System.Threading.Tasks;
namespace Hast.Samples.SampleAssembly;
/// <summary>
/// Algorithm to calculate Pi with a random <see href="https://en.wikipedia.org/wiki/Monte_Carlo_method">Monte Carlo
/// method</see> in a parallelized manner. For an overview of the idea see <see
/// href="https://www.coursera.org/lecture/parprog1/monte-carlo-method-to-estimate-pi-Zgm76">this video</see>; <see
/// href="http://www.software-architects.com/devblog/2014/09/22/C-Parallel-and-Async-Programming">this blog post's
/// implementation</see> was used as an inspiration too. Also see <c>MonteCarloAlgorithmSampleRunner</c> on what to
/// configure to make this work.
/// </summary>
public class MonteCarloPiEstimator
{
private const int EstimatePiIteractionsCountUInt32Index = 0;
private const int EstimatePiRandomSeedUInt32Index = 1;
private const int EstimatePiInCircleCountSumUInt32Index = 0;
// With a degree of parallelism of 78 the resource utilization of the Nexys A7 board would jump to 101% so this is
// the limit of efficiency. Note that this is one lower than in the currently measured benchmark because since then
// we changed Hastlayer slightly.
[Replaceable(nameof(MonteCarloPiEstimator) + "." + nameof(MaxDegreeOfParallelism))]
public static readonly int MaxDegreeOfParallelism = 77;
private readonly NonSecurityRandomizer _random = new();
public virtual void EstimatePi(SimpleMemory memory)
{
var iterationsCount = memory.ReadUInt32(EstimatePiIteractionsCountUInt32Index);
var randomSeed = (ushort)memory.ReadUInt32(EstimatePiRandomSeedUInt32Index);
var iterationsPerTask = iterationsCount / MaxDegreeOfParallelism;
var tasks = new Task<uint>[MaxDegreeOfParallelism];
for (uint i = 0; i < MaxDegreeOfParallelism; i++)
{
tasks[i] = Task.Factory.StartNew(
indexObject =>
{
var index = (uint)indexObject;
// A 16b PRNG is enough for this task and the xorshift one has suitable quality.
var random = new RandomXorshiftLfsr16 { State = (ushort)(randomSeed + index) };
uint inCircleCount = 0;
for (var j = 0; j < iterationsPerTask; j++)
{
uint a = random.NextUInt16();
uint b = random.NextUInt16();
// A bit of further parallelization can be exploited with SIMD to shave off some execution time.
// However, this needs so much resources on the hardware that the degree of parallelism needs to
// be lowered substantially (below 60).
//// var randomNumbers = new uint[] { random.NextUInt16(), random.NextUInt16() };
//// var products = Common.Numerics.SimdOperations.MultiplyVectors(randomNumbers, randomNumbers, 2);
if ((ulong)(a * a) + (b * b) <= ((uint)ushort.MaxValue * ushort.MaxValue))
{
inCircleCount++;
}
}
return inCircleCount;
},
i);
}
Task.WhenAll(tasks).Wait();
uint inCircleCountSum = 0;
for (int i = 0; i < MaxDegreeOfParallelism; i++)
{
inCircleCountSum += tasks[i].Result;
}
memory.WriteUInt32(EstimatePiInCircleCountSumUInt32Index, inCircleCountSum);
}
public double EstimatePi(uint iterationsCount, IHastlayer hastlayer = null, IHardwareGenerationConfiguration configuration = null)
{
if (iterationsCount % MaxDegreeOfParallelism != 0)
{
throw new InvalidOperationException(
StringHelper.CreateInvariant(
$"The number of iterations must be divisible by {MaxDegreeOfParallelism}."));
}
var memory = hastlayer is null
? SimpleMemory.CreateSoftwareMemory(2)
: hastlayer.CreateMemory(configuration, 2);
memory.WriteUInt32(EstimatePiIteractionsCountUInt32Index, iterationsCount);
memory.WriteUInt32(EstimatePiRandomSeedUInt32Index, (uint)_random.Get());
EstimatePi(memory);
// This single calculation takes up too much space on the FPGA, since it needs fix point arithmetic, but it
// doesn't take much time. So doing it on the host instead.
return (double)memory.ReadInt32(EstimatePiInCircleCountSumUInt32Index) / iterationsCount * 4;
}
}