From 3916ca243e86c1cb28e75e6d6178d8c8ee49d519 Mon Sep 17 00:00:00 2001 From: bendudson Date: Wed, 12 May 2021 17:01:36 +0000 Subject: [PATCH 01/13] [skip ci] Apply black changes --- tests/integrated/test-cyclic/runtest | 10 ++++++++-- 1 file changed, 8 insertions(+), 2 deletions(-) diff --git a/tests/integrated/test-cyclic/runtest b/tests/integrated/test-cyclic/runtest index 051744a61b..9ebdf9f552 100755 --- a/tests/integrated/test-cyclic/runtest +++ b/tests/integrated/test-cyclic/runtest @@ -20,8 +20,14 @@ from sys import stdout, exit build_and_log("Cyclic Reduction test") -flags = ["", "nsys=2", "nsys=5 periodic", "nsys=7 n=10", - "ngather=1", "nsys=7 n=10 ngather=1"] +flags = [ + "", + "nsys=2", + "nsys=5 periodic", + "nsys=7 n=10", + "ngather=1", + "nsys=7 n=10 ngather=1", +] code = 0 # Return code for nproc in [1, 2, 4]: From ac7f3cd6a85ba8d5c37b48f277d0247e8bf4e82f Mon Sep 17 00:00:00 2001 From: Ben Dudson Date: Wed, 12 May 2021 18:11:04 +0100 Subject: [PATCH 02/13] Add an option to CyclicReduce to gather onto fewer processors By default all processors are used, resulting in an all-to-all communication pattern. An optional third argument to the constructor now specifies how many processors should be used. This reduces the number of messages, at the cost of increasing load imbalance. The processors used are evenly distributed, in the hope that this will minimise network traffic on any given node. --- include/cyclic_reduction.hxx | 232 ++++++++++++------- tests/integrated/test-cyclic/runtest | 3 +- tests/integrated/test-cyclic/test_cyclic.cxx | 11 +- 3 files changed, 159 insertions(+), 87 deletions(-) diff --git a/include/cyclic_reduction.hxx b/include/cyclic_reduction.hxx index 9958937e3d..00851b52da 100644 --- a/include/cyclic_reduction.hxx +++ b/include/cyclic_reduction.hxx @@ -13,7 +13,7 @@ alpha = a3 / b1 * ************************************************************************** - * Copyright 2010 B.D.Dudson, S.Farley, M.V.Umansky, X.Q.Xu + * Copyright 2010,2021 B.D.Dudson, S.Farley, M.V.Umansky, X.Q.Xu * * Contact: Ben Dudson, bd512@york.ac.uk * @@ -58,26 +58,39 @@ template class CyclicReduce { public: CyclicReduce() = default; - CyclicReduce(MPI_Comm c, int size) : comm(c), N(size) { - MPI_Comm_size(c, &nprocs); - MPI_Comm_rank(c, &myproc); + CyclicReduce(MPI_Comm c, int size, int ngather=0) : comm(c), ngatherprocs(ngather), N(size) { + setup(c, size, ngather); } /// Set parameters + /// This can be called after creation, to reset the solver + /// /// @param[in] c The communicator of all processors involved in the solve /// @param[in] size The number of rows on this processor - void setup(MPI_Comm c, int size) { + /// @param[in] gather The number of processors to gather onto. If 0, use all processors + void setup(MPI_Comm c, int size, int ngather=0) { comm = c; int np, myp; MPI_Comm_size(c, &np); MPI_Comm_rank(c, &myp); - if ((size != N) || (np != nprocs) || (myp != myproc)) + + if (ngather == 0) { + ngather = np; // Use all processors + } + + if ((size != N) || (np != nprocs) || (myp != myproc) || (ngather != ngatherprocs)) { Nsys = 0; // Need to re-size + } + N = size; periodic = false; nprocs = np; myproc = myp; + + ngatherprocs = ngather; + ASSERT1(ngatherprocs > 0); + ASSERT1(ngatherprocs <= nprocs); } ~CyclicReduce() = default; @@ -117,7 +130,7 @@ public: int nsys = std::get<0>(a.shape()); // Make sure correct memory arrays allocated - allocMemory(nprocs, nsys, N); + allocMemory(nsys); // Fill coefficient array BOUT_OMP(parallel for) @@ -190,13 +203,14 @@ public: /////////////////////////////////////// // Gather all interface equations onto single processor - // NOTE: Need to replace with divide-and-conquer at some point + // NOTE: Cyclic reduction would do this in multiple stages, + // but here we just gather onto ngatherprocs processors // - // There are Nsys sets of equations to gather, and nprocs processors + // There are Nsys sets of equations to gather, and ngatherprocs processors // which can be used. Each processor therefore sends interface equations // to several different processors for gathering // - // e.g. 3 processors (nproc=3), 4 sets of equations (Nsys=4): + // e.g. 3 processors (nprocs=3 and ngatherprocs=3), 4 sets of equations (Nsys=4): // // PE 0: [1a 2a 3a 4a] PE 1: [1b 2b 3b 4b] PE 2: [1c 2c 3c 4c] // @@ -207,14 +221,19 @@ public: // // Here PE 0 would have myns=2, PE 1 and 2 would have myns=1 - int ns = Nsys / nprocs; // Number of systems to assign to all processors - int nsextra = Nsys % nprocs; // Number of processors with 1 extra + int ns = Nsys / ngatherprocs; // Number of systems to assign to all gathering processors + int nsextra = Nsys % ngatherprocs; // Number of processors with 1 extra - auto* req = new MPI_Request[nprocs]; + // Receive requests if (myns > 0) { - // Post receives from all other processors - req[myproc] = MPI_REQUEST_NULL; + // If gathering systems onto this processor + // post receives from all other processors + + all_req.resize(nprocs); // One communicator per processor + all_buffer.reallocate(nprocs, myns * 8); // Storage for systems from each processor + + all_req[myproc] = MPI_REQUEST_NULL; for (int p = 0; p < nprocs; p++) { // Loop over processor // 2 interface equations per processor // myns systems to solve @@ -231,27 +250,48 @@ public: #ifdef DIAGNOSE output << "Expecting to receive " << len << " from " << p << endl; #endif - MPI_Irecv(&recvbuffer(p, 0), len, + MPI_Irecv(&all_buffer(p, 0), len, MPI_BYTE, // Just sending raw data, unknown type p, // Destination processor p, // Identifier comm, // Communicator - &req[p]); // Request + &all_req[p]); // Request } } } - // Send data - int s0 = 0; - for (int p = 0; p < nprocs; p++) { // Loop over processor - int nsp = ns; - if (p < nsextra) - nsp++; + // Send data to the processors which are gathering + + gather_req.resize(ngatherprocs); // One request per gathering processor + gather_proc.resize(ngatherprocs); // Processor number + gather_sys_offset.resize(ngatherprocs); // Index of the first system gathered + gather_nsys.resize(ngatherprocs); // Number of systems + + // Interval between gathering processors + BoutReal pinterval = static_cast(nprocs) / ngatherprocs; + + int s0 = 0; // System number + + // Loop over gathering processors + for (int i = 0; i < ngatherprocs; i++) { + + int p = i; // Gathering onto all processors + if (ngatherprocs != nprocs) { + // Gathering onto only some + p = static_cast( pinterval * i ); + } + + int nsp = ns; // Number of systems to send to this processor + if (i < nsextra) + nsp++; // Some processors get an extra system + + gather_proc[i] = p; + gather_sys_offset[i] = s0; + gather_nsys[i] = nsp; + if ((p != myproc) && (nsp > 0)) { #ifdef DIAGNOSE output << "Sending to " << p << endl; - for (int i = 0; i < 8; i++) - output << "value " << i << " : " << myif(s0, i) << endl; #endif MPI_Send(&myif(s0, 0), // Data pointer 8 * nsp * sizeof(T), // Number @@ -260,15 +300,18 @@ public: myproc, // Message identifier comm); // Communicator } + s0 += nsp; } if (myns > 0) { + // If this processor is gathering systems + // Wait for data int p; do { MPI_Status stat; - MPI_Waitany(nprocs, req, &p, &stat); + MPI_Waitany(nprocs, all_req.data(), &p, &stat); if (p != MPI_UNDEFINED) { // p is the processor number. Copy data #ifdef DIAGNOSE @@ -278,11 +321,11 @@ public: for (int i = 0; i < myns; i++) for (int j = 0; j < 8; j++) { #ifdef DIAGNOSE - output << "Value " << j << " : " << recvbuffer(p, 8 * i + j) << endl; + output << "Value " << j << " : " << all_buffer(p, 8 * i + j) << endl; #endif - ifcs(i, 8 * p + j) = recvbuffer(p, 8 * i + j); + ifcs(i, 8 * p + j) = all_buffer(p, 8 * i + j); } - req[p] = MPI_REQUEST_NULL; + all_req[p] = MPI_REQUEST_NULL; } } while (p != MPI_UNDEFINED); @@ -342,11 +385,15 @@ public: /////////////////////////////////////// // Scatter back solution + // Allocate buffer space. Note assumes 0th processor has most systems + gather_buffer.reallocate(ngatherprocs, 2 * gather_nsys[0]); + // Post receives - for (int p = 0; p < nprocs; p++) { // Loop over processor - int nsp = ns; - if (p < nsextra) - nsp++; + // Loop over gathering processors + for (int i = 0; i < ngatherprocs; i++) { + int p = gather_proc[i]; // Processor number + int nsp = gather_nsys[i]; // Number of systems + int len = 2 * nsp * sizeof(T); // 2 values per system if (p == myproc) { @@ -356,19 +403,20 @@ public: x1[sys0 + i] = ifx(i, 2 * p); xn[sys0 + i] = ifx(i, 2 * p + 1); } - req[p] = MPI_REQUEST_NULL; + gather_req[i] = MPI_REQUEST_NULL; } else if (nsp > 0) { #ifdef DIAGNOSE output << "Expecting receive from " << p << " of size " << len << endl; #endif - MPI_Irecv(&recvbuffer(p, 0), len, + MPI_Irecv(&gather_buffer(i, 0), len, MPI_BYTE, // Just sending raw data, unknown type p, // Destination processor p, // Identifier comm, // Communicator - &req[p]); // Request - } else - req[p] = MPI_REQUEST_NULL; + &gather_req[p]); // Request + } else { + gather_req[i] = MPI_REQUEST_NULL; + } } if (myns > 0) { @@ -392,49 +440,51 @@ public: } // Wait for data - int fromproc; - int nsp; + int fromind; do { MPI_Status stat; - MPI_Waitany(nprocs, req, &fromproc, &stat); + MPI_Waitany(ngatherprocs, gather_req.data(), &fromind, &stat); - if (fromproc != MPI_UNDEFINED) { - // fromproc is the processor number. Copy data + if (fromind != MPI_UNDEFINED) { + // Copy data - int s0 = fromproc * ns; - if (fromproc > nsextra) { - s0 += nsextra; - } else - s0 += fromproc; + int fromproc = gather_proc[fromind]; // From processor + int s0 = gather_sys_offset[fromind]; // System index start + int nsp = gather_nsys[fromind]; // Number of systems - nsp = ns; - if (fromproc < nsextra) - nsp++; - BOUT_OMP(parallel for) for (int i = 0; i < nsp; i++) { - x1[s0 + i] = recvbuffer(fromproc, 2 * i); - xn[s0 + i] = recvbuffer(fromproc, 2 * i + 1); + x1[s0 + i] = gather_buffer(fromind, 2 * i); + xn[s0 + i] = gather_buffer(fromind, 2 * i + 1); #ifdef DIAGNOSE output << "Received x1,xn[" << s0 + i << "] = " << x1[s0 + i] << ", " << xn[s0 + i] << " from " << fromproc << endl; #endif } - req[fromproc] = MPI_REQUEST_NULL; + gather_req[fromproc] = MPI_REQUEST_NULL; } - } while (fromproc != MPI_UNDEFINED); + } while (fromind != MPI_UNDEFINED); } /////////////////////////////////////// // Solve local equations back_solve(Nsys, N, coefs, x1, xn, x); - delete[] req; } private: MPI_Comm comm; ///< Communicator int nprocs{0}, myproc{-1}; ///< Number of processors and ID of my processor + int ngatherprocs{0}; ///< Number of processors to gather onto + std::vector gather_req; ///< Comms with gathering processors + std::vector gather_proc; ///< The processor number + std::vector gather_sys_offset; ///< Index of the first system gathered + std::vector gather_nsys; ///< Number of systems gathered + Matrix gather_buffer; ///< Buffer for receiving from gathering processors + + std::vector all_req; ///< Requests for comms with all processors + Matrix all_buffer; ///< Buffer for receiving from all other processors + int N{0}; ///< Total size of the problem int Nsys{0}; ///< Number of independent systems to solve int myns; ///< Number of systems for interface solve on this processor @@ -445,7 +495,6 @@ private: Matrix coefs; ///< Starting coefficients, rhs [Nsys, {3*coef,rhs}*N] Matrix myif; ///< Interface equations for this processor - Matrix recvbuffer; ///< Buffer for receiving from other processors Matrix ifcs; ///< Coefficients for interface solve Matrix if2x2; ///< 2x2 interface equations on this processor Matrix ifx; ///< Solution of interface equations @@ -453,42 +502,61 @@ private: Array x1, xn; ///< Interface solutions for back-solving /// Allocate memory arrays - /// @param[in] np Number of processors /// @param[in] nsys Number of independent systems to solve - /// @param[in] n Size of each system of equations - void allocMemory(int np, int nsys, int n) { - if ((nsys == Nsys) && (n == N) && (np == nprocs)) + void allocMemory(int nsys) { + if (nsys == Nsys) return; // No need to allocate memory - nprocs = np; Nsys = nsys; - N = n; // Work out how many systems are going to be solved on this processor - int ns = nsys / nprocs; // Number of systems to assign to all processors - int nsextra = nsys % nprocs; // Number of processors with 1 extra - - myns = ns; // Number of systems to gather onto this processor - sys0 = ns * myproc; // Starting system number - if (myproc < nsextra) { - myns++; - sys0 += myproc; + int ns = nsys / ngatherprocs; // Number of systems to assign to all processors + int nsextra = nsys % ngatherprocs; // Number of processors with 1 extra + + if (ngatherprocs == nprocs) { + // Gathering onto all processors (This is the default) + + myns = ns; // Number of systems to gather onto this processor + sys0 = ns * myproc; // Starting system number + if (myproc < nsextra) { + myns++; + sys0 += myproc; + } else { + sys0 += nsextra; + } } else { - sys0 += nsextra; + // Gathering onto only a few processors + + myns = 0; // Default to not gathering onto this processor + + // Interval between processors + BoutReal pinterval = static_cast(nprocs) / ngatherprocs; + ASSERT1(pinterval > 1.0); + + // Calculate which processors these are + for (int i = 0; i < ngatherprocs; i++) { + int proc = static_cast( pinterval * i ); + + if (proc == myproc) { + // This processor is receiving + + myns = ns; // Number of systems to gather onto this processor + sys0 = ns * i; // Starting system number + + if (i < nsextra) { + // Receiving an extra system + myns++; + sys0 += i; + } else { + sys0 += nsextra; + } + } + } } coefs.reallocate(Nsys, 4 * N); myif.reallocate(Nsys, 8); - // Note: The recvbuffer is used to receive data in both stages of the solve: - // 1. In the gather step, this processor will receive myns interface equations - // from each processor. - // 2. In the scatter step, this processor receives the solved interface values - // from each processor. The number of systems of equations received will - // vary from myns to myns+1 (if myproc >= nsextra). - // The size of the array reserved is therefore (myns+1) - recvbuffer.reallocate(nprocs, (myns + 1) * 8); - // Some interface systems to be solved on this processor // Note that the interface equations are organised by system (myns as first argument) // but communication buffers are organised by processor (nprocs first). diff --git a/tests/integrated/test-cyclic/runtest b/tests/integrated/test-cyclic/runtest index ae2bca24c7..051744a61b 100755 --- a/tests/integrated/test-cyclic/runtest +++ b/tests/integrated/test-cyclic/runtest @@ -20,7 +20,8 @@ from sys import stdout, exit build_and_log("Cyclic Reduction test") -flags = ["", "nsys=2", "nsys=5 periodic", "nsys=7 n=10"] +flags = ["", "nsys=2", "nsys=5 periodic", "nsys=7 n=10", + "ngather=1", "nsys=7 n=10 ngather=1"] code = 0 # Return code for nproc in [1, 2, 4]: diff --git a/tests/integrated/test-cyclic/test_cyclic.cxx b/tests/integrated/test-cyclic/test_cyclic.cxx index f90147d900..366ba2a6b4 100644 --- a/tests/integrated/test-cyclic/test_cyclic.cxx +++ b/tests/integrated/test-cyclic/test_cyclic.cxx @@ -21,7 +21,7 @@ int main(int argc, char **argv) { int nsys; int n; - Options *options = Options::getRoot(); + Options& options = Options::root(); OPTION(options, n, 5); OPTION(options, nsys, 1); BoutReal tol; @@ -29,13 +29,16 @@ int main(int argc, char **argv) { bool periodic; OPTION(options, periodic, false); - // Create a cyclic reduction object, operating on Ts - auto* cr = new CyclicReduce(BoutComm::get(), n); - int mype, npe; MPI_Comm_rank(BoutComm::get(), &mype); MPI_Comm_size(BoutComm::get(), &npe); + int ngather = + options["ngather"].doc("The number of processors to gather onto").withDefault(npe); + + // Create a cyclic reduction object, operating on Ts + auto* cr = new CyclicReduce(BoutComm::get(), n, ngather); + a.reallocate(nsys, n); b.reallocate(nsys, n); c.reallocate(nsys, n); From 6ff389629d1f509e55cdfabdc6555df8d5b0a0c7 Mon Sep 17 00:00:00 2001 From: Ben Dudson Date: Thu, 13 May 2021 10:09:14 +0100 Subject: [PATCH 03/13] Add ngather option to Laplace cyclic solver Input option to control the number of processors (in X) to gather systems onto. This can be used to tune performance: Smaller ngather leads to fewer messages, but more load imbalance. Modified test, to cover more variations of nxpe and ngather. That test found some bugs, which are fixed here. --- include/cyclic_reduction.hxx | 66 +++++++++---------- .../laplace/impls/cyclic/cyclic_laplace.cxx | 12 +++- tests/integrated/test-laplace/runtest | 11 ++-- 3 files changed, 46 insertions(+), 43 deletions(-) diff --git a/include/cyclic_reduction.hxx b/include/cyclic_reduction.hxx index 00851b52da..898abf9f58 100644 --- a/include/cyclic_reduction.hxx +++ b/include/cyclic_reduction.hxx @@ -221,9 +221,6 @@ public: // // Here PE 0 would have myns=2, PE 1 and 2 would have myns=1 - int ns = Nsys / ngatherprocs; // Number of systems to assign to all gathering processors - int nsextra = Nsys % ngatherprocs; // Number of processors with 1 extra - // Receive requests if (myns > 0) { @@ -270,38 +267,42 @@ public: // Interval between gathering processors BoutReal pinterval = static_cast(nprocs) / ngatherprocs; - int s0 = 0; // System number + { + int ns = Nsys / ngatherprocs; // Number of systems to assign to all gathering processors + int nsextra = Nsys % ngatherprocs; // Number of processors with 1 extra + int s0 = 0; // Starting system number - // Loop over gathering processors - for (int i = 0; i < ngatherprocs; i++) { + // Loop over gathering processors + for (int i = 0; i < ngatherprocs; i++) { - int p = i; // Gathering onto all processors - if (ngatherprocs != nprocs) { - // Gathering onto only some - p = static_cast( pinterval * i ); - } + int p = i; // Gathering onto all processors + if (ngatherprocs != nprocs) { + // Gathering onto only some + p = static_cast( pinterval * i ); + } - int nsp = ns; // Number of systems to send to this processor - if (i < nsextra) - nsp++; // Some processors get an extra system + int nsp = ns; // Number of systems to send to this processor + if (i < nsextra) + nsp++; // Some processors get an extra system - gather_proc[i] = p; - gather_sys_offset[i] = s0; - gather_nsys[i] = nsp; + gather_proc[i] = p; + gather_sys_offset[i] = s0; + gather_nsys[i] = nsp; - if ((p != myproc) && (nsp > 0)) { + if ((p != myproc) && (nsp > 0)) { #ifdef DIAGNOSE - output << "Sending to " << p << endl; + output << "Sending to " << p << endl; #endif - MPI_Send(&myif(s0, 0), // Data pointer - 8 * nsp * sizeof(T), // Number - MPI_BYTE, // Type - p, // Destination - myproc, // Message identifier - comm); // Communicator - } + MPI_Send(&myif(s0, 0), // Data pointer + 8 * nsp * sizeof(T), // Number + MPI_BYTE, // Type + p, // Destination + myproc, // Message identifier + comm); // Communicator + } - s0 += nsp; + s0 += nsp; + } } if (myns > 0) { @@ -409,11 +410,11 @@ public: output << "Expecting receive from " << p << " of size " << len << endl; #endif MPI_Irecv(&gather_buffer(i, 0), len, - MPI_BYTE, // Just sending raw data, unknown type - p, // Destination processor + MPI_BYTE, // Just receiving raw data, unknown type + p, // Origin processor p, // Identifier comm, // Communicator - &gather_req[p]); // Request + &gather_req[i]); // Request } else { gather_req[i] = MPI_REQUEST_NULL; } @@ -448,7 +449,6 @@ public: if (fromind != MPI_UNDEFINED) { // Copy data - int fromproc = gather_proc[fromind]; // From processor int s0 = gather_sys_offset[fromind]; // System index start int nsp = gather_nsys[fromind]; // Number of systems @@ -458,10 +458,10 @@ public: xn[s0 + i] = gather_buffer(fromind, 2 * i + 1); #ifdef DIAGNOSE output << "Received x1,xn[" << s0 + i << "] = " << x1[s0 + i] << ", " - << xn[s0 + i] << " from " << fromproc << endl; + << xn[s0 + i] << " from " << gather_proc[fromind] << endl; #endif } - gather_req[fromproc] = MPI_REQUEST_NULL; + gather_req[fromind] = MPI_REQUEST_NULL; } } while (fromind != MPI_UNDEFINED); } diff --git a/src/invert/laplace/impls/cyclic/cyclic_laplace.cxx b/src/invert/laplace/impls/cyclic/cyclic_laplace.cxx index af203d2ec5..4c9ac49618 100644 --- a/src/invert/laplace/impls/cyclic/cyclic_laplace.cxx +++ b/src/invert/laplace/impls/cyclic/cyclic_laplace.cxx @@ -55,10 +55,11 @@ LaplaceCyclic::LaplaceCyclic(Options *opt, const CELL_LOC loc, Mesh *mesh_in) OPTION(opt, dst, false); - if(dst) { + if (dst) { nmode = localmesh->LocalNz-2; - }else + } else { nmode = maxmode+1; // Number of Z modes. maxmode set in invert_laplace.cxx from options + } // Note nmode == nsys of cyclic_reduction @@ -81,8 +82,13 @@ LaplaceCyclic::LaplaceCyclic(Options *opt, const CELL_LOC loc, Mesh *mesh_in) xcmplx.reallocate(nmode, n); bcmplx.reallocate(nmode, n); + int ngather = + (*opt)["ngather"] + .doc("Number of processors in X to gather onto. Default (0) is all processors") + .withDefault(0); + // Create a cyclic reduction object, operating on dcomplex values - cr = new CyclicReduce(localmesh->getXcomm(), n); + cr = new CyclicReduce(localmesh->getXcomm(), n, ngather); cr->setPeriodic(localmesh->periodicX); } diff --git a/tests/integrated/test-laplace/runtest b/tests/integrated/test-laplace/runtest index d4a501f184..446e808037 100755 --- a/tests/integrated/test-laplace/runtest +++ b/tests/integrated/test-laplace/runtest @@ -52,18 +52,15 @@ for v in vars: print("Running Laplacian inversion test") success = True -for nproc in [1, 2, 4]: - nxpe = 1 - if nproc > 2: - nxpe = 2 +for nproc, nxpe, ngather in [(1,1,1), (2,1,1), (2,2,2), (4,2,1), (4,4,2)]: - cmd = "./test_laplace nxpe=" + str(nxpe) + cmd = "./test_laplace nxpe=" + str(nxpe)+ " laplace:ngather=" + str(ngather) shell("rm data/BOUT.dmp.*.nc") - print(" %d processors (nxpe = %d)...." % (nproc, nxpe)) + print(" {} processors (nxpe = {}, ngather = {})....".format(nproc, nxpe, ngather)) s, out = launch_safe(cmd, nproc=nproc, mthread=1, pipe=True) - with open("run.log." + str(nproc), "w") as f: + with open("run.log." + str(nproc) + "." + str(nxpe) + "." + str(ngather), "w") as f: f.write(out) # Collect output data From 764725e2b0e6c34ec1cf6d6bda09c16ff6066d4d Mon Sep 17 00:00:00 2001 From: Ben Dudson Date: Thu, 13 May 2021 10:18:06 +0100 Subject: [PATCH 04/13] Clang and Black formatting --- include/cyclic_reduction.hxx | 103 ++++++++-------- .../laplace/impls/cyclic/cyclic_laplace.cxx | 115 +++++++++--------- tests/integrated/test-laplace/runtest | 4 +- 3 files changed, 114 insertions(+), 108 deletions(-) diff --git a/include/cyclic_reduction.hxx b/include/cyclic_reduction.hxx index 898abf9f58..26aa0f85f3 100644 --- a/include/cyclic_reduction.hxx +++ b/include/cyclic_reduction.hxx @@ -43,22 +43,24 @@ //#define DIAGNOSE 1 #include "mpi.h" -#include "utils.hxx" #include "msg_stack.hxx" +#include "utils.hxx" #include -#include "bout/assert.hxx" #include "boutexception.hxx" +#include "bout/assert.hxx" #include "output.hxx" #include "bout/openmpwrap.hxx" -template class CyclicReduce { +template +class CyclicReduce { public: CyclicReduce() = default; - CyclicReduce(MPI_Comm c, int size, int ngather=0) : comm(c), ngatherprocs(ngather), N(size) { + CyclicReduce(MPI_Comm c, int size, int ngather = 0) + : comm(c), ngatherprocs(ngather), N(size) { setup(c, size, ngather); } @@ -68,7 +70,7 @@ public: /// @param[in] c The communicator of all processors involved in the solve /// @param[in] size The number of rows on this processor /// @param[in] gather The number of processors to gather onto. If 0, use all processors - void setup(MPI_Comm c, int size, int ngather=0) { + void setup(MPI_Comm c, int size, int ngather = 0) { comm = c; int np, myp; @@ -99,7 +101,7 @@ public: /// By default not periodic void setPeriodic(bool p = true) { periodic = p; } - void setCoefs(const Array &a, const Array &b, const Array &c) { + void setCoefs(const Array& a, const Array& b, const Array& c) { ASSERT2(a.size() == b.size()); ASSERT2(a.size() == c.size()); ASSERT2(a.size() == N); @@ -108,7 +110,7 @@ public: Matrix bMatrix(1, N); Matrix cMatrix(1, N); - BOUT_OMP(parallel for) + BOUT_OMP(parallel for) for (int i = 0; i < N; ++i) { aMatrix(0, i) = a[i]; bMatrix(0, i) = b[i]; @@ -148,7 +150,7 @@ public: /// /// @param[in] rhs Array storing Values of the rhs for a single system /// @param[out] x Array storing the result for a single system - void solve(const Array &rhs, Array &x) { + void solve(const Array& rhs, Array& x) { ASSERT2(rhs.size() == x.size()); ASSERT2(rhs.size() == N); @@ -175,7 +177,7 @@ public: /// /// @param[in] rhs Matrix storing Values of the rhs for each system /// @param[out] x Matrix storing the result for each system - void solve(const Matrix &rhs, Matrix &x) { + void solve(const Matrix& rhs, Matrix& x) { TRACE("CyclicReduce::solve"); ASSERT2(static_cast(std::get<0>(rhs.shape())) == Nsys); ASSERT2(static_cast(std::get<0>(x.shape())) == Nsys); @@ -227,7 +229,7 @@ public: // If gathering systems onto this processor // post receives from all other processors - all_req.resize(nprocs); // One communicator per processor + all_req.resize(nprocs); // One communicator per processor all_buffer.reallocate(nprocs, myns * 8); // Storage for systems from each processor all_req[myproc] = MPI_REQUEST_NULL; @@ -239,7 +241,7 @@ public: if (p == myproc) { // Just copy the data - BOUT_OMP(parallel for) + BOUT_OMP(parallel for) for (int i = 0; i < myns; i++) for (int j = 0; j < 8; j++) ifcs(i, 8 * p + j) = myif(sys0 + i, j); @@ -248,10 +250,10 @@ public: output << "Expecting to receive " << len << " from " << p << endl; #endif MPI_Irecv(&all_buffer(p, 0), len, - MPI_BYTE, // Just sending raw data, unknown type - p, // Destination processor - p, // Identifier - comm, // Communicator + MPI_BYTE, // Just sending raw data, unknown type + p, // Destination processor + p, // Identifier + comm, // Communicator &all_req[p]); // Request } } @@ -259,18 +261,19 @@ public: // Send data to the processors which are gathering - gather_req.resize(ngatherprocs); // One request per gathering processor - gather_proc.resize(ngatherprocs); // Processor number - gather_sys_offset.resize(ngatherprocs); // Index of the first system gathered - gather_nsys.resize(ngatherprocs); // Number of systems + gather_req.resize(ngatherprocs); // One request per gathering processor + gather_proc.resize(ngatherprocs); // Processor number + gather_sys_offset.resize(ngatherprocs); // Index of the first system gathered + gather_nsys.resize(ngatherprocs); // Number of systems // Interval between gathering processors BoutReal pinterval = static_cast(nprocs) / ngatherprocs; { - int ns = Nsys / ngatherprocs; // Number of systems to assign to all gathering processors + int ns = + Nsys / ngatherprocs; // Number of systems to assign to all gathering processors int nsextra = Nsys % ngatherprocs; // Number of processors with 1 extra - int s0 = 0; // Starting system number + int s0 = 0; // Starting system number // Loop over gathering processors for (int i = 0; i < ngatherprocs; i++) { @@ -278,7 +281,7 @@ public: int p = i; // Gathering onto all processors if (ngatherprocs != nprocs) { // Gathering onto only some - p = static_cast( pinterval * i ); + p = static_cast(pinterval * i); } int nsp = ns; // Number of systems to send to this processor @@ -318,7 +321,7 @@ public: #ifdef DIAGNOSE output << "Copying received data from " << p << endl; #endif - BOUT_OMP(parallel for) + BOUT_OMP(parallel for) for (int i = 0; i < myns; i++) for (int j = 0; j < 8; j++) { #ifdef DIAGNOSE @@ -348,7 +351,7 @@ public: if2x2.ensureUnique(); x1.ensureUnique(); xn.ensureUnique(); - + BOUT_OMP(parallel for) for (int i = 0; i < myns; ++i) { // (a b) (x1) = (b1) @@ -392,14 +395,14 @@ public: // Post receives // Loop over gathering processors for (int i = 0; i < ngatherprocs; i++) { - int p = gather_proc[i]; // Processor number + int p = gather_proc[i]; // Processor number int nsp = gather_nsys[i]; // Number of systems int len = 2 * nsp * sizeof(T); // 2 values per system if (p == myproc) { // Just copy the data - BOUT_OMP(parallel for) + BOUT_OMP(parallel for) for (int i = 0; i < myns; i++) { x1[sys0 + i] = ifx(i, 2 * p); xn[sys0 + i] = ifx(i, 2 * p + 1); @@ -410,10 +413,10 @@ public: output << "Expecting receive from " << p << " of size " << len << endl; #endif MPI_Irecv(&gather_buffer(i, 0), len, - MPI_BYTE, // Just receiving raw data, unknown type - p, // Origin processor - p, // Identifier - comm, // Communicator + MPI_BYTE, // Just receiving raw data, unknown type + p, // Origin processor + p, // Identifier + comm, // Communicator &gather_req[i]); // Request } else { gather_req[i] = MPI_REQUEST_NULL; @@ -424,7 +427,7 @@ public: // Send data for (int p = 0; p < nprocs; p++) { // Loop over processor if (p != myproc) { - BOUT_OMP(parallel for) + BOUT_OMP(parallel for) for (int i = 0; i < myns; i++) { ifp[2 * i] = ifx(i, 2 * p); ifp[2 * i + 1] = ifx(i, 2 * p + 1); @@ -452,7 +455,7 @@ public: int s0 = gather_sys_offset[fromind]; // System index start int nsp = gather_nsys[fromind]; // Number of systems - BOUT_OMP(parallel for) + BOUT_OMP(parallel for) for (int i = 0; i < nsp; i++) { x1[s0 + i] = gather_buffer(fromind, 2 * i); xn[s0 + i] = gather_buffer(fromind, 2 * i + 1); @@ -475,15 +478,15 @@ private: MPI_Comm comm; ///< Communicator int nprocs{0}, myproc{-1}; ///< Number of processors and ID of my processor - int ngatherprocs{0}; ///< Number of processors to gather onto + int ngatherprocs{0}; ///< Number of processors to gather onto std::vector gather_req; ///< Comms with gathering processors - std::vector gather_proc; ///< The processor number - std::vector gather_sys_offset; ///< Index of the first system gathered - std::vector gather_nsys; ///< Number of systems gathered - Matrix gather_buffer; ///< Buffer for receiving from gathering processors + std::vector gather_proc; ///< The processor number + std::vector gather_sys_offset; ///< Index of the first system gathered + std::vector gather_nsys; ///< Number of systems gathered + Matrix gather_buffer; ///< Buffer for receiving from gathering processors std::vector all_req; ///< Requests for comms with all processors - Matrix all_buffer; ///< Buffer for receiving from all other processors + Matrix all_buffer; ///< Buffer for receiving from all other processors int N{0}; ///< Total size of the problem int Nsys{0}; ///< Number of independent systems to solve @@ -495,11 +498,11 @@ private: Matrix coefs; ///< Starting coefficients, rhs [Nsys, {3*coef,rhs}*N] Matrix myif; ///< Interface equations for this processor - Matrix ifcs; ///< Coefficients for interface solve - Matrix if2x2; ///< 2x2 interface equations on this processor - Matrix ifx; ///< Solution of interface equations - Array ifp; ///< Interface equations returned to processor p - Array x1, xn; ///< Interface solutions for back-solving + Matrix ifcs; ///< Coefficients for interface solve + Matrix if2x2; ///< 2x2 interface equations on this processor + Matrix ifx; ///< Solution of interface equations + Array ifp; ///< Interface equations returned to processor p + Array x1, xn; ///< Interface solutions for back-solving /// Allocate memory arrays /// @param[in] nsys Number of independent systems to solve @@ -535,7 +538,7 @@ private: // Calculate which processors these are for (int i = 0; i < ngatherprocs; i++) { - int proc = static_cast( pinterval * i ); + int proc = static_cast(pinterval * i); if (proc == myproc) { // This processor is receiving @@ -584,12 +587,12 @@ private: /// ( a3 b3 c3 ) => ( A2 B2 C2) /// ( ... ) /// ( an bn cn) - void reduce(int ns, int nloc, Matrix &co, Matrix &ifc) { + void reduce(int ns, int nloc, Matrix& co, Matrix& ifc) { #ifdef DIAGNOSE if (nloc < 2) throw BoutException("CyclicReduce::reduce nloc < 2"); #endif - + BOUT_OMP(parallel for) for (int j = 0; j < ns; j++) { // Calculate upper interface equation @@ -662,20 +665,20 @@ private: const Array& xn, Matrix& xa) { xa.ensureUnique(); // Going to be modified, so call this outside parallel region - + // Tridiagonal system, solve using serial Thomas algorithm // xa -- Result for each system // co -- Coefficients & rhs for each system BOUT_OMP(parallel for) for (int i = 0; i < ns; i++) { // Loop over systems - Array gam(nloc); // Thread-local array + Array gam(nloc); // Thread-local array T bet = 1.0; xa(i, 0) = x1[i]; // Already know the first gam[1] = 0.; for (int j = 1; j < nloc - 1; j++) { bet = co(i, 4 * j + 1) - co(i, 4 * j) * gam[j]; // bet = b[j]-a[j]*gam[j] - xa(i, j) = (co(i, 4 * j + 3) - co(i, 4 * j) * xa(i, j - 1)) / - bet; // x[j] = (r[j]-a[j]*x[j-1])/bet; + xa(i, j) = (co(i, 4 * j + 3) - co(i, 4 * j) * xa(i, j - 1)) + / bet; // x[j] = (r[j]-a[j]*x[j-1])/bet; gam[j + 1] = co(i, 4 * j + 2) / bet; // gam[j+1] = c[j]/bet } xa(i, nloc - 1) = xn[i]; // Know the last value diff --git a/src/invert/laplace/impls/cyclic/cyclic_laplace.cxx b/src/invert/laplace/impls/cyclic/cyclic_laplace.cxx index 4c9ac49618..9ab6e5f942 100644 --- a/src/invert/laplace/impls/cyclic/cyclic_laplace.cxx +++ b/src/invert/laplace/impls/cyclic/cyclic_laplace.cxx @@ -33,18 +33,18 @@ * */ -#include -#include +#include #include -#include -#include #include -#include +#include +#include +#include #include +#include #include "cyclic_laplace.hxx" -LaplaceCyclic::LaplaceCyclic(Options *opt, const CELL_LOC loc, Mesh *mesh_in) +LaplaceCyclic::LaplaceCyclic(Options* opt, const CELL_LOC loc, Mesh* mesh_in) : Laplacian(opt, loc, mesh_in), Acoef(0.0), C1coef(1.0), C2coef(1.0), Dcoef(1.0) { Acoef.setLocation(location); C1coef.setLocation(location); @@ -56,9 +56,10 @@ LaplaceCyclic::LaplaceCyclic(Options *opt, const CELL_LOC loc, Mesh *mesh_in) OPTION(opt, dst, false); if (dst) { - nmode = localmesh->LocalNz-2; + nmode = localmesh->LocalNz - 2; } else { - nmode = maxmode+1; // Number of Z modes. maxmode set in invert_laplace.cxx from options + nmode = + maxmode + 1; // Number of Z modes. maxmode set in invert_laplace.cxx from options } // Note nmode == nsys of cyclic_reduction @@ -66,15 +67,19 @@ LaplaceCyclic::LaplaceCyclic(Options *opt, const CELL_LOC loc, Mesh *mesh_in) // Allocate arrays xs = localmesh->xstart; // Starting X index - if(localmesh->firstX() && !localmesh->periodicX){ // Only want to include guard cells at boundaries (unless periodic in x) - xs = 0; + if (localmesh->firstX() + && !localmesh->periodicX) { // Only want to include guard cells at boundaries + // (unless periodic in x) + xs = 0; } - xe = localmesh->xend; // Last X index - if(localmesh->lastX() && !localmesh->periodicX){ // Only want to include guard cells at boundaries (unless periodic in x) - xe = localmesh->LocalNx-1; + xe = localmesh->xend; // Last X index + if (localmesh->lastX() + && !localmesh->periodicX) { // Only want to include guard cells at boundaries + // (unless periodic in x) + xe = localmesh->LocalNx - 1; } - int n = xe - xs + 1; // Number of X points on this processor, - // including boundaries but not guard cells + int n = xe - xs + 1; // Number of X points on this processor, + // including boundaries but not guard cells a.reallocate(nmode, n); b.reallocate(nmode, n); @@ -104,26 +109,26 @@ FieldPerp LaplaceCyclic::solve(const FieldPerp& rhs, const FieldPerp& x0) { FieldPerp x{emptyFrom(rhs)}; // Result - int jy = rhs.getIndex(); // Get the Y index + int jy = rhs.getIndex(); // Get the Y index x.setIndex(jy); // Get the width of the boundary // If the flags to assign that only one guard cell should be used is set - int inbndry = localmesh->xstart, outbndry=localmesh->xstart; - if((global_flags & INVERT_BOTH_BNDRY_ONE) || (localmesh->xstart < 2)) { + int inbndry = localmesh->xstart, outbndry = localmesh->xstart; + if ((global_flags & INVERT_BOTH_BNDRY_ONE) || (localmesh->xstart < 2)) { inbndry = outbndry = 1; } - if(inner_boundary_flags & INVERT_BNDRY_ONE) + if (inner_boundary_flags & INVERT_BNDRY_ONE) inbndry = 1; - if(outer_boundary_flags & INVERT_BNDRY_ONE) + if (outer_boundary_flags & INVERT_BNDRY_ONE) outbndry = 1; - if(dst) { + if (dst) { BOUT_OMP(parallel) { /// Create a local thread-scope working array - auto k1d = - Array(localmesh->LocalNz); // ZFFT routine expects input of this length + auto k1d = Array( + localmesh->LocalNz); // ZFFT routine expects input of this length // Loop over X indices, including boundaries but not guard cells. (unless periodic // in x) @@ -131,9 +136,9 @@ FieldPerp LaplaceCyclic::solve(const FieldPerp& rhs, const FieldPerp& x0) { for (int ix = xs; ix <= xe; ix++) { // Take DST in Z direction and put result in k1d - if (((ix < inbndry) && (inner_boundary_flags & INVERT_SET) && localmesh->firstX()) || - ((localmesh->LocalNx - ix - 1 < outbndry) && (outer_boundary_flags & INVERT_SET) && - localmesh->lastX())) { + if (((ix < inbndry) && (inner_boundary_flags & INVERT_SET) && localmesh->firstX()) + || ((localmesh->LocalNx - ix - 1 < outbndry) + && (outer_boundary_flags & INVERT_SET) && localmesh->lastX())) { // Use the values in x0 in the boundary DST(x0[ix] + 1, localmesh->LocalNz - 2, std::begin(k1d)); } else { @@ -169,8 +174,8 @@ FieldPerp LaplaceCyclic::solve(const FieldPerp& rhs, const FieldPerp& x0) { // FFT back to real space BOUT_OMP(parallel) { /// Create a local thread-scope working array - auto k1d = - Array(localmesh->LocalNz); // ZFFT routine expects input of this length + auto k1d = Array( + localmesh->LocalNz); // ZFFT routine expects input of this length BOUT_OMP(for nowait) for (int ix = xs; ix <= xe; ix++) { @@ -186,12 +191,11 @@ FieldPerp LaplaceCyclic::solve(const FieldPerp& rhs, const FieldPerp& x0) { x(ix, localmesh->LocalNz - 1) = -x(ix, localmesh->LocalNz - 3); } } - }else { - BOUT_OMP(parallel) - { + } else { + BOUT_OMP(parallel) { /// Create a local thread-scope working array - auto k1d = Array((localmesh->LocalNz) / 2 + - 1); // ZFFT routine expects input of this length + auto k1d = Array((localmesh->LocalNz) / 2 + + 1); // ZFFT routine expects input of this length // Loop over X indices, including boundaries but not guard cells (unless periodic in // x) @@ -199,9 +203,9 @@ FieldPerp LaplaceCyclic::solve(const FieldPerp& rhs, const FieldPerp& x0) { for (int ix = xs; ix <= xe; ix++) { // Take FFT in Z direction, apply shift, and put result in k1d - if (((ix < inbndry) && (inner_boundary_flags & INVERT_SET) && localmesh->firstX()) || - ((localmesh->LocalNx - ix - 1 < outbndry) && (outer_boundary_flags & INVERT_SET) && - localmesh->lastX())) { + if (((ix < inbndry) && (inner_boundary_flags & INVERT_SET) && localmesh->firstX()) + || ((localmesh->LocalNx - ix - 1 < outbndry) + && (outer_boundary_flags & INVERT_SET) && localmesh->lastX())) { // Use the values in x0 in the boundary rfft(x0[ix], localmesh->LocalNz, std::begin(k1d)); } else { @@ -232,11 +236,10 @@ FieldPerp LaplaceCyclic::solve(const FieldPerp& rhs, const FieldPerp& x0) { cr->solve(bcmplx, xcmplx); // FFT back to real space - BOUT_OMP(parallel) - { + BOUT_OMP(parallel) { /// Create a local thread-scope working array - auto k1d = Array((localmesh->LocalNz) / 2 + - 1); // ZFFT routine expects input of this length + auto k1d = Array((localmesh->LocalNz) / 2 + + 1); // ZFFT routine expects input of this length const bool zero_DC = global_flags & INVERT_ZERO_DC; @@ -298,8 +301,8 @@ Field3D LaplaceCyclic::solve(const Field3D& rhs, const Field3D& x0) { } if (localmesh->hasBndryUpperY()) { if (include_yguards) - ye = localmesh->LocalNy - - 1; // Contains upper boundary and we are solving in the guard cells + ye = localmesh->LocalNy + - 1; // Contains upper boundary and we are solving in the guard cells ye -= extra_yguards_upper; } @@ -318,8 +321,8 @@ Field3D LaplaceCyclic::solve(const Field3D& rhs, const Field3D& x0) { if (dst) { BOUT_OMP(parallel) { /// Create a local thread-scope working array - auto k1d = - Array(localmesh->LocalNz); // ZFFT routine expects input of this length + auto k1d = Array( + localmesh->LocalNz); // ZFFT routine expects input of this length // Loop over X and Y indices, including boundaries but not guard cells. // (unless periodic in x) @@ -331,9 +334,9 @@ Field3D LaplaceCyclic::solve(const Field3D& rhs, const Field3D& x0) { // Take DST in Z direction and put result in k1d - if (((ix < inbndry) && (inner_boundary_flags & INVERT_SET) && localmesh->firstX()) || - ((localmesh->LocalNx - ix - 1 < outbndry) && (outer_boundary_flags & INVERT_SET) && - localmesh->lastX())) { + if (((ix < inbndry) && (inner_boundary_flags & INVERT_SET) && localmesh->firstX()) + || ((localmesh->LocalNx - ix - 1 < outbndry) + && (outer_boundary_flags & INVERT_SET) && localmesh->lastX())) { // Use the values in x0 in the boundary DST(x0(ix, iy) + 1, localmesh->LocalNz - 2, std::begin(k1d)); } else { @@ -374,8 +377,8 @@ Field3D LaplaceCyclic::solve(const Field3D& rhs, const Field3D& x0) { // FFT back to real space BOUT_OMP(parallel) { /// Create a local thread-scope working array - auto k1d = - Array(localmesh->LocalNz); // ZFFT routine expects input of this length + auto k1d = Array( + localmesh->LocalNz); // ZFFT routine expects input of this length BOUT_OMP(for nowait) for (int ind = 0; ind < nxny; ++ind) { // Loop over X and Y @@ -399,8 +402,8 @@ Field3D LaplaceCyclic::solve(const Field3D& rhs, const Field3D& x0) { } else { BOUT_OMP(parallel) { /// Create a local thread-scope working array - auto k1d = Array(localmesh->LocalNz / 2 + - 1); // ZFFT routine expects input of this length + auto k1d = Array(localmesh->LocalNz / 2 + + 1); // ZFFT routine expects input of this length // Loop over X and Y indices, including boundaries but not guard cells // (unless periodic in x) @@ -413,9 +416,9 @@ Field3D LaplaceCyclic::solve(const Field3D& rhs, const Field3D& x0) { // Take FFT in Z direction, apply shift, and put result in k1d - if (((ix < inbndry) && (inner_boundary_flags & INVERT_SET) && localmesh->firstX()) || - ((localmesh->LocalNx - ix - 1 < outbndry) && (outer_boundary_flags & INVERT_SET) && - localmesh->lastX())) { + if (((ix < inbndry) && (inner_boundary_flags & INVERT_SET) && localmesh->firstX()) + || ((localmesh->LocalNx - ix - 1 < outbndry) + && (outer_boundary_flags & INVERT_SET) && localmesh->lastX())) { // Use the values in x0 in the boundary rfft(x0(ix, iy), localmesh->LocalNz, std::begin(k1d)); } else { @@ -452,8 +455,8 @@ Field3D LaplaceCyclic::solve(const Field3D& rhs, const Field3D& x0) { // FFT back to real space BOUT_OMP(parallel) { /// Create a local thread-scope working array - auto k1d = Array((localmesh->LocalNz) / 2 + - 1); // ZFFT routine expects input of this length + auto k1d = Array((localmesh->LocalNz) / 2 + + 1); // ZFFT routine expects input of this length const bool zero_DC = global_flags & INVERT_ZERO_DC; diff --git a/tests/integrated/test-laplace/runtest b/tests/integrated/test-laplace/runtest index 446e808037..454716cc3e 100755 --- a/tests/integrated/test-laplace/runtest +++ b/tests/integrated/test-laplace/runtest @@ -52,9 +52,9 @@ for v in vars: print("Running Laplacian inversion test") success = True -for nproc, nxpe, ngather in [(1,1,1), (2,1,1), (2,2,2), (4,2,1), (4,4,2)]: +for nproc, nxpe, ngather in [(1, 1, 1), (2, 1, 1), (2, 2, 2), (4, 2, 1), (4, 4, 2)]: - cmd = "./test_laplace nxpe=" + str(nxpe)+ " laplace:ngather=" + str(ngather) + cmd = "./test_laplace nxpe=" + str(nxpe) + " laplace:ngather=" + str(ngather) shell("rm data/BOUT.dmp.*.nc") From a5f0a64bc9583f4ea6a344740987e2c67380b018 Mon Sep 17 00:00:00 2001 From: Ben Dudson Date: Thu, 13 May 2021 15:26:13 +0100 Subject: [PATCH 05/13] Tidying, using unique_ptr Clang-tidy and @ZedThree suggestions --- include/cyclic_reduction.hxx | 9 +++++---- .../laplace/impls/cyclic/cyclic_laplace.cxx | 17 +++++++---------- .../laplace/impls/cyclic/cyclic_laplace.hxx | 5 ++--- tests/integrated/test-cyclic/test_cyclic.cxx | 5 +---- 4 files changed, 15 insertions(+), 21 deletions(-) diff --git a/include/cyclic_reduction.hxx b/include/cyclic_reduction.hxx index 26aa0f85f3..4395bb0ca3 100644 --- a/include/cyclic_reduction.hxx +++ b/include/cyclic_reduction.hxx @@ -285,8 +285,9 @@ public: } int nsp = ns; // Number of systems to send to this processor - if (i < nsextra) + if (i < nsextra) { nsp++; // Some processors get an extra system + } gather_proc[i] = p; gather_sys_offset[i] = s0; @@ -444,7 +445,7 @@ public: } // Wait for data - int fromind; + int fromind{MPI_UNDEFINED}; do { MPI_Status stat; MPI_Waitany(ngatherprocs, gather_req.data(), &fromind, &stat); @@ -490,8 +491,8 @@ private: int N{0}; ///< Total size of the problem int Nsys{0}; ///< Number of independent systems to solve - int myns; ///< Number of systems for interface solve on this processor - int sys0; ///< Starting system index for interface solve + int myns{0}; ///< Number of systems for interface solve on this processor + int sys0{0}; ///< Starting system index for interface solve bool periodic{false}; ///< Is the domain periodic? diff --git a/src/invert/laplace/impls/cyclic/cyclic_laplace.cxx b/src/invert/laplace/impls/cyclic/cyclic_laplace.cxx index 9ab6e5f942..f4415fb7aa 100644 --- a/src/invert/laplace/impls/cyclic/cyclic_laplace.cxx +++ b/src/invert/laplace/impls/cyclic/cyclic_laplace.cxx @@ -93,15 +93,10 @@ LaplaceCyclic::LaplaceCyclic(Options* opt, const CELL_LOC loc, Mesh* mesh_in) .withDefault(0); // Create a cyclic reduction object, operating on dcomplex values - cr = new CyclicReduce(localmesh->getXcomm(), n, ngather); + cr = std::make_unique>(localmesh->getXcomm(), n, ngather); cr->setPeriodic(localmesh->periodicX); } -LaplaceCyclic::~LaplaceCyclic() { - // Delete tridiagonal solver - delete cr; -} - FieldPerp LaplaceCyclic::solve(const FieldPerp& rhs, const FieldPerp& x0) { ASSERT1(localmesh == rhs.getMesh() && localmesh == x0.getMesh()); ASSERT1(rhs.getLocation() == location); @@ -334,9 +329,10 @@ Field3D LaplaceCyclic::solve(const Field3D& rhs, const Field3D& x0) { // Take DST in Z direction and put result in k1d - if (((ix < inbndry) && (inner_boundary_flags & INVERT_SET) && localmesh->firstX()) + if (((ix < inbndry) && ((inner_boundary_flags & INVERT_SET) != 0) + && localmesh->firstX()) || ((localmesh->LocalNx - ix - 1 < outbndry) - && (outer_boundary_flags & INVERT_SET) && localmesh->lastX())) { + && ((outer_boundary_flags & INVERT_SET) != 0) && localmesh->lastX())) { // Use the values in x0 in the boundary DST(x0(ix, iy) + 1, localmesh->LocalNz - 2, std::begin(k1d)); } else { @@ -416,9 +412,10 @@ Field3D LaplaceCyclic::solve(const Field3D& rhs, const Field3D& x0) { // Take FFT in Z direction, apply shift, and put result in k1d - if (((ix < inbndry) && (inner_boundary_flags & INVERT_SET) && localmesh->firstX()) + if (((ix < inbndry) && ((inner_boundary_flags & INVERT_SET) != 0) + && localmesh->firstX()) || ((localmesh->LocalNx - ix - 1 < outbndry) - && (outer_boundary_flags & INVERT_SET) && localmesh->lastX())) { + && ((outer_boundary_flags & INVERT_SET) != 0) && localmesh->lastX())) { // Use the values in x0 in the boundary rfft(x0(ix, iy), localmesh->LocalNz, std::begin(k1d)); } else { diff --git a/src/invert/laplace/impls/cyclic/cyclic_laplace.hxx b/src/invert/laplace/impls/cyclic/cyclic_laplace.hxx index 9e1161ecd8..17259bf661 100644 --- a/src/invert/laplace/impls/cyclic/cyclic_laplace.hxx +++ b/src/invert/laplace/impls/cyclic/cyclic_laplace.hxx @@ -49,8 +49,7 @@ RegisterLaplace registerlaplacecycle(LAPLACE_CYCLIC); class LaplaceCyclic : public Laplacian { public: LaplaceCyclic(Options *opt = nullptr, const CELL_LOC loc = CELL_CENTRE, Mesh *mesh_in = nullptr); - ~LaplaceCyclic(); - + using Laplacian::setCoefA; void setCoefA(const Field2D &val) override { ASSERT1(val.getLocation() == location); @@ -104,7 +103,7 @@ private: bool dst; - CyclicReduce *cr; ///< Tridiagonal solver + std::unique_ptr> cr; ///< Tridiagonal solver }; #endif // __SPT_H__ diff --git a/tests/integrated/test-cyclic/test_cyclic.cxx b/tests/integrated/test-cyclic/test_cyclic.cxx index 366ba2a6b4..6814d31138 100644 --- a/tests/integrated/test-cyclic/test_cyclic.cxx +++ b/tests/integrated/test-cyclic/test_cyclic.cxx @@ -37,7 +37,7 @@ int main(int argc, char **argv) { options["ngather"].doc("The number of processors to gather onto").withDefault(npe); // Create a cyclic reduction object, operating on Ts - auto* cr = new CyclicReduce(BoutComm::get(), n, ngather); + auto cr = std::make_unique>(BoutComm::get(), n, ngather); a.reallocate(nsys, n); b.reallocate(nsys, n); @@ -89,9 +89,6 @@ int main(int argc, char **argv) { cr->setPeriodic(periodic); cr->setCoefs(a, b, c); cr->solve(rhs, x); - - // Destroy solver - delete cr; // Check result From 72b8e64933a71e71ce4b8beb642f182585372094 Mon Sep 17 00:00:00 2001 From: bendudson Date: Mon, 25 Jul 2022 17:54:52 +0000 Subject: [PATCH 06/13] Apply clang-format changes --- src/invert/laplace/impls/cyclic/cyclic_laplace.cxx | 6 ++---- src/invert/laplace/impls/cyclic/cyclic_laplace.hxx | 2 +- 2 files changed, 3 insertions(+), 5 deletions(-) diff --git a/src/invert/laplace/impls/cyclic/cyclic_laplace.cxx b/src/invert/laplace/impls/cyclic/cyclic_laplace.cxx index 4dd6efb3b9..a3d530cf23 100644 --- a/src/invert/laplace/impls/cyclic/cyclic_laplace.cxx +++ b/src/invert/laplace/impls/cyclic/cyclic_laplace.cxx @@ -41,7 +41,7 @@ #if not BOUT_USE_METRIC_3D >>>>>>> next -#include + #include < bout / constants.hxx> #include #include #include @@ -52,9 +52,7 @@ #include "cyclic_laplace.hxx" -LaplaceCyclic::LaplaceCyclic(Options* opt, const CELL_LOC loc, Mesh* mesh_in, - Solver* UNUSED(solver), Datafile* UNUSED(dump)) - : Laplacian(opt, loc, mesh_in), Acoef(0.0), C1coef(1.0), C2coef(1.0), Dcoef(1.0) { + LaplaceCyclic::LaplaceCyclic(Options* opt, const CELL_LOC loc, Mesh* mesh_in, Solver* UNUSED(solver), Datafile* UNUSED(dump)) : Laplacian(opt, loc, mesh_in), Acoef(0.0), C1coef(1.0), C2coef(1.0), Dcoef(1.0) { Acoef.setLocation(location); C1coef.setLocation(location); diff --git a/src/invert/laplace/impls/cyclic/cyclic_laplace.hxx b/src/invert/laplace/impls/cyclic/cyclic_laplace.hxx index e76eff94be..03186e16ff 100644 --- a/src/invert/laplace/impls/cyclic/cyclic_laplace.hxx +++ b/src/invert/laplace/impls/cyclic/cyclic_laplace.hxx @@ -120,7 +120,7 @@ private: Matrix a, b, c, bcmplx, xcmplx; bool dst; - + std::unique_ptr> cr; ///< Tridiagonal solver }; From 6db58ff97d2495892f5f9c5f2ebeb0c0027109d6 Mon Sep 17 00:00:00 2001 From: Ben Dudson Date: Mon, 25 Jul 2022 11:12:35 -0700 Subject: [PATCH 07/13] Missed a merge conflict in cyclic_laplace --- src/invert/laplace/impls/cyclic/cyclic_laplace.cxx | 3 --- 1 file changed, 3 deletions(-) diff --git a/src/invert/laplace/impls/cyclic/cyclic_laplace.cxx b/src/invert/laplace/impls/cyclic/cyclic_laplace.cxx index 4dd6efb3b9..e0e880c9b2 100644 --- a/src/invert/laplace/impls/cyclic/cyclic_laplace.cxx +++ b/src/invert/laplace/impls/cyclic/cyclic_laplace.cxx @@ -33,14 +33,11 @@ * */ -<<<<<<< HEAD -======= #include "cyclic_laplace.hxx" #include "bout/build_config.hxx" #if not BOUT_USE_METRIC_3D ->>>>>>> next #include #include #include From deb463f8bdb681377feb33fd4d5453f67fd2ea6d Mon Sep 17 00:00:00 2001 From: Ben Dudson Date: Mon, 25 Jul 2022 11:53:29 -0700 Subject: [PATCH 08/13] CyclicLaplace destructor replaced by unique_ptr --- src/invert/laplace/impls/cyclic/cyclic_laplace.hxx | 1 - 1 file changed, 1 deletion(-) diff --git a/src/invert/laplace/impls/cyclic/cyclic_laplace.hxx b/src/invert/laplace/impls/cyclic/cyclic_laplace.hxx index 03186e16ff..bf7ed6d4ce 100644 --- a/src/invert/laplace/impls/cyclic/cyclic_laplace.hxx +++ b/src/invert/laplace/impls/cyclic/cyclic_laplace.hxx @@ -62,7 +62,6 @@ public: LaplaceCyclic(Options* opt = nullptr, const CELL_LOC loc = CELL_CENTRE, Mesh* mesh_in = nullptr, Solver* solver = nullptr, Datafile* dump = nullptr); - ~LaplaceCyclic(); using Laplacian::setCoefA; void setCoefA(const Field2D &val) override { From 7901661995dc4b0bcbdda88a6dac123884a3a06e Mon Sep 17 00:00:00 2001 From: bendudson Date: Wed, 14 Jun 2023 21:06:59 +0000 Subject: [PATCH 09/13] Apply black changes --- tests/integrated/test-laplace/runtest | 20 ++++++++++++++++---- 1 file changed, 16 insertions(+), 4 deletions(-) diff --git a/tests/integrated/test-laplace/runtest b/tests/integrated/test-laplace/runtest index c7d9ab17dc..1aa389b1ec 100755 --- a/tests/integrated/test-laplace/runtest +++ b/tests/integrated/test-laplace/runtest @@ -57,14 +57,26 @@ for solver in ["cyclic", "pcr", "pcr_thomas"]: # Try different combinations of processor numbers # If not cyclic then ignore ngather option for nproc, nxpe, ngather in [(1, 1, 1), (2, 1, 1), (2, 2, 2), (4, 2, 1), (4, 4, 2)]: - - cmd = "./test_laplace NXPE=" + str(nxpe) + " laplace:type=" + solver + (" laplace:ngather=" + str(ngather)) if solver == "cyclic" else "" + cmd = ( + "./test_laplace NXPE=" + + str(nxpe) + + " laplace:type=" + + solver + + (" laplace:ngather=" + str(ngather)) + if solver == "cyclic" + else "" + ) shell("rm data/BOUT.dmp.*.nc") - print(" %s solver with %d processors (nxpe = %d, ngather = %d)...." % (solver, nproc, nxpe, ngather)) + print( + " %s solver with %d processors (nxpe = %d, ngather = %d)...." + % (solver, nproc, nxpe, ngather) + ) s, out = launch_safe(cmd, nproc=nproc, mthread=1, pipe=True) - with open("run.log." + str(nproc) + "." + str(nxpe) + "." + str(ngather), "w") as f: + with open( + "run.log." + str(nproc) + "." + str(nxpe) + "." + str(ngather), "w" + ) as f: f.write(out) # Collect output data From 47d69ff09e93533332fa3f96018b590d3860dd5e Mon Sep 17 00:00:00 2001 From: Ben Dudson Date: Wed, 14 Jun 2023 14:20:19 -0700 Subject: [PATCH 10/13] Clang-tidy suggestions: Add braces --- include/bout/cyclic_reduction.hxx | 14 +++++++++----- 1 file changed, 9 insertions(+), 5 deletions(-) diff --git a/include/bout/cyclic_reduction.hxx b/include/bout/cyclic_reduction.hxx index 7c166c9b09..f7dbc71cbd 100644 --- a/include/bout/cyclic_reduction.hxx +++ b/include/bout/cyclic_reduction.hxx @@ -247,9 +247,11 @@ public: if (p == myproc) { // Just copy the data BOUT_OMP(parallel for) - for (int i = 0; i < myns; i++) - for (int j = 0; j < 8; j++) + for (int i = 0; i < myns; i++) { + for (int j = 0; j < 8; j++) { ifcs(i, 8 * p + j) = myif(sys0 + i, j); + } + } } else { #ifdef DIAGNOSE output << "Expecting to receive " << len << " from " << p << endl; @@ -327,13 +329,14 @@ public: output << "Copying received data from " << p << endl; #endif BOUT_OMP(parallel for) - for (int i = 0; i < myns; i++) + for (int i = 0; i < myns; i++) { for (int j = 0; j < 8; j++) { #ifdef DIAGNOSE output << "Value " << j << " : " << all_buffer(p, 8 * i + j) << endl; #endif ifcs(i, 8 * p + j) = all_buffer(p, 8 * i + j); } + } all_req[p] = MPI_REQUEST_NULL; } } while (p != MPI_UNDEFINED); @@ -480,7 +483,7 @@ public: } private: - MPI_Comm comm; ///< Communicator + MPI_Comm comm{}; ///< Communicator int nprocs{0}, myproc{-1}; ///< Number of processors and ID of my processor int ngatherprocs{0}; ///< Number of processors to gather onto @@ -512,8 +515,9 @@ private: /// Allocate memory arrays /// @param[in] nsys Number of independent systems to solve void allocMemory(int nsys) { - if (nsys == Nsys) + if (nsys == Nsys) { return; // No need to allocate memory + } Nsys = nsys; From 8a4d12221f620b093d6e9cc47301445acd578670 Mon Sep 17 00:00:00 2001 From: bendudson Date: Wed, 14 Jun 2023 21:21:49 +0000 Subject: [PATCH 11/13] Apply clang-format changes --- externalpackages/PVODE/precon/band.cpp | 2 +- externalpackages/PVODE/source/cvode.cpp | 6 +++--- 2 files changed, 4 insertions(+), 4 deletions(-) diff --git a/externalpackages/PVODE/precon/band.cpp b/externalpackages/PVODE/precon/band.cpp index c1d0f6f1e2..8b444de191 100644 --- a/externalpackages/PVODE/precon/band.cpp +++ b/externalpackages/PVODE/precon/band.cpp @@ -185,7 +185,7 @@ integer gbfa(real **a, integer n, integer mu, integer ml, integer smu, if (col_k[storage_l] == ZERO) return(k+1); /* swap a(l,k) and a(k,k) if necessary */ - + if (swap = (l != k)) { temp = col_k[storage_l]; col_k[storage_l] = *diag_k; diff --git a/externalpackages/PVODE/source/cvode.cpp b/externalpackages/PVODE/source/cvode.cpp index e28a3b1ae6..3617bf34ef 100644 --- a/externalpackages/PVODE/source/cvode.cpp +++ b/externalpackages/PVODE/source/cvode.cpp @@ -179,7 +179,7 @@ namespace pvode { #define MSG_Y0_NULL CVM "y0=NULL illegal.\n\n" -#define MSG_BAD_N CVM "N=%ld < 1 illegal.\n\n" +#define MSG_BAD_N CVM "N=%ld < 1 illegal.\n\n" #define MSG_BAD_LMM_1 CVM "lmm=%d illegal.\n" #define MSG_BAD_LMM_2 "The legal values are ADAMS=%d and BDF=%d.\n\n" @@ -1825,7 +1825,7 @@ static int CVnls(CVodeMem cv_mem, int nflag) switch(iter) { case FUNCTIONAL : return(CVnlsFunctional(cv_mem)); case NEWTON : return(CVnlsNewton(cv_mem, nflag)); - } + } } /***************** CVnlsFunctional ******************************** @@ -2391,7 +2391,7 @@ static int CVHandleFailure(CVodeMem cv_mem, int kflag) return(SETUP_FAILURE); case SOLVE_FAILED: fprintf(errfp, MSG_SOLVE_FAILED, tn); return(SOLVE_FAILURE); - } + } } /*******************************************************************/ From b4f7ca417ff16547d6f24ca36aec51c724d3a94e Mon Sep 17 00:00:00 2001 From: Ben Dudson Date: Wed, 14 Jun 2023 14:51:35 -0700 Subject: [PATCH 12/13] Fix merge in PVODE --- externalpackages/PVODE/source/cvode.cpp | 8 -------- 1 file changed, 8 deletions(-) diff --git a/externalpackages/PVODE/source/cvode.cpp b/externalpackages/PVODE/source/cvode.cpp index 418bd14be2..48ac3209f2 100644 --- a/externalpackages/PVODE/source/cvode.cpp +++ b/externalpackages/PVODE/source/cvode.cpp @@ -1825,13 +1825,9 @@ static int CVnls(CVodeMem cv_mem, int nflag) switch(iter) { case FUNCTIONAL : return(CVnlsFunctional(cv_mem)); case NEWTON : return(CVnlsNewton(cv_mem, nflag)); -<<<<<<< HEAD - } -======= } fprintf(errfp, "Should be unreachable ..."); return (ERR_FAILURE); ->>>>>>> v5.1.0-rc } /***************** CVnlsFunctional ******************************** @@ -2397,13 +2393,9 @@ static int CVHandleFailure(CVodeMem cv_mem, int kflag) return(SETUP_FAILURE); case SOLVE_FAILED: fprintf(errfp, MSG_SOLVE_FAILED, tn); return(SOLVE_FAILURE); -<<<<<<< HEAD - } -======= } fprintf(errfp, "Should be unreachable ..."); return (ERR_FAILURE); ->>>>>>> v5.1.0-rc } /*******************************************************************/ From b75ffa7a3fb63f86c407198c18188433afa617d5 Mon Sep 17 00:00:00 2001 From: Ben Dudson Date: Wed, 14 Jun 2023 15:26:15 -0700 Subject: [PATCH 13/13] test-laplace fix script adding ngather setting `if` applied to entire string rather than just last part --- tests/integrated/test-laplace/runtest | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) diff --git a/tests/integrated/test-laplace/runtest b/tests/integrated/test-laplace/runtest index 1aa389b1ec..b435f80833 100755 --- a/tests/integrated/test-laplace/runtest +++ b/tests/integrated/test-laplace/runtest @@ -62,9 +62,7 @@ for solver in ["cyclic", "pcr", "pcr_thomas"]: + str(nxpe) + " laplace:type=" + solver - + (" laplace:ngather=" + str(ngather)) - if solver == "cyclic" - else "" + + ((" laplace:ngather=" + str(ngather)) if solver == "cyclic" else "") ) shell("rm data/BOUT.dmp.*.nc")