From 3916ca243e86c1cb28e75e6d6178d8c8ee49d519 Mon Sep 17 00:00:00 2001
From: bendudson <bendudson@users.noreply.github.com>
Date: Wed, 12 May 2021 17:01:36 +0000
Subject: [PATCH 01/13] [skip ci] Apply black changes

---
 tests/integrated/test-cyclic/runtest | 10 ++++++++--
 1 file changed, 8 insertions(+), 2 deletions(-)
diff --git a/tests/integrated/test-cyclic/runtest b/tests/integrated/test-cyclic/runtest
index 051744a61b..9ebdf9f552 100755
--- a/tests/integrated/test-cyclic/runtest
+++ b/tests/integrated/test-cyclic/runtest
@@ -20,8 +20,14 @@ from sys import stdout, exit
 
 build_and_log("Cyclic Reduction test")
 
-flags = ["", "nsys=2", "nsys=5 periodic", "nsys=7 n=10",
-         "ngather=1", "nsys=7 n=10 ngather=1"]
+flags = [
+    "",
+    "nsys=2",
+    "nsys=5 periodic",
+    "nsys=7 n=10",
+    "ngather=1",
+    "nsys=7 n=10 ngather=1",
+]
 
 code = 0  # Return code
 for nproc in [1, 2, 4]:

From ac7f3cd6a85ba8d5c37b48f277d0247e8bf4e82f Mon Sep 17 00:00:00 2001
From: Ben Dudson <bd512@york.ac.uk>
Date: Wed, 12 May 2021 18:11:04 +0100
Subject: [PATCH 02/13] Add an option to CyclicReduce to gather onto fewer
 processors

By default all processors are used, resulting in an all-to-all
communication pattern.

An optional third argument to the constructor now specifies how many
processors should be used. This reduces the number of messages, at the
cost of increasing load imbalance. The processors used are evenly
distributed, in the hope that this will minimise network traffic on
any given node.
---
 include/cyclic_reduction.hxx                 | 232 ++++++++++++-------
 tests/integrated/test-cyclic/runtest         |   3 +-
 tests/integrated/test-cyclic/test_cyclic.cxx |  11 +-
 3 files changed, 159 insertions(+), 87 deletions(-)

diff --git a/include/cyclic_reduction.hxx b/include/cyclic_reduction.hxx
index 9958937e3d..00851b52da 100644
--- a/include/cyclic_reduction.hxx
+++ b/include/cyclic_reduction.hxx
@@ -13,7 +13,7 @@
  alpha = a3 / b1
  *
  **************************************************************************
- * Copyright 2010 B.D.Dudson, S.Farley, M.V.Umansky, X.Q.Xu
+ * Copyright 2010,2021 B.D.Dudson, S.Farley, M.V.Umansky, X.Q.Xu
  *
  * Contact: Ben Dudson, bd512@york.ac.uk
  *
@@ -58,26 +58,39 @@ template <class T> class CyclicReduce {
 public:
   CyclicReduce() = default;
 
-  CyclicReduce(MPI_Comm c, int size) : comm(c), N(size) {
-    MPI_Comm_size(c, &nprocs);
-    MPI_Comm_rank(c, &myproc);
+  CyclicReduce(MPI_Comm c, int size, int ngather=0) : comm(c), ngatherprocs(ngather), N(size) {
+    setup(c, size, ngather);
   }
 
   /// Set parameters
+  /// This can be called after creation, to reset the solver
+  ///
   /// @param[in] c  The communicator of all processors involved in the solve
   /// @param[in] size  The number of rows on this processor
-  void setup(MPI_Comm c, int size) {
+  /// @param[in] gather  The number of processors to gather onto. If 0, use all processors
+  void setup(MPI_Comm c, int size, int ngather=0) {
     comm = c;
 
     int np, myp;
     MPI_Comm_size(c, &np);
     MPI_Comm_rank(c, &myp);
-    if ((size != N) || (np != nprocs) || (myp != myproc))
+
+    if (ngather == 0) {
+      ngather = np; // Use all processors
+    }
+
+    if ((size != N) || (np != nprocs) || (myp != myproc) || (ngather != ngatherprocs)) {
       Nsys = 0; // Need to re-size
+    }
+
     N = size;
     periodic = false;
     nprocs = np;
     myproc = myp;
+
+    ngatherprocs = ngather;
+    ASSERT1(ngatherprocs > 0);
+    ASSERT1(ngatherprocs <= nprocs);
   }
 
   ~CyclicReduce() = default;
@@ -117,7 +130,7 @@ public:
     int nsys = std::get<0>(a.shape());
 
     // Make sure correct memory arrays allocated
-    allocMemory(nprocs, nsys, N);
+    allocMemory(nsys);
 
     // Fill coefficient array
     BOUT_OMP(parallel for)
@@ -190,13 +203,14 @@ public:
 
     ///////////////////////////////////////
     // Gather all interface equations onto single processor
-    // NOTE: Need to replace with divide-and-conquer at some point
+    // NOTE: Cyclic reduction would do this in multiple stages,
+    //       but here we just gather onto ngatherprocs processors
     //
-    // There are Nsys sets of equations to gather, and nprocs processors
+    // There are Nsys sets of equations to gather, and ngatherprocs processors
     // which can be used. Each processor therefore sends interface equations
     // to several different processors for gathering
     //
-    // e.g. 3 processors (nproc=3), 4 sets of equations (Nsys=4):
+    // e.g. 3 processors (nprocs=3 and ngatherprocs=3), 4 sets of equations (Nsys=4):
     //
     // PE 0: [1a 2a 3a 4a]  PE 1: [1b 2b 3b 4b]  PE 2: [1c 2c 3c 4c]
     //
@@ -207,14 +221,19 @@ public:
     //
     // Here PE 0 would have myns=2, PE 1 and 2 would have myns=1
 
-    int ns = Nsys / nprocs;      // Number of systems to assign to all processors
-    int nsextra = Nsys % nprocs; // Number of processors with 1 extra
+    int ns = Nsys / ngatherprocs;      // Number of systems to assign to all gathering processors
+    int nsextra = Nsys % ngatherprocs; // Number of processors with 1 extra
 
-    auto* req = new MPI_Request[nprocs];
+    // Receive requests
 
     if (myns > 0) {
-      // Post receives from all other processors
-      req[myproc] = MPI_REQUEST_NULL;
+      // If gathering systems onto this processor
+      // post receives from all other processors
+
+      all_req.resize(nprocs); // One communicator per processor
+      all_buffer.reallocate(nprocs, myns * 8); // Storage for systems from each processor
+
+      all_req[myproc] = MPI_REQUEST_NULL;
       for (int p = 0; p < nprocs; p++) { // Loop over processor
         // 2 interface equations per processor
         // myns systems to solve
@@ -231,27 +250,48 @@ public:
 #ifdef DIAGNOSE
           output << "Expecting to receive " << len << " from " << p << endl;
 #endif
-          MPI_Irecv(&recvbuffer(p, 0), len,
+          MPI_Irecv(&all_buffer(p, 0), len,
                     MPI_BYTE, // Just sending raw data, unknown type
                     p,        // Destination processor
                     p,        // Identifier
                     comm,     // Communicator
-                    &req[p]); // Request
+                    &all_req[p]); // Request
         }
       }
     }
 
-    // Send data
-    int s0 = 0;
-    for (int p = 0; p < nprocs; p++) { // Loop over processor
-      int nsp = ns;
-      if (p < nsextra)
-        nsp++;
+    // Send data to the processors which are gathering
+
+    gather_req.resize(ngatherprocs);   // One request per gathering processor
+    gather_proc.resize(ngatherprocs);  // Processor number
+    gather_sys_offset.resize(ngatherprocs);  // Index of the first system gathered
+    gather_nsys.resize(ngatherprocs);  // Number of systems
+
+    // Interval between gathering processors
+    BoutReal pinterval = static_cast<BoutReal>(nprocs) / ngatherprocs;
+
+    int s0 = 0;  // System number
+
+    // Loop over gathering processors
+    for (int i = 0; i < ngatherprocs; i++) {
+
+      int p = i; // Gathering onto all processors
+      if (ngatherprocs != nprocs) {
+        // Gathering onto only some
+        p = static_cast<int>( pinterval * i );
+      }
+
+      int nsp = ns; // Number of systems to send to this processor
+      if (i < nsextra)
+        nsp++; // Some processors get an extra system
+
+      gather_proc[i] = p;
+      gather_sys_offset[i] = s0;
+      gather_nsys[i] = nsp;
+
       if ((p != myproc) && (nsp > 0)) {
 #ifdef DIAGNOSE
         output << "Sending to " << p << endl;
-        for (int i = 0; i < 8; i++)
-          output << "value " << i << " : " << myif(s0, i) << endl;
 #endif
         MPI_Send(&myif(s0, 0),        // Data pointer
                  8 * nsp * sizeof(T), // Number
@@ -260,15 +300,18 @@ public:
                  myproc,              // Message identifier
                  comm);               // Communicator
       }
+
       s0 += nsp;
     }
 
     if (myns > 0) {
+      // If this processor is gathering systems
+
       // Wait for data
       int p;
       do {
         MPI_Status stat;
-        MPI_Waitany(nprocs, req, &p, &stat);
+        MPI_Waitany(nprocs, all_req.data(), &p, &stat);
         if (p != MPI_UNDEFINED) {
 // p is the processor number. Copy data
 #ifdef DIAGNOSE
@@ -278,11 +321,11 @@ public:
           for (int i = 0; i < myns; i++)
             for (int j = 0; j < 8; j++) {
 #ifdef DIAGNOSE
-              output << "Value " << j << " : " << recvbuffer(p, 8 * i + j) << endl;
+              output << "Value " << j << " : " << all_buffer(p, 8 * i + j) << endl;
 #endif
-              ifcs(i, 8 * p + j) = recvbuffer(p, 8 * i + j);
+              ifcs(i, 8 * p + j) = all_buffer(p, 8 * i + j);
             }
-          req[p] = MPI_REQUEST_NULL;
+          all_req[p] = MPI_REQUEST_NULL;
         }
       } while (p != MPI_UNDEFINED);
 
@@ -342,11 +385,15 @@ public:
       ///////////////////////////////////////
       // Scatter back solution
 
+      // Allocate buffer space. Note assumes 0th processor has most systems
+      gather_buffer.reallocate(ngatherprocs, 2 * gather_nsys[0]);
+
       // Post receives
-      for (int p = 0; p < nprocs; p++) { // Loop over processor
-        int nsp = ns;
-        if (p < nsextra)
-          nsp++;
+      // Loop over gathering processors
+      for (int i = 0; i < ngatherprocs; i++) {
+        int p = gather_proc[i]; // Processor number
+        int nsp = gather_nsys[i]; // Number of systems
+
         int len = 2 * nsp * sizeof(T); // 2 values per system
 
         if (p == myproc) {
@@ -356,19 +403,20 @@ public:
             x1[sys0 + i] = ifx(i, 2 * p);
             xn[sys0 + i] = ifx(i, 2 * p + 1);
           }
-          req[p] = MPI_REQUEST_NULL;
+          gather_req[i] = MPI_REQUEST_NULL;
         } else if (nsp > 0) {
 #ifdef DIAGNOSE
           output << "Expecting receive from " << p << " of size " << len << endl;
 #endif
-          MPI_Irecv(&recvbuffer(p, 0), len,
+          MPI_Irecv(&gather_buffer(i, 0), len,
                     MPI_BYTE, // Just sending raw data, unknown type
                     p,        // Destination processor
                     p,        // Identifier
                     comm,     // Communicator
-                    &req[p]); // Request
-        } else
-          req[p] = MPI_REQUEST_NULL;
+                    &gather_req[p]); // Request
+        } else {
+          gather_req[i] = MPI_REQUEST_NULL;
+        }
       }
 
       if (myns > 0) {
@@ -392,49 +440,51 @@ public:
       }
 
       // Wait for data
-      int fromproc;
-      int nsp;
+      int fromind;
       do {
         MPI_Status stat;
-        MPI_Waitany(nprocs, req, &fromproc, &stat);
+        MPI_Waitany(ngatherprocs, gather_req.data(), &fromind, &stat);
 
-        if (fromproc != MPI_UNDEFINED) {
-          // fromproc is the processor number. Copy data
+        if (fromind != MPI_UNDEFINED) {
+          // Copy data
 
-          int s0 = fromproc * ns;
-          if (fromproc > nsextra) {
-            s0 += nsextra;
-          } else
-            s0 += fromproc;
+          int fromproc = gather_proc[fromind]; // From processor
+          int s0 = gather_sys_offset[fromind]; // System index start
+          int nsp = gather_nsys[fromind];      // Number of systems
 
-          nsp = ns;
-          if (fromproc < nsextra)
-            nsp++;
-	  
 	  BOUT_OMP(parallel for)
           for (int i = 0; i < nsp; i++) {
-            x1[s0 + i] = recvbuffer(fromproc, 2 * i);
-            xn[s0 + i] = recvbuffer(fromproc, 2 * i + 1);
+            x1[s0 + i] = gather_buffer(fromind, 2 * i);
+            xn[s0 + i] = gather_buffer(fromind, 2 * i + 1);
 #ifdef DIAGNOSE
             output << "Received x1,xn[" << s0 + i << "] = " << x1[s0 + i] << ", "
                    << xn[s0 + i] << " from " << fromproc << endl;
 #endif
           }
-          req[fromproc] = MPI_REQUEST_NULL;
+          gather_req[fromproc] = MPI_REQUEST_NULL;
         }
-      } while (fromproc != MPI_UNDEFINED);
+      } while (fromind != MPI_UNDEFINED);
     }
 
     ///////////////////////////////////////
     // Solve local equations
     back_solve(Nsys, N, coefs, x1, xn, x);
-    delete[] req;
   }
 
 private:
   MPI_Comm comm;             ///< Communicator
   int nprocs{0}, myproc{-1}; ///< Number of processors and ID of my processor
 
+  int ngatherprocs{0};  ///< Number of processors to gather onto
+  std::vector<MPI_Request> gather_req; ///< Comms with gathering processors
+  std::vector<int> gather_proc;  ///< The processor number
+  std::vector<int> gather_sys_offset; ///< Index of the first system gathered
+  std::vector<int> gather_nsys;       ///< Number of systems gathered
+  Matrix<T> gather_buffer; ///< Buffer for receiving from gathering processors
+
+  std::vector<MPI_Request> all_req; ///< Requests for comms with all processors
+  Matrix<T> all_buffer; ///< Buffer for receiving from all other processors
+
   int N{0};    ///< Total size of the problem
   int Nsys{0}; ///< Number of independent systems to solve
   int myns;    ///< Number of systems for interface solve on this processor
@@ -445,7 +495,6 @@ private:
   Matrix<T> coefs; ///< Starting coefficients, rhs [Nsys, {3*coef,rhs}*N]
   Matrix<T> myif;  ///< Interface equations for this processor
 
-  Matrix<T> recvbuffer; ///< Buffer for receiving from other processors
   Matrix<T> ifcs;       ///< Coefficients for interface solve
   Matrix<T> if2x2;      ///< 2x2 interface equations on this processor
   Matrix<T> ifx;        ///< Solution of interface equations
@@ -453,42 +502,61 @@ private:
   Array<T> x1, xn;      ///< Interface solutions for back-solving
 
   /// Allocate memory arrays
-  /// @param[in] np   Number of processors
   /// @param[in] nsys  Number of independent systems to solve
-  /// @param[in] n     Size of each system of equations
-  void allocMemory(int np, int nsys, int n) {
-    if ((nsys == Nsys) && (n == N) && (np == nprocs))
+  void allocMemory(int nsys) {
+    if (nsys == Nsys)
       return; // No need to allocate memory
 
-    nprocs = np;
     Nsys = nsys;
-    N = n;
 
     // Work out how many systems are going to be solved on this processor
-    int ns = nsys / nprocs;      // Number of systems to assign to all processors
-    int nsextra = nsys % nprocs; // Number of processors with 1 extra
-
-    myns = ns;          // Number of systems to gather onto this processor
-    sys0 = ns * myproc; // Starting system number
-    if (myproc < nsextra) {
-      myns++;
-      sys0 += myproc;
+    int ns = nsys / ngatherprocs;      // Number of systems to assign to all processors
+    int nsextra = nsys % ngatherprocs; // Number of processors with 1 extra
+
+    if (ngatherprocs == nprocs) {
+      // Gathering onto all processors (This is the default)
+
+      myns = ns;          // Number of systems to gather onto this processor
+      sys0 = ns * myproc; // Starting system number
+      if (myproc < nsextra) {
+        myns++;
+        sys0 += myproc;
+      } else {
+        sys0 += nsextra;
+      }
     } else {
-      sys0 += nsextra;
+      // Gathering onto only a few processors
+
+      myns = 0; // Default to not gathering onto this processor
+
+      // Interval between processors
+      BoutReal pinterval = static_cast<BoutReal>(nprocs) / ngatherprocs;
+      ASSERT1(pinterval > 1.0);
+
+      // Calculate which processors these are
+      for (int i = 0; i < ngatherprocs; i++) {
+        int proc = static_cast<int>( pinterval * i );
+
+        if (proc == myproc) {
+          // This processor is receiving
+
+          myns = ns;     // Number of systems to gather onto this processor
+          sys0 = ns * i; // Starting system number
+
+          if (i < nsextra) {
+            // Receiving an extra system
+            myns++;
+            sys0 += i;
+          } else {
+            sys0 += nsextra;
+          }
+        }
+      }
     }
 
     coefs.reallocate(Nsys, 4 * N);
     myif.reallocate(Nsys, 8);
 
-    // Note: The recvbuffer is used to receive data in both stages of the solve:
-    //  1. In the gather step, this processor will receive myns interface equations
-    //     from each processor.
-    //  2. In the scatter step, this processor receives the solved interface values
-    //     from each processor. The number of systems of equations received will
-    //     vary from myns to myns+1 (if myproc >= nsextra).
-    // The size of the array reserved is therefore (myns+1)
-    recvbuffer.reallocate(nprocs, (myns + 1) * 8);
-
     // Some interface systems to be solved on this processor
     // Note that the interface equations are organised by system (myns as first argument)
     // but communication buffers are organised by processor (nprocs first).
diff --git a/tests/integrated/test-cyclic/runtest b/tests/integrated/test-cyclic/runtest
index ae2bca24c7..051744a61b 100755
--- a/tests/integrated/test-cyclic/runtest
+++ b/tests/integrated/test-cyclic/runtest
@@ -20,7 +20,8 @@ from sys import stdout, exit
 
 build_and_log("Cyclic Reduction test")
 
-flags = ["", "nsys=2", "nsys=5 periodic", "nsys=7 n=10"]
+flags = ["", "nsys=2", "nsys=5 periodic", "nsys=7 n=10",
+         "ngather=1", "nsys=7 n=10 ngather=1"]
 
 code = 0  # Return code
 for nproc in [1, 2, 4]:
diff --git a/tests/integrated/test-cyclic/test_cyclic.cxx b/tests/integrated/test-cyclic/test_cyclic.cxx
index f90147d900..366ba2a6b4 100644
--- a/tests/integrated/test-cyclic/test_cyclic.cxx
+++ b/tests/integrated/test-cyclic/test_cyclic.cxx
@@ -21,7 +21,7 @@ int main(int argc, char **argv) {
 
   int nsys;
   int n;
-  Options *options = Options::getRoot();
+  Options& options = Options::root();
   OPTION(options, n, 5);
   OPTION(options, nsys, 1);
   BoutReal tol;
@@ -29,13 +29,16 @@ int main(int argc, char **argv) {
   bool periodic;
   OPTION(options, periodic, false);
 
-  // Create a cyclic reduction object, operating on Ts
-  auto* cr = new CyclicReduce<T>(BoutComm::get(), n);
-
   int mype, npe;
   MPI_Comm_rank(BoutComm::get(), &mype);
   MPI_Comm_size(BoutComm::get(), &npe);
 
+  int ngather =
+      options["ngather"].doc("The number of processors to gather onto").withDefault(npe);
+
+  // Create a cyclic reduction object, operating on Ts
+  auto* cr = new CyclicReduce<T>(BoutComm::get(), n, ngather);
+
   a.reallocate(nsys, n);
   b.reallocate(nsys, n);
   c.reallocate(nsys, n);

From 6ff389629d1f509e55cdfabdc6555df8d5b0a0c7 Mon Sep 17 00:00:00 2001
From: Ben Dudson <bd512@york.ac.uk>
Date: Thu, 13 May 2021 10:09:14 +0100
Subject: [PATCH 03/13] Add ngather option to Laplace cyclic solver

Input option to control the number of processors (in X) to gather
systems onto. This can be used to tune performance: Smaller ngather
leads to fewer messages, but more load imbalance.

Modified test, to cover more variations of nxpe and ngather.

That test found some bugs, which are fixed here.
---
 include/cyclic_reduction.hxx                  | 66 +++++++++----------
 .../laplace/impls/cyclic/cyclic_laplace.cxx   | 12 +++-
 tests/integrated/test-laplace/runtest         | 11 ++--
 3 files changed, 46 insertions(+), 43 deletions(-)

diff --git a/include/cyclic_reduction.hxx b/include/cyclic_reduction.hxx
index 00851b52da..898abf9f58 100644
--- a/include/cyclic_reduction.hxx
+++ b/include/cyclic_reduction.hxx
@@ -221,9 +221,6 @@ public:
     //
     // Here PE 0 would have myns=2, PE 1 and 2 would have myns=1
 
-    int ns = Nsys / ngatherprocs;      // Number of systems to assign to all gathering processors
-    int nsextra = Nsys % ngatherprocs; // Number of processors with 1 extra
-
     // Receive requests
 
     if (myns > 0) {
@@ -270,38 +267,42 @@ public:
     // Interval between gathering processors
     BoutReal pinterval = static_cast<BoutReal>(nprocs) / ngatherprocs;
 
-    int s0 = 0;  // System number
+    {
+      int ns = Nsys / ngatherprocs;      // Number of systems to assign to all gathering processors
+      int nsextra = Nsys % ngatherprocs; // Number of processors with 1 extra
+      int s0 = 0;  // Starting system number
 
-    // Loop over gathering processors
-    for (int i = 0; i < ngatherprocs; i++) {
+      // Loop over gathering processors
+      for (int i = 0; i < ngatherprocs; i++) {
 
-      int p = i; // Gathering onto all processors
-      if (ngatherprocs != nprocs) {
-        // Gathering onto only some
-        p = static_cast<int>( pinterval * i );
-      }
+        int p = i; // Gathering onto all processors
+        if (ngatherprocs != nprocs) {
+          // Gathering onto only some
+          p = static_cast<int>( pinterval * i );
+        }
 
-      int nsp = ns; // Number of systems to send to this processor
-      if (i < nsextra)
-        nsp++; // Some processors get an extra system
+        int nsp = ns; // Number of systems to send to this processor
+        if (i < nsextra)
+          nsp++; // Some processors get an extra system
 
-      gather_proc[i] = p;
-      gather_sys_offset[i] = s0;
-      gather_nsys[i] = nsp;
+        gather_proc[i] = p;
+        gather_sys_offset[i] = s0;
+        gather_nsys[i] = nsp;
 
-      if ((p != myproc) && (nsp > 0)) {
+        if ((p != myproc) && (nsp > 0)) {
 #ifdef DIAGNOSE
-        output << "Sending to " << p << endl;
+          output << "Sending to " << p << endl;
 #endif
-        MPI_Send(&myif(s0, 0),        // Data pointer
-                 8 * nsp * sizeof(T), // Number
-                 MPI_BYTE,            // Type
-                 p,                   // Destination
-                 myproc,              // Message identifier
-                 comm);               // Communicator
-      }
+          MPI_Send(&myif(s0, 0),        // Data pointer
+                   8 * nsp * sizeof(T), // Number
+                   MPI_BYTE,            // Type
+                   p,                   // Destination
+                   myproc,              // Message identifier
+                   comm);               // Communicator
+        }
 
-      s0 += nsp;
+        s0 += nsp;
+      }
     }
 
     if (myns > 0) {
@@ -409,11 +410,11 @@ public:
           output << "Expecting receive from " << p << " of size " << len << endl;
 #endif
           MPI_Irecv(&gather_buffer(i, 0), len,
-                    MPI_BYTE, // Just sending raw data, unknown type
-                    p,        // Destination processor
+                    MPI_BYTE, // Just receiving raw data, unknown type
+                    p,        // Origin processor
                     p,        // Identifier
                     comm,     // Communicator
-                    &gather_req[p]); // Request
+                    &gather_req[i]); // Request
         } else {
           gather_req[i] = MPI_REQUEST_NULL;
         }
@@ -448,7 +449,6 @@ public:
         if (fromind != MPI_UNDEFINED) {
           // Copy data
 
-          int fromproc = gather_proc[fromind]; // From processor
           int s0 = gather_sys_offset[fromind]; // System index start
           int nsp = gather_nsys[fromind];      // Number of systems
 
@@ -458,10 +458,10 @@ public:
             xn[s0 + i] = gather_buffer(fromind, 2 * i + 1);
 #ifdef DIAGNOSE
             output << "Received x1,xn[" << s0 + i << "] = " << x1[s0 + i] << ", "
-                   << xn[s0 + i] << " from " << fromproc << endl;
+                   << xn[s0 + i] << " from " << gather_proc[fromind] << endl;
 #endif
           }
-          gather_req[fromproc] = MPI_REQUEST_NULL;
+          gather_req[fromind] = MPI_REQUEST_NULL;
         }
       } while (fromind != MPI_UNDEFINED);
     }
diff --git a/src/invert/laplace/impls/cyclic/cyclic_laplace.cxx b/src/invert/laplace/impls/cyclic/cyclic_laplace.cxx
index af203d2ec5..4c9ac49618 100644
--- a/src/invert/laplace/impls/cyclic/cyclic_laplace.cxx
+++ b/src/invert/laplace/impls/cyclic/cyclic_laplace.cxx
@@ -55,10 +55,11 @@ LaplaceCyclic::LaplaceCyclic(Options *opt, const CELL_LOC loc, Mesh *mesh_in)
 
   OPTION(opt, dst, false);
 
-  if(dst) {
+  if (dst) {
     nmode = localmesh->LocalNz-2;
-  }else
+  } else {
     nmode = maxmode+1; // Number of Z modes. maxmode set in invert_laplace.cxx from options
+  }
 
   // Note nmode == nsys of cyclic_reduction
 
@@ -81,8 +82,13 @@ LaplaceCyclic::LaplaceCyclic(Options *opt, const CELL_LOC loc, Mesh *mesh_in)
   xcmplx.reallocate(nmode, n);
   bcmplx.reallocate(nmode, n);
 
+  int ngather =
+      (*opt)["ngather"]
+          .doc("Number of processors in X to gather onto. Default (0) is all processors")
+          .withDefault(0);
+
   // Create a cyclic reduction object, operating on dcomplex values
-  cr = new CyclicReduce<dcomplex>(localmesh->getXcomm(), n);
+  cr = new CyclicReduce<dcomplex>(localmesh->getXcomm(), n, ngather);
   cr->setPeriodic(localmesh->periodicX);
 }
 
diff --git a/tests/integrated/test-laplace/runtest b/tests/integrated/test-laplace/runtest
index d4a501f184..446e808037 100755
--- a/tests/integrated/test-laplace/runtest
+++ b/tests/integrated/test-laplace/runtest
@@ -52,18 +52,15 @@ for v in vars:
 print("Running Laplacian inversion test")
 success = True
 
-for nproc in [1, 2, 4]:
-    nxpe = 1
-    if nproc > 2:
-        nxpe = 2
+for nproc, nxpe, ngather in [(1,1,1), (2,1,1), (2,2,2), (4,2,1), (4,4,2)]:
 
-    cmd = "./test_laplace nxpe=" + str(nxpe)
+    cmd = "./test_laplace nxpe=" + str(nxpe)+ " laplace:ngather=" + str(ngather)
 
     shell("rm data/BOUT.dmp.*.nc")
 
-    print("   %d processors (nxpe = %d)...." % (nproc, nxpe))
+    print("   {} processors (nxpe = {}, ngather = {})....".format(nproc, nxpe, ngather))
     s, out = launch_safe(cmd, nproc=nproc, mthread=1, pipe=True)
-    with open("run.log." + str(nproc), "w") as f:
+    with open("run.log." + str(nproc) + "." + str(nxpe) + "." + str(ngather), "w") as f:
         f.write(out)
 
     # Collect output data

From 764725e2b0e6c34ec1cf6d6bda09c16ff6066d4d Mon Sep 17 00:00:00 2001
From: Ben Dudson <bd512@york.ac.uk>
Date: Thu, 13 May 2021 10:18:06 +0100
Subject: [PATCH 04/13] Clang and Black formatting

---
 include/cyclic_reduction.hxx                  | 103 ++++++++--------
 .../laplace/impls/cyclic/cyclic_laplace.cxx   | 115 +++++++++---------
 tests/integrated/test-laplace/runtest         |   4 +-
 3 files changed, 114 insertions(+), 108 deletions(-)

diff --git a/include/cyclic_reduction.hxx b/include/cyclic_reduction.hxx
index 898abf9f58..26aa0f85f3 100644
--- a/include/cyclic_reduction.hxx
+++ b/include/cyclic_reduction.hxx
@@ -43,22 +43,24 @@
 //#define DIAGNOSE 1
 
 #include "mpi.h"
-#include "utils.hxx"
 #include "msg_stack.hxx"
+#include "utils.hxx"
 #include <lapack_routines.hxx>
 
-#include "bout/assert.hxx"
 #include "boutexception.hxx"
+#include "bout/assert.hxx"
 
 #include "output.hxx"
 
 #include "bout/openmpwrap.hxx"
 
-template <class T> class CyclicReduce {
+template <class T>
+class CyclicReduce {
 public:
   CyclicReduce() = default;
 
-  CyclicReduce(MPI_Comm c, int size, int ngather=0) : comm(c), ngatherprocs(ngather), N(size) {
+  CyclicReduce(MPI_Comm c, int size, int ngather = 0)
+      : comm(c), ngatherprocs(ngather), N(size) {
     setup(c, size, ngather);
   }
 
@@ -68,7 +70,7 @@ public:
   /// @param[in] c  The communicator of all processors involved in the solve
   /// @param[in] size  The number of rows on this processor
   /// @param[in] gather  The number of processors to gather onto. If 0, use all processors
-  void setup(MPI_Comm c, int size, int ngather=0) {
+  void setup(MPI_Comm c, int size, int ngather = 0) {
     comm = c;
 
     int np, myp;
@@ -99,7 +101,7 @@ public:
   /// By default not periodic
   void setPeriodic(bool p = true) { periodic = p; }
 
-  void setCoefs(const Array<T> &a, const Array<T> &b, const Array<T> &c) {
+  void setCoefs(const Array<T>& a, const Array<T>& b, const Array<T>& c) {
     ASSERT2(a.size() == b.size());
     ASSERT2(a.size() == c.size());
     ASSERT2(a.size() == N);
@@ -108,7 +110,7 @@ public:
     Matrix<T> bMatrix(1, N);
     Matrix<T> cMatrix(1, N);
 
-    BOUT_OMP(parallel for)    
+    BOUT_OMP(parallel for)
     for (int i = 0; i < N; ++i) {
       aMatrix(0, i) = a[i];
       bMatrix(0, i) = b[i];
@@ -148,7 +150,7 @@ public:
   ///
   /// @param[in] rhs Array storing Values of the rhs for a single system
   /// @param[out] x  Array storing the result for a single system
-  void solve(const Array<T> &rhs, Array<T> &x) {
+  void solve(const Array<T>& rhs, Array<T>& x) {
     ASSERT2(rhs.size() == x.size());
     ASSERT2(rhs.size() == N);
 
@@ -175,7 +177,7 @@ public:
   ///
   /// @param[in] rhs Matrix storing Values of the rhs for each system
   /// @param[out] x  Matrix storing the result for each system
-  void solve(const Matrix<T> &rhs, Matrix<T> &x) {
+  void solve(const Matrix<T>& rhs, Matrix<T>& x) {
     TRACE("CyclicReduce::solve");
     ASSERT2(static_cast<int>(std::get<0>(rhs.shape())) == Nsys);
     ASSERT2(static_cast<int>(std::get<0>(x.shape())) == Nsys);
@@ -227,7 +229,7 @@ public:
       // If gathering systems onto this processor
       // post receives from all other processors
 
-      all_req.resize(nprocs); // One communicator per processor
+      all_req.resize(nprocs);                  // One communicator per processor
       all_buffer.reallocate(nprocs, myns * 8); // Storage for systems from each processor
 
       all_req[myproc] = MPI_REQUEST_NULL;
@@ -239,7 +241,7 @@ public:
 
         if (p == myproc) {
           // Just copy the data
-	  BOUT_OMP(parallel for)
+          BOUT_OMP(parallel for)
           for (int i = 0; i < myns; i++)
             for (int j = 0; j < 8; j++)
               ifcs(i, 8 * p + j) = myif(sys0 + i, j);
@@ -248,10 +250,10 @@ public:
           output << "Expecting to receive " << len << " from " << p << endl;
 #endif
           MPI_Irecv(&all_buffer(p, 0), len,
-                    MPI_BYTE, // Just sending raw data, unknown type
-                    p,        // Destination processor
-                    p,        // Identifier
-                    comm,     // Communicator
+                    MPI_BYTE,     // Just sending raw data, unknown type
+                    p,            // Destination processor
+                    p,            // Identifier
+                    comm,         // Communicator
                     &all_req[p]); // Request
         }
       }
@@ -259,18 +261,19 @@ public:
 
     // Send data to the processors which are gathering
 
-    gather_req.resize(ngatherprocs);   // One request per gathering processor
-    gather_proc.resize(ngatherprocs);  // Processor number
-    gather_sys_offset.resize(ngatherprocs);  // Index of the first system gathered
-    gather_nsys.resize(ngatherprocs);  // Number of systems
+    gather_req.resize(ngatherprocs);        // One request per gathering processor
+    gather_proc.resize(ngatherprocs);       // Processor number
+    gather_sys_offset.resize(ngatherprocs); // Index of the first system gathered
+    gather_nsys.resize(ngatherprocs);       // Number of systems
 
     // Interval between gathering processors
     BoutReal pinterval = static_cast<BoutReal>(nprocs) / ngatherprocs;
 
     {
-      int ns = Nsys / ngatherprocs;      // Number of systems to assign to all gathering processors
+      int ns =
+          Nsys / ngatherprocs; // Number of systems to assign to all gathering processors
       int nsextra = Nsys % ngatherprocs; // Number of processors with 1 extra
-      int s0 = 0;  // Starting system number
+      int s0 = 0;                        // Starting system number
 
       // Loop over gathering processors
       for (int i = 0; i < ngatherprocs; i++) {
@@ -278,7 +281,7 @@ public:
         int p = i; // Gathering onto all processors
         if (ngatherprocs != nprocs) {
           // Gathering onto only some
-          p = static_cast<int>( pinterval * i );
+          p = static_cast<int>(pinterval * i);
         }
 
         int nsp = ns; // Number of systems to send to this processor
@@ -318,7 +321,7 @@ public:
 #ifdef DIAGNOSE
           output << "Copying received data from " << p << endl;
 #endif
-	  BOUT_OMP(parallel for)
+          BOUT_OMP(parallel for)
           for (int i = 0; i < myns; i++)
             for (int j = 0; j < 8; j++) {
 #ifdef DIAGNOSE
@@ -348,7 +351,7 @@ public:
       if2x2.ensureUnique();
       x1.ensureUnique();
       xn.ensureUnique();
-      
+
       BOUT_OMP(parallel for)
       for (int i = 0; i < myns; ++i) {
         //  (a  b) (x1) = (b1)
@@ -392,14 +395,14 @@ public:
       // Post receives
       // Loop over gathering processors
       for (int i = 0; i < ngatherprocs; i++) {
-        int p = gather_proc[i]; // Processor number
+        int p = gather_proc[i];   // Processor number
         int nsp = gather_nsys[i]; // Number of systems
 
         int len = 2 * nsp * sizeof(T); // 2 values per system
 
         if (p == myproc) {
           // Just copy the data
-	  BOUT_OMP(parallel for)	  
+          BOUT_OMP(parallel for)
           for (int i = 0; i < myns; i++) {
             x1[sys0 + i] = ifx(i, 2 * p);
             xn[sys0 + i] = ifx(i, 2 * p + 1);
@@ -410,10 +413,10 @@ public:
           output << "Expecting receive from " << p << " of size " << len << endl;
 #endif
           MPI_Irecv(&gather_buffer(i, 0), len,
-                    MPI_BYTE, // Just receiving raw data, unknown type
-                    p,        // Origin processor
-                    p,        // Identifier
-                    comm,     // Communicator
+                    MPI_BYTE,        // Just receiving raw data, unknown type
+                    p,               // Origin processor
+                    p,               // Identifier
+                    comm,            // Communicator
                     &gather_req[i]); // Request
         } else {
           gather_req[i] = MPI_REQUEST_NULL;
@@ -424,7 +427,7 @@ public:
         // Send data
         for (int p = 0; p < nprocs; p++) { // Loop over processor
           if (p != myproc) {
-	    BOUT_OMP(parallel for)	    
+            BOUT_OMP(parallel for)
             for (int i = 0; i < myns; i++) {
               ifp[2 * i] = ifx(i, 2 * p);
               ifp[2 * i + 1] = ifx(i, 2 * p + 1);
@@ -452,7 +455,7 @@ public:
           int s0 = gather_sys_offset[fromind]; // System index start
           int nsp = gather_nsys[fromind];      // Number of systems
 
-	  BOUT_OMP(parallel for)
+          BOUT_OMP(parallel for)
           for (int i = 0; i < nsp; i++) {
             x1[s0 + i] = gather_buffer(fromind, 2 * i);
             xn[s0 + i] = gather_buffer(fromind, 2 * i + 1);
@@ -475,15 +478,15 @@ private:
   MPI_Comm comm;             ///< Communicator
   int nprocs{0}, myproc{-1}; ///< Number of processors and ID of my processor
 
-  int ngatherprocs{0};  ///< Number of processors to gather onto
+  int ngatherprocs{0};                 ///< Number of processors to gather onto
   std::vector<MPI_Request> gather_req; ///< Comms with gathering processors
-  std::vector<int> gather_proc;  ///< The processor number
-  std::vector<int> gather_sys_offset; ///< Index of the first system gathered
-  std::vector<int> gather_nsys;       ///< Number of systems gathered
-  Matrix<T> gather_buffer; ///< Buffer for receiving from gathering processors
+  std::vector<int> gather_proc;        ///< The processor number
+  std::vector<int> gather_sys_offset;  ///< Index of the first system gathered
+  std::vector<int> gather_nsys;        ///< Number of systems gathered
+  Matrix<T> gather_buffer;             ///< Buffer for receiving from gathering processors
 
   std::vector<MPI_Request> all_req; ///< Requests for comms with all processors
-  Matrix<T> all_buffer; ///< Buffer for receiving from all other processors
+  Matrix<T> all_buffer;             ///< Buffer for receiving from all other processors
 
   int N{0};    ///< Total size of the problem
   int Nsys{0}; ///< Number of independent systems to solve
@@ -495,11 +498,11 @@ private:
   Matrix<T> coefs; ///< Starting coefficients, rhs [Nsys, {3*coef,rhs}*N]
   Matrix<T> myif;  ///< Interface equations for this processor
 
-  Matrix<T> ifcs;       ///< Coefficients for interface solve
-  Matrix<T> if2x2;      ///< 2x2 interface equations on this processor
-  Matrix<T> ifx;        ///< Solution of interface equations
-  Array<T> ifp;         ///< Interface equations returned to processor p
-  Array<T> x1, xn;      ///< Interface solutions for back-solving
+  Matrix<T> ifcs;  ///< Coefficients for interface solve
+  Matrix<T> if2x2; ///< 2x2 interface equations on this processor
+  Matrix<T> ifx;   ///< Solution of interface equations
+  Array<T> ifp;    ///< Interface equations returned to processor p
+  Array<T> x1, xn; ///< Interface solutions for back-solving
 
   /// Allocate memory arrays
   /// @param[in] nsys  Number of independent systems to solve
@@ -535,7 +538,7 @@ private:
 
       // Calculate which processors these are
       for (int i = 0; i < ngatherprocs; i++) {
-        int proc = static_cast<int>( pinterval * i );
+        int proc = static_cast<int>(pinterval * i);
 
         if (proc == myproc) {
           // This processor is receiving
@@ -584,12 +587,12 @@ private:
   /// (      a3 b3 c3            )   =>  (   A2 B2 C2)
   /// (              ...         )
   /// (                  an bn cn)
-  void reduce(int ns, int nloc, Matrix<T> &co, Matrix<T> &ifc) {
+  void reduce(int ns, int nloc, Matrix<T>& co, Matrix<T>& ifc) {
 #ifdef DIAGNOSE
     if (nloc < 2)
       throw BoutException("CyclicReduce::reduce nloc < 2");
 #endif
-    
+
     BOUT_OMP(parallel for)
     for (int j = 0; j < ns; j++) {
       // Calculate upper interface equation
@@ -662,20 +665,20 @@ private:
                   const Array<T>& xn, Matrix<T>& xa) {
 
     xa.ensureUnique(); // Going to be modified, so call this outside parallel region
-    
+
     // Tridiagonal system, solve using serial Thomas algorithm
     // xa -- Result for each system
     // co -- Coefficients & rhs for each system
     BOUT_OMP(parallel for)
     for (int i = 0; i < ns; i++) { // Loop over systems
-      Array<T> gam(nloc); // Thread-local array
+      Array<T> gam(nloc);          // Thread-local array
       T bet = 1.0;
       xa(i, 0) = x1[i]; // Already know the first
       gam[1] = 0.;
       for (int j = 1; j < nloc - 1; j++) {
         bet = co(i, 4 * j + 1) - co(i, 4 * j) * gam[j]; // bet = b[j]-a[j]*gam[j]
-        xa(i, j) = (co(i, 4 * j + 3) - co(i, 4 * j) * xa(i, j - 1)) /
-                   bet;                      // x[j] = (r[j]-a[j]*x[j-1])/bet;
+        xa(i, j) = (co(i, 4 * j + 3) - co(i, 4 * j) * xa(i, j - 1))
+                   / bet;                    // x[j] = (r[j]-a[j]*x[j-1])/bet;
         gam[j + 1] = co(i, 4 * j + 2) / bet; // gam[j+1] = c[j]/bet
       }
       xa(i, nloc - 1) = xn[i]; // Know the last value
diff --git a/src/invert/laplace/impls/cyclic/cyclic_laplace.cxx b/src/invert/laplace/impls/cyclic/cyclic_laplace.cxx
index 4c9ac49618..9ab6e5f942 100644
--- a/src/invert/laplace/impls/cyclic/cyclic_laplace.cxx
+++ b/src/invert/laplace/impls/cyclic/cyclic_laplace.cxx
@@ -33,18 +33,18 @@
  *
  */
 
-#include <globals.hxx>
-#include <boutexception.hxx>
+#include <bout/constants.hxx>
 #include <bout/mesh.hxx>
-#include <utils.hxx>
-#include <fft.hxx>
 #include <bout/sys/timer.hxx>
-#include <bout/constants.hxx>
+#include <boutexception.hxx>
+#include <fft.hxx>
+#include <globals.hxx>
 #include <output.hxx>
+#include <utils.hxx>
 
 #include "cyclic_laplace.hxx"
 
-LaplaceCyclic::LaplaceCyclic(Options *opt, const CELL_LOC loc, Mesh *mesh_in)
+LaplaceCyclic::LaplaceCyclic(Options* opt, const CELL_LOC loc, Mesh* mesh_in)
     : Laplacian(opt, loc, mesh_in), Acoef(0.0), C1coef(1.0), C2coef(1.0), Dcoef(1.0) {
   Acoef.setLocation(location);
   C1coef.setLocation(location);
@@ -56,9 +56,10 @@ LaplaceCyclic::LaplaceCyclic(Options *opt, const CELL_LOC loc, Mesh *mesh_in)
   OPTION(opt, dst, false);
 
   if (dst) {
-    nmode = localmesh->LocalNz-2;
+    nmode = localmesh->LocalNz - 2;
   } else {
-    nmode = maxmode+1; // Number of Z modes. maxmode set in invert_laplace.cxx from options
+    nmode =
+        maxmode + 1; // Number of Z modes. maxmode set in invert_laplace.cxx from options
   }
 
   // Note nmode == nsys of cyclic_reduction
@@ -66,15 +67,19 @@ LaplaceCyclic::LaplaceCyclic(Options *opt, const CELL_LOC loc, Mesh *mesh_in)
   // Allocate arrays
 
   xs = localmesh->xstart; // Starting X index
-  if(localmesh->firstX() && !localmesh->periodicX){ // Only want to include guard cells at boundaries (unless periodic in x)
-	  xs = 0;
+  if (localmesh->firstX()
+      && !localmesh->periodicX) { // Only want to include guard cells at boundaries
+                                  // (unless periodic in x)
+    xs = 0;
   }
-  xe = localmesh->xend;   // Last X index
-  if(localmesh->lastX() && !localmesh->periodicX){ // Only want to include guard cells at boundaries (unless periodic in x)
-	  xe = localmesh->LocalNx-1;
+  xe = localmesh->xend; // Last X index
+  if (localmesh->lastX()
+      && !localmesh->periodicX) { // Only want to include guard cells at boundaries
+                                  // (unless periodic in x)
+    xe = localmesh->LocalNx - 1;
   }
-  int n = xe - xs + 1;  // Number of X points on this processor,
-                        // including boundaries but not guard cells
+  int n = xe - xs + 1; // Number of X points on this processor,
+                       // including boundaries but not guard cells
 
   a.reallocate(nmode, n);
   b.reallocate(nmode, n);
@@ -104,26 +109,26 @@ FieldPerp LaplaceCyclic::solve(const FieldPerp& rhs, const FieldPerp& x0) {
 
   FieldPerp x{emptyFrom(rhs)}; // Result
 
-  int jy = rhs.getIndex();  // Get the Y index
+  int jy = rhs.getIndex(); // Get the Y index
   x.setIndex(jy);
 
   // Get the width of the boundary
 
   // If the flags to assign that only one guard cell should be used is set
-  int inbndry = localmesh->xstart, outbndry=localmesh->xstart;
-  if((global_flags & INVERT_BOTH_BNDRY_ONE) || (localmesh->xstart < 2))  {
+  int inbndry = localmesh->xstart, outbndry = localmesh->xstart;
+  if ((global_flags & INVERT_BOTH_BNDRY_ONE) || (localmesh->xstart < 2)) {
     inbndry = outbndry = 1;
   }
-  if(inner_boundary_flags & INVERT_BNDRY_ONE)
+  if (inner_boundary_flags & INVERT_BNDRY_ONE)
     inbndry = 1;
-  if(outer_boundary_flags & INVERT_BNDRY_ONE)
+  if (outer_boundary_flags & INVERT_BNDRY_ONE)
     outbndry = 1;
 
-  if(dst) {
+  if (dst) {
     BOUT_OMP(parallel) {
       /// Create a local thread-scope working array
-      auto k1d =
-          Array<dcomplex>(localmesh->LocalNz); // ZFFT routine expects input of this length
+      auto k1d = Array<dcomplex>(
+          localmesh->LocalNz); // ZFFT routine expects input of this length
 
       // Loop over X indices, including boundaries but not guard cells. (unless periodic
       // in x)
@@ -131,9 +136,9 @@ FieldPerp LaplaceCyclic::solve(const FieldPerp& rhs, const FieldPerp& x0) {
       for (int ix = xs; ix <= xe; ix++) {
         // Take DST in Z direction and put result in k1d
 
-        if (((ix < inbndry) && (inner_boundary_flags & INVERT_SET) && localmesh->firstX()) ||
-            ((localmesh->LocalNx - ix - 1 < outbndry) && (outer_boundary_flags & INVERT_SET) &&
-             localmesh->lastX())) {
+        if (((ix < inbndry) && (inner_boundary_flags & INVERT_SET) && localmesh->firstX())
+            || ((localmesh->LocalNx - ix - 1 < outbndry)
+                && (outer_boundary_flags & INVERT_SET) && localmesh->lastX())) {
           // Use the values in x0 in the boundary
           DST(x0[ix] + 1, localmesh->LocalNz - 2, std::begin(k1d));
         } else {
@@ -169,8 +174,8 @@ FieldPerp LaplaceCyclic::solve(const FieldPerp& rhs, const FieldPerp& x0) {
     // FFT back to real space
     BOUT_OMP(parallel) {
       /// Create a local thread-scope working array
-      auto k1d =
-          Array<dcomplex>(localmesh->LocalNz); // ZFFT routine expects input of this length
+      auto k1d = Array<dcomplex>(
+          localmesh->LocalNz); // ZFFT routine expects input of this length
 
       BOUT_OMP(for nowait)
       for (int ix = xs; ix <= xe; ix++) {
@@ -186,12 +191,11 @@ FieldPerp LaplaceCyclic::solve(const FieldPerp& rhs, const FieldPerp& x0) {
         x(ix, localmesh->LocalNz - 1) = -x(ix, localmesh->LocalNz - 3);
       }
     }
-  }else {
-    BOUT_OMP(parallel)
-    {
+  } else {
+    BOUT_OMP(parallel) {
       /// Create a local thread-scope working array
-      auto k1d = Array<dcomplex>((localmesh->LocalNz) / 2 +
-                                 1); // ZFFT routine expects input of this length
+      auto k1d = Array<dcomplex>((localmesh->LocalNz) / 2
+                                 + 1); // ZFFT routine expects input of this length
 
       // Loop over X indices, including boundaries but not guard cells (unless periodic in
       // x)
@@ -199,9 +203,9 @@ FieldPerp LaplaceCyclic::solve(const FieldPerp& rhs, const FieldPerp& x0) {
       for (int ix = xs; ix <= xe; ix++) {
         // Take FFT in Z direction, apply shift, and put result in k1d
 
-        if (((ix < inbndry) && (inner_boundary_flags & INVERT_SET) && localmesh->firstX()) ||
-            ((localmesh->LocalNx - ix - 1 < outbndry) && (outer_boundary_flags & INVERT_SET) &&
-             localmesh->lastX())) {
+        if (((ix < inbndry) && (inner_boundary_flags & INVERT_SET) && localmesh->firstX())
+            || ((localmesh->LocalNx - ix - 1 < outbndry)
+                && (outer_boundary_flags & INVERT_SET) && localmesh->lastX())) {
           // Use the values in x0 in the boundary
           rfft(x0[ix], localmesh->LocalNz, std::begin(k1d));
         } else {
@@ -232,11 +236,10 @@ FieldPerp LaplaceCyclic::solve(const FieldPerp& rhs, const FieldPerp& x0) {
     cr->solve(bcmplx, xcmplx);
 
     // FFT back to real space
-    BOUT_OMP(parallel)
-    {
+    BOUT_OMP(parallel) {
       /// Create a local thread-scope working array
-      auto k1d = Array<dcomplex>((localmesh->LocalNz) / 2 +
-                                 1); // ZFFT routine expects input of this length
+      auto k1d = Array<dcomplex>((localmesh->LocalNz) / 2
+                                 + 1); // ZFFT routine expects input of this length
 
       const bool zero_DC = global_flags & INVERT_ZERO_DC;
 
@@ -298,8 +301,8 @@ Field3D LaplaceCyclic::solve(const Field3D& rhs, const Field3D& x0) {
   }
   if (localmesh->hasBndryUpperY()) {
     if (include_yguards)
-      ye = localmesh->LocalNy -
-           1; // Contains upper boundary and we are solving in the guard cells
+      ye = localmesh->LocalNy
+           - 1; // Contains upper boundary and we are solving in the guard cells
 
     ye -= extra_yguards_upper;
   }
@@ -318,8 +321,8 @@ Field3D LaplaceCyclic::solve(const Field3D& rhs, const Field3D& x0) {
   if (dst) {
     BOUT_OMP(parallel) {
       /// Create a local thread-scope working array
-      auto k1d =
-          Array<dcomplex>(localmesh->LocalNz); // ZFFT routine expects input of this length
+      auto k1d = Array<dcomplex>(
+          localmesh->LocalNz); // ZFFT routine expects input of this length
 
       // Loop over X and Y indices, including boundaries but not guard cells.
       // (unless periodic in x)
@@ -331,9 +334,9 @@ Field3D LaplaceCyclic::solve(const Field3D& rhs, const Field3D& x0) {
 
         // Take DST in Z direction and put result in k1d
 
-        if (((ix < inbndry) && (inner_boundary_flags & INVERT_SET) && localmesh->firstX()) ||
-            ((localmesh->LocalNx - ix - 1 < outbndry) && (outer_boundary_flags & INVERT_SET) &&
-             localmesh->lastX())) {
+        if (((ix < inbndry) && (inner_boundary_flags & INVERT_SET) && localmesh->firstX())
+            || ((localmesh->LocalNx - ix - 1 < outbndry)
+                && (outer_boundary_flags & INVERT_SET) && localmesh->lastX())) {
           // Use the values in x0 in the boundary
           DST(x0(ix, iy) + 1, localmesh->LocalNz - 2, std::begin(k1d));
         } else {
@@ -374,8 +377,8 @@ Field3D LaplaceCyclic::solve(const Field3D& rhs, const Field3D& x0) {
     // FFT back to real space
     BOUT_OMP(parallel) {
       /// Create a local thread-scope working array
-      auto k1d =
-          Array<dcomplex>(localmesh->LocalNz); // ZFFT routine expects input of this length
+      auto k1d = Array<dcomplex>(
+          localmesh->LocalNz); // ZFFT routine expects input of this length
 
       BOUT_OMP(for nowait)
       for (int ind = 0; ind < nxny; ++ind) { // Loop over X and Y
@@ -399,8 +402,8 @@ Field3D LaplaceCyclic::solve(const Field3D& rhs, const Field3D& x0) {
   } else {
     BOUT_OMP(parallel) {
       /// Create a local thread-scope working array
-      auto k1d = Array<dcomplex>(localmesh->LocalNz / 2 +
-                                 1); // ZFFT routine expects input of this length
+      auto k1d = Array<dcomplex>(localmesh->LocalNz / 2
+                                 + 1); // ZFFT routine expects input of this length
 
       // Loop over X and Y indices, including boundaries but not guard cells
       // (unless periodic in x)
@@ -413,9 +416,9 @@ Field3D LaplaceCyclic::solve(const Field3D& rhs, const Field3D& x0) {
 
         // Take FFT in Z direction, apply shift, and put result in k1d
 
-        if (((ix < inbndry) && (inner_boundary_flags & INVERT_SET) && localmesh->firstX()) ||
-            ((localmesh->LocalNx - ix - 1 < outbndry) && (outer_boundary_flags & INVERT_SET) &&
-             localmesh->lastX())) {
+        if (((ix < inbndry) && (inner_boundary_flags & INVERT_SET) && localmesh->firstX())
+            || ((localmesh->LocalNx - ix - 1 < outbndry)
+                && (outer_boundary_flags & INVERT_SET) && localmesh->lastX())) {
           // Use the values in x0 in the boundary
           rfft(x0(ix, iy), localmesh->LocalNz, std::begin(k1d));
         } else {
@@ -452,8 +455,8 @@ Field3D LaplaceCyclic::solve(const Field3D& rhs, const Field3D& x0) {
     // FFT back to real space
     BOUT_OMP(parallel) {
       /// Create a local thread-scope working array
-      auto k1d = Array<dcomplex>((localmesh->LocalNz) / 2 +
-                                 1); // ZFFT routine expects input of this length
+      auto k1d = Array<dcomplex>((localmesh->LocalNz) / 2
+                                 + 1); // ZFFT routine expects input of this length
 
       const bool zero_DC = global_flags & INVERT_ZERO_DC;
 
diff --git a/tests/integrated/test-laplace/runtest b/tests/integrated/test-laplace/runtest
index 446e808037..454716cc3e 100755
--- a/tests/integrated/test-laplace/runtest
+++ b/tests/integrated/test-laplace/runtest
@@ -52,9 +52,9 @@ for v in vars:
 print("Running Laplacian inversion test")
 success = True
 
-for nproc, nxpe, ngather in [(1,1,1), (2,1,1), (2,2,2), (4,2,1), (4,4,2)]:
+for nproc, nxpe, ngather in [(1, 1, 1), (2, 1, 1), (2, 2, 2), (4, 2, 1), (4, 4, 2)]:
 
-    cmd = "./test_laplace nxpe=" + str(nxpe)+ " laplace:ngather=" + str(ngather)
+    cmd = "./test_laplace nxpe=" + str(nxpe) + " laplace:ngather=" + str(ngather)
 
     shell("rm data/BOUT.dmp.*.nc")
 

From a5f0a64bc9583f4ea6a344740987e2c67380b018 Mon Sep 17 00:00:00 2001
From: Ben Dudson <bd512@york.ac.uk>
Date: Thu, 13 May 2021 15:26:13 +0100
Subject: [PATCH 05/13] Tidying, using unique_ptr

Clang-tidy and @ZedThree suggestions
---
 include/cyclic_reduction.hxx                    |  9 +++++----
 .../laplace/impls/cyclic/cyclic_laplace.cxx     | 17 +++++++----------
 .../laplace/impls/cyclic/cyclic_laplace.hxx     |  5 ++---
 tests/integrated/test-cyclic/test_cyclic.cxx    |  5 +----
 4 files changed, 15 insertions(+), 21 deletions(-)

diff --git a/include/cyclic_reduction.hxx b/include/cyclic_reduction.hxx
index 26aa0f85f3..4395bb0ca3 100644
--- a/include/cyclic_reduction.hxx
+++ b/include/cyclic_reduction.hxx
@@ -285,8 +285,9 @@ public:
         }
 
         int nsp = ns; // Number of systems to send to this processor
-        if (i < nsextra)
+        if (i < nsextra) {
           nsp++; // Some processors get an extra system
+        }
 
         gather_proc[i] = p;
         gather_sys_offset[i] = s0;
@@ -444,7 +445,7 @@ public:
       }
 
       // Wait for data
-      int fromind;
+      int fromind{MPI_UNDEFINED};
       do {
         MPI_Status stat;
         MPI_Waitany(ngatherprocs, gather_req.data(), &fromind, &stat);
@@ -490,8 +491,8 @@ private:
 
   int N{0};    ///< Total size of the problem
   int Nsys{0}; ///< Number of independent systems to solve
-  int myns;    ///< Number of systems for interface solve on this processor
-  int sys0;    ///< Starting system index for interface solve
+  int myns{0}; ///< Number of systems for interface solve on this processor
+  int sys0{0}; ///< Starting system index for interface solve
 
   bool periodic{false}; ///< Is the domain periodic?
 
diff --git a/src/invert/laplace/impls/cyclic/cyclic_laplace.cxx b/src/invert/laplace/impls/cyclic/cyclic_laplace.cxx
index 9ab6e5f942..f4415fb7aa 100644
--- a/src/invert/laplace/impls/cyclic/cyclic_laplace.cxx
+++ b/src/invert/laplace/impls/cyclic/cyclic_laplace.cxx
@@ -93,15 +93,10 @@ LaplaceCyclic::LaplaceCyclic(Options* opt, const CELL_LOC loc, Mesh* mesh_in)
           .withDefault(0);
 
   // Create a cyclic reduction object, operating on dcomplex values
-  cr = new CyclicReduce<dcomplex>(localmesh->getXcomm(), n, ngather);
+  cr = std::make_unique<CyclicReduce<dcomplex>>(localmesh->getXcomm(), n, ngather);
   cr->setPeriodic(localmesh->periodicX);
 }
 
-LaplaceCyclic::~LaplaceCyclic() {
-  // Delete tridiagonal solver
-  delete cr;
-}
-
 FieldPerp LaplaceCyclic::solve(const FieldPerp& rhs, const FieldPerp& x0) {
   ASSERT1(localmesh == rhs.getMesh() && localmesh == x0.getMesh());
   ASSERT1(rhs.getLocation() == location);
@@ -334,9 +329,10 @@ Field3D LaplaceCyclic::solve(const Field3D& rhs, const Field3D& x0) {
 
         // Take DST in Z direction and put result in k1d
 
-        if (((ix < inbndry) && (inner_boundary_flags & INVERT_SET) && localmesh->firstX())
+        if (((ix < inbndry) && ((inner_boundary_flags & INVERT_SET) != 0)
+             && localmesh->firstX())
             || ((localmesh->LocalNx - ix - 1 < outbndry)
-                && (outer_boundary_flags & INVERT_SET) && localmesh->lastX())) {
+                && ((outer_boundary_flags & INVERT_SET) != 0) && localmesh->lastX())) {
           // Use the values in x0 in the boundary
           DST(x0(ix, iy) + 1, localmesh->LocalNz - 2, std::begin(k1d));
         } else {
@@ -416,9 +412,10 @@ Field3D LaplaceCyclic::solve(const Field3D& rhs, const Field3D& x0) {
 
         // Take FFT in Z direction, apply shift, and put result in k1d
 
-        if (((ix < inbndry) && (inner_boundary_flags & INVERT_SET) && localmesh->firstX())
+        if (((ix < inbndry) && ((inner_boundary_flags & INVERT_SET) != 0)
+             && localmesh->firstX())
             || ((localmesh->LocalNx - ix - 1 < outbndry)
-                && (outer_boundary_flags & INVERT_SET) && localmesh->lastX())) {
+                && ((outer_boundary_flags & INVERT_SET) != 0) && localmesh->lastX())) {
           // Use the values in x0 in the boundary
           rfft(x0(ix, iy), localmesh->LocalNz, std::begin(k1d));
         } else {
diff --git a/src/invert/laplace/impls/cyclic/cyclic_laplace.hxx b/src/invert/laplace/impls/cyclic/cyclic_laplace.hxx
index 9e1161ecd8..17259bf661 100644
--- a/src/invert/laplace/impls/cyclic/cyclic_laplace.hxx
+++ b/src/invert/laplace/impls/cyclic/cyclic_laplace.hxx
@@ -49,8 +49,7 @@ RegisterLaplace<LaplaceCyclic> registerlaplacecycle(LAPLACE_CYCLIC);
 class LaplaceCyclic : public Laplacian {
 public:
   LaplaceCyclic(Options *opt = nullptr, const CELL_LOC loc = CELL_CENTRE, Mesh *mesh_in = nullptr);
-  ~LaplaceCyclic();
-  
+
   using Laplacian::setCoefA;
   void setCoefA(const Field2D &val) override {
     ASSERT1(val.getLocation() == location);
@@ -104,7 +103,7 @@ private:
   
   bool dst;
   
-  CyclicReduce<dcomplex> *cr; ///< Tridiagonal solver
+  std::unique_ptr<CyclicReduce<dcomplex>> cr; ///< Tridiagonal solver
 };
 
 #endif // __SPT_H__
diff --git a/tests/integrated/test-cyclic/test_cyclic.cxx b/tests/integrated/test-cyclic/test_cyclic.cxx
index 366ba2a6b4..6814d31138 100644
--- a/tests/integrated/test-cyclic/test_cyclic.cxx
+++ b/tests/integrated/test-cyclic/test_cyclic.cxx
@@ -37,7 +37,7 @@ int main(int argc, char **argv) {
       options["ngather"].doc("The number of processors to gather onto").withDefault(npe);
 
   // Create a cyclic reduction object, operating on Ts
-  auto* cr = new CyclicReduce<T>(BoutComm::get(), n, ngather);
+  auto cr = std::make_unique<CyclicReduce<T>>(BoutComm::get(), n, ngather);
 
   a.reallocate(nsys, n);
   b.reallocate(nsys, n);
@@ -89,9 +89,6 @@ int main(int argc, char **argv) {
   cr->setPeriodic(periodic);
   cr->setCoefs(a, b, c);
   cr->solve(rhs, x);
-
-  // Destroy solver
-  delete cr;
   
   // Check result
 

From 72b8e64933a71e71ce4b8beb642f182585372094 Mon Sep 17 00:00:00 2001
From: bendudson <bendudson@users.noreply.github.com>
Date: Mon, 25 Jul 2022 17:54:52 +0000
Subject: [PATCH 06/13] Apply clang-format changes

---
 src/invert/laplace/impls/cyclic/cyclic_laplace.cxx | 6 ++----
 src/invert/laplace/impls/cyclic/cyclic_laplace.hxx | 2 +-
 2 files changed, 3 insertions(+), 5 deletions(-)

diff --git a/src/invert/laplace/impls/cyclic/cyclic_laplace.cxx b/src/invert/laplace/impls/cyclic/cyclic_laplace.cxx
index 4dd6efb3b9..a3d530cf23 100644
--- a/src/invert/laplace/impls/cyclic/cyclic_laplace.cxx
+++ b/src/invert/laplace/impls/cyclic/cyclic_laplace.cxx
@@ -41,7 +41,7 @@
 #if not BOUT_USE_METRIC_3D
 
 >>>>>>> next
-#include <bout/constants.hxx>
+    #include < bout / constants.hxx>
 #include <bout/mesh.hxx>
 #include <bout/sys/timer.hxx>
 #include <boutexception.hxx>
@@ -52,9 +52,7 @@
 
 #include "cyclic_laplace.hxx"
 
-LaplaceCyclic::LaplaceCyclic(Options* opt, const CELL_LOC loc, Mesh* mesh_in,
-                             Solver* UNUSED(solver), Datafile* UNUSED(dump))
-    : Laplacian(opt, loc, mesh_in), Acoef(0.0), C1coef(1.0), C2coef(1.0), Dcoef(1.0) {
+                      LaplaceCyclic::LaplaceCyclic(Options* opt, const CELL_LOC loc, Mesh* mesh_in, Solver* UNUSED(solver), Datafile* UNUSED(dump)) : Laplacian(opt, loc, mesh_in), Acoef(0.0), C1coef(1.0), C2coef(1.0), Dcoef(1.0) {
 
   Acoef.setLocation(location);
   C1coef.setLocation(location);
diff --git a/src/invert/laplace/impls/cyclic/cyclic_laplace.hxx b/src/invert/laplace/impls/cyclic/cyclic_laplace.hxx
index e76eff94be..03186e16ff 100644
--- a/src/invert/laplace/impls/cyclic/cyclic_laplace.hxx
+++ b/src/invert/laplace/impls/cyclic/cyclic_laplace.hxx
@@ -120,7 +120,7 @@ private:
   Matrix<dcomplex> a, b, c, bcmplx, xcmplx;
   
   bool dst;
-  
+
   std::unique_ptr<CyclicReduce<dcomplex>> cr; ///< Tridiagonal solver
 };
 

From 6db58ff97d2495892f5f9c5f2ebeb0c0027109d6 Mon Sep 17 00:00:00 2001
From: Ben Dudson <dudson2@llnl.gov>
Date: Mon, 25 Jul 2022 11:12:35 -0700
Subject: [PATCH 07/13] Missed a merge conflict in cyclic_laplace

---
 src/invert/laplace/impls/cyclic/cyclic_laplace.cxx | 3 ---
 1 file changed, 3 deletions(-)

diff --git a/src/invert/laplace/impls/cyclic/cyclic_laplace.cxx b/src/invert/laplace/impls/cyclic/cyclic_laplace.cxx
index 4dd6efb3b9..e0e880c9b2 100644
--- a/src/invert/laplace/impls/cyclic/cyclic_laplace.cxx
+++ b/src/invert/laplace/impls/cyclic/cyclic_laplace.cxx
@@ -33,14 +33,11 @@
  *
  */
 
-<<<<<<< HEAD
-=======
 #include "cyclic_laplace.hxx"
 #include "bout/build_config.hxx"
 
 #if not BOUT_USE_METRIC_3D
 
->>>>>>> next
 #include <bout/constants.hxx>
 #include <bout/mesh.hxx>
 #include <bout/sys/timer.hxx>

From deb463f8bdb681377feb33fd4d5453f67fd2ea6d Mon Sep 17 00:00:00 2001
From: Ben Dudson <dudson2@llnl.gov>
Date: Mon, 25 Jul 2022 11:53:29 -0700
Subject: [PATCH 08/13] CyclicLaplace destructor replaced by unique_ptr

---
 src/invert/laplace/impls/cyclic/cyclic_laplace.hxx | 1 -
 1 file changed, 1 deletion(-)

diff --git a/src/invert/laplace/impls/cyclic/cyclic_laplace.hxx b/src/invert/laplace/impls/cyclic/cyclic_laplace.hxx
index 03186e16ff..bf7ed6d4ce 100644
--- a/src/invert/laplace/impls/cyclic/cyclic_laplace.hxx
+++ b/src/invert/laplace/impls/cyclic/cyclic_laplace.hxx
@@ -62,7 +62,6 @@ public:
   LaplaceCyclic(Options* opt = nullptr, const CELL_LOC loc = CELL_CENTRE,
                 Mesh* mesh_in = nullptr, Solver* solver = nullptr,
                 Datafile* dump = nullptr);
-  ~LaplaceCyclic();
 
   using Laplacian::setCoefA;
   void setCoefA(const Field2D &val) override {

From 7901661995dc4b0bcbdda88a6dac123884a3a06e Mon Sep 17 00:00:00 2001
From: bendudson <bendudson@users.noreply.github.com>
Date: Wed, 14 Jun 2023 21:06:59 +0000
Subject: [PATCH 09/13] Apply black changes

---
 tests/integrated/test-laplace/runtest | 20 ++++++++++++++++----
 1 file changed, 16 insertions(+), 4 deletions(-)

diff --git a/tests/integrated/test-laplace/runtest b/tests/integrated/test-laplace/runtest
index c7d9ab17dc..1aa389b1ec 100755
--- a/tests/integrated/test-laplace/runtest
+++ b/tests/integrated/test-laplace/runtest
@@ -57,14 +57,26 @@ for solver in ["cyclic", "pcr", "pcr_thomas"]:
     # Try different combinations of processor numbers
     # If not cyclic then ignore ngather option
     for nproc, nxpe, ngather in [(1, 1, 1), (2, 1, 1), (2, 2, 2), (4, 2, 1), (4, 4, 2)]:
-
-        cmd = "./test_laplace NXPE=" + str(nxpe) + " laplace:type=" + solver + (" laplace:ngather=" + str(ngather)) if solver == "cyclic" else ""
+        cmd = (
+            "./test_laplace NXPE="
+            + str(nxpe)
+            + " laplace:type="
+            + solver
+            + (" laplace:ngather=" + str(ngather))
+            if solver == "cyclic"
+            else ""
+        )
 
         shell("rm data/BOUT.dmp.*.nc")
 
-        print("   %s solver with %d processors (nxpe = %d, ngather = %d)...." % (solver, nproc, nxpe, ngather))
+        print(
+            "   %s solver with %d processors (nxpe = %d, ngather = %d)...."
+            % (solver, nproc, nxpe, ngather)
+        )
         s, out = launch_safe(cmd, nproc=nproc, mthread=1, pipe=True)
-        with open("run.log." + str(nproc) + "." + str(nxpe) + "." + str(ngather), "w") as f:
+        with open(
+            "run.log." + str(nproc) + "." + str(nxpe) + "." + str(ngather), "w"
+        ) as f:
             f.write(out)
 
         # Collect output data

From 47d69ff09e93533332fa3f96018b590d3860dd5e Mon Sep 17 00:00:00 2001
From: Ben Dudson <dudson2@llnl.gov>
Date: Wed, 14 Jun 2023 14:20:19 -0700
Subject: [PATCH 10/13] Clang-tidy suggestions: Add braces

---
 include/bout/cyclic_reduction.hxx | 14 +++++++++-----
 1 file changed, 9 insertions(+), 5 deletions(-)

diff --git a/include/bout/cyclic_reduction.hxx b/include/bout/cyclic_reduction.hxx
index 7c166c9b09..f7dbc71cbd 100644
--- a/include/bout/cyclic_reduction.hxx
+++ b/include/bout/cyclic_reduction.hxx
@@ -247,9 +247,11 @@ public:
         if (p == myproc) {
           // Just copy the data
           BOUT_OMP(parallel for)
-          for (int i = 0; i < myns; i++)
-            for (int j = 0; j < 8; j++)
+          for (int i = 0; i < myns; i++) {
+            for (int j = 0; j < 8; j++) {
               ifcs(i, 8 * p + j) = myif(sys0 + i, j);
+            }
+          }
         } else {
 #ifdef DIAGNOSE
           output << "Expecting to receive " << len << " from " << p << endl;
@@ -327,13 +329,14 @@ public:
           output << "Copying received data from " << p << endl;
 #endif
           BOUT_OMP(parallel for)
-          for (int i = 0; i < myns; i++)
+          for (int i = 0; i < myns; i++) {
             for (int j = 0; j < 8; j++) {
 #ifdef DIAGNOSE
               output << "Value " << j << " : " << all_buffer(p, 8 * i + j) << endl;
 #endif
               ifcs(i, 8 * p + j) = all_buffer(p, 8 * i + j);
             }
+          }
           all_req[p] = MPI_REQUEST_NULL;
         }
       } while (p != MPI_UNDEFINED);
@@ -480,7 +483,7 @@ public:
   }
 
 private:
-  MPI_Comm comm;             ///< Communicator
+  MPI_Comm comm{};           ///< Communicator
   int nprocs{0}, myproc{-1}; ///< Number of processors and ID of my processor
 
   int ngatherprocs{0};                 ///< Number of processors to gather onto
@@ -512,8 +515,9 @@ private:
   /// Allocate memory arrays
   /// @param[in] nsys  Number of independent systems to solve
   void allocMemory(int nsys) {
-    if (nsys == Nsys)
+    if (nsys == Nsys) {
       return; // No need to allocate memory
+    }
 
     Nsys = nsys;
 

From 8a4d12221f620b093d6e9cc47301445acd578670 Mon Sep 17 00:00:00 2001
From: bendudson <bendudson@users.noreply.github.com>
Date: Wed, 14 Jun 2023 21:21:49 +0000
Subject: [PATCH 11/13] Apply clang-format changes

---
 externalpackages/PVODE/precon/band.cpp  | 2 +-
 externalpackages/PVODE/source/cvode.cpp | 6 +++---
 2 files changed, 4 insertions(+), 4 deletions(-)

diff --git a/externalpackages/PVODE/precon/band.cpp b/externalpackages/PVODE/precon/band.cpp
index c1d0f6f1e2..8b444de191 100644
--- a/externalpackages/PVODE/precon/band.cpp
+++ b/externalpackages/PVODE/precon/band.cpp
@@ -185,7 +185,7 @@ integer gbfa(real **a, integer n, integer mu, integer ml, integer smu,
     if (col_k[storage_l] == ZERO) return(k+1);
     
     /* swap a(l,k) and a(k,k) if necessary */
-    
+
     if (swap = (l != k)) {
       temp = col_k[storage_l];
       col_k[storage_l] = *diag_k;
diff --git a/externalpackages/PVODE/source/cvode.cpp b/externalpackages/PVODE/source/cvode.cpp
index e28a3b1ae6..3617bf34ef 100644
--- a/externalpackages/PVODE/source/cvode.cpp
+++ b/externalpackages/PVODE/source/cvode.cpp
@@ -179,7 +179,7 @@ namespace pvode {
 
 #define MSG_Y0_NULL     CVM "y0=NULL illegal.\n\n"
 
-#define MSG_BAD_N       CVM "N=%ld < 1 illegal.\n\n"
+#define MSG_BAD_N CVM "N=%ld < 1 illegal.\n\n"
 
 #define MSG_BAD_LMM_1   CVM "lmm=%d illegal.\n"
 #define MSG_BAD_LMM_2   "The legal values are ADAMS=%d and BDF=%d.\n\n"
@@ -1825,7 +1825,7 @@ static int CVnls(CVodeMem cv_mem, int nflag)
   switch(iter) {
     case FUNCTIONAL : return(CVnlsFunctional(cv_mem));
     case NEWTON     : return(CVnlsNewton(cv_mem, nflag));
-  }
+    }
 }
 
 /***************** CVnlsFunctional ********************************
@@ -2391,7 +2391,7 @@ static int CVHandleFailure(CVodeMem cv_mem, int kflag)
                         return(SETUP_FAILURE);
     case SOLVE_FAILED:  fprintf(errfp, MSG_SOLVE_FAILED, tn);
                         return(SOLVE_FAILURE);
-  }
+    }
 }
 
 /*******************************************************************/

From b4f7ca417ff16547d6f24ca36aec51c724d3a94e Mon Sep 17 00:00:00 2001
From: Ben Dudson <dudson2@llnl.gov>
Date: Wed, 14 Jun 2023 14:51:35 -0700
Subject: [PATCH 12/13] Fix merge in PVODE

---
 externalpackages/PVODE/source/cvode.cpp | 8 --------
 1 file changed, 8 deletions(-)

diff --git a/externalpackages/PVODE/source/cvode.cpp b/externalpackages/PVODE/source/cvode.cpp
index 418bd14be2..48ac3209f2 100644
--- a/externalpackages/PVODE/source/cvode.cpp
+++ b/externalpackages/PVODE/source/cvode.cpp
@@ -1825,13 +1825,9 @@ static int CVnls(CVodeMem cv_mem, int nflag)
   switch(iter) {
     case FUNCTIONAL : return(CVnlsFunctional(cv_mem));
     case NEWTON     : return(CVnlsNewton(cv_mem, nflag));
-<<<<<<< HEAD
-    }
-=======
   }
   fprintf(errfp, "Should be unreachable ...");
   return (ERR_FAILURE);
->>>>>>> v5.1.0-rc
 }
 
 /***************** CVnlsFunctional ********************************
@@ -2397,13 +2393,9 @@ static int CVHandleFailure(CVodeMem cv_mem, int kflag)
                         return(SETUP_FAILURE);
     case SOLVE_FAILED:  fprintf(errfp, MSG_SOLVE_FAILED, tn);
                         return(SOLVE_FAILURE);
-<<<<<<< HEAD
-    }
-=======
   }
   fprintf(errfp, "Should be unreachable ...");
   return (ERR_FAILURE);
->>>>>>> v5.1.0-rc
 }
 
 /*******************************************************************/

From b75ffa7a3fb63f86c407198c18188433afa617d5 Mon Sep 17 00:00:00 2001
From: Ben Dudson <dudson2@llnl.gov>
Date: Wed, 14 Jun 2023 15:26:15 -0700
Subject: [PATCH 13/13] test-laplace fix script adding ngather setting

`if` applied to entire string rather than just last part
---
 tests/integrated/test-laplace/runtest | 4 +---
 1 file changed, 1 insertion(+), 3 deletions(-)

diff --git a/tests/integrated/test-laplace/runtest b/tests/integrated/test-laplace/runtest
index 1aa389b1ec..b435f80833 100755
--- a/tests/integrated/test-laplace/runtest
+++ b/tests/integrated/test-laplace/runtest
@@ -62,9 +62,7 @@ for solver in ["cyclic", "pcr", "pcr_thomas"]:
             + str(nxpe)
             + " laplace:type="
             + solver
-            + (" laplace:ngather=" + str(ngather))
-            if solver == "cyclic"
-            else ""
+            + ((" laplace:ngather=" + str(ngather)) if solver == "cyclic" else "")
         )
 
         shell("rm data/BOUT.dmp.*.nc")