oomph-lib: matrices.cc Source File

Go to the documentation of this file.
 // LIC// ====================================================================
 // LIC// This file forms part of oomph-lib, the object-oriented,
 // LIC// multi-physics finite-element library, available
 // LIC// at http://www.oomph-lib.org.
 // LIC//
 // LIC// Copyright (C) 2006-2024 Matthias Heil and Andrew Hazel
 // LIC//
 // LIC// This library is free software; you can redistribute it and/or
 // LIC// modify it under the terms of the GNU Lesser General Public
 // LIC// License as published by the Free Software Foundation; either
 // LIC// version 2.1 of the License, or (at your option) any later version.
 // LIC//
 // LIC// This library is distributed in the hope that it will be useful,
 // LIC// but WITHOUT ANY WARRANTY; without even the implied warranty of
 // LIC// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
 // LIC// Lesser General Public License for more details.
 // LIC//
 // LIC// You should have received a copy of the GNU Lesser General Public
 // LIC// License along with this library; if not, write to the Free Software
 // LIC// Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA
 // LIC// 02110-1301  USA.
 // LIC//
 // LIC// The authors may be contacted at oomph-lib@maths.man.ac.uk.
 // LIC//
 // LIC//====================================================================
 // Non-inline member functions for the matrix classes
  
 #ifdef OOMPH_HAS_MPI
 #include "mpi.h"
 #endif
  
 #include <cstring>
  
 #include <set>
 #include <map>
  
 //#include <valgrind/callgrind.h>
  
 // oomph-lib headers
 #include "matrices.h"
 #include "linear_solver.h"
  
  
 namespace oomph
 {
   //============================================================================
   /// Complete LU solve (overwrites RHS with solution). This is the
   /// generic version which should not need to be over-written.
   //============================================================================
   void DoubleMatrixBase::solve(DoubleVector& rhs)
   {
 #ifdef PARANOID
     if (Linear_solver_pt == 0)
     {
       throw OomphLibError("Linear_solver_pt not set in matrix",
                           OOMPH_CURRENT_FUNCTION,
                           OOMPH_EXCEPTION_LOCATION);
     }
 #endif
  
     // Copy rhs vector into local storage so it doesn't get overwritten
     // if the linear solver decides to initialise the solution vector, say,
     // which it's quite entitled to do!
     DoubleVector actual_rhs(rhs);
  
     // Use the linear algebra interface to the linear solver
     Linear_solver_pt->solve(this, actual_rhs, rhs);
   }
  
   //============================================================================
   /// Complete LU solve (Nothing gets overwritten!). This generic
   /// version should never need to be overwritten
   //============================================================================
   void DoubleMatrixBase::solve(const DoubleVector& rhs, DoubleVector& soln)
   {
 #ifdef PARANOID
     if (Linear_solver_pt == 0)
     {
       throw OomphLibError("Linear_solver_pt not set in matrix",
                           OOMPH_CURRENT_FUNCTION,
                           OOMPH_EXCEPTION_LOCATION);
     }
 #endif
     // Use the linear algebra interface to the linear solver
     Linear_solver_pt->solve(this, rhs, soln);
   }
  
   //============================================================================
   /// Complete LU solve (overwrites RHS with solution). This is the
   /// generic version which should not need to be over-written.
   //============================================================================
   void DoubleMatrixBase::solve(Vector<double>& rhs)
   {
 #ifdef PARANOID
     if (Linear_solver_pt == 0)
     {
       throw OomphLibError("Linear_solver_pt not set in matrix",
                           OOMPH_CURRENT_FUNCTION,
                           OOMPH_EXCEPTION_LOCATION);
     }
 #endif
  
     // Copy rhs vector into local storage so it doesn't get overwritten
     // if the linear solver decides to initialise the solution vector, say,
     // which it's quite entitled to do!
     Vector<double> actual_rhs(rhs);
  
     // Use the linear algebra interface to the linear solver
     Linear_solver_pt->solve(this, actual_rhs, rhs);
   }
  
   //============================================================================
   /// Complete LU solve (Nothing gets overwritten!). This generic
   /// version should never need to be overwritten
   //============================================================================
   void DoubleMatrixBase::solve(const Vector<double>& rhs, Vector<double>& soln)
   {
 #ifdef PARANOID
     if (Linear_solver_pt == 0)
     {
       throw OomphLibError("Linear_solver_pt not set in matrix",
                           OOMPH_CURRENT_FUNCTION,
                           OOMPH_EXCEPTION_LOCATION);
     }
 #endif
     // Use the linear algebra interface to the linear solver
     Linear_solver_pt->solve(this, rhs, soln);
   }
  
  
   /// /////////////////////////////////////////////////////////////////////
   /// /////////////////////////////////////////////////////////////////////
   /// /////////////////////////////////////////////////////////////////////
  
   //===============================================================
   /// Constructor, set the default linear solver to be the DenseLU
   /// solver
   //===============================================================
   DenseDoubleMatrix::DenseDoubleMatrix() : DenseMatrix<double>()
   {
     Linear_solver_pt = Default_linear_solver_pt = new DenseLU;
   }
  
   //==============================================================
   /// Constructor to build a square n by n matrix.
   /// Set the default linear solver to be DenseLU
   //==============================================================
   DenseDoubleMatrix::DenseDoubleMatrix(const unsigned long& n)
     : DenseMatrix<double>(n)
   {
     Linear_solver_pt = Default_linear_solver_pt = new DenseLU;
   }
  
  
   //=================================================================
   /// Constructor to build a matrix with n rows and m columns.
   /// Set the default linear solver to be DenseLU
   //=================================================================
   DenseDoubleMatrix::DenseDoubleMatrix(const unsigned long& n,
                                        const unsigned long& m)
     : DenseMatrix<double>(n, m)
   {
     Linear_solver_pt = Default_linear_solver_pt = new DenseLU;
   }
  
   //=====================================================================
   /// Constructor to build a matrix with n rows and m columns,
   /// with initial value initial_val
   /// Set the default linear solver to be DenseLU
   //=====================================================================
   DenseDoubleMatrix::DenseDoubleMatrix(const unsigned long& n,
                                        const unsigned long& m,
                                        const double& initial_val)
     : DenseMatrix<double>(n, m, initial_val)
   {
     Linear_solver_pt = Default_linear_solver_pt = new DenseLU;
   }
  
   //=======================================================================
   /// Destructor delete the default linear solver
   //======================================================================
   DenseDoubleMatrix::~DenseDoubleMatrix()
   {
     // Delete the default linear solver
     delete Default_linear_solver_pt;
   }
  
   //============================================================================
   /// LU decompose a matrix, by using the default linear solver
   /// (DenseLU)
   //============================================================================
   void DenseDoubleMatrix::ludecompose()
   {
     // Use the default (DenseLU) solver to ludecompose the matrix
     static_cast<DenseLU*>(Default_linear_solver_pt)->factorise(this);
   }
  
  
   //============================================================================
   ///  Back substitute an LU decomposed matrix.
   //============================================================================
   void DenseDoubleMatrix::lubksub(DoubleVector& rhs)
   {
     // Use the default (DenseLU) solver to perform the backsubstitution
     static_cast<DenseLU*>(Default_linear_solver_pt)->backsub(rhs, rhs);
   }
  
   //============================================================================
   ///  Back substitute an LU decomposed matrix.
   //============================================================================
   void DenseDoubleMatrix::lubksub(Vector<double>& rhs)
   {
     // Use the default (DenseLU) solver to perform the backsubstitution
     static_cast<DenseLU*>(Default_linear_solver_pt)->backsub(rhs, rhs);
   }
  
  
   //============================================================================
   ///  Determine eigenvalues and eigenvectors, using
   /// Jacobi rotations. Only for symmetric matrices. Nothing gets overwritten!
   /// - \c eigen_vect(i,j) = j-th component of i-th eigenvector.
   /// - \c eigen_val[i] is the i-th eigenvalue; same ordering as in eigenvectors
   //============================================================================
   void DenseDoubleMatrix::eigenvalues_by_jacobi(
     Vector<double>& eigen_vals, DenseMatrix<double>& eigen_vect) const
   {
 #ifdef PARANOID
     // Check Matrix is square
     if (N != M)
     {
       throw OomphLibError(
         "This matrix is not square, the matrix MUST be square!",
         OOMPH_CURRENT_FUNCTION,
         OOMPH_EXCEPTION_LOCATION);
     }
 #endif
     // Make a copy of the matrix & check that it's symmetric
  
     // Check that the sizes of eigen_vals and eigen_vect are correct. If not
     // correct them.
     if (eigen_vals.size() != N)
     {
       eigen_vals.resize(N);
     }
     if (eigen_vect.ncol() != N || eigen_vect.nrow() != N)
     {
       eigen_vect.resize(N);
     }
  
     DenseDoubleMatrix working_matrix(N);
     for (unsigned long i = 0; i < N; i++)
     {
       for (unsigned long j = 0; j < M; j++)
       {
 #ifdef PARANOID
         if (Matrixdata[M * i + j] != Matrixdata[M * j + i])
         {
           throw OomphLibError(
             "Matrix needs to be symmetric for eigenvalues_by_jacobi()",
             OOMPH_CURRENT_FUNCTION,
             OOMPH_EXCEPTION_LOCATION);
         }
 #endif
         working_matrix(i, j) = (*this)(i, j);
       }
     }
  
     DenseDoubleMatrix aux_eigen_vect(N);
  
     throw OomphLibError("Sorry JacobiEigenSolver::jacobi() removed because of "
                         "licencing problems.",
                         OOMPH_CURRENT_FUNCTION,
                         OOMPH_EXCEPTION_LOCATION);
  
     // // Call eigensolver
     // unsigned long nrot;
     // JacobiEigenSolver::jacobi(working_matrix, eigen_vals, aux_eigen_vect,
     //                           nrot);
  
     // Copy across (and transpose)
     for (unsigned long i = 0; i < N; i++)
     {
       for (unsigned long j = 0; j < M; j++)
       {
         eigen_vect(i, j) = aux_eigen_vect(j, i);
       }
     }
   }
  
  
   //============================================================================
   ///  Multiply the matrix by the vector x: soln=Ax
   //============================================================================
   void DenseDoubleMatrix::multiply(const DoubleVector& x,
                                    DoubleVector& soln) const
   {
 #ifdef PARANOID
     // Check to see if x.size() = ncol().
     if (x.nrow() != this->ncol())
     {
       std::ostringstream error_message_stream;
       error_message_stream << "The x vector is not the right size. It is "
                            << x.nrow() << ", it should be " << this->ncol()
                            << std::endl;
       throw OomphLibError(error_message_stream.str(),
                           OOMPH_CURRENT_FUNCTION,
                           OOMPH_EXCEPTION_LOCATION);
     }
     // check that x is not distributed
     if (x.distributed())
     {
       std::ostringstream error_message_stream;
       error_message_stream
         << "The x vector cannot be distributed for DenseDoubleMatrix "
         << "matrix-vector multiply" << std::endl;
       throw OomphLibError(error_message_stream.str(),
                           OOMPH_CURRENT_FUNCTION,
                           OOMPH_EXCEPTION_LOCATION);
     }
     // if soln is setup...
     if (soln.built())
     {
       // check that soln is not distributed
       if (soln.distributed())
       {
         std::ostringstream error_message_stream;
         error_message_stream
           << "The x vector cannot be distributed for DenseDoubleMatrix "
           << "matrix-vector multiply" << std::endl;
         throw OomphLibError(error_message_stream.str(),
                             OOMPH_CURRENT_FUNCTION,
                             OOMPH_EXCEPTION_LOCATION);
       }
       if (soln.nrow() != this->nrow())
       {
         std::ostringstream error_message_stream;
         error_message_stream
           << "The soln vector is setup and therefore must have the same "
           << "number of rows as the matrix";
         throw OomphLibError(error_message_stream.str(),
                             OOMPH_CURRENT_FUNCTION,
                             OOMPH_EXCEPTION_LOCATION);
       }
       if (*x.distribution_pt()->communicator_pt() !=
           *soln.distribution_pt()->communicator_pt())
       {
         std::ostringstream error_message_stream;
         error_message_stream
           << "The soln vector and the x vector must have the same communicator"
           << std::endl;
         throw OomphLibError(error_message_stream.str(),
                             OOMPH_CURRENT_FUNCTION,
                             OOMPH_EXCEPTION_LOCATION);
       }
     }
 #endif
  
     // if soln is not setup then setup the distribution
     if (!soln.built())
     {
       LinearAlgebraDistribution dist(
         x.distribution_pt()->communicator_pt(), this->nrow(), false);
       soln.build(&dist, 0.0);
     }
  
     // Initialise the solution
     soln.initialise(0.0);
  
     // Multiply the matrix A, by the vector x
     const double* x_pt = x.values_pt();
     double* soln_pt = soln.values_pt();
     for (unsigned long i = 0; i < N; i++)
     {
       for (unsigned long j = 0; j < M; j++)
       {
         soln_pt[i] += Matrixdata[M * i + j] * x_pt[j];
       }
     }
   }
  
  
   //=================================================================
   /// Multiply the transposed matrix by the vector x: soln=A^T x
   //=================================================================
   void DenseDoubleMatrix::multiply_transpose(const DoubleVector& x,
                                              DoubleVector& soln) const
   {
 #ifdef PARANOID
     // Check to see if x.size() = ncol().
     if (x.nrow() != this->nrow())
     {
       std::ostringstream error_message_stream;
       error_message_stream << "The x vector is not the right size. It is "
                            << x.nrow() << ", it should be " << this->nrow()
                            << std::endl;
       throw OomphLibError(error_message_stream.str(),
                           OOMPH_CURRENT_FUNCTION,
                           OOMPH_EXCEPTION_LOCATION);
     }
     // check that x is not distributed
     if (x.distributed())
     {
       std::ostringstream error_message_stream;
       error_message_stream
         << "The x vector cannot be distributed for DenseDoubleMatrix "
         << "matrix-vector multiply" << std::endl;
       throw OomphLibError(error_message_stream.str(),
                           OOMPH_CURRENT_FUNCTION,
                           OOMPH_EXCEPTION_LOCATION);
     }
     // if soln is setup...
     if (soln.built())
     {
       // check that soln is not distributed
       if (soln.distributed())
       {
         std::ostringstream error_message_stream;
         error_message_stream
           << "The x vector cannot be distributed for DenseDoubleMatrix "
           << "matrix-vector multiply" << std::endl;
         throw OomphLibError(error_message_stream.str(),
                             OOMPH_CURRENT_FUNCTION,
                             OOMPH_EXCEPTION_LOCATION);
       }
       if (soln.nrow() != this->ncol())
       {
         std::ostringstream error_message_stream;
         error_message_stream
           << "The soln vector is setup and therefore must have the same "
           << "number of columns as the matrix";
         throw OomphLibError(error_message_stream.str(),
                             OOMPH_CURRENT_FUNCTION,
                             OOMPH_EXCEPTION_LOCATION);
       }
       if (*soln.distribution_pt()->communicator_pt() !=
           *x.distribution_pt()->communicator_pt())
       {
         std::ostringstream error_message_stream;
         error_message_stream
           << "The soln vector and the x vector must have the same communicator"
           << std::endl;
         throw OomphLibError(error_message_stream.str(),
                             OOMPH_CURRENT_FUNCTION,
                             OOMPH_EXCEPTION_LOCATION);
       }
     }
 #endif
  
     // if soln is not setup then setup the distribution
     if (!soln.built())
     {
       LinearAlgebraDistribution* dist_pt = new LinearAlgebraDistribution(
         x.distribution_pt()->communicator_pt(), this->ncol(), false);
       soln.build(dist_pt, 0.0);
       delete dist_pt;
     }
  
     // Initialise the solution
     soln.initialise(0.0);
  
     // Matrix vector product
     double* soln_pt = soln.values_pt();
     const double* x_pt = x.values_pt();
     for (unsigned long i = 0; i < N; i++)
     {
       for (unsigned long j = 0; j < M; j++)
       {
         soln_pt[j] += Matrixdata[N * i + j] * x_pt[i];
       }
     }
   }
  
  
   //=================================================================
   /// For every row, find the maximum absolute value of the
   /// entries in this row. Set all values that are less than alpha times
   /// this maximum to zero and return the resulting matrix in
   /// reduced_matrix. Note: Diagonal entries are retained regardless
   /// of their size.
   //=================================================================
   void DenseDoubleMatrix::matrix_reduction(const double& alpha,
                                            DenseDoubleMatrix& reduced_matrix)
   {
     reduced_matrix.resize(N, M, 0.0);
     // maximum value in a row
     double max_row;
  
     // Loop over rows
     for (unsigned i = 0; i < N; i++)
     {
       // Initialise max value in row
       max_row = 0.0;
  
       // Loop over entries in columns
       for (unsigned long j = 0; j < M; j++)
       {
         // Find max. value in row
         if (std::fabs(Matrixdata[M * i + j]) > max_row)
         {
           max_row = std::fabs(Matrixdata[M * i + j]);
         }
       }
  
       // Decide if we need to retain the entries in the row
       for (unsigned long j = 0; j < M; j++)
       {
         // If we're on the diagonal or the value is sufficiently large: retain
         // i.e. copy across.
         if (i == j || std::fabs(Matrixdata[M * i + j]) > alpha * max_row)
         {
           reduced_matrix(i, j) = Matrixdata[M * i + j];
         }
       }
     }
   }
  
  
   //=============================================================================
   /// Function to multiply this matrix by the DenseDoubleMatrix  matrix_in.
   //=============================================================================
   void DenseDoubleMatrix::multiply(const DenseDoubleMatrix& matrix_in,
                                    DenseDoubleMatrix& result)
   {
 #ifdef PARANOID
     // check matrix dimensions are compatable
     if (this->ncol() != matrix_in.nrow())
     {
       std::ostringstream error_message;
       error_message
         << "Matrix dimensions incompatable for matrix-matrix multiplication"
         << "ncol() for first matrix:" << this->ncol()
         << "nrow() for second matrix: " << matrix_in.nrow();
  
       throw OomphLibError(
         error_message.str(), OOMPH_CURRENT_FUNCTION, OOMPH_EXCEPTION_LOCATION);
     }
 #endif
  
     // NB N is number of rows!
     unsigned long n_row = this->nrow();
     unsigned long m_col = matrix_in.ncol();
  
     // resize and intialize result
     result.resize(n_row, m_col, 0.0);
  
     // clock_t clock1 = clock();
  
     // do calculation
     unsigned long n_col = this->ncol();
     for (unsigned long k = 0; k < n_col; k++)
     {
       for (unsigned long i = 0; i < n_row; i++)
       {
         for (unsigned long j = 0; j < m_col; j++)
         {
           result(i, j) += Matrixdata[m_col * i + k] * matrix_in(k, j);
         }
       }
     }
   }
  
  
   /// ////////////////////////////////////////////////////////////////////////////
   /// ////////////////////////////////////////////////////////////////////////////
   /// ////////////////////////////////////////////////////////////////////////////
  
  
   //=======================================================================
   /// Default constructor, set the default linear solver and
   /// matrix-matrix multiplication method.
   //========================================================================
   CCDoubleMatrix::CCDoubleMatrix() : CCMatrix<double>()
   {
     Linear_solver_pt = Default_linear_solver_pt = new SuperLUSolver;
     Matrix_matrix_multiply_method = 2;
   }
  
   //========================================================================
   /// Constructor: Pass vector of values, vector of row indices,
   /// vector of column starts and number of rows (can be suppressed
   /// for square matrices). Number of nonzero entries is read
   /// off from value, so make sure the vector has been shrunk
   /// to its correct length.
   //=======================================================================
   CCDoubleMatrix::CCDoubleMatrix(const Vector<double>& value,
                                  const Vector<int>& row_index,
                                  const Vector<int>& column_start,
                                  const unsigned long& n,
                                  const unsigned long& m)
     : CCMatrix<double>(value, row_index, column_start, n, m)
   {
     Linear_solver_pt = Default_linear_solver_pt = new SuperLUSolver;
     Matrix_matrix_multiply_method = 2;
   }
  
   /// Destructor: delete the default linear solver
   CCDoubleMatrix::~CCDoubleMatrix()
   {
     delete Default_linear_solver_pt;
   }
  
  
   //===================================================================
   /// Perform LU decomposition. Return the sign of the determinant
   //===================================================================
   void CCDoubleMatrix::ludecompose()
   {
     static_cast<SuperLUSolver*>(Default_linear_solver_pt)->factorise(this);
   }
  
   //===================================================================
   /// Do the backsubstitution
   //===================================================================
   void CCDoubleMatrix::lubksub(DoubleVector& rhs)
   {
     static_cast<SuperLUSolver*>(Default_linear_solver_pt)->backsub(rhs, rhs);
   }
  
   //===================================================================
   ///  Multiply the matrix by the vector x
   //===================================================================
   void CCDoubleMatrix::multiply(const DoubleVector& x, DoubleVector& soln) const
   {
 #ifdef PARANOID
     // Check to see if x.size() = ncol().
     if (x.nrow() != this->ncol())
     {
       std::ostringstream error_message_stream;
       error_message_stream << "The x vector is not the right size. It is "
                            << x.nrow() << ", it should be " << this->ncol()
                            << std::endl;
       throw OomphLibError(error_message_stream.str(),
                           OOMPH_CURRENT_FUNCTION,
                           OOMPH_EXCEPTION_LOCATION);
     }
     // check that x is not distributed
     if (x.distributed())
     {
       std::ostringstream error_message_stream;
       error_message_stream
         << "The x vector cannot be distributed for CCDoubleMatrix "
         << "matrix-vector multiply" << std::endl;
       throw OomphLibError(error_message_stream.str(),
                           OOMPH_CURRENT_FUNCTION,
                           OOMPH_EXCEPTION_LOCATION);
     }
     // if soln is setup...
     if (soln.built())
     {
       // check that soln is not distributed
       if (soln.distributed())
       {
         std::ostringstream error_message_stream;
         error_message_stream
           << "The x vector cannot be distributed for CCDoubleMatrix "
           << "matrix-vector multiply" << std::endl;
         throw OomphLibError(error_message_stream.str(),
                             OOMPH_CURRENT_FUNCTION,
                             OOMPH_EXCEPTION_LOCATION);
       }
       if (soln.nrow() != this->nrow())
       {
         std::ostringstream error_message_stream;
         error_message_stream
           << "The soln vector is setup and therefore must have the same "
           << "number of rows as the matrix";
         throw OomphLibError(error_message_stream.str(),
                             OOMPH_CURRENT_FUNCTION,
                             OOMPH_EXCEPTION_LOCATION);
       }
       if (*soln.distribution_pt()->communicator_pt() !=
           *x.distribution_pt()->communicator_pt())
       {
         std::ostringstream error_message_stream;
         error_message_stream
           << "The soln vector and the x vector must have the same communicator"
           << std::endl;
         throw OomphLibError(error_message_stream.str(),
                             OOMPH_CURRENT_FUNCTION,
                             OOMPH_EXCEPTION_LOCATION);
       }
     }
 #endif
  
     // if soln is not setup then setup the distribution
     if (!soln.built())
     {
       LinearAlgebraDistribution* dist_pt = new LinearAlgebraDistribution(
         x.distribution_pt()->communicator_pt(), this->nrow(), false);
       soln.build(dist_pt, 0.0);
       delete dist_pt;
     }
  
     // zero
     soln.initialise(0.0);
  
     // multiply
     double* soln_pt = soln.values_pt();
     const double* x_pt = x.values_pt();
     for (unsigned long j = 0; j < N; j++)
     {
       for (long k = Column_start[j]; k < Column_start[j + 1]; k++)
       {
         unsigned long i = Row_index[k];
         double a_ij = Value[k];
         soln_pt[i] += a_ij * x_pt[j];
       }
     }
   }
  
  
   //=================================================================
   /// Multiply the  transposed matrix by the vector x: soln=A^T x
   //=================================================================
   void CCDoubleMatrix::multiply_transpose(const DoubleVector& x,
                                           DoubleVector& soln) const
   {
 #ifdef PARANOID
     // Check to see if x.size() = ncol().
     if (x.nrow() != this->nrow())
     {
       std::ostringstream error_message_stream;
       error_message_stream << "The x vector is not the right size. It is "
                            << x.nrow() << ", it should be " << this->nrow()
                            << std::endl;
       throw OomphLibError(error_message_stream.str(),
                           OOMPH_CURRENT_FUNCTION,
                           OOMPH_EXCEPTION_LOCATION);
     }
     // check that x is not distributed
     if (x.distributed())
     {
       std::ostringstream error_message_stream;
       error_message_stream
         << "The x vector cannot be distributed for CCDoubleMatrix "
         << "matrix-vector multiply" << std::endl;
       throw OomphLibError(error_message_stream.str(),
                           OOMPH_CURRENT_FUNCTION,
                           OOMPH_EXCEPTION_LOCATION);
     }
     // if soln is setup...
     if (soln.built())
     {
       // check that soln is not distributed
       if (soln.distributed())
       {
         std::ostringstream error_message_stream;
         error_message_stream
           << "The x vector cannot be distributed for CCDoubleMatrix "
           << "matrix-vector multiply" << std::endl;
         throw OomphLibError(error_message_stream.str(),
                             OOMPH_CURRENT_FUNCTION,
                             OOMPH_EXCEPTION_LOCATION);
       }
       if (soln.nrow() != this->ncol())
       {
         std::ostringstream error_message_stream;
         error_message_stream
           << "The soln vector is setup and therefore must have the same "
           << "number of columns as the matrix";
         throw OomphLibError(error_message_stream.str(),
                             OOMPH_CURRENT_FUNCTION,
                             OOMPH_EXCEPTION_LOCATION);
       }
       if (*soln.distribution_pt()->communicator_pt() !=
           *x.distribution_pt()->communicator_pt())
       {
         std::ostringstream error_message_stream;
         error_message_stream
           << "The soln vector and the x vector must have the same communicator"
           << std::endl;
         throw OomphLibError(error_message_stream.str(),
                             OOMPH_CURRENT_FUNCTION,
                             OOMPH_EXCEPTION_LOCATION);
       }
     }
 #endif
  
     // if soln is not setup then setup the distribution
     if (!soln.built())
     {
       LinearAlgebraDistribution* dist_pt = new LinearAlgebraDistribution(
         x.distribution_pt()->communicator_pt(), this->ncol(), false);
       soln.build(dist_pt, 0.0);
       delete dist_pt;
     }
  
     // zero
     soln.initialise(0.0);
  
     // Matrix vector product
     double* soln_pt = soln.values_pt();
     const double* x_pt = x.values_pt();
     for (unsigned long i = 0; i < N; i++)
     {
       for (long k = Column_start[i]; k < Column_start[i + 1]; k++)
       {
         unsigned long j = Row_index[k];
         double a_ij = Value[k];
         soln_pt[j] += a_ij * x_pt[i];
       }
     }
   }
  
  
   //===========================================================================
   /// Function to multiply this matrix by the CCDoubleMatrix matrix_in
   /// The multiplication method used can be selected using the flag
   /// Matrix_matrix_multiply_method. By default Method 2 is used.
   /// Method 1: First runs through this matrix and matrix_in to find the storage
   ///           requirements for result - arrays of the correct size are
   ///           then allocated before performing the calculation.
   ///           Minimises memory requirements but more costly.
   /// Method 2: Grows storage for values and column indices of result 'on the
   ///           fly' using an array of maps. Faster but more memory
   ///           intensive.
   /// Method 3: Grows storage for values and column indices of result 'on the
   ///           fly' using a vector of vectors. Not particularly impressive
   ///           on the platforms we tried...
   //=============================================================================
   void CCDoubleMatrix::multiply(const CCDoubleMatrix& matrix_in,
                                 CCDoubleMatrix& result)
   {
 #ifdef PARANOID
     // check matrix dimensions are compatible
     if (this->ncol() != matrix_in.nrow())
     {
       std::ostringstream error_message;
       error_message
         << "Matrix dimensions incompatable for matrix-matrix multiplication"
         << "ncol() for first matrix:" << this->ncol()
         << "nrow() for second matrix: " << matrix_in.nrow();
  
       throw OomphLibError(
         error_message.str(), OOMPH_CURRENT_FUNCTION, OOMPH_EXCEPTION_LOCATION);
     }
 #endif
  
     // NB N is number of rows!
     unsigned long N = this->nrow();
     unsigned long M = matrix_in.ncol();
     unsigned long Nnz = 0;
  
     // pointers to arrays which store result
     int* Column_start;
     double* Value;
     int* Row_index;
  
     // get pointers to matrix_in
     const int* matrix_in_col_start = matrix_in.column_start();
     const int* matrix_in_row_index = matrix_in.row_index();
     const double* matrix_in_value = matrix_in.value();
  
     // get pointers to this matrix
     const double* this_value = this->value();
     const int* this_col_start = this->column_start();
     const int* this_row_index = this->row_index();
  
     // set method
     unsigned method = Matrix_matrix_multiply_method;
  
     // clock_t clock1 = clock();
  
     // METHOD 1
     // --------
     if (method == 1)
     {
       // allocate storage for column starts
       Column_start = new int[M + 1];
       Column_start[0] = 0;
  
       // a set to store number of non-zero rows in each column of result
       std::set<unsigned> rows;
  
       // run through columns of this matrix and matrix_in to find number of
       // non-zero entries in each column of result
       for (unsigned long this_col = 0; this_col < M; this_col++)
       {
         // run through non-zeros in this_col of this matrix
         for (int this_ptr = this_col_start[this_col];
              this_ptr < this_col_start[this_col + 1];
              this_ptr++)
         {
           // find row index for non-zero
           unsigned matrix_in_col = this_row_index[this_ptr];
  
           // run through corresponding column in matrix_in
           for (int matrix_in_ptr = matrix_in_col_start[matrix_in_col];
                matrix_in_ptr < matrix_in_col_start[matrix_in_col + 1];
                matrix_in_ptr++)
           {
             // find row index for non-zero in matrix_in and store in rows
             rows.insert(matrix_in_row_index[matrix_in_ptr]);
           }
         }
         // update Column_start
         Column_start[this_col + 1] = Column_start[this_col] + rows.size();
  
         // wipe values in rows
         rows.clear();
       }
  
       // set Nnz
       Nnz = Column_start[M];
  
       // allocate arrays for result
       Value = new double[Nnz];
       Row_index = new int[Nnz];
  
       // set all values of Row_index to -1
       for (unsigned long i = 0; i < Nnz; i++) Row_index[i] = -1;
  
       // Calculate values for result - first run through columns of this matrix
       for (unsigned long this_col = 0; this_col < M; this_col++)
       {
         // run through non-zeros in this_column
         for (int this_ptr = this_col_start[this_col];
              this_ptr < this_col_start[this_col + 1];
              this_ptr++)
         {
           // find value of non-zero
           double this_val = this_value[this_ptr];
  
           // find row associated with non-zero
           unsigned matrix_in_col = this_row_index[this_ptr];
  
           // run through corresponding column in matrix_in
           for (int matrix_in_ptr = matrix_in_col_start[matrix_in_col];
                matrix_in_ptr < matrix_in_col_start[matrix_in_col + 1];
                matrix_in_ptr++)
           {
             // find row index for non-zero in matrix_in
             int row = matrix_in_row_index[matrix_in_ptr];
  
             // find position in result to insert value
             for (int ptr = Column_start[this_col];
                  ptr <= Column_start[this_col + 1];
                  ptr++)
             {
               if (ptr == Column_start[this_col + 1])
               {
                 // error - have passed end of column without finding
                 // correct row index
                 std::ostringstream error_message;
                 error_message << "Error inserting value in result";
  
                 throw OomphLibError(error_message.str(),
                                     OOMPH_CURRENT_FUNCTION,
                                     OOMPH_EXCEPTION_LOCATION);
               }
               else if (Row_index[ptr] == -1)
               {
                 // first entry for this row index
                 Row_index[ptr] = row;
                 Value[ptr] = this_val * matrix_in_value[matrix_in_ptr];
                 break;
               }
               else if (Row_index[ptr] == row)
               {
                 // row index already exists - add value
                 Value[ptr] += this_val * matrix_in_value[matrix_in_ptr];
                 break;
               }
             }
           }
         }
       }
     }
  
     // METHOD 2
     // --------
     else if (method == 2)
     {
       // generate array of maps to store values for result
       std::map<int, double>* result_maps = new std::map<int, double>[M];
  
       // run through columns of this matrix
       for (unsigned long this_col = 0; this_col < M; this_col++)
       {
         // run through non-zeros in this_col
         for (int this_ptr = this_col_start[this_col];
              this_ptr < this_col_start[this_col + 1];
              this_ptr++)
         {
           // find value of non-zero
           double this_val = this_value[this_ptr];
  
           // find row index associated with non-zero
           unsigned matrix_in_col = this_row_index[this_ptr];
  
           // run through corresponding column in matrix_in
           for (int matrix_in_ptr = matrix_in_col_start[matrix_in_col];
                matrix_in_ptr < matrix_in_col_start[matrix_in_col + 1];
                matrix_in_ptr++)
           {
             // find row index for non-zero in matrix_in
             int row = matrix_in_row_index[matrix_in_ptr];
  
             // insert value
             result_maps[this_col][row] +=
               this_val * matrix_in_value[matrix_in_ptr];
           }
         }
       }
  
       // allocate Column_start
       Column_start = new int[M + 1];
  
       // copy across column starts
       Column_start[0] = 0;
       for (unsigned long col = 0; col < M; col++)
       {
         int size = result_maps[col].size();
         Column_start[col + 1] = Column_start[col] + size;
       }
  
       // set Nnz
       Nnz = Column_start[M];
  
       // allocate other arrays
       Value = new double[Nnz];
       Row_index = new int[Nnz];
  
       // copy values and row indices
       for (unsigned long col = 0; col < M; col++)
       {
         unsigned ptr = Column_start[col];
         for (std::map<int, double>::iterator i = result_maps[col].begin();
              i != result_maps[col].end();
              i++)
         {
           Row_index[ptr] = i->first;
           Value[ptr] = i->second;
           ptr++;
         }
       }
  
       // tidy up memory
       delete[] result_maps;
     }
  
     // METHOD 3
     // --------
     else if (method == 3)
     {
       // vectors of vectors to store results
       std::vector<std::vector<int>> result_rows(N);
       std::vector<std::vector<double>> result_vals(N);
  
       // run through the columns of this matrix
       for (unsigned long this_col = 0; this_col < M; this_col++)
       {
         // run through non-zeros in this_col
         for (int this_ptr = this_col_start[this_col];
              this_ptr < this_col_start[this_col + 1];
              this_ptr++)
         {
           // find value of non-zero
           double this_val = this_value[this_ptr];
  
           // find row index associated with non-zero
           unsigned matrix_in_col = this_row_index[this_ptr];
  
           // run through corresponding column in matrix_in
           for (int matrix_in_ptr = matrix_in_col_start[matrix_in_col];
                matrix_in_ptr < matrix_in_col_start[matrix_in_col + 1];
                matrix_in_ptr++)
           {
             // find row index for non-zero in matrix_in
             int row = matrix_in_row_index[matrix_in_ptr];
  
             // insert value
             int size = result_rows[this_col].size();
             for (int i = 0; i <= size; i++)
             {
               if (i == size)
               {
                 // first entry for this row index
                 result_rows[this_col].push_back(row);
                 result_vals[this_col].push_back(this_val *
                                                 matrix_in_value[matrix_in_ptr]);
               }
               else if (row == result_rows[this_col][i])
               {
                 // row index already exists
                 result_vals[this_col][i] +=
                   this_val * matrix_in_value[matrix_in_ptr];
                 break;
               }
             }
           }
         }
       }
  
       // allocate Column_start
       Column_start = new int[M + 1];
  
       // copy across column starts
       Column_start[0] = 0;
       for (unsigned long col = 0; col < M; col++)
       {
         int size = result_rows[col].size();
         Column_start[col + 1] = Column_start[col] + size;
       }
  
       // set Nnz
       Nnz = Column_start[M];
  
       // allocate other arrays
       Value = new double[Nnz];
       Row_index = new int[Nnz];
  
       // copy across values and row indices
       for (unsigned long col = 0; col < N; col++)
       {
         unsigned ptr = Column_start[col];
         unsigned n_rows = result_rows[col].size();
         for (unsigned i = 0; i < n_rows; i++)
         {
           Row_index[ptr] = result_rows[col][i];
           Value[ptr] = result_vals[col][i];
           ptr++;
         }
       }
     }
  
     // INCORRECT VALUE FOR METHOD
     else
     {
       std::ostringstream error_message;
       error_message << "Incorrect method set in matrix-matrix multiply"
                     << "method=" << method << " not allowed";
  
       throw OomphLibError(
         error_message.str(), OOMPH_CURRENT_FUNCTION, OOMPH_EXCEPTION_LOCATION);
     }
  
     result.build_without_copy(Value, Row_index, Column_start, Nnz, N, M);
   }
  
  
   //=================================================================
   /// For every row, find the maximum absolute value of the
   /// entries in this row. Set all values that are less than alpha times
   /// this maximum to zero and return the resulting matrix in
   /// reduced_matrix. Note: Diagonal entries are retained regardless
   /// of their size.
   //=================================================================
   void CCDoubleMatrix::matrix_reduction(const double& alpha,
                                         CCDoubleMatrix& reduced_matrix)
   {
     // number of columns in matrix
     long n_coln = ncol();
  
     Vector<double> max_row(nrow(), 0.0);
  
     // Here's the packed format for the new matrix
     Vector<int> B_row_start(1);
     Vector<int> B_column_index;
     Vector<double> B_value;
  
  
     // k is counter for the number of entries in the reduced matrix
     unsigned k = 0;
  
     // Initialise row start
     B_row_start[0] = 0;
  
     // Loop over columns
     for (long i = 0; i < n_coln; i++)
     {
       // Loop over entries in columns
       for (long j = Column_start[i]; j < Column_start[i + 1]; j++)
       {
         // Find max. value in row
         if (std::fabs(Value[j]) > max_row[Row_index[j]])
         {
           max_row[Row_index[j]] = std::fabs(Value[j]);
         }
       }
  
       // Decide if we need to retain the entries in the row
       for (long j = Column_start[i]; j < Column_start[i + 1]; j++)
       {
         // If we're on the diagonal or the value is sufficiently large: retain
         // i.e. copy across.
         if (i == Row_index[j] ||
             std::fabs(Value[j]) > alpha * max_row[Row_index[j]])
         {
           B_value.push_back(Value[j]);
           B_column_index.push_back(Row_index[j]);
           k++;
         }
       }
       // This writes the row start for the next row -- equal to
       // to the number of entries written so far (C++ zero-based indexing!)
       B_row_start.push_back(k);
     }
  
  
     // Build the matrix from the compressed format
     dynamic_cast<CCDoubleMatrix&>(reduced_matrix)
       .build(B_value, B_column_index, B_row_start, nrow(), ncol());
   }
  
  
   /// ////////////////////////////////////////////////////////////////////////////
   /// ////////////////////////////////////////////////////////////////////////////
   /// ////////////////////////////////////////////////////////////////////////////
  
   //=============================================================================
   /// Default constructor
   //=============================================================================
   CRDoubleMatrix::CRDoubleMatrix()
   {
     // set the default solver
     Linear_solver_pt = Default_linear_solver_pt = new SuperLUSolver;
  
     // matrix not built
     Built = false;
  
     // set the serial matrix-matrix multiply method
 #ifdef OOMPH_HAS_TRILINOS
     //    Serial_matrix_matrix_multiply_method = 4;
     Serial_matrix_matrix_multiply_method = 2;
 #else
     Serial_matrix_matrix_multiply_method = 2;
 #endif
   }
  
   //=============================================================================
   /// Copy constructor
   //=============================================================================
   CRDoubleMatrix::CRDoubleMatrix(const CRDoubleMatrix& other_matrix)
   {
     // copy the distribution
     this->build_distribution(other_matrix.distribution_pt());
  
     // copy coefficients
     const double* values_pt = other_matrix.value();
     const int* column_indices = other_matrix.column_index();
     const int* row_start = other_matrix.row_start();
  
     // This is the local nnz.
     const unsigned nnz = other_matrix.nnz();
  
     // Using number of local rows since the underlying CRMatrix is local to
     // each processor.
     const unsigned nrow_local = other_matrix.nrow_local();
  
     // Storage for the (yet to be copied) data.
     double* my_values_pt = new double[nnz];
     int* my_column_indices = new int[nnz];
     int* my_row_start = new int[nrow_local + 1];
  
     // Copying over the data.
     std::copy(values_pt, values_pt + nnz, my_values_pt);
     std::copy(column_indices, column_indices + nnz, my_column_indices);
     std::copy(row_start, row_start + nrow_local + 1, my_row_start);
  
  
     // Build without copy since we have made a deep copy of the data structure.
     this->build_without_copy(
       other_matrix.ncol(), nnz, my_values_pt, my_column_indices, my_row_start);
  
     // set the default solver
     Linear_solver_pt = Default_linear_solver_pt = new SuperLUSolver;
  
     // matrix is built
     Built = true;
  
     // set the serial matrix-matrix multiply method
 #ifdef OOMPH_HAS_TRILINOS
     // Serial_matrix_matrix_multiply_method = 4;
     Serial_matrix_matrix_multiply_method = 2;
 #else
     Serial_matrix_matrix_multiply_method = 2;
 #endif
   }
  
  
   //=============================================================================
   /// Constructor: just stores the distribution but does not build the
   /// matrix
   //=============================================================================
   CRDoubleMatrix::CRDoubleMatrix(
     const LinearAlgebraDistribution* distribution_pt)
   {
     this->build_distribution(distribution_pt);
  
     // set the default solver
     Linear_solver_pt = Default_linear_solver_pt = new SuperLUSolver;
  
     // matrix not built
     Built = false;
  
 // set the serial matrix-matrix multiply method
 #ifdef OOMPH_HAS_TRILINOS
     //    Serial_matrix_matrix_multiply_method = 4;
     Serial_matrix_matrix_multiply_method = 2;
 #else
     Serial_matrix_matrix_multiply_method = 2;
 #endif
   }
  
   //=============================================================================
   /// Constructor: Takes the distribution and the number of columns, as
   /// well as the vector of values, vector of column indices,vector of row
   /// starts.
   //=============================================================================
   CRDoubleMatrix::CRDoubleMatrix(const LinearAlgebraDistribution* dist_pt,
                                  const unsigned& ncol,
                                  const Vector<double>& value,
                                  const Vector<int>& column_index,
                                  const Vector<int>& row_start)
   {
     // build the compressed row matrix
     CR_matrix.build(
       value, column_index, row_start, dist_pt->nrow_local(), ncol);
  
     // store the Distribution
     this->build_distribution(dist_pt);
  
     // set the linear solver
     Linear_solver_pt = Default_linear_solver_pt = new SuperLUSolver;
  
     // set the serial matrix-matrix multiply method
 #ifdef OOMPH_HAS_TRILINOS
     // Serial_matrix_matrix_multiply_method = 4;
     Serial_matrix_matrix_multiply_method = 2;
 #else
     Serial_matrix_matrix_multiply_method = 2;
 #endif
  
     // matrix has been built
     Built = true;
   }
  
  
   //=============================================================================
   /// Destructor
   //=============================================================================
   CRDoubleMatrix::~CRDoubleMatrix()
   {
     this->clear();
     delete Default_linear_solver_pt;
     Default_linear_solver_pt = 0;
   }
  
   //=============================================================================
   /// Rebuild the matrix - assembles an empty matrix with a defined distribution
   //=============================================================================
   void CRDoubleMatrix::build(const LinearAlgebraDistribution* distribution_pt)
   {
     this->clear();
     this->build_distribution(distribution_pt);
   }
  
   //=============================================================================
   /// Runs through the column index vector and checks if the entries
   /// are arranged arbitrarily or if they follow the regular lexicographical
   /// of matrices. If a boolean argument is provided with the assignment
   /// TRUE then information on the first entry which is not in the correct
   /// position will also be given
   //=============================================================================
   bool CRDoubleMatrix::entries_are_sorted(
     const bool& doc_unordered_entries) const
   {
 #ifdef OOMPH_HAS_MPI
     // We only need to produce a warning if the matrix is distributed
     if (this->distributed())
     {
       // Create an ostringstream object to store the warning message
       std::ostringstream warning_message;
  
       // Create the warning messsage
       warning_message << "This method currently works for serial but "
                       << "has not been implemented for use with MPI!\n";
  
       // Issue the warning
       OomphLibWarning(warning_message.str(),
                       OOMPH_CURRENT_FUNCTION,
                       OOMPH_EXCEPTION_LOCATION);
     }
 #endif
  
     // Get the number of rows in this matrix
     unsigned n_rows = this->nrow();
  
     // Acquire access to the value, row_start and column_index arrays from
     // the CR matrix. Since we do not change anything in row_start_pt we
     // give it the const prefix
     const int* column_index_pt = this->column_index();
     const int* row_start_pt = this->row_start();
  
     // Loop over the rows of matrix
     for (unsigned i = 0; i < n_rows; i++)
     {
       // Calculate the number of nonzeros in the i-th row
       unsigned nnz_row_i = *(row_start_pt + i + 1) - *(row_start_pt + i);
  
       // Get the index of the first entry in row i
       unsigned i_row_start = *(row_start_pt + i);
  
       // Loop over the entries of the i-th row
       for (unsigned j = 0; j < nnz_row_i - 1; j++)
       {
         // Check if the column index of the following entry is greater than the
         // current entry
         if ((*(column_index_pt + i_row_start + j + 1)) <
             (*(column_index_pt + i_row_start + j)))
         {
           // If the input argument was set to TRUE we document we output
           // information of the first entry which is not in the correct position
           if (doc_unordered_entries)
           {
             // Tell the user
             oomph_info << "Matrix has not been correctly sorted!"
                        << "\nOn row: " << i << "\nEntry: " << j
                        << "\nEntry 1 = " << *(column_index_pt + i_row_start + j)
                        << "\nEntry 2 = "
                        << *(column_index_pt + i_row_start + j + 1) << std::endl;
           }
  
           // It hasn't worked
           return false;
         } // if ((*(column_index_pt+i_row_start+j+1)) ...
       } // for (unsigned j=0;j<nnz_row_i-1;j++)
     } // for (unsigned i=0;i<n_rows;i++)
  
     // If it gets here without a warning then the entries in each row of
     // the matrix are ordered by increasing column index
     return true;
   } // End of entries_are_sorted()
  
   //=============================================================================
   /// This helper function sorts the entries in the column index vector
   /// and the value vector. During the construction of the matrix the entries
   /// were most likely assigned in an arbitrary order. As a result, it cannot
   /// be assumed that the entries in the column index vector corresponding to
   /// each row of the matrix have been arranged in increasing order. During
   /// the setup an additional vector will be set up; Index_of_diagonal_entries.
   /// The i-th entry of this vector contains the index of the last entry
   /// below or on the diagonal. If there are no entries below or on the
   /// diagonal then the corresponding entry is -1. If, however, there are
   /// no entries in the row then the entry is irrelevant and is kept
   /// as the initialised value; 0.
   //=============================================================================
   void CRDoubleMatrix::sort_entries()
   {
 #ifdef OOMPH_HAS_MPI
     // We only need to produce a warning if the matrix is distributed
     if (this->distributed())
     {
       // Create an ostringstream object to store the warning message
       std::ostringstream warning_message;
  
       // Create the warning messsage
       warning_message << "This method currently works for serial but "
                       << "has not been tested with MPI!\n";
  
       // Issue the warning
       OomphLibWarning(warning_message.str(),
                       OOMPH_CURRENT_FUNCTION,
                       OOMPH_EXCEPTION_LOCATION);
     }
 #endif
  
     // Get the number of rows in the matrix
     unsigned n_rows = this->nrow();
  
     // Acquire access to the value, row_start and column_index arrays from
     // the CR matrix. Since we do not change anything in row_start_pt we
     // give it the const prefix
     double* value_pt = this->value();
     int* column_index_pt = this->column_index();
     const int* row_start_pt = this->row_start();
  
     // Resize the Index_of_diagonal_entries vector
     Index_of_diagonal_entries.resize(n_rows, 0);
  
     // Vector of pairs to store the column_index of each value in the i-th row
     // and its corresponding matrix entry
     Vector<std::pair<int, double>> column_index_and_value_row_i;
  
     // Loop over the rows of the matrix
     for (unsigned i = 0; i < n_rows; i++)
     {
       // Find the number of nonzeros in the i-th row
       unsigned nnz_row_i = *(row_start_pt + i + 1) - *(row_start_pt + i);
  
       // Variable to store the start of the i-th row
       unsigned i_row_start = *(row_start_pt + i);
  
       // If there are no nonzeros in this row then the i-th entry of the vector
       // Index_of_diagonal_entries is irrelevant so we can simply let it be 0
       if (nnz_row_i == 0)
       {
         // Set the i-th entry
         Index_of_diagonal_entries[i] = 0;
       }
       // If there are nonzeros in the i-th row
       else
       {
         // If there is more than one entry in the row resize the vector
         // column_index_and_value_row_i
         column_index_and_value_row_i.resize(nnz_row_i);
  
         // Loop over the entries in the row
         for (unsigned j = 0; j < nnz_row_i; j++)
         {
           // Assign the appropriate entries to column_index_and_value_row_i
           column_index_and_value_row_i[j] =
             std::make_pair(*(column_index_pt + i_row_start + j),
                            *(value_pt + i_row_start + j));
         }
  
         // Sort the vector of pairs using the struct
         // CRDoubleMatrixComparisonHelper
         std::sort(column_index_and_value_row_i.begin(),
                   column_index_and_value_row_i.end(),
                   Comparison_struct);
  
         //-----------------------------------------------------------------------
         // Now that the entries of the i-th row have been sorted we can read
         // them back into value_pt and column_index_pt:
         //-----------------------------------------------------------------------
  
         // Create a boolean variable to indicate whether or not the i-th entry
         // of Index_of_diagonal_entries has been set
         bool is_ith_entry_set = false;
  
         // Loop over the entries in the vector column_index_and_value_row_i and
         // assign its entries to value_pt and column_index_pt
         for (unsigned j = 0; j < nnz_row_i; j++)
         {
           // Set the column index of the j-th nonzero value in the i-th row of
           // the matrix
           *(column_index_pt + i_row_start + j) =
             column_index_and_value_row_i[j].first;
  
           // Set the value of the j-th nonzero value in the i-th row of
           // the matrix
           *(value_pt + i_row_start + j) =
             column_index_and_value_row_i[j].second;
  
           // This if statement is used to set the i-th entry of the vector
           // Index_of_diagonal_entries if it has not yet been set
           if (!is_ith_entry_set)
           {
             // If the column index of the first entry in row i is greater than
             // the row number then the first entry must lie above the diagonal
             if (unsigned(*(column_index_pt + i_row_start)) > i)
             {
               // If the column index of the first entry in the row is greater
               // than the row number, i, then the i-th entry of
               // Index_of_diagonal_entries needs to be set to -1 to indicate
               // there are no entries below or on the diagonal
               Index_of_diagonal_entries[i] = -1;
  
               // Indicate that the i-th entry of Index_of_diagonal_entries has
               // been set
               is_ith_entry_set = true;
             }
             // If there are entries below or on the diagonal
             else
             {
               // If there is only one entry in the row then we know that this
               // will be the last entry below or on the diagonal because we have
               // eliminated the possibility that if there is only one entry,
               // that it lies above the diagonal
               if (nnz_row_i == 1)
               {
                 // Set the index of the current entry to be the value of i-th
                 // entry of Index_of_diagonal_entries
                 Index_of_diagonal_entries[i] = i_row_start + j;
  
                 // Indicate that the i-th entry of Index_of_diagonal_entries has
                 // been set
                 is_ith_entry_set = true;
               }
               // It remains to now check the case that there is more than one
               // entry in the row. If there is more than one entry in the row
               // and there are entries below or on the diagonal then we have
               // three cases:
               //          (1) The current entry lies on the diagonal;
               //          (2) The current entry lies above the diagonal;
               //          (3) The current entry lies below the diagonal;
               // The first case can easily be checked as done below. If the
               // second case occurs then we have just passed the last entry. We
               // know this because at least one entry lies on or below the
               // diagonal. If the second case it true then we need to assign the
               // previous entry to the vector Index_of_diagonal_entries.
               // Finally, we are left with case (3), which can be split into two
               // cases:
               //          (3.1) The current entry lies below the diagonal but it
               //                is not the last entry below or on the diagonal;
               //          (3.2) The current entry lies below the diagonal and is
               //                the last entry below or on the diagonal.
               // If case (3.1) holds then we can simply wait until we get to the
               // next entry in the row and examine that. If the next entry lies
               // on the diagonal then it will be swept up by case (1). If the
               // next entry lies above the diagonal then case (2) will sweep it
               // up and if neither is the case then we wait until the next entry
               // and so on. If, instead, case (3.2) holds then our last check
               // simply needs to check if the current entry is the last entry in
               // the row because if the last entry lies on the diagonal, case
               // (1) will sweep it up. If it lies above the diagonal, case (2)
               // will take care of it. Therefore, the only remaining case is
               // that it lies strictly below the diagonal and since it is the
               // last entry in the row it means the index of this entry needs to
               // be assigned to Index_of_diagonal_entries
  
               // Case (1) : The current entry lies on the diagonal
               else if (unsigned(*(column_index_pt + i_row_start + j)) == i)
               {
                 // Set the index of the current entry to be the value of i-th
                 // entry of Index_of_diagonal_entries
                 Index_of_diagonal_entries[i] = i_row_start + j;
  
                 // Indicate that the i-th entry of Index_of_diagonal_entries has
                 // been set
                 is_ith_entry_set = true;
               }
               // Case (2) : The current entry lies above the diagonal
               else if (unsigned(*(column_index_pt + i_row_start + j)) > i)
               {
                 // Set the index of the current entry to be the value of i-th
                 // entry of Index_of_diagonal_entries
                 Index_of_diagonal_entries[i] = i_row_start + j - 1;
  
                 // Indicate that the i-th entry of Index_of_diagonal_entries has
                 // been set
                 is_ith_entry_set = true;
               }
               // Case (3.2) : The current entry is the last entry in the row
               else if (j == nnz_row_i - 1)
               {
                 // Set the index of the current entry to be the value of i-th
                 // entry of Index_of_diagonal_entries
                 Index_of_diagonal_entries[i] = i_row_start + j;
  
                 // Indicate that the i-th entry of Index_of_diagonal_entries has
                 // been set
                 is_ith_entry_set = true;
               } // if (nnz_row_i==1) else if
             } // if (*(column_index_pt+i_row_start)>i)
           } // if (!is_ith_entry_set)
         } // for (unsigned j=0;j<nnz_row_i;j++)
       } // if (nnz_row_i==0) else
     } // for (unsigned i=0;i<n_rows;i++)
   } // End of sort_entries()
  
   //=============================================================================
   /// Clean method
   //=============================================================================
   void CRDoubleMatrix::clear()
   {
     this->clear_distribution();
     CR_matrix.clean_up_memory();
     Built = false;
  
     if (Linear_solver_pt != 0) // Only clean up if it exists
       Linear_solver_pt->clean_up_memory();
   }
  
   //=============================================================================
   /// build method: Takes the distribution and the number of columns, as
   /// well as the vector of values, vector of column indices,vector of row
   /// starts.
   //=============================================================================
   void CRDoubleMatrix::build(const LinearAlgebraDistribution* distribution_pt,
                              const unsigned& ncol,
                              const Vector<double>& value,
                              const Vector<int>& column_index,
                              const Vector<int>& row_start)
   {
     // clear
     this->clear();
  
     // store the Distribution
     this->build_distribution(distribution_pt);
  
     // set the linear solver
     Default_linear_solver_pt = new SuperLUSolver;
  
     // now build the matrix
     this->build(ncol, value, column_index, row_start);
   }
  
   //=============================================================================
   /// method to rebuild the matrix, but not the distribution
   //=============================================================================
   void CRDoubleMatrix::build(const unsigned& ncol,
                              const Vector<double>& value,
                              const Vector<int>& column_index,
                              const Vector<int>& row_start)
   {
     // call the underlying build method
     CR_matrix.clean_up_memory();
     CR_matrix.build(value, column_index, row_start, this->nrow_local(), ncol);
  
     // matrix has been build
     Built = true;
   }
  
   //=============================================================================
   /// method to rebuild the matrix, but not the distribution
   //=============================================================================
   void CRDoubleMatrix::build_without_copy(const unsigned& ncol,
                                           const unsigned& nnz,
                                           double* value,
                                           int* column_index,
                                           int* row_start)
   {
     // call the underlying build method
     CR_matrix.clean_up_memory();
     CR_matrix.build_without_copy(
       value, column_index, row_start, nnz, this->nrow_local(), ncol);
  
     // matrix has been build
     Built = true;
   }
  
   //=============================================================================
   /// Do LU decomposition
   //=============================================================================
   void CRDoubleMatrix::ludecompose()
   {
 #ifdef PARANOID
     // check that the this matrix is built
     if (!Built)
     {
       std::ostringstream error_message_stream;
       error_message_stream << "This matrix has not been built.";
       throw OomphLibError(error_message_stream.str(),
                           OOMPH_CURRENT_FUNCTION,
                           OOMPH_EXCEPTION_LOCATION);
     }
 #endif
  
     // factorise using superlu or superlu dist if we oomph has mpi
     static_cast<SuperLUSolver*>(Default_linear_solver_pt)->factorise(this);
   }
  
   //=============================================================================
   /// Do back-substitution
   //=============================================================================
   void CRDoubleMatrix::lubksub(DoubleVector& rhs)
   {
 #ifdef PARANOID
     // check that the rhs vector is setup
     if (!rhs.built())
     {
       std::ostringstream error_message_stream;
       error_message_stream << "The vector rhs has not been setup";
       throw OomphLibError(error_message_stream.str(),
                           OOMPH_CURRENT_FUNCTION,
                           OOMPH_EXCEPTION_LOCATION);
     }
     // check that the rhs vector has the same distribution as this matrix
     if (!(*this->distribution_pt() == *rhs.distribution_pt()))
     {
       std::ostringstream error_message_stream;
       error_message_stream
         << "The vector rhs must have the same distribution as the matrix";
       throw OomphLibError(error_message_stream.str(),
                           OOMPH_CURRENT_FUNCTION,
                           OOMPH_EXCEPTION_LOCATION);
     }
 #endif
  
     // backsub
     DoubleVector rhs_copy(rhs);
     static_cast<SuperLUSolver*>(Default_linear_solver_pt)
       ->backsub(rhs_copy, rhs);
   }
  
   //=============================================================================
   ///  Multiply the matrix by the vector x
   //=============================================================================
   void CRDoubleMatrix::multiply(const DoubleVector& x, DoubleVector& soln) const
   {
 #ifdef PARANOID
     // check that this matrix is built
     if (!Built)
     {
       std::ostringstream error_message_stream;
       error_message_stream << "This matrix has not been built";
       throw OomphLibError(error_message_stream.str(),
                           OOMPH_CURRENT_FUNCTION,
                           OOMPH_EXCEPTION_LOCATION);
     }
     // check that the distribution of x is setup
     if (!x.built())
     {
       std::ostringstream error_message_stream;
       error_message_stream << "The distribution of the vector x must be setup";
       throw OomphLibError(error_message_stream.str(),
                           OOMPH_CURRENT_FUNCTION,
                           OOMPH_EXCEPTION_LOCATION);
     }
     // Check to see if x.size() = ncol().
     if (this->ncol() != x.distribution_pt()->nrow())
     {
       std::ostringstream error_message_stream;
       error_message_stream << "The number of rows in the x vector and the "
                               "number of columns in the "
                            << "matrix must be the same";
       throw OomphLibError(error_message_stream.str(),
                           OOMPH_CURRENT_FUNCTION,
                           OOMPH_EXCEPTION_LOCATION);
     }
     // if the soln is distributed
     if (soln.built())
     {
       if (!(*soln.distribution_pt() == *this->distribution_pt()))
       {
         std::ostringstream error_message_stream;
         error_message_stream
           << "The soln vector is setup and therefore must have the same "
           << "distribution as the matrix";
         throw OomphLibError(error_message_stream.str(),
                             OOMPH_CURRENT_FUNCTION,
                             OOMPH_EXCEPTION_LOCATION);
       }
     }
 #endif
  
     // if soln is not setup then setup the distribution
     if (!soln.built())
     {
       // Resize and initialize the solution vector
       soln.build(this->distribution_pt(), 0.0);
     }
  
     // Initialise
     soln.initialise(0.0);
  
     // if distributed and on more than one processor use trilinos
     // otherwise use the oomph-lib methods
     if (this->distributed() &&
         this->distribution_pt()->communicator_pt()->nproc() > 1)
     {
 #ifdef OOMPH_HAS_TRILINOS
       // This will only work if we have trilinos on board
       TrilinosEpetraHelpers::multiply(this, x, soln);
 #else
       std::ostringstream error_message_stream;
       error_message_stream
         << "Matrix-vector product on multiple processors with distributed "
         << "CRDoubleMatrix requires Trilinos.";
       throw OomphLibError(error_message_stream.str(),
                           OOMPH_CURRENT_FUNCTION,
                           OOMPH_EXCEPTION_LOCATION);
 #endif
     }
     else
     {
       unsigned n = this->nrow();
       const int* row_start = CR_matrix.row_start();
       const int* column_index = CR_matrix.column_index();
       const double* value = CR_matrix.value();
       double* soln_pt = soln.values_pt();
       const double* x_pt = x.values_pt();
       for (unsigned long i = 0; i < n; i++)
       {
         soln_pt[i] = 0.0;
         for (long k = row_start[i]; k < row_start[i + 1]; k++)
         {
           unsigned long j = column_index[k];
           double a_ij = value[k];
           soln_pt[i] += a_ij * x_pt[j];
         }
       }
     }
   }
  
   //=================================================================
   /// Multiply the transposed matrix by the vector x: soln=A^T x
   //=================================================================
   void CRDoubleMatrix::multiply_transpose(const DoubleVector& x,
                                           DoubleVector& soln) const
   {
 #ifdef PARANOID
     // check that this matrix is built
     if (!Built)
     {
       std::ostringstream error_message_stream;
       error_message_stream << "This matrix has not been built";
       throw OomphLibError(error_message_stream.str(),
                           OOMPH_CURRENT_FUNCTION,
                           OOMPH_EXCEPTION_LOCATION);
     }
     // Check to see if x.size() = ncol().
     if (!(*this->distribution_pt() == *x.distribution_pt()))
     {
       std::ostringstream error_message_stream;
       error_message_stream
         << "The x vector and this matrix must have the same distribution.";
       throw OomphLibError(error_message_stream.str(),
                           OOMPH_CURRENT_FUNCTION,
                           OOMPH_EXCEPTION_LOCATION);
     }
     // if soln is setup then it should have the same distribution as x
     if (soln.built())
     {
       if (soln.distribution_pt()->nrow() != this->ncol())
       {
         std::ostringstream error_message_stream;
         error_message_stream
           << "The soln vector is setup and therefore must have the same "
           << "number of rows as the vector x";
         throw OomphLibError(error_message_stream.str(),
                             OOMPH_CURRENT_FUNCTION,
                             OOMPH_EXCEPTION_LOCATION);
       }
     }
 #endif
  
     // if soln is not setup then setup the distribution
     if (!soln.built())
     {
       LinearAlgebraDistribution* dist_pt =
         new LinearAlgebraDistribution(x.distribution_pt()->communicator_pt(),
                                       this->ncol(),
                                       this->distributed());
       soln.build(dist_pt, 0.0);
       delete dist_pt;
     }
  
     // Initialise
     soln.initialise(0.0);
  
     if (this->distributed() &&
         this->distribution_pt()->communicator_pt()->nproc() > 1)
     {
 #ifdef OOMPH_HAS_TRILINOS
       // This will only work if we have trilinos on board
       TrilinosEpetraHelpers::multiply(this, x, soln);
 #else
       std::ostringstream error_message_stream;
       error_message_stream
         << "Matrix-vector product on multiple processors with distributed "
         << "CRDoubleMatrix requires Trilinos.";
       throw OomphLibError(error_message_stream.str(),
                           OOMPH_CURRENT_FUNCTION,
                           OOMPH_EXCEPTION_LOCATION);
 #endif
     }
     else
     {
       unsigned n = this->nrow();
       const int* row_start = CR_matrix.row_start();
       const int* column_index = CR_matrix.column_index();
       const double* value = CR_matrix.value();
       double* soln_pt = soln.values_pt();
       const double* x_pt = x.values_pt();
       // Matrix vector product
       for (unsigned long i = 0; i < n; i++)
       {
         for (long k = row_start[i]; k < row_start[i + 1]; k++)
         {
           unsigned long j = column_index[k];
           double a_ij = value[k];
           soln_pt[j] += a_ij * x_pt[i];
         }
       }
     }
   }
  
   //===========================================================================
   /// Function to multiply this matrix by the CRDoubleMatrix matrix_in.
   /// In a serial matrix, there are 4 methods available:
   /// Method 1: First runs through this matrix and matrix_in to find the storage
   ///           requirements for result - arrays of the correct size are
   ///           then allocated before performing the calculation.
   ///           Minimises memory requirements but more costly.
   /// Method 2: Grows storage for values and column indices of result 'on the
   ///           fly' using an array of maps. Faster but more memory
   ///           intensive.
   /// Method 3: Grows storage for values and column indices of result 'on the
   ///           fly' using a vector of vectors. Not particularly impressive
   ///           on the platforms we tried...
   /// Method 4: Trilinos Epetra Matrix Matrix multiply.
   /// Method 5: Trilinox Epetra Matrix Matrix Mulitply (ml based)
   /// If Trilinos is installed then Method 4 is employed by default, otherwise
   /// Method 2 is employed by default.
   /// In a distributed matrix, only Trilinos Epetra Matrix Matrix multiply
   /// is available.
   //=============================================================================
   void CRDoubleMatrix::multiply(const CRDoubleMatrix& matrix_in,
                                 CRDoubleMatrix& result) const
   {
 #ifdef PARANOID
     // check that this matrix is built
     if (!Built)
     {
       std::ostringstream error_message_stream;
       error_message_stream << "This matrix has not been built";
       throw OomphLibError(error_message_stream.str(),
                           OOMPH_CURRENT_FUNCTION,
                           OOMPH_EXCEPTION_LOCATION);
     }
     // check that this matrix is built
     if (!matrix_in.built())
     {
       std::ostringstream error_message_stream;
       error_message_stream << "This matrix matrix_in has not been built";
       throw OomphLibError(error_message_stream.str(),
                           OOMPH_CURRENT_FUNCTION,
                           OOMPH_EXCEPTION_LOCATION);
     }
     // if soln is setup then it should have the same distribution as x
     if (result.built())
     {
       if (!(*result.distribution_pt() == *this->distribution_pt()))
       {
         std::ostringstream error_message_stream;
         error_message_stream
           << "The matrix result is setup and therefore must have the same "
           << "distribution as the vector x";
         throw OomphLibError(error_message_stream.str(),
                             OOMPH_CURRENT_FUNCTION,
                             OOMPH_EXCEPTION_LOCATION);
       }
     }
 #endif
  
     // if the result has not been setup, then store the distribution
     if (!result.distribution_built())
     {
       result.build(this->distribution_pt());
     }
  
     // short name for Serial_matrix_matrix_multiply_method
     unsigned method = Serial_matrix_matrix_multiply_method;
  
     // if this matrix is not distributed and matrix in is not distributed
     if (!this->distributed() && !matrix_in.distributed() &&
         ((method == 1) || (method == 2) || (method == 3)))
     {
       // NB N is number of rows!
       unsigned long N = this->nrow();
       unsigned long M = matrix_in.ncol();
       unsigned long Nnz = 0;
  
       // pointers to arrays which store result
       int* Row_start = 0;
       double* Value = 0;
       int* Column_index = 0;
  
       // get pointers to matrix_in
       const int* matrix_in_row_start = matrix_in.row_start();
       const int* matrix_in_column_index = matrix_in.column_index();
       const double* matrix_in_value = matrix_in.value();
  
       // get pointers to this matrix
       const double* this_value = this->value();
       const int* this_row_start = this->row_start();
       const int* this_column_index = this->column_index();
  
       // clock_t clock1 = clock();
  
       // METHOD 1
       // --------
       if (method == 1)
       {
         // allocate storage for row starts
         Row_start = new int[N + 1];
         Row_start[0] = 0;
  
         // a set to store number of non-zero columns in each row of result
         std::set<unsigned> columns;
  
         // run through rows of this matrix and matrix_in to find number of
         // non-zero entries in each row of result
         for (unsigned long this_row = 0; this_row < N; this_row++)
         {
           // run through non-zeros in this_row of this matrix
           for (int this_ptr = this_row_start[this_row];
                this_ptr < this_row_start[this_row + 1];
                this_ptr++)
           {
             // find column index for non-zero
             int matrix_in_row = this_column_index[this_ptr];
  
             // run through corresponding row in matrix_in
             for (int matrix_in_ptr = matrix_in_row_start[matrix_in_row];
                  matrix_in_ptr < matrix_in_row_start[matrix_in_row + 1];
                  matrix_in_ptr++)
             {
               // find column index for non-zero in matrix_in and store in
               // columns
               columns.insert(matrix_in_column_index[matrix_in_ptr]);
             }
           }
           // update Row_start
           Row_start[this_row + 1] = Row_start[this_row] + columns.size();
  
           // wipe values in columns
           columns.clear();
         }
  
         // set Nnz
         Nnz = Row_start[N];
  
         // allocate arrays for result
         Value = new double[Nnz];
         Column_index = new int[Nnz];
  
         // set all values of Column_index to -1
         for (unsigned long i = 0; i < Nnz; i++)
         {
           Column_index[i] = -1;
         }
  
         // Calculate values for result - first run through rows of this matrix
         for (unsigned long this_row = 0; this_row < N; this_row++)
         {
           // run through non-zeros in this_row
           for (int this_ptr = this_row_start[this_row];
                this_ptr < this_row_start[this_row + 1];
                this_ptr++)
           {
             // find value of non-zero
             double this_val = this_value[this_ptr];
  
             // find column associated with non-zero
             int matrix_in_row = this_column_index[this_ptr];
  
             // run through corresponding row in matrix_in
             for (int matrix_in_ptr = matrix_in_row_start[matrix_in_row];
                  matrix_in_ptr < matrix_in_row_start[matrix_in_row + 1];
                  matrix_in_ptr++)
             {
               // find column index for non-zero in matrix_in
               int col = matrix_in_column_index[matrix_in_ptr];
  
               // find position in result to insert value
               for (int ptr = Row_start[this_row];
                    ptr <= Row_start[this_row + 1];
                    ptr++)
               {
                 if (ptr == Row_start[this_row + 1])
                 {
                   // error - have passed end of row without finding
                   // correct column
                   std::ostringstream error_message;
                   error_message << "Error inserting value in result";
  
                   throw OomphLibError(error_message.str(),
                                       OOMPH_CURRENT_FUNCTION,
                                       OOMPH_EXCEPTION_LOCATION);
                 }
                 else if (Column_index[ptr] == -1)
                 {
                   // first entry for this column index
                   Column_index[ptr] = col;
                   Value[ptr] = this_val * matrix_in_value[matrix_in_ptr];
                   break;
                 }
                 else if (Column_index[ptr] == col)
                 {
                   // column index already exists - add value
                   Value[ptr] += this_val * matrix_in_value[matrix_in_ptr];
                   break;
                 }
               }
             }
           }
         }
       }
  
       // METHOD 2
       // --------
       else if (method == 2)
       {
         // generate array of maps to store values for result
         std::map<int, double>* result_maps = new std::map<int, double>[N];
  
         // run through rows of this matrix
         for (unsigned long this_row = 0; this_row < N; this_row++)
         {
           // run through non-zeros in this_row
           for (int this_ptr = this_row_start[this_row];
                this_ptr < this_row_start[this_row + 1];
                this_ptr++)
           {
             // find value of non-zero
             double this_val = this_value[this_ptr];
  
             // find column index associated with non-zero
             int matrix_in_row = this_column_index[this_ptr];
  
             // run through corresponding row in matrix_in
             for (int matrix_in_ptr = matrix_in_row_start[matrix_in_row];
                  matrix_in_ptr < matrix_in_row_start[matrix_in_row + 1];
                  matrix_in_ptr++)
             {
               // find column index for non-zero in matrix_in
               int col = matrix_in_column_index[matrix_in_ptr];
  
               // insert value
               result_maps[this_row][col] +=
                 this_val * matrix_in_value[matrix_in_ptr];
             }
           }
         }
  
         // allocate Row_start
         Row_start = new int[N + 1];
  
         // copy across row starts
         Row_start[0] = 0;
         for (unsigned long row = 0; row < N; row++)
         {
           int size = result_maps[row].size();
           Row_start[row + 1] = Row_start[row] + size;
         }
  
         // set Nnz
         Nnz = Row_start[N];
  
         // allocate other arrays
         Value = new double[Nnz];
         Column_index = new int[Nnz];
  
         // copy values and column indices
         for (unsigned long row = 0; row < N; row++)
         {
           unsigned ptr = Row_start[row];
           for (std::map<int, double>::iterator i = result_maps[row].begin();
                i != result_maps[row].end();
                i++)
           {
             Column_index[ptr] = i->first;
             Value[ptr] = i->second;
             ptr++;
           }
         }
  
         // tidy up memory
         delete[] result_maps;
       }
  
       // METHOD 3
       // --------
       else if (method == 3)
       {
         // vectors of vectors to store results
         std::vector<std::vector<int>> result_cols(N);
         std::vector<std::vector<double>> result_vals(N);
  
         // run through the rows of this matrix
         for (unsigned long this_row = 0; this_row < N; this_row++)
         {
           // run through non-zeros in this_row
           for (int this_ptr = this_row_start[this_row];
                this_ptr < this_row_start[this_row + 1];
                this_ptr++)
           {
             // find value of non-zero
             double this_val = this_value[this_ptr];
  
             // find column index associated with non-zero
             int matrix_in_row = this_column_index[this_ptr];
  
             // run through corresponding row in matrix_in
             for (int matrix_in_ptr = matrix_in_row_start[matrix_in_row];
                  matrix_in_ptr < matrix_in_row_start[matrix_in_row + 1];
                  matrix_in_ptr++)
             {
               // find column index for non-zero in matrix_in
               int col = matrix_in_column_index[matrix_in_ptr];
  
               // insert value
               int size = result_cols[this_row].size();
               for (int i = 0; i <= size; i++)
               {
                 if (i == size)
                 {
                   // first entry for this column
                   result_cols[this_row].push_back(col);
                   result_vals[this_row].push_back(
                     this_val * matrix_in_value[matrix_in_ptr]);
                 }
                 else if (col == result_cols[this_row][i])
                 {
                   // column already exists
                   result_vals[this_row][i] +=
                     this_val * matrix_in_value[matrix_in_ptr];
                   break;
                 }
               }
             }
           }
         }
  
         // allocate Row_start
         Row_start = new int[N + 1];
  
         // copy across row starts
         Row_start[0] = 0;
         for (unsigned long row = 0; row < N; row++)
         {
           int size = result_cols[row].size();
           Row_start[row + 1] = Row_start[row] + size;
         }
  
         // set Nnz
         Nnz = Row_start[N];
  
         // allocate other arrays
         Value = new double[Nnz];
         Column_index = new int[Nnz];
  
         // copy across values and column indices
         for (unsigned long row = 0; row < N; row++)
         {
           unsigned ptr = Row_start[row];
           unsigned nnn = result_cols[row].size();
           for (unsigned i = 0; i < nnn; i++)
           {
             Column_index[ptr] = result_cols[row][i];
             Value[ptr] = result_vals[row][i];
             ptr++;
           }
         }
       }
  
       // build
       result.build_without_copy(M, Nnz, Value, Column_index, Row_start);
     }
  
     // else we have to use trilinos
     else
     {
 #ifdef OOMPH_HAS_TRILINOS
       bool use_ml = false;
       if (method == 5)
       {
         use_ml = true;
       }
       TrilinosEpetraHelpers::multiply(*this, matrix_in, result, use_ml);
 #else
       std::ostringstream error_message;
       error_message << "Serial_matrix_matrix_multiply_method = "
                     << Serial_matrix_matrix_multiply_method
                     << " requires trilinos.";
       throw OomphLibError(
         error_message.str(), OOMPH_CURRENT_FUNCTION, OOMPH_EXCEPTION_LOCATION);
 #endif
     }
   }
  
  
   //=================================================================
   /// For every row, find the maximum absolute value of the
   /// entries in this row. Set all values that are less than alpha times
   /// this maximum to zero and return the resulting matrix in
   /// reduced_matrix. Note: Diagonal entries are retained regardless
   /// of their size.
   //=================================================================
   void CRDoubleMatrix::matrix_reduction(const double& alpha,
                                         CRDoubleMatrix& reduced_matrix)
   {
     // number of rows in matrix
     long n_row = nrow_local();
     double max_row;
  
     // Here's the packed format for the new matrix
     Vector<int> B_row_start(1);
     Vector<int> B_column_index;
     Vector<double> B_value;
  
     // get pointers to the underlying data
     const int* row_start = CR_matrix.row_start();
     const int* column_index = CR_matrix.column_index();
     const double* value = CR_matrix.value();
  
     // k is counter for the number of entries in the reduced matrix
     unsigned k = 0;
  
     // Initialise row start
     B_row_start[0] = 0;
  
     // Loop over rows
     for (long i = 0; i < n_row; i++)
     {
       // Initialise max value in row
       max_row = 0.0;
  
       // Loop over entries in columns
       for (long j = row_start[i]; j < row_start[i + 1]; j++)
       {
         // Find max. value in row
         if (std::fabs(value[j]) > max_row)
         {
           max_row = std::fabs(value[j]);
         }
       }
  
       // Decide if we need to retain the entries in the row
       for (long j = row_start[i]; j < row_start[i + 1]; j++)
       {
         // If we're on the diagonal or the value is sufficiently large: retain
         // i.e. copy across.
         if (i == column_index[j] || std::fabs(value[j]) > alpha * max_row)
         {
           B_value.push_back(value[j]);
           B_column_index.push_back(column_index[j]);
           k++;
         }
       }
       // This writes the row start for the next row -- equal to
       // to the number of entries written so far (C++ zero-based indexing!)
       B_row_start.push_back(k);
     }
  
     // Build the matrix from the compressed format
     dynamic_cast<CRDoubleMatrix&>(reduced_matrix)
       .build(this->ncol(), B_value, B_column_index, B_row_start);
   }
  
   //=============================================================================
   /// if this matrix is distributed then the equivalent global matrix is built
   /// using new and returned. The calling method is responsible for the
   /// destruction of the new matrix.
   //=============================================================================
   CRDoubleMatrix* CRDoubleMatrix::global_matrix() const
   {
 #ifdef OOMPH_HAS_MPI
     // if this matrix is not distributed then this method is redundant
     if (!this->distributed() ||
         this->distribution_pt()->communicator_pt()->nproc() == 1)
     {
       return new CRDoubleMatrix(*this);
     }
  
     // nnz
     int nnz = this->nnz();
  
     // my nrow local
     unsigned nrow_local = this->nrow_local();
  
     // nrow global
     unsigned nrow = this->nrow();
  
     // cache nproc
     int nproc = this->distribution_pt()->communicator_pt()->nproc();
  
     // get the nnzs on the other processors
     int* dist_nnz_pt = new int[nproc];
     MPI_Allgather(&nnz,
                   1,
                   MPI_INT,
                   dist_nnz_pt,
                   1,
                   MPI_INT,
                   this->distribution_pt()->communicator_pt()->mpi_comm());
  
     // create a int vector of first rows and nrow local and compute nnz global
     int* dist_first_row = new int[nproc];
     int* dist_nrow_local = new int[nproc];
     int nnz_global = 0;
     for (int p = 0; p < nproc; p++)
     {
       nnz_global += dist_nnz_pt[p];
       dist_first_row[p] = this->first_row(p);
       dist_nrow_local[p] = this->nrow_local(p);
     }
  
     // compute the offset for the values and column index data
     int* nnz_offset = new int[nproc];
     nnz_offset[0] = 0;
     for (int p = 1; p < nproc; p++)
     {
       nnz_offset[p] = nnz_offset[p - 1] + dist_nnz_pt[p - 1];
     }
  
     // get pointers to the (current) distributed data
     // const_cast required because MPI requires non-const data when sending
     // data
     int* dist_row_start = const_cast<int*>(this->row_start());
     int* dist_column_index = const_cast<int*>(this->column_index());
     double* dist_value = const_cast<double*>(this->value());
  
     // space for the global matrix
     int* global_row_start = new int[nrow + 1];
     int* global_column_index = new int[nnz_global];
     double* global_value = new double[nnz_global];
  
     // get the row starts
     MPI_Allgatherv(dist_row_start,
                    nrow_local,
                    MPI_INT,
                    global_row_start,
                    dist_nrow_local,
                    dist_first_row,
                    MPI_INT,
                    this->distribution_pt()->communicator_pt()->mpi_comm());
  
     // get the column indexes
     MPI_Allgatherv(dist_column_index,
                    nnz,
                    MPI_INT,
                    global_column_index,
                    dist_nnz_pt,
                    nnz_offset,
                    MPI_INT,
                    this->distribution_pt()->communicator_pt()->mpi_comm());
  
     // get the values
     MPI_Allgatherv(dist_value,
                    nnz,
                    MPI_DOUBLE,
                    global_value,
                    dist_nnz_pt,
                    nnz_offset,
                    MPI_DOUBLE,
                    this->distribution_pt()->communicator_pt()->mpi_comm());
  
     // finally the last row start
     global_row_start[nrow] = nnz_global;
  
     // update the other row start
     for (int p = 0; p < nproc; p++)
     {
       for (int i = 0; i < dist_nrow_local[p]; i++)
       {
         unsigned j = dist_first_row[p] + i;
         global_row_start[j] += nnz_offset[p];
       }
     }
  
     // create the global distribution
     LinearAlgebraDistribution* dist_pt = new LinearAlgebraDistribution(
       this->distribution_pt()->communicator_pt(), nrow, false);
  
     // create the matrix
     CRDoubleMatrix* matrix_pt = new CRDoubleMatrix(dist_pt);
  
     // copy of distribution taken so delete
     delete dist_pt;
  
     // pass data into matrix
     matrix_pt->build_without_copy(this->ncol(),
                                   nnz_global,
                                   global_value,
                                   global_column_index,
                                   global_row_start);
  
     // clean up
     delete[] dist_first_row;
     delete[] dist_nrow_local;
     delete[] nnz_offset;
     delete[] dist_nnz_pt;
  
     // and return
     return matrix_pt;
 #else
     return new CRDoubleMatrix(*this);
 #endif
   }
  
   //============================================================================
   /// The contents of the matrix are redistributed to match the new
   /// distribution. In a non-MPI build this method does nothing.
   /// \b NOTE 1: The current distribution and the new distribution must have
   /// the same number of global rows.
   /// \b NOTE 2: The current distribution and the new distribution must have
   /// the same Communicator.
   //============================================================================
   void CRDoubleMatrix::redistribute(
     const LinearAlgebraDistribution* const& dist_pt)
   {
 #ifdef OOMPH_HAS_MPI
 #ifdef PARANOID
     // paranoid check that the nrows for both distributions is the
     // same
     if (dist_pt->nrow() != this->distribution_pt()->nrow())
     {
       std::ostringstream error_message;
       error_message << "The number of global rows in the new distribution ("
                     << dist_pt->nrow() << ") is not equal to the number"
                     << " of global rows in the current distribution ("
                     << this->distribution_pt()->nrow() << ").\n";
       throw OomphLibError(
         error_message.str(), OOMPH_CURRENT_FUNCTION, OOMPH_EXCEPTION_LOCATION);
     }
     // paranoid check that the current distribution and the new distribution
     // have the same Communicator
     OomphCommunicator temp_comm(*dist_pt->communicator_pt());
     if (!(temp_comm == *this->distribution_pt()->communicator_pt()))
     {
       std::ostringstream error_message;
       error_message << "The new distribution and the current distribution must "
                     << "have the same communicator.";
       throw OomphLibError(
         error_message.str(), OOMPH_CURRENT_FUNCTION, OOMPH_EXCEPTION_LOCATION);
     }
     // paranoid check that the matrix is build
     if (!this->built())
     {
       std::ostringstream error_message;
       error_message << "The matrix must be build to be redistributed";
       throw OomphLibError(
         error_message.str(), OOMPH_CURRENT_FUNCTION, OOMPH_EXCEPTION_LOCATION);
     }
 #endif
  
     // if the two distributions are not the same
     // =========================================
     if (!((*this->distribution_pt()) == *dist_pt))
     {
       // Get the number of columns to build the matrix.
       unsigned long ncol = this->ncol();
  
       // current data
       int* current_row_start = this->row_start();
       int* current_column_index = this->column_index();
       double* current_value = this->value();
  
       // get the rank and the number of processors
       int my_rank = this->distribution_pt()->communicator_pt()->my_rank();
       int nproc = this->distribution_pt()->communicator_pt()->nproc();
  
       // if both distributions are distributed
       // =====================================
       if (this->distributed() && dist_pt->distributed())
       {
         // new nrow_local and first_row data
         Vector<unsigned> new_first_row(nproc);
         Vector<unsigned> new_nrow_local(nproc);
         Vector<unsigned> current_first_row(nproc);
         Vector<unsigned> current_nrow_local(nproc);
         for (int i = 0; i < nproc; i++)
         {
           new_first_row[i] = dist_pt->first_row(i);
           new_nrow_local[i] = dist_pt->nrow_local(i);
           current_first_row[i] = this->first_row(i);
           current_nrow_local[i] = this->nrow_local(i);
         }
  
         // compute which local rows are expected to be received from each
         // processor / sent to each processor
         Vector<unsigned> first_row_for_proc(nproc, 0);
         Vector<unsigned> nrow_local_for_proc(nproc, 0);
         Vector<unsigned> first_row_from_proc(nproc, 0);
         Vector<unsigned> nrow_local_from_proc(nproc, 0);
  
         // for every processor compute first_row and nrow_local that will
         // will sent and received by this processor
         for (int p = 0; p < nproc; p++)
         {
           // start with data to be sent
           if ((new_first_row[p] <
                (current_first_row[my_rank] + current_nrow_local[my_rank])) &&
               (current_first_row[my_rank] <
                (new_first_row[p] + new_nrow_local[p])))
           {
             first_row_for_proc[p] =
               std::max(current_first_row[my_rank], new_first_row[p]);
             nrow_local_for_proc[p] =
               std::min(
                 (current_first_row[my_rank] + current_nrow_local[my_rank]),
                 (new_first_row[p] + new_nrow_local[p])) -
               first_row_for_proc[p];
           }
  
           // and data to be received
           if ((new_first_row[my_rank] <
                (current_first_row[p] + current_nrow_local[p])) &&
               (current_first_row[p] <
                (new_first_row[my_rank] + new_nrow_local[my_rank])))
           {
             first_row_from_proc[p] =
               std::max(current_first_row[p], new_first_row[my_rank]);
             nrow_local_from_proc[p] =
               std::min((current_first_row[p] + current_nrow_local[p]),
                        (new_first_row[my_rank] + new_nrow_local[my_rank])) -
               first_row_from_proc[p];
           }
         }
  
         // determine how many nnzs to send to each processor
         Vector<unsigned> nnz_for_proc(nproc, 0);
         for (int p = 0; p < nproc; p++)
         {
           if (nrow_local_for_proc[p] > 0)
           {
             nnz_for_proc[p] = (current_row_start[first_row_for_proc[p] -
                                                  current_first_row[my_rank] +
                                                  nrow_local_for_proc[p]] -
                                current_row_start[first_row_for_proc[p] -
                                                  current_first_row[my_rank]]);
           }
         }
  
         // next post non-blocking sends and recv for the nnzs
         Vector<unsigned> nnz_from_proc(nproc, 0);
         Vector<MPI_Request> send_req;
         Vector<MPI_Request> nnz_recv_req;
         for (int p = 0; p < nproc; p++)
         {
           if (p != my_rank)
           {
             // send
             if (nrow_local_for_proc[p] > 0)
             {
               MPI_Request req;
               MPI_Isend(&nnz_for_proc[p],
                         1,
                         MPI_UNSIGNED,
                         p,
                         0,
                         this->distribution_pt()->communicator_pt()->mpi_comm(),
                         &req);
               send_req.push_back(req);
             }
  
             // recv
             if (nrow_local_from_proc[p] > 0)
             {
               MPI_Request req;
               MPI_Irecv(&nnz_from_proc[p],
                         1,
                         MPI_UNSIGNED,
                         p,
                         0,
                         this->distribution_pt()->communicator_pt()->mpi_comm(),
                         &req);
               nnz_recv_req.push_back(req);
             }
           }
           // "send to self"
           else
           {
             nnz_from_proc[p] = nnz_for_proc[p];
           }
         }
  
         // allocate new storage for the new row_start
         int* new_row_start = new int[new_nrow_local[my_rank] + 1];
  
         // wait for recvs to complete
         unsigned n_recv_req = nnz_recv_req.size();
         if (n_recv_req > 0)
         {
           Vector<MPI_Status> recv_status(n_recv_req);
           MPI_Waitall(n_recv_req, &nnz_recv_req[0], &recv_status[0]);
         }
  
         // compute the nnz offset for each processor
         unsigned next_row = 0;
         unsigned nnz_count = 0;
         Vector<unsigned> nnz_offset(nproc, 0);
         for (int p = 0; p < nproc; p++)
         {
           unsigned pp = 0;
           while (new_first_row[pp] != next_row)
           {
             pp++;
           }
           nnz_offset[pp] = nnz_count;
           nnz_count += nnz_from_proc[pp];
           next_row += new_nrow_local[pp];
         }
  
         // allocate storage for the values and column indices
         int* new_column_index = new int[nnz_count];
         double* new_value = new double[nnz_count];
  
         // post the sends and recvs for the matrix data
         Vector<MPI_Request> recv_req;
         MPI_Aint base_address;
         MPI_Get_address(new_value, &base_address);
         for (int p = 0; p < nproc; p++)
         {
           // communicated with other processors
           if (p != my_rank)
           {
             // SEND
             if (nrow_local_for_proc[p] > 0)
             {
               // array of datatypes
               MPI_Datatype types[3];
  
               // array of offsets
               MPI_Aint offsets[3];
  
               // array of lengths
               int len[3];
  
               // row start
               unsigned first_row_to_send =
                 first_row_for_proc[p] - current_first_row[my_rank];
               MPI_Type_contiguous(nrow_local_for_proc[p], MPI_INT, &types[0]);
               MPI_Type_commit(&types[0]);
               len[0] = 1;
               MPI_Get_address(current_row_start + first_row_to_send,
                               &offsets[0]);
               offsets[0] -= base_address;
  
               // values
               unsigned first_coef_to_send =
                 current_row_start[first_row_to_send];
               MPI_Type_contiguous(nnz_for_proc[p], MPI_DOUBLE, &types[1]);
               MPI_Type_commit(&types[1]);
               len[1] = 1;
               MPI_Get_address(current_value + first_coef_to_send, &offsets[1]);
               offsets[1] -= base_address;
  
               // column index
               MPI_Type_contiguous(nnz_for_proc[p], MPI_DOUBLE, &types[2]);
               MPI_Type_commit(&types[2]);
               len[2] = 1;
               MPI_Get_address(current_column_index + first_coef_to_send,
                               &offsets[2]);
               offsets[2] -= base_address;
  
               // build the combined datatype
               MPI_Datatype send_type;
               MPI_Type_create_struct(3, len, offsets, types, &send_type);
               MPI_Type_commit(&send_type);
               MPI_Type_free(&types[0]);
               MPI_Type_free(&types[1]);
               MPI_Type_free(&types[2]);
  
               // and send
               MPI_Request req;
               MPI_Isend(new_value,
                         1,
                         send_type,
                         p,
                         1,
                         this->distribution_pt()->communicator_pt()->mpi_comm(),
                         &req);
               send_req.push_back(req);
               MPI_Type_free(&send_type);
             }
  
             // RECV
             if (nrow_local_from_proc[p] > 0)
             {
               // array of datatypes
               MPI_Datatype types[3];
  
               // array of offsets
               MPI_Aint offsets[3];
  
               // array of lengths
               int len[3];
  
               // row start
               unsigned first_row_to_recv =
                 first_row_from_proc[p] - new_first_row[my_rank];
               MPI_Type_contiguous(nrow_local_from_proc[p], MPI_INT, &types[0]);
               MPI_Type_commit(&types[0]);
               len[0] = 1;
               MPI_Get_address(new_row_start + first_row_to_recv, &offsets[0]);
               offsets[0] -= base_address;
  
               // values
               unsigned first_coef_to_recv = nnz_offset[p];
               MPI_Type_contiguous(nnz_from_proc[p], MPI_DOUBLE, &types[1]);
               MPI_Type_commit(&types[1]);
               len[1] = 1;
               MPI_Get_address(new_value + first_coef_to_recv, &offsets[1]);
               offsets[1] -= base_address;
  
               // column index
               MPI_Type_contiguous(nnz_from_proc[p], MPI_INT, &types[2]);
               MPI_Type_commit(&types[2]);
               len[2] = 1;
               MPI_Get_address(new_column_index + first_coef_to_recv,
                               &offsets[2]);
               offsets[2] -= base_address;
  
               // build the combined datatype
               MPI_Datatype recv_type;
               MPI_Type_create_struct(3, len, offsets, types, &recv_type);
               MPI_Type_commit(&recv_type);
               MPI_Type_free(&types[0]);
               MPI_Type_free(&types[1]);
               MPI_Type_free(&types[2]);
  
               // and send
               MPI_Request req;
               MPI_Irecv(new_value,
                         1,
                         recv_type,
                         p,
                         1,
                         this->distribution_pt()->communicator_pt()->mpi_comm(),
                         &req);
               recv_req.push_back(req);
               MPI_Type_free(&recv_type);
             }
           }
           // other wise transfer data internally
           else
           {
             unsigned j =
               first_row_for_proc[my_rank] - current_first_row[my_rank];
             unsigned k = first_row_from_proc[my_rank] - new_first_row[my_rank];
             for (unsigned i = 0; i < nrow_local_for_proc[my_rank]; i++)
             {
               new_row_start[k + i] = current_row_start[j + i];
             }
             unsigned first_coef_to_send = current_row_start[j];
             for (unsigned i = 0; i < nnz_for_proc[my_rank]; i++)
             {
               new_value[nnz_offset[p] + i] =
                 current_value[first_coef_to_send + i];
               new_column_index[nnz_offset[p] + i] =
                 current_column_index[first_coef_to_send + i];
             }
           }
         }
  
         // wait for all recvs to complete
         n_recv_req = recv_req.size();
         if (n_recv_req > 0)
         {
           Vector<MPI_Status> recv_status(n_recv_req);
           MPI_Waitall(n_recv_req, &recv_req[0], &recv_status[0]);
         }
  
         // next we need to update the row starts
         for (int p = 0; p < nproc; p++)
         {
           if (nrow_local_from_proc[p] > 0)
           {
             unsigned first_row =
               first_row_from_proc[p] - new_first_row[my_rank];
             unsigned last_row = first_row + nrow_local_from_proc[p] - 1;
             int update = nnz_offset[p] - new_row_start[first_row];
             for (unsigned i = first_row; i <= last_row; i++)
             {
               new_row_start[i] += update;
             }
           }
         }
         new_row_start[dist_pt->nrow_local()] = nnz_count;
  
         // wait for sends to complete
         unsigned n_send_req = send_req.size();
         if (n_recv_req > 0)
         {
           Vector<MPI_Status> send_status(n_recv_req);
           MPI_Waitall(n_send_req, &send_req[0], &send_status[0]);
         }
         // if (my_rank == 0)
         //  {
         //   CRDoubleMatrix* m_pt = this->global_matrix();
         //   m_pt->sparse_indexed_output("m1.dat");
         //  }
  
         //
         this->build(dist_pt);
         this->build_without_copy(
           ncol, nnz_count, new_value, new_column_index, new_row_start);
         // if (my_rank == 0)
         //  {
         //   CRDoubleMatrix* m_pt = this->global_matrix();
         //   m_pt->sparse_indexed_output("m2.dat");
         //  }
         //       this->sparse_indexed_output(oomph_info);
         abort();
       }
  
       // if this matrix is distributed but the new distributed matrix is global
       // ======================================================================
       else if (this->distributed() && !dist_pt->distributed())
       {
         // nnz
         int nnz = this->nnz();
  
         // nrow global
         unsigned nrow = this->nrow();
  
         // cache nproc
         int nproc = this->distribution_pt()->communicator_pt()->nproc();
  
         // get the nnzs on the other processors
         int* dist_nnz_pt = new int[nproc];
         MPI_Allgather(&nnz,
                       1,
                       MPI_INT,
                       dist_nnz_pt,
                       1,
                       MPI_INT,
                       this->distribution_pt()->communicator_pt()->mpi_comm());
  
         // create an int array of first rows and nrow local and
         // compute nnz global
         int* dist_first_row = new int[nproc];
         int* dist_nrow_local = new int[nproc];
         for (int p = 0; p < nproc; p++)
         {
           dist_first_row[p] = this->first_row(p);
           dist_nrow_local[p] = this->nrow_local(p);
         }
  
         // conpute the offset for the values and column index data
         // compute the nnz offset for each processor
         int next_row = 0;
         unsigned nnz_count = 0;
         Vector<unsigned> nnz_offset(nproc, 0);
         for (int p = 0; p < nproc; p++)
         {
           unsigned pp = 0;
           while (dist_first_row[pp] != next_row)
           {
             pp++;
           }
           nnz_offset[pp] = nnz_count;
           nnz_count += dist_nnz_pt[pp];
           next_row += dist_nrow_local[pp];
         }
  
         // get pointers to the (current) distributed data
         int* dist_row_start = this->row_start();
         int* dist_column_index = this->column_index();
         double* dist_value = this->value();
  
         // space for the global matrix
         int* global_row_start = new int[nrow + 1];
         int* global_column_index = new int[nnz_count];
         double* global_value = new double[nnz_count];
  
         // post the sends and recvs for the matrix data
         Vector<MPI_Request> recv_req;
         Vector<MPI_Request> send_req;
         MPI_Aint base_address;
         MPI_Get_address(global_value, &base_address);
  
         // SEND
         if (dist_nrow_local[my_rank] > 0)
         {
           // types
           MPI_Datatype types[3];
  
           // offsets
           MPI_Aint offsets[3];
  
           // lengths
           int len[3];
  
           // row start
           MPI_Type_contiguous(dist_nrow_local[my_rank], MPI_INT, &types[0]);
           MPI_Type_commit(&types[0]);
           MPI_Get_address(dist_row_start, &offsets[0]);
           offsets[0] -= base_address;
           len[0] = 1;
  
           // value
           MPI_Type_contiguous(nnz, MPI_DOUBLE, &types[1]);
           MPI_Type_commit(&types[1]);
           MPI_Get_address(dist_value, &offsets[1]);
           offsets[1] -= base_address;
           len[1] = 1;
  
           // column indices
           MPI_Type_contiguous(nnz, MPI_INT, &types[2]);
           MPI_Type_commit(&types[2]);
           MPI_Get_address(dist_column_index, &offsets[2]);
           offsets[2] -= base_address;
           len[2] = 1;
  
           // build the send type
           MPI_Datatype send_type;
           MPI_Type_create_struct(3, len, offsets, types, &send_type);
           MPI_Type_commit(&send_type);
           MPI_Type_free(&types[0]);
           MPI_Type_free(&types[1]);
           MPI_Type_free(&types[2]);
  
           // and send
           for (int p = 0; p < nproc; p++)
           {
             if (p != my_rank)
             {
               MPI_Request req;
               MPI_Isend(global_value,
                         1,
                         send_type,
                         p,
                         1,
                         this->distribution_pt()->communicator_pt()->mpi_comm(),
                         &req);
               send_req.push_back(req);
             }
           }
           MPI_Type_free(&send_type);
         }
  
         // RECV
         for (int p = 0; p < nproc; p++)
         {
           // communicated with other processors
           if (p != my_rank)
           {
             // RECV
             if (dist_nrow_local[p] > 0)
             {
               // types
               MPI_Datatype types[3];
  
               // offsets
               MPI_Aint offsets[3];
  
               // lengths
               int len[3];
  
               // row start
               MPI_Type_contiguous(dist_nrow_local[p], MPI_INT, &types[0]);
               MPI_Type_commit(&types[0]);
               MPI_Get_address(global_row_start + dist_first_row[p],
                               &offsets[0]);
               offsets[0] -= base_address;
               len[0] = 1;
  
               // value
               MPI_Type_contiguous(dist_nnz_pt[p], MPI_DOUBLE, &types[1]);
               MPI_Type_commit(&types[1]);
               MPI_Get_address(global_value + nnz_offset[p], &offsets[1]);
               offsets[1] -= base_address;
               len[1] = 1;
  
               // column indices
               MPI_Type_contiguous(dist_nnz_pt[p], MPI_INT, &types[2]);
               MPI_Type_commit(&types[2]);
               MPI_Get_address(global_column_index + nnz_offset[p], &offsets[2]);
               offsets[2] -= base_address;
               len[2] = 1;
  
               // build the send type
               MPI_Datatype recv_type;
               MPI_Type_create_struct(3, len, offsets, types, &recv_type);
               MPI_Type_commit(&recv_type);
               MPI_Type_free(&types[0]);
               MPI_Type_free(&types[1]);
               MPI_Type_free(&types[2]);
  
               // and send
               MPI_Request req;
               MPI_Irecv(global_value,
                         1,
                         recv_type,
                         p,
                         1,
                         this->distribution_pt()->communicator_pt()->mpi_comm(),
                         &req);
               recv_req.push_back(req);
               MPI_Type_free(&recv_type);
             }
           }
           // otherwise send to self
           else
           {
             unsigned nrow_local = dist_nrow_local[my_rank];
             unsigned first_row = dist_first_row[my_rank];
             for (unsigned i = 0; i < nrow_local; i++)
             {
               global_row_start[first_row + i] = dist_row_start[i];
             }
             unsigned offset = nnz_offset[my_rank];
             for (int i = 0; i < nnz; i++)
             {
               global_value[offset + i] = dist_value[i];
               global_column_index[offset + i] = dist_column_index[i];
             }
           }
         }
  
         // wait for all recvs to complete
         unsigned n_recv_req = recv_req.size();
         if (n_recv_req > 0)
         {
           Vector<MPI_Status> recv_status(n_recv_req);
           MPI_Waitall(n_recv_req, &recv_req[0], &recv_status[0]);
         }
  
         // finally the last row start
         global_row_start[nrow] = nnz_count;
  
         // update the other row start
         for (int p = 0; p < nproc; p++)
         {
           for (int i = 0; i < dist_nrow_local[p]; i++)
           {
             unsigned j = dist_first_row[p] + i;
             global_row_start[j] += nnz_offset[p];
           }
         }
  
         // wait for sends to complete
         unsigned n_send_req = send_req.size();
         if (n_recv_req > 0)
         {
           Vector<MPI_Status> send_status(n_recv_req);
           MPI_Waitall(n_send_req, &send_req[0], &send_status[0]);
         }
  
         // rebuild the matrix
         LinearAlgebraDistribution* dist_pt = new LinearAlgebraDistribution(
           this->distribution_pt()->communicator_pt(), nrow, false);
         this->build(dist_pt);
         this->build_without_copy(
           ncol, nnz_count, global_value, global_column_index, global_row_start);
  
         // clean up
         delete dist_pt;
         delete[] dist_first_row;
         delete[] dist_nrow_local;
         delete[] dist_nnz_pt;
       }
  
       // other the matrix is not distributed but it needs to be turned
       // into a distributed matrix
       // =============================================================
       else if (!this->distributed() && dist_pt->distributed())
       {
         // cache the new nrow_local
         unsigned nrow_local = dist_pt->nrow_local();
  
         // and first_row
         unsigned first_row = dist_pt->first_row();
  
         // get pointers to the (current) distributed data
         int* global_row_start = this->row_start();
         int* global_column_index = this->column_index();
         double* global_value = this->value();
  
         // determine the number of non zeros required by this processor
         unsigned nnz = global_row_start[first_row + nrow_local] -
                        global_row_start[first_row];
  
         // allocate
         int* dist_row_start = new int[nrow_local + 1];
         int* dist_column_index = new int[nnz];
         double* dist_value = new double[nnz];
  
         // copy
         int offset = global_row_start[first_row];
         for (unsigned i = 0; i <= nrow_local; i++)
         {
           dist_row_start[i] = global_row_start[first_row + i] - offset;
         }
         for (unsigned i = 0; i < nnz; i++)
         {
           dist_column_index[i] = global_column_index[offset + i];
           dist_value[i] = global_value[offset + i];
         }
  
         // rebuild
         this->build(dist_pt);
         this->build_without_copy(
           ncol, nnz, dist_value, dist_column_index, dist_row_start);
       }
     }
 #endif
   }
  
   //=============================================================================
   /// Compute transpose of matrix
   //=============================================================================
   void CRDoubleMatrix::get_matrix_transpose(CRDoubleMatrix* result) const
   {
     // Get the number of non_zeros
     unsigned long nnon_zeros = this->nnz();
  
     // Find the number of rows and columns in the transposed
     // matrix. We differentiate these from those associated
     // with the original matrix by appending the characters
     // '_t' onto the end
     unsigned long n_rows = this->nrow();
     unsigned long n_rows_t = this->ncol();
  
 #ifdef OOMPH_HAS_MPI
     // We only need to produce a warning if the matrix is distributed
     if (this->distributed())
     {
       // Create an ostringstream object to store the warning message
       std::ostringstream warning_message;
  
       // Create the warning messsage
       warning_message << "This method currently works for serial but "
                       << "has not been tested with MPI!\n";
  
       // Issue the warning
       OomphLibWarning(warning_message.str(),
                       OOMPH_CURRENT_FUNCTION,
                       OOMPH_EXCEPTION_LOCATION);
     }
 #endif
  
     // Set up the distribution for the transposed matrix
     result->distribution_pt()->build(
       this->distribution_pt()->communicator_pt(), n_rows_t, false);
  
     // Acquire access to the value, row_start and column_index
     // arrays from the CR matrix
     const double* value_pt = this->value();
     const int* row_start_pt = this->row_start();
     const int* column_index_pt = this->column_index();
  
     // Allocate space for the row_start and column_index vectors
     // associated with the transpose of the current matrix.
     Vector<double> value_t(nnon_zeros, 0.0);
     Vector<int> column_index_t(nnon_zeros, 0);
     Vector<int> row_start_t(n_rows_t + 1, 0);
  
     // Loop over the column index vector and count how many times
     // each column number occurs and increment the appropriate
     // entry in row_start_t whose i+1'th entry will contain the
     // number of non-zeros in the i'th column of the matrix (this
     // is done so that the cumulative sum done next returns the
     // correct result)
     for (unsigned i = 0; i < nnon_zeros; i++)
     {
       // Assign entries to row_start_t (noting the first
       // entry is left as 0 for the cumulative sum done later)
       row_start_t[*(column_index_pt + i) + 1]++;
     }
  
     // Calculate the sum of the first i entries in the row_start_t
     // vector and store the values in the i'th entry of row_start_t
     for (unsigned i = 1; i < n_rows_t + 1; i++)
     {
       // Calculate the cumulative sum
       row_start_t[i] += row_start_t[i - 1];
     }
  
     // Allocate space for variables to store the indices of the
     // start and end of the non-zeros in a given row of the matrix
     unsigned i_row_s = 0;
     unsigned i_row_e = 0;
  
     // Initialise 3 extra variables for readability of the
     // code in the subsequent piece of code
     unsigned a = 0;
     unsigned b = 0;
     unsigned c = 0;
  
     // Vector needed to count the number of entries added
     // to each segment in column_index_t where each segment
     // is associated with each row in the transposed matrix
     Vector<int> counter(n_rows_t, 0);
  
     // Set the entries in column_index_t. To do this we loop
     // over each row of the original matrix (equivalently
     // the number of columns in the transpose)
     for (unsigned i_row = 0; i_row < n_rows; i_row++)
     {
       // Here we find the indices of the start and end
       // of the non-zeros in i_row'th row of the matrix.
       // [Note, there should be a -1 on i_row_e but this
       // is ignored so that we use a strict inequality
       // in the subsequent for-loop!]
       i_row_s = *(row_start_pt + i_row);
       i_row_e = *(row_start_pt + i_row + 1);
  
       // Loop over the entries in the i_row'th row
       // of the matrix
       for (unsigned j = i_row_s; j < i_row_e; j++)
       {
         // The value of a is the column index of the j'th
         // element in the i_row'th row of the original matrix
         // (which is also the row index of the associated
         // non-zero in the transposed matrix)
         a = *(column_index_pt + j);
  
         // The value of b will be used to find the start
         // of the appropriate segment of column_index_t
         // that the non-zero belongs in
         b = row_start_t[a];
  
         // Find the number of elements already added to
         // this segment (to find which entry of the segment
         // the value i_row, the column index of the non-zero
         // in the transposed matrix, should be assigned to)
         c = counter[*(column_index_pt + j)];
  
         // Assign the value i_row to the appropriate entry
         // in column_index_t
         column_index_t[b + c] = i_row;
         value_t[b + c] = *(value_pt + j);
  
         // Increment the j'th value of the vector counter
         // to indicate another non-zero index has been
         // added into the
         counter[*(column_index_pt + j)]++;
  
       } // End of for-loop over i_row'th row of the matrix
  
     } // End of for-loop over rows of the matrix
  
     // Build the matrix (note: the value of n_cols for the
     // transposed matrix is n_rows for the original matrix)
     result->build(n_rows, value_t, column_index_t, row_start_t);
  
   } // End of the function
  
  
   //=============================================================================
   /// Compute infinity (maximum) norm of matrix
   //=============================================================================
   double CRDoubleMatrix::inf_norm() const
   {
 #ifdef PARANOID
     // paranoid check that the vector is setup
     if (!this->distribution_built())
     {
       std::ostringstream error_message;
       error_message << "This matrix must be setup.";
       throw OomphLibError(
         error_message.str(), OOMPH_CURRENT_FUNCTION, OOMPH_EXCEPTION_LOCATION);
     }
 #endif
  
     // compute the local norm
     unsigned nrow_local = this->nrow_local();
     double n = 0;
     const int* row_start = CR_matrix.row_start();
     const double* value = CR_matrix.value();
     for (unsigned i = 0; i < nrow_local; i++)
     {
       double a = 0;
       for (int j = row_start[i]; j < row_start[i + 1]; j++)
       {
         a += fabs(value[j]);
       }
       n = std::max(n, a);
     }
  
     // if this vector is distributed and on multiple processors then gather
 #ifdef OOMPH_HAS_MPI
     double n2 = n;
     if (this->distributed() &&
         this->distribution_pt()->communicator_pt()->nproc() > 1)
     {
       MPI_Allreduce(&n,
                     &n2,
                     1,
                     MPI_DOUBLE,
                     MPI_MAX,
                     this->distribution_pt()->communicator_pt()->mpi_comm());
     }
     n = n2;
 #endif
  
     // and return
     return n;
   }
  
   //=============================================================================
   /// Return the diagonal entries of the matrix.
   /// This only works with square matrices. This condition may be relaxed
   /// in the future if need be.
   //=============================================================================
   Vector<double> CRDoubleMatrix::diagonal_entries() const
   {
 #ifdef PARANOID
     // Check if the matrix has been built.
     if (!this->built())
     {
       std::ostringstream error_message;
       error_message << "The matrix has not been built.\n"
                     << "Please build it...\n";
       throw OomphLibError(
         error_message.str(), OOMPH_CURRENT_FUNCTION, OOMPH_EXCEPTION_LOCATION);
     }
  
     // Check if this is a square matrix.
     if (this->nrow() != this->ncol())
     {
       std::ostringstream error_message;
       error_message << "The matrix is not square. Can only get the diagonal\n"
                     << "entries of a square matrix.\n";
       throw OomphLibError(
         error_message.str(), OOMPH_CURRENT_FUNCTION, OOMPH_EXCEPTION_LOCATION);
     }
 #endif
  
     // We get the diagonal entries on this processor.
     unsigned nrow_local = this->nrow_local();
  
     // Create storage for the diagonal entries.
     Vector<double> result_vec;
     result_vec.reserve(nrow_local);
  
     // Get the first row for the column offset.
     unsigned first_row = this->first_row();
  
     // Loop through the local rows.
     for (unsigned i = 0; i < nrow_local; i++)
     {
       // The column entries are globally indexed.
       unsigned diag_entry_col = first_row + i;
  
       // Push back the diagonal entry.
       result_vec.push_back(CR_matrix.get_entry(i, diag_entry_col));
     }
  
     return result_vec;
   }
  
   //=============================================================================
   /// Element-wise addition of this matrix with matrix_in.
   //=============================================================================
   void CRDoubleMatrix::add(const CRDoubleMatrix& matrix_in,
                            CRDoubleMatrix& result_matrix) const
   {
 #ifdef PARANOID
     // Check if this matrix is built.
     if (!this->built())
     {
       std::ostringstream error_message;
       error_message << "The matrix is not built.\n"
                     << "Please build the matrix!\n";
       throw OomphLibError(
         error_message.str(), OOMPH_CURRENT_FUNCTION, OOMPH_EXCEPTION_LOCATION);
     }
  
     // Check if this matrix_in is built.
     if (!matrix_in.built())
     {
       std::ostringstream error_message;
       error_message << "The matrix matrix_in is not built.\n"
                     << "Please build the matrix!\n";
       throw OomphLibError(
         error_message.str(), OOMPH_CURRENT_FUNCTION, OOMPH_EXCEPTION_LOCATION);
     }
  
     // Check if the dimensions of this matrix and matrix_in are the same.
     unsigned long this_nrow = this->nrow();
     unsigned long matrix_in_nrow = matrix_in.nrow();
     if (this_nrow != matrix_in_nrow)
     {
       std::ostringstream error_message;
       error_message << "matrix_in has a different number of rows than\n"
                     << "this matrix.\n";
       throw OomphLibError(
         error_message.str(), OOMPH_CURRENT_FUNCTION, OOMPH_EXCEPTION_LOCATION);
     }
  
     unsigned long this_ncol = this->ncol();
     unsigned long matrix_in_ncol = matrix_in.ncol();
     if (this_ncol != matrix_in_ncol)
     {
       std::ostringstream error_message;
       error_message << "matrix_in has a different number of columns than\n"
                     << "this matrix.\n";
       throw OomphLibError(
         error_message.str(), OOMPH_CURRENT_FUNCTION, OOMPH_EXCEPTION_LOCATION);
     }
  
     // Check if the distribution is the same (Otherwise we may have to send and
     // receive data from other processors - which is not implemented!)
     if (*(this->distribution_pt()) != *(matrix_in.distribution_pt()))
     {
       std::ostringstream error_message;
       error_message << "matrix_in must have the same distribution as\n"
                     << "this matrix.\n";
       throw OomphLibError(
         error_message.str(), OOMPH_CURRENT_FUNCTION, OOMPH_EXCEPTION_LOCATION);
     }
  
     // If the matrix is built, check that it's existing distribution is the
     // same as the in matrix. Since we'll use the same distribution instead
     // of completely rebuilding it.
     if (result_matrix.built() &&
         (*result_matrix.distribution_pt() != *matrix_in.distribution_pt()))
     {
       std::ostringstream error_message;
       error_message << "The result_matrix is built. "
                     << "But has a different distribution from matrix_in \n"
                     << "They need to be the same.\n";
       throw OomphLibError(
         error_message.str(), OOMPH_CURRENT_FUNCTION, OOMPH_EXCEPTION_LOCATION);
     }
 #endif
  
     // To add the elements of two CRDoubleMatrices, we need to know the union of
     // the sparsity patterns. This is determined by the column indices.
     // We add the column indices and values (entries) as a key-value pair in
     // a map (per row). We then read these out into two column indices and
     // values vector for the result matrix.
  
     unsigned nrow_local = this->nrow_local();
     Vector<int> res_column_indices;
     Vector<double> res_values;
     Vector<int> res_row_start;
     res_row_start.reserve(nrow_local + 1);
  
     // The row_start and column_indices
     const int* this_column_indices = this->column_index();
     const int* this_row_start = this->row_start();
     const int* in_column_indices = matrix_in.column_index();
     const int* in_row_start = matrix_in.row_start();
  
     // Values from this matrix and matrix_in.
     const double* this_values = this->value();
     const double* in_values = matrix_in.value();
  
  
     // The first entry in row_start is always zero.
     res_row_start.push_back(0);
  
     // Loop through the rows of both matrices and insert the column indices and
     // values as a key-value pair.
     for (unsigned row_i = 0; row_i < nrow_local; row_i++)
     {
       // Create the map for this row.
       std::map<int, double> res_row_map;
  
       // Insert the column and value pair for this matrix.
       for (int i = this_row_start[row_i]; i < this_row_start[row_i + 1]; i++)
       {
         res_row_map[this_column_indices[i]] = this_values[i];
       }
  
       // Insert the column and value pair for in matrix.
       for (int i = in_row_start[row_i]; i < in_row_start[row_i + 1]; i++)
       {
         res_row_map[in_column_indices[i]] += in_values[i];
       }
  
       // Fill in the row start
       res_row_start.push_back(res_row_start.back() + res_row_map.size());
  
       // Now insert the key into res_column_indices and value into res_values
       for (std::map<int, double>::iterator it = res_row_map.begin();
            it != res_row_map.end();
            ++it)
       {
         res_column_indices.push_back(it->first);
         res_values.push_back(it->second);
       }
     }
  
     // Finally build the result_matrix.
     if (result_matrix.distribution_pt()->built())
     {
       // Use the existing distribution.
       result_matrix.build(
         this->ncol(), res_values, res_column_indices, res_row_start);
     }
     else
     {
       // Build with THIS distribution
       result_matrix.build(this->distribution_pt(),
                           this->ncol(),
                           res_values,
                           res_column_indices,
                           res_row_start);
     }
   }
  
   //=================================================================
   /// Namespace for helper functions for CRDoubleMatrices
   //=================================================================
   namespace CRDoubleMatrixHelpers
   {
     //============================================================================
     /// Builds a uniformly distributed matrix.
     /// A locally replicated matrix is constructed then redistributed using
     /// OOMPH-LIB's default uniform row distribution.
     /// This is memory intensive thus should be used for
     /// testing or small problems only.
     //============================================================================
     void create_uniformly_distributed_matrix(
       const unsigned& nrow,
       const unsigned& ncol,
       const OomphCommunicator* const comm_pt,
       const Vector<double>& values,
       const Vector<int>& column_indices,
       const Vector<int>& row_start,
       CRDoubleMatrix& matrix_out)
     {
 #ifdef PARANOID
       // Check if the communicator exists.
       if (comm_pt == 0)
       {
         std::ostringstream error_message;
         error_message << "Please supply the communicator.\n";
         throw OomphLibError(error_message.str(),
                             OOMPH_CURRENT_FUNCTION,
                             OOMPH_EXCEPTION_LOCATION);
       }
       // Is the out matrix built? We need an empty matrix!
       if (matrix_out.built())
       {
         std::ostringstream error_message;
         error_message << "The result matrix has been built.\n"
                       << "Please clear the matrix.\n";
         throw OomphLibError(error_message.str(),
                             OOMPH_CURRENT_FUNCTION,
                             OOMPH_EXCEPTION_LOCATION);
       }
 #endif
  
       // Create the locally replicated distribution.
       bool distributed = false;
       LinearAlgebraDistribution locally_replicated_distribution(
         comm_pt, nrow, distributed);
  
       // Create the matrix.
       matrix_out.build(&locally_replicated_distribution,
                        ncol,
                        values,
                        column_indices,
                        row_start);
  
       // Create the distributed distribution.
       distributed = true;
       LinearAlgebraDistribution distributed_distribution(
         comm_pt, nrow, distributed);
  
       // Redistribute the matrix.
       matrix_out.redistribute(&distributed_distribution);
     }
  
     //============================================================================
     /// Compute infinity (maximum) norm of sub blocks as if it was one matrix
     //============================================================================
     double inf_norm(const DenseMatrix<CRDoubleMatrix*>& matrix_pt)
     {
       // The number of block rows and columns
       const unsigned nblockrow = matrix_pt.nrow();
       const unsigned nblockcol = matrix_pt.ncol();
  
 #ifdef PARANOID
       // Check that tehre is at least one matrix.
       if (matrix_pt.nrow() == 0)
       {
         std::ostringstream error_message;
         error_message << "There are no matrices... \n";
         throw OomphLibError(error_message.str(),
                             OOMPH_CURRENT_FUNCTION,
                             OOMPH_EXCEPTION_LOCATION);
       }
  
  
       // Check that all matrix_pt pointers are not null
       // and the matrices are built.
       for (unsigned block_row_i = 0; block_row_i < nblockrow; block_row_i++)
       {
         for (unsigned block_col_i = 0; block_col_i < nblockcol; block_col_i++)
         {
           if (matrix_pt(block_row_i, block_col_i) == 0)
           {
             std::ostringstream error_message;
             error_message << "The pointer martrix_pt(" << block_row_i << ","
                           << block_col_i << ") is null.\n";
             throw OomphLibError(error_message.str(),
                                 OOMPH_CURRENT_FUNCTION,
                                 OOMPH_EXCEPTION_LOCATION);
           }
  
           if (!matrix_pt(block_row_i, block_col_i)->built())
           {
             std::ostringstream error_message;
             error_message << "The matrix at martrix_pt(" << block_row_i << ","
                           << block_col_i << ") is not built.\n";
             throw OomphLibError(error_message.str(),
                                 OOMPH_CURRENT_FUNCTION,
                                 OOMPH_EXCEPTION_LOCATION);
           }
         }
       }
 #endif
  
 #ifdef OOMPH_HAS_MPI
  
       // The communicator pointer from block (0,0)
       const OomphCommunicator* const comm_pt =
         matrix_pt(0, 0)->distribution_pt()->communicator_pt();
  
 #ifdef PARANOID
  
  
       // Check that all communicators are the same
       for (unsigned block_row_i = 0; block_row_i < nblockrow; block_row_i++)
       {
         for (unsigned block_col_i = 0; block_col_i < nblockcol; block_col_i++)
         {
           // Communicator for this block matrix.
           const OomphCommunicator current_block_comm =
             *(matrix_pt(block_row_i, block_col_i)
                 ->distribution_pt()
                 ->communicator_pt());
           if (*comm_pt != current_block_comm)
           {
             std::ostringstream error_message;
             error_message << "The communicator of block martrix_pt("
                           << block_row_i << "," << block_col_i
                           << ") is not the same as block "
                           << "matrix_pt(0,0).\n";
             throw OomphLibError(error_message.str(),
                                 OOMPH_CURRENT_FUNCTION,
                                 OOMPH_EXCEPTION_LOCATION);
           }
         }
       }
  
       // Check that all distributed boolean are the same (if on more than 1
       // core)
       if (comm_pt->nproc() > 1)
       {
         // Get the distributed boolean from matrix_pt(0,0)
         bool first_distributed = matrix_pt(0, 0)->distributed();
  
         for (unsigned block_row_i = 0; block_row_i < nblockrow; block_row_i++)
         {
           for (unsigned block_col_i = 0; block_col_i < nblockcol; block_col_i++)
           {
             // Is the current block distributed?
             bool current_distributed =
               matrix_pt(block_row_i, block_col_i)->distributed();
  
             if (first_distributed != current_distributed)
             {
               std::ostringstream error_message;
               error_message << "Block matrix_pt(" << block_row_i << ","
                             << block_col_i << ") and block matrix_pt(0,0) "
                             << "have a different distributed boolean.\n";
               throw OomphLibError(error_message.str(),
                                   OOMPH_CURRENT_FUNCTION,
                                   OOMPH_EXCEPTION_LOCATION);
             }
           }
         }
       }
  
       // Check that all sub matrix dimensions "make sense"
       // We need to check that all the matrices in the same row has the same
       // nrow. Then repeat for the columns.
  
       // Check the nrow of each block row.
       for (unsigned block_row_i = 0; block_row_i < nblockrow; block_row_i++)
       {
         // Get the nrow to compare against from the first column.
         const unsigned first_block_nrow = matrix_pt(block_row_i, 0)->nrow();
  
         // Loop through the block columns.
         for (unsigned block_col_i = 1; block_col_i < nblockcol; block_col_i++)
         {
           // If the nrow of this block is not the same as the nrow from the
           // first block in this block row, throw an error.
           const unsigned current_block_nrow =
             matrix_pt(block_row_i, block_col_i)->nrow();
  
           if (first_block_nrow != current_block_nrow)
           {
             std::ostringstream error_message;
             error_message << "First block has nrow = " << current_block_nrow
                           << ". But martrix_pt(" << block_row_i << ","
                           << block_col_i
                           << ") has nrow = " << current_block_nrow << ".\n";
             throw OomphLibError(error_message.str(),
                                 OOMPH_CURRENT_FUNCTION,
                                 OOMPH_EXCEPTION_LOCATION);
           }
         }
       }
  
       // Check the ncol of each block column.
       for (unsigned block_col_i = 0; block_col_i < nblockcol; block_col_i++)
       {
         // Get the ncol from the first block row to compare against.
         const unsigned first_block_ncol = matrix_pt(0, block_col_i)->ncol();
  
         for (unsigned block_row_i = 1; block_row_i < nblockrow; block_row_i++)
         {
           // Get the ncol for the current block.
           const unsigned current_block_ncol =
             matrix_pt(block_row_i, block_col_i)->ncol();
  
           if (first_block_ncol != current_block_ncol)
           {
             std::ostringstream error_message;
             error_message << "First block has ncol = " << current_block_ncol
                           << ". But martrix_pt(" << block_row_i << ","
                           << block_col_i
                           << ") has ncol = " << current_block_ncol << ".\n";
             throw OomphLibError(error_message.str(),
                                 OOMPH_CURRENT_FUNCTION,
                                 OOMPH_EXCEPTION_LOCATION);
           }
         }
       }
  
       // Check that the distribution for each block row is the same.
       for (unsigned block_row_i = 0; block_row_i < nblockrow; block_row_i++)
       {
         // The first distribution of this block row.
         const LinearAlgebraDistribution first_dist =
           *(matrix_pt(block_row_i, 0)->distribution_pt());
  
         // Loop through the rest of the block columns.
         for (unsigned block_col_i = 1; block_col_i < nblockcol; block_col_i++)
         {
           // Get the distribution from the current block.
           const LinearAlgebraDistribution current_dist =
             matrix_pt(block_row_i, block_col_i)->distribution_pt();
  
           // Compare the first distribution against the current.
           if (first_dist != current_dist)
           {
             std::ostringstream error_message;
             error_message << "First distribution of block row " << block_row_i
                           << " is different from the distribution from "
                           << "martrix_pt(" << block_row_i << "," << block_col_i
                           << ").\n";
             throw OomphLibError(error_message.str(),
                                 OOMPH_CURRENT_FUNCTION,
                                 OOMPH_EXCEPTION_LOCATION);
           }
         }
       }
 #endif
  
 #endif
  
       // Loop thrpugh the block rows, then block columns to
       // compute the local inf norm
       double inf_norm = 0;
       for (unsigned block_row_i = 0; block_row_i < nblockrow; block_row_i++)
       {
         // Get the number of local rows from the first block.
         unsigned block_nrow_local = matrix_pt(block_row_i, 0)->nrow_local();
  
         // Loop through the block_nrow_local in this block row
         for (unsigned local_row_i = 0; local_row_i < block_nrow_local;
              local_row_i++)
         {
           double abs_sum_of_row = 0;
           // Loop through the block columns
           for (unsigned block_col_i = 0; block_col_i < nblockcol; block_col_i++)
           {
             // Locally cache the pointer to the current block.
             CRDoubleMatrix* block_pt = matrix_pt(block_row_i, block_col_i);
  
             const int* row_start = block_pt->row_start();
             const double* value = block_pt->value();
  
             // Loop through the values
             for (int val_i = row_start[local_row_i];
                  val_i < row_start[local_row_i + 1];
                  val_i++)
             {
               abs_sum_of_row += fabs(value[val_i]);
             }
           }
           // Store the max row
           inf_norm = std::max(inf_norm, abs_sum_of_row);
         }
       }
  
       // if this vector is distributed and on multiple processors then gather
 #ifdef OOMPH_HAS_MPI
       double inf_norm_local = inf_norm;
       if (matrix_pt(0, 0)->distributed() && comm_pt->nproc() > 1)
       {
         MPI_Allreduce(&inf_norm,
                       &inf_norm_local,
                       1,
                       MPI_DOUBLE,
                       MPI_MAX,
                       comm_pt->mpi_comm());
       }
       inf_norm = inf_norm_local;
 #endif
  
       // and return
       return inf_norm;
     }
  
     //============================================================================
     /// Calculates the largest Gershgorin disc whilst preserving the sign. Let
     /// A be an n by n matrix, with entries aij. For \f$ i \in \{ 1,...,n \} \f$
     /// let \f$ R_i = \sum_{i\neq j} |a_{ij}| \f$ be the sum of the absolute
     /// values of the non-diagonal entries in the i-th row. Let \f$ D(a_{ii},R_i) \f$
     /// be the closed disc centered at \f$ a_{ii} \f$ with radius \f$ R_i \f$,
     /// such a disc is called a Gershgorin disc.
     ///
     /// \n
     ///
     /// We calculate \f$ |D(a_{ii},R_i)|_{max} \f$ and multiply by the sign of
     /// the diagonal entry.
     ///
     /// \n
     ///
     /// The DenseMatrix of CRDoubleMatrices are treated as if they are one
     /// large matrix. Therefore the dimensions of the sub matrices has to
     /// "make sense", there is a paranoid check for this.
     //============================================================================
     double gershgorin_eigenvalue_estimate(
       const DenseMatrix<CRDoubleMatrix*>& matrix_pt)
     {
       // The number of block rows and columns
       const unsigned nblockrow = matrix_pt.nrow();
       const unsigned nblockcol = matrix_pt.ncol();
  
 #ifdef PARANOID
       // Check that tehre is at least one matrix.
       if (matrix_pt.nrow() == 0)
       {
         std::ostringstream error_message;
         error_message << "There are no matrices... \n";
         throw OomphLibError(error_message.str(),
                             OOMPH_CURRENT_FUNCTION,
                             OOMPH_EXCEPTION_LOCATION);
       }
  
  
       // Check that all matrix_pt pointers are not null
       // and the matrices are built.
       for (unsigned block_row_i = 0; block_row_i < nblockrow; block_row_i++)
       {
         for (unsigned block_col_i = 0; block_col_i < nblockcol; block_col_i++)
         {
           if (matrix_pt(block_row_i, block_col_i) == 0)
           {
             std::ostringstream error_message;
             error_message << "The pointer martrix_pt(" << block_row_i << ","
                           << block_col_i << ") is null.\n";
             throw OomphLibError(error_message.str(),
                                 OOMPH_CURRENT_FUNCTION,
                                 OOMPH_EXCEPTION_LOCATION);
           }
  
           if (!matrix_pt(block_row_i, block_col_i)->built())
           {
             std::ostringstream error_message;
             error_message << "The matrix at martrix_pt(" << block_row_i << ","
                           << block_col_i << ") is not built.\n";
             throw OomphLibError(error_message.str(),
                                 OOMPH_CURRENT_FUNCTION,
                                 OOMPH_EXCEPTION_LOCATION);
           }
         }
       }
 #endif
  
  
 #ifdef OOMPH_HAS_MPI
  
       // The communicator pointer from block (0,0)
       // All communicators should be the same, we check this next.
       const OomphCommunicator* const comm_pt =
         matrix_pt(0, 0)->distribution_pt()->communicator_pt();
  
 #ifdef PARANOID
  
       // Check that all communicators are the same
       for (unsigned block_row_i = 0; block_row_i < nblockrow; block_row_i++)
       {
         for (unsigned block_col_i = 0; block_col_i < nblockcol; block_col_i++)
         {
           // Communicator for this block matrix.
           const OomphCommunicator current_block_comm =
             *(matrix_pt(block_row_i, block_col_i)
                 ->distribution_pt()
                 ->communicator_pt());
           if (*comm_pt != current_block_comm)
           {
             std::ostringstream error_message;
             error_message << "The communicator of block martrix_pt("
                           << block_row_i << "," << block_col_i
                           << ") is not the same as block "
                           << "matrix_pt(0,0).\n";
             throw OomphLibError(error_message.str(),
                                 OOMPH_CURRENT_FUNCTION,
                                 OOMPH_EXCEPTION_LOCATION);
           }
         }
       }
  
       // Check that all distributed boolean are the same (if on more than 1
       // core)
       if (comm_pt->nproc() > 1)
       {
         // Get the distributed boolean from matrix_pt(0,0)
         bool first_distributed = matrix_pt(0, 0)->distributed();
  
         for (unsigned block_row_i = 0; block_row_i < nblockrow; block_row_i++)
         {
           for (unsigned block_col_i = 0; block_col_i < nblockcol; block_col_i++)
           {
             // Is the current block distributed?
             bool current_distributed =
               matrix_pt(block_row_i, block_col_i)->distributed();
  
             if (first_distributed != current_distributed)
             {
               std::ostringstream error_message;
               error_message << "Block matrix_pt(" << block_row_i << ","
                             << block_col_i << ") and block matrix_pt(0,0) "
                             << "have a different distributed boolean.\n";
               throw OomphLibError(error_message.str(),
                                   OOMPH_CURRENT_FUNCTION,
                                   OOMPH_EXCEPTION_LOCATION);
             }
           }
         }
       }
  
       // Check that all sub matrix dimensions "make sense"
       // We need to check that all the matrices in the same row has the same
       // nrow. Then repeat for the columns.
  
       // Check the nrow of each block row.
       for (unsigned block_row_i = 0; block_row_i < nblockrow; block_row_i++)
       {
         // Get the nrow to compare against from the first column.
         const unsigned first_block_nrow = matrix_pt(block_row_i, 0)->nrow();
  
         // Loop through the block columns.
         for (unsigned block_col_i = 1; block_col_i < nblockcol; block_col_i++)
         {
           // If the nrow of this block is not the same as the nrow from the
           // first block in this block row, throw an error.
           const unsigned current_block_nrow =
             matrix_pt(block_row_i, block_col_i)->nrow();
  
           if (first_block_nrow != current_block_nrow)
           {
             std::ostringstream error_message;
             error_message << "First block has nrow = " << current_block_nrow
                           << ". But martrix_pt(" << block_row_i << ","
                           << block_col_i
                           << ") has nrow = " << current_block_nrow << ".\n";
             throw OomphLibError(error_message.str(),
                                 OOMPH_CURRENT_FUNCTION,
                                 OOMPH_EXCEPTION_LOCATION);
           }
         }
       }
  
       // Check the ncol of each block column.
       for (unsigned block_col_i = 0; block_col_i < nblockcol; block_col_i++)
       {
         // Get the ncol from the first block row to compare against.
         const unsigned first_block_ncol = matrix_pt(0, block_col_i)->ncol();
  
         for (unsigned block_row_i = 1; block_row_i < nblockrow; block_row_i++)
         {
           // Get the ncol for the current block.
           const unsigned current_block_ncol =
             matrix_pt(block_row_i, block_col_i)->ncol();
  
           if (first_block_ncol != current_block_ncol)
           {
             std::ostringstream error_message;
             error_message << "First block has ncol = " << current_block_ncol
                           << ". But martrix_pt(" << block_row_i << ","
                           << block_col_i
                           << ") has ncol = " << current_block_ncol << ".\n";
             throw OomphLibError(error_message.str(),
                                 OOMPH_CURRENT_FUNCTION,
                                 OOMPH_EXCEPTION_LOCATION);
           }
         }
       }
  
       // Check that the distribution for each block row is the same.
       for (unsigned block_row_i = 0; block_row_i < nblockrow; block_row_i++)
       {
         // The first distribution of this block row.
         const LinearAlgebraDistribution first_dist =
           *(matrix_pt(block_row_i, 0)->distribution_pt());
  
         // Loop through the rest of the block columns.
         for (unsigned block_col_i = 1; block_col_i < nblockcol; block_col_i++)
         {
           // Get the distribution from the current block.
           const LinearAlgebraDistribution current_dist =
             matrix_pt(block_row_i, block_col_i)->distribution_pt();
  
           // Compare the first distribution against the current.
           if (first_dist != current_dist)
           {
             std::ostringstream error_message;
             error_message << "First distribution of block row " << block_row_i
                           << " is different from the distribution from "
                           << "martrix_pt(" << block_row_i << "," << block_col_i
                           << ").\n";
             throw OomphLibError(error_message.str(),
                                 OOMPH_CURRENT_FUNCTION,
                                 OOMPH_EXCEPTION_LOCATION);
           }
         }
       }
  
 #endif
 #endif
  
       // Loop thrpugh the block rows, then block columns to
       // compute the local inf norm
       double extreme_disc = 0;
       for (unsigned block_row_i = 0; block_row_i < nblockrow; block_row_i++)
       {
         // Get the number of local rows from the first block.
         unsigned block_nrow_local = matrix_pt(block_row_i, 0)->nrow_local();
  
         // Loop through the block_nrow_local in this block row
         for (unsigned local_row_i = 0; local_row_i < block_nrow_local;
              local_row_i++)
         {
           double abs_sum_of_row = 0;
           // Loop through the block columns
           for (unsigned block_col_i = 0; block_col_i < nblockcol; block_col_i++)
           {
             // Locally cache the pointer to the current block.
             CRDoubleMatrix* block_pt = matrix_pt(block_row_i, block_col_i);
  
             const int* row_start = block_pt->row_start();
             const double* value = block_pt->value();
  
             // Loop through the values
             for (int val_i = row_start[local_row_i];
                  val_i < row_start[local_row_i + 1];
                  val_i++)
             {
               abs_sum_of_row += fabs(value[val_i]);
             }
           }
  
           // Now minus the diagonal entry...
           // Locate the diagonal block matrix.
           double* s_values = matrix_pt(block_row_i, block_row_i)->value();
           int* s_column_index =
             matrix_pt(block_row_i, block_row_i)->column_index();
           int* s_row_start = matrix_pt(block_row_i, block_row_i)->row_start();
           // int s_nrow_local =
           // matrix_pt(block_row_i,block_row_i)->nrow_local();
           int s_first_row = matrix_pt(block_row_i, block_row_i)->first_row();
  
           // Get the diagonal value...
           double diagonal_value = 0;
           bool found = false;
           for (int j = s_row_start[local_row_i];
                j < s_row_start[local_row_i + 1] && !found;
                j++)
           {
             if (s_column_index[j] == int(local_row_i + s_first_row))
             {
               diagonal_value = s_values[j];
               found = true;
             }
           }
  
           // Check if the diagonal entry is found.
           if (!found)
           {
             std::ostringstream error_message;
             error_message << "The diagonal entry for the block(" << block_row_i
                           << "," << block_row_i << ")\n"
                           << "on local row " << local_row_i
                           << " does not exist." << std::endl;
             throw OomphLibError(error_message.str(),
                                 OOMPH_CURRENT_FUNCTION,
                                 OOMPH_EXCEPTION_LOCATION);
           }
  
           // This is the disc.
           abs_sum_of_row -= fabs(diagonal_value);
  
           // Now we have to check if the diagonal entry is
           // on the left or right side of zero.
           if (diagonal_value > 0)
           {
             double extreme_disc_local = diagonal_value + abs_sum_of_row;
             extreme_disc = std::max(extreme_disc_local, extreme_disc);
           }
           else
           {
             double extreme_disc_local = diagonal_value - abs_sum_of_row;
             extreme_disc = std::min(extreme_disc_local, extreme_disc);
           }
         } // Loop through local row (of all block column)
       } // Loop through block row
  
       // if this vector is distributed and on multiple processors then gather
 #ifdef OOMPH_HAS_MPI
       double extreme_disc_local = extreme_disc;
       if (matrix_pt(0, 0)->distributed() && comm_pt->nproc() > 1)
       {
         if (extreme_disc > 0)
         {
           MPI_Allreduce(&extreme_disc,
                         &extreme_disc_local,
                         1,
                         MPI_DOUBLE,
                         MPI_MAX,
                         comm_pt->mpi_comm());
         }
         else
         {
           MPI_Allreduce(&extreme_disc,
                         &extreme_disc_local,
                         1,
                         MPI_DOUBLE,
                         MPI_MIN,
                         comm_pt->mpi_comm());
         }
       }
       extreme_disc = extreme_disc_local;
 #endif
  
       // and return
       return extreme_disc;
     }
  
     //============================================================================
     /// Concatenate CRDoubleMatrix matrices.
     /// The in matrices are concatenated such that the block structure of the
     /// in matrices are preserved in the result matrix. Communication between
     /// processors is required. If the block structure of the sub matrices does
     /// not need to be preserved, consider using
     /// CRDoubleMatrixHelpers::concatenate_without_communication(...).
     ///
     /// The matrix manipulation functions
     /// CRDoubleMatrixHelpers::concatenate(...) and
     /// CRDoubleMatrixHelpers::concatenate_without_communication(...)
     /// are analogous to the Vector manipulation functions
     /// DoubleVectorHelpers::concatenate(...) and
     /// DoubleVectorHelpers::concatenate_without_communication(...).
     /// Please look at the DoubleVector functions for an illustration of the
     /// differences between concatenate(...) and
     /// concatenate_without_communication(...).
     ///
     /// Distribution of the result matrix:
     /// If the result matrix does not have a distribution built, then it will be
     /// given a uniform row distribution. Otherwise we use the existing
     /// distribution. This gives the user the ability to define their own
     /// distribution, or save computing power if a distribution has
     /// been pre-built.
     ///
     /// NOTE: ALL the matrices pointed to by matrix_pt has to be built. This is
     /// not the case with concatenate_without_communication(...)
     //============================================================================
     void concatenate(const DenseMatrix<CRDoubleMatrix*>& matrix_pt,
                      CRDoubleMatrix& result_matrix)
     {
       // The number of block rows and block columns.
       unsigned matrix_nrow = matrix_pt.nrow();
       unsigned matrix_ncol = matrix_pt.ncol();
  
       // PARANOID checks involving only the in matrices.
 #ifdef PARANOID
       // Are there matrices to concatenate?
       if (matrix_nrow == 0)
       {
         std::ostringstream error_message;
         error_message << "There are no matrices to concatenate.\n";
         throw OomphLibError(error_message.str(),
                             OOMPH_CURRENT_FUNCTION,
                             OOMPH_EXCEPTION_LOCATION);
       }
  
       // Does this matrix need concatenating?
       if ((matrix_nrow == 1) && (matrix_ncol == 1))
       {
         std::ostringstream warning_message;
         warning_message << "There is only one matrix to concatenate...\n"
                         << "This does not require concatenating...\n";
         OomphLibWarning(warning_message.str(),
                         OOMPH_CURRENT_FUNCTION,
                         OOMPH_EXCEPTION_LOCATION);
       }
  
       // Are all sub matrices built?
       for (unsigned block_row_i = 0; block_row_i < matrix_nrow; block_row_i++)
       {
         for (unsigned block_col_i = 0; block_col_i < matrix_ncol; block_col_i++)
         {
           if (!(matrix_pt(block_row_i, block_col_i)->built()))
           {
             std::ostringstream error_message;
             error_message << "The sub matrix (" << block_row_i << ","
                           << block_col_i << ")\n"
                           << "is not built. \n";
             throw OomphLibError(error_message.str(),
                                 OOMPH_CURRENT_FUNCTION,
                                 OOMPH_EXCEPTION_LOCATION);
           }
         }
       }
  
       // Do all dimensions of sub matrices "make sense"?
       // Compare the number of rows of each block matrix in a block row.
       for (unsigned block_row_i = 0; block_row_i < matrix_nrow; block_row_i++)
       {
         // Use the first column to compare against the rest.
         unsigned long current_block_nrow = matrix_pt(block_row_i, 0)->nrow();
  
         // Compare against columns 1 to matrix_ncol - 1
         for (unsigned block_col_i = 1; block_col_i < matrix_ncol; block_col_i++)
         {
           // Get the nrow for this sub block.
           unsigned long subblock_nrow =
             matrix_pt(block_row_i, block_col_i)->nrow();
  
           if (current_block_nrow != subblock_nrow)
           {
             std::ostringstream error_message;
             error_message << "The sub matrix (" << block_row_i << ","
                           << block_col_i << ")\n"
                           << "requires nrow = " << current_block_nrow
                           << ", but has nrow = " << subblock_nrow << ".\n";
             throw OomphLibError(error_message.str(),
                                 OOMPH_CURRENT_FUNCTION,
                                 OOMPH_EXCEPTION_LOCATION);
           }
         }
       }
  
       // Compare the number of columns of each block matrix in a block column.
       for (unsigned block_col_i = 0; block_col_i < matrix_ncol; block_col_i++)
       {
         // Use the first row to compare against the rest.
         unsigned long current_block_ncol = matrix_pt(0, block_col_i)->ncol();
  
         // Compare against rows 1 to matrix_nrow - 1
         for (unsigned block_row_i = 1; block_row_i < matrix_nrow; block_row_i++)
         {
           // Get the ncol for this sub block.
           unsigned long subblock_ncol =
             matrix_pt(block_row_i, block_col_i)->ncol();
  
           if (current_block_ncol != subblock_ncol)
           {
             std::ostringstream error_message;
             error_message << "The sub matrix (" << block_row_i << ","
                           << block_col_i << ")\n"
                           << "requires ncol = " << current_block_ncol
                           << ", but has ncol = " << subblock_ncol << ".\n";
             throw OomphLibError(error_message.str(),
                                 OOMPH_CURRENT_FUNCTION,
                                 OOMPH_EXCEPTION_LOCATION);
           }
         }
       }
 #endif
  
       // The communicator pointer from block (0,0)
       const OomphCommunicator* const comm_pt =
         matrix_pt(0, 0)->distribution_pt()->communicator_pt();
  
       // Check if the block (0,0) is distributed or not.
       bool distributed = matrix_pt(0, 0)->distributed();
  
       // If the result matrix does not have a distribution, we create a uniform
       // distribution.
       if (!result_matrix.distribution_pt()->built())
       {
         // Sum of sub matrix nrow. We use the first column.
         unsigned tmp_nrow = 0;
         for (unsigned block_row_i = 0; block_row_i < matrix_nrow; block_row_i++)
         {
           tmp_nrow += matrix_pt(block_row_i, 0)->nrow();
         }
  
         LinearAlgebraDistribution tmp_distribution(
           comm_pt, tmp_nrow, distributed);
  
         result_matrix.build(&tmp_distribution);
       }
       else
       // A distribution is supplied for the result matrix.
       {
 #ifdef PARANOID
         // Check if the sum of the nrow from the sub matrices is the same as the
         // the nrow from the result matrix.
  
         // Sum of sub matrix nrow. We use the first column.
         unsigned tmp_nrow = 0;
         for (unsigned block_row_i = 0; block_row_i < matrix_nrow; block_row_i++)
         {
           tmp_nrow += matrix_pt(block_row_i, 0)->nrow();
         }
  
         if (tmp_nrow != result_matrix.nrow())
         {
           std::ostringstream error_message;
           error_message << "The total number of rows from the matrices to\n"
                         << "concatenate does not match the nrow from the\n"
                         << "result matrix\n";
           throw OomphLibError(error_message.str(),
                               OOMPH_CURRENT_FUNCTION,
                               OOMPH_EXCEPTION_LOCATION);
         }
 #endif
       }
  
 #ifdef PARANOID
  
       // Are all the communicators the same?
       // Compare the communicator for sub matrices (against the result matrix).
       {
         const OomphCommunicator communicator =
           *(result_matrix.distribution_pt()->communicator_pt());
  
         // Are all communicator pointers the same?
         for (unsigned block_row_i = 0; block_row_i < matrix_nrow; block_row_i++)
         {
           for (unsigned block_col_i = 0; block_col_i < matrix_ncol;
                block_col_i++)
           {
             const OomphCommunicator another_communicator =
               *(matrix_pt(block_row_i, block_col_i)
                   ->distribution_pt()
                   ->communicator_pt());
  
             if (!(communicator == another_communicator))
             {
               std::ostringstream error_message;
               error_message << "The OomphCommunicator of the sub matrix ("
                             << block_row_i << "," << block_col_i << ")\n"
                             << "does not have the same communicator as the "
                                "result matrix. \n";
               throw OomphLibError(error_message.str(),
                                   OOMPH_CURRENT_FUNCTION,
                                   OOMPH_EXCEPTION_LOCATION);
             }
           }
         }
       }
  
       // Are all the distributed boolean the same? This only applies if we have
       // more than one processor. If there is only one processor, then it does
       // not matter if it is distributed or not - they are conceptually the
       // same.
       if (comm_pt->nproc() != 1)
       {
         // Compare distributed for sub matrices (against the result matrix).
         const bool res_distributed = result_matrix.distributed();
  
         // Loop over all sub blocks.
         for (unsigned block_row_i = 0; block_row_i < matrix_nrow; block_row_i++)
         {
           for (unsigned block_col_i = 0; block_col_i < matrix_ncol;
                block_col_i++)
           {
             const bool another_distributed =
               matrix_pt(block_row_i, block_col_i)->distributed();
  
             if (res_distributed != another_distributed)
             {
               std::ostringstream error_message;
               error_message << "The distributed boolean of the sub matrix ("
                             << block_row_i << "," << block_col_i << ")\n"
                             << "is not the same as the result matrix. \n";
               throw OomphLibError(error_message.str(),
                                   OOMPH_CURRENT_FUNCTION,
                                   OOMPH_EXCEPTION_LOCATION);
             }
           }
         }
       }
 #endif
  
  
       // Get the number of columns up to each block for offset
       // in calculating the result column indices.
       // Since the number of columns in each block column is the same,
       // we only loop through the first block row (row zero).
       Vector<unsigned long> sum_of_ncol_up_to_block(matrix_ncol);
  
       // Also compute the total number of columns to build the resulting matrix.
       unsigned long res_ncol = 0;
  
       for (unsigned block_col_i = 0; block_col_i < matrix_ncol; block_col_i++)
       {
         sum_of_ncol_up_to_block[block_col_i] = res_ncol;
         res_ncol += matrix_pt(0, block_col_i)->ncol();
       }
  
       // We begin the process of extracting and ordering the values,
       // column_indices and row_start of all the sub blocks.
       if ((comm_pt->nproc() == 1) || !distributed)
       // Serial version of the code.
       {
         // Get the total number of non zero entries so we can reserve storage
         // for the values and column_indices vectors.
         unsigned long res_nnz = 0;
         for (unsigned block_row_i = 0; block_row_i < matrix_nrow; block_row_i++)
         {
           for (unsigned block_col_i = 0; block_col_i < matrix_ncol;
                block_col_i++)
           {
             res_nnz += matrix_pt(block_row_i, block_col_i)->nnz();
           }
         }
  
         // Declare the vectors required to build a CRDoubleMatrix
         Vector<double> res_values;
         Vector<int> res_column_indices;
         Vector<int> res_row_start;
  
         // Reserve space for the vectors.
         res_values.reserve(res_nnz);
         res_column_indices.reserve(res_nnz);
         res_row_start.reserve(result_matrix.nrow() + 1);
  
         // Now we fill in the data.
  
         // Running sum of nnz per row.
         int nnz_running_sum = 0;
  
         // Loop through the block rows.
         for (unsigned block_row_i = 0; block_row_i < matrix_nrow; block_row_i++)
         {
           // Get the number of rows in this block row, from the first block.
           unsigned long block_row_nrow = matrix_pt(block_row_i, 0)->nrow();
  
           // Loop through the number of rows in this block row
           for (unsigned row_i = 0; row_i < block_row_nrow; row_i++)
           {
             // The row start is the nnz at the start of the row.
             res_row_start.push_back(nnz_running_sum);
  
             // Loop through the block columns
             for (unsigned block_col_i = 0; block_col_i < matrix_ncol;
                  block_col_i++)
             {
               // Get the current block.
               CRDoubleMatrix* current_block_pt =
                 matrix_pt(block_row_i, block_col_i);
  
               // Get the values, column_indices and row_start for this block.
               double* current_block_values = current_block_pt->value();
               int* current_block_column_indices =
                 current_block_pt->column_index();
               int* current_block_row_start = current_block_pt->row_start();
  
               for (int val_i = current_block_row_start[row_i];
                    val_i < current_block_row_start[row_i + 1];
                    val_i++)
               {
                 res_values.push_back(current_block_values[val_i]);
                 res_column_indices.push_back(
                   current_block_column_indices[val_i] +
                   sum_of_ncol_up_to_block[block_col_i]);
               }
  
               // Update the running sum of nnz per row
               nnz_running_sum += current_block_row_start[row_i + 1] -
                                  current_block_row_start[row_i];
             } // for block cols
           } // for rows
         } // for block rows
  
         // Fill in the last row start
         res_row_start.push_back(res_nnz);
  
         // Build the matrix
         result_matrix.build(
           res_ncol, res_values, res_column_indices, res_row_start);
       }
       // Otherwise we are dealing with a distributed matrix.
       else
       {
 #ifdef OOMPH_HAS_MPI
  
         // Flag to enable timing. This is for debugging
         // and/or testing purposes only.
         bool enable_timing = false;
  
         // Get the number of processors
         unsigned nproc = comm_pt->nproc();
  
         // My rank
         unsigned my_rank = comm_pt->my_rank();
  
         // Storage for the data (per processor) to send.
         Vector<Vector<unsigned>> column_indices_to_send(nproc);
         Vector<Vector<double>> values_to_send(nproc);
  
         // The sum of the nrow for the sub blocks (so far). This is used as an
         // offset to calculate the global equation number in the result matrix.
         unsigned long sum_of_block_nrow = 0;
  
         double t_prep_data_start;
         if (enable_timing)
         {
           t_prep_data_start = TimingHelpers::timer();
         }
  
         // Get the pointer to the result distribution, for convenience...
         LinearAlgebraDistribution* res_distribution_pt =
           result_matrix.distribution_pt();
  
         // loop over the sub blocks to calculate the global_eqn, get the values
         // and column indices.
         for (unsigned block_row_i = 0; block_row_i < matrix_nrow; block_row_i++)
         {
           // Get the number of local rows in this block_row from the first
           // block.
           unsigned current_block_nrow_local =
             matrix_pt(block_row_i, 0)->nrow_local();
  
           // Get the first_row for this block_row
           unsigned current_block_row_first_row =
             matrix_pt(block_row_i, 0)->first_row();
  
           // Loop through the number of local rows
           for (unsigned sub_local_eqn = 0;
                sub_local_eqn < current_block_nrow_local;
                sub_local_eqn++)
           {
             // Calculate the corresponding (res_global_eqn) equation number
             // for this local row number in this block.
             unsigned long res_global_eqn =
               sub_local_eqn + current_block_row_first_row + sum_of_block_nrow;
  
             // Get the processor that this global row belongs to.
             // The rank_of_global_row(...) function loops through all the
             // processors and does two unsigned comparisons. Since we have to do
             // this for every row, it may be better to store a list mapping for
             // very large number of processors.
             unsigned res_p =
               res_distribution_pt->rank_of_global_row(res_global_eqn);
  
             // With the res_p, we get the res_first_row to
             // work out the res_local_eqn
             unsigned res_first_row = res_distribution_pt->first_row(res_p);
             unsigned res_local_eqn = res_global_eqn - res_first_row;
  
             // Loop through the block columns, calculate the nnz. This is used
             // to reserve space for the value and column_indices Vectors.
             unsigned long current_row_nnz = 0;
             for (unsigned block_col_i = 0; block_col_i < matrix_ncol;
                  block_col_i++)
             {
               // Get the row_start
               int* current_block_row_start =
                 matrix_pt(block_row_i, block_col_i)->row_start();
  
               // Update the nnz for this row.
               current_row_nnz += current_block_row_start[sub_local_eqn + 1] -
                                  current_block_row_start[sub_local_eqn];
             } // for block column, get nnz.
  
             // Reserve space for efficiency.
             // unsigned capacity_in_res_p_vec
             //  = column_indices_to_send[res_p].capacity();
  
             // Reserve memory for nnz+2, since we need to store the
             // res_local_eqn and nnz as well as the data (values/column
             // indices). Note: The two reserve functions are called per row. If
             // the matrix is very sparse (just a few elements per row), it will
             // be more efficient to not reserve and let the STL vector handle
             // this. On average, this implementation is more efficient.
             // column_indices_to_send[res_p].reserve(capacity_in_res_p_vec
             //    + current_row_nnz+2);
             // values_to_send[res_p].reserve(capacity_in_res_p_vec
             //    + current_row_nnz+2);
  
             // Push back the res_local_eqn and nnz
             column_indices_to_send[res_p].push_back(res_local_eqn);
             column_indices_to_send[res_p].push_back(current_row_nnz);
             values_to_send[res_p].push_back(res_local_eqn);
             values_to_send[res_p].push_back(current_row_nnz);
  
             // Loop through the block columns again and get the values
             // and column_indices
             for (unsigned block_col_i = 0; block_col_i < matrix_ncol;
                  block_col_i++)
             {
               // Cache the pointer to the current block for convenience.
               CRDoubleMatrix* current_block_pt =
                 matrix_pt(block_row_i, block_col_i);
  
               // Values, column indices and row_start for the current block.
               double* current_block_values = current_block_pt->value();
               int* current_block_column_indices =
                 current_block_pt->column_index();
               int* current_block_row_start = current_block_pt->row_start();
  
               // Loop though the values and column_indices
               for (int val_i = current_block_row_start[sub_local_eqn];
                    val_i < current_block_row_start[sub_local_eqn + 1];
                    val_i++)
               {
                 // Push back the value.
                 values_to_send[res_p].push_back(current_block_values[val_i]);
  
                 // Push back the (offset) column index.
                 column_indices_to_send[res_p].push_back(
                   current_block_column_indices[val_i] +
                   sum_of_ncol_up_to_block[block_col_i]);
               } // for block columns
             } // for block column, get values and column_indices.
           } // for sub_local_eqn
  
           // update the sum_of_block_nrow
           sum_of_block_nrow += matrix_pt(block_row_i, 0)->nrow();
  
         } // for block_row
  
         if (enable_timing)
         {
           double t_prep_data_finish = TimingHelpers::timer();
           double t_prep_data_time = t_prep_data_finish - t_prep_data_start;
           oomph_info << "Time for prep data: " << t_prep_data_time << std::endl;
         }
  
         // Prepare to send data!
  
         // Storage for the number of data to be sent to each processor.
         Vector<int> send_n(nproc, 0);
  
         // Storage for all the values/column indices to be sent
         // to each processor.
         Vector<double> send_values_data;
         Vector<unsigned> send_column_indices_data;
  
         // Storage location within send_values_data
         // (and send_column_indices_data) for data to be sent to each processor.
         Vector<int> send_displacement(nproc, 0);
  
         double t_total_ndata_start;
         if (enable_timing) t_total_ndata_start = TimingHelpers::timer();
  
         // Get the total amount of data which needs to be sent, so we can
         // reserve space for it.
         unsigned total_ndata = 0;
         for (unsigned rank = 0; rank < nproc; rank++)
         {
           if (rank != my_rank)
           {
             total_ndata += values_to_send[rank].size();
           }
         }
  
         if (enable_timing)
         {
           double t_total_ndata_finish = TimingHelpers::timer();
           double t_total_ndata_time =
             t_total_ndata_finish - t_total_ndata_start;
           oomph_info << "Time for total_ndata: " << t_total_ndata_time
                      << std::endl;
         }
  
         double t_flat_pack_start;
         if (enable_timing) t_flat_pack_start = TimingHelpers::timer();
  
         // Now we don't have to re-allocate data/memory when push_back is
         // called. Nb. Using push_back without reserving memory may cause
         // multiple re-allocation behind the scenes, this is expensive.
         send_values_data.reserve(total_ndata);
         send_column_indices_data.reserve(total_ndata);
  
         // Loop over all the processors to "flat pack" the data for sending.
         for (unsigned rank = 0; rank < nproc; rank++)
         {
           // Set the offset for the current processor
           // This only has to be done once for both values and column indices.
           send_displacement[rank] = send_values_data.size();
  
           // Don't bother to do anything if
           // the processor in the loop is the current processor.
           if (rank != my_rank)
           {
             // Put the values into the send data vector.
             // n_data is the same for both values and column indices.
             unsigned n_data = values_to_send[rank].size();
             for (unsigned j = 0; j < n_data; j++)
             {
               send_values_data.push_back(values_to_send[rank][j]);
               send_column_indices_data.push_back(
                 column_indices_to_send[rank][j]);
             } // for
           } // if rank != my_rank
  
           // Find the number of data to be added to the vector.
           // send_n is the same for both values and column indices.
           send_n[rank] = send_values_data.size() - send_displacement[rank];
         } // loop over processors
  
         if (enable_timing)
         {
           double t_flat_pack_finish = TimingHelpers::timer();
           double t_flat_pack_time = t_flat_pack_finish - t_flat_pack_start;
           oomph_info << "t_flat_pack_time: " << t_flat_pack_time << std::endl;
         }
  
         double t_sendn_start;
         if (enable_timing) t_sendn_start = TimingHelpers::timer();
  
         // Strorage for the number of data to be received from each processor
         Vector<int> receive_n(nproc, 0);
  
         MPI_Alltoall(&send_n[0],
                      1,
                      MPI_INT,
                      &receive_n[0],
                      1,
                      MPI_INT,
                      comm_pt->mpi_comm());
  
         if (enable_timing)
         {
           double t_sendn_finish = TimingHelpers::timer();
           double t_sendn_time = t_sendn_finish - t_sendn_start;
           oomph_info << "t_sendn_time: " << t_sendn_time << std::endl;
         }
  
  
         // Prepare the data to be received
         // by working out the displacement from the received data
         // receive_displacement is the same for both values and column indices.
         Vector<int> receive_displacement(nproc, 0);
         int receive_data_count = 0;
         for (unsigned rank = 0; rank < nproc; rank++)
         {
           receive_displacement[rank] = receive_data_count;
           receive_data_count += receive_n[rank];
         }
  
         // Now resize the receive buffer for all data from all processors.
         // Make sure that it has a size of at least one.
         if (receive_data_count == 0)
         {
           receive_data_count++;
         }
         Vector<double> receive_values_data(receive_data_count);
         Vector<unsigned> receive_column_indices_data(receive_data_count);
  
         // Make sure that the send buffer has size at least one
         // so that we don't get a segmentation fault.
         if (send_values_data.size() == 0)
         {
           send_values_data.resize(1);
         }
  
         double t_send_data_start;
         if (enable_timing) t_send_data_start = TimingHelpers::timer();
  
         // Now send the data between all processors
         MPI_Alltoallv(&send_values_data[0],
                       &send_n[0],
                       &send_displacement[0],
                       MPI_DOUBLE,
                       &receive_values_data[0],
                       &receive_n[0],
                       &receive_displacement[0],
                       MPI_DOUBLE,
                       comm_pt->mpi_comm());
  
         // Now send the data between all processors
         MPI_Alltoallv(&send_column_indices_data[0],
                       &send_n[0],
                       &send_displacement[0],
                       MPI_UNSIGNED,
                       &receive_column_indices_data[0],
                       &receive_n[0],
                       &receive_displacement[0],
                       MPI_UNSIGNED,
                       comm_pt->mpi_comm());
  
         if (enable_timing)
         {
           double t_send_data_finish = TimingHelpers::timer();
           double t_send_data_time = t_send_data_finish - t_send_data_start;
           oomph_info << "t_send_data_time: " << t_send_data_time << std::endl;
         }
  
         // All the rows for this processor are stored in:
         // from other processors:
         // receive_column_indices_data and receive_values_data
         // from this processor:
         // column_indices_to_send[my_rank] and values_to_send[my_rank]
         //
         // They are in some order determined by the distribution.
         // We need to re-arrange them. To do this, we do some pre-processing.
  
         // nrow_local for this processor.
         unsigned res_nrow_local = res_distribution_pt->nrow_local();
  
         // Per row, store:
         // 1) where this row came from, 0 - this proc, 1 - other procs.
         // 2) the nnz,
         // 3) the offset - where the values/columns in the receive data vectors
         //                 begins. This is different from the offset of where
         //                 the data from a certain processor starts.
         Vector<Vector<unsigned>> value_column_locations(res_nrow_local,
                                                         Vector<unsigned>(3, 0));
  
         // Store the local nnz so we can reserve space for
         // the values and column indices.
         unsigned long res_nnz_local = 0;
  
         double t_locations_start;
         if (enable_timing) t_locations_start = TimingHelpers::timer();
  
         // Loop through the data currently on this processor.
         unsigned location_i = 0;
         unsigned my_column_indices_to_send_size =
           column_indices_to_send[my_rank].size();
         while (location_i < my_column_indices_to_send_size)
         {
           unsigned current_local_eqn =
             column_indices_to_send[my_rank][location_i++];
  
           unsigned current_nnz = column_indices_to_send[my_rank][location_i++];
  
           // No need to fill [*][0] with 0 since it is already initialised to 0.
  
           // Store the nnz.
           value_column_locations[current_local_eqn][1] = current_nnz;
  
           // Also increment the res_local_nnz
           res_nnz_local += current_nnz;
  
           // Store the offset.
           value_column_locations[current_local_eqn][2] = location_i;
  
           // Update the location_i so it starts at the next row.
           location_i += current_nnz;
         }
  
         // Loop through the data from different processors.
  
         // Check to see if data has been received.
         bool data_has_been_received = false;
         unsigned send_rank = 0;
         while (send_rank < nproc)
         {
           if (receive_n[send_rank] > 0)
           {
             data_has_been_received = true;
             break;
           }
           send_rank++;
         }
  
         location_i = 0; // start at 0.
         if (data_has_been_received)
         {
           unsigned receive_column_indices_data_size =
             receive_column_indices_data.size();
           while (location_i < receive_column_indices_data_size)
           {
             unsigned current_local_eqn =
               receive_column_indices_data[location_i++];
             unsigned current_nnz = receive_column_indices_data[location_i++];
  
             // These comes from other processors.
             value_column_locations[current_local_eqn][0] = 1;
  
             // Store the nnz.
             value_column_locations[current_local_eqn][1] = current_nnz;
  
             // Also increment the res_local_nnz
             res_nnz_local += current_nnz;
  
             // Store the offset.
             value_column_locations[current_local_eqn][2] = location_i;
  
             // Update the location_i so it starts at the next row.
             location_i += current_nnz;
           }
         }
  
         if (enable_timing)
         {
           double t_locations_finish = TimingHelpers::timer();
           double t_locations_time = t_locations_finish - t_locations_start;
           oomph_info << "t_locations_time: " << t_locations_time << std::endl;
         }
  
         double t_fillvecs_start;
         if (enable_timing) t_fillvecs_start = TimingHelpers::timer();
  
         // Now loop through the locations and store the values
         // the column indices in the correct order.
         Vector<int> res_column_indices;
         Vector<double> res_values;
         Vector<int> res_row_start;
  
         res_column_indices.reserve(res_nnz_local);
         res_values.reserve(res_nnz_local);
         res_row_start.reserve(res_nrow_local + 1);
  
         // Running sum of nnz for the row_start. Must be int because
         // res_row_start is templated with int.
         int nnz_running_sum = 0;
  
         // Now insert the rows.
         for (unsigned local_row_i = 0; local_row_i < res_nrow_local;
              local_row_i++)
         {
           // Fill the res_row_start with the nnz so far.
           res_row_start.push_back(nnz_running_sum);
  
           bool data_is_from_other_proc =
             bool(value_column_locations[local_row_i][0]);
  
           unsigned row_i_nnz = value_column_locations[local_row_i][1];
  
           unsigned row_i_offset = value_column_locations[local_row_i][2];
  
           if (data_is_from_other_proc)
           {
             // Insert range [offset, offset+nnz) from
             // receive_column_indices_data and receive_values_data into
             // res_column_indices and res_values respectively.
             res_column_indices.insert(
               res_column_indices.end(),
               receive_column_indices_data.begin() + row_i_offset,
               receive_column_indices_data.begin() + row_i_offset + row_i_nnz);
  
             res_values.insert(res_values.end(),
                               receive_values_data.begin() + row_i_offset,
                               receive_values_data.begin() + row_i_offset +
                                 row_i_nnz);
           }
           else
           {
             res_column_indices.insert(res_column_indices.end(),
                                       column_indices_to_send[my_rank].begin() +
                                         row_i_offset,
                                       column_indices_to_send[my_rank].begin() +
                                         row_i_offset + row_i_nnz);
  
             res_values.insert(res_values.end(),
                               values_to_send[my_rank].begin() + row_i_offset,
                               values_to_send[my_rank].begin() + row_i_offset +
                                 row_i_nnz);
           }
  
           // Update the running sum of nnz
           nnz_running_sum += row_i_nnz;
         }
  
         // Insert the last row_start value
         res_row_start.push_back(res_nnz_local);
  
         if (enable_timing)
         {
           double t_fillvecs_finish = TimingHelpers::timer();
           double t_fillvecs_time = t_fillvecs_finish - t_fillvecs_start;
           oomph_info << "t_fillvecs_time: " << t_fillvecs_time << std::endl;
         }
  
         double t_buildres_start;
         if (enable_timing) t_buildres_start = TimingHelpers::timer();
  
         // build the matrix.
         result_matrix.build(
           res_ncol, res_values, res_column_indices, res_row_start);
  
         if (enable_timing)
         {
           double t_buildres_finish = TimingHelpers::timer();
           double t_buildres_time = t_buildres_finish - t_buildres_start;
           oomph_info << "t_buildres_time: " << t_buildres_time << std::endl;
         }
         //  */
 #endif
       }
     }
  
     //============================================================================
     /// Concatenate CRDoubleMatrix matrices.
     ///
     /// The Vector row_distribution_pt contains the LinearAlgebraDistribution
     /// of each block row.
     /// The Vector col_distribution_pt contains the LinearAlgebraDistribution
     /// of each block column.
     /// The DenseMatrix matrix_pt contains pointers to the CRDoubleMatrices
     /// to concatenate.
     /// The CRDoubleMatrix result_matrix is the result matrix.
     ///
     /// The result matrix is a permutation of the sub matrices such that the
     /// data stays on the same processor when the result matrix is built, there
     /// is no communication between processors. Thus the block structure of the
     /// sub matrices are NOT preserved in the result matrix. The rows are
     /// block-permuted, defined by the concatenation of the distributions in
     /// row_distribution_pt. Similarly, the columns are block-permuted, defined
     /// by the concatenation of the distributions in col_distribution_pt. For
     /// more details on the block-permutation, see
     /// LinearAlgebraDistributionHelpers::concatenate(...).
     ///
     /// If one wishes to preserve the block structure of the sub matrices in the
     /// result matrix, consider using CRDoubleMatrixHelpers::concatenate(...),
     /// which uses communication between processors to ensure that the block
     /// structure of the sub matrices are preserved.
     ///
     /// The matrix manipulation functions
     /// CRDoubleMatrixHelpers::concatenate(...) and
     /// CRDoubleMatrixHelpers::concatenate_without_communication(...)
     /// are analogous to the Vector manipulation functions
     /// DoubleVectorHelpers::concatenate(...) and
     /// DoubleVectorHelpers::concatenate_without_communication(...).
     /// Please look at the DoubleVector functions for an illustration of the
     /// differences between concatenate(...) and
     /// concatenate_without_communication(...).
     ///
     /// Distribution of the result matrix:
     /// If the result matrix does not have a distribution built, then it will be
     /// given a distribution built from the concatenation of the distributions
     /// from row_distribution_pt, see
     /// LinearAlgebraDistributionHelpers::concatenate(...) for more detail.
     /// Otherwise we use the existing distribution.
     /// If there is an existing distribution then it must be the same as the
     /// distribution from the concatenation of row distributions as described
     /// above.
     /// Why don't we always compute the distribution "on the fly"?
     /// Because a non-uniform distribution requires communication.
     /// All block preconditioner distributions are concatenations of the
     /// distributions of the individual blocks.
     //============================================================================
     void concatenate_without_communication(
       const Vector<LinearAlgebraDistribution*>& row_distribution_pt,
       const Vector<LinearAlgebraDistribution*>& col_distribution_pt,
       const DenseMatrix<CRDoubleMatrix*>& matrix_pt,
       CRDoubleMatrix& result_matrix)
     {
       // The number of block rows and block columns.
       unsigned matrix_nrow = matrix_pt.nrow();
       unsigned matrix_ncol = matrix_pt.ncol();
  
       // PARANOID checks involving in matrices and block_distribution only.
       // PARANOID checks involving the result matrix will come later since
       // we have to create the result matrix distribution from the in
       // distribution if it does not already exist.
 #ifdef PARANOID
  
       // Are there matrices to concatenate?
       if (matrix_nrow == 0 || matrix_ncol == 0)
       {
         std::ostringstream error_message;
         error_message << "There are no matrices to concatenate.\n";
         throw OomphLibError(error_message.str(),
                             OOMPH_CURRENT_FUNCTION,
                             OOMPH_EXCEPTION_LOCATION);
       }
  
       // Does this matrix need concatenating?
       if ((matrix_nrow == 1) && (matrix_ncol == 1))
       {
         std::ostringstream warning_message;
         warning_message << "There is only one matrix to concatenate...\n"
                         << "This does not require concatenating...\n";
         OomphLibWarning(warning_message.str(),
                         OOMPH_CURRENT_FUNCTION,
                         OOMPH_EXCEPTION_LOCATION);
       }
  
  
       // The distribution for each block row is stored in row_distribution_pt.
       // So the number of distributions in row_distribution_pt must be the
       // same as matrix_nrow.
       if (matrix_nrow != row_distribution_pt.size())
       {
         std::ostringstream error_message;
         error_message << "The number of row distributions must be the same as\n"
                       << "the number of block rows.";
         throw OomphLibError(error_message.str(),
                             OOMPH_CURRENT_FUNCTION,
                             OOMPH_EXCEPTION_LOCATION);
       }
  
       // The number of distributions for the columns must match the number of
       // block columns.
       if (matrix_ncol != col_distribution_pt.size())
       {
         std::ostringstream error_message;
         error_message
           << "The number of column distributions must be the same as\n"
           << "the number of block columns.";
         throw OomphLibError(error_message.str(),
                             OOMPH_CURRENT_FUNCTION,
                             OOMPH_EXCEPTION_LOCATION);
       }
  
       // Check that all pointers in row_distribution_pt is not null.
       for (unsigned block_row_i = 0; block_row_i < matrix_nrow; block_row_i++)
       {
         if (row_distribution_pt[block_row_i] == 0)
         {
           std::ostringstream error_message;
           error_message << "The row distribution pointer in position "
                         << block_row_i << " is null.\n";
           throw OomphLibError(error_message.str(),
                               OOMPH_CURRENT_FUNCTION,
                               OOMPH_EXCEPTION_LOCATION);
         }
       }
  
       // Check that all pointers in row_distribution_pt is not null.
       for (unsigned block_col_i = 0; block_col_i < matrix_ncol; block_col_i++)
       {
         if (col_distribution_pt[block_col_i] == 0)
         {
           std::ostringstream error_message;
           error_message << "The column distribution pointer in position "
                         << block_col_i << " is null.\n";
           throw OomphLibError(error_message.str(),
                               OOMPH_CURRENT_FUNCTION,
                               OOMPH_EXCEPTION_LOCATION);
         }
       }
  
       // Check that all distributions are built.
       // First the row distributions
       for (unsigned block_row_i = 0; block_row_i < matrix_nrow; block_row_i++)
       {
         if (!row_distribution_pt[block_row_i]->built())
         {
           std::ostringstream error_message;
           error_message << "The distribution pointer in position "
                         << block_row_i << " is not built.\n";
           throw OomphLibError(error_message.str(),
                               OOMPH_CURRENT_FUNCTION,
                               OOMPH_EXCEPTION_LOCATION);
         }
       }
       // Now the column distributions
       for (unsigned block_col_i = 0; block_col_i < matrix_ncol; block_col_i++)
       {
         if (!col_distribution_pt[block_col_i]->built())
         {
           std::ostringstream error_message;
           error_message << "The distribution pointer in position "
                         << block_col_i << " is not built.\n";
           throw OomphLibError(error_message.str(),
                               OOMPH_CURRENT_FUNCTION,
                               OOMPH_EXCEPTION_LOCATION);
         }
       }
  
       // Check that all communicators in row_distribution_pt are the same.
       const OomphCommunicator first_row_comm =
         *(row_distribution_pt[0]->communicator_pt());
  
       for (unsigned block_row_i = 1; block_row_i < matrix_nrow; block_row_i++)
       {
         const OomphCommunicator current_comm =
           *(row_distribution_pt[block_row_i]->communicator_pt());
  
         if (first_row_comm != current_comm)
         {
           std::ostringstream error_message;
           error_message
             << "The communicator from the row distribution in position "
             << block_row_i << " is not the same as the first "
             << "communicator from row_distribution_pt";
           throw OomphLibError(error_message.str(),
                               OOMPH_CURRENT_FUNCTION,
                               OOMPH_EXCEPTION_LOCATION);
         }
       }
  
       // Check that all communicators in col_distribution_pt are the same as the
       // first row communicator from above.
       for (unsigned block_col_i = 0; block_col_i < matrix_ncol; block_col_i++)
       {
         const OomphCommunicator current_comm =
           *(col_distribution_pt[block_col_i]->communicator_pt());
  
         if (first_row_comm != current_comm)
         {
           std::ostringstream error_message;
           error_message
             << "The communicator from the col distribution in position "
             << block_col_i << " is not the same as the first "
             << "communicator from row_distribution_pt";
           throw OomphLibError(error_message.str(),
                               OOMPH_CURRENT_FUNCTION,
                               OOMPH_EXCEPTION_LOCATION);
         }
       }
  
       // Are all sub matrices built? If the matrix_pt is not null, make sure
       // that it is built.
       for (unsigned block_row_i = 0; block_row_i < matrix_nrow; block_row_i++)
       {
         for (unsigned block_col_i = 0; block_col_i < matrix_ncol; block_col_i++)
         {
           if (matrix_pt(block_row_i, block_col_i) != 0 &&
               !(matrix_pt(block_row_i, block_col_i)->built()))
           {
             std::ostringstream error_message;
             error_message << "The sub matrix_pt(" << block_row_i << ","
                           << block_col_i << ")\n"
                           << "is not built.\n";
             throw OomphLibError(error_message.str(),
                                 OOMPH_CURRENT_FUNCTION,
                                 OOMPH_EXCEPTION_LOCATION);
           }
         }
       }
  
       // For the matrices which are built, do they have the same communicator as
       // the first communicator from row_distribution_pt?
       for (unsigned block_row_i = 0; block_row_i < matrix_nrow; block_row_i++)
       {
         for (unsigned block_col_i = 0; block_col_i < matrix_ncol; block_col_i++)
         {
           if (matrix_pt(block_row_i, block_col_i) != 0)
           {
             const OomphCommunicator current_comm =
               *(matrix_pt(block_row_i, block_col_i)
                   ->distribution_pt()
                   ->communicator_pt());
             if (first_row_comm != current_comm)
             {
               std::ostringstream error_message;
               error_message
                 << "The sub matrix_pt(" << block_row_i << "," << block_col_i
                 << ")\n"
                 << "does not have the same communicator pointer as those in\n"
                 << "(row|col)_distribution_pt.\n";
               throw OomphLibError(error_message.str(),
                                   OOMPH_CURRENT_FUNCTION,
                                   OOMPH_EXCEPTION_LOCATION);
             }
           }
         }
       }
  
       // Do all dimensions of sub matrices "make sense"?
       // Compare the number of rows of each block matrix in a block row.
       for (unsigned block_row_i = 0; block_row_i < matrix_nrow; block_row_i++)
       {
         // Use the first column to compare against the rest.
         unsigned long current_block_nrow =
           row_distribution_pt[block_row_i]->nrow();
  
         // Compare against columns 0 to matrix_ncol - 1
         for (unsigned block_col_i = 0; block_col_i < matrix_ncol; block_col_i++)
         {
           // Perform the check if the matrix_pt is not null.
           if (matrix_pt(block_row_i, block_col_i) != 0)
           {
             // Get the nrow for this sub block.
             unsigned long subblock_nrow =
               matrix_pt(block_row_i, block_col_i)->nrow();
  
             if (current_block_nrow != subblock_nrow)
             {
               std::ostringstream error_message;
               error_message << "The sub matrix (" << block_row_i << ","
                             << block_col_i << ")\n"
                             << "requires nrow = " << current_block_nrow
                             << ", but has nrow = " << subblock_nrow << ".\n"
                             << "Either the row_distribution_pt is incorrect or "
                             << "the sub matrices are incorrect.\n";
               throw OomphLibError(error_message.str(),
                                   OOMPH_CURRENT_FUNCTION,
                                   OOMPH_EXCEPTION_LOCATION);
             }
           }
         }
       }
  
       // Compare the number of columns of each block matrix in a block column.
       for (unsigned block_col_i = 0; block_col_i < matrix_ncol; block_col_i++)
       {
         // Get the current block ncol from the linear algebra distribution.
         // Note that we assume that the dimensions are symmetrical.
         unsigned current_block_ncol = col_distribution_pt[block_col_i]->nrow();
  
         for (unsigned block_row_i = 0; block_row_i < matrix_nrow; block_row_i++)
         {
           if (matrix_pt(block_row_i, block_col_i) != 0)
           {
             // Get the ncol for this sub block.
             unsigned subblock_ncol =
               matrix_pt(block_row_i, block_col_i)->ncol();
  
             if (current_block_ncol != subblock_ncol)
             {
               std::ostringstream error_message;
               error_message << "The sub matrix (" << block_row_i << ","
                             << block_col_i << ")\n"
                             << "requires ncol = " << current_block_ncol
                             << ", but has ncol = " << subblock_ncol << ".\n"
                             << "Either the col_distribution_pt is incorrect or "
                             << "the sub matrices are incorrect.\n";
               throw OomphLibError(error_message.str(),
                                   OOMPH_CURRENT_FUNCTION,
                                   OOMPH_EXCEPTION_LOCATION);
             }
           }
         }
       }
  
       // Ensure that the distributions for all sub matrices in the same block
       // row are the same. This is because we permute the row across several
       // matrices.
  
       // Loop through each block row.
       for (unsigned block_row_i = 0; block_row_i < matrix_nrow; block_row_i++)
       {
         // Get the distribution from the first block in this row.
         LinearAlgebraDistribution* block_row_distribution_pt =
           row_distribution_pt[block_row_i];
  
         // Loop through the block columns
         for (unsigned block_col_i = 0; block_col_i < matrix_ncol; block_col_i++)
         {
           if (matrix_pt(block_row_i, block_col_i) != 0)
           {
             // Get the distribution for this block.
             LinearAlgebraDistribution* current_block_distribution_pt =
               matrix_pt(block_row_i, block_col_i)->distribution_pt();
  
             // Ensure that the in matrices is a square block matrix.
             if ((*block_row_distribution_pt) !=
                 (*current_block_distribution_pt))
             {
               std::ostringstream error_message;
               error_message
                 << "Sub block(" << block_row_i << "," << block_col_i << ")"
                 << "does not have the same distributoin as the first"
                 << "block in this block row.\n"
                 << "All distributions on a block row must be the same"
                 << "for this function to concatenate matrices.\n";
               throw OomphLibError(error_message.str(),
                                   OOMPH_CURRENT_FUNCTION,
                                   OOMPH_EXCEPTION_LOCATION);
             }
           }
         }
       }
 #endif
  
       // The communicator pointer from the first row_distribution_pt
       const OomphCommunicator* const comm_pt =
         row_distribution_pt[0]->communicator_pt();
  
       // Renamed for so it makes more sense.
       unsigned nblock_row = matrix_nrow;
  
       // If the result matrix does not have a distribution, then we concatenate
       // the sub matrix distributions.
       if (!result_matrix.distribution_pt()->built())
       {
         // The result distribution
         LinearAlgebraDistribution tmp_distribution;
         LinearAlgebraDistributionHelpers::concatenate(row_distribution_pt,
                                                       tmp_distribution);
  
         result_matrix.build(&tmp_distribution);
       }
       else
       // A distribution is supplied for the result matrix.
       {
 #ifdef PARANOID
         // Check that the result distribution is a concatenation of the
         // distributions of the sub matrices.
  
         LinearAlgebraDistribution wanted_distribution;
  
         LinearAlgebraDistributionHelpers::concatenate(row_distribution_pt,
                                                       wanted_distribution);
  
         if (*(result_matrix.distribution_pt()) != wanted_distribution)
         {
           std::ostringstream error_message;
           error_message
             << "The result distribution is not correct.\n"
             << "Please call the function without a result\n"
             << "distribution (clear the result matrix) or check the\n"
             << "distribution of the result matrix.\n"
             << "The result distribution must be the same as the one \n"
             << "created by\n"
             << "LinearAlgebraDistributionHelpers::concatenate(...)";
           throw OomphLibError(error_message.str(),
                               OOMPH_CURRENT_FUNCTION,
                               OOMPH_EXCEPTION_LOCATION);
         }
 #endif
       }
  
       // The rest of the paranoid checks.
 #ifdef PARANOID
  
       // Make sure that the communicator from the result matrix is the same as
       // all the others. This test is redundant if this function created the
       // result matrix distribution, since then it is guaranteed that the
       // communicators are the same.
       {
         // Communicator from the result matrix.
         const OomphCommunicator res_comm =
           *(result_matrix.distribution_pt()->communicator_pt());
  
         // Is the result communicator pointer the same as the others?
         // Since we have already tested the others, we only need to compare
         // against one of them. Say the first communicator from
         // row_distribution_pt.
         const OomphCommunicator first_comm =
           *(row_distribution_pt[0]->communicator_pt());
  
         if (res_comm != first_comm)
         {
           std::ostringstream error_message;
           error_message << "The OomphCommunicator of the result matrix is not "
                            "the same as the "
                         << "others!";
           throw OomphLibError(error_message.str(),
                               OOMPH_CURRENT_FUNCTION,
                               OOMPH_EXCEPTION_LOCATION);
         }
       }
  
       // Are all the distributed boolean the same? This only applies if we have
       // more than one processor. If there is only one processor, then it does
       // not matter if it is distributed or not - they are conceptually the
       // same.
       if (comm_pt->nproc() != 1)
       {
         // Compare distributed for sub matrices (against the result matrix).
         const bool res_distributed = result_matrix.distributed();
  
         // Loop over all sub blocks.
         for (unsigned block_row_i = 0; block_row_i < matrix_nrow; block_row_i++)
         {
           for (unsigned block_col_i = 0; block_col_i < matrix_ncol;
                block_col_i++)
           {
             if (matrix_pt(block_row_i, block_col_i) != 0)
             {
               const bool another_distributed =
                 matrix_pt(block_row_i, block_col_i)->distributed();
  
               if (res_distributed != another_distributed)
               {
                 std::ostringstream error_message;
                 error_message << "The distributed boolean of the sub matrix ("
                               << block_row_i << "," << block_col_i << ")\n"
                               << "is not the same as the result matrix. \n";
                 throw OomphLibError(error_message.str(),
                                     OOMPH_CURRENT_FUNCTION,
                                     OOMPH_EXCEPTION_LOCATION);
               }
             }
           }
         }
  
         // Do this test for row_distribution_pt
         const bool first_row_distribution_distributed =
           row_distribution_pt[0]->distributed();
  
         for (unsigned block_row_i = 1; block_row_i < matrix_nrow; block_row_i++)
         {
           const bool another_distributed =
             row_distribution_pt[block_row_i]->distributed();
  
           if (first_row_distribution_distributed != another_distributed)
           {
             std::ostringstream error_message;
             error_message
               << "The distributed boolean of row_distribution_pt["
               << block_row_i << "]\n"
               << "is not the same as the one from row_distribution_pt[0]. \n";
             throw OomphLibError(error_message.str(),
                                 OOMPH_CURRENT_FUNCTION,
                                 OOMPH_EXCEPTION_LOCATION);
           }
         }
  
         // Repeat for col_distribution_pt
         for (unsigned block_col_i = 0; block_col_i < matrix_ncol; block_col_i++)
         {
           const bool another_distributed =
             col_distribution_pt[block_col_i]->distributed();
  
           if (first_row_distribution_distributed != another_distributed)
           {
             std::ostringstream error_message;
             error_message
               << "The distributed boolean of col_distribution_pt["
               << block_col_i << "]\n"
               << "is not the same as the one from row_distribution_pt[0]. \n";
             throw OomphLibError(error_message.str(),
                                 OOMPH_CURRENT_FUNCTION,
                                 OOMPH_EXCEPTION_LOCATION);
           }
         }
       }
 #endif
  
       /// /////////////// END OF PARANOID TESTS
       /// ////////////////////////////////////
  
       // The number of processors.
       unsigned nproc = comm_pt->nproc();
  
       // Cache the result distribution pointer for convenience.
       LinearAlgebraDistribution* res_distribution_pt =
         result_matrix.distribution_pt();
  
       // nrow_local for the result matrix
       unsigned res_nrow_local = res_distribution_pt->nrow_local();
  
       // renamed for readability.
       unsigned nblock_col = matrix_ncol;
  
       // construct the block offset
       //  DenseMatrix<unsigned> col_offset(nproc,nblock_col,0);
       std::vector<std::vector<unsigned>> col_offset(
         nproc, std::vector<unsigned>(nblock_col));
       unsigned off = 0;
       for (unsigned proc_i = 0; proc_i < nproc; proc_i++)
       {
         for (unsigned block_i = 0; block_i < nblock_col; block_i++)
         {
           col_offset[proc_i][block_i] = off;
           off += col_distribution_pt[block_i]->nrow_local(proc_i);
         }
       }
  
       // Do some pre-processing for the processor number a global row number is
       // on. This is required when permuting the column entries.
       // We need to do this for each distribution, so we have a vector of
       // vectors. First index corresponds to the distribution, the second is
       // the processor number.
       std::vector<std::vector<unsigned>> p_for_rows(nblock_col,
                                                     std::vector<unsigned>());
       // initialise 2D vector
       for (unsigned blocki = 0; blocki < nblock_col; blocki++)
       {
         int blockinrow = col_distribution_pt[blocki]->nrow();
         p_for_rows[blocki].resize(blockinrow);
         // FOR each global index in the block, work out the corresponding proc.
         for (int rowi = 0; rowi < blockinrow; rowi++)
         {
           unsigned p = 0;
           int b_first_row = col_distribution_pt[blocki]->first_row(p);
           int b_nrow_local = col_distribution_pt[blocki]->nrow_local(p);
  
           while (rowi < b_first_row || rowi >= b_nrow_local + b_first_row)
           {
             p++;
             b_first_row = col_distribution_pt[blocki]->first_row(p);
             b_nrow_local = col_distribution_pt[blocki]->nrow_local(p);
           }
           p_for_rows[blocki][rowi] = p;
         }
       }
  
       // determine nnz of all blocks on this processor only.
       // This is used to create storage space.
       unsigned long res_nnz = 0;
       for (unsigned row_i = 0; row_i < nblock_row; row_i++)
       {
         for (unsigned col_i = 0; col_i < nblock_col; col_i++)
         {
           if (matrix_pt(row_i, col_i) != 0)
           {
             res_nnz += matrix_pt(row_i, col_i)->nnz();
           }
         }
       }
  
       // My rank
       //    unsigned my_rank = comm_pt->my_rank();
       //    my_rank = my_rank;
  
       // Turn the above into a string.
       //      std::ostringstream myrankstream;
       //      myrankstream << "THISDOESNOTHINGnp" << my_rank << std::endl;
       //      std::string myrankstring = myrankstream.str();
  
  
       // CALLGRIND_ZERO_STATS;
       // CALLGRIND_START_INSTRUMENTATION;
  
       // storage for the result matrix.
       int* res_row_start = new int[res_nrow_local + 1];
       int* res_column_index = new int[res_nnz];
       double* res_value = new double[res_nnz];
  
       // initialise the zero-th entry
       res_row_start[0] = 0;
  
       // loop over the block rows
       unsigned long res_i = 0; // index for the result matrix.
       unsigned long res_row_i = 0; // index for the row
       for (unsigned i = 0; i < nblock_row; i++)
       {
         // loop over the rows of the current block local rows.
         unsigned block_nrow = row_distribution_pt[i]->nrow_local();
         for (unsigned k = 0; k < block_nrow; k++)
         {
           // initialise res_row_start
           res_row_start[res_row_i + 1] = res_row_start[res_row_i];
  
           // Loop over the block columns
           for (unsigned j = 0; j < nblock_col; j++)
           {
             // if block(i,j) pointer is not null then
             if (matrix_pt(i, j) != 0)
             {
               // get pointers for the elements in the current block
               int* b_row_start = matrix_pt(i, j)->row_start();
               int* b_column_index = matrix_pt(i, j)->column_index();
               double* b_value = matrix_pt(i, j)->value();
  
               // memcpy( &dst[dstIdx], &src[srcIdx], numElementsToCopy * sizeof(
               // Element ) );
               // no ele to copy
               int numEleToCopy = b_row_start[k + 1] - b_row_start[k];
               memcpy(res_value + res_i,
                      b_value + b_row_start[k],
                      numEleToCopy * sizeof(double));
               // Loop through the current local row.
               for (int l = b_row_start[k]; l < b_row_start[k + 1]; l++)
               {
                 // if b_column_index[l] was a row index, what processor
                 // would it be on
                 //            unsigned p = col_distribution_pt[j]
                 //              ->rank_of_global_row_map(b_column_index[l]);
                 unsigned p = p_for_rows[j][b_column_index[l]];
  
                 int b_first_row = col_distribution_pt[j]->first_row(p);
                 //            int b_nrow_local =
                 //            col_distribution_pt[j]->nrow_local(p);
  
                 //            while (b_column_index[l] < b_first_row ||
                 //                   b_column_index[l] >=
                 //                   b_nrow_local+b_first_row)
                 //             {
                 //              p++;
                 //              b_first_row =
                 //              col_distribution_pt[j]->first_row(p);
                 //              b_nrow_local =
                 //              col_distribution_pt[j]->nrow_local(p);
                 //             }
  
                 // determine the local equation number in the block j/processor
                 // p "column block"
                 int eqn = b_column_index[l] - b_first_row;
  
                 // add to the result matrix
                 //            res_value[res_i] = b_value[l];
                 res_column_index[res_i] = col_offset[p][j] + eqn;
                 res_row_start[res_row_i + 1]++;
                 res_i++;
               }
             }
           }
  
           // increment the row pt
           res_row_i++;
         }
       }
       // CALLGRIND_STOP_INSTRUMENTATION;
       // CALLGRIND_DUMP_STATS_AT(myrankstring.c_str());
  
  
       // Get the number of columns of the result matrix.
       unsigned res_ncol = 0;
       for (unsigned block_col_i = 0; block_col_i < matrix_ncol; block_col_i++)
       {
         res_ncol += col_distribution_pt[block_col_i]->nrow();
       }
  
       // Build the result matrix.
       result_matrix.build_without_copy(
         res_ncol, res_nnz, res_value, res_column_index, res_row_start);
     }
  
  
     //============================================================================
     /// Concatenate CRDoubleMatrix matrices.
     /// This calls the other concatenate_without_communication(...) function,
     /// passing block_distribution_pt as both the row_distribution_pt and
     /// col_distribution_pt. This should only be called for block square
     /// matrices.
     //============================================================================
     void concatenate_without_communication(
       const Vector<LinearAlgebraDistribution*>& block_distribution_pt,
       const DenseMatrix<CRDoubleMatrix*>& matrix_pt,
       CRDoubleMatrix& result_matrix)
     {
 #ifdef PARANOID
       // The number of block rows and block columns.
       unsigned matrix_nrow = matrix_pt.nrow();
       unsigned matrix_ncol = matrix_pt.ncol();
  
       // Are there matrices to concatenate?
       if (matrix_nrow == 0)
       {
         std::ostringstream error_message;
         error_message << "There are no matrices to concatenate.\n";
         throw OomphLibError(error_message.str(),
                             OOMPH_CURRENT_FUNCTION,
                             OOMPH_EXCEPTION_LOCATION);
       }
  
       // Ensure that the sub matrices is a square block matrix.
       if (matrix_nrow != matrix_ncol)
       {
         std::ostringstream error_message;
         error_message
           << "The number of block rows and block columns\n"
           << "must be the same. Otherwise, call the other\n"
           << "concatenate_without_communication function, passing in\n"
           << "a Vector of distributions describing how to permute the\n"
           << "columns.";
         throw OomphLibError(error_message.str(),
                             OOMPH_CURRENT_FUNCTION,
                             OOMPH_EXCEPTION_LOCATION);
       }
 #endif
  
       concatenate_without_communication(
         block_distribution_pt, block_distribution_pt, matrix_pt, result_matrix);
     }
  
   } // namespace CRDoubleMatrixHelpers
  
 } // namespace oomph