oomph-lib: block_preconditioner.cc Source File

Go to the documentation of this file.
 // LIC// ====================================================================
 // LIC// This file forms part of oomph-lib, the object-oriented,
 // LIC// multi-physics finite-element library, available
 // LIC// at http://www.oomph-lib.org.
 // LIC//
 // LIC// Copyright (C) 2006-2024 Matthias Heil and Andrew Hazel
 // LIC//
 // LIC// This library is free software; you can redistribute it and/or
 // LIC// modify it under the terms of the GNU Lesser General Public
 // LIC// License as published by the Free Software Foundation; either
 // LIC// version 2.1 of the License, or (at your option) any later version.
 // LIC//
 // LIC// This library is distributed in the hope that it will be useful,
 // LIC// but WITHOUT ANY WARRANTY; without even the implied warranty of
 // LIC// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
 // LIC// Lesser General Public License for more details.
 // LIC//
 // LIC// You should have received a copy of the GNU Lesser General Public
 // LIC// License along with this library; if not, write to the Free Software
 // LIC// Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA
 // LIC// 02110-1301  USA.
 // LIC//
 // LIC// The authors may be contacted at oomph-lib@maths.man.ac.uk.
 // LIC//
 // LIC//====================================================================
 #include "block_preconditioner.h"
  
 namespace oomph
 {
   /// Static boolean to allow block_matrix_test(...) to be run.
   /// Defaults to false.
   template<typename MATRIX>
   bool BlockPreconditioner<MATRIX>::Run_block_matrix_test = false;
  
  
   //============================================================================
   /// Determine the size of the matrix blocks and setup the
   /// lookup schemes relating the global degrees of freedom with
   /// their "blocks" and their indices (row/column numbers) in those
   /// blocks.
   /// The distributions of the preconditioner and the blocks are
   /// automatically specified (and assumed to be uniform) at this
   /// stage.
   /// This method should be used if any block contains more than one
   /// type of DOF. The argument vector dof_to_block_map should be of length
   /// ndof. Each element should contain an integer indicating the block number
   /// corresponding to that type of DOF.
   //============================================================================
   template<typename MATRIX>
   void BlockPreconditioner<MATRIX>::block_setup(
     const Vector<unsigned>& dof_to_block_map_in)
   {
 #ifdef PARANOID
     // Subsidiary preconditioners don't really need the meshes
     if (this->is_master_block_preconditioner())
     {
       std::ostringstream err_msg;
       unsigned n = nmesh();
       if (n == 0)
       {
         err_msg << "No meshes have been set for this block preconditioner!\n"
                 << "Set one with set_nmesh(...), set_mesh(...)" << std::endl;
         throw OomphLibError(
           err_msg.str(), OOMPH_CURRENT_FUNCTION, OOMPH_EXCEPTION_LOCATION);
         for (unsigned m = 0; m < n; m++)
         {
           if (Mesh_pt[m] == 0)
           {
             err_msg << "The mesh pointer to mesh " << m << " is null!\n"
                     << "Set a non-null one with set_mesh(...)" << std::endl;
             throw OomphLibError(
               err_msg.str(), OOMPH_CURRENT_FUNCTION, OOMPH_EXCEPTION_LOCATION);
           }
         }
       }
     }
 #endif
  
     // Create a copy of the vector input so that we can modify it below
     Vector<unsigned> dof_to_block_map = dof_to_block_map_in;
  
     if (is_subsidiary_block_preconditioner())
     {
 #ifdef PARANOID
       // Get the size of the Doftype_in_master_preconditioner_coarse.
       unsigned para_doftype_in_master_preconditioner_coarse_size =
         Doftype_in_master_preconditioner_coarse.size();
  
       // Check that the Doftype_in_master_preconditioner_coarse vector is not
       // empty. This must be set (via the function
       // turn_into_subsidiary_block_preconditioner) if this is a
       // subsidiary block preconditioner.
       if (para_doftype_in_master_preconditioner_coarse_size == 0)
       {
         std::ostringstream err_msg;
         err_msg << "The mapping from the dof types of the master "
                 << "block preconditioner \n"
                 << "to the subsidiary block preconditioner is empty.\n"
                 << "Doftype_in_master_preconditioner_coarse.size() == 0 \n"
                 << "has turn_into_subsidiary_block_preconditioner(...)\n"
                 << "been called with the correct parameters?\n"
                 << std::endl;
         throw OomphLibError(
           err_msg.str(), OOMPH_CURRENT_FUNCTION, OOMPH_EXCEPTION_LOCATION);
       }
  
  
       // PARANOID checks for Doftype_coarsen_map_coarse
       // This is also set in the function
       // turn_into_subsidiary_block_preconditioner(...).
       //
       // The Doftype_coarsen_map_coarse vector must satisfy two conditions
       // for it to be valid.
       //
       // 1) The dof type numbers in the dof_coarsen_map vector must be
       //    unique. For example, it does not make sense to have the vector
       //    [[0,1][1,2]] because the first inner vector says
       //    "treat dof types 0 and 1 as dof type 0" and the second inner vector
       //    says "treat dof type 1 and 2 as dof type 1", but dof type 1 is
       //    already being treated as dof type 0.
       //
       // 2) Every SUBSIDIARY dof type must be mapped to a dof type in the
       //    Doftype_coarsen_map_coarse vector.
       //    For example, if there are 5 dof types (passed down from the master
       //    block preconditioner), and this block subsidiary block
       //    preconditioner only deals with 3 dof types, then all 5 dof types
       //    must be mapped to a dof type in the subsidiary preconditioner. For
       //    example if the dof_map is [1,2,3,4,5], then the subsidiary block
       //    preconditioner knows that 5 dof types have been passed down. But if
       //    it only works with three dof types, we MUST have three inner vectors
       //    in the doftype_coarsen_map vector (which corresponds to dof types 0,
       //    1 and 2), the union of the dof types in the three inner vectors must
       //    contain dof types 0, 1, 2, 3 and 4 exactly once. It cannot contain,
       //    say, 0, 1, 5, 7, 9, even though it passes the uniqueness check. We
       //    ensure this by two conditions:
       //
       //    2.1) The Doftype_coarsen_map_coarse vector must contain the same
       //         number of dof types as the dof_map vector.
       //         In other words, recall that Doftype_coarsen_map_coarse is a
       //         2D vector, this must contain the same number of vectors as
       //         there are elements in the dof_to_block_map_in vector.
       //
       //    2.2) The maximum element in the doftype_coarsen_map_coarse vector
       //         is the length of the dof_map vector minus 1.
  
       // A set is deal for checking the above three conditions, we shall insert
       // all the elements in the doftype_coarsen_map_coarse vector into this
       // set.
       std::set<unsigned> doftype_map_set;
  
       // Condition (1): Check for uniqueness by inserting all the values of
       // Doftype_coarsen_map_coarse into a set.
       unsigned para_doftype_coarsen_map_coarse_size =
         Doftype_coarsen_map_coarse.size();
  
       // Loop through the outer vector of Doftype_coarsen_map_coarse
       // then loop through the inner vectors and attempt to insert each
       // element of Doftype_coarsen_map_coarse into doftype_map_set.
       //
       // The inner for loop will throw an error if we cannot insert the
       // element, this means that it is already inserted and thus not unique.
       for (unsigned i = 0; i < para_doftype_coarsen_map_coarse_size; i++)
       {
         // Loop through the inner vector
         unsigned para_doftype_coarsen_map_coarse_i_size =
           Doftype_coarsen_map_coarse[i].size();
         for (unsigned j = 0; j < para_doftype_coarsen_map_coarse_i_size; j++)
         {
           // Attempt to insert all the values of the inner vector into a set.
           std::pair<std::set<unsigned>::iterator, bool> doftype_map_ret =
             doftype_map_set.insert(Doftype_coarsen_map_coarse[i][j]);
  
           if (!doftype_map_ret.second)
           {
             std::ostringstream err_msg;
             err_msg << "Error: the doftype number "
                     << Doftype_coarsen_map_coarse[i][j]
                     << " is already inserted." << std::endl;
             throw OomphLibError(
               err_msg.str(), OOMPH_CURRENT_FUNCTION, OOMPH_EXCEPTION_LOCATION);
           }
         }
       }
  
       // Condition (2.1): Check that the doftype_map_set describes as many
       // values as doftype_in_master_preconditioner_coarse. I.e. if dof_map
       // contains 5 dof types, then the doftype_coarsen_map_coarse vector must
       // also contain 5 dof types.
       if (para_doftype_in_master_preconditioner_coarse_size !=
           doftype_map_set.size())
       {
         std::ostringstream err_msg;
         err_msg << "The size of doftype_in_master_preconditioner_coarse "
                 << "must be the same as the total\n"
                 << "number of values in the doftype_coarsen_map_coarse vector."
                 << std::endl;
         throw OomphLibError(
           err_msg.str(), OOMPH_CURRENT_FUNCTION, OOMPH_EXCEPTION_LOCATION);
       }
  
       // Condition (2.2): Check that the maximum element in the
       // doftype_coarsen_map_coarse vector is the length of the
       // doftype_in_master_preconditioner_coarse minus 1.
       unsigned para_doftype_in_master_preconditioner_coarse_size_minus_one =
         para_doftype_in_master_preconditioner_coarse_size - 1;
       if (para_doftype_in_master_preconditioner_coarse_size_minus_one !=
           *doftype_map_set.rbegin())
       {
         std::ostringstream err_msg;
         err_msg << "The maximum dof type number in the "
                 << "doftype_coarsen_map vector must be "
                 << para_doftype_in_master_preconditioner_coarse_size_minus_one
                 << std::endl;
         throw OomphLibError(
           err_msg.str(), OOMPH_CURRENT_FUNCTION, OOMPH_EXCEPTION_LOCATION);
       }
 #endif
  
       // Set the mapping from the master preconditioner DOF types to the
       // subsidiary preconditioner DOF types.
       //
       // IMPORTANT: Since DOF types may be coarsened in the master block
       // preconditioner, this may no longer reflect the actual underlying dof
       // types. We must get the actual underlying dof types for the
       // block_setup(...) function to work properly so all the look up schemes
       // for this (subsidiary) block preconditioner is correct and works
       // properly, this is for backwards compatibility purposes and to make sure
       // Richard Muddle's still works at this (subsidiary) level, although it
       // may not be used.
       //
       // If we do not want to make it backwards compatible, we may as well
       // kill the block_setup(...) for subsidiary block preconditioners -
       // but other thing may break. Do it at your own risk (take time to
       // fully understand the whole block preconditioning framework code).
  
       // Create the corresponding Doftype_in_master_preconditioner_fine and
       // Doftype_coarsen_map_fine vectors.
  
       // First resize the vectors.
       Doftype_in_master_preconditioner_fine.resize(0);
       Doftype_coarsen_map_fine.resize(0);
  
       // The Doftype_in_master_preconditioner_fine vector is easy.  We know that
       // the Doftype_coarsen_map_fine in the master preconditioner must be
       // constructed already. So we simply loop through the values in
       // doftype_in_master_preconditioner_coarse, then get the most fine grain
       // dof types from the master preconditioner's Doftype_coarsen_map_fine
       // vector.
       //
       // For example, if the master preconditioner has the vector:
       // Doftype_coarsen_map_fine = [0,1,2,3][4,5,6,7][8,9,10,11][12,13][14,15]
       //
       // and passes the two vectors
       // doftype_in_master_preconditioner_coarse = [1,2,3]
       // doftype_coarsen_map_coarse = [[0][1,2]]
       //
       // Then we want
       // Doftype_in_master_preconditioner_fine = [4,5,6,7,8,9,10,11,12,13]
       //
       // We achieve this by looking up the corresponding fine dof types in the
       // masters' Doftype_coarsen_map_fine vector which corresponds to the
       // values in Doftype_in_master_preconditioner_coarse.
       //
       // That is, the values in Doftype_in_master_preconditioner_coarse gives us
       // the index of sub vector we want in the master's
       // Doftype_coarsen_map_fine vector.
  
 #ifdef PARANOID
       // Check that the master block preconditioner's Doftype_coarsen_map_fine
       // is set up. Under the current implementation, this would always be set
       // up properly, but we check it just in case!
       if (master_block_preconditioner_pt()->doftype_coarsen_map_fine().size() ==
           0)
       {
         std::ostringstream err_msg;
         err_msg << "The master block preconditioner's "
                 << "Doftype_coarsen_map_fine is not\n"
                 << "set up properly.\n"
                 << "\n"
                 << "This vector is constructed in the function "
                 << "block_setup(...).\n"
                 << std::endl;
         throw OomphLibError(
           err_msg.str(), OOMPH_CURRENT_FUNCTION, OOMPH_EXCEPTION_LOCATION);
       }
 #endif
  
       unsigned doftype_in_master_preconditioner_coarse_size =
         Doftype_in_master_preconditioner_coarse.size();
       for (unsigned i = 0; i < doftype_in_master_preconditioner_coarse_size;
            i++)
       {
         // The index of the sub vector we want.
         unsigned subvec_index = Doftype_in_master_preconditioner_coarse[i];
  
         // Get the corresponding most fine grain sub vector from the master
         // block preconditioner
         Vector<unsigned> tmp_master_dof_subvec =
           Master_block_preconditioner_pt->get_fine_grain_dof_types_in(
             subvec_index);
  
         Doftype_in_master_preconditioner_fine.insert(
           Doftype_in_master_preconditioner_fine.end(),
           tmp_master_dof_subvec.begin(),
           tmp_master_dof_subvec.end());
       }
  
       // The Doftype_coarsen_map_fine vector is a bit more tricky.
       // The Doftype_coarsen_map_coarse vector describes which coarse dof types
       // of THIS preconditioner are grouped together. We have to translate this
       // into the most fine grain dof types.
       //
       // For example, if
       // Doftype_coarsen_map_coarse = [[0][1,2]]
       // Doftype_in_master_preconditioner_coarse = [1,2,3]
       //
       // and the MASTER preconditioner has:
       // Doftype_coarsen_map_fine= [[0,1,2,3][4,5,6,7][8,9,10,11][12,13][14,15]]
       //
       // Then [[0][1,2]] tell us that the most fine grain DOF types 1 of the
       // master preconditioner most be grouped together, and the most fine
       // grained dof types 2 and 3 of the master preconditioner must be grouped
       // together.
       //
       // This gives the vector [[4,5,6,7] [8,9,10,11,12,13]], translating this
       // into the local DOF types of this preconditioner we have
       // Doftype_coarsen_map_fine = [[0,1,2,3][4,5,6,7,8,9]]. This corresponds
       // with the Doftype_in_master_preconditioner_fine vector we created above:
       // Doftype_in_master_preconditioner_fine = [4,5,6,7,8,9,10,11,12,13]
       //
       // Together, the master block preconditioner says to THIS subsidiary block
       // preconditioner "work on my DOF types [4,5,6,7,8,9,10,11,12,13], but
       // group your DOF type [0,1,2,3] together as DOF type 0 and [4,5,6,7,8,9]
       // together together as DOF type 1".
       //
       // Think of it like this: For each DOF type in Doftype_coarsen_map_coarse
       // we look at how many values this corresponds to in the master
       // preconditioner. In this case, Doftype_coarsen_map_coarse:
       //
       // 1 - corresponds to fine DOF types 0,1,2,3 in this preconditioner,
       // and 4,5,6,7 in the master preconditioner;
       //
       // 2 - corresponds to fine DOF types 4,5,6,7 in this preconditioner,
       // and 8,9,10,11 in the master preconditioner;
       //
       // 3 - corresponds to fine DOF types 8,9 in this preconditioner,
       // and 12,13 in the master preconditioner.
       //
       // Thus Doftype_coarsen_map_fine = [[0,1,2,3][4,5,6,7,8,9]]
       //
       /// /////////////////////////////////////////////////////////////////////
       //
       // How to do this: First we create a 2D vector which has the corresponds
       // to the fine dof types in the master preconditioner but starting from
       // 0. For example, take the above example (repeated below):
       //   Passed to this prec by the master prec:
       //   Doftype_coarsen_map_coarse = [[0][1,2]]
       //   Doftype_in_master_preconditioner_coarse = [1,2,3]
       //
       // and the MASTER preconditioner has:
       // Doftype_coarsen_map_fine= [[0,1,2,3][4,5,6,7][8,9,10,11][12,13][14,15]]
       //
       // Step 1:
       // Then, the temp 2D vector we want to create is:
       // master_fine_doftype_translated = [[0 1 2 3], [4,5,6,7], [8,9]]
       // This comes from using Doftype_in_master_preconditioner_coarse
       // then get the number of fine dof types in the master.
       //
       // Step 2:
       // Then:
       //   Loop through the vector Doftype_coarsen_map_coarse,
       //     Loop over the inner vectors in Doftype_coarsen_map_coarse
       //       Each element in the inner vector corresponds to a vector in
       //       master_fine_doftype_translated. We push in the vectors of
       //       master_fine_doftype_translated intp Doftype_coarsen_map_fine
       //
  
       Vector<Vector<unsigned>> master_fine_doftype_translated;
       unsigned dof_type_index = 0;
       for (unsigned i = 0; i < doftype_in_master_preconditioner_coarse_size;
            i++)
       {
         // How many fine DOF types are in the master's
         // Doftype_in_master_preconditioner_coarse[i]?
         unsigned coarse_dof = Doftype_in_master_preconditioner_coarse[i];
  
         unsigned n_master_fine_doftypes =
           Master_block_preconditioner_pt->nfine_grain_dof_types_in(coarse_dof);
  
         Vector<unsigned> tmp_sub_vec;
         for (unsigned j = 0; j < n_master_fine_doftypes; j++)
         {
           tmp_sub_vec.push_back(dof_type_index);
           dof_type_index++;
         }
         master_fine_doftype_translated.push_back(tmp_sub_vec);
       }
  
  
       // master_fine_doftype_translated now contains vectors with values are
       // from 0, 1, 2, ..,
       //
       // Now read out the values of master_fine_doftype_translated and place
       // them in order according to Doftype_coarsen_map_coarse.
       unsigned doftype_coarsen_map_coarse_size =
         Doftype_coarsen_map_coarse.size();
       for (unsigned i = 0; i < doftype_coarsen_map_coarse_size; i++)
       {
         Vector<unsigned> tmp_vec;
         unsigned doftype_coarsen_map_coarse_i_size =
           Doftype_coarsen_map_coarse[i].size();
         for (unsigned j = 0; j < doftype_coarsen_map_coarse_i_size; j++)
         {
           unsigned subvec_i = Doftype_coarsen_map_coarse[i][j];
  
           tmp_vec.insert(tmp_vec.end(),
                          master_fine_doftype_translated[subvec_i].begin(),
                          master_fine_doftype_translated[subvec_i].end());
         }
  
         Doftype_coarsen_map_fine.push_back(tmp_vec);
       }
  
       // Get the number of block types (and DOF types) in this preconditioner
       // from the length of the dof_map vector.
       Internal_ndof_types = Doftype_in_master_preconditioner_fine.size();
  
       // Nblock_types is later updated in block_setup(...)
       Internal_nblock_types = Internal_ndof_types;
  
       // Compute number of rows in this (sub) preconditioner using data from
       // the master.
       Nrow = 0;
       for (unsigned b = 0; b < Internal_ndof_types; b++)
       {
         Nrow += this->internal_dof_block_dimension(b);
       }
  
 #ifdef PARANOID
       if (Nrow == 0)
       {
         std::ostringstream error_message;
         error_message
           << "Nrow=0 in subsidiary preconditioner. This seems fishy and\n"
           << "suggests that block_setup() was not called for the \n"
           << "master block preconditioner yet.";
         throw OomphLibWarning(error_message.str(),
                               OOMPH_CURRENT_FUNCTION,
                               OOMPH_EXCEPTION_LOCATION);
       }
 #endif
     }
  
     // If this is a master block preconditioner, then set the
     // Doftype_coarsen_map_fine and Doftype_coarsen_map_coarse to the
     // identity. Recall that the Doftype_coarsen_map_fine maps the dof types
     // that this preconditioner requires with the most fine grain dof types (the
     // internal dof types) and the Doftype_coarsen_map_coarse maps the dof
     // types that this preconditioner requires with the dof types which this
     // preconditioner is given from a master preconditioner (these dof types may
     // or may not be coarsened). In the case of the master preconditioner, these
     // are the same (since dof types are not coarsened), furthermore the
     // identity mapping is provided to say that dof type 0 maps to dof type 0,
     // dof type 1 maps to dof type 1,
     // dof type 2 maps to dof type 2,
     // etc...
     //
     // If this is not a master block preconditioner, then the vectors
     // Doftype_coarsen_map_fine and Doftype_coarsen_map_coarse is handled
     // by the turn_into_subsidiary_block_preconditioner(...) function.
     if (is_master_block_preconditioner())
     {
       // How many dof types does this preconditioner work with?
       unsigned n_external_dof_types = dof_to_block_map.size();
  
       // Note: at the master level, the n_external_dof_types should be the same
       // as the internal_ndof_types(), since the dof_to_block_map MUST describe
       // the mapping between every dof type (not yet coarsened - so it is the
       // same number as the internal dof types) to the block types. But we
       // distinguish them for clarity. We also check that this is the case.
 #ifdef PARANOID
       unsigned n_internal_dof_types = internal_ndof_types();
  
       if (n_internal_dof_types != n_external_dof_types)
       {
         std::ostringstream err_msg;
         err_msg
           << "The internal ndof types and the length of the dof_to_block_map\n"
           << "vector is not the same. Since this is the master block "
           << "preconditioner,\n"
           << "you must describe which block each DOF type belongs to,\n"
           << "no more, no less."
           << "internal_ndof_types = " << n_internal_dof_types << "\n"
           << "dof_to_block_map.size() = " << n_external_dof_types << "\n";
         throw OomphLibWarning(
           err_msg.str(), OOMPH_CURRENT_FUNCTION, OOMPH_EXCEPTION_LOCATION);
       }
 #endif
  
       // Clear and reserve space.
       Doftype_coarsen_map_fine.clear();
       Doftype_coarsen_map_coarse.clear();
       Doftype_coarsen_map_fine.reserve(n_external_dof_types);
       Doftype_coarsen_map_coarse.reserve(n_external_dof_types);
  
       // Now push back the identity mapping.
       for (unsigned i = 0; i < n_external_dof_types; i++)
       {
         // Create a vector and push it in.
         Vector<unsigned> tmp_vec(1, i);
         Doftype_coarsen_map_fine.push_back(tmp_vec);
         Doftype_coarsen_map_coarse.push_back(tmp_vec);
       }
     }
     else
     // Else this is a subsidiary block preconditioner.
     {
       // Both the Doftype_coarsen_map_fine and Doftype_coarsen_map_coarse
       // vectors must be already be handled by the
       // turn_into_subsidiary_block_preconditioner(...) function. We check this.
 #ifdef PARANOID
       if ((Doftype_coarsen_map_fine.size() == 0) ||
           (Doftype_coarsen_map_coarse.size() == 0))
       {
         std::ostringstream err_msg;
         err_msg << "Either the Doftype_coarsen_map_fine or the \n"
                 << "Doftype_coarsen_map_coarse vectors is of size 0.\n"
                 << "Did you remember to call the function "
                 << "turn_into_subsidiary_block_preconditioner(...)?";
         throw OomphLibWarning(
           err_msg.str(), OOMPH_CURRENT_FUNCTION, OOMPH_EXCEPTION_LOCATION);
       }
 #endif
     }
  
  
     // Now we create the vector Block_to_dof_map_coarse.
     // Recall that the vector describe which dof types are in which block with
     // the relationship:
     //
     // Block_to_dof_map_coarse[block_number] = Vector[dof types];
     //
     // Note that this is not the internal (underlying) dof type.
     // Nor is this in relation to the parent block preconditioner's dof type.
     // The number of elements in it is the same as dof_to_block_map vector.
     //
     // Since the dof type coarsening feature is added later, we encapsulate this
     // bit of the code so it does not affect things below.
     {
       // Check that the dof_to_block map "makes sense" for the
       // Doftype_coarsen_map_coarse.
       // The Doftype_coarsen_map_coarse describes which doftypes should be
       // considered as a single doftype in THIS preconditioner.
       //
       // For example, if this preconditioner is the LSC block preconditioner
       // applied to a 3D problem, it expects 4 doftypes:
       // 3 velocity, [u, v, w] and 1 pressure [p],
       // giving us the dof type ordering
       // [u v w p].
       //
       // The LSC preconditioner groups the velocity and pressure doftypes
       // separately, thus the dof_to_block_map will be:
       // [0 0 0 1]
       //
       // Then the Doftype_coarsen_map_coarse MUST have length 4, to identify
       // which of the OTHER (possibly coarsened) dof types should be grouped
       // together to be considered as a single dof types of THIS preconditioner.
       //
       // For example, if the preconditioner above this one has the dof type
       // ordering:
       // 0  1  2  3  4  5  6  7  8  9
       // ub vb wb up vp wp ut vt wt p
       // Then we want to tell THIS preconditioner which dof types belongs to
       // u, v, w and p, by providing the optional argument
       // Doftype_coarsen_map_coarse to the
       // turn_into_subsidiary_block_preconditioner(...) function
       // [[0 3 6] [1 4 7] [2 5 8] [9]]
       //
       // So, it is important that the length of dof_to_block_map is the same as
       // the length of Doftype_coarsen_map_coarse. We check this.
       unsigned dof_to_block_map_size = dof_to_block_map.size();
  
 #ifdef PARANOID
       if (dof_to_block_map_size != Doftype_coarsen_map_coarse.size())
       {
         std::ostringstream err_msg;
         err_msg
           << "The size of dof_to_block_map and Doftype_coarsen_map_coarse is "
              "not "
           << "the same.\n"
           << "dof_to_block_map.size() = " << dof_to_block_map_size << "\n"
           << "Doftype_coarsen_map_coarse.size() = "
           << Doftype_coarsen_map_coarse.size() << ".\n"
           << "One of the two list is incorrect, please look at the comments\n"
           << "in the source code for more details.";
         throw OomphLibWarning(
           err_msg.str(), OOMPH_CURRENT_FUNCTION, OOMPH_EXCEPTION_LOCATION);
       }
 #endif
  
       // Create the Block_to_dof_map_coarse from
       // the dof_to_block_map and Doftype_coarsen_map_coarse.
  
       // find the maximum block number
       unsigned max_block_number =
         *std::max_element(dof_to_block_map.begin(), dof_to_block_map.end());
  
       // Now we do the following:
       // Lets say the Doftype_coarsen_map_coarse is:
       // [0 3 6]
       // [1 4 7]
       // [2 5 8]
       // [9]
       //
       // (this is the same as the above example)
       //
       // and the dof_to_block_map is [0 0 0 1].
       //
       // Then we need to form the Block_to_dof_map_coarse:
       // [0 3 6 1 4 7 2 5 8]
       // [9]
  
       // Clear anything in the Block_to_dof_map_coarse
       Block_to_dof_map_coarse.clear();
  
       const unsigned tmp_nblock = max_block_number + 1;
  
       Block_to_dof_map_coarse.resize(tmp_nblock);
  
       for (unsigned i = 0; i < dof_to_block_map_size; i++)
       {
         Block_to_dof_map_coarse[dof_to_block_map[i]].push_back(i);
       }
  
       Block_to_dof_map_fine.clear();
       Block_to_dof_map_fine.resize(tmp_nblock);
       for (unsigned block_i = 0; block_i < tmp_nblock; block_i++)
       {
         // get the dof types in this block.
         const unsigned ndof_in_block = Block_to_dof_map_coarse[block_i].size();
         for (unsigned dof_i = 0; dof_i < ndof_in_block; dof_i++)
         {
           const unsigned coarsened_dof_i =
             Block_to_dof_map_coarse[block_i][dof_i];
  
           // Insert the most fine grain dofs which this dof_i corresponds to
           // into block_i
           Vector<unsigned> dof_i_dofs =
             Doftype_coarsen_map_fine[coarsened_dof_i];
  
           Block_to_dof_map_fine[block_i].insert(
             Block_to_dof_map_fine[block_i].end(),
             dof_i_dofs.begin(),
             dof_i_dofs.end());
         }
       }
  
       // Now set the dof_to_block_map to the identify.
       // NOTE: We are now using the internal n dof types. This is because the
       // dof type coarsening feature was built on top of the existing block
       // preconditioning framework which does not handle coarsening of dof
       // types. Hence, under the hood, it still works with the most fine grain
       // dof types and does not do any coarsening.
  
       // Locally cache the internal ndof types (using access function because
       // the Internal_ndof_type variable may not be set up yet if this is a
       // master preconditioner).
       unsigned tmp_internal_ndof_types = internal_ndof_types();
  
       dof_to_block_map.resize(tmp_internal_ndof_types, 0);
  
       for (unsigned i = 0; i < tmp_internal_ndof_types; i++)
       {
         dof_to_block_map[i] = i;
       }
     } // end of Block_to_dof_map_coarse encapsulation
  
 #ifdef PARANOID
  
     // Check that the meshes are ok. This only needs to be done in the master
     // because subsidiary preconditioners don't do anything with the meshes
     // here.
     if (is_master_block_preconditioner())
     {
       // This is declared as local_nmesh because there are other variables
       // called nmesh floating about... but this will not exist if PARANOID is
       // switched on.
       unsigned local_nmesh = nmesh();
  
       // Check that some mesh pointers have been assigned.
       if (local_nmesh == 0)
       {
         std::ostringstream error_msg;
         error_msg << "Cannot setup blocks because no meshes have been set.";
         throw OomphLibError(
           error_msg.str(), OOMPH_CURRENT_FUNCTION, OOMPH_EXCEPTION_LOCATION);
       }
  
       // Each mesh must contain elements with the same number of dof.
       // A stricter check is to ensure that the mesh contains only one type of
       // elements. This is relaxed in same cases.
       for (unsigned mesh_i = 0; mesh_i < local_nmesh; mesh_i++)
       {
         // The number of elements in the current mesh.
         unsigned n_element = mesh_pt(mesh_i)->nelement();
  
         // When the bulk mesh is distributed, there may not be any elements
         // in the surface mesh(es).
         if (n_element > 0)
         {
           // The string of the first element in the current mesh.
           std::string first_element_string =
             typeid(*(mesh_pt(mesh_i)->element_pt(0))).name();
  
           // If there are multiple element types in the current mesh,
           // we can at least make sure that they contain the same types of DOFs.
           if (bool(Allow_multiple_element_type_in_mesh[mesh_i]))
           {
             // The ndof types of the first element.
             unsigned first_element_ndof_type =
               mesh_pt(mesh_i)->element_pt(0)->ndof_types();
  
             // Loop through the meshes and compare the number of types of DOFs.
             for (unsigned el_i = 1; el_i < n_element; el_i++)
             {
               // The ndof type of the current element.
               unsigned current_element_ndof_type =
                 mesh_pt(mesh_i)->element_pt(el_i)->ndof_types();
  
               // The string of the current element.
               std::string current_element_string =
                 typeid(*(mesh_pt(mesh_i)->element_pt(el_i))).name();
  
               // Compare against the first element.
               if (current_element_ndof_type != first_element_ndof_type)
               {
                 std::ostringstream error_message;
                 error_message
                   << "Elements in the same mesh MUST have the same number of "
                      "types "
                   << "of DOFs.\n"
                   << "The element in mesh " << mesh_i << ", at position "
                   << el_i << " is: \n"
                   << current_element_string << ", \n"
                   << "with ndof types: " << current_element_ndof_type << ".\n"
                   << "The first element in the same mesh is: \n"
                   << first_element_string << ", \n"
                   << "with ndof types: " << first_element_ndof_type << ".\n";
                 throw OomphLibError(error_message.str(),
                                     OOMPH_CURRENT_FUNCTION,
                                     OOMPH_EXCEPTION_LOCATION);
               }
             }
           }
           else
           // There should be only one type of elements in the current mesh.
           // Check that this is the case!
           {
             // Loop through the elements in the current mesh.
             for (unsigned el_i = 1; el_i < n_element; el_i++)
             {
               // The string of the current element.
               std::string current_element_string =
                 typeid(*(mesh_pt(mesh_i)->element_pt(el_i))).name();
  
               // Compare against the first element.
               if (current_element_string.compare(first_element_string) != 0)
               {
                 std::ostringstream error_message;
                 error_message
                   << "By default, a mesh containing block preconditionable "
                   << "elements must contain only one type of element.\n"
                   << "The element in mesh " << mesh_i << ", at position "
                   << el_i << " is: \n"
                   << current_element_string << "\n"
                   << "The first element in the same mesh is: \n"
                   << first_element_string << "\n"
                   << "If this is correct, consider calling the set_mesh(...) "
                      "with\n"
                   << "the optional argument set true to allow multiple "
                      "element\n"
                   << "types in the same mesh.\n"
                   << "Note: A minimal requirement is that the elements in the "
                      "same\n"
                   << "mesh MUST have the same number of DOF types.";
                 throw OomphLibError(error_message.str(),
                                     OOMPH_CURRENT_FUNCTION,
                                     OOMPH_EXCEPTION_LOCATION);
               }
             }
           }
         }
       }
     }
  
 #endif
     // clear the memory
     this->clear_block_preconditioner_base();
  
     // get my_rank and nproc
 #ifdef OOMPH_HAS_MPI
     unsigned my_rank = comm_pt()->my_rank();
     unsigned nproc = comm_pt()->nproc();
 #endif
  
  
     /// //////////////////////////////////////////////////////////////////////////
     // start of master block preconditioner only operations
     /// //////////////////////////////////////////////////////////////////////////
 #ifdef OOMPH_HAS_MPI
     unsigned* nreq_sparse = new unsigned[nproc]();
     unsigned* nreq_sparse_for_proc = new unsigned[nproc]();
     unsigned** index_in_dof_block_sparse_send = new unsigned*[nproc]();
     unsigned** dof_number_sparse_send = new unsigned*[nproc]();
     Vector<MPI_Request> send_requests_sparse;
     Vector<MPI_Request> recv_requests_sparse;
 #endif
  
     // If this preconditioner is the master preconditioner then we need
     // to assemble the vectors : Dof_number
     //                           Index_in_dof_block
     if (is_master_block_preconditioner())
     {
       // Get the number of dof types in each mesh.
       Ndof_types_in_mesh.assign(nmesh(), 0);
       for (unsigned i = 0; i < nmesh(); i++)
       {
         Ndof_types_in_mesh[i] = mesh_pt(i)->ndof_types();
       }
       // Setup the distribution of this preconditioner, assumed to be the same
       // as the matrix if the matrix is distributable.
       if (dynamic_cast<DistributableLinearAlgebraObject*>(matrix_pt()))
       {
         this->build_distribution(
           dynamic_cast<DistributableLinearAlgebraObject*>(matrix_pt())
             ->distribution_pt());
       }
       else
       {
         LinearAlgebraDistribution dist(comm_pt(), matrix_pt()->nrow(), false);
         this->build_distribution(dist);
       }
       Nrow = matrix_pt()->nrow();
  
       // Boolean to indicate whether the matrix is actually distributed,
       // ie distributed and on more than one processor.
       bool matrix_distributed =
         (this->distribution_pt()->distributed() &&
          this->distribution_pt()->communicator_pt()->nproc() > 1);
  
  
       // Matrix must be a CR matrix.
       CRDoubleMatrix* cr_matrix_pt = dynamic_cast<CRDoubleMatrix*>(matrix_pt());
  
       if (cr_matrix_pt == 0)
       {
         std::ostringstream error_message;
         error_message << "Block setup for distributed matrices only works "
                       << "for CRDoubleMatrices";
         throw OomphLibError(error_message.str(),
                             OOMPH_CURRENT_FUNCTION,
                             OOMPH_EXCEPTION_LOCATION);
       }
  
  
       // Get distribution.
       unsigned first_row = this->distribution_pt()->first_row();
       unsigned nrow_local = this->distribution_pt()->nrow_local();
       unsigned last_row = first_row + nrow_local - 1;
  
 #ifdef OOMPH_HAS_MPI
       // storage for the rows required by each processor in the dense
       // block lookup storage scheme
       // dense_required_rows(p,0) is the minimum global index required by proc p
       //                 ...(p,1) is the maximum global index required by proc p
       DenseMatrix<unsigned> dense_required_rows(nproc, 2);
       for (unsigned p = 0; p < nproc; p++)
       {
         dense_required_rows(p, 0) = this->distribution_pt()->first_row(p);
         dense_required_rows(p, 1) = this->distribution_pt()->first_row(p) +
                                     this->distribution_pt()->nrow_local(p) - 1;
       }
  
       // determine the global rows That are not in the range first_row to
       // first_row+nrow_local for which we should store the
       // Dof_index and Index_in_dof_block for
       // then send the lists to other processors
       std::set<unsigned> sparse_global_rows_for_block_lookup;
       if (matrix_distributed)
       {
         unsigned nnz = cr_matrix_pt->nnz();
         int* column_index = cr_matrix_pt->column_index();
         for (unsigned i = 0; i < nnz; i++)
         {
           unsigned ci = column_index[i];
           if (ci < first_row || ci > last_row)
           {
             sparse_global_rows_for_block_lookup.insert(ci);
           }
         }
       }
  
       int nsparse = sparse_global_rows_for_block_lookup.size();
  
       Global_index_sparse.resize(0);
       std::copy(sparse_global_rows_for_block_lookup.begin(),
                 sparse_global_rows_for_block_lookup.end(),
                 std::back_inserter(Global_index_sparse));
  
       Index_in_dof_block_sparse.resize(nsparse);
       Dof_number_sparse.resize(nsparse);
       sparse_global_rows_for_block_lookup.clear();
  
       Vector<MPI_Request> recv_requests_sparse_nreq;
       if (matrix_distributed)
       {
         MPI_Aint base_displacement_sparse;
         MPI_Get_address(nreq_sparse, &base_displacement_sparse);
  
         int zero = 0;
         for (unsigned p = 0; p < nproc; p++)
         {
           // determine the global eqn numbers required by this processor
           // that can be classified by processor p
           int begin = 0;
           for (int i = 0; i < nsparse; ++i)
           {
             if (Global_index_sparse[i] < dense_required_rows(p, 0))
             {
               ++begin;
             }
             else
             {
               if (Global_index_sparse[i] <= dense_required_rows(p, 1))
               {
                 ++nreq_sparse[p];
               }
               else
               {
                 break;
               }
             }
           }
  
           // if this processor has rows to be classified by proc p
           if (nreq_sparse[p] > 0)
           {
             // send the number of global eqn numbers
             MPI_Request req1;
             MPI_Isend(&nreq_sparse[p],
                       1,
                       MPI_UNSIGNED,
                       p,
                       31,
                       comm_pt()->mpi_comm(),
                       &req1);
             send_requests_sparse.push_back(req1);
  
             // send the global eqn numbers
             MPI_Request req2;
             MPI_Isend(&Global_index_sparse[begin],
                       nreq_sparse[p],
                       MPI_UNSIGNED,
                       p,
                       32,
                       comm_pt()->mpi_comm(),
                       &req2);
             send_requests_sparse.push_back(req2);
  
             // post the recvs for the data that will be returned
  
             // the datatypes, displacements, lengths for the two datatypes
             MPI_Datatype types[2];
             MPI_Aint displacements[2];
             int lengths[2];
  
             // index in dof block
             MPI_Type_contiguous(nreq_sparse[p], MPI_UNSIGNED, &types[0]);
             MPI_Type_commit(&types[0]);
             MPI_Get_address(&Index_in_dof_block_sparse[begin],
                             &displacements[0]);
             displacements[0] -= base_displacement_sparse;
             lengths[0] = 1;
  
             // dof number
             MPI_Type_contiguous(nreq_sparse[p], MPI_UNSIGNED, &types[1]);
             MPI_Type_commit(&types[1]);
             MPI_Get_address(&Dof_number_sparse[begin], &displacements[1]);
             displacements[1] -= base_displacement_sparse;
             lengths[1] = 1;
  
             // build the final type
             MPI_Datatype recv_type;
             MPI_Type_create_struct(
               2, lengths, displacements, types, &recv_type);
             MPI_Type_commit(&recv_type);
             MPI_Type_free(&types[0]);
             MPI_Type_free(&types[1]);
  
             // and recv
             MPI_Request req;
             MPI_Irecv(
               nreq_sparse, 1, recv_type, p, 33, comm_pt()->mpi_comm(), &req);
             recv_requests_sparse.push_back(req);
             MPI_Type_free(&recv_type);
           }
  
           // if no communication required, confirm this
           if (nreq_sparse[p] == 0)
           {
             MPI_Request req1;
             MPI_Isend(
               &zero, 1, MPI_UNSIGNED, p, 31, comm_pt()->mpi_comm(), &req1);
             send_requests_sparse.push_back(req1);
           }
  
           //
           MPI_Request req;
           MPI_Irecv(&nreq_sparse_for_proc[p],
                     1,
                     MPI_UNSIGNED,
                     p,
                     31,
                     comm_pt()->mpi_comm(),
                     &req);
           recv_requests_sparse_nreq.push_back(req);
         }
       }
 #endif
  
       // resize the storage
       Dof_number_dense.resize(nrow_local);
       Index_in_dof_block_dense.resize(nrow_local);
  
       // zero the number of dof types
       Internal_ndof_types = 0;
  
 #ifdef PARANOID
       // Vector to keep track of previously assigned block numbers
       // to check consistency between multiple assignments.
       Vector<int> previously_assigned_block_number(nrow_local,
                                                    Data::Is_unclassified);
 #endif
  
       // determine whether the problem is distribution
       bool problem_distributed = false;
  
       // the problem method distributed() is only accessible with MPI
 #ifdef OOMPH_HAS_MPI
       problem_distributed = any_mesh_distributed();
 #endif
  
       // if the problem is not distributed
       if (!problem_distributed)
       {
         // Offset for the block type in the overall system.
         // Different meshes contain different block-preconditionable
         // elements -- their blocks are added one after the other.
         unsigned dof_offset = 0;
  
         // Loop over all meshes.
         for (unsigned m = 0; m < nmesh(); m++)
         {
           // Number of elements in this mesh.
           unsigned n_element = mesh_pt(m)->nelement();
  
           // Find the number of block types that the elements in this mesh
           // are in charge of.
           unsigned ndof_in_element = ndof_types_in_mesh(m);
           Internal_ndof_types += ndof_in_element;
  
           for (unsigned e = 0; e < n_element; e++)
           {
             // List containing pairs of global equation number and
             // dof number for each global dof in an element.
             std::list<std::pair<unsigned long, unsigned>> dof_lookup_list;
  
             // Get list of blocks associated with the element's global unknowns.
             mesh_pt(m)->element_pt(e)->get_dof_numbers_for_unknowns(
               dof_lookup_list);
  
             // Loop over all entries in the list
             // and store the block number.
             typedef std::list<std::pair<unsigned long, unsigned>>::iterator IT;
             for (IT it = dof_lookup_list.begin(); it != dof_lookup_list.end();
                  it++)
             {
               unsigned long global_dof = it->first;
               if (global_dof >= unsigned(first_row) &&
                   global_dof <= unsigned(last_row))
               {
                 unsigned dof_number = (it->second) + dof_offset;
                 Dof_number_dense[global_dof - first_row] = dof_number;
  
 #ifdef PARANOID
                 // Check consistency of block numbers if assigned multiple times
                 if (previously_assigned_block_number[global_dof - first_row] <
                     0)
                 {
                   previously_assigned_block_number[global_dof - first_row] =
                     dof_number;
                 }
 #endif
               }
             }
           }
  
           // About to do the next mesh which contains block preconditionable
           // elements of a different type; all the dofs that these elements are
           // "in charge of" differ from the ones considered so far.
           // Bump up the block counter to make sure we're not overwriting
           // anything here
           dof_offset += ndof_in_element;
         }
  
 #ifdef PARANOID
         // check that every global equation number has been allocated a dof type
         for (unsigned i = 0; i < nrow_local; i++)
         {
           if (previously_assigned_block_number[i] < 0)
           {
             std::ostringstream error_message;
             error_message << "Not all degrees of freedom have had DOF type "
                           << "numbers allocated. Dof number " << i
                           << " is unallocated.";
             throw OomphLibError(error_message.str(),
                                 OOMPH_CURRENT_FUNCTION,
                                 OOMPH_EXCEPTION_LOCATION);
           }
         }
 #endif
       }
       // else the problem is distributed
       else
       {
 #ifdef OOMPH_HAS_MPI
         // Offset for the block type in the overall system.
         // Different meshes contain different block-preconditionable
         // elements -- their blocks are added one after the other...
         unsigned dof_offset = 0;
  
         // the set of global degrees of freedom and their corresponding dof
         // number on this processor
         std::map<unsigned long, unsigned> my_dof_map;
  
         // Loop over all meshes
         for (unsigned m = 0; m < nmesh(); m++)
         {
           // Number of elements in this mesh
           unsigned n_element = this->mesh_pt(m)->nelement();
  
           // Find the number of block types that the elements in this mesh
           // are in charge of
           unsigned ndof_in_element = ndof_types_in_mesh(m);
           Internal_ndof_types += ndof_in_element;
  
           // Loop over all elements
           for (unsigned e = 0; e < n_element; e++)
           {
             // if the element is not a halo element
             if (!this->mesh_pt(m)->element_pt(e)->is_halo())
             {
               // List containing pairs of global equation number and
               // dof number for each global dof in an element
               std::list<std::pair<unsigned long, unsigned>> dof_lookup_list;
  
               // Get list of blocks associated with the element's global
               // unknowns
               this->mesh_pt(m)->element_pt(e)->get_dof_numbers_for_unknowns(
                 dof_lookup_list);
  
               // update the block numbers and put it in the map.
               typedef std::list<std::pair<unsigned long, unsigned>>::iterator
                 IT;
               for (IT it = dof_lookup_list.begin(); it != dof_lookup_list.end();
                    it++)
               {
                 it->second = (it->second) + dof_offset;
                 my_dof_map[it->first] = it->second;
               }
             }
           }
  
           // About to do the next mesh which contains block preconditionable
           // elements of a different type; all the dofs that these elements are
           // "in charge of" differ from the ones considered so far.
           // Bump up the block counter to make sure we're not overwriting
           // anything here
           dof_offset += ndof_in_element;
         }
  
         // next copy the map of my dofs to two vectors to send
         unsigned my_ndof = my_dof_map.size();
         unsigned long* my_global_dofs = new unsigned long[my_ndof];
         unsigned* my_dof_numbers = new unsigned[my_ndof];
         typedef std::map<unsigned long, unsigned>::iterator IT;
         unsigned pt = 0;
         for (IT it = my_dof_map.begin(); it != my_dof_map.end(); it++)
         {
           my_global_dofs[pt] = it->first;
           my_dof_numbers[pt] = it->second;
           pt++;
         }
  
         // and then clear the map
         my_dof_map.clear();
  
         // count up how many DOFs need to be sent to each processor
         int* first_dof_to_send = new int[nproc];
         int* ndof_to_send = new int[nproc];
         unsigned ptr = 0;
         for (unsigned p = 0; p < nproc; p++)
         {
           first_dof_to_send[p] = 0;
           ndof_to_send[p] = 0;
           while (ptr < my_ndof &&
                  my_global_dofs[ptr] < dense_required_rows(p, 0))
           {
             ptr++;
           }
           first_dof_to_send[p] = ptr;
           while (ptr < my_ndof &&
                  my_global_dofs[ptr] <= dense_required_rows(p, 1))
           {
             ndof_to_send[p]++;
             ptr++;
           }
         }
  
         // next communicate to each processor how many dofs it expects to recv
         int* ndof_to_recv = new int[nproc];
         MPI_Alltoall(ndof_to_send,
                      1,
                      MPI_INT,
                      ndof_to_recv,
                      1,
                      MPI_INT,
                      comm_pt()->mpi_comm());
  
         // the base displacements for the sends
         MPI_Aint base_displacement;
         MPI_Get_address(my_global_dofs, &base_displacement);
  
 #ifdef PARANOID
         // storage for paranoid check to ensure that every row as been
         // imported
         std::vector<bool> dof_recv(nrow_local, false);
 #endif
  
         // next send and recv
         Vector<MPI_Request> send_requests;
         Vector<MPI_Request> recv_requests;
         Vector<unsigned long*> global_dofs_recv(nproc, 0);
         Vector<unsigned*> dof_numbers_recv(nproc, 0);
         Vector<unsigned> proc;
         for (unsigned p = 0; p < nproc; p++)
         {
           if (p != my_rank)
           {
             // send
             if (ndof_to_send[p] > 0)
             {
               // the datatypes, displacements, lengths for the two datatypes
               MPI_Datatype types[2];
               MPI_Aint displacements[2];
               int lengths[2];
  
               // my global dofs
               MPI_Type_contiguous(
                 ndof_to_send[p], MPI_UNSIGNED_LONG, &types[0]);
               MPI_Type_commit(&types[0]);
               MPI_Get_address(my_global_dofs + first_dof_to_send[p],
                               &displacements[0]);
               displacements[0] -= base_displacement;
               lengths[0] = 1;
  
               // my dof numbers
               MPI_Type_contiguous(ndof_to_send[p], MPI_UNSIGNED, &types[1]);
               MPI_Type_commit(&types[1]);
               MPI_Get_address(my_dof_numbers + first_dof_to_send[p],
                               &displacements[1]);
               displacements[1] -= base_displacement;
               lengths[1] = 1;
  
               // build the final type
               MPI_Datatype send_type;
               MPI_Type_create_struct(
                 2, lengths, displacements, types, &send_type);
               MPI_Type_commit(&send_type);
               MPI_Type_free(&types[0]);
               MPI_Type_free(&types[1]);
  
               // and send
               MPI_Request req;
               MPI_Isend(my_global_dofs,
                         1,
                         send_type,
                         p,
                         2,
                         comm_pt()->mpi_comm(),
                         &req);
               send_requests.push_back(req);
               MPI_Type_free(&send_type);
             }
  
             // and recv
             if (ndof_to_recv[p] > 0)
             {
               // resize the storage
               global_dofs_recv[p] = new unsigned long[ndof_to_recv[p]];
               dof_numbers_recv[p] = new unsigned[ndof_to_recv[p]];
               proc.push_back(p);
  
               // the datatypes, displacements, lengths for the two datatypes
               MPI_Datatype types[2];
               MPI_Aint displacements[2];
               int lengths[2];
  
               // my global dofs
               MPI_Type_contiguous(
                 ndof_to_recv[p], MPI_UNSIGNED_LONG, &types[0]);
               MPI_Type_commit(&types[0]);
               MPI_Get_address(global_dofs_recv[p], &displacements[0]);
               displacements[0] -= base_displacement;
               lengths[0] = 1;
  
               // my dof numbers
               MPI_Type_contiguous(ndof_to_recv[p], MPI_UNSIGNED, &types[1]);
               MPI_Type_commit(&types[1]);
               MPI_Get_address(dof_numbers_recv[p], &displacements[1]);
               displacements[1] -= base_displacement;
               lengths[1] = 1;
  
               // build the final type
               MPI_Datatype recv_type;
               MPI_Type_create_struct(
                 2, lengths, displacements, types, &recv_type);
               MPI_Type_commit(&recv_type);
               MPI_Type_free(&types[0]);
               MPI_Type_free(&types[1]);
  
               // and recv
               MPI_Request req;
               MPI_Irecv(my_global_dofs,
                         1,
                         recv_type,
                         p,
                         2,
                         comm_pt()->mpi_comm(),
                         &req);
               recv_requests.push_back(req);
               MPI_Type_free(&recv_type);
             }
           }
           // send to self
           else
           {
             unsigned u = first_dof_to_send[p] + ndof_to_recv[p];
             for (unsigned i = first_dof_to_send[p]; i < u; i++)
             {
 #ifdef PARANOID
               // indicate that this dof has ben recv
               dof_recv[my_global_dofs[i] - first_row] = true;
 #endif
               Dof_number_dense[my_global_dofs[i] - first_row] =
                 my_dof_numbers[i];
             }
           }
         }
  
         // recv and store the data
         unsigned c_recv = recv_requests.size();
         while (c_recv > 0)
         {
           // wait for any communication to finish
           int req_number;
           MPI_Waitany(
             c_recv, &recv_requests[0], &req_number, MPI_STATUS_IGNORE);
           recv_requests.erase(recv_requests.begin() + req_number);
           c_recv--;
  
           // determine the source processor
           unsigned p = proc[req_number];
           proc.erase(proc.begin() + req_number);
  
           // import the data
           for (int i = 0; i < ndof_to_recv[p]; i++)
           {
 #ifdef PARANOID
             // indicate that this dof has ben recv
             dof_recv[global_dofs_recv[p][i] - first_row] = true;
 #endif
             Dof_number_dense[global_dofs_recv[p][i] - first_row] =
               dof_numbers_recv[p][i];
           }
  
           // delete the data
           delete[] global_dofs_recv[p];
           delete[] dof_numbers_recv[p];
         }
  
         // finally wait for the send requests to complete as we are leaving
         // an MPI block of code
         unsigned csr = send_requests.size();
         if (csr)
         {
           MPI_Waitall(csr, &send_requests[0], MPI_STATUS_IGNORE);
         }
  
         // clean up
         delete[] ndof_to_send;
         delete[] first_dof_to_send;
         delete[] ndof_to_recv;
         delete[] my_global_dofs;
         delete[] my_dof_numbers;
 #ifdef PARANOID
         unsigned all_recv = true;
         for (unsigned i = 0; i < nrow_local; i++)
         {
           if (!dof_recv[i])
           {
             all_recv = false;
           }
         }
         if (!all_recv)
         {
           std::ostringstream error_message;
           error_message << "Not all the DOF numbers required were received";
           throw OomphLibError(error_message.str(),
                               OOMPH_CURRENT_FUNCTION,
                               OOMPH_EXCEPTION_LOCATION);
         }
 #endif
 #else
         std::ostringstream error_message;
         error_message
           << "The problem appears to be distributed, MPI is required";
         throw OomphLibError(error_message.str(),
                             OOMPH_CURRENT_FUNCTION,
                             OOMPH_EXCEPTION_LOCATION);
 #endif
       }
 #ifdef OOMPH_HAS_MPI
       Vector<unsigned*> sparse_rows_for_proc(nproc, 0);
       Vector<MPI_Request> sparse_rows_for_proc_requests;
       if (matrix_distributed)
       {
         // wait for number of sparse rows each processor requires
         // post recvs for that data
         if (recv_requests_sparse_nreq.size() > 0)
         {
           MPI_Waitall(recv_requests_sparse_nreq.size(),
                       &recv_requests_sparse_nreq[0],
                       MPI_STATUS_IGNORE);
         }
         for (unsigned p = 0; p < nproc; ++p)
         {
           if (nreq_sparse_for_proc[p] > 0)
           {
             MPI_Request req;
             sparse_rows_for_proc[p] = new unsigned[nreq_sparse_for_proc[p]];
             MPI_Irecv(sparse_rows_for_proc[p],
                       nreq_sparse_for_proc[p],
                       MPI_UNSIGNED,
                       p,
                       32,
                       comm_pt()->mpi_comm(),
                       &req);
             sparse_rows_for_proc_requests.push_back(req);
           }
         }
       }
 #endif
  
  
       // for every global degree of freedom required by this processor we now
       // have the corresponding dof number
  
       // clear the Ndof_in_dof_block storage
       Dof_dimension.assign(Internal_ndof_types, 0);
  
       // first consider a non distributed matrix
       if (!matrix_distributed)
       {
         // set the Index_in_dof_block
         unsigned nrow = this->distribution_pt()->nrow();
         Index_in_dof_block_dense.resize(nrow);
         Index_in_dof_block_dense.initialise(0);
         for (unsigned i = 0; i < nrow; i++)
         {
           Index_in_dof_block_dense[i] = Dof_dimension[Dof_number_dense[i]];
           Dof_dimension[Dof_number_dense[i]]++;
         }
       }
  
       // next a distributed matrix
       else
       {
 #ifdef OOMPH_HAS_MPI
  
  
         // first compute how many instances of each dof are on this
         // processor
         unsigned* my_nrows_in_dof_block = new unsigned[Internal_ndof_types];
         for (unsigned i = 0; i < Internal_ndof_types; i++)
         {
           my_nrows_in_dof_block[i] = 0;
         }
         for (unsigned i = 0; i < nrow_local; i++)
         {
           my_nrows_in_dof_block[Dof_number_dense[i]]++;
         }
  
         // next share the data
         unsigned* nrow_in_dof_block_recv =
           new unsigned[Internal_ndof_types * nproc];
         MPI_Allgather(my_nrows_in_dof_block,
                       Internal_ndof_types,
                       MPI_UNSIGNED,
                       nrow_in_dof_block_recv,
                       Internal_ndof_types,
                       MPI_UNSIGNED,
                       comm_pt()->mpi_comm());
         delete[] my_nrows_in_dof_block;
  
         // compute my first dof index and Nrows_in_dof_block
         Vector<unsigned> my_first_dof_index(Internal_ndof_types, 0);
         for (unsigned i = 0; i < Internal_ndof_types; i++)
         {
           for (unsigned p = 0; p < my_rank; p++)
           {
             my_first_dof_index[i] +=
               nrow_in_dof_block_recv[p * Internal_ndof_types + i];
           }
           Dof_dimension[i] = my_first_dof_index[i];
           for (unsigned p = my_rank; p < nproc; p++)
           {
             Dof_dimension[i] +=
               nrow_in_dof_block_recv[p * Internal_ndof_types + i];
           }
         }
         delete[] nrow_in_dof_block_recv;
  
         // next compute Index in dof block
         Index_in_dof_block_dense.resize(nrow_local);
         Index_in_dof_block_dense.initialise(0);
         Vector<unsigned> dof_counter(Internal_ndof_types, 0);
         for (unsigned i = 0; i < nrow_local; i++)
         {
           Index_in_dof_block_dense[i] =
             my_first_dof_index[Dof_number_dense[i]] +
             dof_counter[Dof_number_dense[i]];
           dof_counter[Dof_number_dense[i]]++;
         }
  
         // the base displacements for the sends
         if (sparse_rows_for_proc_requests.size() > 0)
         {
           MPI_Waitall(sparse_rows_for_proc_requests.size(),
                       &sparse_rows_for_proc_requests[0],
                       MPI_STATUS_IGNORE);
         }
         MPI_Aint base_displacement;
         MPI_Get_address(dof_number_sparse_send, &base_displacement);
         unsigned first_row = this->distribution_pt()->first_row();
         for (unsigned p = 0; p < nproc; ++p)
         {
           if (nreq_sparse_for_proc[p] > 0)
           {
             // construct the data
             index_in_dof_block_sparse_send[p] =
               new unsigned[nreq_sparse_for_proc[p]];
             dof_number_sparse_send[p] = new unsigned[nreq_sparse_for_proc[p]];
             for (unsigned i = 0; i < nreq_sparse_for_proc[p]; ++i)
             {
               unsigned r = sparse_rows_for_proc[p][i];
               r -= first_row;
               index_in_dof_block_sparse_send[p][i] =
                 Index_in_dof_block_dense[r];
               dof_number_sparse_send[p][i] = Dof_number_dense[r];
             }
             delete[] sparse_rows_for_proc[p];
  
             // send the data
             // the datatypes, displacements, lengths for the two datatypes
             MPI_Datatype types[2];
             MPI_Aint displacements[2];
             int lengths[2];
  
             // index in dof block
             MPI_Type_contiguous(
               nreq_sparse_for_proc[p], MPI_UNSIGNED, &types[0]);
             MPI_Type_commit(&types[0]);
             MPI_Get_address(index_in_dof_block_sparse_send[p],
                             &displacements[0]);
             displacements[0] -= base_displacement;
             lengths[0] = 1;
  
             // dof number
             MPI_Type_contiguous(
               nreq_sparse_for_proc[p], MPI_UNSIGNED, &types[1]);
             MPI_Type_commit(&types[1]);
             MPI_Get_address(dof_number_sparse_send[p], &displacements[1]);
             displacements[1] -= base_displacement;
             lengths[1] = 1;
  
             // build the final type
             MPI_Datatype send_type;
             MPI_Type_create_struct(
               2, lengths, displacements, types, &send_type);
             MPI_Type_commit(&send_type);
             MPI_Type_free(&types[0]);
             MPI_Type_free(&types[1]);
  
             // and recv
             MPI_Request req;
             MPI_Isend(dof_number_sparse_send,
                       1,
                       send_type,
                       p,
                       33,
                       comm_pt()->mpi_comm(),
                       &req);
             send_requests_sparse.push_back(req);
             MPI_Type_free(&send_type);
           }
           else
           {
             index_in_dof_block_sparse_send[p] = 0;
             dof_number_sparse_send[p] = 0;
           }
         }
 #endif
       }
     }
  
     /// //////////////////////////////////////////////////////////////////////////
     // end of master block preconditioner only operations
     /// //////////////////////////////////////////////////////////////////////////
  
     // compute the number of rows in each block
  
 #ifdef PARANOID
     // check the vector is the correct length
     if (dof_to_block_map.size() != Internal_ndof_types)
     {
       std::ostringstream error_message;
       error_message << "The dof_to_block_map vector (size="
                     << dof_to_block_map.size()
                     << ") must be of size Internal_ndof_types="
                     << Internal_ndof_types;
       throw OomphLibError(
         error_message.str(), OOMPH_CURRENT_FUNCTION, OOMPH_EXCEPTION_LOCATION);
     }
 #endif
  
     // find the maximum block number RAYAY use std::max_element
     unsigned max_block_number = 0;
     for (unsigned i = 0; i < Internal_ndof_types; i++)
     {
       if (dof_to_block_map[i] > max_block_number)
       {
         max_block_number = dof_to_block_map[i];
       }
     }
  
     // resize the storage the the block to dof map
     Block_number_to_dof_number_lookup.clear();
     Block_number_to_dof_number_lookup.resize(max_block_number + 1);
     Ndof_in_block.clear();
     Ndof_in_block.resize(max_block_number + 1);
  
     // resize storage
     Dof_number_to_block_number_lookup.resize(Internal_ndof_types);
  
     // build the storage for the two maps (block to dof) and (dof to block)
     for (unsigned i = 0; i < Internal_ndof_types; i++)
     {
       Dof_number_to_block_number_lookup[i] = dof_to_block_map[i];
       Block_number_to_dof_number_lookup[dof_to_block_map[i]].push_back(i);
       Ndof_in_block[dof_to_block_map[i]]++;
     }
  
 #ifdef PARANOID
     // paranoid check that every block number has at least one DOF associated
     // with it
     for (unsigned i = 0; i < max_block_number + 1; i++)
     {
       if (Block_number_to_dof_number_lookup[i].size() == 0)
       {
         std::ostringstream error_message;
         error_message << "block number " << i
                       << " does not have any DOFs associated with it";
         throw OomphLibWarning(error_message.str(),
                               OOMPH_CURRENT_FUNCTION,
                               OOMPH_EXCEPTION_LOCATION);
       }
     }
 #endif
  
     // Update the number of blocks types.
     Internal_nblock_types = max_block_number + 1;
  
     // Distributed or not, depends on if we have more than one processor.
     bool distributed = this->master_distribution_pt()->distributed();
  
     // Create the new block distributions.
     Internal_block_distribution_pt.resize(Internal_nblock_types);
     for (unsigned i = 0; i < Internal_nblock_types; i++)
     {
       unsigned block_dim = 0;
       for (unsigned j = 0; j < Ndof_in_block[i]; j++)
       {
         block_dim +=
           internal_dof_block_dimension(Block_number_to_dof_number_lookup[i][j]);
       }
       Internal_block_distribution_pt[i] =
         new LinearAlgebraDistribution(comm_pt(), block_dim, distributed);
     }
  
     // Work out the distribution of the dof-level blocks.
     // Since several dof types may be coarsened into a single dof type.
     // We get the dof-level block distributions from the parent preconditioner.
  
     // How many dof types are there?
     if (is_subsidiary_block_preconditioner())
     {
       // Delete any pre-existing distributions.
       const unsigned dof_block_distribution_size =
         Dof_block_distribution_pt.size();
       for (unsigned dof_i = 0; dof_i < dof_block_distribution_size; dof_i++)
       {
         delete Dof_block_distribution_pt[dof_i];
       }
       const unsigned ndofs = this->ndof_types();
       Dof_block_distribution_pt.resize(ndofs, 0);
  
       // For each dof type, work out how many parent preconditioner dof types
       // are in it.
       for (unsigned dof_i = 0; dof_i < ndofs; dof_i++)
       {
         // For each external dof, we get the dofs coarsened into it (from the
         // parent preconditioner level, not the most fine grain level).
         const unsigned ncoarsened_dofs_in_dof_i =
           Doftype_coarsen_map_coarse[dof_i].size();
         Vector<LinearAlgebraDistribution*> tmp_dist_pt(ncoarsened_dofs_in_dof_i,
                                                        0);
         for (unsigned parent_dof_i = 0; parent_dof_i < ncoarsened_dofs_in_dof_i;
              parent_dof_i++)
         {
           tmp_dist_pt[parent_dof_i] =
             master_block_preconditioner_pt()->dof_block_distribution_pt(
               Doftype_in_master_preconditioner_coarse
                 [Doftype_coarsen_map_coarse[dof_i][parent_dof_i]]);
         }
  
         Dof_block_distribution_pt[dof_i] = new LinearAlgebraDistribution;
  
  
         LinearAlgebraDistributionHelpers::concatenate(
           tmp_dist_pt, *Dof_block_distribution_pt[dof_i]);
       }
     }
  
     // Create Block_distribution_pt
     {
       // Delete any existing distributions in Block_distribution_pt.
       // (This should already be deleted in clear_block_preconditioner_base(...)
       // but we are just being extra safe!).
       unsigned n_existing_block_dist = Block_distribution_pt.size();
       for (unsigned dist_i = 0; dist_i < n_existing_block_dist; dist_i++)
       {
         delete Block_distribution_pt[dist_i];
       }
  
       Block_distribution_pt.clear();
  
       // Work out the distributions of the concatenated blocks.
       unsigned super_block_size = Block_to_dof_map_coarse.size();
       Block_distribution_pt.resize(super_block_size, 0);
       for (unsigned super_block_i = 0; super_block_i < super_block_size;
            super_block_i++)
       {
         unsigned sub_block_size = Block_to_dof_map_coarse[super_block_i].size();
         Vector<LinearAlgebraDistribution*> tmp_dist_pt(sub_block_size, 0);
  
         for (unsigned sub_block_i = 0; sub_block_i < sub_block_size;
              sub_block_i++)
         {
           tmp_dist_pt[sub_block_i] = dof_block_distribution_pt(
             Block_to_dof_map_coarse[super_block_i][sub_block_i]);
         }
  
         Block_distribution_pt[super_block_i] = new LinearAlgebraDistribution;
  
         LinearAlgebraDistributionHelpers::concatenate(
           tmp_dist_pt, *Block_distribution_pt[super_block_i]);
       }
  
     } // Creating Block_distribution_pt.
  
  
     // Create the distribution of the preconditioner matrix,
     // if this preconditioner is a subsidiary preconditioner then it stored
     // at Distribution_pt;
     // if this preconditioner is a master preconditioner then it is stored
     // at Internal_preconditioner_matrix_distribution_pt.
     LinearAlgebraDistribution dist;
     LinearAlgebraDistributionHelpers::concatenate(
       Internal_block_distribution_pt, dist);
  
     // Build the distribution.
     if (is_subsidiary_block_preconditioner())
     {
       this->build_distribution(dist);
     }
     else
     {
       Internal_preconditioner_matrix_distribution_pt =
         new LinearAlgebraDistribution(dist);
     }
  
     Preconditioner_matrix_distribution_pt = new LinearAlgebraDistribution;
     LinearAlgebraDistributionHelpers::concatenate(
       Block_distribution_pt, *Preconditioner_matrix_distribution_pt);
  
     // Clear all distributions in Auxiliary_block_distribution_pt, except for
     // the one which corresponds to the preconditioner matrix distribution. This
     // is already deleted by clear_block_preconditioner_base(...)
  
     // Create the key which corresponds to
     // preconditioner_matrix_distribution_pt.
     {
       const unsigned nblocks = Block_distribution_pt.size();
       Vector<unsigned> preconditioner_matrix_key(nblocks, 0);
       for (unsigned i = 0; i < nblocks; i++)
       {
         preconditioner_matrix_key[i] = i;
       }
  
       // Now iterate through Auxiliary_block_distribution_pt and delete
       // everything except for the value which corresponds to
       // preconditioner_matrix_key.
       std::map<Vector<unsigned>, LinearAlgebraDistribution*>::iterator iter =
         Auxiliary_block_distribution_pt.begin();
       while (iter != Auxiliary_block_distribution_pt.end())
       {
         if (iter->first != preconditioner_matrix_key)
         {
           delete iter->second;
           iter++;
         }
         else
         {
           ++iter;
         }
       }
  
       // Clear it just to be safe!
       Auxiliary_block_distribution_pt.clear();
  
       // Insert the preconditioner matrix distribution.
       insert_auxiliary_block_distribution(
         preconditioner_matrix_key, Preconditioner_matrix_distribution_pt);
     } // End of Auxiliary_block_distribution_pt encapsulation.
  
     // Clearing up after comm to assemble sparse lookup schemes.
 #ifdef OOMPH_HAS_MPI
     if (send_requests_sparse.size() > 0)
     {
       MPI_Waitall(send_requests_sparse.size(),
                   &send_requests_sparse[0],
                   MPI_STATUS_IGNORE);
     }
     if (recv_requests_sparse.size() > 0)
     {
       MPI_Waitall(recv_requests_sparse.size(),
                   &recv_requests_sparse[0],
                   MPI_STATUS_IGNORE);
     }
     for (unsigned p = 0; p < nproc; p++)
     {
       delete[] index_in_dof_block_sparse_send[p];
       delete[] dof_number_sparse_send[p];
     }
     delete[] index_in_dof_block_sparse_send;
     delete[] dof_number_sparse_send;
     delete[] nreq_sparse;
     delete[] nreq_sparse_for_proc;
 #endif
  
     // Next we assemble the lookup schemes for the rows
     // if the matrix is not distributed then we assemble Global_index
     // if the matrix is distributed then Rows_to_send_..., Rows_to_recv_... etc.
     if (!distributed)
     {
       // Resize the storage.
       Global_index.resize(Internal_nblock_types);
       for (unsigned b = 0; b < Internal_nblock_types; b++)
       {
         Global_index[b].resize(Internal_block_distribution_pt[b]->nrow());
       }
  
       // Compute:
       unsigned nrow = this->master_nrow();
       for (unsigned i = 0; i < nrow; i++)
       {
         // the dof type number;
         int dof_number = this->internal_dof_number(i);
         if (dof_number >= 0)
         {
           // the block number;
           unsigned block_number = Dof_number_to_block_number_lookup[dof_number];
  
           // the index in the block.
           unsigned index_in_block = 0;
           unsigned ptr = 0;
           while (int(Block_number_to_dof_number_lookup[block_number][ptr]) !=
                  dof_number)
           {
             index_in_block += internal_dof_block_dimension(
               Block_number_to_dof_number_lookup[block_number][ptr]);
             ptr++;
           }
           index_in_block += internal_index_in_dof(i);
           Global_index[block_number][index_in_block] = i;
         }
       }
     }
     // otherwise the matrix is distributed
     else
     {
 #ifdef OOMPH_HAS_MPI
  
       // the pointer to the master distribution
       const LinearAlgebraDistribution* master_distribution_pt =
         this->master_distribution_pt();
  
       // resize the nrows... storage
       Nrows_to_send_for_get_block.resize(Internal_nblock_types, nproc);
       Nrows_to_send_for_get_block.initialise(0);
       Nrows_to_send_for_get_ordered.resize(nproc);
       Nrows_to_send_for_get_ordered.initialise(0);
  
       // loop over my rows
       unsigned nrow_local = master_distribution_pt->nrow_local();
       unsigned first_row = master_distribution_pt->first_row();
       for (unsigned i = 0; i < nrow_local; i++)
       {
         // the block number
         int b = this->internal_block_number(first_row + i);
  
         // check that the DOF i is associated with this preconditioner
         if (b >= 0)
         {
           // the block index
           unsigned j = this->internal_index_in_block(first_row + i);
  
           // the processor this row will be sent to
           unsigned block_p = 0;
           while (!(Internal_block_distribution_pt[b]->first_row(block_p) <= j &&
                    (Internal_block_distribution_pt[b]->first_row(block_p) +
                       Internal_block_distribution_pt[b]->nrow_local(block_p) >
                     j)))
           {
             block_p++;
           }
  
           // and increment the counter
           Nrows_to_send_for_get_block(b, block_p)++;
           Nrows_to_send_for_get_ordered[block_p]++;
         }
       }
  
       // resize the storage for Nrows_to_recv
       Nrows_to_recv_for_get_block.resize(Internal_nblock_types, nproc);
       Nrows_to_recv_for_get_block.initialise(0);
       Nrows_to_recv_for_get_ordered.resize(nproc);
       Nrows_to_recv_for_get_ordered.initialise(0);
  
       // next we send the number of rows that will be sent by this processor
       Vector<unsigned*> nrows_to_send(nproc, 0);
       Vector<unsigned*> nrows_to_recv(nproc, 0);
       Vector<MPI_Request> send_requests_nrow;
       Vector<MPI_Request> recv_requests_nrow;
       Vector<unsigned> proc;
       for (unsigned p = 0; p < nproc; p++)
       {
         if (p != my_rank)
         {
           // send
           proc.push_back(p);
           nrows_to_send[p] = new unsigned[Internal_nblock_types];
           for (unsigned b = 0; b < Internal_nblock_types; b++)
           {
             nrows_to_send[p][b] = Nrows_to_send_for_get_block(b, p);
           }
           MPI_Request s_req;
           MPI_Isend(nrows_to_send[p],
                     Internal_nblock_types,
                     MPI_UNSIGNED,
                     p,
                     3,
                     comm_pt()->mpi_comm(),
                     &s_req);
           send_requests_nrow.push_back(s_req);
  
           // recv
           nrows_to_recv[p] = new unsigned[Internal_nblock_types];
           MPI_Request r_req;
           MPI_Irecv(nrows_to_recv[p],
                     Internal_nblock_types,
                     MPI_UNSIGNED,
                     p,
                     3,
                     comm_pt()->mpi_comm(),
                     &r_req);
           recv_requests_nrow.push_back(r_req);
         }
         // send to self
         else
         {
           for (unsigned b = 0; b < Internal_nblock_types; b++)
           {
             Nrows_to_recv_for_get_block(b, p) =
               Nrows_to_send_for_get_block(b, p);
           }
           Nrows_to_recv_for_get_ordered[p] = Nrows_to_send_for_get_ordered[p];
         }
       }
  
       // create some temporary storage for the global row indices that will
       // be received from another processor.
       DenseMatrix<int*> block_rows_to_send(Internal_nblock_types, nproc, 0);
       Vector<int*> ordered_rows_to_send(nproc, 0);
  
       // resize the rows... storage
       Rows_to_send_for_get_block.resize(Internal_nblock_types, nproc);
       Rows_to_send_for_get_block.initialise(0);
       Rows_to_send_for_get_ordered.resize(nproc);
       Rows_to_send_for_get_ordered.initialise(0);
       Rows_to_recv_for_get_block.resize(Internal_nblock_types, nproc);
       Rows_to_recv_for_get_block.initialise(0);
  
       // resize the storage
       for (unsigned p = 0; p < nproc; p++)
       {
         for (unsigned b = 0; b < Internal_nblock_types; b++)
         {
           Rows_to_send_for_get_block(b, p) =
             new int[Nrows_to_send_for_get_block(b, p)];
           if (p != my_rank)
           {
             block_rows_to_send(b, p) =
               new int[Nrows_to_send_for_get_block(b, p)];
           }
           else
           {
             Rows_to_recv_for_get_block(b, p) =
               new int[Nrows_to_send_for_get_block(b, p)];
           }
         }
         Rows_to_send_for_get_ordered[p] =
           new int[Nrows_to_send_for_get_ordered[p]];
       }
  
  
       // loop over my rows to allocate the nrows
       DenseMatrix<unsigned> ptr_block(Internal_nblock_types, nproc, 0);
       for (unsigned i = 0; i < nrow_local; i++)
       {
         // the block number
         int b = this->internal_block_number(first_row + i);
  
         // check that the DOF i is associated with this preconditioner
         if (b >= 0)
         {
           // the block index
           unsigned j = this->internal_index_in_block(first_row + i);
  
           // the processor this row will be sent to
           unsigned block_p = 0;
           while (!(Internal_block_distribution_pt[b]->first_row(block_p) <= j &&
                    (Internal_block_distribution_pt[b]->first_row(block_p) +
                       Internal_block_distribution_pt[b]->nrow_local(block_p) >
                     j)))
           {
             block_p++;
           }
  
           // and store the row
           Rows_to_send_for_get_block(b, block_p)[ptr_block(b, block_p)] = i;
           if (block_p != my_rank)
           {
             block_rows_to_send(b, block_p)[ptr_block(b, block_p)] =
               j - Internal_block_distribution_pt[b]->first_row(block_p);
           }
           else
           {
             Rows_to_recv_for_get_block(b, block_p)[ptr_block(b, block_p)] =
               j - Internal_block_distribution_pt[b]->first_row(block_p);
           }
           ptr_block(b, block_p)++;
         }
       }
  
       // next block ordered
       for (unsigned p = 0; p < nproc; ++p)
       {
         int pt = 0;
         for (unsigned b = 0; b < Internal_nblock_types; ++b)
         {
           for (unsigned i = 0; i < Nrows_to_send_for_get_block(b, p); ++i)
           {
             Rows_to_send_for_get_ordered[p][pt] =
               Rows_to_send_for_get_block(b, p)[i];
             pt++;
           }
         }
       }
  
       // next process the nrow recvs as they complete
  
       // recv and store the data
       unsigned c = recv_requests_nrow.size();
       while (c > 0)
       {
         // wait for any communication to finish
         int req_number;
         MPI_Waitany(c, &recv_requests_nrow[0], &req_number, MPI_STATUS_IGNORE);
         recv_requests_nrow.erase(recv_requests_nrow.begin() + req_number);
         c--;
  
         // determine the source processor
         unsigned p = proc[req_number];
         proc.erase(proc.begin() + req_number);
  
         // copy the data to its final storage
         Nrows_to_recv_for_get_ordered[p] = 0;
         for (unsigned b = 0; b < Internal_nblock_types; b++)
         {
           Nrows_to_recv_for_get_block(b, p) = nrows_to_recv[p][b];
           Nrows_to_recv_for_get_ordered[p] += nrows_to_recv[p][b];
         }
  
         // and clear
         delete[] nrows_to_recv[p];
       }
  
       // resize the storage for the incoming rows data
       Rows_to_recv_for_get_ordered.resize(nproc, 0);
       for (unsigned p = 0; p < nproc; p++)
       {
         if (p != my_rank)
         {
           for (unsigned b = 0; b < Internal_nblock_types; b++)
           {
             Rows_to_recv_for_get_block(b, p) =
               new int[Nrows_to_recv_for_get_block(b, p)];
           }
         }
       }
  
       // compute the number of sends and recv from this processor
       // to each other processor
       Vector<unsigned> nsend_for_rows(nproc, 0);
       Vector<unsigned> nrecv_for_rows(nproc, 0);
       for (unsigned p = 0; p < nproc; p++)
       {
         if (p != my_rank)
         {
           for (unsigned b = 0; b < Internal_nblock_types; b++)
           {
             if (Nrows_to_send_for_get_block(b, p) > 0)
             {
               nsend_for_rows[p]++;
             }
             if (Nrows_to_recv_for_get_block(b, p) > 0)
             {
               nrecv_for_rows[p]++;
             }
           }
         }
       }
  
       // finally post the sends and recvs
       MPI_Aint base_displacement;
       MPI_Get_address(matrix_pt(), &base_displacement);
       Vector<MPI_Request> req_rows;
       for (unsigned p = 0; p < nproc; p++)
       {
         if (p != my_rank)
         {
           // send
           if (nsend_for_rows[p] > 0)
           {
             MPI_Datatype send_types[nsend_for_rows[p]];
             MPI_Aint send_displacements[nsend_for_rows[p]];
             int send_sz[nsend_for_rows[p]];
             unsigned send_ptr = 0;
             for (unsigned b = 0; b < Internal_nblock_types; b++)
             {
               if (Nrows_to_send_for_get_block(b, p) > 0)
               {
                 MPI_Type_contiguous(Nrows_to_send_for_get_block(b, p),
                                     MPI_INT,
                                     &send_types[send_ptr]);
                 MPI_Type_commit(&send_types[send_ptr]);
                 MPI_Get_address(block_rows_to_send(b, p),
                                 &send_displacements[send_ptr]);
                 send_displacements[send_ptr] -= base_displacement;
                 send_sz[send_ptr] = 1;
                 send_ptr++;
               }
             }
             MPI_Datatype final_send_type;
             MPI_Type_create_struct(nsend_for_rows[p],
                                    send_sz,
                                    send_displacements,
                                    send_types,
                                    &final_send_type);
             MPI_Type_commit(&final_send_type);
             for (unsigned i = 0; i < nsend_for_rows[p]; i++)
             {
               MPI_Type_free(&send_types[i]);
             }
             MPI_Request send_req;
             MPI_Isend(matrix_pt(),
                       1,
                       final_send_type,
                       p,
                       4,
                       comm_pt()->mpi_comm(),
                       &send_req);
             req_rows.push_back(send_req);
             MPI_Type_free(&final_send_type);
           }
  
           // recv
           if (nrecv_for_rows[p] > 0)
           {
             MPI_Datatype recv_types[nrecv_for_rows[p]];
             MPI_Aint recv_displacements[nrecv_for_rows[p]];
             int recv_sz[nrecv_for_rows[p]];
             unsigned recv_ptr = 0;
             for (unsigned b = 0; b < Internal_nblock_types; b++)
             {
               if (Nrows_to_recv_for_get_block(b, p) > 0)
               {
                 MPI_Type_contiguous(Nrows_to_recv_for_get_block(b, p),
                                     MPI_INT,
                                     &recv_types[recv_ptr]);
                 MPI_Type_commit(&recv_types[recv_ptr]);
                 MPI_Get_address(Rows_to_recv_for_get_block(b, p),
                                 &recv_displacements[recv_ptr]);
                 recv_displacements[recv_ptr] -= base_displacement;
                 recv_sz[recv_ptr] = 1;
                 recv_ptr++;
               }
             }
             MPI_Datatype final_recv_type;
             MPI_Type_create_struct(nrecv_for_rows[p],
                                    recv_sz,
                                    recv_displacements,
                                    recv_types,
                                    &final_recv_type);
             MPI_Type_commit(&final_recv_type);
             for (unsigned i = 0; i < nrecv_for_rows[p]; i++)
             {
               MPI_Type_free(&recv_types[i]);
             }
             MPI_Request recv_req;
             MPI_Irecv(matrix_pt(),
                       1,
                       final_recv_type,
                       p,
                       4,
                       comm_pt()->mpi_comm(),
                       &recv_req);
             req_rows.push_back(recv_req);
             MPI_Type_free(&final_recv_type);
           }
         }
       }
  
       // cleaning up Waitalls
  
  
       // wait for the recv requests so we can compute
       // Nrows_to_recv_for_get_ordered
       unsigned n_req_rows = req_rows.size();
       if (n_req_rows)
       {
         MPI_Waitall(n_req_rows, &req_rows[0], MPI_STATUS_IGNORE);
       }
  
       // resize the storage
       Rows_to_recv_for_get_ordered.resize(nproc);
       Rows_to_recv_for_get_ordered.initialise(0);
  
       // construct block offset
       Vector<int> vec_offset(Internal_nblock_types, 0);
       for (unsigned b = 1; b < Internal_nblock_types; ++b)
       {
         vec_offset[b] = vec_offset[b - 1] +
                         Internal_block_distribution_pt[b - 1]->nrow_local();
       }
  
       //
       for (unsigned p = 0; p < nproc; p++)
       {
         int pt = 0;
         Rows_to_recv_for_get_ordered[p] =
           new int[Nrows_to_recv_for_get_ordered[p]];
         for (unsigned b = 0; b < Internal_nblock_types; b++)
         {
           for (unsigned i = 0; i < Nrows_to_recv_for_get_block(b, p); i++)
           {
             Rows_to_recv_for_get_ordered[p][pt] =
               Rows_to_recv_for_get_block(b, p)[i] + vec_offset[b];
             pt++;
           }
         }
       }
  
       // clean up
       for (unsigned p = 0; p < nproc; p++)
       {
         if (p != my_rank)
         {
           for (unsigned b = 0; b < Internal_nblock_types; b++)
           {
             delete[] block_rows_to_send(b, p);
           }
           if (Nrows_to_send_for_get_ordered[p] > 0)
           {
             delete[] ordered_rows_to_send[p];
           }
         }
       }
  
       // and the send reqs
       unsigned n_req_send_nrow = send_requests_nrow.size();
       if (n_req_send_nrow)
       {
         MPI_Waitall(n_req_send_nrow, &send_requests_nrow[0], MPI_STATUS_IGNORE);
       }
       for (unsigned p = 0; p < nproc; p++)
       {
         delete[] nrows_to_send[p];
       }
 #endif
     }
  
     // If we asked for output of blocks to a file then do it.
     if (block_output_on()) output_blocks_to_files(Output_base_filename);
   }
  
   //============================================================================
   //??ds
   /// Function to turn this preconditioner into a
   /// subsidiary preconditioner that operates within a bigger
   /// "master block preconditioner (e.g. a Navier-Stokes 2x2 block
   /// preconditioner dealing with the fluid sub-blocks within a
   /// 3x3 FSI preconditioner. Once this is done the master block
   /// preconditioner deals with the block setup etc.
   /// The vector block_map must specify the dof number in the
   /// master preconditioner that corresponds to a block number in this
   /// preconditioner. ??ds horribly misleading comment!
   /// The length of the vector is used to determine the number of
   /// blocks in this preconditioner therefore it must be correctly sized.
   /// This calls the other turn_into_subsidiary_block_preconditioner(...)
   /// function providing an empty doftype_to_doftype_map vector.
   //============================================================================
   template<typename MATRIX>
   void BlockPreconditioner<MATRIX>::turn_into_subsidiary_block_preconditioner(
     BlockPreconditioner<MATRIX>* master_block_prec_pt,
     const Vector<unsigned>& doftype_in_master_preconditioner_coarse)
   {
     // Create the identity dof_coarsen_map
     Vector<Vector<unsigned>> doftype_coarsen_map_coarse;
     unsigned doftype_in_master_preconditioner_coarse_size =
       doftype_in_master_preconditioner_coarse.size();
  
     for (unsigned dof_i = 0;
          dof_i < doftype_in_master_preconditioner_coarse_size;
          dof_i++)
     {
       // Create a vector of size 1 and value i,
       // then push it into the dof_coarsen_map vector.
       Vector<unsigned> tmp_vec(1, dof_i);
       doftype_coarsen_map_coarse.push_back(tmp_vec);
     }
  
     // Call the other turn_into_subsidiary_block_preconditioner function.
     turn_into_subsidiary_block_preconditioner(
       master_block_prec_pt,
       doftype_in_master_preconditioner_coarse,
       doftype_coarsen_map_coarse);
   }
  
  
   //============================================================================
   /// Function to turn this block preconditioner into a
   /// subsidiary block preconditioner that operates within a bigger
   /// master block preconditioner (e.g. a Navier-Stokes 2x2 block
   /// preconditioner dealing with the fluid sub-blocks within a
   /// 3x3 FSI preconditioner. Once this is done the master block
   /// preconditioner deals with the block setup etc.
   ///
   /// The vector doftype_map must specify the dof type in the
   /// master preconditioner that corresponds to a dof type in this block
   /// preconditioner.
   ///
   /// In general, we want:
   /// doftype_map[doftype in subsidiary prec] = doftype in master prec.
   ///
   /// It tells this block preconditioner which dof types of the master
   /// block preconditioner it is working with.
   ///
   /// The length of the vector is used to determine the number of
   /// dof types in THIS block preconditioner therefore it must be correctly
   /// sized.
   ///
   /// For example, let the master block preconditioner have 5 dof types in total
   /// and a 1-4 dof type splitting where the block (0,0) corresponds to
   /// dof type 0 and the block (1,1) corresponds to dof types 1, 2, 3 and 4
   /// (i.e. it would have given to block_setup the vector [0,1,1,1,1]).
   /// Furthermore, it solves (1,1) block with subsidiary block preconditioner.
   /// Then the doftype_map passed to this function of the subsidiary block
   /// preconditioner would be [1, 2, 3, 4].
   ///
   /// Dof type coarsening (following on from the example above):
   /// Let the subsidiary block preconditioner (THIS block preconditioner)
   /// only works with two DOF types, then the master block preconditioner must
   /// "coarsen" the dof types by providing the optional argument
   /// doftype_coarsen_map vector.
   ///
   /// The doftype_coarsen_map vector (in this case) might be [[0,1], [2,3]]
   /// telling the subsidiary block preconditioner that the SUBSIDIARY dof types
   /// 0 and 1 should be treated as dof type 0 and the subsidiary dof types 2
   /// and 3 should be treated as subsidiary dof type 1.
   ///
   /// If no doftype_coarsen_map vector is provided, then the identity is
   /// used automatically (see the turn_into_subsidiary_block_preconditioner(...)
   /// function with only two arguments). In the above case, the identity
   /// doftype_coarsen_map vector for the subsidiary block preconditioner
   /// would be the 2D vector [[0], [1], [2], [3]] which means
   /// dof type 0 is treated as dof type 0,
   /// dof type 1 is treated as dof type 1,
   /// dof type 2 is treated as dof type 2, and
   /// dof type 3 is treated as dof type 3.
   //============================================================================
   template<typename MATRIX>
   void BlockPreconditioner<MATRIX>::turn_into_subsidiary_block_preconditioner(
     BlockPreconditioner<MATRIX>* master_block_prec_pt,
     const Vector<unsigned>& doftype_in_master_preconditioner_coarse,
     const Vector<Vector<unsigned>>& doftype_coarsen_map_coarse)
   {
     // Set the master block preconditioner pointer
     Master_block_preconditioner_pt = master_block_prec_pt;
  
     // Set the Doftype_coarsen_map_coarse.
     Doftype_coarsen_map_coarse = doftype_coarsen_map_coarse;
  
     Doftype_in_master_preconditioner_coarse =
       doftype_in_master_preconditioner_coarse;
   } // end of turn_into_subsidiary_block_preconditioner(...)
  
  
   //============================================================================
   /// Determine the size of the matrix blocks and setup the
   /// lookup schemes relating the global degrees of freedom with
   /// their "blocks" and their indices (row/column numbers) in those
   /// blocks.
   /// The distributions of the preconditioner and the blocks are
   /// automatically specified (and assumed to be uniform) at this
   /// stage.
   /// This method should be used if each DOF type corresponds to a
   /// unique block type.
   //============================================================================
   template<typename MATRIX>
   void BlockPreconditioner<MATRIX>::block_setup()
   {
 #ifdef PARANOID
  
     // Subsidiary preconditioners don't really need the meshes
     if (this->is_master_block_preconditioner())
     {
       std::ostringstream err_msg;
       unsigned n = nmesh();
       if (n == 0)
       {
         err_msg << "No meshes have been set for this block preconditioner!\n"
                 << "Set one with set_nmesh(...), set_mesh(...)" << std::endl;
         throw OomphLibError(
           err_msg.str(), OOMPH_CURRENT_FUNCTION, OOMPH_EXCEPTION_LOCATION);
         for (unsigned m = 0; m < n; m++)
         {
           if (Mesh_pt[m] == 0)
           {
             err_msg << "The mesh pointer to mesh " << m << " is null!\n"
                     << "Set a non-null one with set_mesh(...)" << std::endl;
             throw OomphLibError(
               err_msg.str(), OOMPH_CURRENT_FUNCTION, OOMPH_EXCEPTION_LOCATION);
           }
         }
       }
     }
 #endif
  
     // Get the number of dof types.
     unsigned internal_n_dof_types = ndof_types();
  
     // Build the dof to block map - assume that each type of dof corresponds
     // to a different type of block.
     Vector<unsigned> dof_to_block_lookup(internal_n_dof_types);
     for (unsigned i = 0; i < internal_n_dof_types; i++)
     {
       dof_to_block_lookup[i] = i;
     }
  
     // call the block setup method
     this->block_setup(dof_to_block_lookup);
   }
  
  
   //============================================================================
   /// Get the block matrices required for the block preconditioner. Takes a
   /// pointer to a matrix of bools that indicate if a specified sub-block is
   /// required for the preconditioning operation. Computes the required block
   /// matrices, and stores pointers to them in the matrix block_matrix_pt. If an
   /// entry in block_matrix_pt is equal to NULL that sub-block has not been
   /// requested and is therefore not available.
   //============================================================================
   template<typename MATRIX>
   void BlockPreconditioner<MATRIX>::get_blocks(
     DenseMatrix<bool>& required_blocks,
     DenseMatrix<MATRIX*>& block_matrix_pt) const
   {
     // Cache number of block types
     const unsigned n_block_types = nblock_types();
  
 #ifdef PARANOID
     // If required blocks matrix pointer is not the correct size then abort.
     if ((required_blocks.nrow() != n_block_types) ||
         (required_blocks.ncol() != n_block_types))
     {
       std::ostringstream error_message;
       error_message << "The size of the matrix of bools required_blocks "
                     << "(which indicates which blocks are required) is not the "
                     << "right size, required_blocks is "
                     << required_blocks.ncol() << " x " << required_blocks.nrow()
                     << ", whereas it should "
                     << "be " << n_block_types << " x " << n_block_types;
       throw OomphLibError(
         error_message.str(), OOMPH_CURRENT_FUNCTION, OOMPH_EXCEPTION_LOCATION);
     }
  
     // If block matrix pointer is not the correct size then abort.
     if ((block_matrix_pt.nrow() != n_block_types) ||
         (block_matrix_pt.ncol() != n_block_types))
     {
       std::ostringstream error_message;
       error_message << "The size of the block matrix pt is not the "
                     << "right size, block_matrix_pt is "
                     << block_matrix_pt.ncol() << " x " << block_matrix_pt.nrow()
                     << ", whereas it should "
                     << "be " << n_block_types << " x " << n_block_types;
       throw OomphLibError(
         error_message.str(), OOMPH_CURRENT_FUNCTION, OOMPH_EXCEPTION_LOCATION);
     }
  
 #endif
  
     // Loop over the blocks
     for (unsigned i = 0; i < n_block_types; i++)
     {
       for (unsigned j = 0; j < n_block_types; j++)
       {
         // If block(i,j) is required then create a matrix and fill it in.
         if (required_blocks(i, j))
         {
           //??ds might want to remove this use of new as well?
           block_matrix_pt(i, j) = new MATRIX;
           get_block(i, j, *block_matrix_pt(i, j));
         }
  
         // Otherwise set pointer to null.
         else
         {
           block_matrix_pt(i, j) = 0;
         }
       }
     }
   }
  
   //============================================================================
   /// Takes the naturally ordered vector and extracts the blocks
   /// indicated by the block number (the values) in the Vector
   /// block_vec_number all at once, then concatenates them without
   /// communication. Here, the values in block_vec_number is the block number
   /// in the current preconditioner.
   /// This is a non-const function because distributions may be created
   /// and stored in Auxiliary_block_distribution_pt for future use.
   //============================================================================
   template<typename MATRIX>
   void BlockPreconditioner<MATRIX>::get_concatenated_block_vector(
     const Vector<unsigned>& block_vec_number,
     const DoubleVector& v,
     DoubleVector& w)
   {
 #ifdef PARANOID
  
     // Check if v is built.
     if (!v.built())
     {
       std::ostringstream err_msg;
       err_msg << "The distribution of the global vector v must be setup.";
       throw OomphLibError(
         err_msg.str(), OOMPH_CURRENT_FUNCTION, OOMPH_EXCEPTION_LOCATION);
     }
  
     // v must have the same distribution as the upper-most master block
     // preconditioner, since the upper-most master block preconditioner
     // should have the same distribution as the matrix pointed to
     // by matrix_pt().
     if (*(v.distribution_pt()) != *(this->master_distribution_pt()))
     {
       std::ostringstream err_msg;
       err_msg << "The distribution of the global vector v must match the "
               << " specified master_distribution_pt(). \n"
               << "i.e. Distribution_pt in the master preconditioner";
       throw OomphLibError(
         err_msg.str(), OOMPH_CURRENT_FUNCTION, OOMPH_EXCEPTION_LOCATION);
     }
  
     // Check to see if there are more blocks defined in the block_vec_number
     // vector than the number of block types. This is not allowed.
     const unsigned para_nblock_types = nblock_types();
     const unsigned para_block_vec_number_size = block_vec_number.size();
     if (para_block_vec_number_size > para_nblock_types)
     {
       std::ostringstream err_msg;
       err_msg << "You have requested " << para_block_vec_number_size
               << " number of blocks, (block_vec_number.size() is "
               << para_block_vec_number_size << ").\n"
               << "But there are only " << para_nblock_types
               << " nblock_types.\n"
               << "Please make sure that block_vec_number is correctly sized.\n";
       throw OomphLibError(
         err_msg.str(), OOMPH_CURRENT_FUNCTION, OOMPH_EXCEPTION_LOCATION);
     }
  
     // Check if any block numbers defined in block_vec_number is equal to or
     // greater than the number of block types.
     // E.g. if there are 5 block types, we can only have block numbers:
     //  0, 1, 2, 3 and 4.
     for (unsigned i = 0; i < para_block_vec_number_size; i++)
     {
       const unsigned para_required_block = block_vec_number[i];
       if (para_required_block >= para_nblock_types)
       {
         std::ostringstream err_msg;
         err_msg << "block_vec_number[" << i << "] is " << para_required_block
                 << ".\n"
                 << "But there are only " << para_nblock_types
                 << " nblock_types.\n";
         throw OomphLibError(
           err_msg.str(), OOMPH_CURRENT_FUNCTION, OOMPH_EXCEPTION_LOCATION);
       }
     }
  
     // Check that no block number is inserted twice.
     std::set<unsigned> para_set;
     for (unsigned b = 0; b < para_block_vec_number_size; b++)
     {
       std::pair<std::set<unsigned>::iterator, bool> para_set_ret;
       para_set_ret = para_set.insert(block_vec_number[b]);
  
       if (!para_set_ret.second)
       {
         std::ostringstream err_msg;
         err_msg << "Error: the block number " << block_vec_number[b]
                 << " appears twice.\n";
         throw OomphLibError(
           err_msg.str(), OOMPH_CURRENT_FUNCTION, OOMPH_EXCEPTION_LOCATION);
       }
     }
 #endif
  
     // Number of blocks to get.
     const unsigned n_block = block_vec_number.size();
  
     // Each block is made of dof types. We get the most fine grain dof types.
     // Most fine grain in the sense that these are the dof types that belongs
     // in this block before any coarsening of dof types has taken place.
     // The ordering of the dof types matters, this is handled properly when
     // creating the Block_to_dof_map_fine vector and must be respected here.
     // I.e. we cannot arbitrarily insert dof types (even if they are correct)
     // in the vector most_fine_grain_dof.
     Vector<unsigned> most_fine_grain_dof;
     for (unsigned b = 0; b < n_block; b++)
     {
       const unsigned mapped_b = block_vec_number[b];
       most_fine_grain_dof.insert(most_fine_grain_dof.end(),
                                  Block_to_dof_map_fine[mapped_b].begin(),
                                  Block_to_dof_map_fine[mapped_b].end());
     }
  
     // Get all the dof level vectors in one go.
     Vector<DoubleVector> dof_block_vector;
     internal_get_block_vectors(most_fine_grain_dof, v, dof_block_vector);
  
     // Next we need to build the output DoubleVector w with the correct
     // distribution: the concatenation of the distributions of all the
     // dof-level vectors. This is the same as the concatenation of the
     // distributions of the blocks within this preconditioner.
     //
     // So we first check if it exists already, if not, we create it and
     // store it for future use. We store it because concatenation of
     // distributions requires communication, so concatenation of
     // distributions on-the-fly should be avoided.
     std::map<Vector<unsigned>, LinearAlgebraDistribution*>::const_iterator iter;
  
     // Attempt to get an iterator pointing to the pair with the value
     // block_vec_number.
     iter = Auxiliary_block_distribution_pt.find(block_vec_number);
  
     if (iter != Auxiliary_block_distribution_pt.end())
     // If it exists, build w with the distribution pointed to
     // by pair::second.
     {
       w.build(iter->second);
     }
     else
     // Else, we need to create the distribution and store it in
     // Auxiliary_block_distribution_pt.
     {
       Vector<LinearAlgebraDistribution*> tmp_vec_dist_pt(n_block, 0);
       for (unsigned b = 0; b < n_block; b++)
       {
         tmp_vec_dist_pt[b] = Block_distribution_pt[block_vec_number[b]];
       }
  
       // Note that the distribution is created with new but not deleted here.
       // This is handled in the clean up functions.
       LinearAlgebraDistribution* tmp_dist_pt = new LinearAlgebraDistribution;
       LinearAlgebraDistributionHelpers::concatenate(tmp_vec_dist_pt,
                                                     *tmp_dist_pt);
  
       // Store the pair of Vector<unsigned> and LinearAlgebraDistribution*
       insert_auxiliary_block_distribution(block_vec_number, tmp_dist_pt);
  
       // Build w.
       w.build(tmp_dist_pt);
     }
  
     // Now concatenate all the dof level vectors into the vector w.
     DoubleVectorHelpers::concatenate_without_communication(dof_block_vector, w);
  
   } // get_concatenated_block_vector(...)
  
   //============================================================================
   /// Takes concatenated block ordered vector, b, and copies its
   // entries to the appropriate entries in the naturally ordered vector, v.
   // Here the values in block_vec_number indicates which blocks the vector
   // b is a concatenation of. The block number are those in the current
   // preconditioner. If the preconditioner is a subsidiary block
   // preconditioner the other entries in v that are not associated with it
   // are left alone.
   //============================================================================
   template<typename MATRIX>
   void BlockPreconditioner<MATRIX>::return_concatenated_block_vector(
     const Vector<unsigned>& block_vec_number,
     const DoubleVector& w,
     DoubleVector& v) const
   {
 #ifdef PARANOID
  
     // Check if v is built.
     if (!v.built())
     {
       std::ostringstream err_msg;
       err_msg << "The distribution of the global vector v must be setup.";
       throw OomphLibError(
         err_msg.str(), OOMPH_CURRENT_FUNCTION, OOMPH_EXCEPTION_LOCATION);
     }
  
     // v must have the same distribution as the upper-most master block
     // preconditioner, since the upper-most master block preconditioner
     // should have the same distribution as the matrix pointed to
     // by matrix_pt().
     if (*(v.distribution_pt()) != *(this->master_distribution_pt()))
     {
       std::ostringstream err_msg;
       err_msg << "The distribution of the global vector v must match the "
               << " specified master_distribution_pt(). \n"
               << "i.e. Distribution_pt in the master preconditioner";
       throw OomphLibError(
         err_msg.str(), OOMPH_CURRENT_FUNCTION, OOMPH_EXCEPTION_LOCATION);
     }
  
     // Check to see if there are more blocks defined in the block_vec_number
     // vector than the number of block types. This is not allowed.
     const unsigned para_block_vec_number_size = block_vec_number.size();
     const unsigned para_n_block = nblock_types();
     if (para_block_vec_number_size > para_n_block)
     {
       std::ostringstream err_msg;
       err_msg << "Trying to return " << para_block_vec_number_size
               << " block vectors.\n"
               << "But there are only " << para_n_block << " block types.\n";
       throw OomphLibError(
         err_msg.str(), OOMPH_CURRENT_FUNCTION, OOMPH_EXCEPTION_LOCATION);
     }
  
     // Check if any block numbers defined in block_vec_number is equal to or
     // greater than the number of block types.
     // E.g. if there are 5 block types, we can only have block numbers:
     //  0, 1, 2, 3 and 4.
     for (unsigned b = 0; b < para_block_vec_number_size; b++)
     {
       const unsigned para_required_block = block_vec_number[b];
       if (para_required_block > para_n_block)
       {
         std::ostringstream err_msg;
         err_msg << "block_vec_number[" << b << "] is " << para_required_block
                 << ".\n"
                 << "But there are only " << para_n_block << " block types.\n";
         throw OomphLibError(
           err_msg.str(), OOMPH_CURRENT_FUNCTION, OOMPH_EXCEPTION_LOCATION);
       }
     }
  
     // Check that no block number is inserted twice.
     std::set<unsigned> para_set;
     for (unsigned b = 0; b < para_block_vec_number_size; b++)
     {
       std::pair<std::set<unsigned>::iterator, bool> para_set_ret;
       para_set_ret = para_set.insert(block_vec_number[b]);
  
       if (!para_set_ret.second)
       {
         std::ostringstream err_msg;
         err_msg << "Error: the block number " << block_vec_number[b]
                 << " appears twice.\n";
         throw OomphLibError(
           err_msg.str(), OOMPH_CURRENT_FUNCTION, OOMPH_EXCEPTION_LOCATION);
       }
     }
  
     // Check that w is built.
     if (!w.built())
     {
       std::ostringstream err_msg;
       err_msg << "The distribution of the block vector w must be setup.";
       throw OomphLibError(
         err_msg.str(), OOMPH_CURRENT_FUNCTION, OOMPH_EXCEPTION_LOCATION);
     }
  
     // Check that the distributions defined by block_vec_number is correct for
     // the distribution from w.
     // Recall that w is the concatenation of the block vectors defined by
     // the values in block_vec_number. We check that this is the case.
     Vector<LinearAlgebraDistribution*> para_vec_dist_pt(
       para_block_vec_number_size, 0);
  
     for (unsigned b = 0; b < para_block_vec_number_size; b++)
     {
       para_vec_dist_pt[b] = Block_distribution_pt[block_vec_number[b]];
     }
  
     LinearAlgebraDistribution para_tmp_dist;
  
     LinearAlgebraDistributionHelpers::concatenate(para_vec_dist_pt,
                                                   para_tmp_dist);
  
     if (*w.distribution_pt() != para_tmp_dist)
     {
       std::ostringstream err_msg;
       err_msg << "The distribution of the block vector w does not match \n"
               << "the concatenation of the block distributions defined in \n"
               << "block_vec_number.\n";
       throw OomphLibError(
         err_msg.str(), OOMPH_CURRENT_FUNCTION, OOMPH_EXCEPTION_LOCATION);
     }
 #endif
  
     // Number of blocks to return.
     const unsigned n_block = block_vec_number.size();
  
     // Each block is made of dof types. We get the most fine grain dof types.
     // Most fine grain in the sense that these are the dof types that belongs
     // in this block before any coarsening of dof types has taken place.
     // The ordering of the dof types matters, this is handled properly when
     // creating the Block_to_dof_map_fine vector and must be respected here.
     // I.e. we cannot arbitrarily insert dof types (even if they are correct)
     // in the vector most_fine_grain_dof.
     Vector<unsigned> most_fine_grain_dof;
     for (unsigned b = 0; b < n_block; b++)
     {
       const unsigned mapped_b = block_vec_number[b];
       most_fine_grain_dof.insert(most_fine_grain_dof.end(),
                                  Block_to_dof_map_fine[mapped_b].begin(),
                                  Block_to_dof_map_fine[mapped_b].end());
     }
  
     // The number of most fine grain dof types associated with the blocks
     // defined by block_vec_number.
     const unsigned ndof = most_fine_grain_dof.size();
  
     // Build each dof level vector with the correct distribution.
     Vector<DoubleVector> dof_vector(ndof);
     for (unsigned d = 0; d < ndof; d++)
     {
       dof_vector[d].build(
         internal_block_distribution_pt(most_fine_grain_dof[d]));
     }
  
     // Perform the splitting of w into the most fine grain dof level vectors.
     DoubleVectorHelpers::split_without_communication(w, dof_vector);
  
     // Return all the dof level vectors in one go.
     internal_return_block_vectors(most_fine_grain_dof, dof_vector, v);
   } // return_concatenated_block_vector(...)
  
   //============================================================================
   /// Takes the naturally ordered vector and rearranges it into a
   /// vector of sub vectors corresponding to the blocks, so s[b][i] contains
   /// the i-th entry in the vector associated with block b.
   /// Note: If the preconditioner is a subsidiary preconditioner then only the
   /// sub-vectors associated with the blocks of the subsidiary preconditioner
   /// will be included. Hence the length of v is master_nrow() whereas the
   /// total length of the s vectors is the sum of the lengths of the
   /// individual block vectors defined in block_vec_number.
   //============================================================================
   template<typename MATRIX>
   void BlockPreconditioner<MATRIX>::get_block_vectors(
     const Vector<unsigned>& block_vec_number,
     const DoubleVector& v,
     Vector<DoubleVector>& s) const
   {
 #ifdef PARANOID
  
     // Check if v is built.
     if (!v.built())
     {
       std::ostringstream err_msg;
       err_msg << "The distribution of the global vector v must be setup.";
       throw OomphLibError(
         err_msg.str(), OOMPH_CURRENT_FUNCTION, OOMPH_EXCEPTION_LOCATION);
     }
  
     // v must have the same distribution as the upper-most master block
     // preconditioner, since the upper-most master block preconditioner
     // should have the same distribution as the matrix pointed to
     // by matrix_pt().
     if (*(v.distribution_pt()) != *(this->master_distribution_pt()))
     {
       std::ostringstream err_msg;
       err_msg << "The distribution of the global vector v must match the "
               << " specified master_distribution_pt(). \n"
               << "i.e. Distribution_pt in the master preconditioner";
       throw OomphLibError(
         err_msg.str(), OOMPH_CURRENT_FUNCTION, OOMPH_EXCEPTION_LOCATION);
     }
  
     // Check to see if there are more blocks defined in the block_vec_number
     // vector than the number of block types. This is not allowed.
     const unsigned para_nblock_types = nblock_types();
     const unsigned para_block_vec_number_size = block_vec_number.size();
     if (para_block_vec_number_size > para_nblock_types)
     {
       std::ostringstream err_msg;
       err_msg << "You have requested " << para_block_vec_number_size
               << " number of blocks, (block_vec_number.size() is "
               << para_block_vec_number_size << ").\n"
               << "But there are only " << para_nblock_types
               << " nblock_types.\n"
               << "Please make sure that block_vec_number is correctly sized.\n";
       throw OomphLibError(
         err_msg.str(), OOMPH_CURRENT_FUNCTION, OOMPH_EXCEPTION_LOCATION);
     }
  
     // Check if any block numbers defined in block_vec_number is equal to or
     // greater than the number of block types.
     // E.g. if there are 5 block types, we can only have block numbers:
     //  0, 1, 2, 3 and 4.
     for (unsigned i = 0; i < para_block_vec_number_size; i++)
     {
       const unsigned para_required_block = block_vec_number[i];
       if (para_required_block > para_nblock_types)
       {
         std::ostringstream err_msg;
         err_msg << "block_vec_number[" << i << "] is " << para_required_block
                 << ".\n"
                 << "But there are only " << para_nblock_types
                 << " nblock_types.\n";
         throw OomphLibError(
           err_msg.str(), OOMPH_CURRENT_FUNCTION, OOMPH_EXCEPTION_LOCATION);
       }
     }
     // Check that no block number is inserted twice.
     std::set<unsigned> para_set;
     for (unsigned b = 0; b < para_block_vec_number_size; b++)
     {
       std::pair<std::set<unsigned>::iterator, bool> para_set_ret;
       para_set_ret = para_set.insert(block_vec_number[b]);
  
       if (!para_set_ret.second)
       {
         std::ostringstream err_msg;
         err_msg << "Error: the block number " << block_vec_number[b]
                 << " appears twice.\n";
         throw OomphLibError(
           err_msg.str(), OOMPH_CURRENT_FUNCTION, OOMPH_EXCEPTION_LOCATION);
       }
     }
 #endif
  
     // Number of blocks to get.
     const unsigned n_block = block_vec_number.size();
     s.resize(n_block);
  
     // Each block is made of dof types. We get the most fine grain dof types.
     // Most fine grain in the sense that these are the dof types that belongs
     // in this block before any coarsening of dof types has taken place.
     // The ordering of the dof types matters, this is handled properly when
     // creating the Block_to_dof_map_fine vector and must be respected here.
     // I.e. we cannot arbitrarily insert dof types (even if they are correct)
     // in the vector most_fine_grain_dof.
     Vector<unsigned> most_fine_grain_dof;
     for (unsigned b = 0; b < n_block; b++)
     {
       const unsigned mapped_b = block_vec_number[b];
  
       most_fine_grain_dof.insert(most_fine_grain_dof.end(),
                                  Block_to_dof_map_fine[mapped_b].begin(),
                                  Block_to_dof_map_fine[mapped_b].end());
     }
  
     // Get all the dof level vectors in one go.
     Vector<DoubleVector> dof_vector;
     internal_get_block_vectors(most_fine_grain_dof, v, dof_vector);
  
     // For each block vector requested,
     // build the block s[b],
     // concatenate the corresponding dof vector
  
     // Since all the dof vectors are in dof_vector,
     // we need to loop through this.
     // The offset helps us loop through this.
     unsigned offset = 0;
  
     for (unsigned b = 0; b < n_block; b++)
     {
       // The actual block number required.
       const unsigned mapped_b = block_vec_number[b];
  
       // How many most fine grain dofs are in this block?
       const unsigned n_dof = Block_to_dof_map_fine[mapped_b].size();
  
       if (n_dof == 1)
       // No need to concatenate, just copy the DoubleVector.
       {
         s[b] = dof_vector[offset];
       }
       else
       // Concatenate the relevant dof vectors into s[b].
       {
         s[b].build(Block_distribution_pt[mapped_b], 0);
         Vector<DoubleVector*> tmp_vec_pt(n_dof, 0);
         for (unsigned vec_i = 0; vec_i < n_dof; vec_i++)
         {
           tmp_vec_pt[vec_i] = &dof_vector[offset + vec_i];
         }
  
         DoubleVectorHelpers::concatenate_without_communication(tmp_vec_pt,
                                                                s[b]);
       }
  
       // Update the offset.
       offset += n_dof;
     }
   } // get_block_vectors(...)
  
  
   //============================================================================
   /// Takes the naturally ordered vector and rearranges it into a
   /// vector of sub vectors corresponding to the blocks, so s[b][i] contains
   /// the i-th entry in the vector associated with block b.
   /// Note: If the preconditioner is a subsidiary preconditioner then only the
   /// sub-vectors associated with the blocks of the subsidiary preconditioner
   /// will be included. Hence the length of v is master_nrow() whereas the
   /// total length of the s vectors is Nrow.
   /// This is simply a wrapper around the other get_block_vectors(...) function
   /// where the block_vec_number Vector is the identity, i.e.
   /// block_vec_number is [0, 1, ..., nblock_types - 1].
   //============================================================================
   template<typename MATRIX>
   void BlockPreconditioner<MATRIX>::get_block_vectors(
     const DoubleVector& v, Vector<DoubleVector>& s) const
   {
     // Get the number of blocks in this block preconditioner.
     const unsigned n_block = nblock_types();
  
     // Create the identity vector.
     Vector<unsigned> required_block(n_block, 0);
     for (unsigned i = 0; i < n_block; i++)
     {
       required_block[i] = i;
     }
  
     // Call the other function which does the work.
     get_block_vectors(required_block, v, s);
   }
  
   //============================================================================
   /// Takes the naturally ordered vector and
   /// rearranges it into a vector of sub vectors corresponding to the blocks,
   /// so s[b][i] contains the i-th entry in the vector associated with block b.
   /// The block_vec_number indicates which blocks we want.
   /// These blocks and vectors are those corresponding to the internal blocks.
   /// Note: If the preconditioner is a subsidiary preconditioner then only the
   /// sub-vectors associated with the blocks of the subsidiary preconditioner
   /// will be included. Hence the length of v is master_nrow() whereas the
   /// total length of the s vectors is the sum of the Nrow of the sub vectors.
   //============================================================================
   template<typename MATRIX>
   void BlockPreconditioner<MATRIX>::internal_get_block_vectors(
     const Vector<unsigned>& block_vec_number,
     const DoubleVector& v,
     Vector<DoubleVector>& s) const
   {
 #ifdef PARANOID
     if (!v.built())
     {
       std::ostringstream error_message;
       error_message << "The distribution of the global vector v must be setup.";
       throw OomphLibError(
         error_message.str(), OOMPH_CURRENT_FUNCTION, OOMPH_EXCEPTION_LOCATION);
     }
     if (*(v.distribution_pt()) != *(this->master_distribution_pt()))
     {
       std::ostringstream error_message;
       error_message << "The distribution of the global vector v must match the "
                     << " specified master_distribution_pt(). \n"
                     << "i.e. Distribution_pt in the master preconditioner";
       throw OomphLibError(
         error_message.str(), OOMPH_CURRENT_FUNCTION, OOMPH_EXCEPTION_LOCATION);
     }
 #endif
  
     // Number of block types
     // const unsigned nblock = this->internal_nblock_types();
     const unsigned nblock = block_vec_number.size();
  
     // if + only one processor
     //    + more than one processor but matrix_pt is not distributed
     // then use the serial get_block method
     if (this->distribution_pt()->communicator_pt()->nproc() == 1 ||
         !this->distribution_pt()->distributed())
     {
       // Vector of vectors for each section of residual vector
       s.resize(nblock);
  
       // pointer to the data in v
       const double* v_pt = v.values_pt();
  
       // setup the block vector and then insert the data
       for (unsigned b = 0; b < nblock; b++)
       {
         const unsigned required_block = block_vec_number[b];
         s[b].build(Internal_block_distribution_pt[required_block], 0.0);
         double* s_pt = s[b].values_pt();
         unsigned nrow = s[b].nrow();
         for (unsigned i = 0; i < nrow; i++)
         {
           s_pt[i] = v_pt[this->Global_index[required_block][i]];
         }
       }
     }
     // otherwise use mpi
     else
     {
 #ifdef OOMPH_HAS_MPI
       // my rank
       unsigned my_rank = this->distribution_pt()->communicator_pt()->my_rank();
  
       // the number of processors
       unsigned nproc = this->distribution_pt()->communicator_pt()->nproc();
  
       // build the vectors
       s.resize(nblock);
       for (unsigned b = 0; b < nblock; b++)
       {
         const unsigned required_block = block_vec_number[b];
         s[b].build(Internal_block_distribution_pt[required_block], 0.0);
       }
  
       // determine the maximum number of rows to be sent or recv
       // and determine the number of blocks each processor will send and recv
       // communication for
       Vector<int> nblock_send(nproc, 0);
       Vector<int> nblock_recv(nproc, 0);
       unsigned max_n_send_or_recv = 0;
       for (unsigned p = 0; p < nproc; p++)
       {
         for (unsigned b = 0; b < nblock; b++)
         {
           const unsigned required_block = block_vec_number[b];
           max_n_send_or_recv = std::max(
             max_n_send_or_recv, Nrows_to_send_for_get_block(required_block, p));
           max_n_send_or_recv = std::max(
             max_n_send_or_recv, Nrows_to_recv_for_get_block(required_block, p));
           if (Nrows_to_send_for_get_block(required_block, p) > 0)
           {
             nblock_send[p]++;
           }
           if (Nrows_to_recv_for_get_block(required_block, p) > 0)
           {
             nblock_recv[p]++;
           }
         }
       }
  
       // create a vectors of 1s the size of the nblock for the mpi indexed
       // data types
       int* block_lengths = new int[max_n_send_or_recv];
       for (unsigned i = 0; i < max_n_send_or_recv; i++)
       {
         block_lengths[i] = 1;
       }
  
       // perform the sends and receives
       Vector<MPI_Request> requests;
       for (unsigned p = 0; p < nproc; p++)
       {
         // send and recv with other processors
         if (p != my_rank)
         {
           // send
           if (nblock_send[p] > 0)
           {
             // create the datatypes vector
             MPI_Datatype block_send_types[nblock_send[p]];
  
             // create the datatypes
             unsigned ptr = 0;
             for (unsigned b = 0; b < nblock; b++)
             {
               const unsigned required_block = block_vec_number[b];
  
               if (Nrows_to_send_for_get_block(required_block, p) > 0)
               {
                 MPI_Type_indexed(Nrows_to_send_for_get_block(required_block, p),
                                  block_lengths,
                                  Rows_to_send_for_get_block(required_block, p),
                                  MPI_DOUBLE,
                                  &block_send_types[ptr]);
                 MPI_Type_commit(&block_send_types[ptr]);
                 ptr++;
               }
             }
  
             // compute the displacements and lengths
             MPI_Aint displacements[nblock_send[p]];
             int lengths[nblock_send[p]];
             for (int i = 0; i < nblock_send[p]; i++)
             {
               lengths[i] = 1;
               displacements[i] = 0;
             }
  
             // build the final datatype
             MPI_Datatype type_send;
             MPI_Type_create_struct(nblock_send[p],
                                    lengths,
                                    displacements,
                                    block_send_types,
                                    &type_send);
             MPI_Type_commit(&type_send);
  
             // send
             MPI_Request send_req;
             MPI_Isend(const_cast<double*>(v.values_pt()),
                       1,
                       type_send,
                       p,
                       0,
                       this->distribution_pt()->communicator_pt()->mpi_comm(),
                       &send_req);
             MPI_Type_free(&type_send);
             for (int i = 0; i < nblock_send[p]; i++)
             {
               MPI_Type_free(&block_send_types[i]);
             }
             requests.push_back(send_req);
           }
  
           // recv
           if (nblock_recv[p] > 0)
           {
             // create the datatypes vector
             MPI_Datatype block_recv_types[nblock_recv[p]];
  
             // and the displacements
             MPI_Aint displacements[nblock_recv[p]];
  
             // and the lengths
             int lengths[nblock_recv[p]];
  
             // all displacements are computed relative to s[0] values
             MPI_Aint displacements_base;
             MPI_Get_address(s[0].values_pt(), &displacements_base);
  
             // now build
             unsigned ptr = 0;
             for (unsigned b = 0; b < nblock; b++)
             {
               const unsigned required_block = block_vec_number[b];
  
               if (Nrows_to_recv_for_get_block(required_block, p) > 0)
               {
                 MPI_Type_indexed(Nrows_to_recv_for_get_block(required_block, p),
                                  block_lengths,
                                  Rows_to_recv_for_get_block(required_block, p),
                                  MPI_DOUBLE,
                                  &block_recv_types[ptr]);
                 MPI_Type_commit(&block_recv_types[ptr]);
                 MPI_Get_address(s[b].values_pt(), &displacements[ptr]);
                 displacements[ptr] -= displacements_base;
                 lengths[ptr] = 1;
                 ptr++;
               }
             }
  
             // build the final data type
             MPI_Datatype type_recv;
             MPI_Type_create_struct(nblock_recv[p],
                                    lengths,
                                    displacements,
                                    block_recv_types,
                                    &type_recv);
             MPI_Type_commit(&type_recv);
  
             // recv
             MPI_Request recv_req;
             MPI_Irecv(s[0].values_pt(),
                       1,
                       type_recv,
                       p,
                       0,
                       this->distribution_pt()->communicator_pt()->mpi_comm(),
                       &recv_req);
             MPI_Type_free(&type_recv);
             for (int i = 0; i < nblock_recv[p]; i++)
             {
               MPI_Type_free(&block_recv_types[i]);
             }
             requests.push_back(recv_req);
           }
         }
  
         // communicate with self
         else
         {
           const double* v_values_pt = v.values_pt();
           for (unsigned b = 0; b < nblock; b++)
           {
             const unsigned required_block = block_vec_number[b];
  
             double* w_values_pt = s[b].values_pt();
             for (unsigned i = 0;
                  i < Nrows_to_send_for_get_block(required_block, p);
                  i++)
             {
               w_values_pt[Rows_to_recv_for_get_block(required_block, p)[i]] =
                 v_values_pt[Rows_to_send_for_get_block(required_block, p)[i]];
             }
           }
         }
       }
  
       // and then just wait
       unsigned c = requests.size();
       Vector<MPI_Status> stat(c);
       if (c)
       {
         MPI_Waitall(c, &requests[0], &stat[0]);
       }
       delete[] block_lengths;
  
 #else
       // throw error
       std::ostringstream error_message;
       error_message << "The preconditioner is distributed and on more than one "
                     << "processor. MPI is required.";
       throw OomphLibError(
         error_message.str(), OOMPH_CURRENT_FUNCTION, OOMPH_EXCEPTION_LOCATION);
 #endif
     }
   }
  
   //============================================================================
   /// A helper function, takes the naturally ordered vector and
   /// rearranges it into a vector of sub vectors corresponding to the blocks,
   /// so s[b][i] contains the i-th entry in the vector associated with block b.
   /// The block_vec_number indicates which blocks we want.
   /// These blocks and vectors are those corresponding to the internal blocks.
   /// Note: If the preconditioner is a subsidiary preconditioner then only the
   /// sub-vectors associated with the blocks of the subsidiary preconditioner
   /// will be included. Hence the length of v is master_nrow() whereas the
   /// total length of the s vectors is the sum of the Nrow of the sub vectors.
   /// This is simply a wrapper around the other internal_get_block_vectors(...)
   /// function with the identity block_vec_number vector.
   //============================================================================
   template<typename MATRIX>
   void BlockPreconditioner<MATRIX>::internal_get_block_vectors(
     const DoubleVector& v, Vector<DoubleVector>& s) const
   {
     // Number of block types
     const unsigned nblock = this->internal_nblock_types();
     Vector<unsigned> block_vec_number(nblock, 0);
     for (unsigned b = 0; b < nblock; b++)
     {
       block_vec_number[b] = b;
     }
  
     internal_get_block_vectors(block_vec_number, v, s);
   }
  
   //============================================================================
   /// Takes the vector of block vectors, s, and copies its entries into
   /// the naturally ordered vector, v. If this is a subsidiary block
   /// preconditioner only those entries in v that are associated with its
   /// blocks are affected. The block_vec_number indicates which block the
   /// vectors in s came from. The block number corresponds to the block
   /// numbers in this preconditioner.
   //============================================================================
   template<typename MATRIX>
   void BlockPreconditioner<MATRIX>::return_block_vectors(
     const Vector<unsigned>& block_vec_number,
     const Vector<DoubleVector>& s,
     DoubleVector& v) const
   {
 #ifdef PARANOID
  
     // Check if v is built.
     if (!v.built())
     {
       std::ostringstream err_msg;
       err_msg << "The distribution of the global vector v must be setup.";
       throw OomphLibError(
         err_msg.str(), OOMPH_CURRENT_FUNCTION, OOMPH_EXCEPTION_LOCATION);
     }
  
     // v must have the same distribution as the upper-most master block
     // preconditioner, since the upper-most master block preconditioner
     // should have the same distribution as the matrix pointed to
     // by matrix_pt().
     if (*(v.distribution_pt()) != *(this->master_distribution_pt()))
     {
       std::ostringstream err_msg;
       err_msg << "The distribution of the global vector v must match the "
               << " specified master_distribution_pt(). \n"
               << "i.e. Distribution_pt in the master preconditioner";
       throw OomphLibError(
         err_msg.str(), OOMPH_CURRENT_FUNCTION, OOMPH_EXCEPTION_LOCATION);
     }
  
     // Check if the number of vectors in s is the same as the number of block
     // numbers described in block_vec_number.
     const unsigned para_block_vec_number_size = block_vec_number.size();
     const unsigned para_s_size = s.size();
     if (para_block_vec_number_size != para_s_size)
     {
       std::ostringstream err_msg;
       err_msg << "block_vec_number.size() is " << para_block_vec_number_size
               << "\n."
               << "s.size() is " << para_s_size << ".\n"
               << "But they must be the same size!\n";
       throw OomphLibError(
         err_msg.str(), OOMPH_CURRENT_FUNCTION, OOMPH_EXCEPTION_LOCATION);
     }
  
     // Check to see if there are more blocks defined in the block_vec_number
     // vector than the number of block types. This is not allowed.
     const unsigned para_n_block = nblock_types();
     if (para_block_vec_number_size > para_n_block)
     {
       std::ostringstream err_msg;
       err_msg << "Trying to return " << para_block_vec_number_size
               << " block vectors.\n"
               << "But there are only " << para_n_block << " block types.\n";
       throw OomphLibError(
         err_msg.str(), OOMPH_CURRENT_FUNCTION, OOMPH_EXCEPTION_LOCATION);
     }
  
     // Check if any block numbers defined in block_vec_number is equal to or
     // greater than the number of block types.
     // E.g. if there are 5 block types, we can only have block numbers:
     //  0, 1, 2, 3 and 4.
     for (unsigned b = 0; b < para_block_vec_number_size; b++)
     {
       const unsigned para_required_block = block_vec_number[b];
       if (para_required_block > para_n_block)
       {
         std::ostringstream err_msg;
         err_msg << "block_vec_number[" << b << "] is " << para_required_block
                 << ".\n"
                 << "But there are only " << para_n_block << " block types.\n";
         throw OomphLibError(
           err_msg.str(), OOMPH_CURRENT_FUNCTION, OOMPH_EXCEPTION_LOCATION);
       }
     }
  
     // Check that no block number is inserted twice.
     std::set<unsigned> para_set;
     for (unsigned b = 0; b < para_block_vec_number_size; b++)
     {
       std::pair<std::set<unsigned>::iterator, bool> para_set_ret;
       para_set_ret = para_set.insert(block_vec_number[b]);
  
       if (!para_set_ret.second)
       {
         std::ostringstream err_msg;
         err_msg << "Error: the block number " << block_vec_number[b]
                 << " appears twice.\n";
         throw OomphLibError(
           err_msg.str(), OOMPH_CURRENT_FUNCTION, OOMPH_EXCEPTION_LOCATION);
       }
     }
  
     // Check to see that all the vectors in s are built
     // (since we are trying to return them).
     for (unsigned b = 0; b < para_block_vec_number_size; b++)
     {
       if (!s[b].built())
       {
         std::ostringstream err_msg;
         err_msg << "The distribution of the block vector s[" << b
                 << "] must be setup.\n";
         throw OomphLibError(
           err_msg.str(), OOMPH_CURRENT_FUNCTION, OOMPH_EXCEPTION_LOCATION);
       }
     }
  
     // Since these are built, we check that the distributions are correct.
     // This are incorrect if the block numbers in block_vec_number and
     // the vectors in s does not match.
     for (unsigned b = 0; b < para_block_vec_number_size; b++)
     {
       if (*(s[b].distribution_pt()) !=
           *(Block_distribution_pt[block_vec_number[b]]))
       {
         std::ostringstream error_message;
         error_message
           << "The distribution of the block vector " << b << " must match the"
           << " specified distribution at "
           << "Block_distribution_pt[" << block_vec_number[b] << "].\n"
           << "The distribution of the Block_distribution_pt is determined by\n"
           << "the vector block_vec_number. Perhaps it is incorrect?\n";
         throw OomphLibError(error_message.str(),
                             OOMPH_CURRENT_FUNCTION,
                             OOMPH_EXCEPTION_LOCATION);
       }
     }
 #endif
  
     // Number of blocks to get.
     const unsigned n_block = block_vec_number.size();
  
     // Each block is made of dof types. We get the most fine grain dof types.
     // Most fine grain in the sense that these are the dof types that belongs
     // in this block before any coarsening of dof types has taken place.
     // The ordering of the dof types matters, this is handled properly when
     // creating the Block_to_dof_map_fine vector and must be respected here.
     // I.e. we cannot arbitrarily insert dof types (even if they are correct)
     // in the vector most_fine_grain_dof.
     Vector<unsigned> most_fine_grain_dof;
     for (unsigned b = 0; b < n_block; b++)
     {
       const unsigned mapped_b = block_vec_number[b];
  
       most_fine_grain_dof.insert(most_fine_grain_dof.end(),
                                  Block_to_dof_map_fine[mapped_b].begin(),
                                  Block_to_dof_map_fine[mapped_b].end());
     }
  
     // Split all the blocks into it's most fine grain dof vector.
     Vector<DoubleVector> dof_vector(most_fine_grain_dof.size());
  
     unsigned offset = 0;
  
     // Perform the splitting for each block.
     for (unsigned b = 0; b < n_block; b++)
     {
       // The actual block number.
       const unsigned mapped_b = block_vec_number[b];
  
       // How many most fine grain dof types are associated with this block?
       const unsigned ndof = Block_to_dof_map_fine[mapped_b].size();
  
       if (ndof == 1)
       // No need to split, just copy.
       {
         dof_vector[offset] = s[b];
       }
       else
       // Need to split s[b] into it's most fine grain dof vectors
       {
         // To store pointers to the dof vectors associated with this block.
         Vector<DoubleVector*> tmp_dof_vector_pt(ndof, 0);
  
         for (unsigned d = 0; d < ndof; d++)
         {
           const unsigned offset_plus_d = offset + d;
  
           // build the dof vector.
           dof_vector[offset_plus_d].build(
             Internal_block_distribution_pt[most_fine_grain_dof[offset_plus_d]]);
  
           // Store the pointer.
           tmp_dof_vector_pt[d] = &dof_vector[offset_plus_d];
         }
  
         // Split without communication.
         DoubleVectorHelpers::split_without_communication(s[b],
                                                          tmp_dof_vector_pt);
       }
  
       // Update the offset!
       offset += ndof;
     }
  
     // Return the block vectors all in one go.
     internal_return_block_vectors(most_fine_grain_dof, dof_vector, v);
   } // return_block_vectors(...)
  
  
   //============================================================================
   /// Takes the vector of block vectors, s, and copies its entries into
   /// the naturally ordered vector, v. If this is a subsidiary block
   /// preconditioner only those entries in v that are associated with its
   /// blocks are affected. The block_vec_number indicates which block the
   /// vectors in s came from. The block number corresponds to the block
   /// numbers in this preconditioner.
   /// This is simply a wrapper around the other return_block_vectors(...)
   /// function where the block_vec_number Vector is the identity, i.e.
   /// block_vec_number is [0, 1, ..., nblock_types - 1].
   //============================================================================
   template<typename MATRIX>
   void BlockPreconditioner<MATRIX>::return_block_vectors(
     const Vector<DoubleVector>& s, DoubleVector& v) const
   {
     // The number of block types in this preconditioner.
     const unsigned n_block = nblock_types();
  
     // Create the identity vector.
     Vector<unsigned> required_block(n_block, 0);
     for (unsigned i = 0; i < n_block; i++)
     {
       required_block[i] = i;
     }
  
     // Call the other return_block_vectors function which does the work.
     return_block_vectors(required_block, s, v);
   } // return_block_vectors(...)
  
   //============================================================================
   /// Takes the naturally ordered vector and
   /// rearranges it into a vector of sub vectors corresponding to the blocks,
   /// so s[b][i] contains the i-th entry in the vector associated with block b.
   /// The block_vec_number indicates which blocks we want.
   /// These blocks and vectors are those corresponding to the internal blocks.
   /// Note: If the preconditioner is a subsidiary preconditioner then only the
   /// sub-vectors associated with the blocks of the subsidiary preconditioner
   /// will be included. Hence the length of v is master_nrow() whereas the
   /// total length of the s vectors is the sum of the Nrow of the sub vectors.
   //============================================================================
   template<typename MATRIX>
   void BlockPreconditioner<MATRIX>::internal_return_block_vectors(
     const Vector<unsigned>& block_vec_number,
     const Vector<DoubleVector>& s,
     DoubleVector& v) const
   {
     // the number of blocks
     const unsigned nblock = block_vec_number.size();
  
 #ifdef PARANOID
     if (!v.built())
     {
       std::ostringstream error_message;
       error_message << "The distribution of the global vector v must be setup.";
       throw OomphLibError(
         error_message.str(), OOMPH_CURRENT_FUNCTION, OOMPH_EXCEPTION_LOCATION);
     }
     if (*(v.distribution_pt()) != *(this->master_distribution_pt()))
     {
       std::ostringstream error_message;
       error_message << "The distribution of the global vector v must match the "
                     << " specified master_distribution_pt(). \n"
                     << "i.e. Distribution_pt in the master preconditioner";
       throw OomphLibError(
         error_message.str(), OOMPH_CURRENT_FUNCTION, OOMPH_EXCEPTION_LOCATION);
     }
     for (unsigned b = 0; b < nblock; b++)
     {
       if (!s[b].built())
       {
         std::ostringstream error_message;
         error_message << "The distribution of the block vector " << b
                       << " must be setup.";
         throw OomphLibError(error_message.str(),
                             OOMPH_CURRENT_FUNCTION,
                             OOMPH_EXCEPTION_LOCATION);
       }
       const unsigned required_block = block_vec_number[b];
       if (*(s[b].distribution_pt()) !=
           *(Internal_block_distribution_pt[required_block]))
       {
         std::ostringstream error_message;
         error_message
           << "The distribution of the block vector " << b << " must match the"
           << " specified distribution at Internal_block_distribution_pt[" << b
           << "]";
         throw OomphLibError(error_message.str(),
                             OOMPH_CURRENT_FUNCTION,
                             OOMPH_EXCEPTION_LOCATION);
       }
     }
 #endif
  
     // if + only one processor
     //    + more than one processor but matrix_pt is not distributed
     // then use the serial get_block method
     if (this->distribution_pt()->communicator_pt()->nproc() == 1 ||
         !this->distribution_pt()->distributed())
     {
       double* v_pt = v.values_pt();
       for (unsigned b = 0; b < nblock; b++)
       {
         const unsigned required_block = block_vec_number[b];
  
         const double* s_pt = s[b].values_pt();
         unsigned nrow = this->internal_block_dimension(required_block);
         for (unsigned i = 0; i < nrow; i++)
         {
           v_pt[this->Global_index[required_block][i]] = s_pt[i];
         }
       }
     }
     // otherwise use mpi
     else
     {
 #ifdef OOMPH_HAS_MPI
  
       // my rank
       unsigned my_rank = this->distribution_pt()->communicator_pt()->my_rank();
  
       // the number of processors
       unsigned nproc = this->distribution_pt()->communicator_pt()->nproc();
  
       // determine the maximum number of rows to be sent or recv
       // and determine the number of blocks each processor will send and recv
       // communication for
       Vector<int> nblock_send(nproc, 0);
       Vector<int> nblock_recv(nproc, 0);
       unsigned max_n_send_or_recv = 0;
       for (unsigned p = 0; p < nproc; p++)
       {
         for (unsigned b = 0; b < nblock; b++)
         {
           const unsigned required_block = block_vec_number[b];
  
           max_n_send_or_recv = std::max(
             max_n_send_or_recv, Nrows_to_send_for_get_block(required_block, p));
           max_n_send_or_recv = std::max(
             max_n_send_or_recv, Nrows_to_recv_for_get_block(required_block, p));
           if (Nrows_to_send_for_get_block(required_block, p) > 0)
           {
             nblock_recv[p]++;
           }
           if (Nrows_to_recv_for_get_block(required_block, p) > 0)
           {
             nblock_send[p]++;
           }
         }
       }
  
       // create a vectors of 1s the size of the nblock for the mpi indexed
       // data types
       int* block_lengths = new int[max_n_send_or_recv];
       for (unsigned i = 0; i < max_n_send_or_recv; i++)
       {
         block_lengths[i] = 1;
       }
  
       // perform the sends and receives
       Vector<MPI_Request> requests;
       for (unsigned p = 0; p < nproc; p++)
       {
         // send and recv with other processors
         if (p != my_rank)
         {
           // recv
           if (nblock_recv[p] > 0)
           {
             // create the datatypes vector
             MPI_Datatype block_recv_types[nblock_recv[p]];
  
             // create the datatypes
             unsigned ptr = 0;
             for (unsigned b = 0; b < nblock; b++)
             {
               const unsigned required_block = block_vec_number[b];
  
               if (Nrows_to_send_for_get_block(required_block, p) > 0)
               {
                 MPI_Type_indexed(Nrows_to_send_for_get_block(required_block, p),
                                  block_lengths,
                                  Rows_to_send_for_get_block(required_block, p),
                                  MPI_DOUBLE,
                                  &block_recv_types[ptr]);
                 MPI_Type_commit(&block_recv_types[ptr]);
                 ptr++;
               }
             }
  
             // compute the displacements and lengths
             MPI_Aint displacements[nblock_recv[p]];
             int lengths[nblock_recv[p]];
             for (int i = 0; i < nblock_recv[p]; i++)
             {
               lengths[i] = 1;
               displacements[i] = 0;
             }
  
             // build the final datatype
             MPI_Datatype type_recv;
             MPI_Type_create_struct(nblock_recv[p],
                                    lengths,
                                    displacements,
                                    block_recv_types,
                                    &type_recv);
             MPI_Type_commit(&type_recv);
  
             // recv
             MPI_Request recv_req;
             MPI_Irecv(v.values_pt(),
                       1,
                       type_recv,
                       p,
                       0,
                       this->distribution_pt()->communicator_pt()->mpi_comm(),
                       &recv_req);
             MPI_Type_free(&type_recv);
             for (int i = 0; i < nblock_recv[p]; i++)
             {
               MPI_Type_free(&block_recv_types[i]);
             }
             requests.push_back(recv_req);
           }
  
           // send
           if (nblock_send[p] > 0)
           {
             // create the datatypes vector
             MPI_Datatype block_send_types[nblock_send[p]];
  
             // and the displacements
             MPI_Aint displacements[nblock_send[p]];
  
             // and the lengths
             int lengths[nblock_send[p]];
  
             // all displacements are computed relative to s[0] values
             MPI_Aint displacements_base;
             MPI_Get_address(const_cast<double*>(s[0].values_pt()),
                             &displacements_base);
  
             // now build
             unsigned ptr = 0;
             for (unsigned b = 0; b < nblock; b++)
             {
               const unsigned required_block = block_vec_number[b];
  
               if (Nrows_to_recv_for_get_block(required_block, p) > 0)
               {
                 MPI_Type_indexed(Nrows_to_recv_for_get_block(required_block, p),
                                  block_lengths,
                                  Rows_to_recv_for_get_block(required_block, p),
                                  MPI_DOUBLE,
                                  &block_send_types[ptr]);
                 MPI_Type_commit(&block_send_types[ptr]);
                 MPI_Get_address(const_cast<double*>(s[b].values_pt()),
                                 &displacements[ptr]);
                 displacements[ptr] -= displacements_base;
                 lengths[ptr] = 1;
                 ptr++;
               }
             }
  
             // build the final data type
             MPI_Datatype type_send;
             MPI_Type_create_struct(nblock_send[p],
                                    lengths,
                                    displacements,
                                    block_send_types,
                                    &type_send);
             MPI_Type_commit(&type_send);
  
             // send
             MPI_Request send_req;
             MPI_Isend(const_cast<double*>(s[0].values_pt()),
                       1,
                       type_send,
                       p,
                       0,
                       this->distribution_pt()->communicator_pt()->mpi_comm(),
                       &send_req);
             MPI_Type_free(&type_send);
             for (int i = 0; i < nblock_send[p]; i++)
             {
               MPI_Type_free(&block_send_types[i]);
             }
             requests.push_back(send_req);
           }
         }
  
         // communicate wih self
         else
         {
           double* v_values_pt = v.values_pt();
           for (unsigned b = 0; b < nblock; b++)
           {
             const unsigned required_block = block_vec_number[b];
  
             const double* w_values_pt = s[b].values_pt();
             for (unsigned i = 0;
                  i < Nrows_to_send_for_get_block(required_block, p);
                  i++)
             {
               v_values_pt[Rows_to_send_for_get_block(required_block, p)[i]] =
                 w_values_pt[Rows_to_recv_for_get_block(required_block, p)[i]];
             }
           }
         }
       }
  
       // and then just wait
       unsigned c = requests.size();
       Vector<MPI_Status> stat(c);
       if (c)
       {
         MPI_Waitall(c, &requests[0], &stat[0]);
       }
       delete[] block_lengths;
  
 #else
       // throw error
       std::ostringstream error_message;
       error_message << "The preconditioner is distributed and on more than one "
                     << "processor. MPI is required.";
       throw OomphLibError(
         error_message.str(), OOMPH_CURRENT_FUNCTION, OOMPH_EXCEPTION_LOCATION);
 #endif
     }
   }
  
   //============================================================================
   /// A helper function, takes the naturally ordered vector and
   /// rearranges it into a vector of sub vectors corresponding to the blocks,
   /// so s[b][i] contains the i-th entry in the vector associated with block b.
   /// The block_vec_number indicates which blocks we want.
   /// These blocks and vectors are those corresponding to the internal blocks.
   /// Note: If the preconditioner is a subsidiary preconditioner then only the
   /// sub-vectors associated with the blocks of the subsidiary preconditioner
   /// will be included. Hence the length of v is master_nrow() whereas the
   /// total length of the s vectors is the sum of the Nrow of the sub vectors.
   /// This is simply a wrapper around the other internal_get_block_vectors(...)
   /// function with the identity block_vec_number vector.
   //============================================================================
   template<typename MATRIX>
   void BlockPreconditioner<MATRIX>::internal_return_block_vectors(
     const Vector<DoubleVector>& s, DoubleVector& v) const
   {
     // the number of blocks
     const unsigned nblock = this->internal_nblock_types();
     Vector<unsigned> block_vec_number(nblock, 0);
     for (unsigned b = 0; b < nblock; b++)
     {
       block_vec_number[b] = b;
     }
  
     internal_return_block_vectors(block_vec_number, s, v);
   }
  
   //============================================================================
   /// A helper function, takes the naturally ordered vector, v,
   /// and extracts the n-th block vector, b.
   /// Here n is the block number in the current preconditioner.
   /// NOTE: The ordering of the vector b is the same as the
   /// ordering of the block matrix from internal_get_block(...).
   //============================================================================
   template<typename MATRIX>
   void BlockPreconditioner<MATRIX>::internal_get_block_vector(
     const unsigned& b, const DoubleVector& v, DoubleVector& w) const
   {
 #ifdef PARANOID
     // the number of blocks
     const unsigned n_blocks = this->internal_nblock_types();
  
     // paranoid check that block i is in this block preconditioner
     if (b >= n_blocks)
     {
       std::ostringstream error_message;
       error_message
         << "Requested block  vector " << b
         << ", however this preconditioner has internal_nblock_types() "
         << "= " << internal_nblock_types() << std::endl;
       throw OomphLibError(
         error_message.str(), OOMPH_CURRENT_FUNCTION, OOMPH_EXCEPTION_LOCATION);
     }
     if (!v.built())
     {
       std::ostringstream error_message;
       error_message << "The distribution of the global vector v must be setup.";
       throw OomphLibError(
         error_message.str(), OOMPH_CURRENT_FUNCTION, OOMPH_EXCEPTION_LOCATION);
     }
     if (*(v.distribution_pt()) != *(this->master_distribution_pt()))
     {
       std::ostringstream error_message;
       error_message << "The distribution of the global vector v must match the "
                     << " specified master_distribution_pt(). \n"
                     << "i.e. Distribution_pt in the master preconditioner";
       throw OomphLibError(
         error_message.str(), OOMPH_CURRENT_FUNCTION, OOMPH_EXCEPTION_LOCATION);
     }
 #endif
  
     // rebuild the block vector
     w.build(Internal_block_distribution_pt[b], 0.0);
  
     // if + only one processor
     //    + more than one processor but matrix_pt is not distributed
     // then use the serial get_block method
     if (this->distribution_pt()->communicator_pt()->nproc() == 1 ||
         !this->distribution_pt()->distributed())
     {
       double* w_pt = w.values_pt();
       const double* v_pt = v.values_pt();
       unsigned n_row = w.nrow();
       for (unsigned i = 0; i < n_row; i++)
       {
         w_pt[i] = v_pt[this->Global_index[b][i]];
       }
     }
     // otherwise use mpi
     else
     {
 #ifdef OOMPH_HAS_MPI
  
       // my rank
       unsigned my_rank = this->distribution_pt()->communicator_pt()->my_rank();
  
       // the number of processors
       unsigned nproc = this->distribution_pt()->communicator_pt()->nproc();
  
       // determine the maximum number of rows to be sent or recv
       unsigned max_n_send_or_recv = 0;
       for (unsigned p = 0; p < nproc; p++)
       {
         max_n_send_or_recv =
           std::max(max_n_send_or_recv, Nrows_to_send_for_get_block(b, p));
         max_n_send_or_recv =
           std::max(max_n_send_or_recv, Nrows_to_recv_for_get_block(b, p));
       }
  
       // create a vectors of 1s (the size of the nblock for the mpi indexed
       // data types
       int* block_lengths = new int[max_n_send_or_recv];
       for (unsigned i = 0; i < max_n_send_or_recv; i++)
       {
         block_lengths[i] = 1;
       }
  
       // perform the sends and receives
       Vector<MPI_Request> requests;
       for (unsigned p = 0; p < nproc; p++)
       {
         // send and recv with other processors
         if (p != my_rank)
         {
           if (Nrows_to_send_for_get_block(b, p) > 0)
           {
             // create the send datatype
             MPI_Datatype type_send;
             MPI_Type_indexed(Nrows_to_send_for_get_block(b, p),
                              block_lengths,
                              Rows_to_send_for_get_block(b, p),
                              MPI_DOUBLE,
                              &type_send);
             MPI_Type_commit(&type_send);
  
             // send
             MPI_Request send_req;
             MPI_Isend(const_cast<double*>(v.values_pt()),
                       1,
                       type_send,
                       p,
                       0,
                       this->distribution_pt()->communicator_pt()->mpi_comm(),
                       &send_req);
             MPI_Type_free(&type_send);
             requests.push_back(send_req);
           }
  
           if (Nrows_to_recv_for_get_block(b, p) > 0)
           {
             // create the recv datatype
             MPI_Datatype type_recv;
             MPI_Type_indexed(Nrows_to_recv_for_get_block(b, p),
                              block_lengths,
                              Rows_to_recv_for_get_block(b, p),
                              MPI_DOUBLE,
                              &type_recv);
             MPI_Type_commit(&type_recv);
  
             // recv
             MPI_Request recv_req;
             MPI_Irecv(w.values_pt(),
                       1,
                       type_recv,
                       p,
                       0,
                       this->distribution_pt()->communicator_pt()->mpi_comm(),
                       &recv_req);
             MPI_Type_free(&type_recv);
             requests.push_back(recv_req);
           }
         }
  
         // communicate with self
         else
         {
           double* w_values_pt = w.values_pt();
           const double* v_values_pt = v.values_pt();
           for (unsigned i = 0; i < Nrows_to_send_for_get_block(b, p); i++)
           {
             w_values_pt[Rows_to_recv_for_get_block(b, p)[i]] =
               v_values_pt[Rows_to_send_for_get_block(b, p)[i]];
           }
         }
       }
  
       // and then just wait
       unsigned c = requests.size();
       Vector<MPI_Status> stat(c);
       if (c)
       {
         MPI_Waitall(c, &requests[0], &stat[0]);
       }
       delete[] block_lengths;
  
 #else
       // throw error
       std::ostringstream error_message;
       error_message << "The preconditioner is distributed and on more than one "
                     << "processor. MPI is required.";
       throw OomphLibError(
         error_message.str(), OOMPH_CURRENT_FUNCTION, OOMPH_EXCEPTION_LOCATION);
 #endif
     }
   }
  
   //============================================================================
   /// Takes the naturally ordered vector, v and returns the n-th
   /// block vector, b. Here n is the block number in the current
   /// preconditioner.
   //============================================================================
   template<typename MATRIX>
   void BlockPreconditioner<MATRIX>::get_block_vector(const unsigned& b,
                                                      const DoubleVector& v,
                                                      DoubleVector& w) const
   {
 #ifdef PARANOID
     // the number of blocks
     const unsigned para_n_blocks = nblock_types();
  
     // paranoid check that block i is in this block preconditioner
     if (b >= para_n_blocks)
     {
       std::ostringstream err_msg;
       err_msg << "Requested block vector " << b
               << ", however this preconditioner has only " << para_n_blocks
               << " block types"
               << ".\n";
       throw OomphLibError(
         err_msg.str(), OOMPH_CURRENT_FUNCTION, OOMPH_EXCEPTION_LOCATION);
     }
  
     if (!v.built())
     {
       std::ostringstream err_msg;
       err_msg << "The distribution of the global vector v must be setup.";
       throw OomphLibError(
         err_msg.str(), OOMPH_CURRENT_FUNCTION, OOMPH_EXCEPTION_LOCATION);
     }
     if (*(v.distribution_pt()) != *(this->master_distribution_pt()))
     {
       std::ostringstream err_msg;
       err_msg << "The distribution of the global vector v must match the "
               << " specified master_distribution_pt(). \n"
               << "i.e. Distribution_pt in the master preconditioner";
       throw OomphLibError(
         err_msg.str(), OOMPH_CURRENT_FUNCTION, OOMPH_EXCEPTION_LOCATION);
     }
 #endif
  
     // Recall that, the relationship between the external blocks and the
     // external dof types, as seen by the preconditioner writer is stored in the
     // mapping Block_to_dof_map_coarse.
     //
     // However, each dof type could have been coarsened! The relationship
     // between the dof types of this preconditioner and the parent
     // preconditioner is stored in the mapping Doftype_coarsen_map_coarse. The
     // dof numbers in this map is relative to this preconditioner.
     //
     // Finally, the relationship between the dof types of this preconditioner
     // and the most fine grain dof types is stored in the mapping
     // Doftype_coarsen_map_fine. Again, the dof numbers in this map is relative
     // to this preconditioner.
     //
     // Furthermore, we note that concatenation of vectors without communication
     //  is associative, but not commutative. I.e.
     // (V1+V2)+V3 = V1 + (V2 + V3), where + is concatenation without
     // communication.
     //
     // So all we need is the vectors listed in the correct order.
     //
     // We need only Block_to_dof_map_coarse to tell us which external dof types
     // are in this block, then Doftype_coarsen_map_fine to tell us which most
     // fine grain dofs to concatenate!
     //
     // All the mapping vectors are constructed to respect the ordering of
     // the dof types.
  
     // Get the most fine grain block to dof mapping.
     Vector<unsigned> most_fine_grain_dof = Block_to_dof_map_fine[b];
  
     // How many vectors do we need to concatenate?
     const unsigned n_dof_vec = most_fine_grain_dof.size();
  
     if (n_dof_vec == 1)
     // No need to concatenate, just extract the vector.
     {
       internal_get_block_vector(most_fine_grain_dof[0], v, w);
     }
     else
     // Need to concatenate dof-level vectors.
     {
       Vector<DoubleVector> dof_vector(n_dof_vec);
  
       // Get all the dof-level vectors in one go
       internal_get_block_vectors(most_fine_grain_dof, v, dof_vector);
       // Build w with the correct distribution.
       w.build(Block_distribution_pt[b], 0);
  
       // Concatenate the vectors.
       DoubleVectorHelpers::concatenate_without_communication(dof_vector, w);
  
       dof_vector.clear();
     }
   } // get_block_vector(...)
  
   //============================================================================
   /// Takes the n-th block ordered vector, b,  and copies its entries
   /// to the appropriate entries in the naturally ordered vector, v.
   /// Here n is the block number in the current block preconditioner.
   /// If the preconditioner is a subsidiary block preconditioner
   /// the other entries in v  that are not associated with it
   /// are left alone.
   ///
   /// This version works with the internal block types. This is legacy code
   /// but is kept alive, hence moved to private. Please use the
   /// function "return_block_vector(...)".
   //============================================================================
   template<typename MATRIX>
   void BlockPreconditioner<MATRIX>::internal_return_block_vector(
     const unsigned& b, const DoubleVector& w, DoubleVector& v) const
   {
 #ifdef PARANOID
     // the number of blocks
     const unsigned n_blocks = this->internal_nblock_types();
  
     // paranoid check that block i is in this block preconditioner
     if (b >= n_blocks)
     {
       std::ostringstream error_message;
       error_message
         << "Requested block  vector " << b
         << ", however this preconditioner has internal_nblock_types() "
         << "= " << internal_nblock_types() << std::endl;
       throw OomphLibError(
         error_message.str(), OOMPH_CURRENT_FUNCTION, OOMPH_EXCEPTION_LOCATION);
     }
     if (!v.built())
     {
       std::ostringstream error_message;
       error_message << "The distribution of the global vector v must be setup.";
       throw OomphLibError(
         error_message.str(), OOMPH_CURRENT_FUNCTION, OOMPH_EXCEPTION_LOCATION);
     }
     if (*v.distribution_pt() != *this->master_distribution_pt())
     {
       std::ostringstream error_message;
       error_message << "The distribution of the global vector v must match the "
                     << " specified master_distribution_pt(). \n"
                     << "i.e. Distribution_pt in the master preconditioner";
       throw OomphLibError(
         error_message.str(), OOMPH_CURRENT_FUNCTION, OOMPH_EXCEPTION_LOCATION);
     }
     if (!w.built())
     {
       std::ostringstream error_message;
       error_message << "The distribution of the block vector w must be setup.";
       throw OomphLibError(
         error_message.str(), OOMPH_CURRENT_FUNCTION, OOMPH_EXCEPTION_LOCATION);
     }
     if (*w.distribution_pt() != *Internal_block_distribution_pt[b])
     {
       std::ostringstream error_message;
       error_message
         << "The distribution of the block vector w must match the "
         << " specified distribution at Internal_block_distribution_pt[b]";
       throw OomphLibError(
         error_message.str(), OOMPH_CURRENT_FUNCTION, OOMPH_EXCEPTION_LOCATION);
     }
 #endif
  
     // if + only one processor
     //    + more than one processor but matrix_pt is not distributed
     // then use the serial get_block method
     if (this->distribution_pt()->communicator_pt()->nproc() == 1 ||
         !this->distribution_pt()->distributed())
     {
       // length of vector
       unsigned n_row = this->internal_block_dimension(b);
  
       // copy back from the block vector to the naturally ordered vector
       double* v_pt = v.values_pt();
       const double* w_pt = w.values_pt();
       for (unsigned i = 0; i < n_row; i++)
       {
         v_pt[this->Global_index[b][i]] = w_pt[i];
       }
     }
     // otherwise use mpi
     else
     {
 #ifdef OOMPH_HAS_MPI
  
       // my rank
       unsigned my_rank = this->distribution_pt()->communicator_pt()->my_rank();
  
       // the number of processors
       unsigned nproc = this->distribution_pt()->communicator_pt()->nproc();
  
       // determine the maximum number of rows to be sent or recv
       unsigned max_n_send_or_recv = 0;
       for (unsigned p = 0; p < nproc; p++)
       {
         max_n_send_or_recv =
           std::max(max_n_send_or_recv, Nrows_to_send_for_get_block(b, p));
         max_n_send_or_recv =
           std::max(max_n_send_or_recv, Nrows_to_recv_for_get_block(b, p));
       }
  
       // create a vectors of 1s (the size of the nblock for the mpi indexed
       // data types
       int* block_lengths = new int[max_n_send_or_recv];
       for (unsigned i = 0; i < max_n_send_or_recv; i++)
       {
         block_lengths[i] = 1;
       }
  
       // perform the sends and receives
       Vector<MPI_Request> requests;
       for (unsigned p = 0; p < nproc; p++)
       {
         // send and recv with other processors
         if (p != my_rank)
         {
           if (Nrows_to_recv_for_get_block(b, p) > 0)
           {
             // create the send datatype
             MPI_Datatype type_send;
             MPI_Type_indexed(Nrows_to_recv_for_get_block(b, p),
                              block_lengths,
                              Rows_to_recv_for_get_block(b, p),
                              MPI_DOUBLE,
                              &type_send);
             MPI_Type_commit(&type_send);
  
             // send
             MPI_Request send_req;
             MPI_Isend(const_cast<double*>(w.values_pt()),
                       1,
                       type_send,
                       p,
                       0,
                       this->distribution_pt()->communicator_pt()->mpi_comm(),
                       &send_req);
             MPI_Type_free(&type_send);
             requests.push_back(send_req);
           }
  
           if (Nrows_to_send_for_get_block(b, p) > 0)
           {
             // create the recv datatype
             MPI_Datatype type_recv;
             MPI_Type_indexed(Nrows_to_send_for_get_block(b, p),
                              block_lengths,
                              Rows_to_send_for_get_block(b, p),
                              MPI_DOUBLE,
                              &type_recv);
             MPI_Type_commit(&type_recv);
  
             // recv
             MPI_Request recv_req;
             MPI_Irecv(v.values_pt(),
                       1,
                       type_recv,
                       p,
                       0,
                       this->distribution_pt()->communicator_pt()->mpi_comm(),
                       &recv_req);
             MPI_Type_free(&type_recv);
             requests.push_back(recv_req);
           }
         }
  
         // communicate wih self
         else
         {
           const double* w_values_pt = w.values_pt();
           double* v_values_pt = v.values_pt();
           for (unsigned i = 0; i < Nrows_to_send_for_get_block(b, p); i++)
           {
             v_values_pt[Rows_to_send_for_get_block(b, p)[i]] =
               w_values_pt[Rows_to_recv_for_get_block(b, p)[i]];
           }
         }
       }
  
       // and then just wait
       unsigned c = requests.size();
       Vector<MPI_Status> stat(c);
       if (c)
       {
         MPI_Waitall(c, &requests[0], &stat[0]);
       }
       delete[] block_lengths;
  
 #else
       // throw error
       std::ostringstream error_message;
       error_message << "The preconditioner is distributed and on more than one "
                     << "processor. MPI is required.";
       throw OomphLibError(
         error_message.str(), OOMPH_CURRENT_FUNCTION, OOMPH_EXCEPTION_LOCATION);
 #endif
     }
   }
  
   //============================================================================
   /// Takes the n-th block ordered vector, b,  and copies its entries
   /// to the appropriate entries in the naturally ordered vector, v.
   /// Here n is the block number in the current block preconditioner.
   /// If the preconditioner is a subsidiary block preconditioner
   /// the other entries in v  that are not associated with it
   /// are left alone.
   //============================================================================
   template<typename MATRIX>
   void BlockPreconditioner<MATRIX>::return_block_vector(const unsigned& n,
                                                         const DoubleVector& b,
                                                         DoubleVector& v) const
   {
 #ifdef PARANOID
     // the number of blocks
     const unsigned para_n_blocks = nblock_types();
  
     // paranoid check that block i is in this block preconditioner
     if (n >= para_n_blocks)
     {
       std::ostringstream err_msg;
       err_msg << "Requested block vector " << b
               << ", however this preconditioner has " << para_n_blocks
               << " block types.\n";
       throw OomphLibError(
         err_msg.str(), OOMPH_CURRENT_FUNCTION, OOMPH_EXCEPTION_LOCATION);
     }
     if (!v.built())
     {
       std::ostringstream err_msg;
       err_msg << "The distribution of the global vector v must be setup.";
       throw OomphLibError(
         err_msg.str(), OOMPH_CURRENT_FUNCTION, OOMPH_EXCEPTION_LOCATION);
     }
     if (*v.distribution_pt() != *this->master_distribution_pt())
     {
       std::ostringstream err_msg;
       err_msg << "The distribution of the global vector v must match the "
               << " specified master_distribution_pt(). \n"
               << "i.e. Distribution_pt in the master preconditioner";
       throw OomphLibError(
         err_msg.str(), OOMPH_CURRENT_FUNCTION, OOMPH_EXCEPTION_LOCATION);
     }
     if (!b.built())
     {
       std::ostringstream err_msg;
       err_msg << "The distribution of the block vector b must be setup.";
       throw OomphLibError(
         err_msg.str(), OOMPH_CURRENT_FUNCTION, OOMPH_EXCEPTION_LOCATION);
     }
  
 #endif
  
     // Get the most fine grain dof
     Vector<unsigned> most_fine_grain_dof = Block_to_dof_map_fine[n];
  
     // How many dofs are in this block?
     const unsigned n_dof_vec = Block_to_dof_map_fine[n].size();
  
     if (n_dof_vec == 1)
     // There is only one dof, no need to split.
     {
       internal_return_block_vector(most_fine_grain_dof[0], b, v);
     }
     else
     // Need to split the vector up before we insert them all in one go.
     {
       Vector<DoubleVector> dof_vector(n_dof_vec);
       for (unsigned d = 0; d < n_dof_vec; d++)
       {
         dof_vector[d].build(
           internal_block_distribution_pt(most_fine_grain_dof[d]));
       }
  
       DoubleVectorHelpers::split_without_communication(b, dof_vector);
  
       // return to v
       internal_return_block_vectors(most_fine_grain_dof, dof_vector, v);
     }
   } // return_block_vector(...)
  
   //============================================================================
   /// Given the naturally ordered vector, v, return
   /// the vector rearranged in block order in w. This is a legacy function
   /// from the old block preconditioning framework. Kept alive in case it may
   /// be needed again.
   ///
   /// This uses the variables ending in "get_ordered". We no longer use this
   /// type of method. This function copy values from v and re-order them
   /// in "block order" and place them in w. Block order means that the
   /// values in w are the same as the concatenated block vectors.
   ///
   /// I.e. - v is naturally ordered.
   ///        v -> s_b, v is ordered into blocks vectors
   ///                  (requires communication)
   ///        concatenate_without_communication(s_{0,...,nblocks},w) gives w.
   ///
   /// But this function skips out the concatenation part and builds w directly
   /// from v.
   ///
   /// This is nice but the function is implemented in such a way that it
   /// always use all the (internal) blocks and concatenated with the
   /// identity ordering. I.e. if this preconditioner has 3 block types, then
   /// w will always be:
   /// concatenate_without_communication([s_0, s_1, s_2], w). There is easy
   /// way to change this.
   ///
   /// Furthermore, it does not take into account the new dof type coarsening
   /// feature. So this function will most likely produce the incorrect vector
   /// w from what the user intended. It still works, but w will be the
   /// concatenation of the most fine grain dof block vectors with the
   /// "natural" dof type ordering.
   ///
   /// This has been superseded by the function
   /// get_block_ordered_preconditioner_vector(...) which does the correct
   /// thing.
   //============================================================================
   template<typename MATRIX>
   void BlockPreconditioner<MATRIX>::
     internal_get_block_ordered_preconditioner_vector(const DoubleVector& v,
                                                      DoubleVector& w) const
   {
 #ifdef PARANOID
     if (!v.built())
     {
       std::ostringstream error_message;
       error_message << "The distribution of the global vector v must be setup.";
       throw OomphLibError(
         error_message.str(), OOMPH_CURRENT_FUNCTION, OOMPH_EXCEPTION_LOCATION);
     }
     if (*v.distribution_pt() != *this->master_distribution_pt())
     {
       std::ostringstream error_message;
       error_message << "The distribution of the global vector v must match the "
                     << " specified master_distribution_pt(). \n"
                     << "i.e. Distribution_pt in the master preconditioner";
       throw OomphLibError(
         error_message.str(), OOMPH_CURRENT_FUNCTION, OOMPH_EXCEPTION_LOCATION);
     }
 #endif
  
     //  Cleared and resized w for reordered vector
     w.build(this->internal_preconditioner_matrix_distribution_pt(), 0.0);
  
     // if + only one processor
     //    + more than one processor but matrix_pt is not distributed
     // then use the serial get_block method
     if (this->distribution_pt()->communicator_pt()->nproc() == 1 ||
         !this->distribution_pt()->distributed())
     {
       // number of blocks
       unsigned nblock = this->Internal_nblock_types;
  
       // copy to w
       unsigned block_offset = 0;
       double* w_pt = w.values_pt();
       const double* v_pt = v.values_pt();
       for (unsigned b = 0; b < nblock; b++)
       {
         unsigned block_nrow = this->internal_block_dimension(b);
         for (unsigned i = 0; i < block_nrow; i++)
         {
           w_pt[block_offset + i] = v_pt[this->Global_index[b][i]];
         }
         block_offset += block_nrow;
       }
     }
     // otherwise use mpi
     else
     {
 #ifdef OOMPH_HAS_MPI
  
       // my rank
       unsigned my_rank = this->distribution_pt()->communicator_pt()->my_rank();
  
       // the number of processors
       unsigned nproc = this->distribution_pt()->communicator_pt()->nproc();
  
       // determine the maximum number of rows to be sent or recv
       unsigned max_n_send_or_recv = 0;
       for (unsigned p = 0; p < nproc; p++)
       {
         max_n_send_or_recv =
           std::max(max_n_send_or_recv, Nrows_to_send_for_get_ordered[p]);
         max_n_send_or_recv =
           std::max(max_n_send_or_recv, Nrows_to_recv_for_get_ordered[p]);
       }
  
       // create a vectors of 1s (the size of the nblock for the mpi indexed
       // data types
       int* block_lengths = new int[max_n_send_or_recv];
       for (unsigned i = 0; i < max_n_send_or_recv; i++)
       {
         block_lengths[i] = 1;
       }
  
       // perform the sends and receives
       Vector<MPI_Request> requests;
       for (unsigned p = 0; p < nproc; p++)
       {
         // send and recv with other processors
         if (p != my_rank)
         {
           if (Nrows_to_send_for_get_ordered[p] > 0)
           {
             // create the send datatype
             MPI_Datatype type_send;
             MPI_Type_indexed(Nrows_to_send_for_get_ordered[p],
                              block_lengths,
                              Rows_to_send_for_get_ordered[p],
                              MPI_DOUBLE,
                              &type_send);
             MPI_Type_commit(&type_send);
  
             // send
             MPI_Request send_req;
             MPI_Isend(const_cast<double*>(v.values_pt()),
                       1,
                       type_send,
                       p,
                       0,
                       this->distribution_pt()->communicator_pt()->mpi_comm(),
                       &send_req);
             MPI_Type_free(&type_send);
             requests.push_back(send_req);
           }
  
           if (Nrows_to_recv_for_get_ordered[p] > 0)
           {
             // create the recv datatype
             MPI_Datatype type_recv;
             MPI_Type_indexed(Nrows_to_recv_for_get_ordered[p],
                              block_lengths,
                              Rows_to_recv_for_get_ordered[p],
                              MPI_DOUBLE,
                              &type_recv);
             MPI_Type_commit(&type_recv);
  
             // recv
             MPI_Request recv_req;
             MPI_Irecv(w.values_pt(),
                       1,
                       type_recv,
                       p,
                       0,
                       this->distribution_pt()->communicator_pt()->mpi_comm(),
                       &recv_req);
             MPI_Type_free(&type_recv);
             requests.push_back(recv_req);
           }
         }
  
         // communicate with self
         else
         {
           double* w_values_pt = w.values_pt();
           const double* v_values_pt = v.values_pt();
           for (unsigned i = 0; i < Nrows_to_send_for_get_ordered[p]; i++)
           {
             w_values_pt[Rows_to_recv_for_get_ordered[p][i]] =
               v_values_pt[Rows_to_send_for_get_ordered[p][i]];
           }
         }
       }
  
       // and then just wait
       unsigned c = requests.size();
       Vector<MPI_Status> stat(c);
       if (c)
       {
         MPI_Waitall(c, &requests[0], &stat[0]);
       }
       delete[] block_lengths;
  
 #else
       // throw error
       std::ostringstream error_message;
       error_message << "The preconditioner is distributed and on more than one "
                     << "processor. MPI is required.";
       throw OomphLibError(
         error_message.str(), OOMPH_CURRENT_FUNCTION, OOMPH_EXCEPTION_LOCATION);
 #endif
     }
   }
  
   //============================================================================
   /// Given the naturally ordered vector, v, return
   /// the vector rearranged in block order in w. This function calls
   /// get_concatenated_block_vector(...) with the identity block mapping.
   ///
   /// This function has been re-written to work with the new dof type
   /// coarsening feature. The old function is kept alive in
   /// internal_get_block_ordered_preconditioner_vector(...) and is moved to
   /// the private section of the code. The differences between the two are:
   ///
   /// 1) This function extracts all the block vectors (in one go) via the
   ///    function internal_get_block_vectors(...), and concatenates them.
   ///
   /// 2) The old function makes use of the variables ending in "get_ordered",
   ///    thus is slightly more efficient since it does not have to concatenate
   ///    any block vectors.
   ///
   /// 3) The old function no longer respect the new indirections if dof types
   ///    have been coarsened.
   ///
   /// 4) This function extracts the most fine grain dof-level vectors and
   ///    concatenates them. These dof-level vectors respect the re-ordering
   ///    caused by the coarsening of dof types. The overhead associated with
   ///    concatenating DoubleVectors without communication is very small.
   ///
   /// This function should be used.
   //============================================================================
   template<typename MATRIX>
   void BlockPreconditioner<MATRIX>::get_block_ordered_preconditioner_vector(
     const DoubleVector& v, DoubleVector& w)
   {
 #ifdef PARANOID
     if (!v.built())
     {
       std::ostringstream error_message;
       error_message << "The distribution of the global vector v must be setup.";
       throw OomphLibError(
         error_message.str(), OOMPH_CURRENT_FUNCTION, OOMPH_EXCEPTION_LOCATION);
     }
     if (*v.distribution_pt() != *this->master_distribution_pt())
     {
       std::ostringstream error_message;
       error_message << "The distribution of the global vector v must match the "
                     << " specified master_distribution_pt(). \n"
                     << "i.e. Distribution_pt in the master preconditioner";
       throw OomphLibError(
         error_message.str(), OOMPH_CURRENT_FUNCTION, OOMPH_EXCEPTION_LOCATION);
     }
 #endif
  
     // Get the number of blocks.
     unsigned nblocks = this->nblock_types();
  
     // Fill in the identity mapping.
     Vector<unsigned> block_vec_number(nblocks, 0);
     for (unsigned b = 0; b < nblocks; b++)
     {
       block_vec_number[b] = b;
     }
  
     // Do the work.
     get_concatenated_block_vector(block_vec_number, v, w);
   } // get_block_ordered_preconditioner_vector(...)
  
   //============================================================================
   /// Takes the block ordered vector, w, and reorders it in the natural
   /// order. Reordered vector is returned in v. Note: If the preconditioner is
   /// a subsidiary preconditioner then only the components of the vector
   /// associated with the blocks of the subsidiary preconditioner will be
   /// included. Hence the length of v is master_nrow() whereas that of the
   /// vector w is of length this->nrow().
   ///
   /// This is the return function for the function
   /// internal_get_block_ordered_preconditioner_vector(...).
   /// Both internal_get_block_ordered_preconditioner_vector(...) and
   /// internal_return_block_ordered_preconditioner_vector(...) has been
   /// superseded by the functions
   ///
   /// get_block_ordered_preconditioner_vector(...) and
   /// return_block_ordered_preconditioner_vector(...),
   ///
   /// Thus this function is moved to the private section of the code.
   //============================================================================
   template<typename MATRIX>
   void BlockPreconditioner<MATRIX>::
     internal_return_block_ordered_preconditioner_vector(const DoubleVector& w,
                                                         DoubleVector& v) const
   {
 #ifdef PARANOID
     if (!v.built())
     {
       std::ostringstream error_message;
       error_message << "The distribution of the global vector v must be setup.";
       throw OomphLibError(
         error_message.str(), OOMPH_CURRENT_FUNCTION, OOMPH_EXCEPTION_LOCATION);
     }
     if (*v.distribution_pt() != *this->master_distribution_pt())
     {
       std::ostringstream error_message;
       error_message << "The distribution of the global vector v must match the "
                     << " specified master_distribution_pt(). \n"
                     << "i.e. Distribution_pt in the master preconditioner";
       throw OomphLibError(
         error_message.str(), OOMPH_CURRENT_FUNCTION, OOMPH_EXCEPTION_LOCATION);
     }
     if (!w.built())
     {
       std::ostringstream error_message;
       error_message << "The distribution of the block vector w must be setup.";
       throw OomphLibError(
         error_message.str(), OOMPH_CURRENT_FUNCTION, OOMPH_EXCEPTION_LOCATION);
     }
     if (*w.distribution_pt() !=
         *this->internal_preconditioner_matrix_distribution_pt())
     {
       std::ostringstream error_message;
       error_message << "The distribution of the block vector w must match the "
                     << " specified distribution at Distribution_pt[b]";
       throw OomphLibError(
         error_message.str(), OOMPH_CURRENT_FUNCTION, OOMPH_EXCEPTION_LOCATION);
     }
 #endif
  
  
     // if + only one processor
     //    + more than one processor but matrix_pt is not distributed
     // then use the serial get_block method
     if (this->distribution_pt()->communicator_pt()->nproc() == 1 ||
         !this->distribution_pt()->distributed())
     {
       // number of blocks
       unsigned nblock = this->Internal_nblock_types;
  
       // copy to w
       unsigned block_offset = 0;
       const double* w_pt = w.values_pt();
       double* v_pt = v.values_pt();
       for (unsigned b = 0; b < nblock; b++)
       {
         unsigned block_nrow = this->internal_block_dimension(b);
         for (unsigned i = 0; i < block_nrow; i++)
         {
           v_pt[this->Global_index[b][i]] = w_pt[block_offset + i];
         }
         block_offset += block_nrow;
       }
     }
     // otherwise use mpi
     else
     {
 #ifdef OOMPH_HAS_MPI
  
       // my rank
       unsigned my_rank = this->distribution_pt()->communicator_pt()->my_rank();
  
       // the number of processors
       unsigned nproc = this->distribution_pt()->communicator_pt()->nproc();
  
       // determine the maximum number of rows to be sent or recv
       unsigned max_n_send_or_recv = 0;
       for (unsigned p = 0; p < nproc; p++)
       {
         max_n_send_or_recv =
           std::max(max_n_send_or_recv, Nrows_to_send_for_get_ordered[p]);
         max_n_send_or_recv =
           std::max(max_n_send_or_recv, Nrows_to_recv_for_get_ordered[p]);
       }
  
       // create a vectors of 1s (the size of the nblock for the mpi indexed
       // data types
       int* block_lengths = new int[max_n_send_or_recv];
       for (unsigned i = 0; i < max_n_send_or_recv; i++)
       {
         block_lengths[i] = 1;
       }
  
       // perform the sends and receives
       Vector<MPI_Request> requests;
       for (unsigned p = 0; p < nproc; p++)
       {
         // send and recv with other processors
         if (p != my_rank)
         {
           if (Nrows_to_recv_for_get_ordered[p] > 0)
           {
             // create the send datatype
             MPI_Datatype type_send;
             MPI_Type_indexed(Nrows_to_recv_for_get_ordered[p],
                              block_lengths,
                              Rows_to_recv_for_get_ordered[p],
                              MPI_DOUBLE,
                              &type_send);
             MPI_Type_commit(&type_send);
  
             // send
             MPI_Request send_req;
             MPI_Isend(const_cast<double*>(w.values_pt()),
                       1,
                       type_send,
                       p,
                       0,
                       this->distribution_pt()->communicator_pt()->mpi_comm(),
                       &send_req);
             MPI_Type_free(&type_send);
             requests.push_back(send_req);
           }
  
           if (Nrows_to_send_for_get_ordered[p] > 0)
           {
             // create the recv datatype
             MPI_Datatype type_recv;
             MPI_Type_indexed(Nrows_to_send_for_get_ordered[p],
                              block_lengths,
                              Rows_to_send_for_get_ordered[p],
                              MPI_DOUBLE,
                              &type_recv);
             MPI_Type_commit(&type_recv);
  
             // recv
             MPI_Request recv_req;
             MPI_Irecv(v.values_pt(),
                       1,
                       type_recv,
                       p,
                       0,
                       this->distribution_pt()->communicator_pt()->mpi_comm(),
                       &recv_req);
             MPI_Type_free(&type_recv);
             requests.push_back(recv_req);
           }
         }
  
         // communicate wih self
         else
         {
           const double* w_values_pt = w.values_pt();
           double* v_values_pt = v.values_pt();
           for (unsigned i = 0; i < Nrows_to_send_for_get_ordered[p]; i++)
           {
             v_values_pt[Rows_to_send_for_get_ordered[p][i]] =
               w_values_pt[Rows_to_recv_for_get_ordered[p][i]];
           }
         }
       }
  
       // and then just wait
       unsigned c = requests.size();
       Vector<MPI_Status> stat(c);
       if (c)
       {
         MPI_Waitall(c, &requests[0], &stat[0]);
       }
       delete[] block_lengths;
  
 #else
       // throw error
       std::ostringstream error_message;
       error_message << "The preconditioner is distributed and on more than one "
                     << "processor. MPI is required.";
       throw OomphLibError(
         error_message.str(), OOMPH_CURRENT_FUNCTION, OOMPH_EXCEPTION_LOCATION);
 #endif
     } // else use mpi
   } // function return_block_ordered_preconditioner_vector
  
  
   //============================================================================
   /// Takes the block ordered vector, w, and reorders it in natural
   /// order. Reordered vector is returned in v. Note: If the preconditioner is
   /// a subsidiary preconditioner then only the components of the vector
   /// associated with the blocks of the subsidiary preconditioner will be
   /// included. Hence the length of v is master_nrow() whereas that of the
   /// vector w is of length this->nrow().
   ///
   /// This is the return function for the function
   /// get_block_ordered_preconditioner_vector(...).
   ///
   /// It calls the function return_concatenated_block_vector(...) with the
   /// identity block number ordering.
   //============================================================================
   template<typename MATRIX>
   void BlockPreconditioner<MATRIX>::return_block_ordered_preconditioner_vector(
     const DoubleVector& w, DoubleVector& v) const
   {
 #ifdef PARANOID
     if (!v.built())
     {
       std::ostringstream error_message;
       error_message << "The distribution of the global vector v must be setup.";
       throw OomphLibError(
         error_message.str(), OOMPH_CURRENT_FUNCTION, OOMPH_EXCEPTION_LOCATION);
     }
     if (*v.distribution_pt() != *this->master_distribution_pt())
     {
       std::ostringstream error_message;
       error_message << "The distribution of the global vector v must match the "
                     << " specified master_distribution_pt(). \n"
                     << "i.e. Distribution_pt in the master preconditioner";
       throw OomphLibError(
         error_message.str(), OOMPH_CURRENT_FUNCTION, OOMPH_EXCEPTION_LOCATION);
     }
     if (!w.built())
     {
       std::ostringstream error_message;
       error_message << "The distribution of the block vector w must be setup.";
       throw OomphLibError(
         error_message.str(), OOMPH_CURRENT_FUNCTION, OOMPH_EXCEPTION_LOCATION);
     }
     if (*w.distribution_pt() != *this->preconditioner_matrix_distribution_pt())
     {
       std::ostringstream error_message;
       error_message << "The distribution of the block vector w must match the "
                     << "concatenations of distributions in "
                     << "Block_distribution_pt.\n";
       throw OomphLibError(
         error_message.str(), OOMPH_CURRENT_FUNCTION, OOMPH_EXCEPTION_LOCATION);
     }
 #endif
  
     // Split into the block vectors.
     const unsigned nblocks = nblock_types();
     Vector<unsigned> block_vec_number(nblocks, 0);
     for (unsigned b = 0; b < nblocks; b++)
     {
       block_vec_number[b] = b;
     }
  
     return_concatenated_block_vector(block_vec_number, w, v);
   } // function return_block_ordered_preconditioner_vector
  
   //=============================================================================
   /// Gets block (i,j) from the matrix pointed to by
   /// Matrix_pt and returns it in output_block. This is associated with the
   /// internal blocks. Please use the other get_block(...) function.
   //=============================================================================
   template<>
   void BlockPreconditioner<CRDoubleMatrix>::internal_get_block(
     const unsigned& block_i,
     const unsigned& block_j,
     CRDoubleMatrix& output_block) const
   {
 #ifdef PARANOID
     // the number of blocks
     const unsigned n_blocks = this->internal_nblock_types();
  
     // paranoid check that block i is in this block preconditioner
     if (block_i >= n_blocks || block_j >= n_blocks)
     {
       std::ostringstream error_message;
       error_message
         << "Requested block (" << block_i << "," << block_j
         << "), however this preconditioner has internal_nblock_types() "
         << "= " << internal_nblock_types() << std::endl;
       throw OomphLibError(
         error_message.str(), OOMPH_CURRENT_FUNCTION, OOMPH_EXCEPTION_LOCATION);
     }
  
     // Check that the matrix is the same as that of the master
     if (is_subsidiary_block_preconditioner())
     {
       if (master_block_preconditioner_pt()->matrix_pt() != matrix_pt())
       {
         std::string err = "Master and subs should have same matrix.";
         throw OomphLibError(
           err, OOMPH_CURRENT_FUNCTION, OOMPH_EXCEPTION_LOCATION);
       }
     }
 #endif
  
     // Cast the pointer
     CRDoubleMatrix* cr_matrix_pt = dynamic_cast<CRDoubleMatrix*>(matrix_pt());
  
     // if + only one processor
     //    + more than one processor but matrix_pt is not distributed
     // then use the serial get_block method
     if (cr_matrix_pt->distribution_pt()->communicator_pt()->nproc() == 1 ||
         !cr_matrix_pt->distribution_pt()->distributed())
     {
       // pointers for the jacobian matrix is compressed row sparse format
       int* j_row_start;
       int* j_column_index;
       double* j_value;
  
       // sets pointers to jacobian matrix
       j_row_start = cr_matrix_pt->row_start();
       j_column_index = cr_matrix_pt->column_index();
       j_value = cr_matrix_pt->value();
  
       // get the block dimensions
       unsigned block_nrow = this->internal_block_dimension(block_i);
       unsigned block_ncol = this->internal_block_dimension(block_j);
  
       // allocate temporary storage for the component vectors of block (i,j)
       // temp_ptr is used to point to an element in each column - required as
       // cannot assume that order of block's rows in jacobian and the block
       // matrix will be the same
       int* temp_row_start = new int[block_nrow + 1];
       for (unsigned i = 0; i <= block_nrow; i++)
       {
         temp_row_start[i] = 0;
       }
       Vector<int> temp_ptr(block_nrow + 1);
       int block_nnz = 0;
  
       // get number of rows in source matrix
       unsigned master_nrow = this->master_nrow();
  
       // determine how many non zeros there are in the block (i,j)
       // also determines how many non zeros are stored in each row or column -
       // stored in temp_ptr temporarily
       for (unsigned k = 0; k < master_nrow; k++)
       {
         if (internal_block_number(k) == static_cast<int>(block_i))
         {
           for (int l = j_row_start[k]; l < j_row_start[k + 1]; l++)
           {
             if (internal_block_number(j_column_index[l]) ==
                 static_cast<int>(block_j))
             {
               block_nnz++;
               temp_ptr[internal_index_in_block(k) + 1]++;
             }
           }
         }
       }
  
       // if the matrix is not empty
       int* temp_column_index = new int[block_nnz];
       double* temp_value = new double[block_nnz];
       if (block_nnz > 0)
       {
         // uses number of elements in each column of block to determine values
         // for the block column start (temp_row_start)
         temp_row_start[0] = 0;
         for (unsigned k = 1; k <= block_nrow; k++)
         {
           temp_row_start[k] = temp_row_start[k - 1] + temp_ptr[k];
           temp_ptr[k] = temp_row_start[k];
         }
  
         // copies the relevant elements of the jacobian to the correct entries
         // of the block matrix
         for (unsigned k = 0; k < master_nrow; k++)
         {
           if (internal_block_number(k) == static_cast<int>(block_i))
           {
             for (int l = j_row_start[k]; l < j_row_start[k + 1]; l++)
             {
               if (internal_block_number(j_column_index[l]) ==
                   static_cast<int>(block_j))
               {
                 int kk = temp_ptr[internal_index_in_block(k)]++;
                 temp_value[kk] = j_value[l];
                 temp_column_index[kk] =
                   internal_index_in_block(j_column_index[l]);
               }
             }
           }
         }
       }
  
  
       // Fill in the compressed row matrix ??ds Note: I kept the calls to
       // build as close as I could to before (had to replace new(dist) with
       // .build(dist) ).
       output_block.build(Internal_block_distribution_pt[block_i]);
       output_block.build_without_copy(
         block_ncol, block_nnz, temp_value, temp_column_index, temp_row_start);
  
 #ifdef PARANOID
       // checks to see if block matrix has been set up correctly
       //   block_matrix_test(matrix_pt,block_i,block_j,block_pt);
       if (Run_block_matrix_test)
       {
         // checks to see if block matrix has been set up correctly
         block_matrix_test(block_i, block_j, &output_block);
       }
 #endif
     }
  
  
     // otherwise we are dealing with a distributed matrix
     else
     {
 #ifdef OOMPH_HAS_MPI
       // number of processors
       unsigned nproc = this->distribution_pt()->communicator_pt()->nproc();
  
       // my rank
       unsigned my_rank = this->distribution_pt()->communicator_pt()->my_rank();
  
       // sets pointers to jacobian matrix
       int* j_row_start = cr_matrix_pt->row_start();
       int* j_column_index = cr_matrix_pt->column_index();
       double* j_value = cr_matrix_pt->value();
  
       // number of non zeros in each row to be sent
       Vector<int*> nnz_send(nproc, 0);
  
       // number of non zeros in each row to be received
       Vector<int*> nnz_recv(nproc, 0);
  
       // storage for data to be sent
       Vector<int*> column_index_for_proc(nproc, 0);
       Vector<double*> values_for_proc(nproc, 0);
  
       // number of non zeros to be sent to each processor
       Vector<unsigned> total_nnz_send(nproc, 0);
  
       // number of rows of the block matrix on this processor
       unsigned nrow_local =
         Internal_block_distribution_pt[block_i]->nrow_local();
  
       // resize the nnz storage and compute nnz_send
       // and send and recv the nnz
       Vector<MPI_Request> send_req;
       Vector<MPI_Request> recv1_req;
       for (unsigned p = 0; p < nproc; p++)
       {
         int nrow_send = Nrows_to_send_for_get_block(block_i, p);
         int nrow_recv = Nrows_to_recv_for_get_block(block_i, p);
  
         // assemble nnz recv
         nnz_recv[p] = new int[nrow_recv];
  
         // assemble the storage to send
         if (nrow_send > 0 && p != my_rank)
         {
           nnz_send[p] = new int[nrow_send];
         }
  
         // compute the number of nnzs in each row and the total number
         // of nnzs
         for (int i = 0; i < nrow_send; i++)
         {
           unsigned row = Rows_to_send_for_get_block(block_i, p)[i];
           int c = 0;
           for (int r = j_row_start[row]; r < j_row_start[row + 1]; r++)
           {
             if (internal_block_number(j_column_index[r]) == int(block_j))
             {
               c++;
             }
           }
           if (p != my_rank)
           {
             nnz_send[p][i] = c;
           }
           else
           {
             nnz_recv[p][i] = c;
           }
           total_nnz_send[p] += c;
         }
  
         // send
         if (p != my_rank)
         {
           if (nrow_send)
           {
             MPI_Request req;
             MPI_Isend(nnz_send[p],
                       nrow_send,
                       MPI_INT,
                       p,
                       0,
                       this->distribution_pt()->communicator_pt()->mpi_comm(),
                       &req);
             send_req.push_back(req);
           }
  
           // recv
           if (nrow_recv)
           {
             MPI_Request req;
             MPI_Irecv(nnz_recv[p],
                       nrow_recv,
                       MPI_INT,
                       p,
                       0,
                       this->distribution_pt()->communicator_pt()->mpi_comm(),
                       &req);
             recv1_req.push_back(req);
           }
         }
       }
  
       // next assemble the values and row_start data to be sent for each
       // processor
       for (unsigned p = 0; p < nproc; p++)
       {
         int nrow_send = Nrows_to_send_for_get_block(block_i, p);
  
         // assemble the storage for the values and column indices to be sent
         if (p != my_rank)
         {
           if (total_nnz_send[p] > 0)
           {
             values_for_proc[p] = new double[total_nnz_send[p]];
             column_index_for_proc[p] = new int[total_nnz_send[p]];
  
             // copy the values and column indices to the storage
             unsigned ptr = 0;
             for (int i = 0; i < nrow_send; i++)
             {
               unsigned row = Rows_to_send_for_get_block(block_i, p)[i];
               for (int r = j_row_start[row]; r < j_row_start[row + 1]; r++)
               {
                 if (internal_block_number(j_column_index[r]) == int(block_j))
                 {
                   values_for_proc[p][ptr] = j_value[r];
                   column_index_for_proc[p][ptr] =
                     internal_index_in_block(j_column_index[r]);
                   ptr++;
                 }
               }
             }
  
             // create the datatypes
             MPI_Datatype types[2];
             MPI_Type_contiguous(total_nnz_send[p], MPI_DOUBLE, &types[0]);
             MPI_Type_commit(&types[0]);
             MPI_Type_contiguous(total_nnz_send[p], MPI_INT, &types[1]);
             MPI_Type_commit(&types[1]);
  
             // get the start address of the vectors
             MPI_Aint displacement[2];
             MPI_Get_address(values_for_proc[p], &displacement[0]);
             MPI_Get_address(column_index_for_proc[p], &displacement[1]);
  
             // compute the displacements
             displacement[1] -= displacement[0];
             displacement[0] -= displacement[0];
  
             // compute the block lengths
             int length[2];
             length[0] = length[1] = 1;
  
             // build the struct data type
             MPI_Datatype final_type;
             MPI_Type_create_struct(2, length, displacement, types, &final_type);
             MPI_Type_commit(&final_type);
             MPI_Type_free(&types[0]);
             MPI_Type_free(&types[1]);
  
             // and send
             MPI_Request req;
             MPI_Isend(values_for_proc[p],
                       1,
                       final_type,
                       p,
                       1,
                       this->distribution_pt()->communicator_pt()->mpi_comm(),
                       &req);
             send_req.push_back(req);
             MPI_Type_free(&final_type);
           }
         }
       }
  
       // wait for the recv to complete (the row_start recv which actually
       // contains the number of nnzs in each row)
       int c_recv = recv1_req.size();
       if (c_recv != 0)
       {
         MPI_Waitall(c_recv, &recv1_req[0], MPI_STATUS_IGNORE);
       }
  
       // compute the total number of nnzs to be received
       Vector<int> total_nnz_recv_from_proc(nproc);
       int local_block_nnz = 0;
       for (unsigned p = 0; p < nproc; p++)
       {
         // compute the total nnzs
         for (unsigned i = 0; i < Nrows_to_recv_for_get_block(block_i, p); i++)
         {
           total_nnz_recv_from_proc[p] += nnz_recv[p][i];
         }
         local_block_nnz += total_nnz_recv_from_proc[p];
       }
  
       // compute the offset for each block of nnzs (a matrix row) in the
       // values_recv and column_index_recv vectors
  
       // fisrt determine how many blocks of rows are to be recv
       Vector<int> n_recv_block(nproc, 0);
       for (unsigned p = 0; p < nproc; p++)
       {
         if (Nrows_to_recv_for_get_block(block_i, p) > 0)
         {
           n_recv_block[p] = 1;
         }
         for (unsigned i = 1; i < Nrows_to_recv_for_get_block(block_i, p); i++)
         {
           if (Rows_to_recv_for_get_block(block_i, p)[i] !=
               Rows_to_recv_for_get_block(block_i, p)[i - 1] + 1)
           {
             n_recv_block[p]++;
           }
         }
       }
  
       // next assemble row start recv
       int* row_start_recv = new int[nrow_local + 1];
       for (unsigned i = 0; i <= nrow_local; i++)
       {
         row_start_recv[i] = 0;
       }
       for (unsigned p = 0; p < nproc; p++)
       {
         for (unsigned i = 0; i < Nrows_to_recv_for_get_block(block_i, p); i++)
         {
           row_start_recv[Rows_to_recv_for_get_block(block_i, p)[i]] =
             nnz_recv[p][i];
         }
       }
       int g = row_start_recv[0];
       row_start_recv[0] = 0;
       for (unsigned i = 1; i < nrow_local; i++)
       {
         int temp_g = g;
         g = row_start_recv[i];
         row_start_recv[i] = row_start_recv[i - 1] + temp_g;
       }
       row_start_recv[nrow_local] = row_start_recv[nrow_local - 1] + g;
  
       // next assemble the offset and the number of nzs in each recv block
       Vector<int*> offset_recv_block(nproc, 0);
       Vector<int*> nnz_recv_block(nproc, 0);
       for (unsigned p = 0; p < nproc; p++)
       {
         if (Nrows_to_recv_for_get_block(block_i, p) > 0)
         {
           offset_recv_block[p] = new int[n_recv_block[p]];
           offset_recv_block[p][0] = 0;
           nnz_recv_block[p] = new int[n_recv_block[p]];
           for (int i = 0; i < n_recv_block[p]; i++)
           {
             nnz_recv_block[p][i] = 0;
           }
           unsigned ptr = 0;
           nnz_recv_block[p][ptr] += nnz_recv[p][0];
           offset_recv_block[p][0] =
             row_start_recv[Rows_to_recv_for_get_block(block_i, p)[0]];
           for (unsigned i = 1; i < Nrows_to_recv_for_get_block(block_i, p); i++)
           {
             if (Rows_to_recv_for_get_block(block_i, p)[i] !=
                 Rows_to_recv_for_get_block(block_i, p)[i - 1] + 1)
             {
               ptr++;
               offset_recv_block[p][ptr] =
                 row_start_recv[Rows_to_recv_for_get_block(block_i, p)[i]];
             }
             nnz_recv_block[p][ptr] += nnz_recv[p][i];
           }
         }
         delete[] nnz_recv[p];
       }
  
       // post the receives
       int* column_index_recv = new int[local_block_nnz];
       double* values_recv = new double[local_block_nnz];
       Vector<MPI_Request> recv2_req;
       for (unsigned p = 0; p < nproc; p++)
       {
         if (p != my_rank)
         {
           if (total_nnz_recv_from_proc[p] != 0)
           {
             // create the datatypes
             MPI_Datatype types[2];
             MPI_Type_indexed(n_recv_block[p],
                              nnz_recv_block[p],
                              offset_recv_block[p],
                              MPI_DOUBLE,
                              &types[0]);
             MPI_Type_commit(&types[0]);
             MPI_Type_indexed(n_recv_block[p],
                              nnz_recv_block[p],
                              offset_recv_block[p],
                              MPI_INT,
                              &types[1]);
             MPI_Type_commit(&types[1]);
  
             // compute the displacements
             MPI_Aint displacements[2];
             MPI_Get_address(values_recv, &displacements[0]);
             MPI_Get_address(column_index_recv, &displacements[1]);
             displacements[1] -= displacements[0];
             displacements[0] -= displacements[0];
  
             // compute the block lengths
             int length[2];
             length[0] = length[1] = 1;
  
             // create the final datatype
             MPI_Datatype final_type;
             MPI_Type_create_struct(
               2, length, displacements, types, &final_type);
             MPI_Type_commit(&final_type);
             MPI_Type_free(&types[0]);
             MPI_Type_free(&types[1]);
  
             // and the recv
             MPI_Request req;
             MPI_Irecv(values_recv,
                       1,
                       final_type,
                       p,
                       1,
                       this->distribution_pt()->communicator_pt()->mpi_comm(),
                       &req);
             recv2_req.push_back(req);
             MPI_Type_free(&final_type);
           }
         }
         else
         {
           // next send the values and column indices to self
           unsigned block_ptr = 0;
           unsigned counter = 0;
           int nrow_send = Nrows_to_send_for_get_block(block_i, my_rank);
           if (nrow_send > 0)
           {
             unsigned offset = offset_recv_block[my_rank][0];
             for (int i = 0; i < nrow_send; i++)
             {
               if (i > 0)
               {
                 if (Rows_to_recv_for_get_block(block_i, p)[i] !=
                     Rows_to_recv_for_get_block(block_i, p)[i - 1] + 1)
                 {
                   counter = 0;
                   block_ptr++;
                   offset = offset_recv_block[my_rank][block_ptr];
                 }
               }
               unsigned row = Rows_to_send_for_get_block(block_i, my_rank)[i];
               for (int r = j_row_start[row]; r < j_row_start[row + 1]; r++)
               {
                 if (internal_block_number(j_column_index[r]) == int(block_j))
                 {
                   values_recv[offset + counter] = j_value[r];
                   column_index_recv[offset + counter] =
                     internal_index_in_block(j_column_index[r]);
                   counter++;
                 }
               }
             }
           }
         }
       }
  
       // wait for the recv to complete (for the column_index and the values_
       c_recv = recv2_req.size();
       if (c_recv != 0)
       {
         MPI_Waitall(c_recv, &recv2_req[0], MPI_STATUS_IGNORE);
       }
  
       // Fill in the compressed row matrix
       output_block.build(Internal_block_distribution_pt[block_i]);
       output_block.build_without_copy(this->internal_block_dimension(block_j),
                                       local_block_nnz,
                                       values_recv,
                                       column_index_recv,
                                       row_start_recv);
  
       // wait for the send to complete (nnz / row_start)
       int c_send = send_req.size();
       if (c_send)
       {
         MPI_Waitall(c_send, &send_req[0], MPI_STATUS_IGNORE);
       }
  
       // delete temp storage used for assembling data for communication
       for (unsigned p = 0; p < nproc; p++)
       {
         delete[] nnz_send[p];
         delete[] column_index_for_proc[p];
         delete[] values_for_proc[p];
         delete[] offset_recv_block[p];
         delete[] nnz_recv_block[p];
       }
 #else
       // throw error
       std::ostringstream error_message;
       error_message << "The matrix is distributed and on more than one "
                     << "processor. MPI is required.";
       throw OomphLibError(
         error_message.str(), OOMPH_CURRENT_FUNCTION, OOMPH_EXCEPTION_LOCATION);
 #endif
     }
   }
  
   //=============================================================================
   /// Gets dof-level block (i,j).
   /// If Replacement_dof_block_pt(i,j) is not null, then the replacement
   /// block is returned via a deep copy.
   ///
   /// Otherwise if this is the uppermost block preconditioner then it calls
   /// internal_get_block(i,j), else if it is a subsidiary
   /// block preconditioner, it will call it's master block preconditioners'
   /// get_dof_level_block function.
   //=============================================================================
   template<>
   void BlockPreconditioner<CRDoubleMatrix>::get_dof_level_block(
     const unsigned& block_i,
     const unsigned& block_j,
     CRDoubleMatrix& output_block,
     const bool& ignore_replacement_block) const
   {
 #ifdef PARANOID
     // the number of dof types.
     unsigned para_ndofs = ndof_types();
  
     // paranoid check that block i is in this block preconditioner
     if (block_i >= para_ndofs || block_j >= para_ndofs)
     {
       std::ostringstream err_msg;
       err_msg << "Requested dof block (" << block_i << "," << block_j
               << "), however this preconditioner has ndof_types() "
               << "= " << para_ndofs << std::endl;
       throw OomphLibError(
         err_msg.str(), OOMPH_CURRENT_FUNCTION, OOMPH_EXCEPTION_LOCATION);
     }
 #endif
  
     CRDoubleMatrix* tmp_block_pt =
       Replacement_dof_block_pt.get(block_i, block_j);
  
     if ((tmp_block_pt == 0) || ignore_replacement_block)
     {
       // Getting the block from parent preconditioner
       const unsigned ndof_in_parent_i =
         Doftype_coarsen_map_coarse[block_i].size();
       const unsigned ndof_in_parent_j =
         Doftype_coarsen_map_coarse[block_j].size();
  
       if (ndof_in_parent_i == 1 && ndof_in_parent_j == 1)
       {
         unsigned parent_dof_i = Doftype_coarsen_map_coarse[block_i][0];
         unsigned parent_dof_j = Doftype_coarsen_map_coarse[block_j][0];
  
         if (is_master_block_preconditioner())
         {
           internal_get_block(parent_dof_i, parent_dof_j, output_block);
         }
         else
         {
           parent_dof_i = Doftype_in_master_preconditioner_coarse[parent_dof_i];
           parent_dof_j = Doftype_in_master_preconditioner_coarse[parent_dof_j];
  
           master_block_preconditioner_pt()->get_dof_level_block(
             parent_dof_i, parent_dof_j, output_block, ignore_replacement_block);
         }
       }
       else
       {
         DenseMatrix<CRDoubleMatrix*> tmp_blocks_pt(
           ndof_in_parent_i, ndof_in_parent_j, 0);
  
         Vector<Vector<unsigned>> new_block(
           ndof_in_parent_i, Vector<unsigned>(ndof_in_parent_j, 0));
  
         for (unsigned dof_i = 0; dof_i < ndof_in_parent_i; dof_i++)
         {
           unsigned parent_dof_i = Doftype_coarsen_map_coarse[block_i][dof_i];
           if (is_subsidiary_block_preconditioner())
           {
             parent_dof_i =
               Doftype_in_master_preconditioner_coarse[parent_dof_i];
           }
  
           for (unsigned dof_j = 0; dof_j < ndof_in_parent_j; dof_j++)
           {
             unsigned parent_dof_j = Doftype_coarsen_map_coarse[block_j][dof_j];
  
             tmp_blocks_pt(dof_i, dof_j) = new CRDoubleMatrix;
  
             new_block[dof_i][dof_j] = 1;
  
             if (is_master_block_preconditioner())
             {
               internal_get_block(
                 parent_dof_i, parent_dof_j, *tmp_blocks_pt(dof_i, dof_j));
             }
             else
             {
               parent_dof_j =
                 Doftype_in_master_preconditioner_coarse[parent_dof_j];
  
               master_block_preconditioner_pt()->get_dof_level_block(
                 parent_dof_i,
                 parent_dof_j,
                 *tmp_blocks_pt(dof_i, dof_j),
                 ignore_replacement_block);
             }
           }
         }
  
         Vector<LinearAlgebraDistribution*> tmp_row_dist_pt(ndof_in_parent_i, 0);
  
         for (unsigned parent_dof_i = 0; parent_dof_i < ndof_in_parent_i;
              parent_dof_i++)
         {
           unsigned mapped_dof_i =
             Doftype_coarsen_map_coarse[block_i][parent_dof_i];
  
           if (is_master_block_preconditioner())
           {
             tmp_row_dist_pt[parent_dof_i] =
               Internal_block_distribution_pt[mapped_dof_i];
           }
           else
           {
             mapped_dof_i =
               Doftype_in_master_preconditioner_coarse[mapped_dof_i];
  
             tmp_row_dist_pt[parent_dof_i] =
               master_block_preconditioner_pt()->dof_block_distribution_pt(
                 mapped_dof_i);
           }
         }
  
         Vector<LinearAlgebraDistribution*> tmp_col_dist_pt(ndof_in_parent_j, 0);
  
         for (unsigned parent_dof_j = 0; parent_dof_j < ndof_in_parent_j;
              parent_dof_j++)
         {
           unsigned mapped_dof_j =
             Doftype_coarsen_map_coarse[block_j][parent_dof_j];
  
           if (is_master_block_preconditioner())
           {
             tmp_col_dist_pt[parent_dof_j] =
               Internal_block_distribution_pt[mapped_dof_j];
           }
           else
           {
             mapped_dof_j =
               Doftype_in_master_preconditioner_coarse[mapped_dof_j];
             tmp_col_dist_pt[parent_dof_j] =
               master_block_preconditioner_pt()->dof_block_distribution_pt(
                 mapped_dof_j);
           }
         }
  
         CRDoubleMatrixHelpers::concatenate_without_communication(
           tmp_row_dist_pt, tmp_col_dist_pt, tmp_blocks_pt, output_block);
  
         for (unsigned dof_i = 0; dof_i < ndof_in_parent_i; dof_i++)
         {
           for (unsigned dof_j = 0; dof_j < ndof_in_parent_j; dof_j++)
           {
             if (new_block[dof_i][dof_j])
             {
               delete tmp_blocks_pt(dof_i, dof_j);
             }
           }
         }
       }
     }
     else
     {
       CRDoubleMatrixHelpers::deep_copy(tmp_block_pt, output_block);
     }
   }
  
   //=============================================================================
   /// test function to check that every element in the block matrix
   /// (block_i,block_j) matches the corresponding element in the original matrix
   //=============================================================================
   template<typename MATRIX>
   void BlockPreconditioner<MATRIX>::block_matrix_test(
     const unsigned& block_i,
     const unsigned& block_j,
     const MATRIX* block_matrix_pt) const
   {
     // boolean flag to indicate whether test is passed
     bool check = true;
  
     // number of rows in matrix
     unsigned n_row = matrix_pt()->nrow();
  
     // number of columns in matrix
     unsigned n_col = matrix_pt()->ncol();
  
     // loop over rows of original matrix
     for (unsigned i = 0; i < n_row; i++)
     {
       // if this coefficient is associated with a block in this block
       // preconditioner
       if (static_cast<int>(block_i) == this->internal_block_number(i))
       {
         // loop over columns of original matrix
         for (unsigned j = 0; j < n_col; j++)
         {
           // if the coeeficient is associated with a block in this block
           // preconditioner
           if (static_cast<int>(block_j) == this->internal_block_number(j))
           {
             // check whether elements in original matrix and matrix of block
             // pointers match
             if (matrix_pt()->operator()(i, j) !=
                 block_matrix_pt->operator()(internal_index_in_block(i),
                                             internal_index_in_block(j)))
             {
               check = false;
             }
           }
         }
       }
     }
  
     // throw error
     if (!check)
     {
       std::ostringstream error_message;
       error_message << "The require elements have not been successfully copied"
                     << " from the original matrix to the block matrices";
       throw OomphLibError(
         error_message.str(), OOMPH_CURRENT_FUNCTION, OOMPH_EXCEPTION_LOCATION);
     }
   }
  
  
   template class BlockPreconditioner<CRDoubleMatrix>;
  
 } // namespace oomph