partitioning.cc
Go to the documentation of this file.
1// LIC// ====================================================================
2// LIC// This file forms part of oomph-lib, the object-oriented,
3// LIC// multi-physics finite-element library, available
4// LIC// at http://www.oomph-lib.org.
5// LIC//
6// LIC// Copyright (C) 2006-2022 Matthias Heil and Andrew Hazel
7// LIC//
8// LIC// This library is free software; you can redistribute it and/or
9// LIC// modify it under the terms of the GNU Lesser General Public
10// LIC// License as published by the Free Software Foundation; either
11// LIC// version 2.1 of the License, or (at your option) any later version.
12// LIC//
13// LIC// This library is distributed in the hope that it will be useful,
14// LIC// but WITHOUT ANY WARRANTY; without even the implied warranty of
15// LIC// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
16// LIC// Lesser General Public License for more details.
17// LIC//
18// LIC// You should have received a copy of the GNU Lesser General Public
19// LIC// License along with this library; if not, write to the Free Software
20// LIC// Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA
21// LIC// 02110-1301 USA.
22// LIC//
23// LIC// The authors may be contacted at oomph-lib@maths.man.ac.uk.
24// LIC//
25// LIC//====================================================================
26#include <float.h>
27
28#include "partitioning.h"
29#include "mesh.h"
30#include "refineable_mesh.h"
31// Include to fill in additional_setup_shared_node_scheme() function
33
34#ifdef OOMPH_TRANSITION_TO_VERSION_3
35
36// for the new METIS API, need to use symbols defined in the standard header
37// which aren't available in the current frozen (old) version of METIS
38// Version 3 will (presumably) have this header in the include path as standard
39#include "metis.h"
40
41#endif
42
43namespace oomph
44{
45 //====================================================================
46 /// Namespace for METIS graph partitioning routines
47 //====================================================================
48 namespace METIS
49 {
50 /// Default function that translates spatial
51 /// error into weight for METIS partitioning (unit weight regardless
52 /// of input).
53 void default_error_to_weight_fct(const double& spatial_error,
54 const double& max_error,
55 const double& min_error,
56 int& weight)
57 {
58 weight = 1;
59 }
60
61 /// Function pointer to to function that translates spatial
62 /// error into weight for METIS partitioning.
64
65 } // namespace METIS
66
67
68 //==================================================================
69 /// Partition mesh uniformly by dividing elements
70 /// equally over the partitions, in the order
71 /// in which they are returned by problem.
72 /// On return, element_domain[ielem] contains the number
73 /// of the domain [0,1,...,ndomain-1] to which
74 /// element ielem has been assigned.
75 //==================================================================
77 const unsigned& ndomain,
78 Vector<unsigned>& element_domain)
79 {
80 // Number of elements
81 unsigned nelem = problem_pt->mesh_pt()->nelement();
82
83#ifdef PARANOID
84 if (nelem != element_domain.size())
85 {
86 std::ostringstream error_stream;
87 error_stream << "element_domain Vector has wrong length " << nelem << " "
88 << element_domain.size() << std::endl;
89
90 throw OomphLibError(
91 error_stream.str(), OOMPH_CURRENT_FUNCTION, OOMPH_EXCEPTION_LOCATION);
92 }
93#endif
94
95
96 // Uniform partitioning
97 unsigned nel_per_domain = int(float(nelem) / float(ndomain));
98 for (unsigned ielem = 0; ielem < nelem; ielem++)
99 {
100 unsigned idomain = unsigned(float(ielem) / float(nel_per_domain));
101 element_domain[ielem] = idomain;
102 }
103 }
104
105
106 //==================================================================
107 /// Use METIS to assign each element to a domain.
108 /// On return, element_domain[ielem] contains the number
109 /// of the domain [0,1,...,ndomain-1] to which
110 /// element ielem has been assigned.
111 /// - objective=0: minimise edgecut.
112 /// - objective=1: minimise total communications volume.
113 /// .
114 /// Partioning is based on dual graph of mesh.
115 //==================================================================
117 const unsigned& ndomain,
118 const unsigned& objective,
119 Vector<unsigned>& element_domain)
120 {
121 // Global mesh
122 Mesh* mesh_pt = problem_pt->mesh_pt();
123
124 // Number of elements
125 unsigned nelem = mesh_pt->nelement();
126
127#ifdef PARANOID
128 if (nelem != element_domain.size())
129 {
130 std::ostringstream error_stream;
131 error_stream << "element_domain Vector has wrong length " << nelem << " "
132 << element_domain.size() << std::endl;
133
134 throw OomphLibError(
135 error_stream.str(), OOMPH_CURRENT_FUNCTION, OOMPH_EXCEPTION_LOCATION);
136 }
137#endif
138
139 // Setup dual graph
140 //------------------
141
142 // Start timer
143 clock_t cpu_start = clock();
144
145 // Container to collect all elements associated with given global eqn number
146 std::map<unsigned, std::set<unsigned>> elements_connected_with_global_eqn;
147
148 // Container for all unique global eqn numbers
149 std::set<unsigned> all_global_eqns;
150
151 // Loop over all elements
152 for (unsigned e = 0; e < nelem; e++)
153 {
154 GeneralisedElement* el_pt = mesh_pt->element_pt(e);
155
156 // Add all global eqn numbers
157 unsigned ndof = el_pt->ndof();
158 for (unsigned j = 0; j < ndof; j++)
159 {
160 // Get global eqn number
161 unsigned eqn_number = el_pt->eqn_number(j);
162 elements_connected_with_global_eqn[eqn_number].insert(e);
163 all_global_eqns.insert(eqn_number);
164 }
165 }
166
167 // Now reverse the lookup scheme to find out all elements
168 // that are connected because they share the same global eqn
169 Vector<std::set<unsigned>> connected_elements(nelem);
170
171 // Counter for total number of entries in connected_elements structure
172 unsigned count = 0;
173
174 // Loop over all global eqns
175 for (std::set<unsigned>::iterator it = all_global_eqns.begin();
176 it != all_global_eqns.end();
177 it++)
178 {
179 // Get set of elements connected with this data item
180 std::set<unsigned> elements = elements_connected_with_global_eqn[*it];
181
182 // Double loop over connnected elements: Everybody's connected to
183 // everybody
184 for (std::set<unsigned>::iterator it1 = elements.begin();
185 it1 != elements.end();
186 it1++)
187 {
188 for (std::set<unsigned>::iterator it2 = elements.begin();
189 it2 != elements.end();
190 it2++)
191 {
192 if ((*it1) != (*it2))
193 {
194 connected_elements[(*it1)].insert(*it2);
195 }
196 }
197 }
198 }
199
200
201 // Now convert into C-style packed array for interface with METIS
202 int* xadj = new int[nelem + 1];
203 Vector<int> adjacency_vector;
204
205 // Reserve (too much) space
206 adjacency_vector.reserve(count);
207
208 // Initialise counters
209 unsigned ientry = 0;
210
211 // Loop over all elements
212 for (unsigned e = 0; e < nelem; e++)
213 {
214 // First entry for current element
215 xadj[e] = ientry;
216
217 // Loop over elements that are connected to current element
218 typedef std::set<unsigned>::iterator IT;
219 for (IT it = connected_elements[e].begin();
220 it != connected_elements[e].end();
221 it++)
222 {
223 // Copy into adjacency array
224 adjacency_vector.push_back(*it);
225
226 // We've just made another entry
227 ientry++;
228 }
229
230 // Entry after last entry for current element:
231 xadj[e + 1] = ientry;
232 }
233
234 // End timer
235 clock_t cpu_end = clock();
236
237 // Doc
238 double cpu0 = double(cpu_end - cpu_start) / CLOCKS_PER_SEC;
240 << "CPU time for setup of METIS data structures [nelem="
241 << nelem << "]: " << cpu0 << " sec" << std::endl;
242
243
244 // Call METIS graph partitioner
245 //-----------------------------
246
247 // Start timer
248 cpu_start = clock();
249
250 // Number of vertices in graph
251 int nvertex = nelem;
252
253 // No vertex weights
254 int* vwgt = 0;
255
256 // No edge weights
257 int* adjwgt = 0;
258
259 // Flag indicating that graph isn't weighted: 0; vertex weights only: 2
260 // Note that wgtflag==2 requires nodal weights to be stored in vwgt.
261 int wgtflag = 0;
262
263 // Use C-style numbering (first array entry is zero)
264 int numflag = 0;
265
266 // Number of desired partitions
267 int nparts = ndomain;
268
269 // Use default options
270 int* options = new int[10];
271 options[0] = 0;
272
273#ifdef OOMPH_TRANSITION_TO_VERSION_3
274 switch (objective)
275 {
276 case 0:
277 // Edge-cut minimization
278 options[0] = METIS_OBJTYPE_CUT;
279 break;
280
281 case 1:
282 // communication volume minimisation
283 options[0] = METIS_OBJTYPE_VOL;
284 break;
285
286 default:
287 std::ostringstream error_stream;
288 error_stream << "Wrong objective for METIS. objective = " << objective
289 << std::endl;
290
291 throw OomphLibError(
292 error_stream.str(), OOMPH_CURRENT_FUNCTION, OOMPH_EXCEPTION_LOCATION);
293 }
294#endif
295
296 // Number of cut edges in graph
297 int* edgecut = new int[nelem];
298
299 // Array containing the partition information
300 int* part = new int[nelem];
301
302 // Can we get an error estimate?
303
304 unsigned n_mesh = problem_pt->nsub_mesh();
305
306 if (n_mesh == 0)
307 {
308 RefineableMeshBase* mmesh_pt = dynamic_cast<RefineableMeshBase*>(mesh_pt);
309 if (mmesh_pt != 0)
310 {
311 // Bias distribution?
313 {
315 << "Biasing element distribution via spatial error estimate\n";
316
317 // Adjust flag and provide storage for weights
318 wgtflag = 2;
319 vwgt = new int[nelem];
320
321 // Get error for all elements
322 Vector<double> elemental_error(nelem);
324 mesh_pt, elemental_error);
325
326 double max_error =
327 *(std::max_element(elemental_error.begin(), elemental_error.end()));
328 double min_error =
329 *(std::min_element(elemental_error.begin(), elemental_error.end()));
330
331 // Bias weights
332 int weight = 1;
333 for (unsigned e = 0; e < nelem; e++)
334 {
335 // Translate error into weight
337 elemental_error[e], max_error, min_error, weight);
338 vwgt[e] = weight;
339 }
340 }
341 }
342 }
343 else // There are submeshes
344 {
345 // Are any of the submeshes refineable?
346 bool refineable_submesh_exists = false;
347 // Vector to store "start and end point" for loops in submeshes
348 Vector<unsigned> loop_helper(n_mesh + 1);
349 loop_helper[0] = 0;
350
351 // Loop over submeshes
352 for (unsigned i_mesh = 0; i_mesh < n_mesh; i_mesh++)
353 {
354 // Store the end of the loop
355 loop_helper[i_mesh + 1] =
356 problem_pt->mesh_pt(i_mesh)->nelement() + loop_helper[i_mesh];
357
358 RefineableMeshBase* mmesh_pt =
359 dynamic_cast<RefineableMeshBase*>(problem_pt->mesh_pt(i_mesh));
360 if (mmesh_pt != 0)
361 {
362 refineable_submesh_exists = true;
363 }
364 }
365
366 // If a refineable submesh exists
367 if (refineable_submesh_exists)
368 {
369 // Bias distribution?
371 {
373 << "Biasing element distribution via spatial error estimate\n";
374
375 // Adjust flag and provide storage for weights
376 wgtflag = 2;
377 vwgt = new int[nelem];
378
379 // Loop over submeshes
380 for (unsigned i_mesh = 0; i_mesh < n_mesh; i_mesh++)
381 {
382 RefineableMeshBase* mmesh_pt =
383 dynamic_cast<RefineableMeshBase*>(problem_pt->mesh_pt(i_mesh));
384 if (mmesh_pt != 0)
385 {
386 // Get error for all elements
387 unsigned nsub_elem =
388 loop_helper[i_mesh + 1] - loop_helper[i_mesh];
389 Vector<double> elemental_error(nsub_elem);
391 problem_pt->mesh_pt(i_mesh), elemental_error);
392
393 double max_error = *(std::max_element(elemental_error.begin(),
394 elemental_error.end()));
395 double min_error = *(std::min_element(elemental_error.begin(),
396 elemental_error.end()));
397
398 // Bias weights
399 int weight = 1;
400 unsigned start = loop_helper[i_mesh];
401 unsigned end = loop_helper[i_mesh + 1];
402 for (unsigned e = start; e < end; e++)
403 {
404 unsigned error_index = e - start;
405 // Translate error into weight
407 elemental_error[error_index], max_error, min_error, weight);
408 vwgt[e] = weight;
409 }
410 }
411 else // This mesh is not refineable
412 {
413 // There's no error estimator, so use the default weight
414 int weight = 1;
415 unsigned start = loop_helper[i_mesh];
416 unsigned end = loop_helper[i_mesh + 1];
417 for (unsigned e = start; e < end; e++)
418 {
419 vwgt[e] = weight;
420 }
421 }
422 }
423 }
424 }
425 }
426
427#ifdef OOMPH_TRANSITION_TO_VERSION_3
428
429 // Call partitioner
430 METIS_PartGraphKway(&nvertex,
431 xadj,
432 &adjacency_vector[0],
433 vwgt,
434 adjwgt,
435 &wgtflag,
436 &numflag,
437 &nparts,
438 options,
439 edgecut,
440 part);
441#else
442 // original code to delete in version 3
443
444 // Call partitioner
445 if (objective == 0)
446 {
447 // Partition with the objective of minimising the edge cut
448 METIS_PartGraphKway(&nvertex,
449 xadj,
450 &adjacency_vector[0],
451 vwgt,
452 adjwgt,
453 &wgtflag,
454 &numflag,
455 &nparts,
456 options,
457 edgecut,
458 part);
459 }
460 else if (objective == 1)
461 {
462 // Partition with the objective of minimising the total communication
463 // volume
464 METIS_PartGraphVKway(&nvertex,
465 xadj,
466 &adjacency_vector[0],
467 vwgt,
468 adjwgt,
469 &wgtflag,
470 &numflag,
471 &nparts,
472 options,
473 edgecut,
474 part);
475 }
476 else
477 {
478 std::ostringstream error_stream;
479 error_stream << "Wrong objective for METIS. objective = " << objective
480 << std::endl;
481
482 throw OomphLibError(
483 error_stream.str(), OOMPH_CURRENT_FUNCTION, OOMPH_EXCEPTION_LOCATION);
484 }
485#endif
486
487#ifdef PARANOID
488 std::vector<bool> done(nparts, false);
489#endif
490
491 // Copy across
492 for (unsigned e = 0; e < nelem; e++)
493 {
494 element_domain[e] = part[e];
495#ifdef PARANOID
496 done[part[e]] = true;
497#endif
498 }
499
500
501#ifdef PARANOID
502 // Check
503 std::ostringstream error_stream;
504 bool shout = false;
505 for (int p = 0; p < nparts; p++)
506 {
507 if (!done[p])
508 {
509 shout = true;
510 error_stream << "No elements on processor " << p
511 << "when trying to partition " << nelem << "elements over "
512 << nparts << " processors!\n";
513 }
514 }
515 if (shout)
516 {
517 throw OomphLibError(
518 error_stream.str(), OOMPH_CURRENT_FUNCTION, OOMPH_EXCEPTION_LOCATION);
519 }
520#endif
521
522
523 // End timer
524 cpu_end = clock();
525
526 // Doc
527 double cpu1 = double(cpu_end - cpu_start) / CLOCKS_PER_SEC;
529 << "CPU time for METIS mesh partitioning [nelem="
530 << nelem << "]: " << cpu1 << " sec" << std::endl;
531
532
533 // Cleanup
534 delete[] xadj;
535 delete[] part;
536 delete[] edgecut;
537 delete[] options;
538 }
539
540
541#ifdef OOMPH_HAS_MPI
542
543
544 //==================================================================
545 /// Use METIS to assign each element in an already-distributed mesh
546 /// to a domain. On return, element_domain_on_this_proc[e] contains the number
547 /// of the domain [0,1,...,ndomain-1] to which non-halo element e on THE
548 /// CURRENT PROCESSOR ONLY has been assigned. The order of the non-halo
549 /// elements is the same as in the Problem's mesh, with the halo
550 /// elements being skipped.
551 /// Objective:
552 /// - objective=0: minimise edgecut.
553 /// - objective=1: minimise total communications volume.
554 /// .
555 /// The partioning is based on the dof graph of the complete mesh by
556 /// taking into
557 /// account which global equation numbers are affected by each element and
558 /// connecting elements which affect the same global equation number.
559 /// Partitioning is done such that all elements associated with the
560 /// same tree root move together. Non-refineable elements are
561 /// treated as their own root elements. If the optional boolean
562 /// flag is set to true (it defaults to false) each processor
563 /// assigns a dumb-but-repeatable equidistribution of its non-halo
564 /// elements over the domains and outputs the input that would have
565 /// gone into METIS in the file metis_input_for_validation.dat
566 //==================================================================
568 Problem* problem_pt,
569 const unsigned& objective,
570 Vector<unsigned>& element_domain_on_this_proc,
571 const bool& bypass_metis)
572 {
573 // Start timer
574 clock_t cpu_start = clock();
575
576 // Communicator
577 OomphCommunicator* comm_pt = problem_pt->communicator_pt();
578
579 // Number of processors / domains
580 unsigned n_proc = comm_pt->nproc();
581 unsigned my_rank = comm_pt->my_rank();
582
583 // Global mesh
584 Mesh* mesh_pt = problem_pt->mesh_pt();
585
586 // Total number of elements (halo and nonhalo) on this proc
587 unsigned n_elem = mesh_pt->nelement();
588
589 // Get elemental assembly times
590 Vector<double> elemental_assembly_time =
591 problem_pt->elemental_assembly_time();
592
593#ifdef PARANOID
594 unsigned n = elemental_assembly_time.size();
595 if ((n != 0) && (n != n_elem))
596 {
597 std::ostringstream error_stream;
598 error_stream << "Number of elements doesn't match the \n"
599 << "number of elemental assembly times: " << n_elem << " "
600 << n << std::endl;
601 throw OomphLibError(
602 error_stream.str(), OOMPH_CURRENT_FUNCTION, OOMPH_EXCEPTION_LOCATION);
603 }
604#endif
605
606 // Can we base load balancing on assembly times?
607 bool can_load_balance_on_assembly_times = false;
608 if (elemental_assembly_time.size() != 0)
609 {
610 can_load_balance_on_assembly_times = true;
611 }
612
613 // Storage for global eqn numbers on current processor
614 std::set<unsigned> global_eqns_on_this_proc;
615
616 // Storage for pointers to root elements that are connected with given
617 // eqn number -- assembled on local processor
618 std::map<unsigned, std::set<GeneralisedElement*>>
619 root_elements_connected_with_global_eqn_on_this_proc;
620
621 // Storage for long sequence of equation numbers as encountered
622 // by the root elements on this processor
623 Vector<unsigned> eqn_numbers_with_root_elements_on_this_proc;
624
625 // Reserve number of elements x average/estimate (?) for number of dofs
626 // per element
627 eqn_numbers_with_root_elements_on_this_proc.reserve(n_elem * 9);
628
629 // Storage for the number of eqn numbers associated with each
630 // root element on this processors -- once this and the previous
631 // container have been collected from all processors we're
632 // able to reconstruct which root element (in the nominal "global" mesh)
633 // is connected with which global equations
634 Vector<unsigned> number_of_dofs_for_root_element;
635 number_of_dofs_for_root_element.reserve(n_elem);
636
637 // Ditto for number of "leaf" elements connected with each root
638 Vector<unsigned> number_of_non_halo_elements_for_root_element;
639 number_of_non_halo_elements_for_root_element.reserve(n_elem);
640
641 // Ditto for total assembly time of "leaf" elements connected with each root
642 Vector<double> total_assembly_time_for_root_element;
643 total_assembly_time_for_root_element.reserve(n_elem);
644
645 // Map storing the number of the root elements on this processor
646 // (offset by one to bypass the zero default).
647 std::map<GeneralisedElement*, unsigned> root_el_number_plus_one;
648
649 // Loop over non-halo elements on this processor
650 int number_of_root_elements = 0;
651 unsigned number_of_non_halo_elements = 0;
652 for (unsigned e = 0; e < n_elem; e++)
653 {
654 double el_assembly_time = 0.0;
655 GeneralisedElement* el_pt = mesh_pt->element_pt(e);
656 if (!el_pt->is_halo())
657 {
658 if (can_load_balance_on_assembly_times)
659 {
660 el_assembly_time = elemental_assembly_time[e];
661 }
662
663 // Get the associated root element which is either...
664 GeneralisedElement* root_el_pt = 0;
665 RefineableElement* ref_el_pt = dynamic_cast<RefineableElement*>(el_pt);
666 if (ref_el_pt != 0)
667 {
668 //...the actual root element
669 root_el_pt = ref_el_pt->root_element_pt();
670 }
671 // ...or the element itself
672 else
673 {
674 root_el_pt = el_pt;
675 }
676
677 // Have we already encountered this root element?
678 // (offset of one to bypass the default return of zero)
679 bool already_encountered = false;
680 unsigned root_el_number = root_el_number_plus_one[root_el_pt];
681 if (root_el_number_plus_one[root_el_pt] == 0)
682 {
683 // This is a new one
684 already_encountered = false;
685
686 // Give it a number
687 number_of_root_elements++;
688 root_el_number_plus_one[root_el_pt] = number_of_root_elements;
689
690 // Remove offset
691 root_el_number = number_of_root_elements - 1;
692 }
693 else
694 {
695 // We've already visited this one before...
696 already_encountered = true;
697
698 // Remove offset
699 root_el_number -= 1;
700 }
701
702
703 // Get global equation numbers of actual element
704 unsigned n_dof = el_pt->ndof();
705 for (unsigned i = 0; i < n_dof; i++)
706 {
707 unsigned eqn_no = el_pt->eqn_number(i);
708
709 // Record which root elements are connected with this eqn number
710 root_elements_connected_with_global_eqn_on_this_proc[eqn_no].insert(
711 root_el_pt);
712
713 // Record all global eqn numbers on this processor
714 global_eqns_on_this_proc.insert(eqn_no);
715
716 // Add eqn number of the current element to the long sequence
717 // of eqn numbers
718 eqn_numbers_with_root_elements_on_this_proc.push_back(eqn_no);
719 }
720
721 // Now record how many equations are associated with the current
722 // non-halo element
723 if (already_encountered)
724 {
725 number_of_dofs_for_root_element[root_el_number] += n_dof;
726 number_of_non_halo_elements_for_root_element[root_el_number]++;
727 total_assembly_time_for_root_element[root_el_number] +=
728 el_assembly_time;
729 }
730 else
731 {
732 number_of_dofs_for_root_element.push_back(n_dof);
733 number_of_non_halo_elements_for_root_element.push_back(1);
734 total_assembly_time_for_root_element.push_back(el_assembly_time);
735 }
736
737 // Bump up number of non-halos
738 number_of_non_halo_elements++;
739 }
740 }
741
742 // Tell everybody how many root elements
743 // are on each processor
744 unsigned root_processor = 0;
745 Vector<int> number_of_root_elements_on_each_proc(n_proc, 0);
746 MPI_Allgather(&number_of_root_elements,
747 1,
748 MPI_INT,
749 &number_of_root_elements_on_each_proc[0],
750 1,
751 MPI_INT,
752 comm_pt->mpi_comm());
753
754
755 // In the big sequence of concatenated root elements (enumerated
756 // individually on the various processors) where do the root elements from a
757 // given processor start? Also figure out how many root elements there are
758 // in total by summing up their numbers
759 Vector<int> start_index(n_proc, 0);
760 unsigned total_number_of_root_elements = 0;
761 for (unsigned i_proc = 0; i_proc < n_proc; i_proc++)
762 {
763 total_number_of_root_elements +=
764 number_of_root_elements_on_each_proc[i_proc];
765 if (i_proc != 0)
766 {
767 start_index[i_proc] = total_number_of_root_elements -
768 number_of_root_elements_on_each_proc[i_proc];
769 }
770 else
771 {
772 start_index[0] = 0;
773 }
774 }
775
776
777 // How many global equations are held on this processor?
778 int n_eqns_on_this_proc =
779 eqn_numbers_with_root_elements_on_this_proc.size();
780
781 // Gather this information for all processors:
782 // n_eqns_on_each_proc[iproc] now contains the number of global
783 // equations held on processor iproc.
784 Vector<int> n_eqns_on_each_proc(n_proc, 0);
785 MPI_Allgather(&n_eqns_on_this_proc,
786 1,
787 MPI_INT,
788 &n_eqns_on_each_proc[0],
789 1,
790 MPI_INT,
791 comm_pt->mpi_comm());
792
793
794 // In the big sequence of equation numbers from the root elements
795 // (enumerated individually on the various processors) where do the
796 // equation numbers associated with the root elements from a given
797 // processor start? Also figure out how long the sequence of equation
798 // numbers is
799 Vector<int> start_eqns_index(n_proc, 0);
800 unsigned total_n_eqn = 0;
801 for (unsigned i_proc = 0; i_proc < n_proc; i_proc++)
802 {
803 total_n_eqn += n_eqns_on_each_proc[i_proc];
804 if (i_proc != 0)
805 {
806 start_eqns_index[i_proc] = total_n_eqn - n_eqns_on_each_proc[i_proc];
807 }
808 else
809 {
810 start_eqns_index[0] = 0;
811 }
812 }
813
814
815 // Big vector that contains the number of dofs for each root element
816 // (concatenated in processor-by-processor order)
817 Vector<unsigned> number_of_dofs_for_global_root_element(
818 total_number_of_root_elements);
819 // Create at least one entry so we don't get a seg fault below
820 if (number_of_dofs_for_root_element.size() == 0)
821 {
822 number_of_dofs_for_root_element.resize(1);
823 }
824 MPI_Gatherv(
825 &number_of_dofs_for_root_element[0], // pointer to first entry in
826 // vector to be gathered on root
827 number_of_root_elements, // Number of entries to be sent
828 // from current processor
829 MPI_UNSIGNED,
830 &number_of_dofs_for_global_root_element[0], // Target -- this will
831 // store the concatenated
832 // vectors sent from
833 // everywhere
834 &number_of_root_elements_on_each_proc[0], // Pointer to
835 // vector containing
836 // the length of the
837 // vectors received
838 // from elsewhere
839 &start_index[0], // "offset" for storage of vector received
840 // from various processors in the global
841 // concatenated vector stored on root
842 MPI_UNSIGNED,
843 root_processor,
844 comm_pt->mpi_comm());
845
846
847 // ditto for number of non-halo elements associated with root element
848 Vector<unsigned> number_of_non_halo_elements_for_global_root_element(
849 total_number_of_root_elements);
850
851 // Create at least one entry so we don't get a seg fault below
852 if (number_of_non_halo_elements_for_root_element.size() == 0)
853 {
854 number_of_non_halo_elements_for_root_element.resize(1);
855 }
856 MPI_Gatherv(&number_of_non_halo_elements_for_root_element[0],
857 // pointer to first entry in
858 // vector to be gathered on root
859 number_of_root_elements, // Number of entries to be sent
860 // from current processor
861 MPI_UNSIGNED,
862 &number_of_non_halo_elements_for_global_root_element[0],
863 // Target -- this will
864 // store the concatenated
865 // vectors sent from
866 // everywhere
867 &number_of_root_elements_on_each_proc[0], // Pointer to
868 // vector containing
869 // the length of the
870 // vectors received
871 // from elsewhere
872 &start_index[0], // "offset" for storage of vector received
873 // from various processors in the global
874 // concatenated vector stored on root
875 MPI_UNSIGNED,
876 root_processor,
877 comm_pt->mpi_comm());
878
879
880 // ditto for assembly times elements associated with root element
881 Vector<double> total_assembly_time_for_global_root_element(
882 total_number_of_root_elements);
883
884 // Create at least one entry so we don't get a seg fault below
885 if (total_assembly_time_for_root_element.size() == 0)
886 {
887 total_assembly_time_for_root_element.resize(1);
888 }
889 MPI_Gatherv(&total_assembly_time_for_root_element[0],
890 // pointer to first entry in
891 // vector to be gathered on root
892 number_of_root_elements, // Number of entries to be sent
893 // from current processor
894 MPI_DOUBLE,
895 &total_assembly_time_for_global_root_element[0],
896 // Target -- this will
897 // store the concatenated
898 // vectors sent from
899 // everywhere
900 &number_of_root_elements_on_each_proc[0], // Pointer to
901 // vector containing
902 // the length of the
903 // vectors received
904 // from elsewhere
905 &start_index[0], // "offset" for storage of vector received
906 // from various processors in the global
907 // concatenated vector stored on root
908 MPI_DOUBLE,
909 root_processor,
910 comm_pt->mpi_comm());
911
912
913 // Big vector to store the long sequence of global equation numbers
914 // associated with the long sequence of root elements
915 Vector<unsigned> eqn_numbers_with_root_elements(total_n_eqn);
916
917 // Create at least one entry so we don't get a seg fault below
918 if (eqn_numbers_with_root_elements_on_this_proc.size() == 0)
919 {
920 eqn_numbers_with_root_elements_on_this_proc.resize(1);
921 }
922 MPI_Gatherv(&eqn_numbers_with_root_elements_on_this_proc[0],
923 n_eqns_on_this_proc,
924 MPI_UNSIGNED,
925 &eqn_numbers_with_root_elements[0],
926 &n_eqns_on_each_proc[0],
927 &start_eqns_index[0],
928 MPI_UNSIGNED,
929 root_processor,
930 comm_pt->mpi_comm());
931
932 // Doc
933 clock_t cpu_end = clock();
934
935 double cpu0 = double(cpu_end - cpu_start) / CLOCKS_PER_SEC;
937 << "CPU time for global setup of METIS data structures [nroot_elem="
938 << total_number_of_root_elements << "]: " << cpu0 << " sec" << std::endl;
939
940
941 // Now the root processor has gathered all the data needed to establish
942 // the root element connectivity (as in the serial case) so use METIS
943 // to determine "partitioning" for non-uniformly refined mesh
944 //----------------------------------------------------------------------
945
946 // Vector to store target domain for each of the root elements (concatenated
947 // in processor-by-processor order)
948 Vector<unsigned> root_element_domain(total_number_of_root_elements, 0);
949 if (my_rank == root_processor) //--
950 {
951 // Start timer
952 clock_t cpu_start = clock();
953
954 // Repeat the steps used in the serial code: Storage for
955 // the global equations (on root processor)
956 std::set<unsigned> all_global_eqns_root_processor;
957
958 // Set of root elements (as enumerated in the processor-by-processor
959 // order) associated with given global equation number
960 std::map<unsigned, std::set<unsigned>>
961 root_elements_connected_with_global_eqn_on_root_processor;
962
963 // Retrace the steps of the serial code: Who's connected with who
964 unsigned count_all = 0;
965 for (unsigned e = 0; e < total_number_of_root_elements; e++)
966 {
967 unsigned n_eqn_no = number_of_dofs_for_global_root_element[e];
968 for (unsigned n = 0; n < n_eqn_no; n++)
969 {
970 unsigned eqn_no = eqn_numbers_with_root_elements[count_all];
971 count_all++;
972 root_elements_connected_with_global_eqn_on_root_processor[eqn_no]
973 .insert(e);
974 all_global_eqns_root_processor.insert(eqn_no);
975 }
976 }
977
978 // Number of domains
979 unsigned ndomain = n_proc;
980
981 // Now reverse the lookup scheme to find out all root elements
982 // that are connected because they share the same global eqn
983 Vector<std::set<unsigned>> connected_root_elements(
984 total_number_of_root_elements);
985
986 // Counter for total number of entries in connected_root_elements
987 // structure
988 unsigned count = 0;
989
990 // Loop over all global eqns
991 for (std::set<unsigned>::iterator it =
992 all_global_eqns_root_processor.begin();
993 it != all_global_eqns_root_processor.end();
994 it++)
995 {
996 // Get set of root elements connected with this data item
997 std::set<unsigned> root_elements =
998 root_elements_connected_with_global_eqn_on_root_processor[*it];
999
1000 // Double loop over connnected root elements: Everybody's connected to
1001 // everybody
1002 for (std::set<unsigned>::iterator it1 = root_elements.begin();
1003 it1 != root_elements.end();
1004 it1++)
1005 {
1006 for (std::set<unsigned>::iterator it2 = root_elements.begin();
1007 it2 != root_elements.end();
1008 it2++)
1009 {
1010 if ((*it1) != (*it2))
1011 {
1012 connected_root_elements[(*it1)].insert(*it2);
1013 }
1014 }
1015 }
1016 }
1017
1018 // End timer
1019 clock_t cpu_end = clock();
1020
1021 // Doc
1022 double cpu0b = double(cpu_end - cpu_start) / CLOCKS_PER_SEC;
1023 oomph_info << "CPU time for setup of connected elements (load balance) "
1024 "[nroot_elem="
1025 << total_number_of_root_elements << "]: " << cpu0b << " sec"
1026 << std::endl;
1027
1028 // Now convert into C-style packed array for interface with METIS
1029 cpu_start = clock();
1030 int* xadj = new int[total_number_of_root_elements + 1];
1031 Vector<int> adjacency_vector;
1032
1033 // Reserve (too much) space
1034 adjacency_vector.reserve(count);
1035
1036 // Initialise counters
1037 unsigned ientry = 0;
1038
1039 // Loop over all elements
1040 for (unsigned e = 0; e < total_number_of_root_elements; e++)
1041 {
1042 // First entry for current element
1043 xadj[e] = ientry;
1044
1045 // Loop over elements that are connected to current element
1046 typedef std::set<unsigned>::iterator IT;
1047 for (IT it = connected_root_elements[e].begin();
1048 it != connected_root_elements[e].end();
1049 it++)
1050 {
1051 // Copy into adjacency array
1052 adjacency_vector.push_back(*it);
1053
1054 // We've just made another entry
1055 ientry++;
1056 }
1057
1058 // Entry after last entry for current element:
1059 xadj[e + 1] = ientry;
1060 }
1061
1062 // End timer
1063 cpu_end = clock();
1064
1065 // Doc
1066 double cpu0 = double(cpu_end - cpu_start) / CLOCKS_PER_SEC;
1067 oomph_info << "CPU time for setup of METIS data structures (load "
1068 "balance) [nroot_elem="
1069 << total_number_of_root_elements << "]: " << cpu0 << " sec"
1070 << std::endl;
1071
1072
1073 // Call METIS graph partitioner
1074 //-----------------------------
1075
1076 // Start timer
1077 cpu_start = clock();
1078
1079 // Number of vertices in graph
1080 int nvertex = total_number_of_root_elements;
1081
1082 // No vertex weights
1083 int* vwgt = 0;
1084
1085 // No edge weights
1086 int* adjwgt = 0;
1087
1088 // Flag indicating that graph isn't weighted: 0; vertex weights only: 2
1089 // Note that wgtflag==2 requires nodal weights to be stored in vwgt.
1090 int wgtflag = 0;
1091
1092 // Use C-style numbering (first array entry is zero)
1093 int numflag = 0;
1094
1095 // Number of desired partitions
1096 int nparts = ndomain;
1097
1098 // Use default options
1099 int* options = new int[10];
1100 options[0] = 0;
1101
1102#ifdef OOMPH_TRANSITION_TO_VERSION_3
1103 switch (objective)
1104 {
1105 case 0:
1106 // Edge-cut minimization
1107 options[0] = METIS_OBJTYPE_CUT;
1108 break;
1109
1110 case 1:
1111 // communication volume minimisation
1112 options[0] = METIS_OBJTYPE_VOL;
1113 break;
1114
1115 default:
1116 std::ostringstream error_stream;
1117 error_stream << "Wrong objective for METIS. objective = " << objective
1118 << std::endl;
1119
1120 throw OomphLibError(error_stream.str(),
1121 OOMPH_CURRENT_FUNCTION,
1122 OOMPH_EXCEPTION_LOCATION);
1123 }
1124#endif
1125
1126 // Number of cut edges in graph
1127 int* edgecut = new int[total_number_of_root_elements];
1128
1129 // Array containing the partition information
1130 int* part = new int[total_number_of_root_elements];
1131
1132 // Now bias distribution by giving each root element
1133 // a weight equal to the number of elements associated with it
1134
1135 // Adjust flag and provide storage for weights
1136 wgtflag = 2;
1137 vwgt = new int[total_number_of_root_elements];
1138
1139
1140 // Load balance based on assembly times of all leaf
1141 // elements associated with root
1142 if (can_load_balance_on_assembly_times)
1143 {
1144 oomph_info << "Basing distribution on assembly times of elements\n";
1145
1146 // Normalise
1147 double min_time = *(
1148 std::min_element(total_assembly_time_for_global_root_element.begin(),
1149 total_assembly_time_for_global_root_element.end()));
1150#ifdef PARANOID
1151 if (min_time == 0.0)
1152 {
1153 std::ostringstream error_stream;
1154 error_stream << "Minimum assemble time for element is zero!\n";
1155 throw OomphLibError(error_stream.str(),
1156 OOMPH_CURRENT_FUNCTION,
1157 OOMPH_EXCEPTION_LOCATION);
1158 }
1159#endif
1160
1161 // Bypass METIS (usually for validation) and use made-up but
1162 // repeatable timings
1163 if (bypass_metis)
1164 {
1165 for (unsigned e = 0; e < total_number_of_root_elements; e++)
1166 {
1167 vwgt[e] = e;
1168 }
1169 }
1170 else
1171 {
1172 for (unsigned e = 0; e < total_number_of_root_elements; e++)
1173 {
1174 // Use assembly times (relative to minimum) as weight
1175 vwgt[e] =
1176 int(total_assembly_time_for_global_root_element[e] / min_time);
1177 }
1178 }
1179 }
1180 // Load balanced based on number of leaf elements associated with
1181 // root
1182 else
1183 {
1184 oomph_info << "Basing distribution on number of elements\n";
1185 for (unsigned e = 0; e < total_number_of_root_elements; e++)
1186 {
1187 vwgt[e] = number_of_non_halo_elements_for_global_root_element[e];
1188 }
1189 }
1190
1191 // Bypass METIS (usually for validation)
1192 if (bypass_metis)
1193 {
1194 // Simple repeatable partition: Equidistribute root element
1195 for (unsigned e = 0; e < total_number_of_root_elements; e++)
1196 {
1197 // Simple repeatable partition: Equidistribute elements on each
1198 // processor
1199 part[e] = (n_proc - 1) -
1200 unsigned(double(e) / double(total_number_of_root_elements) *
1201 double(n_proc));
1202 }
1203
1205 << "Bypassing METIS for validation purposes.\n"
1206 << "Appending input for metis in metis_input_for_validation.dat\n";
1207 std::ofstream outfile;
1208 outfile.open("metis_input_for_validation.dat", std::ios_base::app);
1209
1210 // Dump out relevant input to metis
1211 for (unsigned e = 0; e < total_number_of_root_elements + 1; e++)
1212 {
1213 outfile << xadj[e] << std::endl;
1214 }
1215 unsigned n = adjacency_vector.size();
1216 for (unsigned i = 0; i < n; i++)
1217 {
1218 outfile << adjacency_vector[i] << std::endl;
1219 }
1220 for (unsigned e = 0; e < total_number_of_root_elements; e++)
1221 {
1222 outfile << vwgt[e] << std::endl;
1223 }
1224 outfile.close();
1225 }
1226 // Actually use METIS (good but not always repeatable!)
1227 else
1228 {
1229#ifdef OOMPH_TRANSITION_TO_VERSION_3
1230
1231 METIS_PartGraphKway(&nvertex,
1232 xadj,
1233 &adjacency_vector[0],
1234 vwgt,
1235 adjwgt,
1236 &wgtflag,
1237 &numflag,
1238 &nparts,
1239 options,
1240 edgecut,
1241 part);
1242#else
1243 // for old version of METIS; these two functions have been merged
1244 // in the new METIS API
1245
1246 if (objective == 0)
1247 {
1248 // Partition with the objective of minimising the edge cut
1249 METIS_PartGraphKway(&nvertex,
1250 xadj,
1251 &adjacency_vector[0],
1252 vwgt,
1253 adjwgt,
1254 &wgtflag,
1255 &numflag,
1256 &nparts,
1257 options,
1258 edgecut,
1259 part);
1260 }
1261 else if (objective == 1)
1262 {
1263 // Partition with the objective of minimising the total communication
1264 // volume
1265 METIS_PartGraphVKway(&nvertex,
1266 xadj,
1267 &adjacency_vector[0],
1268 vwgt,
1269 adjwgt,
1270 &wgtflag,
1271 &numflag,
1272 &nparts,
1273 options,
1274 edgecut,
1275 part);
1276 }
1277#endif
1278 }
1279
1280 // Copy across
1281 Vector<unsigned> total_weight_on_proc(n_proc, 0);
1282 for (unsigned e = 0; e < total_number_of_root_elements; e++)
1283 {
1284 root_element_domain[e] = part[e];
1285 total_weight_on_proc[part[e]] += vwgt[e];
1286 }
1287
1288 // Document success of partitioning
1289 for (unsigned j = 0; j < n_proc; j++)
1290 {
1291 oomph_info << "Total weight on proc " << j << " is "
1292 << total_weight_on_proc[j] << std::endl;
1293 }
1294
1295 // Doc
1296 double cpu1 = double(cpu_end - cpu_start) / CLOCKS_PER_SEC;
1297 oomph_info << "CPU time for METIS mesh partitioning [nroot_elem="
1298 << total_number_of_root_elements << "]: " << cpu1 << " sec"
1299 << std::endl;
1300
1301 // Cleanup
1302 delete[] xadj;
1303 delete[] part;
1304 delete[] vwgt;
1305 delete[] edgecut;
1306 delete[] options;
1307 }
1308
1309 // Now scatter things back to processors: root_element_domain[] contains
1310 // the target domain for all elements (concatenated in processor-by
1311 // processor order on the root processor). Distribute this back
1312 // to the processors so that root_element_domain_on_this_proc[e] contains
1313 // the target domain for root element e (in whatever order the processor
1314 // decided to line up its root elements).
1315 cpu_start = clock();
1316 Vector<unsigned> root_element_domain_on_this_proc(number_of_root_elements);
1317
1318 // Create at least one entry so we don't get a seg fault below
1319 if (root_element_domain_on_this_proc.size() == 0)
1320 {
1321 root_element_domain_on_this_proc.resize(1);
1322 }
1323 MPI_Scatterv(&root_element_domain[0],
1324 &number_of_root_elements_on_each_proc[0],
1325 &start_index[0],
1326 MPI_UNSIGNED,
1327 &root_element_domain_on_this_proc[0],
1328 number_of_root_elements,
1329 MPI_UNSIGNED,
1330 root_processor,
1331 comm_pt->mpi_comm());
1332
1333
1334 // Now translate back into target domain for the actual (non-root)
1335 // elements
1336 element_domain_on_this_proc.resize(number_of_non_halo_elements);
1337 unsigned count_non_halo = 0;
1338 for (unsigned e = 0; e < n_elem; e++)
1339 {
1340 GeneralisedElement* el_pt = mesh_pt->element_pt(e);
1341 if (!el_pt->is_halo())
1342 {
1343 // Get the associated root element which is either...
1344 GeneralisedElement* root_el_pt = 0;
1345 RefineableElement* ref_el_pt = dynamic_cast<RefineableElement*>(el_pt);
1346 if (ref_el_pt != 0)
1347 {
1348 //...the actual root element
1349 root_el_pt = ref_el_pt->root_element_pt();
1350 }
1351 // ...or the element itself
1352 else
1353 {
1354 root_el_pt = el_pt;
1355 }
1356
1357 // Recover the root element number (offset by one)
1358 unsigned root_el_number = root_el_number_plus_one[root_el_pt] - 1;
1359
1360 // Copy target domain across from root element
1361 element_domain_on_this_proc[count_non_halo] =
1362 root_element_domain_on_this_proc[root_el_number];
1363
1364 // Bump up counter for non-root elements
1365 count_non_halo++;
1366 }
1367 }
1368
1369
1370#ifdef PARANOID
1371 if (count_non_halo != number_of_non_halo_elements)
1372 {
1373 std::ostringstream error_stream;
1374 error_stream << "Non-halo counts don't match: " << count_non_halo << " "
1375 << number_of_non_halo_elements << std::endl;
1376
1377 throw OomphLibError(
1378 error_stream.str(), OOMPH_CURRENT_FUNCTION, OOMPH_EXCEPTION_LOCATION);
1379 }
1380#endif
1381
1382 // End timer
1383 cpu_end = clock();
1384
1385 // Doc
1386 double cpu2 = double(cpu_end - cpu_start) / CLOCKS_PER_SEC;
1387 oomph_info << "CPU time for communication of partition to all processors "
1388 "[nroot_elem="
1389 << total_number_of_root_elements << "]: " << cpu2 << " sec"
1390 << std::endl;
1391 }
1392
1393
1394#endif
1395
1396} // namespace oomph
e
Definition: cfortran.h:571
cstr elem_len * i
Definition: cfortran.h:603
void get_element_errors(Mesh *&mesh_pt, Vector< double > &elemental_error)
Compute the elemental error-measures for a given mesh and store them in a vector.
A Generalised Element class.
Definition: elements.h:73
bool is_halo() const
Is this element a halo?
Definition: elements.h:1163
unsigned ndof() const
Return the number of equations/dofs in the element.
Definition: elements.h:835
unsigned long eqn_number(const unsigned &ieqn_local) const
Return the global equation number corresponding to the ieqn_local-th local equation number.
Definition: elements.h:704
A general mesh class.
Definition: mesh.h:67
GeneralisedElement *& element_pt(const unsigned long &e)
Return pointer to element e.
Definition: mesh.h:448
unsigned long nelement() const
Return number of elements in the mesh.
Definition: mesh.h:590
An oomph-lib wrapper to the MPI_Comm communicator object. Just contains an MPI_Comm object (which is ...
Definition: communicator.h:54
An OomphLibError object which should be thrown when an run-time error is encountered....
////////////////////////////////////////////////////////////////// //////////////////////////////////...
Definition: problem.h:151
OomphCommunicator * communicator_pt()
access function to the oomph-lib communicator
Definition: problem.h:1246
Vector< double > elemental_assembly_time()
Return vector of most-recent elemental assembly times (used for load balancing). Zero sized if no Jac...
Definition: problem.h:864
unsigned nsub_mesh() const
Return number of submeshes.
Definition: problem.h:1323
Mesh *& mesh_pt()
Return a pointer to the global mesh.
Definition: problem.h:1280
RefineableElements are FiniteElements that may be subdivided into children to provide a better local ...
virtual RefineableElement * root_element_pt()
Pointer to the root element in refinement hierarchy (must be implemented in specific elements that do...
Base class for refineable meshes. Provides standardised interfaces for the following standard mesh ad...
ErrorEstimator *& spatial_error_estimator_pt()
Access to spatial error estimator.
A slight extension to the standard template vector class so that we can include "graceful" array rang...
Definition: Vector.h:58
void start(const unsigned &i)
(Re-)start i-th timer
void uniform_partition_mesh(Problem *problem_pt, const unsigned &ndomain, Vector< unsigned > &element_domain)
Partition mesh uniformly by dividing elements equally over the partitions, in the order in which they...
Definition: partitioning.cc:76
void partition_distributed_mesh(Problem *problem_pt, const unsigned &objective, Vector< unsigned > &element_domain_on_this_proc, const bool &bypass_metis=false)
Use METIS to assign each element in an already-distributed mesh to a domain. On return,...
void partition_mesh(Problem *problem_pt, const unsigned &ndomain, const unsigned &objective, Vector< unsigned > &element_domain)
Use METIS to assign each element to a domain. On return, element_domain[ielem] contains the number of...
void(* ErrorToWeightFctPt)(const double &spatial_error, const double &max_error, const double &min_error, int &weight)
Typedef for function pointer to to function that translates spatial error into weight for METIS parti...
Definition: partitioning.h:83
ErrorToWeightFctPt Error_to_weight_fct_pt
Function pointer to to function that translates spatial error into weight for METIS partitioning.
Definition: partitioning.cc:63
void default_error_to_weight_fct(const double &spatial_error, const double &max_error, const double &min_error, int &weight)
Default function that translates spatial error into weight for METIS partitioning (unit weight regard...
Definition: partitioning.cc:53
//////////////////////////////////////////////////////////////////// ////////////////////////////////...
void METIS_PartGraphKway(int *, int *, int *, int *, int *, int *, int *, int *, int *, int *, int *)
Metis graph partitioning function.
void METIS_PartGraphVKway(int *, int *, int *, int *, int *, int *, int *, int *, int *, int *, int *)
Metis graph partitioning function – decomposes nodal graph based on minimum communication volume.
OomphInfo oomph_info
Single (global) instantiation of the OomphInfo object – this is used throughout the library as a "rep...