OOFEM 3.0
Loading...
Searching...
No Matches
wallclockloadbalancermonitor.C
Go to the documentation of this file.
1/*
2 *
3 * ##### ##### ###### ###### ### ###
4 * ## ## ## ## ## ## ## ### ##
5 * ## ## ## ## #### #### ## # ##
6 * ## ## ## ## ## ## ## ##
7 * ## ## ## ## ## ## ## ##
8 * ##### ##### ## ###### ## ##
9 *
10 *
11 * OOFEM : Object Oriented Finite Element Code
12 *
13 * Copyright (C) 1993 - 2025 Borek Patzak
14 *
15 *
16 *
17 * Czech Technical University, Faculty of Civil Engineering,
18 * Department of Structural Mechanics, 166 29 Prague, Czech Republic
19 *
20 * This library is free software; you can redistribute it and/or
21 * modify it under the terms of the GNU Lesser General Public
22 * License as published by the Free Software Foundation; either
23 * version 2.1 of the License, or (at your option) any later version.
24 *
25 * This program is distributed in the hope that it will be useful,
26 * but WITHOUT ANY WARRANTY; without even the implied warranty of
27 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
28 * Lesser General Public License for more details.
29 *
30 * You should have received a copy of the GNU Lesser General Public
31 * License along with this library; if not, write to the Free Software
32 * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
33 */
34
36#include "engngm.h"
37#include "domain.h"
38#include "timestep.h"
39#include "element.h"
40#include "mathfem.h"
41#include "classfactory.h"
42
43#include <mpi.h>
44
45namespace oofem {
46
48
49LoadBalancerMonitor :: LoadBalancerDecisionType
50WallClockLoadBalancerMonitor :: decide(TimeStep *tStep)
51{
52 int nproc = emodel->giveNumberOfProcesses();
53 int myrank = emodel->giveRank();
54 Domain *d = emodel->giveLoadBalancer()->giveDomain();
55 int nelem;
56 double *node_solutiontimes = new double [ nproc ];
57 double *node_relcomppowers = new double [ nproc ];
58 double *node_equivelements = new double [ nproc ];
59 double min_st, max_st;
60 double relWallClockImbalance;
61 double absWallClockImbalance;
62 double neqelems, sum_relcomppowers;
63
64 if ( node_solutiontimes == NULL ) {
65 OOFEM_ERROR("failed to allocate node_solutiontimes array");
66 }
67
68 if ( node_relcomppowers == NULL ) {
69 OOFEM_ERROR("failed to allocate node_relcomppowers array");
70 }
71
72 if ( node_equivelements == NULL ) {
73 OOFEM_ERROR("failed to allocate node_equivelements array");
74 }
75
76
77 // compute wall solution time of my node
78 double mySolutionTime = emodel->giveTimer()->getWtime(EngngModelTimer :: EMTT_NetComputationalStepTimer);
79
80#ifdef __LB_DEBUG
81 // perturb solution time artificially if requested
82 bool perturb = false;
83 for ( auto perturbedStep: perturbedSteps ) {
84 if ( perturbedStep.test( tStep->giveNumber() ) ) {
85 perturb = true;
86 break;
87 }
88 }
89
90 if ( perturb ) {
91 mySolutionTime *= perturbFactor;
92 OOFEM_LOG_RELEVANT("[%d] WallClockLoadBalancerMonitor: perturbed solution time by factor=%.2f\n", myrank, perturbFactor);
93 }
94
95#endif
96
97 // collect wall clock computational time
98 MPI_Allgather(& mySolutionTime, 1, MPI_DOUBLE, node_solutiontimes, 1, MPI_DOUBLE, MPI_COMM_WORLD);
99
100 OOFEM_LOG_RELEVANT("\nLoadBalancer:: individual processor times [sec]: (");
101 for ( int i = 0; i < nproc; i++ ) {
102 OOFEM_LOG_RELEVANT(" %.3f", node_solutiontimes [ i ]);
103 }
104
105 OOFEM_LOG_RELEVANT(")\n");
106
107 // detect imbalance
108 min_st = max_st = node_solutiontimes [ 0 ];
109 for ( int i = 0; i < nproc; i++ ) {
110 min_st = min(min_st, node_solutiontimes [ i ]);
111 max_st = max(max_st, node_solutiontimes [ i ]);
112 }
113
114 absWallClockImbalance = ( max_st - min_st );
115 if ( min_st ) {
116 relWallClockImbalance = ( ( max_st - min_st ) / min_st );
117 } else {
118 relWallClockImbalance = 0.0;
119 }
120
121 // update node (processor) weights
122
123 // compute number or equivalent elements (equavalent element has computational weight equal to 1.0)
124 nelem = d->giveNumberOfElements();
125 neqelems = 0.0;
126 for ( int ie = 1; ie <= nelem; ie++ ) {
127 if ( d->giveElement(ie)->giveParallelMode() == Element_remote ) {
128 continue;
129 }
130
131 neqelems += d->giveElement(ie)->predictRelativeComputationalCost();
132 }
133
134 // exchange number or equivalent elements
135 MPI_Allgather(& neqelems, 1, MPI_DOUBLE, node_equivelements, 1, MPI_DOUBLE, MPI_COMM_WORLD);
136
137
138 if ( !this->staticNodeWeightFlag ) {
139 // compute relative computational powers (solution_time/number_of_equivalent_elements)
140 for ( int i = 0; i < nproc; i++ ) {
141 node_relcomppowers [ i ] = node_equivelements [ i ] / node_solutiontimes [ i ];
142 }
143
144 // normalize computational powers
145 sum_relcomppowers = 0.0;
146 for ( int i = 0; i < nproc; i++ ) {
147 sum_relcomppowers += node_relcomppowers [ i ];
148 }
149
150 for ( int i = 0; i < nproc; i++ ) {
151 nodeWeights(i) = node_relcomppowers [ i ] / sum_relcomppowers;
152 }
153 }
154
155 // log equivalent elements on nodes
156 OOFEM_LOG_RELEVANT("[%d] LoadBalancer: node equivalent elements: ", myrank);
157 for ( int i = 0; i < nproc; i++ ) {
158 OOFEM_LOG_RELEVANT("%6d ", ( int ) node_equivelements [ i ]);
159 }
160
161 OOFEM_LOG_RELEVANT("\n");
162
163 // log processor weights
164 OOFEM_LOG_RELEVANT("[%d] LoadBalancer: updated proc weights: ", myrank);
165 for ( int i = 0; i < nproc; i++ ) {
166#ifdef __LB_DEBUG
167 OOFEM_LOG_RELEVANT( "%22.15e ", nodeWeights(i) );
168#else
169 OOFEM_LOG_RELEVANT( "%4.3f ", nodeWeights(i) );
170#endif
171 }
172
173 OOFEM_LOG_RELEVANT("\n");
174
175 delete[] node_solutiontimes;
176 delete[] node_relcomppowers;
177 delete[] node_equivelements;
178
179#ifdef __LB_DEBUG
180 if ( recoveredSteps.giveSize() ) {
181 // recover lb if requested
182 int pos;
183 if ( ( pos = recoveredSteps.findFirstIndexOf( tStep->giveNumber() ) ) ) {
184 double procWeight, sumWeight = 0.0, *procWeights = new double [ nproc ];
185
186 // assign prescribed processing weight
187 procWeight = processingWeights.at(pos);
188 OOFEM_LOG_RELEVANT("[%d] WallClockLoadBalancerMonitor: processing weight overriden by value=%e\n", myrank, procWeight);
189
190 // exchange processing weights
191 MPI_Allgather(& procWeight, 1, MPI_DOUBLE, procWeights, 1, MPI_DOUBLE, MPI_COMM_WORLD);
192 for ( int i = 0; i < nproc; i++ ) {
193 nodeWeights(i) = procWeights [ i ];
194 sumWeight += procWeights [ i ];
195 }
196
197 delete[] procWeights;
198
199 if ( fabs(sumWeight - 1.0) > 1.0e-10 ) {
200 OOFEM_ERROR("[%d] processing weights do not sum to 1.0 (sum = %e)\n", sumWeight);
201 }
202
203 OOFEM_LOG_RELEVANT("[%d] LoadBalancer: wall clock imbalance rel=%.2f\%,abs=%.2fs, recovering load\n", myrank, 100 * relWallClockImbalance, absWallClockImbalance);
204 return LBD_RECOVER;
205 } else {
206 OOFEM_LOG_RELEVANT("[%d] LoadBalancer: wall clock imbalance rel=%.2f\%,abs=%.2fs, continuing\n", myrank, 100 * relWallClockImbalance, absWallClockImbalance);
207 return LBD_CONTINUE;
208 }
209 }
210
211#endif
212
213 // decide
214 if ( ( tStep->giveNumber() % this->lbstep == 0 ) &&
215 ( ( absWallClockImbalance > this->absWallClockImbalanceTreshold ) ||
216 ( ( relWallClockImbalance > this->relWallClockImbalanceTreshold ) && ( absWallClockImbalance > this->minAbsWallClockImbalanceTreshold ) ) ) ) {
217 OOFEM_LOG_RELEVANT("[%d] LoadBalancer: wall clock imbalance rel=%.2f\%,abs=%.2fs, recovering load\n", myrank, 100 * relWallClockImbalance, absWallClockImbalance);
218 return LBD_RECOVER;
219 } else {
220 OOFEM_LOG_RELEVANT("[%d] LoadBalancer: wall clock imbalance rel=%.2f\%,abs=%.2fs, continuing\n", myrank, 100 * relWallClockImbalance, absWallClockImbalance);
221 return LBD_CONTINUE;
222 }
223}
224
225
226void
252
253}
#define REGISTER_LoadBalancerMonitor(class)
int giveNumberOfElements() const
Returns number of elements in domain.
Definition domain.h:463
Element * giveElement(int n)
Definition domain.C:165
virtual double predictRelativeComputationalCost()
Definition element.C:1763
elementParallelMode giveParallelMode() const
Definition element.h:1139
int giveNumber()
Returns receiver's number.
Definition timestep.h:144
double relWallClockImbalanceTreshold
Declares min abs imbalance to perform relative imbalance check.
std ::list< Range > perturbedSteps
List of steps with perturbed balancing.
int lbstep
The rebalancing done every lbstep.
IntArray recoveredSteps
list of step at which to performed lb recovery.
FloatArray processingWeights
processing weights for lb recovery.
#define OOFEM_ERROR(...)
Definition error.h:79
#define IR_GIVE_OPTIONAL_FIELD(__ir, __value, __id)
Definition inputrecord.h:75
#define OOFEM_LOG_RELEVANT(...)
Definition logger.h:142
FloatArrayF< N > min(const FloatArrayF< N > &a, const FloatArrayF< N > &b)
@ Element_remote
Element in active domain is only mirror of some remote element.
Definition element.h:89
FloatArrayF< N > max(const FloatArrayF< N > &a, const FloatArrayF< N > &b)
#define _IFT_WallClockLoadBalancerMonitor_minwct
#define _IFT_WallClockLoadBalancerMonitor_perturbfactor
#define _IFT_WallClockLoadBalancerMonitor_perturbedsteps
#define _IFT_WallClockLoadBalancerMonitor_abswct
#define _IFT_WallClockLoadBalancerMonitor_lbstep
#define _IFT_WallClockLoadBalancerMonitor_recoveredsteps
#define _IFT_WallClockLoadBalancerMonitor_processingweights
#define _IFT_WallClockLoadBalancerMonitor_relwct

This page is part of the OOFEM-3.0 documentation. Copyright Copyright (C) 1994-2025 Borek Patzak Bořek Patzák
Project e-mail: oofem@fsv.cvut.cz
Generated at for OOFEM by doxygen 1.15.0 written by Dimitri van Heesch, © 1997-2011