50WallClockLoadBalancerMonitor :: decide(
TimeStep *tStep)
52 int nproc =
emodel->giveNumberOfProcesses();
53 int myrank =
emodel->giveRank();
56 double *node_solutiontimes =
new double [ nproc ];
57 double *node_relcomppowers =
new double [ nproc ];
58 double *node_equivelements =
new double [ nproc ];
59 double min_st, max_st;
60 double relWallClockImbalance;
61 double absWallClockImbalance;
62 double neqelems, sum_relcomppowers;
64 if ( node_solutiontimes == NULL ) {
65 OOFEM_ERROR(
"failed to allocate node_solutiontimes array");
68 if ( node_relcomppowers == NULL ) {
69 OOFEM_ERROR(
"failed to allocate node_relcomppowers array");
72 if ( node_equivelements == NULL ) {
73 OOFEM_ERROR(
"failed to allocate node_equivelements array");
78 double mySolutionTime =
emodel->giveTimer()->getWtime(EngngModelTimer :: EMTT_NetComputationalStepTimer);
84 if ( perturbedStep.test( tStep->
giveNumber() ) ) {
98 MPI_Allgather(& mySolutionTime, 1, MPI_DOUBLE, node_solutiontimes, 1, MPI_DOUBLE, MPI_COMM_WORLD);
101 for (
int i = 0; i < nproc; i++ ) {
108 min_st = max_st = node_solutiontimes [ 0 ];
109 for (
int i = 0; i < nproc; i++ ) {
110 min_st =
min(min_st, node_solutiontimes [ i ]);
111 max_st =
max(max_st, node_solutiontimes [ i ]);
114 absWallClockImbalance = ( max_st - min_st );
116 relWallClockImbalance = ( ( max_st - min_st ) / min_st );
118 relWallClockImbalance = 0.0;
126 for (
int ie = 1; ie <= nelem; ie++ ) {
135 MPI_Allgather(& neqelems, 1, MPI_DOUBLE, node_equivelements, 1, MPI_DOUBLE, MPI_COMM_WORLD);
140 for (
int i = 0; i < nproc; i++ ) {
141 node_relcomppowers [ i ] = node_equivelements [ i ] / node_solutiontimes [ i ];
145 sum_relcomppowers = 0.0;
146 for (
int i = 0; i < nproc; i++ ) {
147 sum_relcomppowers += node_relcomppowers [ i ];
150 for (
int i = 0; i < nproc; i++ ) {
151 nodeWeights(i) = node_relcomppowers [ i ] / sum_relcomppowers;
157 for (
int i = 0; i < nproc; i++ ) {
165 for (
int i = 0; i < nproc; i++ ) {
175 delete[] node_solutiontimes;
176 delete[] node_relcomppowers;
177 delete[] node_equivelements;
184 double procWeight, sumWeight = 0.0, *procWeights =
new double [ nproc ];
188 OOFEM_LOG_RELEVANT(
"[%d] WallClockLoadBalancerMonitor: processing weight overriden by value=%e\n", myrank, procWeight);
191 MPI_Allgather(& procWeight, 1, MPI_DOUBLE, procWeights, 1, MPI_DOUBLE, MPI_COMM_WORLD);
192 for (
int i = 0; i < nproc; i++ ) {
194 sumWeight += procWeights [ i ];
197 delete[] procWeights;
199 if ( fabs(sumWeight - 1.0) > 1.0e-10 ) {
200 OOFEM_ERROR(
"[%d] processing weights do not sum to 1.0 (sum = %e)\n", sumWeight);
203 OOFEM_LOG_RELEVANT(
"[%d] LoadBalancer: wall clock imbalance rel=%.2f\%,abs=%.2fs, recovering load\n", myrank, 100 * relWallClockImbalance, absWallClockImbalance);
206 OOFEM_LOG_RELEVANT(
"[%d] LoadBalancer: wall clock imbalance rel=%.2f\%,abs=%.2fs, continuing\n", myrank, 100 * relWallClockImbalance, absWallClockImbalance);
214 if ( ( tStep->
giveNumber() % this->lbstep == 0 ) &&
215 ( ( absWallClockImbalance > this->absWallClockImbalanceTreshold ) ||
216 ( ( relWallClockImbalance > this->relWallClockImbalanceTreshold ) && ( absWallClockImbalance > this->minAbsWallClockImbalanceTreshold ) ) ) ) {
217 OOFEM_LOG_RELEVANT(
"[%d] LoadBalancer: wall clock imbalance rel=%.2f\%,abs=%.2fs, recovering load\n", myrank, 100 * relWallClockImbalance, absWallClockImbalance);
220 OOFEM_LOG_RELEVANT(
"[%d] LoadBalancer: wall clock imbalance rel=%.2f\%,abs=%.2fs, continuing\n", myrank, 100 * relWallClockImbalance, absWallClockImbalance);
int giveNumberOfElements() const
Returns number of elements in domain.
Element * giveElement(int n)