Actual source code: pipecg2.c

petsc-3.15.0 2021-04-05
Report Typos and Errors
  1: #include <petsc/private/kspimpl.h>

  3: /* The auxiliary functions below are merged vector operations that load vectors from memory once and use
  4:    the data multiple times by performing vector operations element-wise. These functions
  5:    only apply to sequential vectors.
  6: */
  7: /*   VecMergedDot_Private function merges the dot products for gamma, delta and dp */
  8: static PetscErrorCode VecMergedDot_Private(Vec U,Vec W,Vec R,PetscInt normtype,PetscScalar *ru, PetscScalar *wu, PetscScalar *uu)
  9: {
 10:   const PetscScalar *PETSC_RESTRICT PU, *PETSC_RESTRICT PW, *PETSC_RESTRICT PR;
 11:   PetscScalar       sumru = 0.0, sumwu = 0.0, sumuu = 0.0;
 12:   PetscInt          j, n;
 13:   PetscErrorCode    ierr;

 16:   VecGetArrayRead(U,(const PetscScalar**)&PU);
 17:   VecGetArrayRead(W,(const PetscScalar**)&PW);
 18:   VecGetArrayRead(R,(const PetscScalar**)&PR);
 19:   VecGetLocalSize(U,&n);

 21:   if (normtype==KSP_NORM_PRECONDITIONED) {
 22:     PetscPragmaSIMD
 23:     for (j=0; j<n; j++) {
 24:       sumwu += PW[j] * PetscConj(PU[j]);
 25:       sumru += PR[j] * PetscConj(PU[j]);
 26:       sumuu += PU[j] * PetscConj(PU[j]);
 27:     }
 28:   } else if (normtype==KSP_NORM_UNPRECONDITIONED) {
 29:     PetscPragmaSIMD
 30:     for (j=0; j<n; j++) {
 31:       sumwu += PW[j] * PetscConj(PU[j]);
 32:       sumru += PR[j] * PetscConj(PU[j]);
 33:       sumuu += PR[j] * PetscConj(PR[j]);
 34:     }
 35:   } else if (normtype==KSP_NORM_NATURAL) {
 36:     PetscPragmaSIMD
 37:     for (j=0; j<n; j++) {
 38:       sumwu += PW[j] * PetscConj(PU[j]);
 39:       sumru += PR[j] * PetscConj(PU[j]);
 40:     }
 41:     sumuu = sumru;
 42:   }

 44:   *ru = sumru;
 45:   *wu = sumwu;
 46:   *uu = sumuu;

 48:   VecRestoreArrayRead(U,(const PetscScalar**)&PU);
 49:   VecRestoreArrayRead(W,(const PetscScalar**)&PW);
 50:   VecRestoreArrayRead(R,(const PetscScalar**)&PR);
 51:   return(0);
 52: }

 54: /*   VecMergedDot2_Private function merges the dot products for lambda_1 and lambda_4 */
 55: static PetscErrorCode VecMergedDot2_Private(Vec N,Vec M,Vec W,PetscScalar *wm,PetscScalar *nm)
 56: {
 57:   const PetscScalar *PETSC_RESTRICT PN, *PETSC_RESTRICT PM, *PETSC_RESTRICT PW;
 58:   PetscScalar       sumwm = 0.0, sumnm = 0.0;
 59:   PetscInt          j, n;
 60:   PetscErrorCode    ierr;

 63:   VecGetArrayRead(W,(const PetscScalar**)&PW);
 64:   VecGetArrayRead(N,(const PetscScalar**)&PN);
 65:   VecGetArrayRead(M,(const PetscScalar**)&PM);
 66:   VecGetLocalSize(N,&n);

 68:   PetscPragmaSIMD
 69:   for (j=0; j<n; j++){
 70:     sumwm += PW[j] * PetscConj(PM[j]);
 71:     sumnm += PN[j] * PetscConj(PM[j]);
 72:   }

 74:   *wm = sumwm;
 75:   *nm = sumnm;

 77:   VecRestoreArrayRead(W,(const PetscScalar**)&PW);
 78:   VecRestoreArrayRead(N,(const PetscScalar**)&PN);
 79:   VecRestoreArrayRead(M,(const PetscScalar**)&PM);
 80:   return(0);
 81: }

 83: /*   VecMergedOpsShort_Private function merges the dot products, AXPY and SAXPY operations for all vectors for iteration 0  */
 84: static PetscErrorCode VecMergedOpsShort_Private(Vec vx,Vec vr,Vec vz,Vec vw,Vec vp,Vec vq,Vec vc,Vec vd,Vec vg0,Vec vh0,Vec vg1,Vec vh1,Vec vs,Vec va1,Vec vb1,Vec ve,Vec vf,Vec vm,Vec vn,Vec vu,PetscInt normtype,PetscScalar beta0,PetscScalar alpha0,PetscScalar beta1,PetscScalar alpha1,PetscScalar *lambda)
 85: {
 86:   PetscScalar       *PETSC_RESTRICT px, *PETSC_RESTRICT pr, *PETSC_RESTRICT pz, *PETSC_RESTRICT pw;
 87:   PetscScalar       *PETSC_RESTRICT pp, *PETSC_RESTRICT pq;
 88:   PetscScalar       *PETSC_RESTRICT pc, *PETSC_RESTRICT pd, *PETSC_RESTRICT pg0, *PETSC_RESTRICT ph0, *PETSC_RESTRICT pg1,*PETSC_RESTRICT ph1,*PETSC_RESTRICT ps,*PETSC_RESTRICT pa1,*PETSC_RESTRICT pb1, *PETSC_RESTRICT pe,*PETSC_RESTRICT pf,*PETSC_RESTRICT pm,*PETSC_RESTRICT pn, *PETSC_RESTRICT pu;
 89:   PetscInt          j, n;
 90:   PetscErrorCode    ierr;

 93:   VecGetArray(vx,(PetscScalar**)&px);
 94:   VecGetArray(vr,(PetscScalar**)&pr);
 95:   VecGetArray(vz,(PetscScalar**)&pz);
 96:   VecGetArray(vw,(PetscScalar**)&pw);
 97:   VecGetArray(vp,(PetscScalar**)&pp);
 98:   VecGetArray(vq,(PetscScalar**)&pq);
 99:   VecGetArray(vc,(PetscScalar**)&pc);
100:   VecGetArray(vd,(PetscScalar**)&pd);
101:   VecGetArray(vg0,(PetscScalar**)&pg0);
102:   VecGetArray(vh0,(PetscScalar**)&ph0);
103:   VecGetArray(vg1,(PetscScalar**)&pg1);
104:   VecGetArray(vh1,(PetscScalar**)&ph1);
105:   VecGetArray(vs,(PetscScalar**)&ps);
106:   VecGetArray(va1,(PetscScalar**)&pa1);
107:   VecGetArray(vb1,(PetscScalar**)&pb1);
108:   VecGetArray(ve,(PetscScalar**)&pe);
109:   VecGetArray(vf,(PetscScalar**)&pf);
110:   VecGetArray(vm,(PetscScalar**)&pm);
111:   VecGetArray(vn,(PetscScalar**)&pn);
112:   VecGetArray(vu,(PetscScalar**)&pu);

114:   VecGetLocalSize(vx,&n);
115:   for (j=0; j<15; j++) lambda[j] = 0.0;

117:   if (normtype==KSP_NORM_PRECONDITIONED) {
118:     PetscPragmaSIMD
119:     for (j=0; j<n; j++) {
120:       pz[j] = pn[j];
121:       pq[j] = pm[j];
122:       ps[j] = pw[j];
123:       pp[j] = pu[j];
124:       pc[j] = pg0[j];
125:       pd[j] = ph0[j];
126:       pa1[j] = pe[j];
127:       pb1[j] = pf[j];

129:       px[j] = px[j] + alpha0 * pp[j];
130:       pr[j] = pr[j] - alpha0 * ps[j];
131:       pu[j] = pu[j] - alpha0 * pq[j];
132:       pw[j] = pw[j] - alpha0 * pz[j];
133:       pm[j] = pm[j] - alpha0 * pc[j];
134:       pn[j] = pn[j] - alpha0 * pd[j];
135:       pg0[j] = pg0[j] - alpha0 * pa1[j];
136:       ph0[j] = ph0[j] - alpha0 * pb1[j];

138:       pg1[j] = pg0[j];
139:       ph1[j] = ph0[j];

141:       pz[j] = pn[j] + beta1 * pz[j];
142:       pq[j] = pm[j] + beta1 * pq[j];
143:       ps[j] = pw[j] + beta1 * ps[j];
144:       pp[j] = pu[j] + beta1 * pp[j];
145:       pc[j] = pg0[j] + beta1 * pc[j];
146:       pd[j] = ph0[j] + beta1 * pd[j];

148:       px[j] = px[j] + alpha1 * pp[j];
149:       pr[j] = pr[j] - alpha1 * ps[j];
150:       pu[j] = pu[j] - alpha1 * pq[j];
151:       pw[j] = pw[j] - alpha1 * pz[j];
152:       pm[j] = pm[j] - alpha1 * pc[j];
153:       pn[j] = pn[j] - alpha1 * pd[j];

155:       lambda[0] += ps[j] * PetscConj(pu[j]);   lambda[1] += pw[j] * PetscConj(pm[j]);
156:       lambda[2] += pw[j] * PetscConj(pq[j]);   lambda[4] += ps[j] * PetscConj(pq[j]);
157:       lambda[6] += pn[j] * PetscConj(pm[j]);   lambda[7] += pn[j] * PetscConj(pq[j]);
158:       lambda[9] += pz[j] * PetscConj(pq[j]);   lambda[10] += pr[j] * PetscConj(pu[j]);
159:       lambda[11] += pw[j] * PetscConj(pu[j]);  lambda[12] += pu[j] * PetscConj(pu[j]);
160:     }
161:     lambda[3] = PetscConj(lambda[2]);
162:     lambda[5] = PetscConj(lambda[1]);
163:     lambda[8] = PetscConj(lambda[7]);
164:     lambda[13] = PetscConj(lambda[11]);
165:     lambda[14] = PetscConj(lambda[0]);

167:   } else if (normtype==KSP_NORM_UNPRECONDITIONED) {
168:     PetscPragmaSIMD
169:     for (j=0; j<n; j++) {
170:       pz[j] = pn[j];
171:       pq[j] = pm[j];
172:       ps[j] = pw[j];
173:       pp[j] = pu[j];
174:       pc[j] = pg0[j];
175:       pd[j] = ph0[j];
176:       pa1[j] = pe[j];
177:       pb1[j] = pf[j];

179:       px[j] = px[j] + alpha0 * pp[j];
180:       pr[j] = pr[j] - alpha0 * ps[j];
181:       pu[j] = pu[j] - alpha0 * pq[j];
182:       pw[j] = pw[j] - alpha0 * pz[j];
183:       pm[j] = pm[j] - alpha0 * pc[j];
184:       pn[j] = pn[j] - alpha0 * pd[j];
185:       pg0[j] = pg0[j] - alpha0 * pa1[j];
186:       ph0[j] = ph0[j] - alpha0 * pb1[j];

188:       pg1[j] = pg0[j];
189:       ph1[j] = ph0[j];

191:       pz[j] = pn[j] + beta1 * pz[j];
192:       pq[j] = pm[j] + beta1 * pq[j];
193:       ps[j] = pw[j] + beta1 * ps[j];
194:       pp[j] = pu[j] + beta1 * pp[j];
195:       pc[j] = pg0[j] + beta1 * pc[j];
196:       pd[j] = ph0[j] + beta1 * pd[j];

198:       px[j] = px[j] + alpha1 * pp[j];
199:       pr[j] = pr[j] - alpha1 * ps[j];
200:       pu[j] = pu[j] - alpha1 * pq[j];
201:       pw[j] = pw[j] - alpha1 * pz[j];
202:       pm[j] = pm[j] - alpha1 * pc[j];
203:       pn[j] = pn[j] - alpha1 * pd[j];

205:       lambda[0] += ps[j] * PetscConj(pu[j]);   lambda[1] += pw[j] * PetscConj(pm[j]);
206:       lambda[2] += pw[j] * PetscConj(pq[j]);   lambda[4] += ps[j] * PetscConj(pq[j]);
207:       lambda[6] += pn[j] * PetscConj(pm[j]);   lambda[7] += pn[j] * PetscConj(pq[j]);
208:       lambda[9] += pz[j] * PetscConj(pq[j]);   lambda[10] += pr[j] * PetscConj(pu[j]);
209:       lambda[11] += pw[j] * PetscConj(pu[j]);  lambda[12] += pr[j] * PetscConj(pr[j]);
210:     }
211:     lambda[3] = PetscConj(lambda[2]);
212:     lambda[5] = PetscConj(lambda[1]);
213:     lambda[8] = PetscConj(lambda[7]);
214:     lambda[13] = PetscConj(lambda[11]);
215:     lambda[14] = PetscConj(lambda[0]);

217:   } else if (normtype==KSP_NORM_NATURAL) {
218:     PetscPragmaSIMD
219:     for (j=0; j<n; j++) {
220:       pz[j] = pn[j];
221:       pq[j] = pm[j];
222:       ps[j] = pw[j];
223:       pp[j] = pu[j];
224:       pc[j] = pg0[j];
225:       pd[j] = ph0[j];
226:       pa1[j] = pe[j];
227:       pb1[j] = pf[j];

229:       px[j] = px[j] + alpha0 * pp[j];
230:       pr[j] = pr[j] - alpha0 * ps[j];
231:       pu[j] = pu[j] - alpha0 * pq[j];
232:       pw[j] = pw[j] - alpha0 * pz[j];
233:       pm[j] = pm[j] - alpha0 * pc[j];
234:       pn[j] = pn[j] - alpha0 * pd[j];
235:       pg0[j] = pg0[j] - alpha0 * pa1[j];
236:       ph0[j] = ph0[j] - alpha0 * pb1[j];

238:       pg1[j] = pg0[j];
239:       ph1[j] = ph0[j];

241:       pz[j] = pn[j] + beta1 * pz[j];
242:       pq[j] = pm[j] + beta1 * pq[j];
243:       ps[j] = pw[j] + beta1 * ps[j];
244:       pp[j] = pu[j] + beta1 * pp[j];
245:       pc[j] = pg0[j] + beta1 * pc[j];
246:       pd[j] = ph0[j] + beta1 * pd[j];

248:       px[j] = px[j] + alpha1 * pp[j];
249:       pr[j] = pr[j] - alpha1 * ps[j];
250:       pu[j] = pu[j] - alpha1 * pq[j];
251:       pw[j] = pw[j] - alpha1 * pz[j];
252:       pm[j] = pm[j] - alpha1 * pc[j];
253:       pn[j] = pn[j] - alpha1 * pd[j];

255:       lambda[0] += ps[j] * PetscConj(pu[j]);   lambda[1] += pw[j] * PetscConj(pm[j]);
256:       lambda[2] += pw[j] * PetscConj(pq[j]);   lambda[4] += ps[j] * PetscConj(pq[j]);
257:       lambda[6] += pn[j] * PetscConj(pm[j]);   lambda[7] += pn[j] * PetscConj(pq[j]);
258:       lambda[9] += pz[j] * PetscConj(pq[j]);   lambda[10] += pr[j] * PetscConj(pu[j]);
259:       lambda[11] += pw[j] * PetscConj(pu[j]);
260:     }
261:     lambda[3] = PetscConj(lambda[2]);
262:     lambda[5] = PetscConj(lambda[1]);
263:     lambda[8] = PetscConj(lambda[7]);
264:     lambda[13] = PetscConj(lambda[11]);
265:     lambda[14] = PetscConj(lambda[0]);
266:     lambda[12] = lambda[10];

268:   }

270:   VecRestoreArray(vx,(PetscScalar**)&px);
271:   VecRestoreArray(vr,(PetscScalar**)&pr);
272:   VecRestoreArray(vz,(PetscScalar**)&pz);
273:   VecRestoreArray(vw,(PetscScalar**)&pw);
274:   VecRestoreArray(vp,(PetscScalar**)&pp);
275:   VecRestoreArray(vq,(PetscScalar**)&pq);
276:   VecRestoreArray(vc,(PetscScalar**)&pc);
277:   VecRestoreArray(vd,(PetscScalar**)&pd);
278:   VecRestoreArray(vg0,(PetscScalar**)&pg0);
279:   VecRestoreArray(vh0,(PetscScalar**)&ph0);
280:   VecRestoreArray(vg1,(PetscScalar**)&pg1);
281:   VecRestoreArray(vh1,(PetscScalar**)&ph1);
282:   VecRestoreArray(vs,(PetscScalar**)&ps);
283:   VecRestoreArray(va1,(PetscScalar**)&pa1);
284:   VecRestoreArray(vb1,(PetscScalar**)&pb1);
285:   VecRestoreArray(ve,(PetscScalar**)&pe);
286:   VecRestoreArray(vf,(PetscScalar**)&pf);
287:   VecRestoreArray(vm,(PetscScalar**)&pm);
288:   VecRestoreArray(vn,(PetscScalar**)&pn);
289:   VecRestoreArray(vu,(PetscScalar**)&pu);
290:   return(0);
291: }

293: /*   VecMergedOps_Private function merges the dot products, AXPY and SAXPY operations for all vectors for iteration > 0  */
294: static PetscErrorCode VecMergedOps_Private(Vec vx,Vec vr,Vec vz,Vec vw,Vec vp,Vec vq,Vec vc,Vec vd,Vec vg0,Vec vh0,Vec vg1,Vec vh1,Vec vs,Vec va1,Vec vb1,Vec ve,Vec vf,Vec vm,Vec vn,Vec vu,PetscInt normtype,PetscScalar beta0,PetscScalar alpha0,PetscScalar beta1,PetscScalar alpha1,PetscScalar *lambda, PetscScalar alphaold)
295: {
296:   PetscScalar       *PETSC_RESTRICT px, *PETSC_RESTRICT pr, *PETSC_RESTRICT pz, *PETSC_RESTRICT pw;
297:   PetscScalar       *PETSC_RESTRICT pp, *PETSC_RESTRICT pq;
298:   PetscScalar       *PETSC_RESTRICT pc,  *PETSC_RESTRICT pd, *PETSC_RESTRICT pg0,  *PETSC_RESTRICT ph0,  *PETSC_RESTRICT pg1, *PETSC_RESTRICT ph1,*PETSC_RESTRICT ps, *PETSC_RESTRICT pa1,*PETSC_RESTRICT pb1,*PETSC_RESTRICT pe,*PETSC_RESTRICT pf, *PETSC_RESTRICT pm,*PETSC_RESTRICT pn, *PETSC_RESTRICT pu;
299:   PetscInt          j, n;
300:   PetscErrorCode    ierr;

303:   VecGetArray(vx,(PetscScalar**)&px);
304:   VecGetArray(vr,(PetscScalar**)&pr);
305:   VecGetArray(vz,(PetscScalar**)&pz);
306:   VecGetArray(vw,(PetscScalar**)&pw);
307:   VecGetArray(vp,(PetscScalar**)&pp);
308:   VecGetArray(vq,(PetscScalar**)&pq);
309:   VecGetArray(vc,(PetscScalar**)&pc);
310:   VecGetArray(vd,(PetscScalar**)&pd);
311:   VecGetArray(vg0,(PetscScalar**)&pg0);
312:   VecGetArray(vh0,(PetscScalar**)&ph0);
313:   VecGetArray(vg1,(PetscScalar**)&pg1);
314:   VecGetArray(vh1,(PetscScalar**)&ph1);
315:   VecGetArray(vs,(PetscScalar**)&ps);
316:   VecGetArray(va1,(PetscScalar**)&pa1);
317:   VecGetArray(vb1,(PetscScalar**)&pb1);
318:   VecGetArray(ve,(PetscScalar**)&pe);
319:   VecGetArray(vf,(PetscScalar**)&pf);
320:   VecGetArray(vm,(PetscScalar**)&pm);
321:   VecGetArray(vn,(PetscScalar**)&pn);
322:   VecGetArray(vu,(PetscScalar**)&pu);

324:   VecGetLocalSize(vx,&n);
325:   for (j=0; j<15; j++) lambda[j] = 0.0;

327:   if (normtype==KSP_NORM_PRECONDITIONED) {
328:     PetscPragmaSIMD
329:     for (j=0; j<n; j++) {
330:       pa1[j] = (pg1[j] - pg0[j])/alphaold;
331:       pb1[j] = (ph1[j] - ph0[j])/alphaold;

333:       pz[j] = pn[j] + beta0 * pz[j];
334:       pq[j] = pm[j] + beta0 * pq[j];
335:       ps[j] = pw[j] + beta0 * ps[j];
336:       pp[j] = pu[j] + beta0 * pp[j];
337:       pc[j] = pg0[j] + beta0 * pc[j];
338:       pd[j] = ph0[j] + beta0 * pd[j];
339:       pa1[j] = pe[j] + beta0 * pa1[j];
340:       pb1[j] = pf[j] + beta0 * pb1[j];

342:       px[j] = px[j] + alpha0 * pp[j];
343:       pr[j] = pr[j] - alpha0 * ps[j];
344:       pu[j] = pu[j] - alpha0 * pq[j];
345:       pw[j] = pw[j] - alpha0 * pz[j];
346:       pm[j] = pm[j] - alpha0 * pc[j];
347:       pn[j] = pn[j] - alpha0 * pd[j];
348:       pg0[j] = pg0[j] - alpha0 * pa1[j];
349:       ph0[j] = ph0[j] - alpha0 * pb1[j];

351:       pg1[j] = pg0[j];
352:       ph1[j] = ph0[j];

354:       pz[j] = pn[j] + beta1 * pz[j];
355:       pq[j] = pm[j] + beta1 * pq[j];
356:       ps[j] = pw[j] + beta1 * ps[j];
357:       pp[j] = pu[j] + beta1 * pp[j];
358:       pc[j] = pg0[j] + beta1 * pc[j];
359:       pd[j] = ph0[j] + beta1 * pd[j];

361:       px[j] = px[j] + alpha1 * pp[j];
362:       pr[j] = pr[j] - alpha1 * ps[j];
363:       pu[j] = pu[j] - alpha1 * pq[j];
364:       pw[j] = pw[j] - alpha1 * pz[j];
365:       pm[j] = pm[j] - alpha1 * pc[j];
366:       pn[j] = pn[j] - alpha1 * pd[j];

368:       lambda[0] += ps[j] * PetscConj(pu[j]);   lambda[1] += pw[j] * PetscConj(pm[j]);
369:       lambda[2] += pw[j] * PetscConj(pq[j]);   lambda[4] += ps[j] * PetscConj(pq[j]);
370:       lambda[6] += pn[j] * PetscConj(pm[j]);   lambda[7] += pn[j] * PetscConj(pq[j]);
371:       lambda[9] += pz[j] * PetscConj(pq[j]);   lambda[10] += pr[j] * PetscConj(pu[j]);
372:       lambda[11] += pw[j] * PetscConj(pu[j]);  lambda[12] += pu[j] * PetscConj(pu[j]);
373:     }
374:     lambda[3] = PetscConj(lambda[2]);
375:     lambda[5] = PetscConj(lambda[1]);
376:     lambda[8] = PetscConj(lambda[7]);
377:     lambda[13] = PetscConj(lambda[11]);
378:     lambda[14] = PetscConj(lambda[0]);
379:   } else if (normtype==KSP_NORM_UNPRECONDITIONED) {
380:     PetscPragmaSIMD
381:     for (j=0; j<n; j++) {
382:       pa1[j] = (pg1[j] - pg0[j])/alphaold;
383:       pb1[j] = (ph1[j] - ph0[j])/alphaold;

385:       pz[j] = pn[j] + beta0 * pz[j];
386:       pq[j] = pm[j] + beta0 * pq[j];
387:       ps[j] = pw[j] + beta0 * ps[j];
388:       pp[j] = pu[j] + beta0 * pp[j];
389:       pc[j] = pg0[j] + beta0 * pc[j];
390:       pd[j] = ph0[j] + beta0 * pd[j];
391:       pa1[j] = pe[j] + beta0 * pa1[j];
392:       pb1[j] = pf[j] + beta0 * pb1[j];

394:       px[j] = px[j] + alpha0 * pp[j];
395:       pr[j] = pr[j] - alpha0 * ps[j];
396:       pu[j] = pu[j] - alpha0 * pq[j];
397:       pw[j] = pw[j] - alpha0 * pz[j];
398:       pm[j] = pm[j] - alpha0 * pc[j];
399:       pn[j] = pn[j] - alpha0 * pd[j];
400:       pg0[j] = pg0[j] - alpha0 * pa1[j];
401:       ph0[j] = ph0[j] - alpha0 * pb1[j];

403:       pg1[j] = pg0[j];
404:       ph1[j] = ph0[j];

406:       pz[j] = pn[j] + beta1 * pz[j];
407:       pq[j] = pm[j] + beta1 * pq[j];
408:       ps[j] = pw[j] + beta1 * ps[j];
409:       pp[j] = pu[j] + beta1 * pp[j];
410:       pc[j] = pg0[j] + beta1 * pc[j];
411:       pd[j] = ph0[j] + beta1 * pd[j];

413:       px[j] = px[j] + alpha1 * pp[j];
414:       pr[j] = pr[j] - alpha1 * ps[j];
415:       pu[j] = pu[j] - alpha1 * pq[j];
416:       pw[j] = pw[j] - alpha1 * pz[j];
417:       pm[j] = pm[j] - alpha1 * pc[j];
418:       pn[j] = pn[j] - alpha1 * pd[j];

420:       lambda[0] += ps[j] * PetscConj(pu[j]);   lambda[1] += pw[j] * PetscConj(pm[j]);
421:       lambda[2] += pw[j] * PetscConj(pq[j]);   lambda[4] += ps[j] * PetscConj(pq[j]);
422:       lambda[6] += pn[j] * PetscConj(pm[j]);   lambda[7] += pn[j] * PetscConj(pq[j]);
423:       lambda[9] += pz[j] * PetscConj(pq[j]);   lambda[10] += pr[j] * PetscConj(pu[j]);
424:       lambda[11] += pw[j] * PetscConj(pu[j]);  lambda[12] += pr[j] * PetscConj(pr[j]);
425:     }
426:     lambda[3] = PetscConj(lambda[2]);
427:     lambda[5] = PetscConj(lambda[1]);
428:     lambda[8] = PetscConj(lambda[7]);
429:     lambda[13] = PetscConj(lambda[11]);
430:     lambda[14] = PetscConj(lambda[0]);
431:   } else if (normtype==KSP_NORM_NATURAL) {
432:     PetscPragmaSIMD
433:     for (j=0; j<n; j++) {
434:       pa1[j] = (pg1[j] - pg0[j])/alphaold;
435:       pb1[j] = (ph1[j] - ph0[j])/alphaold;

437:       pz[j] = pn[j] + beta0 * pz[j];
438:       pq[j] = pm[j] + beta0 * pq[j];
439:       ps[j] = pw[j] + beta0 * ps[j];
440:       pp[j] = pu[j] + beta0 * pp[j];
441:       pc[j] = pg0[j] + beta0 * pc[j];
442:       pd[j] = ph0[j] + beta0 * pd[j];
443:       pa1[j] = pe[j] + beta0 * pa1[j];
444:       pb1[j] = pf[j] + beta0 * pb1[j];

446:       px[j] = px[j] + alpha0 * pp[j];
447:       pr[j] = pr[j] - alpha0 * ps[j];
448:       pu[j] = pu[j] - alpha0 * pq[j];
449:       pw[j] = pw[j] - alpha0 * pz[j];
450:       pm[j] = pm[j] - alpha0 * pc[j];
451:       pn[j] = pn[j] - alpha0 * pd[j];
452:       pg0[j] = pg0[j] - alpha0 * pa1[j];
453:       ph0[j] = ph0[j] - alpha0 * pb1[j];

455:       pg1[j] = pg0[j];
456:       ph1[j] = ph0[j];

458:       pz[j] = pn[j] + beta1 * pz[j];
459:       pq[j] = pm[j] + beta1 * pq[j];
460:       ps[j] = pw[j] + beta1 * ps[j];
461:       pp[j] = pu[j] + beta1 * pp[j];
462:       pc[j] = pg0[j] + beta1 * pc[j];
463:       pd[j] = ph0[j] + beta1 * pd[j];

465:       px[j] = px[j] + alpha1 * pp[j];
466:       pr[j] = pr[j] - alpha1 * ps[j];
467:       pu[j] = pu[j] - alpha1 * pq[j];
468:       pw[j] = pw[j] - alpha1 * pz[j];
469:       pm[j] = pm[j] - alpha1 * pc[j];
470:       pn[j] = pn[j] - alpha1 * pd[j];

472:       lambda[0] += ps[j] * PetscConj(pu[j]);   lambda[1] += pw[j] * PetscConj(pm[j]);
473:       lambda[2] += pw[j] * PetscConj(pq[j]);   lambda[4] += ps[j] * PetscConj(pq[j]);
474:       lambda[6] += pn[j] * PetscConj(pm[j]);   lambda[7] += pn[j] * PetscConj(pq[j]);
475:       lambda[9] += pz[j] * PetscConj(pq[j]);   lambda[10] += pr[j] * PetscConj(pu[j]);
476:       lambda[11] += pw[j] * PetscConj(pu[j]);
477:     }
478:     lambda[3] = PetscConj(lambda[2]);
479:     lambda[5] = PetscConj(lambda[1]);
480:     lambda[8] = PetscConj(lambda[7]);
481:     lambda[13] = PetscConj(lambda[11]);
482:     lambda[14] = PetscConj(lambda[0]);
483:     lambda[12] = lambda[10];
484:   }

486:   VecRestoreArray(vx,(PetscScalar**)&px);
487:   VecRestoreArray(vr,(PetscScalar**)&pr);
488:   VecRestoreArray(vz,(PetscScalar**)&pz);
489:   VecRestoreArray(vw,(PetscScalar**)&pw);
490:   VecRestoreArray(vp,(PetscScalar**)&pp);
491:   VecRestoreArray(vq,(PetscScalar**)&pq);
492:   VecRestoreArray(vc,(PetscScalar**)&pc);
493:   VecRestoreArray(vd,(PetscScalar**)&pd);
494:   VecRestoreArray(vg0,(PetscScalar**)&pg0);
495:   VecRestoreArray(vh0,(PetscScalar**)&ph0);
496:   VecRestoreArray(vg1,(PetscScalar**)&pg1);
497:   VecRestoreArray(vh1,(PetscScalar**)&ph1);
498:   VecRestoreArray(vs,(PetscScalar**)&ps);
499:   VecRestoreArray(va1,(PetscScalar**)&pa1);
500:   VecRestoreArray(vb1,(PetscScalar**)&pb1);
501:   VecRestoreArray(ve,(PetscScalar**)&pe);
502:   VecRestoreArray(vf,(PetscScalar**)&pf);
503:   VecRestoreArray(vm,(PetscScalar**)&pm);
504:   VecRestoreArray(vn,(PetscScalar**)&pn);
505:   VecRestoreArray(vu,(PetscScalar**)&pu);
506:   return(0);
507: }

509: /*
510:      KSPSetUp_PIPECG2 - Sets up the workspace needed by the PIPECG method.

512:       This is called once, usually automatically by KSPSolve() or KSPSetUp()
513:      but can be called directly by KSPSetUp()
514: */
515: static  PetscErrorCode KSPSetUp_PIPECG2(KSP ksp)
516: {

520:   /* get work vectors needed by PIPECG2 */
521:   KSPSetWorkVecs(ksp,20);
522:   return(0);
523: }

525: /*
526:  KSPSolve_PIPECG2 - This routine actually applies the PIPECG2 method
527: */
528: static PetscErrorCode  KSPSolve_PIPECG2(KSP ksp)
529: {
531:   PetscInt       i,n;
532:   PetscScalar    alpha[2],beta[2],gamma[2],delta[2],lambda[15];
533:   PetscScalar    dps = 0.0,alphaold=0.0;
534:   PetscReal      dp = 0.0;
535:   Vec            X,B,Z,P,W,Q,U,M,N,R,S,C,D,E,F,G[2],H[2],A1,B1;
536:   Mat            Amat,Pmat;
537:   PetscBool      diagonalscale;
538:   MPI_Comm       pcomm;
539:   MPI_Request    req;
540:   MPI_Status     stat;

543:   pcomm = PetscObjectComm((PetscObject)ksp);
544:   PCGetDiagonalScale(ksp->pc,&diagonalscale);
545:   if (diagonalscale) SETERRQ1(PetscObjectComm((PetscObject)ksp),PETSC_ERR_SUP,"Krylov method %s does not support diagonal scaling",((PetscObject)ksp)->type_name);

547:   X = ksp->vec_sol;
548:   B = ksp->vec_rhs;
549:   M = ksp->work[0];
550:   Z = ksp->work[1];
551:   P = ksp->work[2];
552:   N = ksp->work[3];
553:   W = ksp->work[4];
554:   Q = ksp->work[5];
555:   U = ksp->work[6];
556:   R = ksp->work[7];
557:   S = ksp->work[8];
558:   C = ksp->work[9];
559:   D = ksp->work[10];
560:   E = ksp->work[11];
561:   F = ksp->work[12];
562:   G[0]  = ksp->work[13];
563:   H[0] = ksp->work[14];
564:   G[1]  = ksp->work[15];
565:   H[1] = ksp->work[16];
566:   A1 = ksp->work[17];
567:   B1 = ksp->work[18];

569:   PetscMemzero(alpha,2*sizeof(PetscScalar));
570:   PetscMemzero(beta,2*sizeof(PetscScalar));
571:   PetscMemzero(gamma,2*sizeof(PetscScalar));
572:   PetscMemzero(delta,2*sizeof(PetscScalar));
573:   PetscMemzero(lambda,15*sizeof(PetscScalar));

575:   VecGetLocalSize(B,&n);
576:   PCGetOperators(ksp->pc,&Amat,&Pmat);

578:   ksp->its = 0;
579:   if (!ksp->guess_zero) {
580:     KSP_MatMult(ksp,Amat,X,R);             /*  r <- b - Ax  */
581:     VecAYPX(R,-1.0,B);
582:   } else {
583:     VecCopy(B,R);                          /*  r <- b (x is 0) */
584:   }

586:   KSP_PCApply(ksp,R,U);                    /*  u <- Br  */
587:   KSP_MatMult(ksp,Amat,U,W);               /*  w <- Au  */

589:   VecMergedDot_Private(U,W,R,ksp->normtype,&gamma[0],&delta[0],&dps);     /*  gamma  <- r'*u , delta <- w'*u , dp <- u'*u or r'*r or r'*u depending on ksp_norm_type  */
590:   lambda[10]= gamma[0];
591:   lambda[11]= delta[0];
592:   lambda[12] = dps;

594:   MPI_Iallreduce(MPI_IN_PLACE,&lambda[10],3,MPIU_SCALAR,MPIU_SUM,pcomm,&req);

596:   KSP_PCApply(ksp,W,M);                    /*  m <- Bw  */
597:   KSP_MatMult(ksp,Amat,M,N);               /*  n <- Am  */

599:   KSP_PCApply(ksp,N,G[0]);                 /*  g <- Bn  */
600:   KSP_MatMult(ksp,Amat,G[0],H[0]);         /*  h <- Ag  */

602:   KSP_PCApply(ksp,H[0],E);                 /*  e <- Bh  */
603:   KSP_MatMult(ksp,Amat,E,F);               /*  f <- Ae  */

605:   MPI_Wait(&req,&stat);

607:   gamma[0] = lambda[10];
608:   delta[0] = lambda[11];
609:   dp = PetscSqrtReal(PetscAbsScalar(lambda[12]));

611:   VecMergedDot2_Private(N,M,W,&lambda[1],&lambda[6]);     /*  lambda_1 <- w'*m , lambda_4 <- n'*m  */
612:   MPIU_Allreduce(MPI_IN_PLACE,&lambda[1],1,MPIU_SCALAR,MPIU_SUM,pcomm);
613:   MPIU_Allreduce(MPI_IN_PLACE,&lambda[6],1,MPIU_SCALAR,MPIU_SUM,pcomm);

615:   lambda[5] = PetscConj(lambda[1]);
616:   lambda[13] = PetscConj(lambda[11]);

618:   KSPLogResidualHistory(ksp,dp);
619:   KSPMonitor(ksp,0,dp);
620:   ksp->rnorm = dp;

622:   (*ksp->converged)(ksp,0,dp,&ksp->reason,ksp->cnvP); /* test for convergence */
623:   if (ksp->reason) return(0);

625:   for (i=2; i<ksp->max_it; i+=2) {
626:     if (i == 2) {
627:       beta[0] = 0;
628:       alpha[0] = gamma[0]/delta[0];

630:       gamma[1] = gamma[0] - alpha[0] * lambda[13] - alpha[0] * delta[0] + alpha[0] * alpha[0] * lambda[1];
631:       delta[1] = delta[0] - alpha[0] * lambda[1] - alpha[0] * lambda[5] + alpha[0]*alpha[0]*lambda[6];

633:       beta[1]  = gamma[1] / gamma[0];
634:       alpha[1] = gamma[1] / (delta[1] - beta[1] / alpha[0] * gamma[1]);

636:       VecMergedOpsShort_Private(X,R,Z,W,P,Q,C,D,G[0],H[0],G[1],H[1],S,A1,B1,E,F,M,N,U,ksp->normtype,beta[0],alpha[0],beta[1],alpha[1],lambda);
637:     } else {
638:       beta[0]  = gamma[1] / gamma[0];
639:       alpha[0] = gamma[1] / (delta[1] - beta[0] / alpha[1] * gamma[1]);

641:       gamma[0] = gamma[1];
642:       delta[0] = delta[1];

644:       gamma[1] = gamma[0] - alpha[0]*(lambda[13] + beta[0]*lambda[14]) - alpha[0] *(delta[0] + beta[0] * lambda[0]) + alpha[0] * alpha[0] * (lambda[1] + beta[0] * lambda[2] + beta[0] * lambda[3] + beta[0] * beta[0] * lambda[4]);

646:       delta[1] = delta[0] - alpha[0] * (lambda[1] + beta[0]* lambda[2]) - alpha[0] *(lambda[5] + beta[0] * lambda[3]) + alpha[0]*alpha[0] * (lambda[6] + beta[0] * lambda[7] + beta[0] *lambda[8] + beta[0] * beta[0] * lambda[9]);

648:       beta[1]  = gamma[1] / gamma[0];
649:       alpha[1] = gamma[1] / (delta[1] - beta[1] / alpha[0] * gamma[1]);

651:        VecMergedOps_Private(X,R,Z,W,P,Q,C,D,G[0],H[0],G[1],H[1],S,A1,B1,E,F,M,N,U,ksp->normtype,beta[0],alpha[0],beta[1],alpha[1],lambda,alphaold);
652:     }

654:     gamma[0] = gamma[1];
655:     delta[0] = delta[1];

657:     MPI_Iallreduce(MPI_IN_PLACE,lambda,15,MPIU_SCALAR,MPIU_SUM,pcomm,&req);

659:     KSP_PCApply(ksp,N,G[0]);                       /*  g <- Bn  */
660:     KSP_MatMult(ksp,Amat,G[0],H[0]);               /*  h <- Ag  */

662:     KSP_PCApply(ksp,H[0],E);               /*  e <- Bh  */
663:     KSP_MatMult(ksp,Amat,E,F);             /*  f <- Ae */

665:     MPI_Wait(&req,&stat);

667:     gamma[1] = lambda[10];
668:     delta[1] = lambda[11];
669:     dp = PetscSqrtReal(PetscAbsScalar(lambda[12]));

671:     alphaold = alpha[1];
672:     ksp->its = i;

674:     if (i > 0) {
675:       if (ksp->normtype == KSP_NORM_NONE) dp = 0.0;
676:       ksp->rnorm = dp;
677:       KSPLogResidualHistory(ksp,dp);
678:       KSPMonitor(ksp,i,dp);
679:       (*ksp->converged)(ksp,i,dp,&ksp->reason,ksp->cnvP);
680:       if (ksp->reason) break;
681:     }
682:   }

684:   if (i >= ksp->max_it) ksp->reason = KSP_DIVERGED_ITS;
685:   return(0);
686: }

688: /*MC
689:    KSPPIPECG2 - Pipelined conjugate gradient method with a single non-blocking allreduce per two iterations.

691:    This method has only a single non-blocking reduction per two iterations, compared to 2 blocking for standard CG.  The
692:    non-blocking reduction is overlapped by two matrix-vector products and two preconditioner applications.

694:    Level: intermediate

696:    Notes:
697:    MPI configuration may be necessary for reductions to make asynchronous progress, which is important for performance of pipelined methods.
698:    See the FAQ on the PETSc website for details.

700:    Contributed by:
701:    Manasi Tiwari, Computational and Data Sciences, Indian Institute of Science, Bangalore

703:    Reference:
704:    Manasi Tiwari and Sathish Vadhiyar, "Pipelined Conjugate Gradient Methods for Distributed Memory Systems",
705:    Submitted to International Conference on High Performance Computing, Data and Analytics 2020.

707: .seealso: KSPCreate(), KSPSetType(), KSPCG, KSPPIPECG
708: M*/
709: PETSC_EXTERN PetscErrorCode KSPCreate_PIPECG2(KSP ksp)
710: {

714:   KSPSetSupportedNorm(ksp,KSP_NORM_UNPRECONDITIONED,PC_LEFT,2);
715:   KSPSetSupportedNorm(ksp,KSP_NORM_PRECONDITIONED,PC_LEFT,2);
716:   KSPSetSupportedNorm(ksp,KSP_NORM_NATURAL,PC_LEFT,2);
717:   KSPSetSupportedNorm(ksp,KSP_NORM_NONE,PC_LEFT,1);

719:   ksp->ops->setup          = KSPSetUp_PIPECG2;
720:   ksp->ops->solve          = KSPSolve_PIPECG2;
721:   ksp->ops->destroy        = KSPDestroyDefault;
722:   ksp->ops->view           = NULL;
723:   ksp->ops->setfromoptions = NULL;
724:   ksp->ops->buildsolution  = KSPBuildSolutionDefault;
725:   ksp->ops->buildresidual  = KSPBuildResidualDefault;
726:   return(0);
727: }