Actual source code: ibcgs.c

petsc-master 2016-12-07
Report Typos and Errors

  2:  #include <petsc/private/kspimpl.h>
  3:  #include <petsc/private/vecimpl.h>

  7: static PetscErrorCode KSPSetUp_IBCGS(KSP ksp)
  8: {
 10:   PetscBool      diagonalscale;

 13:   PCGetDiagonalScale(ksp->pc,&diagonalscale);
 14:   if (diagonalscale) SETERRQ1(PetscObjectComm((PetscObject)ksp),PETSC_ERR_SUP,"Krylov method %s does not support diagonal scaling",((PetscObject)ksp)->type_name);
 15:   KSPSetWorkVecs(ksp,9);
 16:   return(0);
 17: }

 19: /*
 20:     The code below "cheats" from PETSc style
 21:        1) VecRestoreArray() is called immediately after VecGetArray() and the array values are still accessed; the reason for the immediate
 22:           restore is that Vec operations are done on some of the vectors during the solve and if we did not restore immediately it would
 23:           generate two VecGetArray() (the second one inside the Vec operation) calls without a restore between them.
 24:        2) The vector operations on done directly on the arrays instead of with VecXXXX() calls

 26:        For clarity in the code we name single VECTORS with two names, for example, Rn_1 and R, but they actually always
 27:      the exact same memory. We do this with macro defines so that compiler won't think they are
 28:      two different variables.

 30: */
 31: #define Xn_1 Xn
 32: #define xn_1 xn
 33: #define Rn_1 Rn
 34: #define rn_1 rn
 35: #define Un_1 Un
 36: #define un_1 un
 37: #define Vn_1 Vn
 38: #define vn_1 vn
 39: #define Qn_1 Qn
 40: #define qn_1 qn
 41: #define Zn_1 Zn
 42: #define zn_1 zn
 45: static PetscErrorCode  KSPSolve_IBCGS(KSP ksp)
 46: {
 48:   PetscInt       i,N;
 49:   PetscReal      rnorm,rnormin = 0.0;
 50: #if defined(PETSC_HAVE_MPI_LONG_DOUBLE) && !defined(PETSC_USE_COMPLEX) && (defined(PETSC_USE_REAL_SINGLE) || defined(PETSC_USE_REAL_DOUBLE))
 51:   /* Because of possible instabilities in the algorithm (as indicated by different residual histories for the same problem
 52:      on the same number of processes  with different runs) we support computing the inner products using Intel's 80 bit arithematic
 53:      rather than just 64 bit. Thus we copy our double precision values into long doubles (hoping this keeps the 16 extra bits)
 54:      and tell MPI to do its ALlreduces with MPI_LONG_DOUBLE.

 56:      Note for developers that does not effect the code. Intel's long double is implemented by storing the 80 bits of extended double
 57:      precision into a 16 byte space (the rest of the space is ignored)  */
 58:   long double insums[7],outsums[7];
 59: #else
 60:   PetscScalar insums[7],outsums[7];
 61: #endif
 62:   PetscScalar                       sigman_2, sigman_1, sigman, pin_1, pin, phin_1, phin,tmp1,tmp2;
 63:   PetscScalar                       taun_1, taun, rhon, alphan_1, alphan, omegan_1, omegan;
 64:   const PetscScalar *PETSC_RESTRICT r0, *PETSC_RESTRICT f0, *PETSC_RESTRICT qn, *PETSC_RESTRICT b, *PETSC_RESTRICT un;
 65:   PetscScalar *PETSC_RESTRICT       rn, *PETSC_RESTRICT xn, *PETSC_RESTRICT vn, *PETSC_RESTRICT zn;
 66:   /* the rest do not have to keep n_1 values */
 67:   PetscScalar                       kappan, thetan, etan, gamman, betan, deltan;
 68:   const PetscScalar *PETSC_RESTRICT tn;
 69:   PetscScalar *PETSC_RESTRICT       sn;
 70:   Vec                               R0,Rn,Xn,F0,Vn,Zn,Qn,Tn,Sn,B,Un;
 71:   Mat                               A;

 74:   if (!ksp->vec_rhs->petscnative) SETERRQ(PetscObjectComm((PetscObject)ksp),PETSC_ERR_SUP,"Only coded for PETSc vectors");

 76:  #if defined(PETSC_HAVE_MPI_LONG_DOUBLE) && !defined(PETSC_USE_COMPLEX) && (defined(PETSC_USE_REAL_SINGLE) || defined(PETSC_USE_REAL_DOUBLE))
 77:   /* since 80 bit long doubls do not fill the upper bits, we fill them initially so that
 78:      valgrind won't detect MPI_Allreduce() with uninitialized data */
 79:   PetscMemzero(insums,sizeof(insums));
 80:   PetscMemzero(insums,sizeof(insums));
 81: #endif

 83:   PCGetOperators(ksp->pc,&A,NULL);
 84:   VecGetLocalSize(ksp->vec_sol,&N);
 85:   Xn   = ksp->vec_sol; VecGetArray(Xn_1,(PetscScalar**)&xn_1); VecRestoreArray(Xn_1,NULL);
 86:   B    = ksp->vec_rhs; VecGetArrayRead(B,(const PetscScalar**)&b); VecRestoreArrayRead(B,NULL);
 87:   R0   = ksp->work[0]; VecGetArrayRead(R0,(const PetscScalar**)&r0); VecRestoreArrayRead(R0,NULL);
 88:   Rn   = ksp->work[1]; VecGetArray(Rn_1,(PetscScalar**)&rn_1); VecRestoreArray(Rn_1,NULL);
 89:   Un   = ksp->work[2]; VecGetArrayRead(Un_1,(const PetscScalar**)&un_1); VecRestoreArrayRead(Un_1,NULL);
 90:   F0   = ksp->work[3]; VecGetArrayRead(F0,(const PetscScalar**)&f0); VecRestoreArrayRead(F0,NULL);
 91:   Vn   = ksp->work[4]; VecGetArray(Vn_1,(PetscScalar**)&vn_1); VecRestoreArray(Vn_1,NULL);
 92:   Zn   = ksp->work[5]; VecGetArray(Zn_1,(PetscScalar**)&zn_1); VecRestoreArray(Zn_1,NULL);
 93:   Qn   = ksp->work[6]; VecGetArrayRead(Qn_1,(const PetscScalar**)&qn_1); VecRestoreArrayRead(Qn_1,NULL);
 94:   Tn   = ksp->work[7]; VecGetArrayRead(Tn,(const PetscScalar**)&tn); VecRestoreArrayRead(Tn,NULL);
 95:   Sn   = ksp->work[8]; VecGetArrayRead(Sn,(const PetscScalar**)&sn); VecRestoreArrayRead(Sn,NULL);

 97:   /* r0 = rn_1 = b - A*xn_1; */
 98:   /* KSP_PCApplyBAorAB(ksp,Xn_1,Rn_1,Tn);
 99:      VecAYPX(Rn_1,-1.0,B); */
100:   KSPInitialResidual(ksp,Xn_1,Tn,Sn,Rn_1,B);

102:   VecNorm(Rn_1,NORM_2,&rnorm);
103:   KSPMonitor(ksp,0,rnorm);
104:   (*ksp->converged)(ksp,0,rnorm,&ksp->reason,ksp->cnvP);
105:   if (ksp->reason) return(0);

107:   VecCopy(Rn_1,R0);

109:   /* un_1 = A*rn_1; */
110:   KSP_PCApplyBAorAB(ksp,Rn_1,Un_1,Tn);

112:   /* f0   = A'*rn_1; */
113:   if (ksp->pc_side == PC_RIGHT) { /* B' A' */
114:     KSP_MatMultTranspose(ksp,A,R0,Tn);
115:     KSP_PCApplyTranspose(ksp,Tn,F0);
116:   } else if (ksp->pc_side == PC_LEFT) { /* A' B' */
117:     KSP_PCApplyTranspose(ksp,R0,Tn);
118:     KSP_MatMultTranspose(ksp,A,Tn,F0);
119:   }

121:   /*qn_1 = vn_1 = zn_1 = 0.0; */
122:   VecSet(Qn_1,0.0);
123:   VecSet(Vn_1,0.0);
124:   VecSet(Zn_1,0.0);

126:   sigman_2 = pin_1 = taun_1 = 0.0;

128:   /* the paper says phin_1 should be initialized to zero, it is actually R0'R0 */
129:   VecDot(R0,R0,&phin_1);

131:   /* sigman_1 = rn_1'un_1  */
132:   VecDot(R0,Un_1,&sigman_1);

134:   alphan_1 = omegan_1 = 1.0;

136:   for (ksp->its = 1; ksp->its<ksp->max_it+1; ksp->its++) {
137:     rhon = phin_1 - omegan_1*sigman_2 + omegan_1*alphan_1*pin_1;
138:     if (ksp->its == 1) deltan = rhon;
139:     else deltan = rhon/taun_1;
140:     betan = deltan/omegan_1;
141:     taun  = sigman_1 + betan*taun_1  - deltan*pin_1;
142:     if (taun == 0.0) {
143:       if (ksp->errorifnotconverged) SETERRQ1(PetscObjectComm((PetscObject)ksp),PETSC_ERR_NOT_CONVERGED,"KSPSolve has not converged due to taun is zero, iteration %D",ksp->its);
144:       else {
145:         ksp->reason = KSP_DIVERGED_NANORINF;
146:         return(0);
147:       }
148:     }
149:     alphan = rhon/taun;
150:     PetscLogFlops(15.0);

152:     /*
153:         zn = alphan*rn_1 + (alphan/alphan_1)betan*zn_1 - alphan*deltan*vn_1
154:         vn = un_1 + betan*vn_1 - deltan*qn_1
155:         sn = rn_1 - alphan*vn

157:        The algorithm in the paper is missing the alphan/alphan_1 term in the zn update
158:     */
159:     PetscLogEventBegin(VEC_Ops,0,0,0,0);
160:     tmp1 = (alphan/alphan_1)*betan;
161:     tmp2 = alphan*deltan;
162:     for (i=0; i<N; i++) {
163:       zn[i] = alphan*rn_1[i] + tmp1*zn_1[i] - tmp2*vn_1[i];
164:       vn[i] = un_1[i] + betan*vn_1[i] - deltan*qn_1[i];
165:       sn[i] = rn_1[i] - alphan*vn[i];
166:     }
167:     PetscLogFlops(3.0+11.0*N);
168:     PetscLogEventEnd(VEC_Ops,0,0,0,0);

170:     /*
171:         qn = A*vn
172:     */
173:     KSP_PCApplyBAorAB(ksp,Vn,Qn,Tn);

175:     /*
176:         tn = un_1 - alphan*qn
177:     */
178:     VecWAXPY(Tn,-alphan,Qn,Un_1);


181:     /*
182:         phin = r0'sn
183:         pin  = r0'qn
184:         gamman = f0'sn
185:         etan   = f0'tn
186:         thetan = sn'tn
187:         kappan = tn'tn
188:     */
189:     PetscLogEventBegin(VEC_ReduceArithmetic,0,0,0,0);
190:     phin = pin = gamman = etan = thetan = kappan = 0.0;
191:     for (i=0; i<N; i++) {
192:       phin   += r0[i]*sn[i];
193:       pin    += r0[i]*qn[i];
194:       gamman += f0[i]*sn[i];
195:       etan   += f0[i]*tn[i];
196:       thetan += sn[i]*tn[i];
197:       kappan += tn[i]*tn[i];
198:     }
199:     PetscLogFlops(12.0*N);
200:     PetscLogEventEnd(VEC_ReduceArithmetic,0,0,0,0);

202:     insums[0] = phin;
203:     insums[1] = pin;
204:     insums[2] = gamman;
205:     insums[3] = etan;
206:     insums[4] = thetan;
207:     insums[5] = kappan;
208:     insums[6] = rnormin;

210:     PetscLogEventBarrierBegin(VEC_ReduceBarrier,0,0,0,0,PetscObjectComm((PetscObject)ksp));
211: #if defined(PETSC_HAVE_MPI_LONG_DOUBLE) && !defined(PETSC_USE_COMPLEX) && (defined(PETSC_USE_REAL_SINGLE) || defined(PETSC_USE_REAL_DOUBLE))
212:     if (ksp->lagnorm && ksp->its > 1) {
213:       MPIU_Allreduce(insums,outsums,7,MPI_LONG_DOUBLE,MPI_SUM,PetscObjectComm((PetscObject)ksp));
214:     } else {
215:       MPIU_Allreduce(insums,outsums,6,MPI_LONG_DOUBLE,MPI_SUM,PetscObjectComm((PetscObject)ksp));
216:     }
217: #else
218:     if (ksp->lagnorm && ksp->its > 1) {
219:       MPIU_Allreduce(insums,outsums,7,MPIU_SCALAR,MPIU_SUM,PetscObjectComm((PetscObject)ksp));
220:     } else {
221:       MPIU_Allreduce(insums,outsums,6,MPIU_SCALAR,MPIU_SUM,PetscObjectComm((PetscObject)ksp));
222:     }
223: #endif
224:     PetscLogEventBarrierEnd(VEC_ReduceBarrier,0,0,0,0,PetscObjectComm((PetscObject)ksp));
225:     phin   = outsums[0];
226:     pin    = outsums[1];
227:     gamman = outsums[2];
228:     etan   = outsums[3];
229:     thetan = outsums[4];
230:     kappan = outsums[5];
231:     if (ksp->lagnorm && ksp->its > 1) rnorm = PetscSqrtReal(PetscRealPart(outsums[6]));

233:     if (kappan == 0.0) {
234:       if (ksp->errorifnotconverged) SETERRQ1(PetscObjectComm((PetscObject)ksp),PETSC_ERR_NOT_CONVERGED,"KSPSolve has not converged due to kappan is zero, iteration %D",ksp->its);
235:       else {
236:         ksp->reason = KSP_DIVERGED_NANORINF;
237:         return(0);
238:       }
239:     }
240:     if (thetan == 0.0) {
241:       if (ksp->errorifnotconverged) SETERRQ1(PetscObjectComm((PetscObject)ksp),PETSC_ERR_NOT_CONVERGED,"KSPSolve has not converged due to thetan is zero, iteration %D",ksp->its);
242:       else {
243:         ksp->reason = KSP_DIVERGED_NANORINF;
244:         return(0);
245:       }
246:     }
247:     omegan = thetan/kappan;
248:     sigman = gamman - omegan*etan;

250:     /*
251:         rn = sn - omegan*tn
252:         xn = xn_1 + zn + omegan*sn
253:     */
254:     PetscLogEventBegin(VEC_Ops,0,0,0,0);
255:     rnormin = 0.0;
256:     for (i=0; i<N; i++) {
257:       rn[i]    = sn[i] - omegan*tn[i];
258:       rnormin += PetscRealPart(PetscConj(rn[i])*rn[i]);
259:       xn[i]   += zn[i] + omegan*sn[i];
260:     }
261:     PetscObjectStateIncrease((PetscObject)Xn);
262:     PetscLogFlops(7.0*N);
263:     PetscLogEventEnd(VEC_Ops,0,0,0,0);

265:     if (!ksp->lagnorm && ksp->chknorm < ksp->its) {
266:       PetscLogEventBarrierBegin(VEC_ReduceBarrier,0,0,0,0,PetscObjectComm((PetscObject)ksp));
267:       MPIU_Allreduce(&rnormin,&rnorm,1,MPIU_REAL,MPIU_SUM,PetscObjectComm((PetscObject)ksp));
268:       PetscLogEventBarrierEnd(VEC_ReduceBarrier,0,0,0,0,PetscObjectComm((PetscObject)ksp));
269:       rnorm = PetscSqrtReal(rnorm);
270:     }

272:     /* Test for convergence */
273:     KSPMonitor(ksp,ksp->its,rnorm);
274:     (*ksp->converged)(ksp,ksp->its,rnorm,&ksp->reason,ksp->cnvP);
275:     if (ksp->reason) break;

277:     /* un = A*rn */
278:     KSP_PCApplyBAorAB(ksp,Rn,Un,Tn);

280:     /* Update n-1 locations with n locations */
281:     sigman_2 = sigman_1;
282:     sigman_1 = sigman;
283:     pin_1    = pin;
284:     phin_1   = phin;
285:     alphan_1 = alphan;
286:     taun_1   = taun;
287:     omegan_1 = omegan;
288:   }
289:   if (ksp->its >= ksp->max_it) ksp->reason = KSP_DIVERGED_ITS;
290:   KSPUnwindPreconditioner(ksp,Xn,Tn);
291:   return(0);
292: }


295: /*MC
296:      KSPIBCGS - Implements the IBiCGStab (Improved Stabilized version of BiConjugate Gradient) method
297:             in an alternative form to have only a single global reduction operation instead of the usual 3 (or 4)

299:    Options Database Keys:
300: .   see KSPSolve()

302:    Level: beginner

304:    Notes: Supports left and right preconditioning

306:           See KSPBCGSL for additional stabilization

308:           Unlike the Bi-CG-stab algorithm, this requires one multiplication be the transpose of the operator
309:            before the iteration starts.

311:           The paper has two errors in the algorithm presented, they are fixed in the code in KSPSolve_IBCGS()

313:           For maximum reduction in the number of global reduction operations, this solver should be used with
314:           KSPSetLagNorm().

316:           This is not supported for complex numbers.

318:    Reference: The Improved BiCGStab Method for Large and Sparse Unsymmetric Linear Systems on Parallel Distributed Memory
319:                      Architectures. L. T. Yang and R. Brent, Proceedings of the Fifth International Conference on Algorithms and
320:                      Architectures for Parallel Processing, 2002, IEEE.

322: .seealso:  KSPCreate(), KSPSetType(), KSPType (for list of available types), KSP, KSPBICG, KSPBCGSL, KSPIBCGS, KSPSetLagNorm()
323: M*/

327: PETSC_EXTERN PetscErrorCode KSPCreate_IBCGS(KSP ksp)
328: {

332:   ksp->data = (void*)0;

334:   KSPSetSupportedNorm(ksp,KSP_NORM_PRECONDITIONED,PC_LEFT,3);
335:   KSPSetSupportedNorm(ksp,KSP_NORM_UNPRECONDITIONED,PC_RIGHT,2);

337:   ksp->ops->setup          = KSPSetUp_IBCGS;
338:   ksp->ops->solve          = KSPSolve_IBCGS;
339:   ksp->ops->destroy        = KSPDestroyDefault;
340:   ksp->ops->buildsolution  = KSPBuildSolutionDefault;
341:   ksp->ops->buildresidual  = KSPBuildResidualDefault;
342:   ksp->ops->setfromoptions = 0;
343:   ksp->ops->view           = 0;
344: #if defined(PETSC_USE_COMPLEX)
345:   SETERRQ(PetscObjectComm((PetscObject)ksp),PETSC_ERR_SUP,"This is not supported for complex numbers");
346: #endif
347:   return(0);
348: }