Actual source code: ibcgs.c
petsc-3.3-p5 2012-12-01
2: #include <petsc-private/kspimpl.h>
6: static PetscErrorCode KSPSetUp_IBCGS(KSP ksp)
7: {
9: PetscBool diagonalscale;
12: PCGetDiagonalScale(ksp->pc,&diagonalscale);
13: if (diagonalscale) SETERRQ1(((PetscObject)ksp)->comm,PETSC_ERR_SUP,"Krylov method %s does not support diagonal scaling",((PetscObject)ksp)->type_name);
14: KSPDefaultGetWork(ksp,9);
15: return(0);
16: }
18: /*
19: The code below "cheats" from PETSc style
20: 1) VecRestoreArray() is called immediately after VecGetArray() and the array values are still accessed; the reason for the immediate
21: restore is that Vec operations are done on some of the vectors during the solve and if we did not restore immediately it would
22: generate two VecGetArray() (the second one inside the Vec operation) calls without a restore between them.
23: 2) The vector operations on done directly on the arrays instead of with VecXXXX() calls
25: For clarity in the code we name single VECTORS with two names, for example, Rn_1 and R, but they actually always
26: the exact same memory. We do this with macro defines so that compiler won't think they are
27: two different variables.
29: */
30: #define Xn_1 Xn
31: #define xn_1 xn
32: #define Rn_1 Rn
33: #define rn_1 rn
34: #define Un_1 Un
35: #define un_1 un
36: #define Vn_1 Vn
37: #define vn_1 vn
38: #define Qn_1 Qn
39: #define qn_1 qn
40: #define Zn_1 Zn
41: #define zn_1 zn
44: static PetscErrorCode KSPSolve_IBCGS(KSP ksp)
45: {
47: PetscInt i,N;
48: PetscReal rnorm,rnormin = 0.0;
49: #if defined(PETSC_HAVE_MPI_LONG_DOUBLE) && !defined(PETSC_USE_COMPLEX) && (defined(PETSC_USE_REAL_SINGLE) || defined(PETSC_USE_REAL_DOUBLE))
50: /* Because of possible instabilities in the algorithm (as indicated by different residual histories for the same problem
51: on the same number of processes with different runs) we support computing the inner products using Intel's 80 bit arithematic
52: rather than just 64 bit. Thus we copy our double precision values into long doubles (hoping this keeps the 16 extra bits)
53: and tell MPI to do its ALlreduces with MPI_LONG_DOUBLE.
55: Note for developers that does not effect the code. Intel's long double is implemented by storing the 80 bits of extended double
56: precision into a 16 byte space (the rest of the space is ignored) */
57: long double insums[7],outsums[7];
58: #else
59: PetscScalar insums[7],outsums[7];
60: #endif
61: PetscScalar sigman_2, sigman_1, sigman, pin_1, pin, phin_1, phin,tmp1,tmp2;
62: PetscScalar taun_1, taun, rhon, alphan_1, alphan, omegan_1, omegan;
63: const PetscScalar *PETSC_RESTRICT r0, *PETSC_RESTRICT f0, *PETSC_RESTRICT qn, *PETSC_RESTRICT b, *PETSC_RESTRICT un;
64: PetscScalar *PETSC_RESTRICT rn, *PETSC_RESTRICT xn, *PETSC_RESTRICT vn, *PETSC_RESTRICT zn;
65: /* the rest do not have to keep n_1 values */
66: PetscScalar kappan, thetan, etan, gamman, betan, deltan;
67: const PetscScalar *PETSC_RESTRICT tn;
68: PetscScalar *PETSC_RESTRICT sn;
69: Vec R0,Rn,Xn,F0,Vn,Zn,Qn,Tn,Sn,B,Un;
70: Mat A;
73: if (!ksp->vec_rhs->petscnative) SETERRQ(((PetscObject)ksp)->comm,PETSC_ERR_SUP,"Only coded for PETSc vectors");
75: PCGetOperators(ksp->pc,&A,PETSC_NULL,PETSC_NULL);
76: VecGetLocalSize(ksp->vec_sol,&N);
77: Xn = ksp->vec_sol;VecGetArray(Xn_1,(PetscScalar**)&xn_1);VecRestoreArray(Xn_1,PETSC_NULL);
78: B = ksp->vec_rhs;VecGetArrayRead(B,(const PetscScalar**)&b);VecRestoreArrayRead(B,PETSC_NULL);
79: R0 = ksp->work[0];VecGetArrayRead(R0,(const PetscScalar**)&r0);VecRestoreArrayRead(R0,PETSC_NULL);
80: Rn = ksp->work[1];VecGetArray(Rn_1,(PetscScalar**)&rn_1);VecRestoreArray(Rn_1,PETSC_NULL);
81: Un = ksp->work[2];VecGetArrayRead(Un_1,(const PetscScalar**)&un_1);VecRestoreArrayRead(Un_1,PETSC_NULL);
82: F0 = ksp->work[3];VecGetArrayRead(F0,(const PetscScalar**)&f0);VecRestoreArrayRead(F0,PETSC_NULL);
83: Vn = ksp->work[4];VecGetArray(Vn_1,(PetscScalar**)&vn_1);VecRestoreArray(Vn_1,PETSC_NULL);
84: Zn = ksp->work[5];VecGetArray(Zn_1,(PetscScalar**)&zn_1);VecRestoreArray(Zn_1,PETSC_NULL);
85: Qn = ksp->work[6];VecGetArrayRead(Qn_1,(const PetscScalar**)&qn_1);VecRestoreArrayRead(Qn_1,PETSC_NULL);
86: Tn = ksp->work[7];VecGetArrayRead(Tn,(const PetscScalar**)&tn);VecRestoreArrayRead(Tn,PETSC_NULL);
87: Sn = ksp->work[8];VecGetArrayRead(Sn,(const PetscScalar**)&sn);VecRestoreArrayRead(Sn,PETSC_NULL);
89: /* r0 = rn_1 = b - A*xn_1; */
90: /* KSP_PCApplyBAorAB(ksp,Xn_1,Rn_1,Tn);
91: VecAYPX(Rn_1,-1.0,B); */
92: KSPInitialResidual(ksp,Xn_1,Tn,Sn,Rn_1,B);
94: VecNorm(Rn_1,NORM_2,&rnorm);
95: KSPMonitor(ksp,0,rnorm);
96: (*ksp->converged)(ksp,0,rnorm,&ksp->reason,ksp->cnvP);
97: if (ksp->reason) return(0);
99: VecCopy(Rn_1,R0);
101: /* un_1 = A*rn_1; */
102: KSP_PCApplyBAorAB(ksp,Rn_1,Un_1,Tn);
103:
104: /* f0 = A'*rn_1; */
105: if (ksp->pc_side == PC_RIGHT) { /* B' A' */
106: MatMultTranspose(A,R0,Tn);
107: PCApplyTranspose(ksp->pc,Tn,F0);
108: } else if (ksp->pc_side == PC_LEFT) { /* A' B' */
109: PCApplyTranspose(ksp->pc,R0,Tn);
110: MatMultTranspose(A,Tn,F0);
111: }
113: /*qn_1 = vn_1 = zn_1 = 0.0; */
114: VecSet(Qn_1,0.0);
115: VecSet(Vn_1,0.0);
116: VecSet(Zn_1,0.0);
118: sigman_2 = pin_1 = taun_1 = 0.0;
120: /* the paper says phin_1 should be initialized to zero, it is actually R0'R0 */
121: VecDot(R0,R0,&phin_1);
123: /* sigman_1 = rn_1'un_1 */
124: VecDot(R0,Un_1,&sigman_1);
126: alphan_1 = omegan_1 = 1.0;
128: for (ksp->its = 1; ksp->its<ksp->max_it+1; ksp->its++) {
129: rhon = phin_1 - omegan_1*sigman_2 + omegan_1*alphan_1*pin_1;
130: /* if (rhon == 0.0) SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_CONV_FAILED,"rhon is zero, iteration %D",n); */
131: if (ksp->its == 1) deltan = rhon;
132: else deltan = rhon/taun_1;
133: betan = deltan/omegan_1;
134: taun = sigman_1 + betan*taun_1 - deltan*pin_1;
135: if (taun == 0.0) SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_CONV_FAILED,"taun is zero, iteration %D",ksp->its);
136: alphan = rhon/taun;
137: PetscLogFlops(15.0);
139: /*
140: zn = alphan*rn_1 + (alphan/alphan_1)betan*zn_1 - alphan*deltan*vn_1
141: vn = un_1 + betan*vn_1 - deltan*qn_1
142: sn = rn_1 - alphan*vn
144: The algorithm in the paper is missing the alphan/alphan_1 term in the zn update
145: */
146: PetscLogEventBegin(VEC_Ops,0,0,0,0);
147: tmp1 = (alphan/alphan_1)*betan;
148: tmp2 = alphan*deltan;
149: for (i=0; i<N; i++) {
150: zn[i] = alphan*rn_1[i] + tmp1*zn_1[i] - tmp2*vn_1[i];
151: vn[i] = un_1[i] + betan*vn_1[i] - deltan*qn_1[i];
152: sn[i] = rn_1[i] - alphan*vn[i];
153: }
154: PetscLogFlops(3.0+11.0*N);
155: PetscLogEventEnd(VEC_Ops,0,0,0,0);
157: /*
158: qn = A*vn
159: */
160: KSP_PCApplyBAorAB(ksp,Vn,Qn,Tn);
162: /*
163: tn = un_1 - alphan*qn
164: */
165: VecWAXPY(Tn,-alphan,Qn,Un_1);
166:
168: /*
169: phin = r0'sn
170: pin = r0'qn
171: gamman = f0'sn
172: etan = f0'tn
173: thetan = sn'tn
174: kappan = tn'tn
175: */
176: PetscLogEventBegin(VEC_ReduceArithmetic,0,0,0,0);
177: phin = pin = gamman = etan = thetan = kappan = 0.0;
178: for (i=0; i<N; i++) {
179: phin += r0[i]*sn[i];
180: pin += r0[i]*qn[i];
181: gamman += f0[i]*sn[i];
182: etan += f0[i]*tn[i];
183: thetan += sn[i]*tn[i];
184: kappan += tn[i]*tn[i];
185: }
186: PetscLogFlops(12.0*N);
187: PetscLogEventEnd(VEC_ReduceArithmetic,0,0,0,0);
188: insums[0] = phin;
189: insums[1] = pin;
190: insums[2] = gamman;
191: insums[3] = etan;
192: insums[4] = thetan;
193: insums[5] = kappan;
194: insums[6] = rnormin;
195: PetscLogEventBarrierBegin(VEC_ReduceBarrier,0,0,0,0,((PetscObject)ksp)->comm);
196: #if defined(PETSC_HAVE_MPI_LONG_DOUBLE) && !defined(PETSC_USE_COMPLEX) && (defined(PETSC_USE_REAL_SINGLE) || defined(PETSC_USE_REAL_DOUBLE))
197: if (ksp->lagnorm && ksp->its > 1) {
198: MPI_Allreduce(insums,outsums,7,MPI_LONG_DOUBLE,MPI_SUM,((PetscObject)ksp)->comm);
199: } else {
200: MPI_Allreduce(insums,outsums,6,MPI_LONG_DOUBLE,MPI_SUM,((PetscObject)ksp)->comm);
201: }
202: #else
203: if (ksp->lagnorm && ksp->its > 1) {
204: MPI_Allreduce(insums,outsums,7,MPIU_SCALAR,MPIU_SUM,((PetscObject)ksp)->comm);
205: } else {
206: MPI_Allreduce(insums,outsums,6,MPIU_SCALAR,MPIU_SUM,((PetscObject)ksp)->comm);
207: }
208: #endif
209: PetscLogEventBarrierEnd(VEC_ReduceBarrier,0,0,0,0,((PetscObject)ksp)->comm);
210: phin = outsums[0];
211: pin = outsums[1];
212: gamman = outsums[2];
213: etan = outsums[3];
214: thetan = outsums[4];
215: kappan = outsums[5];
216: if (ksp->lagnorm && ksp->its > 1) rnorm = PetscSqrtReal(PetscRealPart(outsums[6]));
218: if (kappan == 0.0) SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_CONV_FAILED,"kappan is zero, iteration %D",ksp->its);
219: if (thetan == 0.0) SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_CONV_FAILED,"thetan is zero, iteration %D",ksp->its);
220: omegan = thetan/kappan;
221: sigman = gamman - omegan*etan;
223: /*
224: rn = sn - omegan*tn
225: xn = xn_1 + zn + omegan*sn
226: */
227: PetscLogEventBegin(VEC_Ops,0,0,0,0);
228: rnormin = 0.0;
229: for (i=0; i<N; i++) {
230: rn[i] = sn[i] - omegan*tn[i];
231: rnormin += PetscRealPart(PetscConj(rn[i])*rn[i]);
232: xn[i] += zn[i] + omegan*sn[i];
233: }
234: PetscObjectStateIncrease((PetscObject)Xn);
235: PetscLogFlops(7.0*N);
236: PetscLogEventEnd(VEC_Ops,0,0,0,0);
238: if (!ksp->lagnorm && ksp->chknorm < ksp->its) {
239: PetscLogEventBarrierBegin(VEC_ReduceBarrier,0,0,0,0,((PetscObject)ksp)->comm);
240: MPI_Allreduce(&rnormin,&rnorm,1,MPIU_REAL,MPIU_SUM,((PetscObject)ksp)->comm);
241: PetscLogEventBarrierEnd(VEC_ReduceBarrier,0,0,0,0,((PetscObject)ksp)->comm);
242: rnorm = PetscSqrtReal(rnorm);
243: }
245: /* Test for convergence */
246: KSPMonitor(ksp,ksp->its,rnorm);
247: (*ksp->converged)(ksp,ksp->its,rnorm,&ksp->reason,ksp->cnvP);
248: if (ksp->reason) break;
249:
250: /* un = A*rn */
251: KSP_PCApplyBAorAB(ksp,Rn,Un,Tn);
253: /* Update n-1 locations with n locations */
254: sigman_2 = sigman_1;
255: sigman_1 = sigman;
256: pin_1 = pin;
257: phin_1 = phin;
258: alphan_1 = alphan;
259: taun_1 = taun;
260: omegan_1 = omegan;
261: }
262: if (ksp->its >= ksp->max_it) {
263: ksp->reason = KSP_DIVERGED_ITS;
264: }
265: KSPUnwindPreconditioner(ksp,Xn,Tn);
266: return(0);
267: }
270: /*MC
271: KSPIBCGS - Implements the IBiCGStab (Improved Stabilized version of BiConjugate Gradient Squared) method
272: in an alternative form to have only a single global reduction operation instead of the usual 3 (or 4)
274: Options Database Keys:
275: . see KSPSolve()
277: Level: beginner
279: Notes: Supports left and right preconditioning
281: See KSPBCGSL for additional stabilization
283: Unlike the Bi-CG-stab algorithm, this requires one multiplication be the transpose of the operator
284: before the iteration starts.
286: The paper has two errors in the algorithm presented, they are fixed in the code in KSPSolve_IBCGS()
288: For maximum reduction in the number of global reduction operations, this solver should be used with
289: KSPSetLagNorm().
291: This is not supported for complex numbers.
293: Reference: The Improved BiCGStab Method for Large and Sparse Unsymmetric Linear Systems on Parallel Distributed Memory
294: Architectures. L. T. Yang and R. Brent, Proceedings of the Fifth International Conference on Algorithms and
295: Architectures for Parallel Processing, 2002, IEEE.
297: .seealso: KSPCreate(), KSPSetType(), KSPType (for list of available types), KSP, KSPBICG, KSPBCGSL, KSPIBCGS, KSPSetLagNorm()
298: M*/
299: EXTERN_C_BEGIN
302: PetscErrorCode KSPCreate_IBCGS(KSP ksp)
303: {
307: ksp->data = (void*)0;
308: KSPSetSupportedNorm(ksp,KSP_NORM_PRECONDITIONED,PC_LEFT,2);
309: KSPSetSupportedNorm(ksp,KSP_NORM_UNPRECONDITIONED,PC_RIGHT,1);
310: ksp->ops->setup = KSPSetUp_IBCGS;
311: ksp->ops->solve = KSPSolve_IBCGS;
312: ksp->ops->destroy = KSPDefaultDestroy;
313: ksp->ops->buildsolution = KSPDefaultBuildSolution;
314: ksp->ops->buildresidual = KSPDefaultBuildResidual;
315: ksp->ops->setfromoptions = 0;
316: ksp->ops->view = 0;
317: #if defined(PETSC_USE_COMPLEX)
318: SETERRQ(((PetscObject)ksp)->comm,PETSC_ERR_SUP,"This is not supported for complex numbers");
319: #endif
320: return(0);
321: }
322: EXTERN_C_END