Actual source code: sbaijfact2.c

petsc-master 2019-08-17
Report Typos and Errors

  2: /*
  3:     Factorization code for SBAIJ format.
  4: */

  6:  #include <../src/mat/impls/sbaij/seq/sbaij.h>
  7:  #include <../src/mat/impls/baij/seq/baij.h>
  8:  #include <petsc/private/kernels/blockinvert.h>

 10: PetscErrorCode MatSolve_SeqSBAIJ_N_inplace(Mat A,Vec bb,Vec xx)
 11: {
 12:   Mat_SeqSBAIJ      *a   =(Mat_SeqSBAIJ*)A->data;
 13:   IS                isrow=a->row;
 14:   PetscInt          mbs  =a->mbs,*ai=a->i,*aj=a->j;
 15:   PetscErrorCode    ierr;
 16:   const PetscInt    *r;
 17:   PetscInt          nz,*vj,k,idx,k1;
 18:   PetscInt          bs =A->rmap->bs,bs2 = a->bs2;
 19:   const MatScalar   *aa=a->a,*v,*diag;
 20:   PetscScalar       *x,*xk,*xj,*xk_tmp,*t;
 21:   const PetscScalar *b;

 24:   VecGetArrayRead(bb,&b);
 25:   VecGetArray(xx,&x);
 26:   t    = a->solve_work;
 27:   ISGetIndices(isrow,&r);
 28:   PetscMalloc1(bs,&xk_tmp);

 30:   /* solve U^T * D * y = b by forward substitution */
 31:   xk = t;
 32:   for (k=0; k<mbs; k++) { /* t <- perm(b) */
 33:     idx = bs*r[k];
 34:     for (k1=0; k1<bs; k1++) *xk++ = b[idx+k1];
 35:   }
 36:   for (k=0; k<mbs; k++) {
 37:     v    = aa + bs2*ai[k];
 38:     xk   = t + k*bs;    /* Dk*xk = k-th block of x */
 39:     PetscArraycpy(xk_tmp,xk,bs); /* xk_tmp <- xk */
 40:     nz   = ai[k+1] - ai[k];
 41:     vj   = aj + ai[k];
 42:     xj   = t + (*vj)*bs; /* *vj-th block of x, *vj>k */
 43:     while (nz--) {
 44:       /* x(:) += U(k,:)^T*(Dk*xk) */
 45:       PetscKernel_v_gets_v_plus_Atranspose_times_w(bs,xj,v,xk_tmp); /* xj <- xj + v^t * xk */
 46:       vj++; xj = t + (*vj)*bs;
 47:       v       += bs2;
 48:     }
 49:     /* xk = inv(Dk)*(Dk*xk) */
 50:     diag = aa+k*bs2;                            /* ptr to inv(Dk) */
 51:     PetscKernel_w_gets_A_times_v(bs,xk_tmp,diag,xk); /* xk <- diag * xk */
 52:   }

 54:   /* solve U*x = y by back substitution */
 55:   for (k=mbs-1; k>=0; k--) {
 56:     v  = aa + bs2*ai[k];
 57:     xk = t + k*bs;        /* xk */
 58:     nz = ai[k+1] - ai[k];
 59:     vj = aj + ai[k];
 60:     xj = t + (*vj)*bs;
 61:     while (nz--) {
 62:       /* xk += U(k,:)*x(:) */
 63:       PetscKernel_v_gets_v_plus_A_times_w(bs,xk,v,xj); /* xk <- xk + v*xj */
 64:       vj++;
 65:       v += bs2; xj = t + (*vj)*bs;
 66:     }
 67:     idx = bs*r[k];
 68:     for (k1=0; k1<bs; k1++) x[idx+k1] = *xk++;
 69:   }

 71:   PetscFree(xk_tmp);
 72:   ISRestoreIndices(isrow,&r);
 73:   VecRestoreArrayRead(bb,&b);
 74:   VecRestoreArray(xx,&x);
 75:   PetscLogFlops(4.0*bs2*a->nz -(bs+2.0*bs2)*mbs);
 76:   return(0);
 77: }

 79: PetscErrorCode MatForwardSolve_SeqSBAIJ_N_inplace(Mat A,Vec bb,Vec xx)
 80: {
 82:   SETERRQ(PETSC_COMM_SELF,1,"not implemented yet");
 83:   return(0);
 84: }

 86: PetscErrorCode MatBackwardSolve_SeqSBAIJ_N_inplace(Mat A,Vec bb,Vec xx)
 87: {
 89:   SETERRQ(PETSC_COMM_SELF,1,"not implemented yet");
 90:   return(0);
 91: }

 93: PetscErrorCode MatForwardSolve_SeqSBAIJ_N_NaturalOrdering(const PetscInt *ai,const PetscInt *aj,const MatScalar *aa,PetscInt mbs,PetscInt bs,PetscScalar *x)
 94: {
 95:   PetscErrorCode  ierr;
 96:   PetscInt        nz,k;
 97:   const PetscInt  *vj,bs2 = bs*bs;
 98:   const MatScalar *v,*diag;
 99:   PetscScalar     *xk,*xj,*xk_tmp;

102:   PetscMalloc1(bs,&xk_tmp);
103:   for (k=0; k<mbs; k++) {
104:     v    = aa + bs2*ai[k];
105:     xk   = x + k*bs;    /* Dk*xk = k-th block of x */
106:     PetscArraycpy(xk_tmp,xk,bs); /* xk_tmp <- xk */
107:     nz   = ai[k+1] - ai[k];
108:     vj   = aj + ai[k];
109:     xj   = x + (*vj)*bs; /* *vj-th block of x, *vj>k */
110:     while (nz--) {
111:       /* x(:) += U(k,:)^T*(Dk*xk) */
112:       PetscKernel_v_gets_v_plus_Atranspose_times_w(bs,xj,v,xk_tmp); /* xj <- xj + v^t * xk */
113:       vj++; xj = x + (*vj)*bs;
114:       v       += bs2;
115:     }
116:     /* xk = inv(Dk)*(Dk*xk) */
117:     diag = aa+k*bs2;                            /* ptr to inv(Dk) */
118:     PetscKernel_w_gets_A_times_v(bs,xk_tmp,diag,xk); /* xk <- diag * xk */
119:   }
120:   PetscFree(xk_tmp);
121:   return(0);
122: }

124: PetscErrorCode MatBackwardSolve_SeqSBAIJ_N_NaturalOrdering(const PetscInt *ai,const PetscInt *aj,const MatScalar *aa,PetscInt mbs,PetscInt bs,PetscScalar *x)
125: {
126:   PetscInt        nz,k;
127:   const PetscInt  *vj,bs2 = bs*bs;
128:   const MatScalar *v;
129:   PetscScalar     *xk,*xj;

132:   for (k=mbs-1; k>=0; k--) {
133:     v  = aa + bs2*ai[k];
134:     xk = x + k*bs;        /* xk */
135:     nz = ai[k+1] - ai[k];
136:     vj = aj + ai[k];
137:     xj = x + (*vj)*bs;
138:     while (nz--) {
139:       /* xk += U(k,:)*x(:) */
140:       PetscKernel_v_gets_v_plus_A_times_w(bs,xk,v,xj); /* xk <- xk + v*xj */
141:       vj++;
142:       v += bs2; xj = x + (*vj)*bs;
143:     }
144:   }
145:   return(0);
146: }

148: PetscErrorCode MatSolve_SeqSBAIJ_N_NaturalOrdering_inplace(Mat A,Vec bb,Vec xx)
149: {
150:   Mat_SeqSBAIJ      *a=(Mat_SeqSBAIJ*)A->data;
151:   PetscErrorCode    ierr;
152:   const PetscInt    mbs=a->mbs,*ai=a->i,*aj=a->j;
153:   PetscInt          bs =A->rmap->bs;
154:   const MatScalar   *aa=a->a;
155:   PetscScalar       *x;
156:   const PetscScalar *b;

159:   VecGetArrayRead(bb,&b);
160:   VecGetArray(xx,&x);

162:   /* solve U^T * D * y = b by forward substitution */
163:   PetscArraycpy(x,b,bs*mbs); /* x <- b */
164:   MatForwardSolve_SeqSBAIJ_N_NaturalOrdering(ai,aj,aa,mbs,bs,x);

166:   /* solve U*x = y by back substitution */
167:   MatBackwardSolve_SeqSBAIJ_N_NaturalOrdering(ai,aj,aa,mbs,bs,x);

169:   VecRestoreArrayRead(bb,&b);
170:   VecRestoreArray(xx,&x);
171:   PetscLogFlops(4.0*a->bs2*a->nz - (bs+2.0*a->bs2)*mbs);
172:   return(0);
173: }

175: PetscErrorCode MatForwardSolve_SeqSBAIJ_N_NaturalOrdering_inplace(Mat A,Vec bb,Vec xx)
176: {
177:   Mat_SeqSBAIJ      *a=(Mat_SeqSBAIJ*)A->data;
178:   PetscErrorCode    ierr;
179:   const PetscInt    mbs=a->mbs,*ai=a->i,*aj=a->j;
180:   PetscInt          bs =A->rmap->bs;
181:   const MatScalar   *aa=a->a;
182:   const PetscScalar *b;
183:   PetscScalar       *x;

186:   VecGetArrayRead(bb,&b);
187:   VecGetArray(xx,&x);
188:   PetscArraycpy(x,b,bs*mbs); /* x <- b */
189:   MatForwardSolve_SeqSBAIJ_N_NaturalOrdering(ai,aj,aa,mbs,bs,x);
190:   VecRestoreArrayRead(bb,&b);
191:   VecRestoreArray(xx,&x);
192:   PetscLogFlops(2.0*a->bs2*a->nz - bs*mbs);
193:   return(0);
194: }

196: PetscErrorCode MatBackwardSolve_SeqSBAIJ_N_NaturalOrdering_inplace(Mat A,Vec bb,Vec xx)
197: {
198:   Mat_SeqSBAIJ      *a=(Mat_SeqSBAIJ*)A->data;
199:   PetscErrorCode    ierr;
200:   const PetscInt    mbs=a->mbs,*ai=a->i,*aj=a->j;
201:   PetscInt          bs =A->rmap->bs;
202:   const MatScalar   *aa=a->a;
203:   const PetscScalar *b;
204:   PetscScalar       *x;

207:   VecGetArrayRead(bb,&b);
208:   VecGetArray(xx,&x);
209:   PetscArraycpy(x,b,bs*mbs);
210:   MatBackwardSolve_SeqSBAIJ_N_NaturalOrdering(ai,aj,aa,mbs,bs,x);
211:   VecRestoreArrayRead(bb,&b);
212:   VecRestoreArray(xx,&x);
213:   PetscLogFlops(2.0*a->bs2*(a->nz-mbs));
214:   return(0);
215: }

217: PetscErrorCode MatSolve_SeqSBAIJ_7_inplace(Mat A,Vec bb,Vec xx)
218: {
219:   Mat_SeqSBAIJ      *a = (Mat_SeqSBAIJ*)A->data;
220:   IS                isrow=a->row;
221:   const PetscInt    mbs  =a->mbs,*ai=a->i,*aj=a->j,*r,*vj;
222:   PetscErrorCode    ierr;
223:   PetscInt          nz,k,idx;
224:   const MatScalar   *aa=a->a,*v,*d;
225:   PetscScalar       *x,x0,x1,x2,x3,x4,x5,x6,*t,*tp;
226:   const PetscScalar *b;

229:   VecGetArrayRead(bb,&b);
230:   VecGetArray(xx,&x);
231:   t    = a->solve_work;
232:   ISGetIndices(isrow,&r);

234:   /* solve U^T * D * y = b by forward substitution */
235:   tp = t;
236:   for (k=0; k<mbs; k++) { /* t <- perm(b) */
237:     idx   = 7*r[k];
238:     tp[0] = b[idx];
239:     tp[1] = b[idx+1];
240:     tp[2] = b[idx+2];
241:     tp[3] = b[idx+3];
242:     tp[4] = b[idx+4];
243:     tp[5] = b[idx+5];
244:     tp[6] = b[idx+6];
245:     tp   += 7;
246:   }

248:   for (k=0; k<mbs; k++) {
249:     v  = aa + 49*ai[k];
250:     vj = aj + ai[k];
251:     tp = t + k*7;
252:     x0 =tp[0]; x1=tp[1]; x2=tp[2]; x3=tp[3]; x4=tp[4]; x5=tp[5]; x6=tp[6];
253:     nz = ai[k+1] - ai[k];
254:     tp = t + (*vj)*7;
255:     while (nz--) {
256:       tp[0]+=  v[0]*x0 +  v[1]*x1 +  v[2]*x2 + v[3]*x3 + v[4]*x4 + v[5]*x5 + v[6]*x6;
257:       tp[1]+=  v[7]*x0 +  v[8]*x1 +  v[9]*x2+ v[10]*x3+ v[11]*x4+ v[12]*x5+ v[13]*x6;
258:       tp[2]+= v[14]*x0 + v[15]*x1 + v[16]*x2+ v[17]*x3+ v[18]*x4+ v[19]*x5+ v[20]*x6;
259:       tp[3]+= v[21]*x0 + v[22]*x1 + v[23]*x2+ v[24]*x3+ v[25]*x4+ v[26]*x5+ v[27]*x6;
260:       tp[4]+= v[28]*x0 + v[29]*x1 + v[30]*x2+ v[31]*x3+ v[32]*x4+ v[33]*x5+ v[34]*x6;
261:       tp[5]+= v[35]*x0 + v[36]*x1 + v[37]*x2+ v[38]*x3+ v[39]*x4+ v[40]*x5+ v[41]*x6;
262:       tp[6]+= v[42]*x0 + v[43]*x1 + v[44]*x2+ v[45]*x3+ v[46]*x4+ v[47]*x5+ v[48]*x6;
263:       vj++;
264:       tp = t + (*vj)*7;
265:       v += 49;
266:     }

268:     /* xk = inv(Dk)*(Dk*xk) */
269:     d     = aa+k*49;       /* ptr to inv(Dk) */
270:     tp    = t + k*7;
271:     tp[0] = d[0]*x0 + d[7]*x1 + d[14]*x2 + d[21]*x3 + d[28]*x4 + d[35]*x5 + d[42]*x6;
272:     tp[1] = d[1]*x0 + d[8]*x1 + d[15]*x2 + d[22]*x3 + d[29]*x4 + d[36]*x5 + d[43]*x6;
273:     tp[2] = d[2]*x0 + d[9]*x1 + d[16]*x2 + d[23]*x3 + d[30]*x4 + d[37]*x5 + d[44]*x6;
274:     tp[3] = d[3]*x0+ d[10]*x1 + d[17]*x2 + d[24]*x3 + d[31]*x4 + d[38]*x5 + d[45]*x6;
275:     tp[4] = d[4]*x0+ d[11]*x1 + d[18]*x2 + d[25]*x3 + d[32]*x4 + d[39]*x5 + d[46]*x6;
276:     tp[5] = d[5]*x0+ d[12]*x1 + d[19]*x2 + d[26]*x3 + d[33]*x4 + d[40]*x5 + d[47]*x6;
277:     tp[6] = d[6]*x0+ d[13]*x1 + d[20]*x2 + d[27]*x3 + d[34]*x4 + d[41]*x5 + d[48]*x6;
278:   }

280:   /* solve U*x = y by back substitution */
281:   for (k=mbs-1; k>=0; k--) {
282:     v  = aa + 49*ai[k];
283:     vj = aj + ai[k];
284:     tp = t + k*7;
285:     x0 = tp[0]; x1=tp[1]; x2=tp[2]; x3=tp[3]; x4=tp[4]; x5=tp[5];  x6=tp[6]; /* xk */
286:     nz = ai[k+1] - ai[k];

288:     tp = t + (*vj)*7;
289:     while (nz--) {
290:       /* xk += U(k,:)*x(:) */
291:       x0 += v[0]*tp[0] + v[7]*tp[1] + v[14]*tp[2] + v[21]*tp[3] + v[28]*tp[4] + v[35]*tp[5] + v[42]*tp[6];
292:       x1 += v[1]*tp[0] + v[8]*tp[1] + v[15]*tp[2] + v[22]*tp[3] + v[29]*tp[4] + v[36]*tp[5] + v[43]*tp[6];
293:       x2 += v[2]*tp[0] + v[9]*tp[1] + v[16]*tp[2] + v[23]*tp[3] + v[30]*tp[4] + v[37]*tp[5] + v[44]*tp[6];
294:       x3 += v[3]*tp[0]+ v[10]*tp[1] + v[17]*tp[2] + v[24]*tp[3] + v[31]*tp[4] + v[38]*tp[5] + v[45]*tp[6];
295:       x4 += v[4]*tp[0]+ v[11]*tp[1] + v[18]*tp[2] + v[25]*tp[3] + v[32]*tp[4] + v[39]*tp[5] + v[46]*tp[6];
296:       x5 += v[5]*tp[0]+ v[12]*tp[1] + v[19]*tp[2] + v[26]*tp[3] + v[33]*tp[4] + v[40]*tp[5] + v[47]*tp[6];
297:       x6 += v[6]*tp[0]+ v[13]*tp[1] + v[20]*tp[2] + v[27]*tp[3] + v[34]*tp[4] + v[41]*tp[5] + v[48]*tp[6];
298:       vj++;
299:       tp = t + (*vj)*7;
300:       v += 49;
301:     }
302:     tp       = t + k*7;
303:     tp[0]    = x0; tp[1]=x1; tp[2]=x2; tp[3]=x3; tp[4]=x4; tp[5]=x5; tp[6]=x6;
304:     idx      = 7*r[k];
305:     x[idx]   = x0;
306:     x[idx+1] = x1;
307:     x[idx+2] = x2;
308:     x[idx+3] = x3;
309:     x[idx+4] = x4;
310:     x[idx+5] = x5;
311:     x[idx+6] = x6;
312:   }

314:   ISRestoreIndices(isrow,&r);
315:   VecRestoreArrayRead(bb,&b);
316:   VecRestoreArray(xx,&x);
317:   PetscLogFlops(4.0*a->bs2*a->nz - (A->rmap->bs+2.0*a->bs2)*mbs);
318:   return(0);
319: }

321: PetscErrorCode MatForwardSolve_SeqSBAIJ_7_NaturalOrdering(const PetscInt *ai,const PetscInt *aj,const MatScalar *aa,PetscInt mbs,PetscScalar *x)
322: {
323:   const MatScalar *v,*d;
324:   PetscScalar     *xp,x0,x1,x2,x3,x4,x5,x6;
325:   PetscInt        nz,k;
326:   const PetscInt  *vj;

329:   for (k=0; k<mbs; k++) {
330:     v  = aa + 49*ai[k];
331:     xp = x + k*7;
332:     x0 = xp[0]; x1=xp[1]; x2=xp[2]; x3=xp[3]; x4=xp[4]; x5=xp[5]; x6=xp[6]; /* Dk*xk = k-th block of x */
333:     nz = ai[k+1] - ai[k];
334:     vj = aj + ai[k];
335:     xp = x + (*vj)*7;
336:     PetscPrefetchBlock(vj+nz,nz,0,PETSC_PREFETCH_HINT_NTA);      /* Indices for the next row (assumes same size as this one) */
337:     PetscPrefetchBlock(v+49*nz,49*nz,0,PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */
338:     while (nz--) {
339:       /* x(:) += U(k,:)^T*(Dk*xk) */
340:       xp[0]+=  v[0]*x0 +  v[1]*x1 +  v[2]*x2 + v[3]*x3 + v[4]*x4 + v[5]*x5 + v[6]*x6;
341:       xp[1]+=  v[7]*x0 +  v[8]*x1 +  v[9]*x2+ v[10]*x3+ v[11]*x4+ v[12]*x5+ v[13]*x6;
342:       xp[2]+= v[14]*x0 + v[15]*x1 + v[16]*x2+ v[17]*x3+ v[18]*x4+ v[19]*x5+ v[20]*x6;
343:       xp[3]+= v[21]*x0 + v[22]*x1 + v[23]*x2+ v[24]*x3+ v[25]*x4+ v[26]*x5+ v[27]*x6;
344:       xp[4]+= v[28]*x0 + v[29]*x1 + v[30]*x2+ v[31]*x3+ v[32]*x4+ v[33]*x5+ v[34]*x6;
345:       xp[5]+= v[35]*x0 + v[36]*x1 + v[37]*x2+ v[38]*x3+ v[39]*x4+ v[40]*x5+ v[41]*x6;
346:       xp[6]+= v[42]*x0 + v[43]*x1 + v[44]*x2+ v[45]*x3+ v[46]*x4+ v[47]*x5+ v[48]*x6;
347:       vj++;
348:       xp = x + (*vj)*7;
349:       v += 49;
350:     }
351:     /* xk = inv(Dk)*(Dk*xk) */
352:     d     = aa+k*49;       /* ptr to inv(Dk) */
353:     xp    = x + k*7;
354:     xp[0] = d[0]*x0 + d[7]*x1 + d[14]*x2 + d[21]*x3 + d[28]*x4 + d[35]*x5 + d[42]*x6;
355:     xp[1] = d[1]*x0 + d[8]*x1 + d[15]*x2 + d[22]*x3 + d[29]*x4 + d[36]*x5 + d[43]*x6;
356:     xp[2] = d[2]*x0 + d[9]*x1 + d[16]*x2 + d[23]*x3 + d[30]*x4 + d[37]*x5 + d[44]*x6;
357:     xp[3] = d[3]*x0+ d[10]*x1 + d[17]*x2 + d[24]*x3 + d[31]*x4 + d[38]*x5 + d[45]*x6;
358:     xp[4] = d[4]*x0+ d[11]*x1 + d[18]*x2 + d[25]*x3 + d[32]*x4 + d[39]*x5 + d[46]*x6;
359:     xp[5] = d[5]*x0+ d[12]*x1 + d[19]*x2 + d[26]*x3 + d[33]*x4 + d[40]*x5 + d[47]*x6;
360:     xp[6] = d[6]*x0+ d[13]*x1 + d[20]*x2 + d[27]*x3 + d[34]*x4 + d[41]*x5 + d[48]*x6;
361:   }
362:   return(0);
363: }

365: PetscErrorCode MatBackwardSolve_SeqSBAIJ_7_NaturalOrdering(const PetscInt *ai,const PetscInt *aj,const MatScalar *aa,PetscInt mbs,PetscScalar *x)
366: {
367:   const MatScalar *v;
368:   PetscScalar     *xp,x0,x1,x2,x3,x4,x5,x6;
369:   PetscInt        nz,k;
370:   const PetscInt  *vj;

373:   for (k=mbs-1; k>=0; k--) {
374:     v  = aa + 49*ai[k];
375:     xp = x + k*7;
376:     x0 = xp[0]; x1=xp[1]; x2=xp[2]; x3=xp[3]; x4=xp[4]; x5=xp[5]; x6=xp[6]; /* xk */
377:     nz = ai[k+1] - ai[k];
378:     vj = aj + ai[k];
379:     xp = x + (*vj)*7;
380:     PetscPrefetchBlock(vj-nz,nz,0,PETSC_PREFETCH_HINT_NTA);      /* Indices for the next row (assumes same size as this one) */
381:     PetscPrefetchBlock(v-49*nz,49*nz,0,PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */
382:     while (nz--) {
383:       /* xk += U(k,:)*x(:) */
384:       x0 += v[0]*xp[0] + v[7]*xp[1] + v[14]*xp[2] + v[21]*xp[3] + v[28]*xp[4] + v[35]*xp[5] + v[42]*xp[6];
385:       x1 += v[1]*xp[0] + v[8]*xp[1] + v[15]*xp[2] + v[22]*xp[3] + v[29]*xp[4] + v[36]*xp[5] + v[43]*xp[6];
386:       x2 += v[2]*xp[0] + v[9]*xp[1] + v[16]*xp[2] + v[23]*xp[3] + v[30]*xp[4] + v[37]*xp[5] + v[44]*xp[6];
387:       x3 += v[3]*xp[0]+ v[10]*xp[1] + v[17]*xp[2] + v[24]*xp[3] + v[31]*xp[4] + v[38]*xp[5] + v[45]*xp[6];
388:       x4 += v[4]*xp[0]+ v[11]*xp[1] + v[18]*xp[2] + v[25]*xp[3] + v[32]*xp[4] + v[39]*xp[5] + v[46]*xp[6];
389:       x5 += v[5]*xp[0]+ v[12]*xp[1] + v[19]*xp[2] + v[26]*xp[3] + v[33]*xp[4] + v[40]*xp[5] + v[47]*xp[6];
390:       x6 += v[6]*xp[0]+ v[13]*xp[1] + v[20]*xp[2] + v[27]*xp[3] + v[34]*xp[4] + v[41]*xp[5] + v[48]*xp[6];
391:       vj++;
392:       v += 49; xp = x + (*vj)*7;
393:     }
394:     xp   = x + k*7;
395:     xp[0]=x0; xp[1]=x1; xp[2]=x2; xp[3]=x3; xp[4]=x4; xp[5]=x5; xp[6]=x6;
396:   }
397:   return(0);
398: }

400: PetscErrorCode MatSolve_SeqSBAIJ_7_NaturalOrdering_inplace(Mat A,Vec bb,Vec xx)
401: {
402:   Mat_SeqSBAIJ      *a=(Mat_SeqSBAIJ*)A->data;
403:   PetscErrorCode    ierr;
404:   const PetscInt    mbs=a->mbs,*ai=a->i,*aj=a->j;
405:   const MatScalar   *aa=a->a;
406:   PetscScalar       *x;
407:   const PetscScalar *b;

410:   VecGetArrayRead(bb,&b);
411:   VecGetArray(xx,&x);

413:   /* solve U^T * D * y = b by forward substitution */
414:   PetscArraycpy(x,b,7*mbs); /* x <- b */
415:   MatForwardSolve_SeqSBAIJ_7_NaturalOrdering(ai,aj,aa,mbs,x);

417:   /* solve U*x = y by back substitution */
418:   MatBackwardSolve_SeqSBAIJ_7_NaturalOrdering(ai,aj,aa,mbs,x);

420:   VecRestoreArrayRead(bb,&b);
421:   VecRestoreArray(xx,&x);
422:   PetscLogFlops(4.0*a->bs2*a->nz - (A->rmap->bs+2.0*a->bs2)*mbs);
423:   return(0);
424: }

426: PetscErrorCode MatForwardSolve_SeqSBAIJ_7_NaturalOrdering_inplace(Mat A,Vec bb,Vec xx)
427: {
428:   Mat_SeqSBAIJ      *a=(Mat_SeqSBAIJ*)A->data;
429:   PetscErrorCode    ierr;
430:   const PetscInt    mbs=a->mbs,*ai=a->i,*aj=a->j;
431:   const MatScalar   *aa=a->a;
432:   PetscScalar       *x;
433:   const PetscScalar *b;

436:   VecGetArrayRead(bb,&b);
437:   VecGetArray(xx,&x);
438:   PetscArraycpy(x,b,7*mbs);
439:   MatForwardSolve_SeqSBAIJ_7_NaturalOrdering(ai,aj,aa,mbs,x);
440:   VecRestoreArrayRead(bb,&b);
441:   VecRestoreArray(xx,&x);
442:   PetscLogFlops(2.0*a->bs2*a->nz - A->rmap->bs*mbs);
443:   return(0);
444: }

446: PetscErrorCode MatBackwardSolve_SeqSBAIJ_7_NaturalOrdering_inplace(Mat A,Vec bb,Vec xx)
447: {
448:   Mat_SeqSBAIJ      *a=(Mat_SeqSBAIJ*)A->data;
449:   PetscErrorCode    ierr;
450:   const PetscInt    mbs=a->mbs,*ai=a->i,*aj=a->j;
451:   const MatScalar   *aa=a->a;
452:   PetscScalar       *x;
453:   const PetscScalar *b;

456:   VecGetArrayRead(bb,&b);
457:   VecGetArray(xx,&x);
458:   PetscArraycpy(x,b,7*mbs);
459:   MatBackwardSolve_SeqSBAIJ_7_NaturalOrdering(ai,aj,aa,mbs,x);
460:   VecRestoreArrayRead(bb,&b);
461:   VecRestoreArray(xx,&x);
462:   PetscLogFlops(2.0*a->bs2*(a->nz-mbs));
463:   return(0);
464: }

466: PetscErrorCode MatSolve_SeqSBAIJ_6_inplace(Mat A,Vec bb,Vec xx)
467: {
468:   Mat_SeqSBAIJ      *a   =(Mat_SeqSBAIJ*)A->data;
469:   IS                isrow=a->row;
470:   const PetscInt    mbs  =a->mbs,*ai=a->i,*aj=a->j,*r,*vj;
471:   PetscErrorCode    ierr;
472:   PetscInt          nz,k,idx;
473:   const MatScalar   *aa=a->a,*v,*d;
474:   PetscScalar       *x,x0,x1,x2,x3,x4,x5,*t,*tp;
475:   const PetscScalar *b;

478:   VecGetArrayRead(bb,&b);
479:   VecGetArray(xx,&x);
480:   t    = a->solve_work;
481:   ISGetIndices(isrow,&r);

483:   /* solve U^T * D * y = b by forward substitution */
484:   tp = t;
485:   for (k=0; k<mbs; k++) { /* t <- perm(b) */
486:     idx   = 6*r[k];
487:     tp[0] = b[idx];
488:     tp[1] = b[idx+1];
489:     tp[2] = b[idx+2];
490:     tp[3] = b[idx+3];
491:     tp[4] = b[idx+4];
492:     tp[5] = b[idx+5];
493:     tp   += 6;
494:   }

496:   for (k=0; k<mbs; k++) {
497:     v  = aa + 36*ai[k];
498:     vj = aj + ai[k];
499:     tp = t + k*6;
500:     x0 =tp[0]; x1=tp[1]; x2=tp[2]; x3=tp[3]; x4=tp[4]; x5=tp[5];
501:     nz = ai[k+1] - ai[k];
502:     tp = t + (*vj)*6;
503:     while (nz--) {
504:       tp[0] +=  v[0]*x0 +  v[1]*x1 +  v[2]*x2 + v[3]*x3 + v[4]*x4 + v[5]*x5;
505:       tp[1] +=  v[6]*x0 +  v[7]*x1 +  v[8]*x2 + v[9]*x3+ v[10]*x4+ v[11]*x5;
506:       tp[2] += v[12]*x0 + v[13]*x1 + v[14]*x2+ v[15]*x3+ v[16]*x4+ v[17]*x5;
507:       tp[3] += v[18]*x0 + v[19]*x1 + v[20]*x2+ v[21]*x3+ v[22]*x4+ v[23]*x5;
508:       tp[4] += v[24]*x0 + v[25]*x1 + v[26]*x2+ v[27]*x3+ v[28]*x4+ v[29]*x5;
509:       tp[5] += v[30]*x0 + v[31]*x1 + v[32]*x2+ v[33]*x3+ v[34]*x4+ v[35]*x5;
510:       vj++;
511:       tp = t + (*vj)*6;
512:       v += 36;
513:     }

515:     /* xk = inv(Dk)*(Dk*xk) */
516:     d     = aa+k*36;       /* ptr to inv(Dk) */
517:     tp    = t + k*6;
518:     tp[0] = d[0]*x0 + d[6]*x1 + d[12]*x2 + d[18]*x3 + d[24]*x4 + d[30]*x5;
519:     tp[1] = d[1]*x0 + d[7]*x1 + d[13]*x2 + d[19]*x3 + d[25]*x4 + d[31]*x5;
520:     tp[2] = d[2]*x0 + d[8]*x1 + d[14]*x2 + d[20]*x3 + d[26]*x4 + d[32]*x5;
521:     tp[3] = d[3]*x0 + d[9]*x1 + d[15]*x2 + d[21]*x3 + d[27]*x4 + d[33]*x5;
522:     tp[4] = d[4]*x0+ d[10]*x1 + d[16]*x2 + d[22]*x3 + d[28]*x4 + d[34]*x5;
523:     tp[5] = d[5]*x0+ d[11]*x1 + d[17]*x2 + d[23]*x3 + d[29]*x4 + d[35]*x5;
524:   }

526:   /* solve U*x = y by back substitution */
527:   for (k=mbs-1; k>=0; k--) {
528:     v  = aa + 36*ai[k];
529:     vj = aj + ai[k];
530:     tp = t + k*6;
531:     x0 = tp[0]; x1=tp[1]; x2=tp[2]; x3=tp[3]; x4=tp[4]; x5=tp[5]; /* xk */
532:     nz = ai[k+1] - ai[k];

534:     tp = t + (*vj)*6;
535:     while (nz--) {
536:       /* xk += U(k,:)*x(:) */
537:       x0 += v[0]*tp[0] + v[6]*tp[1] + v[12]*tp[2] + v[18]*tp[3] + v[24]*tp[4] + v[30]*tp[5];
538:       x1 += v[1]*tp[0] + v[7]*tp[1] + v[13]*tp[2] + v[19]*tp[3] + v[25]*tp[4] + v[31]*tp[5];
539:       x2 += v[2]*tp[0] + v[8]*tp[1] + v[14]*tp[2] + v[20]*tp[3] + v[26]*tp[4] + v[32]*tp[5];
540:       x3 += v[3]*tp[0] + v[9]*tp[1] + v[15]*tp[2] + v[21]*tp[3] + v[27]*tp[4] + v[33]*tp[5];
541:       x4 += v[4]*tp[0]+ v[10]*tp[1] + v[16]*tp[2] + v[22]*tp[3] + v[28]*tp[4] + v[34]*tp[5];
542:       x5 += v[5]*tp[0]+ v[11]*tp[1] + v[17]*tp[2] + v[23]*tp[3] + v[29]*tp[4] + v[35]*tp[5];
543:       vj++;
544:       tp = t + (*vj)*6;
545:       v += 36;
546:     }
547:     tp       = t + k*6;
548:     tp[0]    = x0; tp[1]=x1; tp[2]=x2; tp[3]=x3; tp[4]=x4; tp[5]=x5;
549:     idx      = 6*r[k];
550:     x[idx]   = x0;
551:     x[idx+1] = x1;
552:     x[idx+2] = x2;
553:     x[idx+3] = x3;
554:     x[idx+4] = x4;
555:     x[idx+5] = x5;
556:   }

558:   ISRestoreIndices(isrow,&r);
559:   VecRestoreArrayRead(bb,&b);
560:   VecRestoreArray(xx,&x);
561:   PetscLogFlops(4.0*a->bs2*a->nz - (A->rmap->bs+2.0*a->bs2)*mbs);
562:   return(0);
563: }

565: PetscErrorCode MatForwardSolve_SeqSBAIJ_6_NaturalOrdering(const PetscInt *ai,const PetscInt *aj,const MatScalar *aa,PetscInt mbs,PetscScalar *x)
566: {
567:   const MatScalar *v,*d;
568:   PetscScalar     *xp,x0,x1,x2,x3,x4,x5;
569:   PetscInt        nz,k;
570:   const PetscInt  *vj;

573:   for (k=0; k<mbs; k++) {
574:     v  = aa + 36*ai[k];
575:     xp = x + k*6;
576:     x0 = xp[0]; x1=xp[1]; x2=xp[2]; x3=xp[3]; x4=xp[4]; x5=xp[5]; /* Dk*xk = k-th block of x */
577:     nz = ai[k+1] - ai[k];
578:     vj = aj + ai[k];
579:     xp = x + (*vj)*6;
580:     PetscPrefetchBlock(vj+nz,nz,0,PETSC_PREFETCH_HINT_NTA);      /* Indices for the next row (assumes same size as this one) */
581:     PetscPrefetchBlock(v+36*nz,36*nz,0,PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */
582:     while (nz--) {
583:       /* x(:) += U(k,:)^T*(Dk*xk) */
584:       xp[0] +=  v[0]*x0 +  v[1]*x1 +  v[2]*x2 + v[3]*x3 + v[4]*x4 + v[5]*x5;
585:       xp[1] +=  v[6]*x0 +  v[7]*x1 +  v[8]*x2 + v[9]*x3+ v[10]*x4+ v[11]*x5;
586:       xp[2] += v[12]*x0 + v[13]*x1 + v[14]*x2+ v[15]*x3+ v[16]*x4+ v[17]*x5;
587:       xp[3] += v[18]*x0 + v[19]*x1 + v[20]*x2+ v[21]*x3+ v[22]*x4+ v[23]*x5;
588:       xp[4] += v[24]*x0 + v[25]*x1 + v[26]*x2+ v[27]*x3+ v[28]*x4+ v[29]*x5;
589:       xp[5] += v[30]*x0 + v[31]*x1 + v[32]*x2+ v[33]*x3+ v[34]*x4+ v[35]*x5;
590:       vj++;
591:       xp = x + (*vj)*6;
592:       v += 36;
593:     }
594:     /* xk = inv(Dk)*(Dk*xk) */
595:     d     = aa+k*36;       /* ptr to inv(Dk) */
596:     xp    = x + k*6;
597:     xp[0] = d[0]*x0 + d[6]*x1 + d[12]*x2 + d[18]*x3 + d[24]*x4 + d[30]*x5;
598:     xp[1] = d[1]*x0 + d[7]*x1 + d[13]*x2 + d[19]*x3 + d[25]*x4 + d[31]*x5;
599:     xp[2] = d[2]*x0 + d[8]*x1 + d[14]*x2 + d[20]*x3 + d[26]*x4 + d[32]*x5;
600:     xp[3] = d[3]*x0 + d[9]*x1 + d[15]*x2 + d[21]*x3 + d[27]*x4 + d[33]*x5;
601:     xp[4] = d[4]*x0+ d[10]*x1 + d[16]*x2 + d[22]*x3 + d[28]*x4 + d[34]*x5;
602:     xp[5] = d[5]*x0+ d[11]*x1 + d[17]*x2 + d[23]*x3 + d[29]*x4 + d[35]*x5;
603:   }
604:   return(0);
605: }
606: PetscErrorCode MatBackwardSolve_SeqSBAIJ_6_NaturalOrdering(const PetscInt *ai,const PetscInt *aj,const MatScalar *aa,PetscInt mbs,PetscScalar *x)
607: {
608:   const MatScalar   *v;
609:   PetscScalar       *xp,x0,x1,x2,x3,x4,x5;
610:   PetscInt          nz,k;
611:   const PetscInt    *vj;

614:   for (k=mbs-1; k>=0; k--) {
615:     v  = aa + 36*ai[k];
616:     xp = x + k*6;
617:     x0 = xp[0]; x1=xp[1]; x2=xp[2]; x3=xp[3]; x4=xp[4]; x5=xp[5]; /* xk */
618:     nz = ai[k+1] - ai[k];
619:     vj = aj + ai[k];
620:     xp = x + (*vj)*6;
621:     PetscPrefetchBlock(vj-nz,nz,0,PETSC_PREFETCH_HINT_NTA);      /* Indices for the next row (assumes same size as this one) */
622:     PetscPrefetchBlock(v-36*nz,36*nz,0,PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */
623:     while (nz--) {
624:       /* xk += U(k,:)*x(:) */
625:       x0 += v[0]*xp[0] + v[6]*xp[1] + v[12]*xp[2] + v[18]*xp[3] + v[24]*xp[4] + v[30]*xp[5];
626:       x1 += v[1]*xp[0] + v[7]*xp[1] + v[13]*xp[2] + v[19]*xp[3] + v[25]*xp[4] + v[31]*xp[5];
627:       x2 += v[2]*xp[0] + v[8]*xp[1] + v[14]*xp[2] + v[20]*xp[3] + v[26]*xp[4] + v[32]*xp[5];
628:       x3 += v[3]*xp[0] + v[9]*xp[1] + v[15]*xp[2] + v[21]*xp[3] + v[27]*xp[4] + v[33]*xp[5];
629:       x4 += v[4]*xp[0]+ v[10]*xp[1] + v[16]*xp[2] + v[22]*xp[3] + v[28]*xp[4] + v[34]*xp[5];
630:       x5 += v[5]*xp[0]+ v[11]*xp[1] + v[17]*xp[2] + v[23]*xp[3] + v[29]*xp[4] + v[35]*xp[5];
631:       vj++;
632:       v += 36; xp = x + (*vj)*6;
633:     }
634:     xp   = x + k*6;
635:     xp[0]=x0; xp[1]=x1; xp[2]=x2; xp[3]=x3; xp[4]=x4; xp[5]=x5;
636:   }
637:   return(0);
638: }


641: PetscErrorCode MatSolve_SeqSBAIJ_6_NaturalOrdering_inplace(Mat A,Vec bb,Vec xx)
642: {
643:   Mat_SeqSBAIJ      *a =(Mat_SeqSBAIJ*)A->data;
644:   const PetscInt    mbs=a->mbs,*ai=a->i,*aj=a->j;
645:   const MatScalar   *aa=a->a;
646:   PetscScalar       *x;
647:   const PetscScalar *b;
648:   PetscErrorCode    ierr;

651:   VecGetArrayRead(bb,&b);
652:   VecGetArray(xx,&x);

654:   /* solve U^T * D * y = b by forward substitution */
655:   PetscArraycpy(x,b,6*mbs); /* x <- b */
656:   MatForwardSolve_SeqSBAIJ_6_NaturalOrdering(ai,aj,aa,mbs,x);

658:   /* solve U*x = y by back substitution */
659:   MatBackwardSolve_SeqSBAIJ_6_NaturalOrdering(ai,aj,aa,mbs,x);

661:   VecRestoreArrayRead(bb,&b);
662:   VecRestoreArray(xx,&x);
663:   PetscLogFlops(4.0*a->bs2*a->nz - (A->rmap->bs+2.0*a->bs2)*mbs);
664:   return(0);
665: }

667: PetscErrorCode MatForwardSolve_SeqSBAIJ_6_NaturalOrdering_inplace(Mat A,Vec bb,Vec xx)
668: {
669:   Mat_SeqSBAIJ      *a =(Mat_SeqSBAIJ*)A->data;
670:   const PetscInt    mbs=a->mbs,*ai=a->i,*aj=a->j;
671:   const MatScalar   *aa=a->a;
672:   PetscScalar       *x;
673:   const PetscScalar *b;
674:   PetscErrorCode    ierr;

677:   VecGetArrayRead(bb,&b);
678:   VecGetArray(xx,&x);
679:   PetscArraycpy(x,b,6*mbs); /* x <- b */
680:   MatForwardSolve_SeqSBAIJ_6_NaturalOrdering(ai,aj,aa,mbs,x);
681:   VecRestoreArrayRead(bb,&b);
682:   VecRestoreArray(xx,&x);
683:   PetscLogFlops(2.0*a->bs2*a->nz - A->rmap->bs*mbs);
684:   return(0);
685: }

687: PetscErrorCode MatBackwardSolve_SeqSBAIJ_6_NaturalOrdering_inplace(Mat A,Vec bb,Vec xx)
688: {
689:   Mat_SeqSBAIJ      *a =(Mat_SeqSBAIJ*)A->data;
690:   const PetscInt    mbs=a->mbs,*ai=a->i,*aj=a->j;
691:   const MatScalar   *aa=a->a;
692:   PetscScalar       *x;
693:   const PetscScalar *b;
694:   PetscErrorCode    ierr;

697:   VecGetArrayRead(bb,&b);
698:   VecGetArray(xx,&x);
699:   PetscArraycpy(x,b,6*mbs); /* x <- b */
700:   MatBackwardSolve_SeqSBAIJ_6_NaturalOrdering(ai,aj,aa,mbs,x);
701:   VecRestoreArrayRead(bb,&b);
702:   VecRestoreArray(xx,&x);
703:   PetscLogFlops(2.0*a->bs2*(a->nz - mbs));
704:   return(0);
705: }

707: PetscErrorCode MatSolve_SeqSBAIJ_5_inplace(Mat A,Vec bb,Vec xx)
708: {
709:   Mat_SeqSBAIJ      *a=(Mat_SeqSBAIJ*)A->data;
710:   IS                isrow=a->row;
711:   const PetscInt    mbs  =a->mbs,*ai=a->i,*aj=a->j;
712:   PetscErrorCode    ierr;
713:   const PetscInt    *r,*vj;
714:   PetscInt          nz,k,idx;
715:   const MatScalar   *aa=a->a,*v,*diag;
716:   PetscScalar       *x,x0,x1,x2,x3,x4,*t,*tp;
717:   const PetscScalar *b;

720:   VecGetArrayRead(bb,&b);
721:   VecGetArray(xx,&x);
722:   t    = a->solve_work;
723:   ISGetIndices(isrow,&r);

725:   /* solve U^T * D * y = b by forward substitution */
726:   tp = t;
727:   for (k=0; k<mbs; k++) { /* t <- perm(b) */
728:     idx   = 5*r[k];
729:     tp[0] = b[idx];
730:     tp[1] = b[idx+1];
731:     tp[2] = b[idx+2];
732:     tp[3] = b[idx+3];
733:     tp[4] = b[idx+4];
734:     tp   += 5;
735:   }

737:   for (k=0; k<mbs; k++) {
738:     v  = aa + 25*ai[k];
739:     vj = aj + ai[k];
740:     tp = t + k*5;
741:     x0 = tp[0]; x1=tp[1]; x2=tp[2]; x3=tp[3]; x4=tp[4];
742:     nz = ai[k+1] - ai[k];

744:     tp = t + (*vj)*5;
745:     while (nz--) {
746:       tp[0] +=  v[0]*x0 + v[1]*x1 + v[2]*x2 + v[3]*x3 + v[4]*x4;
747:       tp[1] +=  v[5]*x0 + v[6]*x1 + v[7]*x2 + v[8]*x3 + v[9]*x4;
748:       tp[2] += v[10]*x0+ v[11]*x1+ v[12]*x2+ v[13]*x3+ v[14]*x4;
749:       tp[3] += v[15]*x0+ v[16]*x1+ v[17]*x2+ v[18]*x3+ v[19]*x4;
750:       tp[4] += v[20]*x0+ v[21]*x1+ v[22]*x2+ v[23]*x3+ v[24]*x4;
751:       vj++;
752:       tp = t + (*vj)*5;
753:       v += 25;
754:     }

756:     /* xk = inv(Dk)*(Dk*xk) */
757:     diag  = aa+k*25;          /* ptr to inv(Dk) */
758:     tp    = t + k*5;
759:     tp[0] = diag[0]*x0 + diag[5]*x1 + diag[10]*x2 + diag[15]*x3 + diag[20]*x4;
760:     tp[1] = diag[1]*x0 + diag[6]*x1 + diag[11]*x2 + diag[16]*x3 + diag[21]*x4;
761:     tp[2] = diag[2]*x0 + diag[7]*x1 + diag[12]*x2 + diag[17]*x3 + diag[22]*x4;
762:     tp[3] = diag[3]*x0 + diag[8]*x1 + diag[13]*x2 + diag[18]*x3 + diag[23]*x4;
763:     tp[4] = diag[4]*x0 + diag[9]*x1 + diag[14]*x2 + diag[19]*x3 + diag[24]*x4;
764:   }

766:   /* solve U*x = y by back substitution */
767:   for (k=mbs-1; k>=0; k--) {
768:     v  = aa + 25*ai[k];
769:     vj = aj + ai[k];
770:     tp = t + k*5;
771:     x0 = tp[0]; x1=tp[1]; x2=tp[2]; x3=tp[3]; x4=tp[4]; /* xk */
772:     nz = ai[k+1] - ai[k];

774:     tp = t + (*vj)*5;
775:     while (nz--) {
776:       /* xk += U(k,:)*x(:) */
777:       x0 += v[0]*tp[0] + v[5]*tp[1] + v[10]*tp[2] + v[15]*tp[3] + v[20]*tp[4];
778:       x1 += v[1]*tp[0] + v[6]*tp[1] + v[11]*tp[2] + v[16]*tp[3] + v[21]*tp[4];
779:       x2 += v[2]*tp[0] + v[7]*tp[1] + v[12]*tp[2] + v[17]*tp[3] + v[22]*tp[4];
780:       x3 += v[3]*tp[0] + v[8]*tp[1] + v[13]*tp[2] + v[18]*tp[3] + v[23]*tp[4];
781:       x4 += v[4]*tp[0] + v[9]*tp[1] + v[14]*tp[2] + v[19]*tp[3] + v[24]*tp[4];
782:       vj++;
783:       tp = t + (*vj)*5;
784:       v += 25;
785:     }
786:     tp       = t + k*5;
787:     tp[0]    = x0; tp[1]=x1; tp[2]=x2; tp[3]=x3; tp[4]=x4;
788:     idx      = 5*r[k];
789:     x[idx]   = x0;
790:     x[idx+1] = x1;
791:     x[idx+2] = x2;
792:     x[idx+3] = x3;
793:     x[idx+4] = x4;
794:   }

796:   ISRestoreIndices(isrow,&r);
797:   VecRestoreArrayRead(bb,&b);
798:   VecRestoreArray(xx,&x);
799:   PetscLogFlops(4.0*a->bs2*a->nz - (A->rmap->bs+2.0*a->bs2)*mbs);
800:   return(0);
801: }

803: PetscErrorCode MatForwardSolve_SeqSBAIJ_5_NaturalOrdering(const PetscInt *ai,const PetscInt *aj,const MatScalar *aa,PetscInt mbs,PetscScalar *x)
804: {
805:   const MatScalar *v,*diag;
806:   PetscScalar     *xp,x0,x1,x2,x3,x4;
807:   PetscInt        nz,k;
808:   const PetscInt  *vj;

811:   for (k=0; k<mbs; k++) {
812:     v  = aa + 25*ai[k];
813:     xp = x + k*5;
814:     x0 = xp[0]; x1=xp[1]; x2=xp[2]; x3=xp[3]; x4=xp[4];/* Dk*xk = k-th block of x */
815:     nz = ai[k+1] - ai[k];
816:     vj = aj + ai[k];
817:     xp = x + (*vj)*5;
818:     PetscPrefetchBlock(vj+nz,nz,0,PETSC_PREFETCH_HINT_NTA);      /* Indices for the next row (assumes same size as this one) */
819:     PetscPrefetchBlock(v+25*nz,25*nz,0,PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */
820:     while (nz--) {
821:       /* x(:) += U(k,:)^T*(Dk*xk) */
822:       xp[0] +=  v[0]*x0 +  v[1]*x1 +  v[2]*x2 + v[3]*x3 + v[4]*x4;
823:       xp[1] +=  v[5]*x0 +  v[6]*x1 +  v[7]*x2 + v[8]*x3 + v[9]*x4;
824:       xp[2] += v[10]*x0 + v[11]*x1 + v[12]*x2+ v[13]*x3+ v[14]*x4;
825:       xp[3] += v[15]*x0 + v[16]*x1 + v[17]*x2+ v[18]*x3+ v[19]*x4;
826:       xp[4] += v[20]*x0 + v[21]*x1 + v[22]*x2+ v[23]*x3+ v[24]*x4;
827:       vj++;
828:       xp = x + (*vj)*5;
829:       v += 25;
830:     }
831:     /* xk = inv(Dk)*(Dk*xk) */
832:     diag  = aa+k*25;         /* ptr to inv(Dk) */
833:     xp    = x + k*5;
834:     xp[0] = diag[0]*x0 + diag[5]*x1 + diag[10]*x2 + diag[15]*x3 + diag[20]*x4;
835:     xp[1] = diag[1]*x0 + diag[6]*x1 + diag[11]*x2 + diag[16]*x3 + diag[21]*x4;
836:     xp[2] = diag[2]*x0 + diag[7]*x1 + diag[12]*x2 + diag[17]*x3 + diag[22]*x4;
837:     xp[3] = diag[3]*x0 + diag[8]*x1 + diag[13]*x2 + diag[18]*x3 + diag[23]*x4;
838:     xp[4] = diag[4]*x0 + diag[9]*x1 + diag[14]*x2 + diag[19]*x3 + diag[24]*x4;
839:   }
840:   return(0);
841: }

843: PetscErrorCode MatBackwardSolve_SeqSBAIJ_5_NaturalOrdering(const PetscInt *ai,const PetscInt *aj,const MatScalar *aa,PetscInt mbs,PetscScalar *x)
844: {
845:   const MatScalar *v;
846:   PetscScalar     *xp,x0,x1,x2,x3,x4;
847:   PetscInt        nz,k;
848:   const PetscInt  *vj;

851:   for (k=mbs-1; k>=0; k--) {
852:     v  = aa + 25*ai[k];
853:     xp = x + k*5;
854:     x0 = xp[0]; x1=xp[1]; x2=xp[2]; x3=xp[3]; x4=xp[4]; /* xk */
855:     nz = ai[k+1] - ai[k];
856:     vj = aj + ai[k];
857:     xp = x + (*vj)*5;
858:     PetscPrefetchBlock(vj-nz,nz,0,PETSC_PREFETCH_HINT_NTA);      /* Indices for the next row (assumes same size as this one) */
859:     PetscPrefetchBlock(v-25*nz,25*nz,0,PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */
860:     while (nz--) {
861:       /* xk += U(k,:)*x(:) */
862:       x0 += v[0]*xp[0] + v[5]*xp[1] + v[10]*xp[2] + v[15]*xp[3] + v[20]*xp[4];
863:       x1 += v[1]*xp[0] + v[6]*xp[1] + v[11]*xp[2] + v[16]*xp[3] + v[21]*xp[4];
864:       x2 += v[2]*xp[0] + v[7]*xp[1] + v[12]*xp[2] + v[17]*xp[3] + v[22]*xp[4];
865:       x3 += v[3]*xp[0] + v[8]*xp[1] + v[13]*xp[2] + v[18]*xp[3] + v[23]*xp[4];
866:       x4 += v[4]*xp[0] + v[9]*xp[1] + v[14]*xp[2] + v[19]*xp[3] + v[24]*xp[4];
867:       vj++;
868:       v += 25; xp = x + (*vj)*5;
869:     }
870:     xp   = x + k*5;
871:     xp[0]=x0; xp[1]=x1; xp[2]=x2; xp[3]=x3; xp[4]=x4;
872:   }
873:   return(0);
874: }

876: PetscErrorCode MatSolve_SeqSBAIJ_5_NaturalOrdering_inplace(Mat A,Vec bb,Vec xx)
877: {
878:   Mat_SeqSBAIJ      *a =(Mat_SeqSBAIJ*)A->data;
879:   const PetscInt    mbs=a->mbs,*ai=a->i,*aj=a->j;
880:   const MatScalar   *aa=a->a;
881:   PetscScalar       *x;
882:   const PetscScalar *b;
883:   PetscErrorCode    ierr;

886:   VecGetArrayRead(bb,&b);
887:   VecGetArray(xx,&x);

889:   /* solve U^T * D * y = b by forward substitution */
890:   PetscArraycpy(x,b,5*mbs); /* x <- b */
891:   MatForwardSolve_SeqSBAIJ_5_NaturalOrdering(ai,aj,aa,mbs,x);

893:   /* solve U*x = y by back substitution */
894:   MatBackwardSolve_SeqSBAIJ_5_NaturalOrdering(ai,aj,aa,mbs,x);

896:   VecRestoreArrayRead(bb,&b);
897:   VecRestoreArray(xx,&x);
898:   PetscLogFlops(4.0*a->bs2*a->nz - (A->rmap->bs+2.0*a->bs2)*mbs);
899:   return(0);
900: }

902: PetscErrorCode MatForwardSolve_SeqSBAIJ_5_NaturalOrdering_inplace(Mat A,Vec bb,Vec xx)
903: {
904:   Mat_SeqSBAIJ      *a =(Mat_SeqSBAIJ*)A->data;
905:   const PetscInt    mbs=a->mbs,*ai=a->i,*aj=a->j;
906:   const MatScalar   *aa=a->a;
907:   PetscScalar       *x;
908:   const PetscScalar *b;
909:   PetscErrorCode    ierr;

912:   VecGetArrayRead(bb,&b);
913:   VecGetArray(xx,&x);
914:   PetscArraycpy(x,b,5*mbs); /* x <- b */
915:   MatForwardSolve_SeqSBAIJ_5_NaturalOrdering(ai,aj,aa,mbs,x);
916:   VecRestoreArrayRead(bb,&b);
917:   VecRestoreArray(xx,&x);
918:   PetscLogFlops(2.0*a->bs2*a->nz - A->rmap->bs*mbs);
919:   return(0);
920: }

922: PetscErrorCode MatBackwardSolve_SeqSBAIJ_5_NaturalOrdering_inplace(Mat A,Vec bb,Vec xx)
923: {
924:   Mat_SeqSBAIJ      *a =(Mat_SeqSBAIJ*)A->data;
925:   const PetscInt    mbs=a->mbs,*ai=a->i,*aj=a->j;
926:   const MatScalar   *aa=a->a;
927:   PetscScalar       *x;
928:   const PetscScalar *b;
929:   PetscErrorCode    ierr;

932:   VecGetArrayRead(bb,&b);
933:   VecGetArray(xx,&x);
934:   PetscArraycpy(x,b,5*mbs);
935:   MatBackwardSolve_SeqSBAIJ_5_NaturalOrdering(ai,aj,aa,mbs,x);
936:   VecRestoreArrayRead(bb,&b);
937:   VecRestoreArray(xx,&x);
938:   PetscLogFlops(2.0*a->bs2*(a->nz-mbs));
939:   return(0);
940: }

942: PetscErrorCode MatSolve_SeqSBAIJ_4_inplace(Mat A,Vec bb,Vec xx)
943: {
944:   Mat_SeqSBAIJ      *a   =(Mat_SeqSBAIJ*)A->data;
945:   IS                isrow=a->row;
946:   const PetscInt    mbs  =a->mbs,*ai=a->i,*aj=a->j;
947:   PetscErrorCode    ierr;
948:   const PetscInt    *r,*vj;
949:   PetscInt          nz,k,idx;
950:   const MatScalar   *aa=a->a,*v,*diag;
951:   PetscScalar       *x,x0,x1,x2,x3,*t,*tp;
952:   const PetscScalar *b;

955:   VecGetArrayRead(bb,&b);
956:   VecGetArray(xx,&x);
957:   t    = a->solve_work;
958:   ISGetIndices(isrow,&r);

960:   /* solve U^T * D * y = b by forward substitution */
961:   tp = t;
962:   for (k=0; k<mbs; k++) { /* t <- perm(b) */
963:     idx   = 4*r[k];
964:     tp[0] = b[idx];
965:     tp[1] = b[idx+1];
966:     tp[2] = b[idx+2];
967:     tp[3] = b[idx+3];
968:     tp   += 4;
969:   }

971:   for (k=0; k<mbs; k++) {
972:     v  = aa + 16*ai[k];
973:     vj = aj + ai[k];
974:     tp = t + k*4;
975:     x0 = tp[0]; x1=tp[1]; x2=tp[2]; x3=tp[3];
976:     nz = ai[k+1] - ai[k];

978:     tp = t + (*vj)*4;
979:     while (nz--) {
980:       tp[0] += v[0]*x0 + v[1]*x1 + v[2]*x2 + v[3]*x3;
981:       tp[1] += v[4]*x0 + v[5]*x1 + v[6]*x2 + v[7]*x3;
982:       tp[2] += v[8]*x0 + v[9]*x1 + v[10]*x2+ v[11]*x3;
983:       tp[3] += v[12]*x0+ v[13]*x1+ v[14]*x2+ v[15]*x3;
984:       vj++;
985:       tp = t + (*vj)*4;
986:       v += 16;
987:     }

989:     /* xk = inv(Dk)*(Dk*xk) */
990:     diag  = aa+k*16;          /* ptr to inv(Dk) */
991:     tp    = t + k*4;
992:     tp[0] = diag[0]*x0 + diag[4]*x1 + diag[8]*x2 + diag[12]*x3;
993:     tp[1] = diag[1]*x0 + diag[5]*x1 + diag[9]*x2 + diag[13]*x3;
994:     tp[2] = diag[2]*x0 + diag[6]*x1 + diag[10]*x2+ diag[14]*x3;
995:     tp[3] = diag[3]*x0 + diag[7]*x1 + diag[11]*x2+ diag[15]*x3;
996:   }

998:   /* solve U*x = y by back substitution */
999:   for (k=mbs-1; k>=0; k--) {
1000:     v  = aa + 16*ai[k];
1001:     vj = aj + ai[k];
1002:     tp = t + k*4;
1003:     x0 = tp[0]; x1=tp[1]; x2=tp[2]; x3=tp[3]; /* xk */
1004:     nz = ai[k+1] - ai[k];

1006:     tp = t + (*vj)*4;
1007:     while (nz--) {
1008:       /* xk += U(k,:)*x(:) */
1009:       x0 += v[0]*tp[0] + v[4]*tp[1] + v[8]*tp[2] + v[12]*tp[3];
1010:       x1 += v[1]*tp[0] + v[5]*tp[1] + v[9]*tp[2] + v[13]*tp[3];
1011:       x2 += v[2]*tp[0] + v[6]*tp[1]+ v[10]*tp[2] + v[14]*tp[3];
1012:       x3 += v[3]*tp[0] + v[7]*tp[1]+ v[11]*tp[2] + v[15]*tp[3];
1013:       vj++;
1014:       tp = t + (*vj)*4;
1015:       v += 16;
1016:     }
1017:     tp       = t + k*4;
1018:     tp[0]    =x0; tp[1]=x1; tp[2]=x2; tp[3]=x3;
1019:     idx      = 4*r[k];
1020:     x[idx]   = x0;
1021:     x[idx+1] = x1;
1022:     x[idx+2] = x2;
1023:     x[idx+3] = x3;
1024:   }

1026:   ISRestoreIndices(isrow,&r);
1027:   VecRestoreArrayRead(bb,&b);
1028:   VecRestoreArray(xx,&x);
1029:   PetscLogFlops(4.0*a->bs2*a->nz - (A->rmap->bs+2.0*a->bs2)*mbs);
1030:   return(0);
1031: }

1033: PetscErrorCode MatForwardSolve_SeqSBAIJ_4_NaturalOrdering(const PetscInt *ai,const PetscInt *aj,const MatScalar *aa,PetscInt mbs,PetscScalar *x)
1034: {
1035:   const MatScalar *v,*diag;
1036:   PetscScalar     *xp,x0,x1,x2,x3;
1037:   PetscInt        nz,k;
1038:   const PetscInt  *vj;

1041:   for (k=0; k<mbs; k++) {
1042:     v  = aa + 16*ai[k];
1043:     xp = x + k*4;
1044:     x0 = xp[0]; x1=xp[1]; x2=xp[2]; x3=xp[3]; /* Dk*xk = k-th block of x */
1045:     nz = ai[k+1] - ai[k];
1046:     vj = aj + ai[k];
1047:     xp = x + (*vj)*4;
1048:     PetscPrefetchBlock(vj+nz,nz,0,PETSC_PREFETCH_HINT_NTA);      /* Indices for the next row (assumes same size as this one) */
1049:     PetscPrefetchBlock(v+16*nz,16*nz,0,PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */
1050:     while (nz--) {
1051:       /* x(:) += U(k,:)^T*(Dk*xk) */
1052:       xp[0] += v[0]*x0 + v[1]*x1 + v[2]*x2 + v[3]*x3;
1053:       xp[1] += v[4]*x0 + v[5]*x1 + v[6]*x2 + v[7]*x3;
1054:       xp[2] += v[8]*x0 + v[9]*x1 + v[10]*x2+ v[11]*x3;
1055:       xp[3] += v[12]*x0+ v[13]*x1+ v[14]*x2+ v[15]*x3;
1056:       vj++;
1057:       xp = x + (*vj)*4;
1058:       v += 16;
1059:     }
1060:     /* xk = inv(Dk)*(Dk*xk) */
1061:     diag  = aa+k*16;         /* ptr to inv(Dk) */
1062:     xp    = x + k*4;
1063:     xp[0] = diag[0]*x0 + diag[4]*x1 + diag[8]*x2 + diag[12]*x3;
1064:     xp[1] = diag[1]*x0 + diag[5]*x1 + diag[9]*x2 + diag[13]*x3;
1065:     xp[2] = diag[2]*x0 + diag[6]*x1 + diag[10]*x2+ diag[14]*x3;
1066:     xp[3] = diag[3]*x0 + diag[7]*x1 + diag[11]*x2+ diag[15]*x3;
1067:   }
1068:   return(0);
1069: }

1071: PetscErrorCode MatBackwardSolve_SeqSBAIJ_4_NaturalOrdering(const PetscInt *ai,const PetscInt *aj,const MatScalar *aa,PetscInt mbs,PetscScalar *x)
1072: {
1073:   const MatScalar *v;
1074:   PetscScalar     *xp,x0,x1,x2,x3;
1075:   PetscInt        nz,k;
1076:   const PetscInt  *vj;

1079:   for (k=mbs-1; k>=0; k--) {
1080:     v  = aa + 16*ai[k];
1081:     xp = x + k*4;
1082:     x0 = xp[0]; x1=xp[1]; x2=xp[2]; x3=xp[3]; /* xk */
1083:     nz = ai[k+1] - ai[k];
1084:     vj = aj + ai[k];
1085:     xp = x + (*vj)*4;
1086:     PetscPrefetchBlock(vj-nz,nz,0,PETSC_PREFETCH_HINT_NTA);      /* Indices for the next row (assumes same size as this one) */
1087:     PetscPrefetchBlock(v-16*nz,16*nz,0,PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */
1088:     while (nz--) {
1089:       /* xk += U(k,:)*x(:) */
1090:       x0 += v[0]*xp[0] + v[4]*xp[1] + v[8]*xp[2] + v[12]*xp[3];
1091:       x1 += v[1]*xp[0] + v[5]*xp[1] + v[9]*xp[2] + v[13]*xp[3];
1092:       x2 += v[2]*xp[0] + v[6]*xp[1]+ v[10]*xp[2] + v[14]*xp[3];
1093:       x3 += v[3]*xp[0] + v[7]*xp[1]+ v[11]*xp[2] + v[15]*xp[3];
1094:       vj++;
1095:       v += 16; xp = x + (*vj)*4;
1096:     }
1097:     xp    = x + k*4;
1098:     xp[0] = x0; xp[1] = x1; xp[2] = x2; xp[3] = x3;
1099:   }
1100:   return(0);
1101: }

1103: PetscErrorCode MatSolve_SeqSBAIJ_4_NaturalOrdering_inplace(Mat A,Vec bb,Vec xx)
1104: {
1105:   Mat_SeqSBAIJ      *a =(Mat_SeqSBAIJ*)A->data;
1106:   const PetscInt    mbs=a->mbs,*ai=a->i,*aj=a->j;
1107:   const MatScalar   *aa=a->a;
1108:   PetscScalar       *x;
1109:   const PetscScalar *b;
1110:   PetscErrorCode    ierr;

1113:   VecGetArrayRead(bb,&b);
1114:   VecGetArray(xx,&x);

1116:   /* solve U^T * D * y = b by forward substitution */
1117:   PetscArraycpy(x,b,4*mbs); /* x <- b */
1118:   MatForwardSolve_SeqSBAIJ_4_NaturalOrdering(ai,aj,aa,mbs,x);

1120:   /* solve U*x = y by back substitution */
1121:   MatBackwardSolve_SeqSBAIJ_4_NaturalOrdering(ai,aj,aa,mbs,x);
1122:   VecRestoreArrayRead(bb,&b);
1123:   VecRestoreArray(xx,&x);
1124:   PetscLogFlops(4.0*a->bs2*a->nz - (A->rmap->bs+2.0*a->bs2)*mbs);
1125:   return(0);
1126: }

1128: PetscErrorCode MatForwardSolve_SeqSBAIJ_4_NaturalOrdering_inplace(Mat A,Vec bb,Vec xx)
1129: {
1130:   Mat_SeqSBAIJ      *a =(Mat_SeqSBAIJ*)A->data;
1131:   const PetscInt    mbs=a->mbs,*ai=a->i,*aj=a->j;
1132:   const MatScalar   *aa=a->a;
1133:   PetscScalar       *x;
1134:   const PetscScalar *b;
1135:   PetscErrorCode    ierr;

1138:   VecGetArrayRead(bb,&b);
1139:   VecGetArray(xx,&x);
1140:   PetscArraycpy(x,b,4*mbs); /* x <- b */
1141:   MatForwardSolve_SeqSBAIJ_4_NaturalOrdering(ai,aj,aa,mbs,x);
1142:   VecRestoreArrayRead(bb,&b);
1143:   VecRestoreArray(xx,&x);
1144:   PetscLogFlops(2.0*a->bs2*a->nz - A->rmap->bs*mbs);
1145:   return(0);
1146: }

1148: PetscErrorCode MatBackwardSolve_SeqSBAIJ_4_NaturalOrdering_inplace(Mat A,Vec bb,Vec xx)
1149: {
1150:   Mat_SeqSBAIJ      *a =(Mat_SeqSBAIJ*)A->data;
1151:   const PetscInt    mbs=a->mbs,*ai=a->i,*aj=a->j;
1152:   const MatScalar   *aa=a->a;
1153:   PetscScalar       *x;
1154:   const PetscScalar *b;
1155:   PetscErrorCode    ierr;

1158:   VecGetArrayRead(bb,&b);
1159:   VecGetArray(xx,&x);
1160:   PetscArraycpy(x,b,4*mbs);
1161:   MatBackwardSolve_SeqSBAIJ_4_NaturalOrdering(ai,aj,aa,mbs,x);
1162:   VecRestoreArrayRead(bb,&b);
1163:   VecRestoreArray(xx,&x);
1164:   PetscLogFlops(2.0*a->bs2*(a->nz-mbs));
1165:   return(0);
1166: }

1168: PetscErrorCode MatSolve_SeqSBAIJ_3_inplace(Mat A,Vec bb,Vec xx)
1169: {
1170:   Mat_SeqSBAIJ      *a =(Mat_SeqSBAIJ*)A->data;
1171:   IS                isrow=a->row;
1172:   const PetscInt    mbs  =a->mbs,*ai=a->i,*aj=a->j;
1173:   PetscErrorCode    ierr;
1174:   const PetscInt    *r;
1175:   PetscInt          nz,k,idx;
1176:   const PetscInt    *vj;
1177:   const MatScalar   *aa=a->a,*v,*diag;
1178:   PetscScalar       *x,x0,x1,x2,*t,*tp;
1179:   const PetscScalar *b;

1182:   VecGetArrayRead(bb,&b);
1183:   VecGetArray(xx,&x);
1184:   t    = a->solve_work;
1185:   ISGetIndices(isrow,&r);

1187:   /* solve U^T * D * y = b by forward substitution */
1188:   tp = t;
1189:   for (k=0; k<mbs; k++) { /* t <- perm(b) */
1190:     idx   = 3*r[k];
1191:     tp[0] = b[idx];
1192:     tp[1] = b[idx+1];
1193:     tp[2] = b[idx+2];
1194:     tp   += 3;
1195:   }

1197:   for (k=0; k<mbs; k++) {
1198:     v  = aa + 9*ai[k];
1199:     vj = aj + ai[k];
1200:     tp = t + k*3;
1201:     x0 = tp[0]; x1 = tp[1]; x2 = tp[2];
1202:     nz = ai[k+1] - ai[k];

1204:     tp = t + (*vj)*3;
1205:     while (nz--) {
1206:       tp[0] += v[0]*x0 + v[1]*x1 + v[2]*x2;
1207:       tp[1] += v[3]*x0 + v[4]*x1 + v[5]*x2;
1208:       tp[2] += v[6]*x0 + v[7]*x1 + v[8]*x2;
1209:       vj++;
1210:       tp = t + (*vj)*3;
1211:       v += 9;
1212:     }

1214:     /* xk = inv(Dk)*(Dk*xk) */
1215:     diag  = aa+k*9;          /* ptr to inv(Dk) */
1216:     tp    = t + k*3;
1217:     tp[0] = diag[0]*x0 + diag[3]*x1 + diag[6]*x2;
1218:     tp[1] = diag[1]*x0 + diag[4]*x1 + diag[7]*x2;
1219:     tp[2] = diag[2]*x0 + diag[5]*x1 + diag[8]*x2;
1220:   }

1222:   /* solve U*x = y by back substitution */
1223:   for (k=mbs-1; k>=0; k--) {
1224:     v  = aa + 9*ai[k];
1225:     vj = aj + ai[k];
1226:     tp = t + k*3;
1227:     x0 = tp[0]; x1 = tp[1]; x2 = tp[2];  /* xk */
1228:     nz = ai[k+1] - ai[k];

1230:     tp = t + (*vj)*3;
1231:     while (nz--) {
1232:       /* xk += U(k,:)*x(:) */
1233:       x0 += v[0]*tp[0] + v[3]*tp[1] + v[6]*tp[2];
1234:       x1 += v[1]*tp[0] + v[4]*tp[1] + v[7]*tp[2];
1235:       x2 += v[2]*tp[0] + v[5]*tp[1] + v[8]*tp[2];
1236:       vj++;
1237:       tp = t + (*vj)*3;
1238:       v += 9;
1239:     }
1240:     tp       = t + k*3;
1241:     tp[0]    = x0; tp[1] = x1; tp[2] = x2;
1242:     idx      = 3*r[k];
1243:     x[idx]   = x0;
1244:     x[idx+1] = x1;
1245:     x[idx+2] = x2;
1246:   }

1248:   ISRestoreIndices(isrow,&r);
1249:   VecRestoreArrayRead(bb,&b);
1250:   VecRestoreArray(xx,&x);
1251:   PetscLogFlops(4.0*a->bs2*a->nz - (A->rmap->bs+2.0*a->bs2)*mbs);
1252:   return(0);
1253: }

1255: PetscErrorCode MatForwardSolve_SeqSBAIJ_3_NaturalOrdering(const PetscInt *ai,const PetscInt *aj,const MatScalar *aa,PetscInt mbs,PetscScalar *x)
1256: {
1257:   const MatScalar *v,*diag;
1258:   PetscScalar     *xp,x0,x1,x2;
1259:   PetscInt        nz,k;
1260:   const PetscInt  *vj;

1263:   for (k=0; k<mbs; k++) {
1264:     v  = aa + 9*ai[k];
1265:     xp = x + k*3;
1266:     x0 = xp[0]; x1 = xp[1]; x2 = xp[2]; /* Dk*xk = k-th block of x */
1267:     nz = ai[k+1] - ai[k];
1268:     vj = aj + ai[k];
1269:     xp = x + (*vj)*3;
1270:     PetscPrefetchBlock(vj+nz,nz,0,PETSC_PREFETCH_HINT_NTA);    /* Indices for the next row (assumes same size as this one) */
1271:     PetscPrefetchBlock(v+9*nz,9*nz,0,PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */
1272:     while (nz--) {
1273:       /* x(:) += U(k,:)^T*(Dk*xk) */
1274:       xp[0] += v[0]*x0 + v[1]*x1 + v[2]*x2;
1275:       xp[1] += v[3]*x0 + v[4]*x1 + v[5]*x2;
1276:       xp[2] += v[6]*x0 + v[7]*x1 + v[8]*x2;
1277:       vj++;
1278:       xp = x + (*vj)*3;
1279:       v += 9;
1280:     }
1281:     /* xk = inv(Dk)*(Dk*xk) */
1282:     diag  = aa+k*9;         /* ptr to inv(Dk) */
1283:     xp    = x + k*3;
1284:     xp[0] = diag[0]*x0 + diag[3]*x1 + diag[6]*x2;
1285:     xp[1] = diag[1]*x0 + diag[4]*x1 + diag[7]*x2;
1286:     xp[2] = diag[2]*x0 + diag[5]*x1 + diag[8]*x2;
1287:   }
1288:   return(0);
1289: }

1291: PetscErrorCode MatBackwardSolve_SeqSBAIJ_3_NaturalOrdering(const PetscInt *ai,const PetscInt *aj,const MatScalar *aa,PetscInt mbs,PetscScalar *x)
1292: {
1293:   const MatScalar *v;
1294:   PetscScalar     *xp,x0,x1,x2;
1295:   PetscInt        nz,k;
1296:   const PetscInt  *vj;

1299:   for (k=mbs-1; k>=0; k--) {
1300:     v  = aa + 9*ai[k];
1301:     xp = x + k*3;
1302:     x0 = xp[0]; x1 = xp[1]; x2 = xp[2];  /* xk */
1303:     nz = ai[k+1] - ai[k];
1304:     vj = aj + ai[k];
1305:     xp = x + (*vj)*3;
1306:     PetscPrefetchBlock(vj-nz,nz,0,PETSC_PREFETCH_HINT_NTA);    /* Indices for the next row (assumes same size as this one) */
1307:     PetscPrefetchBlock(v-9*nz,9*nz,0,PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */
1308:     while (nz--) {
1309:       /* xk += U(k,:)*x(:) */
1310:       x0 += v[0]*xp[0] + v[3]*xp[1] + v[6]*xp[2];
1311:       x1 += v[1]*xp[0] + v[4]*xp[1] + v[7]*xp[2];
1312:       x2 += v[2]*xp[0] + v[5]*xp[1] + v[8]*xp[2];
1313:       vj++;
1314:       v += 9; xp = x + (*vj)*3;
1315:     }
1316:     xp    = x + k*3;
1317:     xp[0] = x0; xp[1] = x1; xp[2] = x2;
1318:   }
1319:   return(0);
1320: }

1322: PetscErrorCode MatSolve_SeqSBAIJ_3_NaturalOrdering_inplace(Mat A,Vec bb,Vec xx)
1323: {
1324:   Mat_SeqSBAIJ      *a =(Mat_SeqSBAIJ*)A->data;
1325:   const PetscInt    mbs=a->mbs,*ai=a->i,*aj=a->j;
1326:   const MatScalar   *aa=a->a;
1327:   PetscScalar       *x;
1328:   const PetscScalar *b;
1329:   PetscErrorCode    ierr;

1332:   VecGetArrayRead(bb,&b);
1333:   VecGetArray(xx,&x);

1335:   /* solve U^T * D * y = b by forward substitution */
1336:   PetscArraycpy(x,b,3*mbs);
1337:   MatForwardSolve_SeqSBAIJ_3_NaturalOrdering(ai,aj,aa,mbs,x);

1339:   /* solve U*x = y by back substitution */
1340:   MatBackwardSolve_SeqSBAIJ_3_NaturalOrdering(ai,aj,aa,mbs,x);

1342:   VecRestoreArrayRead(bb,&b);
1343:   VecRestoreArray(xx,&x);
1344:   PetscLogFlops(4.0*a->bs2*a->nz - (A->rmap->bs+2.0*a->bs2)*mbs);
1345:   return(0);
1346: }

1348: PetscErrorCode MatForwardSolve_SeqSBAIJ_3_NaturalOrdering_inplace(Mat A,Vec bb,Vec xx)
1349: {
1350:   Mat_SeqSBAIJ      *a =(Mat_SeqSBAIJ*)A->data;
1351:   const PetscInt    mbs=a->mbs,*ai=a->i,*aj=a->j;
1352:   const MatScalar   *aa=a->a;
1353:   PetscScalar       *x;
1354:   const PetscScalar *b;
1355:   PetscErrorCode    ierr;

1358:   VecGetArrayRead(bb,&b);
1359:   VecGetArray(xx,&x);
1360:   PetscArraycpy(x,b,3*mbs);
1361:   MatForwardSolve_SeqSBAIJ_3_NaturalOrdering(ai,aj,aa,mbs,x);
1362:   VecRestoreArrayRead(bb,&b);
1363:   VecRestoreArray(xx,&x);
1364:   PetscLogFlops(2.0*a->bs2*a->nz - A->rmap->bs*mbs);
1365:   return(0);
1366: }

1368: PetscErrorCode MatBackwardSolve_SeqSBAIJ_3_NaturalOrdering_inplace(Mat A,Vec bb,Vec xx)
1369: {
1370:   Mat_SeqSBAIJ      *a =(Mat_SeqSBAIJ*)A->data;
1371:   const PetscInt    mbs=a->mbs,*ai=a->i,*aj=a->j;
1372:   const MatScalar   *aa=a->a;
1373:   PetscScalar       *x;
1374:   const PetscScalar *b;
1375:   PetscErrorCode    ierr;

1378:   VecGetArrayRead(bb,&b);
1379:   VecGetArray(xx,&x);
1380:   PetscArraycpy(x,b,3*mbs);
1381:   MatBackwardSolve_SeqSBAIJ_3_NaturalOrdering(ai,aj,aa,mbs,x);
1382:   VecRestoreArrayRead(bb,&b);
1383:   VecRestoreArray(xx,&x);
1384:   PetscLogFlops(2.0*a->bs2*(a->nz-mbs));
1385:   return(0);
1386: }

1388: PetscErrorCode MatSolve_SeqSBAIJ_2_inplace(Mat A,Vec bb,Vec xx)
1389: {
1390:   Mat_SeqSBAIJ      *a   =(Mat_SeqSBAIJ*)A->data;
1391:   IS                isrow=a->row;
1392:   const PetscInt    mbs  =a->mbs,*ai=a->i,*aj=a->j;
1393:   PetscErrorCode    ierr;
1394:   const PetscInt    *r,*vj;
1395:   PetscInt          nz,k,k2,idx;
1396:   const MatScalar   *aa=a->a,*v,*diag;
1397:   PetscScalar       *x,x0,x1,*t;
1398:   const PetscScalar *b;

1401:   VecGetArrayRead(bb,&b);
1402:   VecGetArray(xx,&x);
1403:   t    = a->solve_work;
1404:   ISGetIndices(isrow,&r);

1406:   /* solve U^T * D * y = perm(b) by forward substitution */
1407:   for (k=0; k<mbs; k++) {  /* t <- perm(b) */
1408:     idx      = 2*r[k];
1409:     t[k*2]   = b[idx];
1410:     t[k*2+1] = b[idx+1];
1411:   }
1412:   for (k=0; k<mbs; k++) {
1413:     v  = aa + 4*ai[k];
1414:     vj = aj + ai[k];
1415:     k2 = k*2;
1416:     x0 = t[k2]; x1 = t[k2+1];
1417:     nz = ai[k+1] - ai[k];
1418:     while (nz--) {
1419:       t[(*vj)*2]   += v[0]*x0 + v[1]*x1;
1420:       t[(*vj)*2+1] += v[2]*x0 + v[3]*x1;
1421:       vj++; v      += 4;
1422:     }
1423:     diag    = aa+k*4; /* ptr to inv(Dk) */
1424:     t[k2]   = diag[0]*x0 + diag[2]*x1;
1425:     t[k2+1] = diag[1]*x0 + diag[3]*x1;
1426:   }

1428:   /* solve U*x = y by back substitution */
1429:   for (k=mbs-1; k>=0; k--) {
1430:     v  = aa + 4*ai[k];
1431:     vj = aj + ai[k];
1432:     k2 = k*2;
1433:     x0 = t[k2]; x1 = t[k2+1];
1434:     nz = ai[k+1] - ai[k];
1435:     while (nz--) {
1436:       x0 += v[0]*t[(*vj)*2] + v[2]*t[(*vj)*2+1];
1437:       x1 += v[1]*t[(*vj)*2] + v[3]*t[(*vj)*2+1];
1438:       vj++;
1439:       v += 4;
1440:     }
1441:     t[k2]    = x0;
1442:     t[k2+1]  = x1;
1443:     idx      = 2*r[k];
1444:     x[idx]   = x0;
1445:     x[idx+1] = x1;
1446:   }

1448:   ISRestoreIndices(isrow,&r);
1449:   VecRestoreArrayRead(bb,&b);
1450:   VecRestoreArray(xx,&x);
1451:   PetscLogFlops(4.0*a->bs2*a->nz - (A->rmap->bs+2.0*a->bs2)*mbs);
1452:   return(0);
1453: }

1455: PetscErrorCode MatForwardSolve_SeqSBAIJ_2_NaturalOrdering(const PetscInt *ai,const PetscInt *aj,const MatScalar *aa,PetscInt mbs,PetscScalar *x)
1456: {
1457:   const MatScalar *v,*diag;
1458:   PetscScalar     x0,x1;
1459:   PetscInt        nz,k,k2;
1460:   const PetscInt  *vj;
1461: 
1463:   for (k=0; k<mbs; k++) {
1464:     v  = aa + 4*ai[k];
1465:     vj = aj + ai[k];
1466:     k2 = k*2;
1467:     x0 = x[k2]; x1 = x[k2+1];  /* Dk*xk = k-th block of x */
1468:     nz = ai[k+1] - ai[k];
1469:     PetscPrefetchBlock(vj+nz,nz,0,PETSC_PREFETCH_HINT_NTA);    /* Indices for the next row (assumes same size as this one) */
1470:     PetscPrefetchBlock(v+4*nz,4*nz,0,PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */
1471:     while (nz--) {
1472:       /* x(:) += U(k,:)^T*(Dk*xk) */
1473:       x[(*vj)*2]   += v[0]*x0 + v[1]*x1;
1474:       x[(*vj)*2+1] += v[2]*x0 + v[3]*x1;
1475:       vj++; v      += 4;
1476:     }
1477:     /* xk = inv(Dk)*(Dk*xk) */
1478:     diag    = aa+k*4;       /* ptr to inv(Dk) */
1479:     x[k2]   = diag[0]*x0 + diag[2]*x1;
1480:     x[k2+1] = diag[1]*x0 + diag[3]*x1;
1481:   }
1482:   return(0);
1483: }

1485: PetscErrorCode MatBackwardSolve_SeqSBAIJ_2_NaturalOrdering(const PetscInt *ai,const PetscInt *aj,const MatScalar *aa,PetscInt mbs,PetscScalar *x)
1486: {
1487:   const MatScalar *v;
1488:   PetscScalar     x0,x1;
1489:   PetscInt        nz,k,k2;
1490:   const PetscInt  *vj;

1493:   for (k=mbs-1; k>=0; k--) {
1494:     v  = aa + 4*ai[k];
1495:     vj = aj + ai[k];
1496:     k2 = k*2;
1497:     x0 = x[k2]; x1 = x[k2+1];  /* xk */
1498:     nz = ai[k+1] - ai[k];
1499:     PetscPrefetchBlock(vj-nz,nz,0,PETSC_PREFETCH_HINT_NTA);    /* Indices for the next row (assumes same size as this one) */
1500:     PetscPrefetchBlock(v-4*nz,4*nz,0,PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */
1501:     while (nz--) {
1502:       /* xk += U(k,:)*x(:) */
1503:       x0 += v[0]*x[(*vj)*2] + v[2]*x[(*vj)*2+1];
1504:       x1 += v[1]*x[(*vj)*2] + v[3]*x[(*vj)*2+1];
1505:       vj++;
1506:       v += 4;
1507:     }
1508:     x[k2]   = x0;
1509:     x[k2+1] = x1;
1510:   }
1511:   return(0);
1512: }

1514: PetscErrorCode MatSolve_SeqSBAIJ_2_NaturalOrdering_inplace(Mat A,Vec bb,Vec xx)
1515: {
1516:   Mat_SeqSBAIJ      *a =(Mat_SeqSBAIJ*)A->data;
1517:   const PetscInt    mbs=a->mbs,*ai=a->i,*aj=a->j;
1518:   const MatScalar   *aa=a->a;
1519:   PetscScalar       *x;
1520:   const PetscScalar *b;
1521:   PetscErrorCode    ierr;

1524:   VecGetArrayRead(bb,&b);
1525:   VecGetArray(xx,&x);

1527:   /* solve U^T * D * y = b by forward substitution */
1528:   PetscArraycpy(x,b,2*mbs);
1529:   MatForwardSolve_SeqSBAIJ_2_NaturalOrdering(ai,aj,aa,mbs,x);

1531:   /* solve U*x = y by back substitution */
1532:   MatBackwardSolve_SeqSBAIJ_2_NaturalOrdering(ai,aj,aa,mbs,x);

1534:   VecRestoreArrayRead(bb,&b);
1535:   VecRestoreArray(xx,&x);
1536:   PetscLogFlops(4.0*a->bs2*a->nz - (A->rmap->bs+2.0*a->bs2)*mbs);
1537:   return(0);
1538: }

1540: PetscErrorCode MatForwardSolve_SeqSBAIJ_2_NaturalOrdering_inplace(Mat A,Vec bb,Vec xx)
1541: {
1542:   Mat_SeqSBAIJ      *a =(Mat_SeqSBAIJ*)A->data;
1543:   const PetscInt    mbs=a->mbs,*ai=a->i,*aj=a->j;
1544:   const MatScalar   *aa=a->a;
1545:   PetscScalar       *x;
1546:   const PetscScalar *b;
1547:   PetscErrorCode    ierr;

1550:   VecGetArrayRead(bb,&b);
1551:   VecGetArray(xx,&x);
1552:   PetscArraycpy(x,b,2*mbs);
1553:   MatForwardSolve_SeqSBAIJ_2_NaturalOrdering(ai,aj,aa,mbs,x);
1554:   VecRestoreArrayRead(bb,&b);
1555:   VecRestoreArray(xx,&x);
1556:   PetscLogFlops(2.0*a->bs2*a->nz - A->rmap->bs*mbs);
1557:   return(0);
1558: }

1560: PetscErrorCode MatBackwardSolve_SeqSBAIJ_2_NaturalOrdering_inplace(Mat A,Vec bb,Vec xx)
1561: {
1562:   Mat_SeqSBAIJ      *a =(Mat_SeqSBAIJ*)A->data;
1563:   const PetscInt    mbs=a->mbs,*ai=a->i,*aj=a->j;
1564:   const MatScalar   *aa=a->a;
1565:   PetscScalar       *x;
1566:   const PetscScalar *b;
1567:   PetscErrorCode    ierr;

1570:   VecGetArrayRead(bb,&b);
1571:   VecGetArray(xx,&x);
1572:   PetscArraycpy(x,b,2*mbs);
1573:   MatBackwardSolve_SeqSBAIJ_2_NaturalOrdering(ai,aj,aa,mbs,x);
1574:   VecRestoreArrayRead(bb,&b);
1575:   VecRestoreArray(xx,&x);
1576:   PetscLogFlops(2.0*a->bs2*(a->nz - mbs));
1577:   return(0);
1578: }

1580: PetscErrorCode MatSolve_SeqSBAIJ_1(Mat A,Vec bb,Vec xx)
1581: {
1582:   Mat_SeqSBAIJ      *a   = (Mat_SeqSBAIJ*)A->data;
1583:   IS                isrow=a->row;
1584:   PetscErrorCode    ierr;
1585:   const PetscInt    mbs=a->mbs,*ai=a->i,*aj=a->j,*rp,*vj,*adiag = a->diag;
1586:   const MatScalar   *aa=a->a,*v;
1587:   const PetscScalar *b;
1588:   PetscScalar       *x,xk,*t;
1589:   PetscInt          nz,k,j;

1592:   VecGetArrayRead(bb,&b);
1593:   VecGetArray(xx,&x);
1594:   t    = a->solve_work;
1595:   ISGetIndices(isrow,&rp);

1597:   /* solve U^T*D*y = perm(b) by forward substitution */
1598:   for (k=0; k<mbs; k++) t[k] = b[rp[k]];
1599:   for (k=0; k<mbs; k++) {
1600:     v  = aa + ai[k];
1601:     vj = aj + ai[k];
1602:     xk = t[k];
1603:     nz = ai[k+1] - ai[k] - 1;
1604:     for (j=0; j<nz; j++) t[vj[j]] += v[j]*xk;
1605:     t[k] = xk*v[nz];   /* v[nz] = 1/D(k) */
1606:   }

1608:   /* solve U*perm(x) = y by back substitution */
1609:   for (k=mbs-1; k>=0; k--) {
1610:     v  = aa + adiag[k] - 1;
1611:     vj = aj + adiag[k] - 1;
1612:     nz = ai[k+1] - ai[k] - 1;
1613:     for (j=0; j<nz; j++) t[k] += v[-j]*t[vj[-j]];
1614:     x[rp[k]] = t[k];
1615:   }

1617:   ISRestoreIndices(isrow,&rp);
1618:   VecRestoreArrayRead(bb,&b);
1619:   VecRestoreArray(xx,&x);
1620:   PetscLogFlops(4.0*a->nz - 3.0*mbs);
1621:   return(0);
1622: }

1624: PetscErrorCode MatSolve_SeqSBAIJ_1_inplace(Mat A,Vec bb,Vec xx)
1625: {
1626:   Mat_SeqSBAIJ      *a   = (Mat_SeqSBAIJ*)A->data;
1627:   IS                isrow=a->row;
1628:   PetscErrorCode    ierr;
1629:   const PetscInt    mbs=a->mbs,*ai=a->i,*aj=a->j,*rp,*vj;
1630:   const MatScalar   *aa=a->a,*v;
1631:   PetscScalar       *x,xk,*t;
1632:   const PetscScalar *b;
1633:   PetscInt          nz,k;

1636:   VecGetArrayRead(bb,&b);
1637:   VecGetArray(xx,&x);
1638:   t    = a->solve_work;
1639:   ISGetIndices(isrow,&rp);

1641:   /* solve U^T*D*y = perm(b) by forward substitution */
1642:   for (k=0; k<mbs; k++) t[k] = b[rp[k]];
1643:   for (k=0; k<mbs; k++) {
1644:     v  = aa + ai[k] + 1;
1645:     vj = aj + ai[k] + 1;
1646:     xk = t[k];
1647:     nz = ai[k+1] - ai[k] - 1;
1648:     while (nz--) t[*vj++] += (*v++) * xk;
1649:     t[k] = xk*aa[ai[k]];  /* aa[k] = 1/D(k) */
1650:   }

1652:   /* solve U*perm(x) = y by back substitution */
1653:   for (k=mbs-1; k>=0; k--) {
1654:     v  = aa + ai[k] + 1;
1655:     vj = aj + ai[k] + 1;
1656:     nz = ai[k+1] - ai[k] - 1;
1657:     while (nz--) t[k] += (*v++) * t[*vj++];
1658:     x[rp[k]] = t[k];
1659:   }

1661:   ISRestoreIndices(isrow,&rp);
1662:   VecRestoreArrayRead(bb,&b);
1663:   VecRestoreArray(xx,&x);
1664:   PetscLogFlops(4.0*a->nz - 3*mbs);
1665:   return(0);
1666: }

1668: PetscErrorCode MatForwardSolve_SeqSBAIJ_1(Mat A,Vec bb,Vec xx)
1669: {
1670:   Mat_SeqSBAIJ      *a   = (Mat_SeqSBAIJ*)A->data;
1671:   IS                isrow=a->row;
1672:   PetscErrorCode    ierr;
1673:   const PetscInt    mbs=a->mbs,*ai=a->i,*aj=a->j,*rp,*vj,*adiag = a->diag;
1674:   const MatScalar   *aa=a->a,*v;
1675:   PetscReal         diagk;
1676:   PetscScalar       *x,xk;
1677:   const PetscScalar *b;
1678:   PetscInt          nz,k;

1681:   /* solve U^T*D^(1/2)*x = perm(b) by forward substitution */
1682:   VecGetArrayRead(bb,&b);
1683:   VecGetArray(xx,&x);
1684:   ISGetIndices(isrow,&rp);

1686:   for (k=0; k<mbs; k++) x[k] = b[rp[k]];
1687:   for (k=0; k<mbs; k++) {
1688:     v  = aa + ai[k];
1689:     vj = aj + ai[k];
1690:     xk = x[k];
1691:     nz = ai[k+1] - ai[k] - 1;
1692:     while (nz--) x[*vj++] += (*v++) * xk;

1694:     diagk = PetscRealPart(aa[adiag[k]]); /* note: aa[diag[k]] = 1/D(k) */
1695:     if (PetscImaginaryPart(aa[adiag[k]]) || diagk < 0) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"Diagonal must be real and nonnegative");
1696:     x[k] = xk*PetscSqrtReal(diagk);
1697:   }
1698:   ISRestoreIndices(isrow,&rp);
1699:   VecRestoreArrayRead(bb,&b);
1700:   VecRestoreArray(xx,&x);
1701:   PetscLogFlops(2.0*a->nz - mbs);
1702:   return(0);
1703: }

1705: PetscErrorCode MatForwardSolve_SeqSBAIJ_1_inplace(Mat A,Vec bb,Vec xx)
1706: {
1707:   Mat_SeqSBAIJ      *a   = (Mat_SeqSBAIJ*)A->data;
1708:   IS                isrow=a->row;
1709:   PetscErrorCode    ierr;
1710:   const PetscInt    mbs=a->mbs,*ai=a->i,*aj=a->j,*rp,*vj;
1711:   const MatScalar   *aa=a->a,*v;
1712:   PetscReal         diagk;
1713:   PetscScalar       *x,xk;
1714:   const PetscScalar *b;
1715:   PetscInt          nz,k;

1718:   /* solve U^T*D^(1/2)*x = perm(b) by forward substitution */
1719:   VecGetArrayRead(bb,&b);
1720:   VecGetArray(xx,&x);
1721:   ISGetIndices(isrow,&rp);

1723:   for (k=0; k<mbs; k++) x[k] = b[rp[k]];
1724:   for (k=0; k<mbs; k++) {
1725:     v  = aa + ai[k] + 1;
1726:     vj = aj + ai[k] + 1;
1727:     xk = x[k];
1728:     nz = ai[k+1] - ai[k] - 1;
1729:     while (nz--) x[*vj++] += (*v++) * xk;

1731:     diagk = PetscRealPart(aa[ai[k]]); /* note: aa[diag[k]] = 1/D(k) */
1732:     if (PetscImaginaryPart(aa[ai[k]]) || diagk < 0) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"Diagonal must be real and nonnegative");
1733:     x[k] = xk*PetscSqrtReal(diagk);
1734:   }
1735:   ISRestoreIndices(isrow,&rp);
1736:   VecRestoreArrayRead(bb,&b);
1737:   VecRestoreArray(xx,&x);
1738:   PetscLogFlops(2.0*a->nz - mbs);
1739:   return(0);
1740: }

1742: PetscErrorCode MatBackwardSolve_SeqSBAIJ_1(Mat A,Vec bb,Vec xx)
1743: {
1744:   Mat_SeqSBAIJ      *a   = (Mat_SeqSBAIJ*)A->data;
1745:   IS                isrow=a->row;
1746:   PetscErrorCode    ierr;
1747:   const PetscInt    mbs=a->mbs,*ai=a->i,*aj=a->j,*rp,*vj,*adiag = a->diag;
1748:   const MatScalar   *aa=a->a,*v;
1749:   PetscReal         diagk;
1750:   PetscScalar       *x,*t;
1751:   const PetscScalar *b;
1752:   PetscInt          nz,k;

1755:   /* solve D^(1/2)*U*perm(x) = b by back substitution */
1756:   VecGetArrayRead(bb,&b);
1757:   VecGetArray(xx,&x);
1758:   t    = a->solve_work;
1759:   ISGetIndices(isrow,&rp);

1761:   for (k=mbs-1; k>=0; k--) {
1762:     v     = aa + ai[k];
1763:     vj    = aj + ai[k];
1764:     diagk = PetscRealPart(aa[adiag[k]]);
1765:     if (PetscImaginaryPart(aa[adiag[k]]) || diagk < 0) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"Diagonal must be real and nonnegative");
1766:     t[k] = b[k] * PetscSqrtReal(diagk);
1767:     nz   = ai[k+1] - ai[k] - 1;
1768:     while (nz--) t[k] += (*v++) * t[*vj++];
1769:     x[rp[k]] = t[k];
1770:   }
1771:   ISRestoreIndices(isrow,&rp);
1772:   VecRestoreArrayRead(bb,&b);
1773:   VecRestoreArray(xx,&x);
1774:   PetscLogFlops(2.0*a->nz - mbs);
1775:   return(0);
1776: }

1778: PetscErrorCode MatBackwardSolve_SeqSBAIJ_1_inplace(Mat A,Vec bb,Vec xx)
1779: {
1780:   Mat_SeqSBAIJ      *a   = (Mat_SeqSBAIJ*)A->data;
1781:   IS                isrow=a->row;
1782:   PetscErrorCode    ierr;
1783:   const PetscInt    mbs=a->mbs,*ai=a->i,*aj=a->j,*rp,*vj;
1784:   const MatScalar   *aa=a->a,*v;
1785:   PetscReal         diagk;
1786:   PetscScalar       *x,*t;
1787:   const PetscScalar *b;
1788:   PetscInt          nz,k;

1791:   /* solve D^(1/2)*U*perm(x) = b by back substitution */
1792:   VecGetArrayRead(bb,&b);
1793:   VecGetArray(xx,&x);
1794:   t    = a->solve_work;
1795:   ISGetIndices(isrow,&rp);

1797:   for (k=mbs-1; k>=0; k--) {
1798:     v     = aa + ai[k] + 1;
1799:     vj    = aj + ai[k] + 1;
1800:     diagk = PetscRealPart(aa[ai[k]]);
1801:     if (PetscImaginaryPart(aa[ai[k]]) || diagk < 0) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"Diagonal must be real and nonnegative");
1802:     t[k] = b[k] * PetscSqrtReal(diagk);
1803:     nz   = ai[k+1] - ai[k] - 1;
1804:     while (nz--) t[k] += (*v++) * t[*vj++];
1805:     x[rp[k]] = t[k];
1806:   }
1807:   ISRestoreIndices(isrow,&rp);
1808:   VecRestoreArrayRead(bb,&b);
1809:   VecRestoreArray(xx,&x);
1810:   PetscLogFlops(2.0*a->nz - mbs);
1811:   return(0);
1812: }

1814: PetscErrorCode MatSolves_SeqSBAIJ_1(Mat A,Vecs bb,Vecs xx)
1815: {
1816:   Mat_SeqSBAIJ   *a = (Mat_SeqSBAIJ*)A->data;

1820:   if (A->rmap->bs == 1) {
1821:     MatSolve_SeqSBAIJ_1(A,bb->v,xx->v);
1822:   } else {
1823:     IS                isrow=a->row;
1824:     const PetscInt    *vj,mbs=a->mbs,*ai=a->i,*aj=a->j,*rp;
1825:     const MatScalar   *aa=a->a,*v;
1826:     PetscScalar       *x,*t;
1827:     const PetscScalar *b;
1828:     PetscInt          nz,k,n,i,j;

1830:     if (bb->n > a->solves_work_n) {
1831:       PetscFree(a->solves_work);
1832:       PetscMalloc1(bb->n*A->rmap->N,&a->solves_work);
1833:       a->solves_work_n = bb->n;
1834:     }
1835:     n    = bb->n;
1836:     VecGetArrayRead(bb->v,&b);
1837:     VecGetArray(xx->v,&x);
1838:     t    = a->solves_work;

1840:     ISGetIndices(isrow,&rp);

1842:     /* solve U^T*D*y = perm(b) by forward substitution */
1843:     for (k=0; k<mbs; k++) {
1844:       for (i=0; i<n; i++) t[n*k+i] = b[rp[k]+i*mbs]; /* values are stored interlaced in t */
1845:     }
1846:     for (k=0; k<mbs; k++) {
1847:       v  = aa + ai[k];
1848:       vj = aj + ai[k];
1849:       nz = ai[k+1] - ai[k] - 1;
1850:       for (j=0; j<nz; j++) {
1851:         for (i=0; i<n; i++) t[n*(*vj)+i] += (*v) * t[n*k+i];
1852:         v++;vj++;
1853:       }
1854:       for (i=0; i<n; i++) t[n*k+i] *= aa[nz];  /* note: aa[nz] = 1/D(k) */
1855:     }

1857:     /* solve U*perm(x) = y by back substitution */
1858:     for (k=mbs-1; k>=0; k--) {
1859:       v  = aa + ai[k] - 1;
1860:       vj = aj + ai[k] - 1;
1861:       nz = ai[k+1] - ai[k] - 1;
1862:       for (j=0; j<nz; j++) {
1863:         for (i=0; i<n; i++) t[n*k+i] += (*v) * t[n*(*vj)+i];
1864:         v++;vj++;
1865:       }
1866:       for (i=0; i<n; i++) x[rp[k]+i*mbs] = t[n*k+i];
1867:     }

1869:     ISRestoreIndices(isrow,&rp);
1870:     VecRestoreArrayRead(bb->v,&b);
1871:     VecRestoreArray(xx->v,&x);
1872:     PetscLogFlops(bb->n*(4.0*a->nz - 3.0*mbs));
1873:   }
1874:   return(0);
1875: }

1877: PetscErrorCode MatSolves_SeqSBAIJ_1_inplace(Mat A,Vecs bb,Vecs xx)
1878: {
1879:   Mat_SeqSBAIJ   *a = (Mat_SeqSBAIJ*)A->data;

1883:   if (A->rmap->bs == 1) {
1884:     MatSolve_SeqSBAIJ_1_inplace(A,bb->v,xx->v);
1885:   } else {
1886:     IS                isrow=a->row;
1887:     const PetscInt    *vj,mbs=a->mbs,*ai=a->i,*aj=a->j,*rp;
1888:     const MatScalar   *aa=a->a,*v;
1889:     PetscScalar       *x,*t;
1890:     const PetscScalar *b;
1891:     PetscInt          nz,k,n,i;

1893:     if (bb->n > a->solves_work_n) {
1894:       PetscFree(a->solves_work);
1895:       PetscMalloc1(bb->n*A->rmap->N,&a->solves_work);
1896:       a->solves_work_n = bb->n;
1897:     }
1898:     n    = bb->n;
1899:     VecGetArrayRead(bb->v,&b);
1900:     VecGetArray(xx->v,&x);
1901:     t    = a->solves_work;

1903:     ISGetIndices(isrow,&rp);

1905:     /* solve U^T*D*y = perm(b) by forward substitution */
1906:     for (k=0; k<mbs; k++) {
1907:       for (i=0; i<n; i++) t[n*k+i] = b[rp[k]+i*mbs];  /* values are stored interlaced in t */
1908:     }
1909:     for (k=0; k<mbs; k++) {
1910:       v  = aa + ai[k];
1911:       vj = aj + ai[k];
1912:       nz = ai[k+1] - ai[k];
1913:       while (nz--) {
1914:         for (i=0; i<n; i++) t[n*(*vj)+i] += (*v) * t[n*k+i];
1915:         v++;vj++;
1916:       }
1917:       for (i=0; i<n; i++) t[n*k+i] *= aa[k];  /* note: aa[k] = 1/D(k) */
1918:     }

1920:     /* solve U*perm(x) = y by back substitution */
1921:     for (k=mbs-1; k>=0; k--) {
1922:       v  = aa + ai[k];
1923:       vj = aj + ai[k];
1924:       nz = ai[k+1] - ai[k];
1925:       while (nz--) {
1926:         for (i=0; i<n; i++) t[n*k+i] += (*v) * t[n*(*vj)+i];
1927:         v++;vj++;
1928:       }
1929:       for (i=0; i<n; i++) x[rp[k]+i*mbs] = t[n*k+i];
1930:     }

1932:     ISRestoreIndices(isrow,&rp);
1933:     VecRestoreArrayRead(bb->v,&b);
1934:     VecRestoreArray(xx->v,&x);
1935:     PetscLogFlops(bb->n*(4.0*a->nz - 3.0*mbs));
1936:   }
1937:   return(0);
1938: }

1940: PetscErrorCode MatSolve_SeqSBAIJ_1_NaturalOrdering(Mat A,Vec bb,Vec xx)
1941: {
1942:   Mat_SeqSBAIJ      *a = (Mat_SeqSBAIJ*)A->data;
1943:   PetscErrorCode    ierr;
1944:   const PetscInt    mbs=a->mbs,*ai=a->i,*aj=a->j,*vj,*adiag = a->diag;
1945:   const MatScalar   *aa=a->a,*v;
1946:   const PetscScalar *b;
1947:   PetscScalar       *x,xi;
1948:   PetscInt          nz,i,j;

1951:   VecGetArrayRead(bb,&b);
1952:   VecGetArray(xx,&x);

1954:   /* solve U^T*D*y = b by forward substitution */
1955:   PetscArraycpy(x,b,mbs);
1956:   for (i=0; i<mbs; i++) {
1957:     v  = aa + ai[i];
1958:     vj = aj + ai[i];
1959:     xi = x[i];
1960:     nz = ai[i+1] - ai[i] - 1; /* exclude diag[i] */
1961:     for (j=0; j<nz; j++) x[vj[j]] += v[j]* xi;
1962:     x[i] = xi*v[nz];  /* v[nz] = aa[diag[i]] = 1/D(i) */
1963:   }

1965:   /* solve U*x = y by backward substitution */
1966:   for (i=mbs-2; i>=0; i--) {
1967:     xi = x[i];
1968:     v  = aa + adiag[i] - 1; /* end of row i, excluding diag */
1969:     vj = aj + adiag[i] - 1;
1970:     nz = ai[i+1] - ai[i] - 1;
1971:     for (j=0; j<nz; j++) xi += v[-j]*x[vj[-j]];
1972:     x[i] = xi;
1973:   }

1975:   VecRestoreArrayRead(bb,&b);
1976:   VecRestoreArray(xx,&x);
1977:   PetscLogFlops(4.0*a->nz - 3*mbs);
1978:   return(0);
1979: }

1981: PetscErrorCode MatSolve_SeqSBAIJ_1_NaturalOrdering_inplace(Mat A,Vec bb,Vec xx)
1982: {
1983:   Mat_SeqSBAIJ      *a = (Mat_SeqSBAIJ*)A->data;
1984:   PetscErrorCode    ierr;
1985:   const PetscInt    mbs=a->mbs,*ai=a->i,*aj=a->j,*vj;
1986:   const MatScalar   *aa=a->a,*v;
1987:   PetscScalar       *x,xk;
1988:   const PetscScalar *b;
1989:   PetscInt          nz,k;

1992:   VecGetArrayRead(bb,&b);
1993:   VecGetArray(xx,&x);

1995:   /* solve U^T*D*y = b by forward substitution */
1996:   PetscArraycpy(x,b,mbs);
1997:   for (k=0; k<mbs; k++) {
1998:     v  = aa + ai[k] + 1;
1999:     vj = aj + ai[k] + 1;
2000:     xk = x[k];
2001:     nz = ai[k+1] - ai[k] - 1;     /* exclude diag[k] */
2002:     while (nz--) x[*vj++] += (*v++) * xk;
2003:     x[k] = xk*aa[ai[k]];  /* note: aa[diag[k]] = 1/D(k) */
2004:   }

2006:   /* solve U*x = y by back substitution */
2007:   for (k=mbs-2; k>=0; k--) {
2008:     v  = aa + ai[k] + 1;
2009:     vj = aj + ai[k] + 1;
2010:     xk = x[k];
2011:     nz = ai[k+1] - ai[k] - 1;
2012:     while (nz--) {
2013:       xk += (*v++) * x[*vj++];
2014:     }
2015:     x[k] = xk;
2016:   }

2018:   VecRestoreArrayRead(bb,&b);
2019:   VecRestoreArray(xx,&x);
2020:   PetscLogFlops(4.0*a->nz - 3*mbs);
2021:   return(0);
2022: }

2024: PetscErrorCode MatForwardSolve_SeqSBAIJ_1_NaturalOrdering(Mat A,Vec bb,Vec xx)
2025: {
2026:   Mat_SeqSBAIJ      *a = (Mat_SeqSBAIJ*)A->data;
2027:   PetscErrorCode    ierr;
2028:   const PetscInt    mbs=a->mbs,*ai=a->i,*aj=a->j,*adiag = a->diag,*vj;
2029:   const MatScalar   *aa=a->a,*v;
2030:   PetscReal         diagk;
2031:   PetscScalar       *x;
2032:   const PetscScalar *b;
2033:   PetscInt          nz,k;

2036:   /* solve U^T*D^(1/2)*x = b by forward substitution */
2037:   VecGetArrayRead(bb,&b);
2038:   VecGetArray(xx,&x);
2039:   PetscArraycpy(x,b,mbs);
2040:   for (k=0; k<mbs; k++) {
2041:     v  = aa + ai[k];
2042:     vj = aj + ai[k];
2043:     nz = ai[k+1] - ai[k] - 1;     /* exclude diag[k] */
2044:     while (nz--) x[*vj++] += (*v++) * x[k];
2045:     diagk = PetscRealPart(aa[adiag[k]]); /* note: aa[adiag[k]] = 1/D(k) */
2046:     if (PetscImaginaryPart(aa[adiag[k]]) || diagk < 0) SETERRQ2(PETSC_COMM_SELF,PETSC_ERR_SUP,"Diagonal (%g,%g) must be real and nonnegative",PetscRealPart(aa[adiag[k]]),PetscImaginaryPart(aa[adiag[k]]));
2047:     x[k] *= PetscSqrtReal(diagk);
2048:   }
2049:   VecRestoreArrayRead(bb,&b);
2050:   VecRestoreArray(xx,&x);
2051:   PetscLogFlops(2.0*a->nz - mbs);
2052:   return(0);
2053: }

2055: PetscErrorCode MatForwardSolve_SeqSBAIJ_1_NaturalOrdering_inplace(Mat A,Vec bb,Vec xx)
2056: {
2057:   Mat_SeqSBAIJ      *a = (Mat_SeqSBAIJ*)A->data;
2058:   PetscErrorCode    ierr;
2059:   const PetscInt    mbs=a->mbs,*ai=a->i,*aj=a->j,*vj;
2060:   const MatScalar   *aa=a->a,*v;
2061:   PetscReal         diagk;
2062:   PetscScalar       *x;
2063:   const PetscScalar *b;
2064:   PetscInt          nz,k;

2067:   /* solve U^T*D^(1/2)*x = b by forward substitution */
2068:   VecGetArrayRead(bb,&b);
2069:   VecGetArray(xx,&x);
2070:   PetscArraycpy(x,b,mbs);
2071:   for (k=0; k<mbs; k++) {
2072:     v  = aa + ai[k] + 1;
2073:     vj = aj + ai[k] + 1;
2074:     nz = ai[k+1] - ai[k] - 1;     /* exclude diag[k] */
2075:     while (nz--) x[*vj++] += (*v++) * x[k];
2076:     diagk = PetscRealPart(aa[ai[k]]); /* note: aa[diag[k]] = 1/D(k) */
2077:     if (PetscImaginaryPart(aa[ai[k]]) || diagk < 0) SETERRQ2(PETSC_COMM_SELF,PETSC_ERR_SUP,"Diagonal (%g,%g) must be real and nonnegative",PetscRealPart(aa[ai[k]]),PetscImaginaryPart(aa[ai[k]]));
2078:     x[k] *= PetscSqrtReal(diagk);
2079:   }
2080:   VecRestoreArrayRead(bb,&b);
2081:   VecRestoreArray(xx,&x);
2082:   PetscLogFlops(2.0*a->nz - mbs);
2083:   return(0);
2084: }

2086: PetscErrorCode MatBackwardSolve_SeqSBAIJ_1_NaturalOrdering(Mat A,Vec bb,Vec xx)
2087: {
2088:   Mat_SeqSBAIJ      *a = (Mat_SeqSBAIJ*)A->data;
2089:   PetscErrorCode    ierr;
2090:   const PetscInt    mbs=a->mbs,*ai=a->i,*aj=a->j,*adiag = a->diag,*vj;
2091:   const MatScalar   *aa=a->a,*v;
2092:   PetscReal         diagk;
2093:   PetscScalar       *x;
2094:   const PetscScalar *b;
2095:   PetscInt          nz,k;

2098:   /* solve D^(1/2)*U*x = b by back substitution */
2099:   VecGetArrayRead(bb,&b);
2100:   VecGetArray(xx,&x);

2102:   for (k=mbs-1; k>=0; k--) {
2103:     v     = aa + ai[k];
2104:     vj    = aj + ai[k];
2105:     diagk = PetscRealPart(aa[adiag[k]]); /* note: aa[diag[k]] = 1/D(k) */
2106:     if (PetscImaginaryPart(aa[adiag[k]]) || diagk < 0) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"Diagonal must be real and nonnegative");
2107:     x[k] = PetscSqrtReal(diagk)*b[k];
2108:     nz   = ai[k+1] - ai[k] - 1;
2109:     while (nz--) x[k] += (*v++) * x[*vj++];
2110:   }
2111:   VecRestoreArrayRead(bb,&b);
2112:   VecRestoreArray(xx,&x);
2113:   PetscLogFlops(2.0*a->nz - mbs);
2114:   return(0);
2115: }

2117: PetscErrorCode MatBackwardSolve_SeqSBAIJ_1_NaturalOrdering_inplace(Mat A,Vec bb,Vec xx)
2118: {
2119:   Mat_SeqSBAIJ      *a = (Mat_SeqSBAIJ*)A->data;
2120:   PetscErrorCode    ierr;
2121:   const PetscInt    mbs=a->mbs,*ai=a->i,*aj=a->j,*vj;
2122:   const MatScalar   *aa=a->a,*v;
2123:   PetscReal         diagk;
2124:   PetscScalar       *x;
2125:   const PetscScalar *b;
2126:   PetscInt          nz,k;

2129:   /* solve D^(1/2)*U*x = b by back substitution */
2130:   VecGetArrayRead(bb,&b);
2131:   VecGetArray(xx,&x);

2133:   for (k=mbs-1; k>=0; k--) {
2134:     v     = aa + ai[k] + 1;
2135:     vj    = aj + ai[k] + 1;
2136:     diagk = PetscRealPart(aa[ai[k]]); /* note: aa[diag[k]] = 1/D(k) */
2137:     if (PetscImaginaryPart(aa[ai[k]]) || diagk < 0) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"Diagonal must be real and nonnegative");
2138:     x[k] = PetscSqrtReal(diagk)*b[k];
2139:     nz   = ai[k+1] - ai[k] - 1;
2140:     while (nz--) x[k] += (*v++) * x[*vj++];
2141:   }
2142:   VecRestoreArrayRead(bb,&b);
2143:   VecRestoreArray(xx,&x);
2144:   PetscLogFlops(2.0*a->nz - mbs);
2145:   return(0);
2146: }

2148: /* Use Modified Sparse Row storage for u and ju, see Saad pp.85 */
2149: PetscErrorCode MatICCFactorSymbolic_SeqSBAIJ_MSR(Mat B,Mat A,IS perm,const MatFactorInfo *info)
2150: {
2151:   Mat_SeqSBAIJ   *a = (Mat_SeqSBAIJ*)A->data,*b;
2153:   const PetscInt *rip,mbs = a->mbs,*ai,*aj;
2154:   PetscInt       *jutmp,bs = A->rmap->bs,i;
2155:   PetscInt       m,reallocs = 0,*levtmp;
2156:   PetscInt       *prowl,*q,jmin,jmax,juidx,nzk,qm,*iu,*ju,k,j,vj,umax,maxadd;
2157:   PetscInt       incrlev,*lev,shift,prow,nz;
2158:   PetscReal      f = info->fill,levels = info->levels;
2159:   PetscBool      perm_identity;

2162:   /* check whether perm is the identity mapping */
2163:   ISIdentity(perm,&perm_identity);

2165:   if (perm_identity) {
2166:     a->permute = PETSC_FALSE;
2167:     ai         = a->i; aj = a->j;
2168:   } else { /*  non-trivial permutation */
2169:     SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"Matrix reordering is not supported for sbaij matrix. Use aij format");
2170:     a->permute = PETSC_TRUE;
2171:     MatReorderingSeqSBAIJ(A, perm);
2172:     ai         = a->inew; aj = a->jnew;
2173:   }

2175:   /* initialization */
2176:   ISGetIndices(perm,&rip);
2177:   umax  = (PetscInt)(f*ai[mbs] + 1);
2178:   PetscMalloc1(umax,&lev);
2179:   umax += mbs + 1;
2180:   shift = mbs + 1;
2181:   PetscMalloc1(mbs+1,&iu);
2182:   PetscMalloc1(umax,&ju);
2183:   iu[0] = mbs + 1;
2184:   juidx = mbs + 1;
2185:   /* prowl: linked list for pivot row */
2186:   PetscMalloc3(mbs,&prowl,mbs,&q,mbs,&levtmp);
2187:   /* q: linked list for col index */

2189:   for (i=0; i<mbs; i++) {
2190:     prowl[i] = mbs;
2191:     q[i]     = 0;
2192:   }

2194:   /* for each row k */
2195:   for (k=0; k<mbs; k++) {
2196:     nzk  = 0;
2197:     q[k] = mbs;
2198:     /* copy current row into linked list */
2199:     nz = ai[rip[k]+1] - ai[rip[k]];
2200:     j  = ai[rip[k]];
2201:     while (nz--) {
2202:       vj = rip[aj[j++]];
2203:       if (vj > k) {
2204:         qm = k;
2205:         do {
2206:           m = qm; qm = q[m];
2207:         } while (qm < vj);
2208:         if (qm == vj) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_PLIB,"Duplicate entry in A\n");
2209:         nzk++;
2210:         q[m]       = vj;
2211:         q[vj]      = qm;
2212:         levtmp[vj] = 0;   /* initialize lev for nonzero element */
2213:       }
2214:     }

2216:     /* modify nonzero structure of k-th row by computing fill-in
2217:        for each row prow to be merged in */
2218:     prow = k;
2219:     prow = prowl[prow]; /* next pivot row (== 0 for symbolic factorization) */

2221:     while (prow < k) {
2222:       /* merge row prow into k-th row */
2223:       jmin = iu[prow] + 1;
2224:       jmax = iu[prow+1];
2225:       qm   = k;
2226:       for (j=jmin; j<jmax; j++) {
2227:         incrlev = lev[j-shift] + 1;
2228:         if (incrlev > levels) continue;
2229:         vj = ju[j];
2230:         do {
2231:           m = qm; qm = q[m];
2232:         } while (qm < vj);
2233:         if (qm != vj) {      /* a fill */
2234:           nzk++; q[m] = vj; q[vj] = qm; qm = vj;
2235:           levtmp[vj] = incrlev;
2236:         } else if (levtmp[vj] > incrlev) levtmp[vj] = incrlev;
2237:       }
2238:       prow = prowl[prow]; /* next pivot row */
2239:     }

2241:     /* add k to row list for first nonzero element in k-th row */
2242:     if (nzk > 1) {
2243:       i        = q[k]; /* col value of first nonzero element in k_th row of U */
2244:       prowl[k] = prowl[i]; prowl[i] = k;
2245:     }
2246:     iu[k+1] = iu[k] + nzk;

2248:     /* allocate more space to ju and lev if needed */
2249:     if (iu[k+1] > umax) {
2250:       /* estimate how much additional space we will need */
2251:       /* use the strategy suggested by David Hysom <hysom@perch-t.icase.edu> */
2252:       /* just double the memory each time */
2253:       maxadd = umax;
2254:       if (maxadd < nzk) maxadd = (mbs-k)*(nzk+1)/2;
2255:       umax += maxadd;

2257:       /* allocate a longer ju */
2258:       PetscMalloc1(umax,&jutmp);
2259:       PetscArraycpy(jutmp,ju,iu[k]);
2260:       PetscFree(ju);
2261:       ju   = jutmp;

2263:       PetscMalloc1(umax,&jutmp);
2264:       PetscArraycpy(jutmp,lev,iu[k]-shift);
2265:       PetscFree(lev);
2266:       lev       = jutmp;
2267:       reallocs += 2; /* count how many times we realloc */
2268:     }

2270:     /* save nonzero structure of k-th row in ju */
2271:     i=k;
2272:     while (nzk--) {
2273:       i                = q[i];
2274:       ju[juidx]        = i;
2275:       lev[juidx-shift] = levtmp[i];
2276:       juidx++;
2277:     }
2278:   }

2280: #if defined(PETSC_USE_INFO)
2281:   if (ai[mbs] != 0) {
2282:     PetscReal af = ((PetscReal)iu[mbs])/((PetscReal)ai[mbs]);
2283:     PetscInfo3(A,"Reallocs %D Fill ratio:given %g needed %g\n",reallocs,(double)f,(double)af);
2284:     PetscInfo1(A,"Run with -pc_factor_fill %g or use \n",(double)af);
2285:     PetscInfo1(A,"PCFactorSetFill(pc,%g);\n",(double)af);
2286:     PetscInfo(A,"for best performance.\n");
2287:   } else {
2288:     PetscInfo(A,"Empty matrix.\n");
2289:   }
2290: #endif

2292:   ISRestoreIndices(perm,&rip);
2293:   PetscFree3(prowl,q,levtmp);
2294:   PetscFree(lev);

2296:   /* put together the new matrix */
2297:   MatSeqSBAIJSetPreallocation(B,bs,0,NULL);

2299:   /* PetscLogObjectParent((PetscObject)B,(PetscObject)iperm); */
2300:   b    = (Mat_SeqSBAIJ*)(B)->data;
2301:   PetscFree2(b->imax,b->ilen);

2303:   b->singlemalloc = PETSC_FALSE;
2304:   b->free_a       = PETSC_TRUE;
2305:   b->free_ij      = PETSC_TRUE;
2306:   /* the next line frees the default space generated by the Create() */
2307:   PetscFree3(b->a,b->j,b->i);
2308:   PetscMalloc1((iu[mbs]+1)*a->bs2,&b->a);
2309:   b->j    = ju;
2310:   b->i    = iu;
2311:   b->diag = 0;
2312:   b->ilen = 0;
2313:   b->imax = 0;

2315:   ISDestroy(&b->row);
2316:   ISDestroy(&b->icol);
2317:   b->row  = perm;
2318:   b->icol = perm;
2319:   PetscObjectReference((PetscObject)perm);
2320:   PetscObjectReference((PetscObject)perm);
2321:   PetscMalloc1(bs*mbs+bs,&b->solve_work);
2322:   /* In b structure:  Free imax, ilen, old a, old j.
2323:      Allocate idnew, solve_work, new a, new j */
2324:   PetscLogObjectMemory((PetscObject)B,(iu[mbs]-mbs)*(sizeof(PetscInt)+sizeof(MatScalar)));
2325:   b->maxnz = b->nz = iu[mbs];

2327:   (B)->info.factor_mallocs   = reallocs;
2328:   (B)->info.fill_ratio_given = f;
2329:   if (ai[mbs] != 0) {
2330:     (B)->info.fill_ratio_needed = ((PetscReal)iu[mbs])/((PetscReal)ai[mbs]);
2331:   } else {
2332:     (B)->info.fill_ratio_needed = 0.0;
2333:   }
2334:   MatSeqSBAIJSetNumericFactorization_inplace(B,perm_identity);
2335:   return(0);
2336: }

2338: /*
2339:   See MatICCFactorSymbolic_SeqAIJ() for description of its data structure
2340: */
2341:  #include <petscbt.h>
2342:  #include <../src/mat/utils/freespace.h>
2343: PetscErrorCode MatICCFactorSymbolic_SeqSBAIJ(Mat fact,Mat A,IS perm,const MatFactorInfo *info)
2344: {
2345:   Mat_SeqSBAIJ       *a = (Mat_SeqSBAIJ*)A->data,*b;
2346:   PetscErrorCode     ierr;
2347:   PetscBool          perm_identity,free_ij = PETSC_TRUE,missing;
2348:   PetscInt           bs=A->rmap->bs,am=a->mbs,d,*ai=a->i,*aj= a->j;
2349:   const PetscInt     *rip;
2350:   PetscInt           reallocs=0,i,*ui,*udiag,*cols;
2351:   PetscInt           jmin,jmax,nzk,k,j,*jl,prow,*il,nextprow;
2352:   PetscInt           nlnk,*lnk,*lnk_lvl=NULL,ncols,*uj,**uj_ptr,**uj_lvl_ptr;
2353:   PetscReal          fill          =info->fill,levels=info->levels;
2354:   PetscFreeSpaceList free_space    =NULL,current_space=NULL;
2355:   PetscFreeSpaceList free_space_lvl=NULL,current_space_lvl=NULL;
2356:   PetscBT            lnkbt;

2359:   if (A->rmap->n != A->cmap->n) SETERRQ2(PETSC_COMM_SELF,PETSC_ERR_ARG_WRONG,"Must be square matrix, rows %D columns %D",A->rmap->n,A->cmap->n);
2360:   MatMissingDiagonal(A,&missing,&d);
2361:   if (missing) SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_ARG_WRONGSTATE,"Matrix is missing diagonal entry %D",d);
2362:   if (bs > 1) {
2363:     MatICCFactorSymbolic_SeqSBAIJ_inplace(fact,A,perm,info);
2364:     return(0);
2365:   }

2367:   /* check whether perm is the identity mapping */
2368:   ISIdentity(perm,&perm_identity);
2369:   if (!perm_identity) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"Matrix reordering is not supported for sbaij matrix. Use aij format");
2370:   a->permute = PETSC_FALSE;

2372:   PetscMalloc1(am+1,&ui);
2373:   PetscMalloc1(am+1,&udiag);
2374:   ui[0] = 0;

2376:   /* ICC(0) without matrix ordering: simply rearrange column indices */
2377:   if (!levels) {
2378:     /* reuse the column pointers and row offsets for memory savings */
2379:     for (i=0; i<am; i++) {
2380:       ncols    = ai[i+1] - ai[i];
2381:       ui[i+1]  = ui[i] + ncols;
2382:       udiag[i] = ui[i+1] - 1; /* points to the last entry of U(i,:) */
2383:     }
2384:     PetscMalloc1(ui[am]+1,&uj);
2385:     cols = uj;
2386:     for (i=0; i<am; i++) {
2387:       aj    = a->j + ai[i] + 1; /* 1st entry of U(i,:) without diagonal */
2388:       ncols = ai[i+1] - ai[i] -1;
2389:       for (j=0; j<ncols; j++) *cols++ = aj[j];
2390:       *cols++ = i; /* diagoanl is located as the last entry of U(i,:) */
2391:     }
2392:   } else { /* case: levels>0 */
2393:     ISGetIndices(perm,&rip);

2395:     /* initialization */
2396:     /* jl: linked list for storing indices of the pivot rows
2397:        il: il[i] points to the 1st nonzero entry of U(i,k:am-1) */
2398:     PetscMalloc4(am,&uj_ptr,am,&uj_lvl_ptr,am,&il,am,&jl);
2399:     for (i=0; i<am; i++) {
2400:       jl[i] = am; il[i] = 0;
2401:     }

2403:     /* create and initialize a linked list for storing column indices of the active row k */
2404:     nlnk = am + 1;
2405:     PetscIncompleteLLCreate(am,am,nlnk,lnk,lnk_lvl,lnkbt);

2407:     /* initial FreeSpace size is fill*(ai[am]+1) */
2408:     PetscFreeSpaceGet(PetscRealIntMultTruncate(fill,ai[am]+1),&free_space);

2410:     current_space = free_space;

2412:     PetscFreeSpaceGet(PetscRealIntMultTruncate(fill,ai[am]+1),&free_space_lvl);

2414:     current_space_lvl = free_space_lvl;

2416:     for (k=0; k<am; k++) {  /* for each active row k */
2417:       /* initialize lnk by the column indices of row k */
2418:       nzk   = 0;
2419:       ncols = ai[k+1] - ai[k];
2420:       if (!ncols) SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_MAT_CH_ZRPVT,"Empty row %D in matrix ",k);
2421:       cols = aj+ai[k];
2422:       PetscIncompleteLLInit(ncols,cols,am,rip,nlnk,lnk,lnk_lvl,lnkbt);
2423:       nzk += nlnk;

2425:       /* update lnk by computing fill-in for each pivot row to be merged in */
2426:       prow = jl[k]; /* 1st pivot row */

2428:       while (prow < k) {
2429:         nextprow = jl[prow];

2431:         /* merge prow into k-th row */
2432:         jmin  = il[prow] + 1; /* index of the 2nd nzero entry in U(prow,k:am-1) */
2433:         jmax  = ui[prow+1];
2434:         ncols = jmax-jmin;
2435:         i     = jmin - ui[prow];

2437:         cols = uj_ptr[prow] + i;  /* points to the 2nd nzero entry in U(prow,k:am-1) */
2438:         uj   = uj_lvl_ptr[prow] + i;  /* levels of cols */
2439:         j    = *(uj - 1);
2440:         PetscICCLLAddSorted(ncols,cols,levels,uj,am,nlnk,lnk,lnk_lvl,lnkbt,j);
2441:         nzk += nlnk;

2443:         /* update il and jl for prow */
2444:         if (jmin < jmax) {
2445:           il[prow] = jmin;

2447:           j = *cols; jl[prow] = jl[j]; jl[j] = prow;
2448:         }
2449:         prow = nextprow;
2450:       }

2452:       /* if free space is not available, make more free space */
2453:       if (current_space->local_remaining<nzk) {
2454:         i    = am - k + 1; /* num of unfactored rows */
2455:         i    = PetscIntMultTruncate(i,PetscMin(nzk, i-1)); /* i*nzk, i*(i-1): estimated and max additional space needed */
2456:         PetscFreeSpaceGet(i,&current_space);
2457:         PetscFreeSpaceGet(i,&current_space_lvl);
2458:         reallocs++;
2459:       }

2461:       /* copy data into free_space and free_space_lvl, then initialize lnk */
2462:       if (nzk == 0) SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_ARG_WRONG,"Empty row %D in ICC matrix factor",k);
2463:       PetscIncompleteLLClean(am,am,nzk,lnk,lnk_lvl,current_space->array,current_space_lvl->array,lnkbt);

2465:       /* add the k-th row into il and jl */
2466:       if (nzk > 1) {
2467:         i     = current_space->array[1]; /* col value of the first nonzero element in U(k, k+1:am-1) */
2468:         jl[k] = jl[i]; jl[i] = k;
2469:         il[k] = ui[k] + 1;
2470:       }
2471:       uj_ptr[k]     = current_space->array;
2472:       uj_lvl_ptr[k] = current_space_lvl->array;

2474:       current_space->array               += nzk;
2475:       current_space->local_used          += nzk;
2476:       current_space->local_remaining     -= nzk;
2477:       current_space_lvl->array           += nzk;
2478:       current_space_lvl->local_used      += nzk;
2479:       current_space_lvl->local_remaining -= nzk;

2481:       ui[k+1] = ui[k] + nzk;
2482:     }

2484:     ISRestoreIndices(perm,&rip);
2485:     PetscFree4(uj_ptr,uj_lvl_ptr,il,jl);

2487:     /* destroy list of free space and other temporary array(s) */
2488:     PetscMalloc1(ui[am]+1,&uj);
2489:     PetscFreeSpaceContiguous_Cholesky(&free_space,uj,am,ui,udiag); /* store matrix factor  */
2490:     PetscIncompleteLLDestroy(lnk,lnkbt);
2491:     PetscFreeSpaceDestroy(free_space_lvl);

2493:   } /* end of case: levels>0 || (levels=0 && !perm_identity) */

2495:   /* put together the new matrix in MATSEQSBAIJ format */
2496:   MatSeqSBAIJSetPreallocation(fact,bs,MAT_SKIP_ALLOCATION,NULL);

2498:   b    = (Mat_SeqSBAIJ*)(fact)->data;
2499:   PetscFree2(b->imax,b->ilen);

2501:   b->singlemalloc = PETSC_FALSE;
2502:   b->free_a       = PETSC_TRUE;
2503:   b->free_ij      = free_ij;

2505:   PetscMalloc1(ui[am]+1,&b->a);

2507:   b->j         = uj;
2508:   b->i         = ui;
2509:   b->diag      = udiag;
2510:   b->free_diag = PETSC_TRUE;
2511:   b->ilen      = 0;
2512:   b->imax      = 0;
2513:   b->row       = perm;
2514:   b->col       = perm;

2516:   PetscObjectReference((PetscObject)perm);
2517:   PetscObjectReference((PetscObject)perm);

2519:   b->pivotinblocks = PETSC_FALSE; /* need to get from MatFactorInfo */

2521:   PetscMalloc1(am+1,&b->solve_work);
2522:   PetscLogObjectMemory((PetscObject)fact,ui[am]*(sizeof(PetscInt)+sizeof(MatScalar)));

2524:   b->maxnz = b->nz = ui[am];

2526:   fact->info.factor_mallocs   = reallocs;
2527:   fact->info.fill_ratio_given = fill;
2528:   if (ai[am] != 0) {
2529:     fact->info.fill_ratio_needed = ((PetscReal)ui[am])/ai[am];
2530:   } else {
2531:     fact->info.fill_ratio_needed = 0.0;
2532:   }
2533: #if defined(PETSC_USE_INFO)
2534:   if (ai[am] != 0) {
2535:     PetscReal af = fact->info.fill_ratio_needed;
2536:     PetscInfo3(A,"Reallocs %D Fill ratio:given %g needed %g\n",reallocs,(double)fill,(double)af);
2537:     PetscInfo1(A,"Run with -pc_factor_fill %g or use \n",(double)af);
2538:     PetscInfo1(A,"PCFactorSetFill(pc,%g) for best performance.\n",(double)af);
2539:   } else {
2540:     PetscInfo(A,"Empty matrix.\n");
2541:   }
2542: #endif
2543:   fact->ops->choleskyfactornumeric = MatCholeskyFactorNumeric_SeqSBAIJ_1_NaturalOrdering;
2544:   return(0);
2545: }

2547: PetscErrorCode MatICCFactorSymbolic_SeqSBAIJ_inplace(Mat fact,Mat A,IS perm,const MatFactorInfo *info)
2548: {
2549:   Mat_SeqSBAIJ       *a = (Mat_SeqSBAIJ*)A->data;
2550:   Mat_SeqSBAIJ       *b;
2551:   PetscErrorCode     ierr;
2552:   PetscBool          perm_identity,free_ij = PETSC_TRUE;
2553:   PetscInt           bs=A->rmap->bs,am=a->mbs;
2554:   const PetscInt     *cols,*rip,*ai=a->i,*aj=a->j;
2555:   PetscInt           reallocs=0,i,*ui;
2556:   PetscInt           jmin,jmax,nzk,k,j,*jl,prow,*il,nextprow;
2557:   PetscInt           nlnk,*lnk,*lnk_lvl=NULL,ncols,*cols_lvl,*uj,**uj_ptr,**uj_lvl_ptr;
2558:   PetscReal          fill          =info->fill,levels=info->levels,ratio_needed;
2559:   PetscFreeSpaceList free_space    =NULL,current_space=NULL;
2560:   PetscFreeSpaceList free_space_lvl=NULL,current_space_lvl=NULL;
2561:   PetscBT            lnkbt;

2564:   /*
2565:    This code originally uses Modified Sparse Row (MSR) storage
2566:    (see page 85, "Iterative Methods ..." by Saad) for the output matrix B - bad choice!
2567:    Then it is rewritten so the factor B takes seqsbaij format. However the associated
2568:    MatCholeskyFactorNumeric_() have not been modified for the cases of bs>1,
2569:    thus the original code in MSR format is still used for these cases.
2570:    The code below should replace MatICCFactorSymbolic_SeqSBAIJ_MSR() whenever
2571:    MatCholeskyFactorNumeric_() is modified for using sbaij symbolic factor.
2572:   */
2573:   if (bs > 1) {
2574:     MatICCFactorSymbolic_SeqSBAIJ_MSR(fact,A,perm,info);
2575:     return(0);
2576:   }

2578:   /* check whether perm is the identity mapping */
2579:   ISIdentity(perm,&perm_identity);
2580:   if (!perm_identity) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"Matrix reordering is not supported for sbaij matrix. Use aij format");
2581:   a->permute = PETSC_FALSE;

2583:   /* special case that simply copies fill pattern */
2584:   if (!levels) {
2585:     /* reuse the column pointers and row offsets for memory savings */
2586:     ui           = a->i;
2587:     uj           = a->j;
2588:     free_ij      = PETSC_FALSE;
2589:     ratio_needed = 1.0;
2590:   } else { /* case: levels>0 */
2591:     ISGetIndices(perm,&rip);

2593:     /* initialization */
2594:     PetscMalloc1(am+1,&ui);
2595:     ui[0] = 0;

2597:     /* jl: linked list for storing indices of the pivot rows
2598:        il: il[i] points to the 1st nonzero entry of U(i,k:am-1) */
2599:     PetscMalloc4(am,&uj_ptr,am,&uj_lvl_ptr,am,&il,am,&jl);
2600:     for (i=0; i<am; i++) {
2601:       jl[i] = am; il[i] = 0;
2602:     }

2604:     /* create and initialize a linked list for storing column indices of the active row k */
2605:     nlnk = am + 1;
2606:     PetscIncompleteLLCreate(am,am,nlnk,lnk,lnk_lvl,lnkbt);

2608:     /* initial FreeSpace size is fill*(ai[am]+1) */
2609:     PetscFreeSpaceGet(PetscRealIntMultTruncate(fill,ai[am]+1),&free_space);

2611:     current_space = free_space;

2613:     PetscFreeSpaceGet(PetscRealIntMultTruncate(fill,ai[am]+1),&free_space_lvl);

2615:     current_space_lvl = free_space_lvl;

2617:     for (k=0; k<am; k++) {  /* for each active row k */
2618:       /* initialize lnk by the column indices of row rip[k] */
2619:       nzk   = 0;
2620:       ncols = ai[rip[k]+1] - ai[rip[k]];
2621:       cols  = aj+ai[rip[k]];
2622:       PetscIncompleteLLInit(ncols,cols,am,rip,nlnk,lnk,lnk_lvl,lnkbt);
2623:       nzk  += nlnk;

2625:       /* update lnk by computing fill-in for each pivot row to be merged in */
2626:       prow = jl[k]; /* 1st pivot row */

2628:       while (prow < k) {
2629:         nextprow = jl[prow];

2631:         /* merge prow into k-th row */
2632:         jmin     = il[prow] + 1; /* index of the 2nd nzero entry in U(prow,k:am-1) */
2633:         jmax     = ui[prow+1];
2634:         ncols    = jmax-jmin;
2635:         i        = jmin - ui[prow];
2636:         cols     = uj_ptr[prow] + i; /* points to the 2nd nzero entry in U(prow,k:am-1) */
2637:         j        = *(uj_lvl_ptr[prow] + i - 1);
2638:         cols_lvl = uj_lvl_ptr[prow]+i;
2639:         PetscICCLLAddSorted(ncols,cols,levels,cols_lvl,am,nlnk,lnk,lnk_lvl,lnkbt,j);
2640:         nzk     += nlnk;

2642:         /* update il and jl for prow */
2643:         if (jmin < jmax) {
2644:           il[prow] = jmin;
2645:           j        = *cols;
2646:           jl[prow] = jl[j];
2647:           jl[j]    = prow;
2648:         }
2649:         prow = nextprow;
2650:       }

2652:       /* if free space is not available, make more free space */
2653:       if (current_space->local_remaining<nzk) {
2654:         i    = am - k + 1; /* num of unfactored rows */
2655:         i    = PetscMin(PetscIntMultTruncate(i,nzk), PetscIntMultTruncate(i,i-1)); /* i*nzk, i*(i-1): estimated and max additional space needed */
2656:         PetscFreeSpaceGet(i,&current_space);
2657:         PetscFreeSpaceGet(i,&current_space_lvl);
2658:         reallocs++;
2659:       }

2661:       /* copy data into free_space and free_space_lvl, then initialize lnk */
2662:       PetscIncompleteLLClean(am,am,nzk,lnk,lnk_lvl,current_space->array,current_space_lvl->array,lnkbt);

2664:       /* add the k-th row into il and jl */
2665:       if (nzk-1 > 0) {
2666:         i     = current_space->array[1]; /* col value of the first nonzero element in U(k, k+1:am-1) */
2667:         jl[k] = jl[i]; jl[i] = k;
2668:         il[k] = ui[k] + 1;
2669:       }
2670:       uj_ptr[k]     = current_space->array;
2671:       uj_lvl_ptr[k] = current_space_lvl->array;

2673:       current_space->array               += nzk;
2674:       current_space->local_used          += nzk;
2675:       current_space->local_remaining     -= nzk;
2676:       current_space_lvl->array           += nzk;
2677:       current_space_lvl->local_used      += nzk;
2678:       current_space_lvl->local_remaining -= nzk;

2680:       ui[k+1] = ui[k] + nzk;
2681:     }

2683:     ISRestoreIndices(perm,&rip);
2684:     PetscFree4(uj_ptr,uj_lvl_ptr,il,jl);

2686:     /* destroy list of free space and other temporary array(s) */
2687:     PetscMalloc1(ui[am]+1,&uj);
2688:     PetscFreeSpaceContiguous(&free_space,uj);
2689:     PetscIncompleteLLDestroy(lnk,lnkbt);
2690:     PetscFreeSpaceDestroy(free_space_lvl);
2691:     if (ai[am] != 0) {
2692:       ratio_needed = ((PetscReal)ui[am])/((PetscReal)ai[am]);
2693:     } else {
2694:       ratio_needed = 0.0;
2695:     }
2696:   } /* end of case: levels>0 || (levels=0 && !perm_identity) */

2698:   /* put together the new matrix in MATSEQSBAIJ format */
2699:   MatSeqSBAIJSetPreallocation(fact,bs,MAT_SKIP_ALLOCATION,NULL);

2701:   b = (Mat_SeqSBAIJ*)(fact)->data;

2703:   PetscFree2(b->imax,b->ilen);

2705:   b->singlemalloc = PETSC_FALSE;
2706:   b->free_a       = PETSC_TRUE;
2707:   b->free_ij      = free_ij;

2709:   PetscMalloc1(ui[am]+1,&b->a);

2711:   b->j             = uj;
2712:   b->i             = ui;
2713:   b->diag          = 0;
2714:   b->ilen          = 0;
2715:   b->imax          = 0;
2716:   b->row           = perm;
2717:   b->pivotinblocks = PETSC_FALSE; /* need to get from MatFactorInfo */

2719:   PetscObjectReference((PetscObject)perm);
2720:   b->icol = perm;
2721:   PetscObjectReference((PetscObject)perm);
2722:   PetscMalloc1(am+1,&b->solve_work);

2724:   b->maxnz = b->nz = ui[am];

2726:   fact->info.factor_mallocs    = reallocs;
2727:   fact->info.fill_ratio_given  = fill;
2728:   fact->info.fill_ratio_needed = ratio_needed;
2729: #if defined(PETSC_USE_INFO)
2730:   if (ai[am] != 0) {
2731:     PetscReal af = fact->info.fill_ratio_needed;
2732:     PetscInfo3(A,"Reallocs %D Fill ratio:given %g needed %g\n",reallocs,(double)fill,(double)af);
2733:     PetscInfo1(A,"Run with -pc_factor_fill %g or use \n",(double)af);
2734:     PetscInfo1(A,"PCFactorSetFill(pc,%g) for best performance.\n",(double)af);
2735:   } else {
2736:     PetscInfo(A,"Empty matrix.\n");
2737:   }
2738: #endif
2739:   if (perm_identity) {
2740:     fact->ops->choleskyfactornumeric = MatCholeskyFactorNumeric_SeqSBAIJ_1_NaturalOrdering_inplace;
2741:   } else {
2742:     fact->ops->choleskyfactornumeric = MatCholeskyFactorNumeric_SeqSBAIJ_1_inplace;
2743:   }
2744:   return(0);
2745: }