Actual source code: blockinvert.h

petsc-master 2019-04-18
Report Typos and Errors
  1: /*
  2:     Kernels used in sparse ILU (and LU) and in the resulting triangular
  3:  solves. These are for block algorithms where the block sizes are on
  4:  the order of 2-6+.

  6:     There are TWO versions of the macros below.
  7:     1) standard for MatScalar == PetscScalar use the standard BLAS for
  8:        block size larger than 7 and
  9:     2) handcoded Fortran single precision for the matrices, since BLAS
 10:        does not have some arguments in single and some in double.

 12: */
 15:  #include <petscblaslapack.h>

 17: /*
 18:       These are C kernels,they are contained in
 19:    src/mat/impls/baij/seq
 20: */

 22: PETSC_INTERN PetscErrorCode PetscLINPACKgefa(MatScalar*,PetscInt,PetscInt*,PetscBool,PetscBool*);
 23: PETSC_INTERN PetscErrorCode PetscLINPACKgedi(MatScalar*,PetscInt,PetscInt*,MatScalar*);
 24: PETSC_EXTERN PetscErrorCode PetscKernel_A_gets_inverse_A_2(MatScalar*,PetscReal,PetscBool,PetscBool*);
 25: PETSC_EXTERN PetscErrorCode PetscKernel_A_gets_inverse_A_3(MatScalar*,PetscReal,PetscBool,PetscBool*);

 27: #define PetscKernel_A_gets_inverse_A_4_nopivot(mat) 0; \
 28:   { \
 29:     MatScalar d, di; \
 30: \
 31:     di       = mat[0]; \
 32:     mat[0]   = d = 1.0 / di; \
 33:     mat[4]  *= -d; \
 34:     mat[8]  *= -d; \
 35:     mat[12] *= -d; \
 36:     mat[1]  *= d; \
 37:     mat[2]  *= d; \
 38:     mat[3]  *= d; \
 39:     mat[5]  += mat[4] * mat[1] * di; \
 40:     mat[6]  += mat[4] * mat[2] * di; \
 41:     mat[7]  += mat[4] * mat[3] * di; \
 42:     mat[9]  += mat[8] * mat[1] * di; \
 43:     mat[10] += mat[8] * mat[2] * di; \
 44:     mat[11] += mat[8] * mat[3] * di; \
 45:     mat[13] += mat[12] * mat[1] * di; \
 46:     mat[14] += mat[12] * mat[2] * di; \
 47:     mat[15] += mat[12] * mat[3] * di; \
 48:     di       = mat[5]; \
 49:     mat[5]   = d = 1.0 / di; \
 50:     mat[1]  *= -d; \
 51:     mat[9]  *= -d; \
 52:     mat[13] *= -d; \
 53:     mat[4]  *= d; \
 54:     mat[6]  *= d; \
 55:     mat[7]  *= d; \
 56:     mat[0]  += mat[1] * mat[4] * di; \
 57:     mat[2]  += mat[1] * mat[6] * di; \
 58:     mat[3]  += mat[1] * mat[7] * di; \
 59:     mat[8]  += mat[9] * mat[4] * di; \
 60:     mat[10] += mat[9] * mat[6] * di; \
 61:     mat[11] += mat[9] * mat[7] * di; \
 62:     mat[12] += mat[13] * mat[4] * di; \
 63:     mat[14] += mat[13] * mat[6] * di; \
 64:     mat[15] += mat[13] * mat[7] * di; \
 65:     di       = mat[10]; \
 66:     mat[10]  = d = 1.0 / di; \
 67:     mat[2]  *= -d; \
 68:     mat[6]  *= -d; \
 69:     mat[14] *= -d; \
 70:     mat[8]  *= d; \
 71:     mat[9]  *= d; \
 72:     mat[11] *= d; \
 73:     mat[0]  += mat[2] * mat[8] * di; \
 74:     mat[1]  += mat[2] * mat[9] * di; \
 75:     mat[3]  += mat[2] * mat[11] * di; \
 76:     mat[4]  += mat[6] * mat[8] * di; \
 77:     mat[5]  += mat[6] * mat[9] * di; \
 78:     mat[7]  += mat[6] * mat[11] * di; \
 79:     mat[12] += mat[14] * mat[8] * di; \
 80:     mat[13] += mat[14] * mat[9] * di; \
 81:     mat[15] += mat[14] * mat[11] * di; \
 82:     di       = mat[15]; \
 83:     mat[15]  = d = 1.0 / di; \
 84:     mat[3]  *= -d; \
 85:     mat[7]  *= -d; \
 86:     mat[11] *= -d; \
 87:     mat[12] *= d; \
 88:     mat[13] *= d; \
 89:     mat[14] *= d; \
 90:     mat[0]  += mat[3] * mat[12] * di; \
 91:     mat[1]  += mat[3] * mat[13] * di; \
 92:     mat[2]  += mat[3] * mat[14] * di; \
 93:     mat[4]  += mat[7] * mat[12] * di; \
 94:     mat[5]  += mat[7] * mat[13] * di; \
 95:     mat[6]  += mat[7] * mat[14] * di; \
 96:     mat[8]  += mat[11] * mat[12] * di; \
 97:     mat[9]  += mat[11] * mat[13] * di; \
 98:     mat[10] += mat[11] * mat[14] * di; \
 99:   }

101: PETSC_EXTERN PetscErrorCode PetscKernel_A_gets_inverse_A_4(MatScalar*,PetscReal,PetscBool,PetscBool*);
102: # if defined(PETSC_HAVE_SSE)
103: PETSC_EXTERN PetscErrorCode PetscKernel_A_gets_inverse_A_4_SSE(MatScalar*);
104: # endif
105: PETSC_EXTERN PetscErrorCode PetscKernel_A_gets_inverse_A_5(MatScalar*,PetscInt*,MatScalar*,PetscReal,PetscBool,PetscBool*);
106: PETSC_EXTERN PetscErrorCode PetscKernel_A_gets_inverse_A_6(MatScalar*,PetscReal,PetscBool,PetscBool*);
107: PETSC_EXTERN PetscErrorCode PetscKernel_A_gets_inverse_A_7(MatScalar*,PetscReal,PetscBool,PetscBool*);
108: PETSC_EXTERN PetscErrorCode PetscKernel_A_gets_inverse_A_9(MatScalar*,PetscReal,PetscBool,PetscBool*);
109: PETSC_EXTERN PetscErrorCode PetscKernel_A_gets_inverse_A_15(MatScalar*,PetscInt*,MatScalar*,PetscReal,PetscBool,PetscBool*);

111: /*
112:     A = inv(A)    A_gets_inverse_A

114:    A      - square bs by bs array stored in column major order
115:    pivots - integer work array of length bs
116:    W      -  bs by 1 work array
117: */
118: #define PetscKernel_A_gets_inverse_A(bs,A,pivots,W,allowzeropivot,zeropivotdetected) (PetscLINPACKgefa((A),(bs),(pivots),(allowzeropivot),(zeropivotdetected)) || PetscLINPACKgedi((A),(bs),(pivots),(W)))

120: /* -----------------------------------------------------------------------*/

122: #if !defined(PETSC_USE_REAL_MAT_SINGLE)
123: /*
124:         Version that calls the BLAS directly
125: */
126: /*
127:       A = A * B   A_gets_A_times_B

129:    A, B - square bs by bs arrays stored in column major order
130:    W    - square bs by bs work array

132: */
133: #define PetscKernel_A_gets_A_times_B(bs,A,B,W) \
134:   {                                              \
135:     PetscBLASInt   _bbs;                         \
136:     PetscScalar    _one = 1.0,_zero = 0.0;       \
137:     PetscErrorCode _ierr;                        \
138:     _PetscBLASIntCast(bs,&_bbs); \
139:     _PetscMemcpy((W),(A),(bs)*(bs)*sizeof(MatScalar));CHKERRQ(_ierr); \
140:     PetscStackCallBLAS("BLASgemm",BLASgemm_("N","N",&(_bbs),&(_bbs),&(_bbs),&_one,(W),&(_bbs),(B),&(_bbs),&_zero,(A),&(_bbs))); \
141:   }

143: /*

145:     A = A - B * C  A_gets_A_minus_B_times_C

147:    A, B, C - square bs by bs arrays stored in column major order
148: */
149: #define PetscKernel_A_gets_A_minus_B_times_C(bs,A,B,C) \
150:   {                                                      \
151:     PetscScalar    _mone = -1.0,_one = 1.0;              \
152:     PetscBLASInt   _bbs;                                 \
153:     PetscErrorCode _ierr;                                \
154:     _PetscBLASIntCast(bs,&_bbs);CHKERRQ(_ierr);   \
155:     PetscStackCallBLAS("BLASgemm",BLASgemm_("N","N",&(_bbs),&(_bbs),&(_bbs),&_mone,(B),&(_bbs),(C),&(_bbs),&_one,(A),&(_bbs))); \
156:   }

158: /*

160:     A = A + B^T * C  A_gets_A_plus_Btranspose_times_C

162:    A, B, C - square bs by bs arrays stored in column major order
163: */
164: #define PetscKernel_A_gets_A_plus_Btranspose_times_C(bs,A,B,C) \
165:   {                                                              \
166:     PetscScalar    _one = 1.0;                                   \
167:     PetscBLASInt   _bbs;                                         \
168:     PetscErrorCode _ierr;                                        \
169:     _PetscBLASIntCast(bs,&_bbs);CHKERRQ(_ierr);           \
170:     PetscStackCallBLAS("BLASgemm",BLASgemm_("T","N",&(_bbs),&(_bbs),&(_bbs),&_one,(B),&(_bbs),(C),&(_bbs),&_one,(A),&(_bbs))); \
171:   }

173: /*
174:     v = v + A^T w  v_gets_v_plus_Atranspose_times_w

176:    v - array of length bs
177:    A - square bs by bs array
178:    w - array of length bs
179: */
180: #define  PetscKernel_v_gets_v_plus_Atranspose_times_w(bs,v,A,w) \
181:   {                                                               \
182:     PetscScalar    _one  = 1.0;                                    \
183:     PetscBLASInt   _ione = 1, _bbs;                               \
184:     PetscErrorCode _ierr;                                         \
185:     _PetscBLASIntCast(bs,&_bbs);CHKERRQ(_ierr);            \
186:     PetscStackCallBLAS("BLASgemv",BLASgemv_("T",&(_bbs),&(_bbs),&_one,A,&(_bbs),w,&_ione,&_one,v,&_ione)); \
187:   }

189: /*
190:     v = v - A w  v_gets_v_minus_A_times_w

192:    v - array of length bs
193:    A - square bs by bs array
194:    w - array of length bs
195: */
196: #define  PetscKernel_v_gets_v_minus_A_times_w(bs,v,A,w) \
197:   {                                                       \
198:     PetscScalar    _mone = -1.0,_one = 1.0;                 \
199:     PetscBLASInt   _ione = 1,_bbs;                         \
200:     PetscErrorCode _ierr;                                         \
201:     _PetscBLASIntCast(bs,&_bbs);CHKERRQ(_ierr);            \
202:     PetscStackCallBLAS("BLASgemv",BLASgemv_("N",&(_bbs),&(_bbs),&_mone,A,&(_bbs),w,&_ione,&_one,v,&_ione)); \
203:   }

205: /*
206:     v = v - A w  v_gets_v_minus_transA_times_w

208:    v - array of length bs
209:    A - square bs by bs array
210:    w - array of length bs
211: */
212: #define  PetscKernel_v_gets_v_minus_transA_times_w(bs,v,A,w) \
213:   {                                                            \
214:     PetscScalar    _mone = -1.0,_one = 1.0;                      \
215:     PetscBLASInt   _ione = 1,_bbs;                              \
216:     PetscErrorCode _ierr;                                      \
217:     _PetscBLASIntCast(bs,&_bbs);CHKERRQ(_ierr);         \
218:     PetscStackCallBLAS("BLASgemv",BLASgemv_("T",&(_bbs),&(_bbs),&_mone,A,&(_bbs),w,&_ione,&_one,v,&_ione)); \
219:   }

221: /*
222:     v = v + A w  v_gets_v_plus_A_times_w

224:    v - array of length bs
225:    A - square bs by bs array
226:    w - array of length bs
227: */
228: #define  PetscKernel_v_gets_v_plus_A_times_w(bs,v,A,w) \
229:   {                                                      \
230:     PetscScalar    _one  = 1.0;                             \
231:     PetscBLASInt   _ione = 1,_bbs;                         \
232:     PetscErrorCode _ierr;                                      \
233:     _PetscBLASIntCast(bs,&_bbs);CHKERRQ(_ierr);         \
234:     PetscStackCallBLAS("BLASgemv",BLASgemv_("N",&(_bbs),&(_bbs),&_one,A,&(_bbs),w,&_ione,&_one,v,&_ione)); \
235:   }

237: /*
238:     v = v + A w  v_gets_v_plus_Ar_times_w

240:    v - array of length bs
241:    A - square bs by bs array
242:    w - array of length bs
243: */
244: #define  PetscKernel_w_gets_w_plus_Ar_times_v(bs,ncols,v,A,w) \
245:   {                                                             \
246:     PetscScalar    _one  = 1.0;                                    \
247:     PetscBLASInt   _ione = 1,_bbs,_bncols;                        \
248:     PetscErrorCode _ierr;                                      \
249:     _PetscBLASIntCast(bs,&_bbs);CHKERRQ(_ierr);          \
250:     _PetscBLASIntCast(ncols,&_bncols);CHKERRQ(_ierr);    \
251:     PetscStackCallBLAS("BLASgemv",BLASgemv_("N",&(_bbs),&(_bncols),&_one,A,&(_bbs),v,&_ione,&_one,w,&_ione)); \
252:   }

254: /*
255:     v = v - A w  v_gets_v_minus_Ar_times_w

257:    v - array of length ncol
258:    A(bs,ncols)
259:    w - array of length bs
260: */
261: #define  PetscKernel_w_gets_w_minus_Ar_times_v(bs,ncols,w,A,v) \
262:   {                                                              \
263:     PetscScalar    _one  = 1.0,_mone = -1.0;                        \
264:     PetscBLASInt   _ione = 1,_bbs,_bncols;                         \
265:     PetscErrorCode _ierr;                                      \
266:     _PetscBLASIntCast(bs,&_bbs);CHKERRQ(_ierr);          \
267:     _PetscBLASIntCast(ncols,&_bncols);CHKERRQ(_ierr);    \
268:     PetscStackCallBLAS("BLASgemv",BLASgemv_("N",&(_bbs),&(_bncols),&_mone,A,&(_bbs),v,&_ione,&_one,w,&_ione)); \
269:   }

271: /*
272:     w = A v   w_gets_A_times_v

274:    v - array of length bs
275:    A - square bs by bs array
276:    w - array of length bs
277: */
278: #define PetscKernel_w_gets_A_times_v(bs,v,A,w) \
279:   {                                              \
280:     PetscScalar    _zero = 0.0,_one = 1.0;         \
281:     PetscBLASInt   _ione = 1,_bbs;                 \
282:     PetscErrorCode _ierr;                          \
283:     _PetscBLASIntCast(bs,&_bbs);CHKERRQ(_ierr);                         \
284:     PetscStackCallBLAS("BLASgemv",BLASgemv_("N",&(_bbs),&(_bbs),&_one,A,&(_bbs),v,&_ione,&_zero,w,&_ione)); \
285:   }

287: /*
288:     w = A' v   w_gets_transA_times_v

290:    v - array of length bs
291:    A - square bs by bs array
292:    w - array of length bs
293: */
294: #define PetscKernel_w_gets_transA_times_v(bs,v,A,w)     \
295:   {                                                       \
296:     PetscScalar    _zero = 0.0,_one = 1.0;                  \
297:     PetscBLASInt   _ione = 1,_bbs;                        \
298:     PetscErrorCode _ierr;                          \
299:     _PetscBLASIntCast(bs,&_bbs);CHKERRQ(_ierr); \
300:     PetscStackCallBLAS("BLASgemv",BLASgemv_("T",&(_bbs),&(_bbs),&_one,A,&(_bbs),v,&_ione,&_zero,w,&_ione)); \
301:   }

303: /*
304:         z = A*x
305: */
306: #define PetscKernel_w_gets_Ar_times_v(bs,ncols,x,A,z) \
307:   {                                                     \
308:     PetscScalar    _one  = 1.0,_zero = 0.0;                 \
309:     PetscBLASInt   _ione = 1,_bbs,_bncols;        \
310:     PetscErrorCode _ierr; \
311:     _PetscBLASIntCast(bs,&_bbs);CHKERRQ(_ierr); \
312:     _PetscBLASIntCast(ncols,&_bncols);CHKERRQ(_ierr); \
313:     PetscStackCallBLAS("BLASgemv",BLASgemv_("N",&(_bbs),&_bncols,&_one,A,&(_bbs),x,&_ione,&_zero,z,&_ione)); \
314:   }

316: /*
317:         z = trans(A)*x
318: */
319: #define PetscKernel_w_gets_w_plus_trans_Ar_times_v(bs,ncols,x,A,z) \
320:   {                                                                  \
321:     PetscScalar    _one  = 1.0;                                         \
322:     PetscBLASInt   _ione = 1,_bbs,_bncols;                             \
323:     PetscErrorCode _ierr; \
324:     _PetscBLASIntCast(bs,&_bbs);CHKERRQ(_ierr); \
325:     _PetscBLASIntCast(ncols,&_bncols);CHKERRQ(_ierr); \
326:     PetscStackCallBLAS("BLASgemv",BLASgemv_("T",&_bbs,&_bncols,&_one,A,&_bbs,x,&_ione,&_one,z,&_ione)); \
327:   }

329: #else
330: /*
331:        Version that calls Fortran routines; can handle different precision
332:    of matrix (array) and vectors
333: */
334: /*
335:      These are Fortran kernels: They replace certain BLAS routines but
336:    have some arguments that may be single precision,rather than double
337:    These routines are provided in src/fortran/kernels/sgemv.F
338:    They are pretty pitiful but get the job done. The intention is
339:    that for important block sizes (currently 1,2,3,4,5,6,7) custom inlined
340:    code is used.
341: */

343: #if defined(PETSC_HAVE_FORTRAN_CAPS)
344: #define msgemv_  MSGEMV
345: #define msgemvp_ MSGEMVP
346: #define msgemvm_ MSGEMVM
347: #define msgemvt_ MSGEMVT
348: #define msgemmi_ MSGEMMI
349: #define msgemm_  MSGEMM
350: #elif !defined(PETSC_HAVE_FORTRAN_UNDERSCORE)
351: #define msgemv_  msgemv
352: #define msgemvp_ msgemvp
353: #define msgemvm_ msgemvm
354: #define msgemvt_ msgemvt
355: #define msgemmi_ msgemmi
356: #define msgemm_  msgemm
357: #endif

359: PETSC_EXTERN void msgemv_(PetscInt*,PetscInt*,MatScalar*,PetscScalar*,PetscScalar*);
360: PETSC_EXTERN void msgemvp_(PetscInt*,PetscInt*,MatScalar*,PetscScalar*,PetscScalar*);
361: PETSC_EXTERN void msgemvm_(PetscInt*,PetscInt*,MatScalar*,PetscScalar*,PetscScalar*);
362: PETSC_EXTERN void msgemvt_(PetscInt*,PetscInt*,MatScalar*,PetscScalar*,PetscScalar*);
363: PETSC_EXTERN void msgemmi_(PetscInt*,MatScalar*,MatScalar*,MatScalar*);
364: PETSC_EXTERN void msgemm_(PetscInt*,MatScalar*,MatScalar*,MatScalar*);

366: /*
367:       A = A * B   A_gets_A_times_B

369:    A, B - square bs by bs arrays stored in column major order
370:    W    - square bs by bs work array

372: */
373: #define PetscKernel_A_gets_A_times_B(bs,A,B,W) \
374:   {                                              \
375:     PetscErrorCode _PetscMemcpy((W),(A),(bs)*(bs)*sizeof(MatScalar));CHKERRQ(_ierr); \
376:     msgemmi_(&bs,A,B,W); \
377:   }

379: /*

381:     A = A - B * C  A_gets_A_minus_B_times_C

383:    A, B, C - square bs by bs arrays stored in column major order
384: */
385: #define PetscKernel_A_gets_A_minus_B_times_C(bs,A,B,C) \
386:   {                                                      \
387:     msgemm_(&bs,A,B,C);                                  \
388:   }

390: /*
391:     v = v - A w  v_gets_v_minus_A_times_w

393:    v - array of length bs
394:    A - square bs by bs array
395:    w - array of length bs
396: */
397: #define  PetscKernel_v_gets_v_minus_A_times_w(bs,v,A,w) \
398:   {                                                       \
399:     msgemvm_(&bs,&bs,A,w,v);                              \
400:   }

402: /*
403:     v = v + A w  v_gets_v_plus_A_times_w

405:    v - array of length bs
406:    A - square bs by bs array
407:    w - array of length bs
408: */
409: #define  PetscKernel_v_gets_v_plus_A_times_w(bs,v,A,w) \
410:   {                                                      \
411:     msgemvp_(&bs,&bs,A,w,v);                             \
412:   }

414: /*
415:     v = v + A w  v_gets_v_plus_Ar_times_w

417:    v - array of length bs
418:    A - square bs by bs array
419:    w - array of length bs
420: */
421: #define  PetscKernel_w_gets_w_plus_Ar_times_v(bs,ncol,v,A,w) \
422:   {                                                            \
423:     msgemvp_(&bs,&ncol,A,v,w);                                 \
424:   }

426: /*
427:     w = A v   w_gets_A_times_v

429:    v - array of length bs
430:    A - square bs by bs array
431:    w - array of length bs
432: */
433: #define PetscKernel_w_gets_A_times_v(bs,v,A,w) \
434:   {  \
435:     msgemv_(&bs,&bs,A,v,w); \
436:   }

438: /*
439:         z = A*x
440: */
441: #define PetscKernel_w_gets_Ar_times_v(bs,ncols,x,A,z) \
442:   { \
443:     msgemv_(&bs,&ncols,A,x,z); \
444:   }

446: /*
447:         z = trans(A)*x
448: */
449: #define PetscKernel_w_gets_w_plus_trans_Ar_times_v(bs,ncols,x,A,z) \
450:   { \
451:     msgemvt_(&bs,&ncols,A,x,z); \
452:   }

454: /* These do not work yet */
455: #define PetscKernel_A_gets_A_plus_Btranspose_times_C(bs,A,B,C)
456: #define PetscKernel_v_gets_v_plus_Atranspose_times_w(bs,v,A,w)


459: #endif

461: #endif