Actual source code: blockinvert.h

petsc-3.14.3 2021-01-09
Report Typos and Errors
  1: /*
  2:     Kernels used in sparse ILU (and LU) and in the resulting triangular
  3:  solves. These are for block algorithms where the block sizes are on
  4:  the order of 2-6+.

  6:     There are TWO versions of the macros below.
  7:     1) standard for MatScalar == PetscScalar use the standard BLAS for
  8:        block size larger than 7 and
  9:     2) handcoded Fortran single precision for the matrices, since BLAS
 10:        does not have some arguments in single and some in double.

 12: */
 15: #include <petscblaslapack.h>

 17: /*
 18:       These are C kernels,they are contained in
 19:    src/mat/impls/baij/seq
 20: */

 22: PETSC_INTERN PetscErrorCode PetscLINPACKgefa(MatScalar*,PetscInt,PetscInt*,PetscBool,PetscBool*);
 23: PETSC_INTERN PetscErrorCode PetscLINPACKgedi(MatScalar*,PetscInt,PetscInt*,MatScalar*);
 24: PETSC_EXTERN PetscErrorCode PetscKernel_A_gets_inverse_A_2(MatScalar*,PetscReal,PetscBool,PetscBool*);
 25: PETSC_EXTERN PetscErrorCode PetscKernel_A_gets_inverse_A_3(MatScalar*,PetscReal,PetscBool,PetscBool*);

 27: #define PetscKernel_A_gets_inverse_A_4_nopivot(mat) 0; \
 28:   { \
 29:     MatScalar d, di; \
 30: \
 31:     di       = mat[0]; \
 32:     mat[0]   = d = 1.0 / di; \
 33:     mat[4]  *= -d; \
 34:     mat[8]  *= -d; \
 35:     mat[12] *= -d; \
 36:     mat[1]  *= d; \
 37:     mat[2]  *= d; \
 38:     mat[3]  *= d; \
 39:     mat[5]  += mat[4] * mat[1] * di; \
 40:     mat[6]  += mat[4] * mat[2] * di; \
 41:     mat[7]  += mat[4] * mat[3] * di; \
 42:     mat[9]  += mat[8] * mat[1] * di; \
 43:     mat[10] += mat[8] * mat[2] * di; \
 44:     mat[11] += mat[8] * mat[3] * di; \
 45:     mat[13] += mat[12] * mat[1] * di; \
 46:     mat[14] += mat[12] * mat[2] * di; \
 47:     mat[15] += mat[12] * mat[3] * di; \
 48:     di       = mat[5]; \
 49:     mat[5]   = d = 1.0 / di; \
 50:     mat[1]  *= -d; \
 51:     mat[9]  *= -d; \
 52:     mat[13] *= -d; \
 53:     mat[4]  *= d; \
 54:     mat[6]  *= d; \
 55:     mat[7]  *= d; \
 56:     mat[0]  += mat[1] * mat[4] * di; \
 57:     mat[2]  += mat[1] * mat[6] * di; \
 58:     mat[3]  += mat[1] * mat[7] * di; \
 59:     mat[8]  += mat[9] * mat[4] * di; \
 60:     mat[10] += mat[9] * mat[6] * di; \
 61:     mat[11] += mat[9] * mat[7] * di; \
 62:     mat[12] += mat[13] * mat[4] * di; \
 63:     mat[14] += mat[13] * mat[6] * di; \
 64:     mat[15] += mat[13] * mat[7] * di; \
 65:     di       = mat[10]; \
 66:     mat[10]  = d = 1.0 / di; \
 67:     mat[2]  *= -d; \
 68:     mat[6]  *= -d; \
 69:     mat[14] *= -d; \
 70:     mat[8]  *= d; \
 71:     mat[9]  *= d; \
 72:     mat[11] *= d; \
 73:     mat[0]  += mat[2] * mat[8] * di; \
 74:     mat[1]  += mat[2] * mat[9] * di; \
 75:     mat[3]  += mat[2] * mat[11] * di; \
 76:     mat[4]  += mat[6] * mat[8] * di; \
 77:     mat[5]  += mat[6] * mat[9] * di; \
 78:     mat[7]  += mat[6] * mat[11] * di; \
 79:     mat[12] += mat[14] * mat[8] * di; \
 80:     mat[13] += mat[14] * mat[9] * di; \
 81:     mat[15] += mat[14] * mat[11] * di; \
 82:     di       = mat[15]; \
 83:     mat[15]  = d = 1.0 / di; \
 84:     mat[3]  *= -d; \
 85:     mat[7]  *= -d; \
 86:     mat[11] *= -d; \
 87:     mat[12] *= d; \
 88:     mat[13] *= d; \
 89:     mat[14] *= d; \
 90:     mat[0]  += mat[3] * mat[12] * di; \
 91:     mat[1]  += mat[3] * mat[13] * di; \
 92:     mat[2]  += mat[3] * mat[14] * di; \
 93:     mat[4]  += mat[7] * mat[12] * di; \
 94:     mat[5]  += mat[7] * mat[13] * di; \
 95:     mat[6]  += mat[7] * mat[14] * di; \
 96:     mat[8]  += mat[11] * mat[12] * di; \
 97:     mat[9]  += mat[11] * mat[13] * di; \
 98:     mat[10] += mat[11] * mat[14] * di; \
 99:   }

101: PETSC_EXTERN PetscErrorCode PetscKernel_A_gets_inverse_A_4(MatScalar*,PetscReal,PetscBool,PetscBool*);
102: # if defined(PETSC_HAVE_SSE)
103: PETSC_EXTERN PetscErrorCode PetscKernel_A_gets_inverse_A_4_SSE(MatScalar*);
104: # endif
105: PETSC_EXTERN PetscErrorCode PetscKernel_A_gets_inverse_A_5(MatScalar*,PetscInt*,MatScalar*,PetscReal,PetscBool,PetscBool*);
106: PETSC_EXTERN PetscErrorCode PetscKernel_A_gets_inverse_A_6(MatScalar*,PetscReal,PetscBool,PetscBool*);
107: PETSC_EXTERN PetscErrorCode PetscKernel_A_gets_inverse_A_7(MatScalar*,PetscReal,PetscBool,PetscBool*);
108: PETSC_EXTERN PetscErrorCode PetscKernel_A_gets_inverse_A_9(MatScalar*,PetscReal,PetscBool,PetscBool*);
109: PETSC_EXTERN PetscErrorCode PetscKernel_A_gets_inverse_A_15(MatScalar*,PetscInt*,MatScalar*,PetscReal,PetscBool,PetscBool*);

111: /*
112:     A = inv(A)    A_gets_inverse_A

114:    A      - square bs by bs array stored in column major order
115:    pivots - integer work array of length bs
116:    W      -  bs by 1 work array
117: */
118: #define PetscKernel_A_gets_inverse_A(bs,A,pivots,W,allowzeropivot,zeropivotdetected) (PetscLINPACKgefa((A),(bs),(pivots),(allowzeropivot),(zeropivotdetected)) || PetscLINPACKgedi((A),(bs),(pivots),(W)))

120: /* -----------------------------------------------------------------------*/

122: #if !defined(PETSC_USE_REAL_MAT_SINGLE)
123: /*
124:         Version that calls the BLAS directly
125: */
126: /*
127:       A = A * B   A_gets_A_times_B

129:    A, B - square bs by bs arrays stored in column major order
130:    W    - square bs by bs work array

132: */
133: #define PetscKernel_A_gets_A_times_B(bs,A,B,W) \
134:   {                                              \
135:     PetscBLASInt   _bbs;                         \
136:     PetscScalar    _one = 1.0,_zero = 0.0;       \
137:     PetscErrorCode _ierr;                        \
138:     _PetscBLASIntCast(bs,&_bbs); \
139:     _PetscArraycpy((W),(A),(bs)*(bs));CHKERRQ(_ierr); \
143:     PetscStackCallBLAS("BLASgemm",BLASgemm_("N","N",&(_bbs),&(_bbs),&(_bbs),&_one,(W),&(_bbs),(B),&(_bbs),&_zero,(A),&(_bbs))); \
144:   }

146: /*

148:     A = A - B * C  A_gets_A_minus_B_times_C

150:    A, B, C - square bs by bs arrays stored in column major order
151: */
152: #define PetscKernel_A_gets_A_minus_B_times_C(bs,A,B,C) \
153:   {                                                      \
154:     PetscScalar    _mone = -1.0,_one = 1.0;              \
155:     PetscBLASInt   _bbs;                                 \
156:     PetscErrorCode _ierr;                                \
157:     _PetscBLASIntCast(bs,&_bbs);CHKERRQ(_ierr);   \
161:     PetscStackCallBLAS("BLASgemm",BLASgemm_("N","N",&(_bbs),&(_bbs),&(_bbs),&_mone,(B),&(_bbs),(C),&(_bbs),&_one,(A),&(_bbs))); \
162:   }

164: /*

166:     A = A + B^T * C  A_gets_A_plus_Btranspose_times_C

168:    A, B, C - square bs by bs arrays stored in column major order
169: */
170: #define PetscKernel_A_gets_A_plus_Btranspose_times_C(bs,A,B,C) \
171:   {                                                              \
172:     PetscScalar    _one = 1.0;                                   \
173:     PetscBLASInt   _bbs;                                         \
174:     PetscErrorCode _ierr;                                        \
175:     _PetscBLASIntCast(bs,&_bbs);CHKERRQ(_ierr);           \
179:     PetscStackCallBLAS("BLASgemm",BLASgemm_("T","N",&(_bbs),&(_bbs),&(_bbs),&_one,(B),&(_bbs),(C),&(_bbs),&_one,(A),&(_bbs))); \
180:   }

182: /*
183:     v = v + A^T w  v_gets_v_plus_Atranspose_times_w

185:    v - array of length bs
186:    A - square bs by bs array
187:    w - array of length bs
188: */
189: #define  PetscKernel_v_gets_v_plus_Atranspose_times_w(bs,v,A,w) \
190:   {                                                               \
191:     PetscScalar    _one  = 1.0;                                    \
192:     PetscBLASInt   _ione = 1, _bbs;                               \
193:     PetscErrorCode _ierr;                                         \
194:     _PetscBLASIntCast(bs,&_bbs);CHKERRQ(_ierr);            \
198:     PetscStackCallBLAS("BLASgemv",BLASgemv_("T",&(_bbs),&(_bbs),&_one,A,&(_bbs),w,&_ione,&_one,v,&_ione)); \
199:   }

201: /*
202:     v = v - A w  v_gets_v_minus_A_times_w

204:    v - array of length bs
205:    A - square bs by bs array
206:    w - array of length bs
207: */
208: #define  PetscKernel_v_gets_v_minus_A_times_w(bs,v,A,w) \
209:   {                                                       \
210:     PetscScalar    _mone = -1.0,_one = 1.0;                 \
211:     PetscBLASInt   _ione = 1,_bbs;                         \
212:     PetscErrorCode _ierr;                                         \
213:     _PetscBLASIntCast(bs,&_bbs);CHKERRQ(_ierr);            \
217:     PetscStackCallBLAS("BLASgemv",BLASgemv_("N",&(_bbs),&(_bbs),&_mone,A,&(_bbs),w,&_ione,&_one,v,&_ione)); \
218:   }

220: /*
221:     v = v - A w  v_gets_v_minus_transA_times_w

223:    v - array of length bs
224:    A - square bs by bs array
225:    w - array of length bs
226: */
227: #define  PetscKernel_v_gets_v_minus_transA_times_w(bs,v,A,w) \
228:   {                                                            \
229:     PetscScalar    _mone = -1.0,_one = 1.0;                      \
230:     PetscBLASInt   _ione = 1,_bbs;                              \
231:     PetscErrorCode _ierr;                                      \
232:     _PetscBLASIntCast(bs,&_bbs);CHKERRQ(_ierr);         \
236:     PetscStackCallBLAS("BLASgemv",BLASgemv_("T",&(_bbs),&(_bbs),&_mone,A,&(_bbs),w,&_ione,&_one,v,&_ione)); \
237:   }

239: /*
240:     v = v + A w  v_gets_v_plus_A_times_w

242:    v - array of length bs
243:    A - square bs by bs array
244:    w - array of length bs
245: */
246: #define  PetscKernel_v_gets_v_plus_A_times_w(bs,v,A,w) \
247:   {                                                      \
248:     PetscScalar    _one  = 1.0;                             \
249:     PetscBLASInt   _ione = 1,_bbs;                         \
250:     PetscErrorCode _ierr;                                      \
251:     _PetscBLASIntCast(bs,&_bbs);CHKERRQ(_ierr);         \
255:     PetscStackCallBLAS("BLASgemv",BLASgemv_("N",&(_bbs),&(_bbs),&_one,A,&(_bbs),w,&_ione,&_one,v,&_ione)); \
256:   }

258: /*
259:     v = v + A w  v_gets_v_plus_Ar_times_w

261:    v - array of length bs
262:    A - square bs by bs array
263:    w - array of length bs
264: */
265: #define  PetscKernel_w_gets_w_plus_Ar_times_v(bs,ncols,v,A,w) \
266:   {                                                             \
267:     PetscScalar    _one  = 1.0;                                    \
268:     PetscBLASInt   _ione = 1,_bbs,_bncols;                        \
269:     PetscErrorCode _ierr;                                      \
270:     _PetscBLASIntCast(bs,&_bbs);CHKERRQ(_ierr);          \
271:     _PetscBLASIntCast(ncols,&_bncols);CHKERRQ(_ierr);    \
275:     PetscStackCallBLAS("BLASgemv",BLASgemv_("N",&(_bbs),&(_bncols),&_one,A,&(_bbs),v,&_ione,&_one,w,&_ione)); \
276:   }

278: /*
279:     w <- w - A v  w_gets_w_minus_Ar_times_v

281:    v - array of length ncol
282:    A(bs,ncols)
283:    w - array of length bs
284: */
285: #define  PetscKernel_w_gets_w_minus_Ar_times_v(bs,ncols,w,A,v) \
286:   {                                                              \
287:     PetscScalar    _one  = 1.0,_mone = -1.0;                        \
288:     PetscBLASInt   _ione = 1,_bbs,_bncols;                         \
289:     PetscErrorCode _ierr;                                      \
290:     _PetscBLASIntCast(bs,&_bbs);CHKERRQ(_ierr);          \
291:     _PetscBLASIntCast(ncols,&_bncols);CHKERRQ(_ierr);    \
295:     PetscStackCallBLAS("BLASgemv",BLASgemv_("N",&(_bbs),&(_bncols),&_mone,A,&(_bbs),v,&_ione,&_one,w,&_ione)); \
296:   }

298: /*
299:     w = A v   w_gets_A_times_v

301:    v - array of length bs
302:    A - square bs by bs array
303:    w - array of length bs
304: */
305: #define PetscKernel_w_gets_A_times_v(bs,v,A,w) \
306:   {                                              \
307:     PetscScalar    _zero = 0.0,_one = 1.0;         \
308:     PetscBLASInt   _ione = 1,_bbs;                 \
309:     PetscErrorCode _ierr;                          \
310:     _PetscBLASIntCast(bs,&_bbs);CHKERRQ(_ierr);\
314:     PetscStackCallBLAS("BLASgemv",BLASgemv_("N",&(_bbs),&(_bbs),&_one,A,&(_bbs),v,&_ione,&_zero,w,&_ione)); \
315:   }

317: /*
318:     w = A' v   w_gets_transA_times_v

320:    v - array of length bs
321:    A - square bs by bs array
322:    w - array of length bs
323: */
324: #define PetscKernel_w_gets_transA_times_v(bs,v,A,w)     \
325:   {                                                       \
326:     PetscScalar    _zero = 0.0,_one = 1.0;                  \
327:     PetscBLASInt   _ione = 1,_bbs;                        \
328:     PetscErrorCode _ierr;                          \
329:     _PetscBLASIntCast(bs,&_bbs);CHKERRQ(_ierr); \
333:     PetscStackCallBLAS("BLASgemv",BLASgemv_("T",&(_bbs),&(_bbs),&_one,A,&(_bbs),v,&_ione,&_zero,w,&_ione)); \
334:   }

336: /*
337:         z = A*x
338: */
339: #define PetscKernel_w_gets_Ar_times_v(bs,ncols,x,A,z) \
340:   {                                                     \
341:     PetscScalar    _one  = 1.0,_zero = 0.0;                 \
342:     PetscBLASInt   _ione = 1,_bbs,_bncols;        \
343:     PetscErrorCode _ierr; \
344:     _PetscBLASIntCast(bs,&_bbs);CHKERRQ(_ierr); \
345:     _PetscBLASIntCast(ncols,&_bncols);CHKERRQ(_ierr); \
349:     PetscStackCallBLAS("BLASgemv",BLASgemv_("N",&(_bbs),&_bncols,&_one,A,&(_bbs),x,&_ione,&_zero,z,&_ione)); \
350:   }

352: /*
353:         z = trans(A)*x
354: */
355: #define PetscKernel_w_gets_w_plus_trans_Ar_times_v(bs,ncols,x,A,z) \
356:   {                                                                  \
357:     PetscScalar    _one  = 1.0;                                         \
358:     PetscBLASInt   _ione = 1,_bbs,_bncols;                             \
359:     PetscErrorCode _ierr; \
360:     _PetscBLASIntCast(bs,&_bbs);CHKERRQ(_ierr); \
361:     _PetscBLASIntCast(ncols,&_bncols);CHKERRQ(_ierr); \
365:     PetscStackCallBLAS("BLASgemv",BLASgemv_("T",&_bbs,&_bncols,&_one,A,&_bbs,x,&_ione,&_one,z,&_ione)); \
366:   }

368: #else
369: /*
370:        Version that calls Fortran routines; can handle different precision
371:    of matrix (array) and vectors
372: */
373: /*
374:      These are Fortran kernels: They replace certain BLAS routines but
375:    have some arguments that may be single precision,rather than double
376:    These routines are provided in src/fortran/kernels/sgemv.F
377:    They are pretty pitiful but get the job done. The intention is
378:    that for important block sizes (currently 1,2,3,4,5,6,7) custom inlined
379:    code is used.
380: */

382: #if defined(PETSC_HAVE_FORTRAN_CAPS)
383: #define msgemv_  MSGEMV
384: #define msgemvp_ MSGEMVP
385: #define msgemvm_ MSGEMVM
386: #define msgemvt_ MSGEMVT
387: #define msgemmi_ MSGEMMI
388: #define msgemm_  MSGEMM
389: #elif !defined(PETSC_HAVE_FORTRAN_UNDERSCORE)
390: #define msgemv_  msgemv
391: #define msgemvp_ msgemvp
392: #define msgemvm_ msgemvm
393: #define msgemvt_ msgemvt
394: #define msgemmi_ msgemmi
395: #define msgemm_  msgemm
396: #endif

398: PETSC_EXTERN void msgemv_(PetscInt*,PetscInt*,MatScalar*,PetscScalar*,PetscScalar*);
399: PETSC_EXTERN void msgemvp_(PetscInt*,PetscInt*,MatScalar*,PetscScalar*,PetscScalar*);
400: PETSC_EXTERN void msgemvm_(PetscInt*,PetscInt*,MatScalar*,PetscScalar*,PetscScalar*);
401: PETSC_EXTERN void msgemvt_(PetscInt*,PetscInt*,MatScalar*,PetscScalar*,PetscScalar*);
402: PETSC_EXTERN void msgemmi_(PetscInt*,MatScalar*,MatScalar*,MatScalar*);
403: PETSC_EXTERN void msgemm_(PetscInt*,MatScalar*,MatScalar*,MatScalar*);

405: /*
406:       A = A * B   A_gets_A_times_B

408:    A, B - square bs by bs arrays stored in column major order
409:    W    - square bs by bs work array

411: */
412: #define PetscKernel_A_gets_A_times_B(bs,A,B,W) \
413:   {                                              \
414:     PetscErrorCode _PetscArraycpy((W),(A),(bs)*(bs));CHKERRQ(_ierr); \
415:     msgemmi_(&bs,A,B,W); \
416:   }

418: /*

420:     A = A - B * C  A_gets_A_minus_B_times_C

422:    A, B, C - square bs by bs arrays stored in column major order
423: */
424: #define PetscKernel_A_gets_A_minus_B_times_C(bs,A,B,C) \
425:   {                                                      \
426:     msgemm_(&bs,A,B,C);                                  \
427:   }

429: /*
430:     v = v - A w  v_gets_v_minus_A_times_w

432:    v - array of length bs
433:    A - square bs by bs array
434:    w - array of length bs
435: */
436: #define  PetscKernel_v_gets_v_minus_A_times_w(bs,v,A,w) \
437:   {                                                       \
438:     msgemvm_(&bs,&bs,A,w,v);                              \
439:   }

441: /*
442:     v = v + A w  v_gets_v_plus_A_times_w

444:    v - array of length bs
445:    A - square bs by bs array
446:    w - array of length bs
447: */
448: #define  PetscKernel_v_gets_v_plus_A_times_w(bs,v,A,w) \
449:   {                                                      \
450:     msgemvp_(&bs,&bs,A,w,v);                             \
451:   }

453: /*
454:     v = v + A w  v_gets_v_plus_Ar_times_w

456:    v - array of length bs
457:    A - square bs by bs array
458:    w - array of length bs
459: */
460: #define  PetscKernel_w_gets_w_plus_Ar_times_v(bs,ncol,v,A,w) \
461:   {                                                            \
462:     msgemvp_(&bs,&ncol,A,v,w);                                 \
463:   }

465: /*
466:     w = A v   w_gets_A_times_v

468:    v - array of length bs
469:    A - square bs by bs array
470:    w - array of length bs
471: */
472: #define PetscKernel_w_gets_A_times_v(bs,v,A,w) \
473:   {  \
474:     msgemv_(&bs,&bs,A,v,w); \
475:   }

477: /*
478:         z = A*x
479: */
480: #define PetscKernel_w_gets_Ar_times_v(bs,ncols,x,A,z) \
481:   { \
482:     msgemv_(&bs,&ncols,A,x,z); \
483:   }

485: /*
486:         z = trans(A)*x
487: */
488: #define PetscKernel_w_gets_w_plus_trans_Ar_times_v(bs,ncols,x,A,z) \
489:   { \
490:     msgemvt_(&bs,&ncols,A,x,z); \
491:   }

493: /* These do not work yet */
494: #define PetscKernel_A_gets_A_plus_Btranspose_times_C(bs,A,B,C)
495: #define PetscKernel_v_gets_v_plus_Atranspose_times_w(bs,v,A,w)


498: #endif

500: #endif