Actual source code: blockinvert.h
petsc-3.14.4 2021-02-03
1: /*
2: Kernels used in sparse ILU (and LU) and in the resulting triangular
3: solves. These are for block algorithms where the block sizes are on
4: the order of 2-6+.
6: There are TWO versions of the macros below.
7: 1) standard for MatScalar == PetscScalar use the standard BLAS for
8: block size larger than 7 and
9: 2) handcoded Fortran single precision for the matrices, since BLAS
10: does not have some arguments in single and some in double.
12: */
15: #include <petscblaslapack.h>
17: /*
18: These are C kernels,they are contained in
19: src/mat/impls/baij/seq
20: */
22: PETSC_INTERN PetscErrorCode PetscLINPACKgefa(MatScalar*,PetscInt,PetscInt*,PetscBool,PetscBool*);
23: PETSC_INTERN PetscErrorCode PetscLINPACKgedi(MatScalar*,PetscInt,PetscInt*,MatScalar*);
24: PETSC_EXTERN PetscErrorCode PetscKernel_A_gets_inverse_A_2(MatScalar*,PetscReal,PetscBool,PetscBool*);
25: PETSC_EXTERN PetscErrorCode PetscKernel_A_gets_inverse_A_3(MatScalar*,PetscReal,PetscBool,PetscBool*);
27: #define PetscKernel_A_gets_inverse_A_4_nopivot(mat) 0; \
28: { \
29: MatScalar d, di; \
30: \
31: di = mat[0]; \
32: mat[0] = d = 1.0 / di; \
33: mat[4] *= -d; \
34: mat[8] *= -d; \
35: mat[12] *= -d; \
36: mat[1] *= d; \
37: mat[2] *= d; \
38: mat[3] *= d; \
39: mat[5] += mat[4] * mat[1] * di; \
40: mat[6] += mat[4] * mat[2] * di; \
41: mat[7] += mat[4] * mat[3] * di; \
42: mat[9] += mat[8] * mat[1] * di; \
43: mat[10] += mat[8] * mat[2] * di; \
44: mat[11] += mat[8] * mat[3] * di; \
45: mat[13] += mat[12] * mat[1] * di; \
46: mat[14] += mat[12] * mat[2] * di; \
47: mat[15] += mat[12] * mat[3] * di; \
48: di = mat[5]; \
49: mat[5] = d = 1.0 / di; \
50: mat[1] *= -d; \
51: mat[9] *= -d; \
52: mat[13] *= -d; \
53: mat[4] *= d; \
54: mat[6] *= d; \
55: mat[7] *= d; \
56: mat[0] += mat[1] * mat[4] * di; \
57: mat[2] += mat[1] * mat[6] * di; \
58: mat[3] += mat[1] * mat[7] * di; \
59: mat[8] += mat[9] * mat[4] * di; \
60: mat[10] += mat[9] * mat[6] * di; \
61: mat[11] += mat[9] * mat[7] * di; \
62: mat[12] += mat[13] * mat[4] * di; \
63: mat[14] += mat[13] * mat[6] * di; \
64: mat[15] += mat[13] * mat[7] * di; \
65: di = mat[10]; \
66: mat[10] = d = 1.0 / di; \
67: mat[2] *= -d; \
68: mat[6] *= -d; \
69: mat[14] *= -d; \
70: mat[8] *= d; \
71: mat[9] *= d; \
72: mat[11] *= d; \
73: mat[0] += mat[2] * mat[8] * di; \
74: mat[1] += mat[2] * mat[9] * di; \
75: mat[3] += mat[2] * mat[11] * di; \
76: mat[4] += mat[6] * mat[8] * di; \
77: mat[5] += mat[6] * mat[9] * di; \
78: mat[7] += mat[6] * mat[11] * di; \
79: mat[12] += mat[14] * mat[8] * di; \
80: mat[13] += mat[14] * mat[9] * di; \
81: mat[15] += mat[14] * mat[11] * di; \
82: di = mat[15]; \
83: mat[15] = d = 1.0 / di; \
84: mat[3] *= -d; \
85: mat[7] *= -d; \
86: mat[11] *= -d; \
87: mat[12] *= d; \
88: mat[13] *= d; \
89: mat[14] *= d; \
90: mat[0] += mat[3] * mat[12] * di; \
91: mat[1] += mat[3] * mat[13] * di; \
92: mat[2] += mat[3] * mat[14] * di; \
93: mat[4] += mat[7] * mat[12] * di; \
94: mat[5] += mat[7] * mat[13] * di; \
95: mat[6] += mat[7] * mat[14] * di; \
96: mat[8] += mat[11] * mat[12] * di; \
97: mat[9] += mat[11] * mat[13] * di; \
98: mat[10] += mat[11] * mat[14] * di; \
99: }
101: PETSC_EXTERN PetscErrorCode PetscKernel_A_gets_inverse_A_4(MatScalar*,PetscReal,PetscBool,PetscBool*);
102: # if defined(PETSC_HAVE_SSE)
103: PETSC_EXTERN PetscErrorCode PetscKernel_A_gets_inverse_A_4_SSE(MatScalar*);
104: # endif
105: PETSC_EXTERN PetscErrorCode PetscKernel_A_gets_inverse_A_5(MatScalar*,PetscInt*,MatScalar*,PetscReal,PetscBool,PetscBool*);
106: PETSC_EXTERN PetscErrorCode PetscKernel_A_gets_inverse_A_6(MatScalar*,PetscReal,PetscBool,PetscBool*);
107: PETSC_EXTERN PetscErrorCode PetscKernel_A_gets_inverse_A_7(MatScalar*,PetscReal,PetscBool,PetscBool*);
108: PETSC_EXTERN PetscErrorCode PetscKernel_A_gets_inverse_A_9(MatScalar*,PetscReal,PetscBool,PetscBool*);
109: PETSC_EXTERN PetscErrorCode PetscKernel_A_gets_inverse_A_15(MatScalar*,PetscInt*,MatScalar*,PetscReal,PetscBool,PetscBool*);
111: /*
112: A = inv(A) A_gets_inverse_A
114: A - square bs by bs array stored in column major order
115: pivots - integer work array of length bs
116: W - bs by 1 work array
117: */
118: #define PetscKernel_A_gets_inverse_A(bs,A,pivots,W,allowzeropivot,zeropivotdetected) (PetscLINPACKgefa((A),(bs),(pivots),(allowzeropivot),(zeropivotdetected)) || PetscLINPACKgedi((A),(bs),(pivots),(W)))
120: /* -----------------------------------------------------------------------*/
122: #if !defined(PETSC_USE_REAL_MAT_SINGLE)
123: /*
124: Version that calls the BLAS directly
125: */
126: /*
127: A = A * B A_gets_A_times_B
129: A, B - square bs by bs arrays stored in column major order
130: W - square bs by bs work array
132: */
133: #define PetscKernel_A_gets_A_times_B(bs,A,B,W) \
134: { \
135: PetscBLASInt _bbs; \
136: PetscScalar _one = 1.0,_zero = 0.0; \
137: PetscErrorCode _ierr; \
138: _PetscBLASIntCast(bs,&_bbs); \
139: _PetscArraycpy((W),(A),(bs)*(bs));CHKERRQ(_ierr); \
143: PetscStackCallBLAS("BLASgemm",BLASgemm_("N","N",&(_bbs),&(_bbs),&(_bbs),&_one,(W),&(_bbs),(B),&(_bbs),&_zero,(A),&(_bbs))); \
144: }
146: /*
148: A = A - B * C A_gets_A_minus_B_times_C
150: A, B, C - square bs by bs arrays stored in column major order
151: */
152: #define PetscKernel_A_gets_A_minus_B_times_C(bs,A,B,C) \
153: { \
154: PetscScalar _mone = -1.0,_one = 1.0; \
155: PetscBLASInt _bbs; \
156: PetscErrorCode _ierr; \
157: _PetscBLASIntCast(bs,&_bbs);CHKERRQ(_ierr); \
161: PetscStackCallBLAS("BLASgemm",BLASgemm_("N","N",&(_bbs),&(_bbs),&(_bbs),&_mone,(B),&(_bbs),(C),&(_bbs),&_one,(A),&(_bbs))); \
162: }
164: /*
166: A = A + B^T * C A_gets_A_plus_Btranspose_times_C
168: A, B, C - square bs by bs arrays stored in column major order
169: */
170: #define PetscKernel_A_gets_A_plus_Btranspose_times_C(bs,A,B,C) \
171: { \
172: PetscScalar _one = 1.0; \
173: PetscBLASInt _bbs; \
174: PetscErrorCode _ierr; \
175: _PetscBLASIntCast(bs,&_bbs);CHKERRQ(_ierr); \
179: PetscStackCallBLAS("BLASgemm",BLASgemm_("T","N",&(_bbs),&(_bbs),&(_bbs),&_one,(B),&(_bbs),(C),&(_bbs),&_one,(A),&(_bbs))); \
180: }
182: /*
183: v = v + A^T w v_gets_v_plus_Atranspose_times_w
185: v - array of length bs
186: A - square bs by bs array
187: w - array of length bs
188: */
189: #define PetscKernel_v_gets_v_plus_Atranspose_times_w(bs,v,A,w) \
190: { \
191: PetscScalar _one = 1.0; \
192: PetscBLASInt _ione = 1, _bbs; \
193: PetscErrorCode _ierr; \
194: _PetscBLASIntCast(bs,&_bbs);CHKERRQ(_ierr); \
198: PetscStackCallBLAS("BLASgemv",BLASgemv_("T",&(_bbs),&(_bbs),&_one,A,&(_bbs),w,&_ione,&_one,v,&_ione)); \
199: }
201: /*
202: v = v - A w v_gets_v_minus_A_times_w
204: v - array of length bs
205: A - square bs by bs array
206: w - array of length bs
207: */
208: #define PetscKernel_v_gets_v_minus_A_times_w(bs,v,A,w) \
209: { \
210: PetscScalar _mone = -1.0,_one = 1.0; \
211: PetscBLASInt _ione = 1,_bbs; \
212: PetscErrorCode _ierr; \
213: _PetscBLASIntCast(bs,&_bbs);CHKERRQ(_ierr); \
217: PetscStackCallBLAS("BLASgemv",BLASgemv_("N",&(_bbs),&(_bbs),&_mone,A,&(_bbs),w,&_ione,&_one,v,&_ione)); \
218: }
220: /*
221: v = v - A w v_gets_v_minus_transA_times_w
223: v - array of length bs
224: A - square bs by bs array
225: w - array of length bs
226: */
227: #define PetscKernel_v_gets_v_minus_transA_times_w(bs,v,A,w) \
228: { \
229: PetscScalar _mone = -1.0,_one = 1.0; \
230: PetscBLASInt _ione = 1,_bbs; \
231: PetscErrorCode _ierr; \
232: _PetscBLASIntCast(bs,&_bbs);CHKERRQ(_ierr); \
236: PetscStackCallBLAS("BLASgemv",BLASgemv_("T",&(_bbs),&(_bbs),&_mone,A,&(_bbs),w,&_ione,&_one,v,&_ione)); \
237: }
239: /*
240: v = v + A w v_gets_v_plus_A_times_w
242: v - array of length bs
243: A - square bs by bs array
244: w - array of length bs
245: */
246: #define PetscKernel_v_gets_v_plus_A_times_w(bs,v,A,w) \
247: { \
248: PetscScalar _one = 1.0; \
249: PetscBLASInt _ione = 1,_bbs; \
250: PetscErrorCode _ierr; \
251: _PetscBLASIntCast(bs,&_bbs);CHKERRQ(_ierr); \
255: PetscStackCallBLAS("BLASgemv",BLASgemv_("N",&(_bbs),&(_bbs),&_one,A,&(_bbs),w,&_ione,&_one,v,&_ione)); \
256: }
258: /*
259: v = v + A w v_gets_v_plus_Ar_times_w
261: v - array of length bs
262: A - square bs by bs array
263: w - array of length bs
264: */
265: #define PetscKernel_w_gets_w_plus_Ar_times_v(bs,ncols,v,A,w) \
266: { \
267: PetscScalar _one = 1.0; \
268: PetscBLASInt _ione = 1,_bbs,_bncols; \
269: PetscErrorCode _ierr; \
270: _PetscBLASIntCast(bs,&_bbs);CHKERRQ(_ierr); \
271: _PetscBLASIntCast(ncols,&_bncols);CHKERRQ(_ierr); \
275: PetscStackCallBLAS("BLASgemv",BLASgemv_("N",&(_bbs),&(_bncols),&_one,A,&(_bbs),v,&_ione,&_one,w,&_ione)); \
276: }
278: /*
279: w <- w - A v w_gets_w_minus_Ar_times_v
281: v - array of length ncol
282: A(bs,ncols)
283: w - array of length bs
284: */
285: #define PetscKernel_w_gets_w_minus_Ar_times_v(bs,ncols,w,A,v) \
286: { \
287: PetscScalar _one = 1.0,_mone = -1.0; \
288: PetscBLASInt _ione = 1,_bbs,_bncols; \
289: PetscErrorCode _ierr; \
290: _PetscBLASIntCast(bs,&_bbs);CHKERRQ(_ierr); \
291: _PetscBLASIntCast(ncols,&_bncols);CHKERRQ(_ierr); \
295: PetscStackCallBLAS("BLASgemv",BLASgemv_("N",&(_bbs),&(_bncols),&_mone,A,&(_bbs),v,&_ione,&_one,w,&_ione)); \
296: }
298: /*
299: w = A v w_gets_A_times_v
301: v - array of length bs
302: A - square bs by bs array
303: w - array of length bs
304: */
305: #define PetscKernel_w_gets_A_times_v(bs,v,A,w) \
306: { \
307: PetscScalar _zero = 0.0,_one = 1.0; \
308: PetscBLASInt _ione = 1,_bbs; \
309: PetscErrorCode _ierr; \
310: _PetscBLASIntCast(bs,&_bbs);CHKERRQ(_ierr);\
314: PetscStackCallBLAS("BLASgemv",BLASgemv_("N",&(_bbs),&(_bbs),&_one,A,&(_bbs),v,&_ione,&_zero,w,&_ione)); \
315: }
317: /*
318: w = A' v w_gets_transA_times_v
320: v - array of length bs
321: A - square bs by bs array
322: w - array of length bs
323: */
324: #define PetscKernel_w_gets_transA_times_v(bs,v,A,w) \
325: { \
326: PetscScalar _zero = 0.0,_one = 1.0; \
327: PetscBLASInt _ione = 1,_bbs; \
328: PetscErrorCode _ierr; \
329: _PetscBLASIntCast(bs,&_bbs);CHKERRQ(_ierr); \
333: PetscStackCallBLAS("BLASgemv",BLASgemv_("T",&(_bbs),&(_bbs),&_one,A,&(_bbs),v,&_ione,&_zero,w,&_ione)); \
334: }
336: /*
337: z = A*x
338: */
339: #define PetscKernel_w_gets_Ar_times_v(bs,ncols,x,A,z) \
340: { \
341: PetscScalar _one = 1.0,_zero = 0.0; \
342: PetscBLASInt _ione = 1,_bbs,_bncols; \
343: PetscErrorCode _ierr; \
344: _PetscBLASIntCast(bs,&_bbs);CHKERRQ(_ierr); \
345: _PetscBLASIntCast(ncols,&_bncols);CHKERRQ(_ierr); \
349: PetscStackCallBLAS("BLASgemv",BLASgemv_("N",&(_bbs),&_bncols,&_one,A,&(_bbs),x,&_ione,&_zero,z,&_ione)); \
350: }
352: /*
353: z = trans(A)*x
354: */
355: #define PetscKernel_w_gets_w_plus_trans_Ar_times_v(bs,ncols,x,A,z) \
356: { \
357: PetscScalar _one = 1.0; \
358: PetscBLASInt _ione = 1,_bbs,_bncols; \
359: PetscErrorCode _ierr; \
360: _PetscBLASIntCast(bs,&_bbs);CHKERRQ(_ierr); \
361: _PetscBLASIntCast(ncols,&_bncols);CHKERRQ(_ierr); \
365: PetscStackCallBLAS("BLASgemv",BLASgemv_("T",&_bbs,&_bncols,&_one,A,&_bbs,x,&_ione,&_one,z,&_ione)); \
366: }
368: #else
369: /*
370: Version that calls Fortran routines; can handle different precision
371: of matrix (array) and vectors
372: */
373: /*
374: These are Fortran kernels: They replace certain BLAS routines but
375: have some arguments that may be single precision,rather than double
376: These routines are provided in src/fortran/kernels/sgemv.F
377: They are pretty pitiful but get the job done. The intention is
378: that for important block sizes (currently 1,2,3,4,5,6,7) custom inlined
379: code is used.
380: */
382: #if defined(PETSC_HAVE_FORTRAN_CAPS)
383: #define msgemv_ MSGEMV
384: #define msgemvp_ MSGEMVP
385: #define msgemvm_ MSGEMVM
386: #define msgemvt_ MSGEMVT
387: #define msgemmi_ MSGEMMI
388: #define msgemm_ MSGEMM
389: #elif !defined(PETSC_HAVE_FORTRAN_UNDERSCORE)
390: #define msgemv_ msgemv
391: #define msgemvp_ msgemvp
392: #define msgemvm_ msgemvm
393: #define msgemvt_ msgemvt
394: #define msgemmi_ msgemmi
395: #define msgemm_ msgemm
396: #endif
398: PETSC_EXTERN void msgemv_(PetscInt*,PetscInt*,MatScalar*,PetscScalar*,PetscScalar*);
399: PETSC_EXTERN void msgemvp_(PetscInt*,PetscInt*,MatScalar*,PetscScalar*,PetscScalar*);
400: PETSC_EXTERN void msgemvm_(PetscInt*,PetscInt*,MatScalar*,PetscScalar*,PetscScalar*);
401: PETSC_EXTERN void msgemvt_(PetscInt*,PetscInt*,MatScalar*,PetscScalar*,PetscScalar*);
402: PETSC_EXTERN void msgemmi_(PetscInt*,MatScalar*,MatScalar*,MatScalar*);
403: PETSC_EXTERN void msgemm_(PetscInt*,MatScalar*,MatScalar*,MatScalar*);
405: /*
406: A = A * B A_gets_A_times_B
408: A, B - square bs by bs arrays stored in column major order
409: W - square bs by bs work array
411: */
412: #define PetscKernel_A_gets_A_times_B(bs,A,B,W) \
413: { \
414: PetscErrorCode _PetscArraycpy((W),(A),(bs)*(bs));CHKERRQ(_ierr); \
415: msgemmi_(&bs,A,B,W); \
416: }
418: /*
420: A = A - B * C A_gets_A_minus_B_times_C
422: A, B, C - square bs by bs arrays stored in column major order
423: */
424: #define PetscKernel_A_gets_A_minus_B_times_C(bs,A,B,C) \
425: { \
426: msgemm_(&bs,A,B,C); \
427: }
429: /*
430: v = v - A w v_gets_v_minus_A_times_w
432: v - array of length bs
433: A - square bs by bs array
434: w - array of length bs
435: */
436: #define PetscKernel_v_gets_v_minus_A_times_w(bs,v,A,w) \
437: { \
438: msgemvm_(&bs,&bs,A,w,v); \
439: }
441: /*
442: v = v + A w v_gets_v_plus_A_times_w
444: v - array of length bs
445: A - square bs by bs array
446: w - array of length bs
447: */
448: #define PetscKernel_v_gets_v_plus_A_times_w(bs,v,A,w) \
449: { \
450: msgemvp_(&bs,&bs,A,w,v); \
451: }
453: /*
454: v = v + A w v_gets_v_plus_Ar_times_w
456: v - array of length bs
457: A - square bs by bs array
458: w - array of length bs
459: */
460: #define PetscKernel_w_gets_w_plus_Ar_times_v(bs,ncol,v,A,w) \
461: { \
462: msgemvp_(&bs,&ncol,A,v,w); \
463: }
465: /*
466: w = A v w_gets_A_times_v
468: v - array of length bs
469: A - square bs by bs array
470: w - array of length bs
471: */
472: #define PetscKernel_w_gets_A_times_v(bs,v,A,w) \
473: { \
474: msgemv_(&bs,&bs,A,v,w); \
475: }
477: /*
478: z = A*x
479: */
480: #define PetscKernel_w_gets_Ar_times_v(bs,ncols,x,A,z) \
481: { \
482: msgemv_(&bs,&ncols,A,x,z); \
483: }
485: /*
486: z = trans(A)*x
487: */
488: #define PetscKernel_w_gets_w_plus_trans_Ar_times_v(bs,ncols,x,A,z) \
489: { \
490: msgemvt_(&bs,&ncols,A,x,z); \
491: }
493: /* These do not work yet */
494: #define PetscKernel_A_gets_A_plus_Btranspose_times_C(bs,A,B,C)
495: #define PetscKernel_v_gets_v_plus_Atranspose_times_w(bs,v,A,w)
498: #endif
500: #endif