/* Factorization code for BAIJ format. */ #include <../src/mat/impls/baij/seq/baij.h> #include #if defined(PETSC_HAVE_IMMINTRIN_H) && defined(__AVX2__) && defined(__FMA__) && defined(PETSC_USE_REAL_DOUBLE) && !defined(PETSC_USE_COMPLEX) && !defined(PETSC_USE_64BIT_INDICES) #include #endif /* Version for when blocks are 9 by 9 */ #if defined(PETSC_HAVE_IMMINTRIN_H) && defined(__AVX2__) && defined(__FMA__) && defined(PETSC_USE_REAL_DOUBLE) && !defined(PETSC_USE_COMPLEX) && !defined(PETSC_USE_64BIT_INDICES) PetscErrorCode MatLUFactorNumeric_SeqBAIJ_9_NaturalOrdering(Mat B,Mat A,const MatFactorInfo *info) { Mat C =B; Mat_SeqBAIJ *a=(Mat_SeqBAIJ*)A->data,*b=(Mat_SeqBAIJ*)C->data; PetscErrorCode ierr; PetscInt i,j,k,nz,nzL,row; const PetscInt n=a->mbs,*ai=a->i,*aj=a->j,*bi=b->i,*bj=b->j; const PetscInt *ajtmp,*bjtmp,*bdiag=b->diag,*pj,bs2=a->bs2; MatScalar *rtmp,*pc,*mwork,*v,*pv,*aa=a->a; PetscInt flg; PetscReal shift = info->shiftamount; PetscBool allowzeropivot,zeropivotdetected; PetscFunctionBegin; allowzeropivot = PetscNot(A->erroriffailure); /* generate work space needed by the factorization */ ierr = PetscMalloc2(bs2*n,&rtmp,bs2,&mwork);CHKERRQ(ierr); ierr = PetscArrayzero(rtmp,bs2*n);CHKERRQ(ierr); for (i=0; ia + bs2*bdiag[row]; /* PetscKernel_A_gets_A_times_B(bs,pc,pv,mwork); *pc = *pc * (*pv); */ ierr = PetscKernel_A_gets_A_times_B_9(pc,pv,mwork);CHKERRQ(ierr); pj = b->j + bdiag[row+1]+1; /* begining of U(row,:) */ pv = b->a + bs2*(bdiag[row+1]+1); nz = bdiag[row] - bdiag[row+1] - 1; /* num of entries inU(row,:), excluding diag */ for (j=0; ja */ /* L part */ pv = b->a + bs2*bi[i]; pj = b->j + bi[i]; nz = bi[i+1] - bi[i]; for (j=0; ja + bs2*bdiag[i]; pj = b->j + bdiag[i]; ierr = PetscArraycpy(pv,rtmp+bs2*pj[0],bs2);CHKERRQ(ierr); ierr = PetscKernel_A_gets_inverse_A_9(pv,shift,allowzeropivot,&zeropivotdetected);CHKERRQ(ierr); if (zeropivotdetected) C->factorerrortype = MAT_FACTOR_NUMERIC_ZEROPIVOT; /* U part */ pv = b->a + bs2*(bdiag[i+1]+1); pj = b->j + bdiag[i+1]+1; nz = bdiag[i] - bdiag[i+1] - 1; for (j=0; jops->solve = MatSolve_SeqBAIJ_9_NaturalOrdering; C->ops->solvetranspose = MatSolveTranspose_SeqBAIJ_N; C->assembled = PETSC_TRUE; ierr = PetscLogFlops(1.333333333333*9*9*9*n);CHKERRQ(ierr); /* from inverting diagonal blocks */ PetscFunctionReturn(0); } PetscErrorCode MatSolve_SeqBAIJ_9_NaturalOrdering(Mat A,Vec bb,Vec xx) { Mat_SeqBAIJ *a=(Mat_SeqBAIJ*)A->data; PetscErrorCode ierr; const PetscInt *ai=a->i,*aj=a->j,*adiag=a->diag,*vi; PetscInt i,k,n=a->mbs; PetscInt nz,bs=A->rmap->bs,bs2=a->bs2; const MatScalar *aa=a->a,*v; PetscScalar *x,*s,*t,*ls; const PetscScalar *b; __m256d a0,a1,a2,a3,a4,a5,w0,w1,w2,w3,s0,s1,s2,v0,v1,v2,v3; PetscFunctionBegin; ierr = VecGetArrayRead(bb,&b);CHKERRQ(ierr); ierr = VecGetArray(xx,&x);CHKERRQ(ierr); t = a->solve_work; /* forward solve the lower triangular */ ierr = PetscArraycpy(t,b,bs);CHKERRQ(ierr); /* copy 1st block of b to t */ for (i=1; isolve_work + A->cmap->n; for (i=n-1; i>=0; i--) { v = aa + bs2*(adiag[i+1]+1); vi = aj + adiag[i+1]+1; nz = adiag[i] - adiag[i+1]-1; ierr = PetscArraycpy(ls,t+i*bs,bs);CHKERRQ(ierr); s0 = _mm256_loadu_pd(ls+0); s1 = _mm256_loadu_pd(ls+4); s2 = _mm256_maskload_pd(ls+8, _mm256_set_epi64x(0LL, 0LL, 0LL, 1LL<<63)); for (k=0; kbs2)*(a->nz) - A->rmap->bs*A->cmap->n);CHKERRQ(ierr); PetscFunctionReturn(0); } #endif