Actual source code: vscatcusp.c

petsc-3.3-p5 2012-12-01
  1: #include <petsc-private/isimpl.h>              /*I "petscis.h" I*/
  2: #include <petsc-private/vecimpl.h>             /*I "petscvec.h" I*/

  6: /*@
  7:    VecScatterInitializeForGPU - Initializes a generalized scatter from one vector to
  8:  another for GPU based computation.  Effectively, this function creates all the 
  9:  necessary indexing buffers and work vectors needed to move data only those data points
 10:  in a vector which need to be communicated across ranks. This is done at the first time
 11:  this function is called. Thereafter, this function launches a kernel,
 12:  VecCUSPCopySomeToContiguousBufferGPU_Public, which moves the scattered data into a 
 13:  contiguous buffer on the GPU. Currently, this only used in the context of the parallel 
 14:  SpMV call in MatMult_MPIAIJCUSP (in mpi/mpicusp/mpiaijcusp.cu). This function is executed
 15:  before the call to MatMult. This enables the memory transfers to be overlapped with the
 16:  MatMult SpMV kernel call.

 18:    Input Parameters:
 19: +  inctx - scatter context generated by VecScatterCreate()
 20: .  x - the vector from which we scatter
 21: -  mode - the scattering mode, usually SCATTER_FORWARD.  The available modes are:
 22:     SCATTER_FORWARD or SCATTER_REVERSE

 24:   Level: intermediate

 26: .seealso: VecScatterCreate(), VecScatterEnd()
 27: @*/
 28: PetscErrorCode  VecScatterInitializeForGPU(VecScatter inctx,Vec x,ScatterMode mode)
 29: {
 30:   VecScatter_MPI_General *to,*from;
 31:   PetscScalar            *xv,*svalues;
 32:   MPI_Request            *rwaits,*swaits;
 33:   PetscErrorCode         ierr;
 34:   PetscInt               i,*indices,*sstartsSends,*sstartsRecvs,nrecvs,nsends,bs;

 37:   if (mode & SCATTER_REVERSE) {
 38:     to   = (VecScatter_MPI_General*)inctx->fromdata;
 39:     from = (VecScatter_MPI_General*)inctx->todata;
 40:     rwaits   = from->rev_requests;
 41:     swaits   = to->rev_requests;
 42:   } else {
 43:     to   = (VecScatter_MPI_General*)inctx->todata;
 44:     from = (VecScatter_MPI_General*)inctx->fromdata;
 45:     rwaits   = from->requests;
 46:     swaits   = to->requests;
 47:   }
 48:   bs       = to->bs;
 49:   svalues  = to->values;
 50:   nrecvs   = from->n;
 51:   nsends   = to->n;
 52:   indices  = to->indices;
 53:   sstartsSends  = to->starts;
 54:   sstartsRecvs  = from->starts;
 55:   if (x->valid_GPU_array != PETSC_CUSP_UNALLOCATED && (nsends>0 || nrecvs>0))
 56:   {
 57:     if (!inctx->spptr) {
 58:       PetscInt k,*tindicesSends,*sindicesSends,*tindicesRecvs,*sindicesRecvs;
 59:       PetscInt ns = sstartsSends[nsends],nr = sstartsRecvs[nrecvs];
 60:       /* Here we create indices for both the senders and receivers. */
 61:       PetscMalloc(ns*sizeof(PetscInt),&tindicesSends);
 62:       PetscMalloc(nr*sizeof(PetscInt),&tindicesRecvs);

 64:       PetscMemcpy(tindicesSends,to->indices,ns*sizeof(PetscInt));
 65:       PetscMemcpy(tindicesRecvs,from->indices,nr*sizeof(PetscInt));

 67:       PetscSortRemoveDupsInt(&ns,tindicesSends);
 68:       PetscSortRemoveDupsInt(&nr,tindicesRecvs);

 70:       PetscMalloc(to->bs*ns*sizeof(PetscInt),&sindicesSends);
 71:       PetscMalloc(from->bs*nr*sizeof(PetscInt),&sindicesRecvs);

 73:       /* sender indices */
 74:       for (i=0; i<ns; i++) {
 75:         for (k=0; k<to->bs; k++) {
 76:           sindicesSends[i*to->bs+k] = tindicesSends[i]+k;
 77:         }
 78:       }
 79:       PetscFree(tindicesSends);

 81:       /* receiver indices */
 82:       for (i=0; i<nr; i++) {
 83:         for (k=0; k<from->bs; k++) {
 84:           sindicesRecvs[i*from->bs+k] = tindicesRecvs[i]+k;
 85:         }
 86:       }
 87:       PetscFree(tindicesRecvs);

 89:       /* create GPU indices, work vectors, ... */
 90:       PetscCUSPIndicesCreate(ns*to->bs,sindicesSends,nr*from->bs,sindicesRecvs,(PetscCUSPIndices*)&inctx->spptr);
 91:       PetscFree(sindicesSends);
 92:       PetscFree(sindicesRecvs);
 93:     }
 94:     /*
 95:      This should be called here.
 96:      ... basically, we launch the copy kernel that takes the scattered data and puts it in a 
 97:          a contiguous buffer. Then, this buffer is messaged after the MatMult is called.
 98:      */
 99: #if 0 /* Paul, why did you leave this line commented after writing the note above explaining why it should be called? */
100:     VecCUSPCopySomeToContiguousBufferGPU_Public(x,(PetscCUSPIndices)inctx->spptr);
101: #endif
102:     } else {
103:     VecGetArrayRead(x,(const PetscScalar**)&xv);
104:   }
105:   return(0);
106: }
109: /*@
110:    VecScatterFinalizeForGPU - Finalizes a generalized scatter from one vector to
111:  another for GPU based computation. Effectively, this function resets the temporary
112:  buffer flags. Currently, this only used in the context of the parallel SpMV call in
113:  MatMult_MPIAIJCUSP (in mpi/mpicusp/mpiaijcusp.cu). Once the MatMultAdd is finished,
114:  the GPU temporary buffers used for messaging are no longer valid.

116:    Input Parameters:
117: +  inctx - scatter context generated by VecScatterCreate()

119:   Level: intermediate

121: @*/
122: PetscErrorCode  VecScatterFinalizeForGPU(VecScatter inctx)
123: {
126:   if (inctx->spptr)
127:     VecCUSPResetIndexBuffersFlagsGPU_Public((PetscCUSPIndices)inctx->spptr);
128:   return(0);
129: }