1: #include <petsc-private/isimpl.h> /*I "petscis.h" I*/
2: #include <petsc-private/vecimpl.h> /*I "petscvec.h" I*/
6: /*@
7: VecScatterInitializeForGPU - Initializes a generalized scatter from one vector to
8: another for GPU based computation. Effectively, this function creates all the
9: necessary indexing buffers and work vectors needed to move data only those data points
10: in a vector which need to be communicated across ranks. This is done at the first time
11: this function is called. Thereafter, this function launches a kernel,
12: VecCUSPCopySomeToContiguousBufferGPU_Public, which moves the scattered data into a
13: contiguous buffer on the GPU. Currently, this only used in the context of the parallel
14: SpMV call in MatMult_MPIAIJCUSP (in mpi/mpicusp/mpiaijcusp.cu). This function is executed
15: before the call to MatMult. This enables the memory transfers to be overlapped with the
16: MatMult SpMV kernel call.
18: Input Parameters:
19: + inctx - scatter context generated by VecScatterCreate()
20: . x - the vector from which we scatter
21: - mode - the scattering mode, usually SCATTER_FORWARD. The available modes are:
22: SCATTER_FORWARD or SCATTER_REVERSE 24: Level: intermediate
26: .seealso: VecScatterCreate(), VecScatterEnd()
27: @*/
28: PetscErrorCodeVecScatterInitializeForGPU(VecScatter inctx,Vec x,ScatterMode mode) 29: {
30: VecScatter_MPI_General *to,*from;
31: PetscScalar *xv,*svalues;
32: MPI_Request *rwaits,*swaits;
33: PetscErrorCode ierr;
34: PetscInt i,*indices,*sstartsSends,*sstartsRecvs,nrecvs,nsends,bs;
37: if (mode & SCATTER_REVERSE) {
38: to = (VecScatter_MPI_General*)inctx->fromdata;
39: from = (VecScatter_MPI_General*)inctx->todata;
40: rwaits = from->rev_requests;
41: swaits = to->rev_requests;
42: } else {
43: to = (VecScatter_MPI_General*)inctx->todata;
44: from = (VecScatter_MPI_General*)inctx->fromdata;
45: rwaits = from->requests;
46: swaits = to->requests;
47: }
48: bs = to->bs;
49: svalues = to->values;
50: nrecvs = from->n;
51: nsends = to->n;
52: indices = to->indices;
53: sstartsSends = to->starts;
54: sstartsRecvs = from->starts;
55: if (x->valid_GPU_array != PETSC_CUSP_UNALLOCATED && (nsends>0 || nrecvs>0))
56: {
57: if (!inctx->spptr) {
58: PetscInt k,*tindicesSends,*sindicesSends,*tindicesRecvs,*sindicesRecvs;
59: PetscInt ns = sstartsSends[nsends],nr = sstartsRecvs[nrecvs];
60: /* Here we create indices for both the senders and receivers. */
61: PetscMalloc(ns*sizeof(PetscInt),&tindicesSends);
62: PetscMalloc(nr*sizeof(PetscInt),&tindicesRecvs);
64: PetscMemcpy(tindicesSends,to->indices,ns*sizeof(PetscInt));
65: PetscMemcpy(tindicesRecvs,from->indices,nr*sizeof(PetscInt));
67: PetscSortRemoveDupsInt(&ns,tindicesSends);
68: PetscSortRemoveDupsInt(&nr,tindicesRecvs);
70: PetscMalloc(to->bs*ns*sizeof(PetscInt),&sindicesSends);
71: PetscMalloc(from->bs*nr*sizeof(PetscInt),&sindicesRecvs);
73: /* sender indices */
74: for (i=0; i<ns; i++) {
75: for (k=0; k<to->bs; k++) {
76: sindicesSends[i*to->bs+k] = tindicesSends[i]+k;
77: }
78: }
79: PetscFree(tindicesSends);
81: /* receiver indices */
82: for (i=0; i<nr; i++) {
83: for (k=0; k<from->bs; k++) {
84: sindicesRecvs[i*from->bs+k] = tindicesRecvs[i]+k;
85: }
86: }
87: PetscFree(tindicesRecvs);
89: /* create GPU indices, work vectors, ... */
90: PetscCUSPIndicesCreate(ns*to->bs,sindicesSends,nr*from->bs,sindicesRecvs,(PetscCUSPIndices*)&inctx->spptr);
91: PetscFree(sindicesSends);
92: PetscFree(sindicesRecvs);
93: }
94: /*
95: This should be called here.
96: ... basically, we launch the copy kernel that takes the scattered data and puts it in a
97: a contiguous buffer. Then, this buffer is messaged after the MatMult is called.
98: */
99: #if 0 /* Paul, why did you leave this line commented after writing the note above explaining why it should be called? */
100: VecCUSPCopySomeToContiguousBufferGPU_Public(x,(PetscCUSPIndices)inctx->spptr);
101: #endif
102: } else {
103: VecGetArrayRead(x,(const PetscScalar**)&xv);
104: }
105: return(0);
106: }
109: /*@
110: VecScatterFinalizeForGPU - Finalizes a generalized scatter from one vector to
111: another for GPU based computation. Effectively, this function resets the temporary
112: buffer flags. Currently, this only used in the context of the parallel SpMV call in
113: MatMult_MPIAIJCUSP (in mpi/mpicusp/mpiaijcusp.cu). Once the MatMultAdd is finished,
114: the GPU temporary buffers used for messaging are no longer valid.
116: Input Parameters:
117: + inctx - scatter context generated by VecScatterCreate()
119: Level: intermediate
121: @*/
122: PetscErrorCodeVecScatterFinalizeForGPU(VecScatter inctx)123: {
126: if (inctx->spptr)
127: VecCUSPResetIndexBuffersFlagsGPU_Public((PetscCUSPIndices)inctx->spptr);
128: return(0);
129: }