Actual source code: sfpack.h

  1: #pragma once

  3: #include <../src/vec/is/sf/impls/basic/sfbasic.h>
  4: #if defined(PETSC_HAVE_CUDA)
  5: #include <petscdevice_cuda.h>
  6: typedef cudaStream_t cupmStream_t;
  7: typedef cudaEvent_t  cupmEvent_t;
  8: #endif

 10: #if defined(PETSC_HAVE_HIP)
 11: #include <petscdevice_hip.h>
 12: typedef hipStream_t cupmStream_t;
 13: typedef hipEvent_t  cupmEvent_t;
 14: #endif

 16: /* In terms of function overloading, long long int is a different type than int64_t, which PetscInt might be defined to.
 17:    We prefer long long int over PetscInt (int64_t), since CUDA atomics are built around (unsigned) long long int.
 18:  */
 19: typedef long long int          llint;
 20: typedef unsigned long long int ullint;

 22: /* We separate SF communications for SFBasic and SFNeighbor in two parts: local (self,intra-rank) and remote (inter-rank) */
 23: typedef enum {
 24:   PETSCSF_LOCAL = 0,
 25:   PETSCSF_REMOTE
 26: } PetscSFScope;

 28: /* Optimizations in packing & unpacking for destination ranks.

 30:   Suppose there are m indices stored in idx[], and two addresses u, p. We want to do packing:
 31:      p[i] = u[idx[i]], for i in [0,m)

 33:   Indices are associated with n ranks and each rank's indices are stored consecutively in idx[].
 34:   We go through indices for each rank and see if they are indices of a 3D submatrix of size [dx,dy,dz] in
 35:   a parent matrix of size [X,Y,Z], with the submatrix's first index being <start>.

 37:   E.g., for indices 1,2,3, 6,7,8, 11,12,13, the submatrix size is [3,3,1] with start=1, and the parent matrix's size
 38:   is [5,3,1]. For simplicity, if any destination rank does not have this pattern, we give up the optimization.

 40:   Note before using this per-rank optimization, one should check leafcontig[], rootcontig[], which say
 41:   indices in whole are contiguous, and therefore much more useful than this one when true.
 42:  */
 43: struct _n_PetscSFPackOpt {
 44:   PetscInt *array;        /* [7*n+2] Memory pool for other fields in this struct. Used to easily copy this struct to GPU */
 45:   PetscInt  n;            /* Number of destination ranks */
 46:   PetscInt *offset;       /* [n+1] Offsets of indices for each rank. offset[0]=0, offset[i+1]=offset[i]+dx[i]*dy[i]*dz[i] */
 47:   PetscInt *start;        /* [n] First index */
 48:   PetscInt *dx, *dy, *dz; /* [n] Lengths of the submatrix in X, Y, Z dimension. */
 49:   PetscInt *X, *Y;        /* [n] Lengths of the outer matrix in X, Y. We do not care Z. */
 50: };

 52: /* An abstract class that defines a communication link, which includes how to pack/unpack data and send/recv buffers
 53:  */
 54: struct _n_PetscSFLink {
 55:   PetscErrorCode (*Memcpy)(PetscSFLink, PetscMemType, void *, PetscMemType, const void *, size_t); /* Async device memcopy might use stream in the link */
 56:   PetscErrorCode (*PrePack)(PetscSF, PetscSFLink, PetscSFDirection);
 57:   PetscErrorCode (*PostUnpack)(PetscSF, PetscSFLink, PetscSFDirection);
 58:   PetscErrorCode (*InitMPIRequests)(PetscSF, PetscSFLink, PetscSFDirection); // init (persistent) MPI requests
 59:   PetscErrorCode (*StartCommunication)(PetscSF, PetscSFLink, PetscSFDirection);
 60:   PetscErrorCode (*FinishCommunication)(PetscSF, PetscSFLink, PetscSFDirection);
 61:   PetscErrorCode (*SyncDevice)(PetscSFLink);
 62:   PetscErrorCode (*SyncStream)(PetscSFLink);
 63:   PetscErrorCode (*Destroy)(PetscSF, PetscSFLink);

 65:   PetscErrorCode (*BuildDependenceBegin)(PetscSF, PetscSFLink, PetscSFDirection);
 66:   PetscErrorCode (*BuildDependenceEnd)(PetscSF, PetscSFLink, PetscSFDirection);

 68:   PetscErrorCode (*h_Pack)(PetscSFLink, PetscInt, PetscInt, PetscSFPackOpt, const PetscInt *, const void *, void *);
 69:   PetscErrorCode (*h_UnpackAndInsert)(PetscSFLink, PetscInt, PetscInt, PetscSFPackOpt, const PetscInt *, void *, const void *);
 70:   PetscErrorCode (*h_UnpackAndAdd)(PetscSFLink, PetscInt, PetscInt, PetscSFPackOpt, const PetscInt *, void *, const void *);
 71:   PetscErrorCode (*h_UnpackAndMin)(PetscSFLink, PetscInt, PetscInt, PetscSFPackOpt, const PetscInt *, void *, const void *);
 72:   PetscErrorCode (*h_UnpackAndMax)(PetscSFLink, PetscInt, PetscInt, PetscSFPackOpt, const PetscInt *, void *, const void *);
 73:   PetscErrorCode (*h_UnpackAndMinloc)(PetscSFLink, PetscInt, PetscInt, PetscSFPackOpt, const PetscInt *, void *, const void *);
 74:   PetscErrorCode (*h_UnpackAndMaxloc)(PetscSFLink, PetscInt, PetscInt, PetscSFPackOpt, const PetscInt *, void *, const void *);
 75:   PetscErrorCode (*h_UnpackAndMult)(PetscSFLink, PetscInt, PetscInt, PetscSFPackOpt, const PetscInt *, void *, const void *);
 76:   PetscErrorCode (*h_UnpackAndLAND)(PetscSFLink, PetscInt, PetscInt, PetscSFPackOpt, const PetscInt *, void *, const void *);
 77:   PetscErrorCode (*h_UnpackAndBAND)(PetscSFLink, PetscInt, PetscInt, PetscSFPackOpt, const PetscInt *, void *, const void *);
 78:   PetscErrorCode (*h_UnpackAndLOR)(PetscSFLink, PetscInt, PetscInt, PetscSFPackOpt, const PetscInt *, void *, const void *);
 79:   PetscErrorCode (*h_UnpackAndBOR)(PetscSFLink, PetscInt, PetscInt, PetscSFPackOpt, const PetscInt *, void *, const void *);
 80:   PetscErrorCode (*h_UnpackAndLXOR)(PetscSFLink, PetscInt, PetscInt, PetscSFPackOpt, const PetscInt *, void *, const void *);
 81:   PetscErrorCode (*h_UnpackAndBXOR)(PetscSFLink, PetscInt, PetscInt, PetscSFPackOpt, const PetscInt *, void *, const void *);
 82:   PetscErrorCode (*h_FetchAndAdd)(PetscSFLink, PetscInt, PetscInt, PetscSFPackOpt, const PetscInt *, void *, void *);

 84:   PetscErrorCode (*h_ScatterAndInsert)(PetscSFLink, PetscInt, PetscInt, PetscSFPackOpt, const PetscInt *, const void *, PetscInt, PetscSFPackOpt, const PetscInt *, void *);
 85:   PetscErrorCode (*h_ScatterAndAdd)(PetscSFLink, PetscInt, PetscInt, PetscSFPackOpt, const PetscInt *, const void *, PetscInt, PetscSFPackOpt, const PetscInt *, void *);
 86:   PetscErrorCode (*h_ScatterAndMin)(PetscSFLink, PetscInt, PetscInt, PetscSFPackOpt, const PetscInt *, const void *, PetscInt, PetscSFPackOpt, const PetscInt *, void *);
 87:   PetscErrorCode (*h_ScatterAndMax)(PetscSFLink, PetscInt, PetscInt, PetscSFPackOpt, const PetscInt *, const void *, PetscInt, PetscSFPackOpt, const PetscInt *, void *);
 88:   PetscErrorCode (*h_ScatterAndMinloc)(PetscSFLink, PetscInt, PetscInt, PetscSFPackOpt, const PetscInt *, const void *, PetscInt, PetscSFPackOpt, const PetscInt *, void *);
 89:   PetscErrorCode (*h_ScatterAndMaxloc)(PetscSFLink, PetscInt, PetscInt, PetscSFPackOpt, const PetscInt *, const void *, PetscInt, PetscSFPackOpt, const PetscInt *, void *);
 90:   PetscErrorCode (*h_ScatterAndMult)(PetscSFLink, PetscInt, PetscInt, PetscSFPackOpt, const PetscInt *, const void *, PetscInt, PetscSFPackOpt, const PetscInt *, void *);
 91:   PetscErrorCode (*h_ScatterAndLAND)(PetscSFLink, PetscInt, PetscInt, PetscSFPackOpt, const PetscInt *, const void *, PetscInt, PetscSFPackOpt, const PetscInt *, void *);
 92:   PetscErrorCode (*h_ScatterAndBAND)(PetscSFLink, PetscInt, PetscInt, PetscSFPackOpt, const PetscInt *, const void *, PetscInt, PetscSFPackOpt, const PetscInt *, void *);
 93:   PetscErrorCode (*h_ScatterAndLOR)(PetscSFLink, PetscInt, PetscInt, PetscSFPackOpt, const PetscInt *, const void *, PetscInt, PetscSFPackOpt, const PetscInt *, void *);
 94:   PetscErrorCode (*h_ScatterAndBOR)(PetscSFLink, PetscInt, PetscInt, PetscSFPackOpt, const PetscInt *, const void *, PetscInt, PetscSFPackOpt, const PetscInt *, void *);
 95:   PetscErrorCode (*h_ScatterAndLXOR)(PetscSFLink, PetscInt, PetscInt, PetscSFPackOpt, const PetscInt *, const void *, PetscInt, PetscSFPackOpt, const PetscInt *, void *);
 96:   PetscErrorCode (*h_ScatterAndBXOR)(PetscSFLink, PetscInt, PetscInt, PetscSFPackOpt, const PetscInt *, const void *, PetscInt, PetscSFPackOpt, const PetscInt *, void *);

 98:   PetscErrorCode (*h_FetchAndAddLocal)(PetscSFLink, PetscInt, PetscInt, PetscSFPackOpt, const PetscInt *, void *, PetscInt, PetscSFPackOpt, const PetscInt *, const void *, void *);

100:   PetscBool deviceinited; /* Are device related fields initialized? */
101: #if defined(PETSC_HAVE_DEVICE)
102:   /* These fields are lazily initialized in a sense that only when device pointers are passed to an SF, the SF
103:      will set them, otherwise it just leaves them alone. Packing routines using regular ops when there are no data race chances.
104:   */
105:   PetscErrorCode (*d_Pack)(PetscSFLink, PetscInt, PetscInt, PetscSFPackOpt, const PetscInt *, const void *, void *);
106:   PetscErrorCode (*d_UnpackAndInsert)(PetscSFLink, PetscInt, PetscInt, PetscSFPackOpt, const PetscInt *, void *, const void *);
107:   PetscErrorCode (*d_UnpackAndAdd)(PetscSFLink, PetscInt, PetscInt, PetscSFPackOpt, const PetscInt *, void *, const void *);
108:   PetscErrorCode (*d_UnpackAndMin)(PetscSFLink, PetscInt, PetscInt, PetscSFPackOpt, const PetscInt *, void *, const void *);
109:   PetscErrorCode (*d_UnpackAndMax)(PetscSFLink, PetscInt, PetscInt, PetscSFPackOpt, const PetscInt *, void *, const void *);
110:   PetscErrorCode (*d_UnpackAndMinloc)(PetscSFLink, PetscInt, PetscInt, PetscSFPackOpt, const PetscInt *, void *, const void *);
111:   PetscErrorCode (*d_UnpackAndMaxloc)(PetscSFLink, PetscInt, PetscInt, PetscSFPackOpt, const PetscInt *, void *, const void *);
112:   PetscErrorCode (*d_UnpackAndMult)(PetscSFLink, PetscInt, PetscInt, PetscSFPackOpt, const PetscInt *, void *, const void *);
113:   PetscErrorCode (*d_UnpackAndLAND)(PetscSFLink, PetscInt, PetscInt, PetscSFPackOpt, const PetscInt *, void *, const void *);
114:   PetscErrorCode (*d_UnpackAndBAND)(PetscSFLink, PetscInt, PetscInt, PetscSFPackOpt, const PetscInt *, void *, const void *);
115:   PetscErrorCode (*d_UnpackAndLOR)(PetscSFLink, PetscInt, PetscInt, PetscSFPackOpt, const PetscInt *, void *, const void *);
116:   PetscErrorCode (*d_UnpackAndBOR)(PetscSFLink, PetscInt, PetscInt, PetscSFPackOpt, const PetscInt *, void *, const void *);
117:   PetscErrorCode (*d_UnpackAndLXOR)(PetscSFLink, PetscInt, PetscInt, PetscSFPackOpt, const PetscInt *, void *, const void *);
118:   PetscErrorCode (*d_UnpackAndBXOR)(PetscSFLink, PetscInt, PetscInt, PetscSFPackOpt, const PetscInt *, void *, const void *);
119:   PetscErrorCode (*d_FetchAndAdd)(PetscSFLink, PetscInt, PetscInt, PetscSFPackOpt, const PetscInt *, void *, void *);

121:   PetscErrorCode (*d_ScatterAndInsert)(PetscSFLink, PetscInt, PetscInt, PetscSFPackOpt, const PetscInt *, const void *, PetscInt, PetscSFPackOpt, const PetscInt *, void *);
122:   PetscErrorCode (*d_ScatterAndAdd)(PetscSFLink, PetscInt, PetscInt, PetscSFPackOpt, const PetscInt *, const void *, PetscInt, PetscSFPackOpt, const PetscInt *, void *);
123:   PetscErrorCode (*d_ScatterAndMin)(PetscSFLink, PetscInt, PetscInt, PetscSFPackOpt, const PetscInt *, const void *, PetscInt, PetscSFPackOpt, const PetscInt *, void *);
124:   PetscErrorCode (*d_ScatterAndMax)(PetscSFLink, PetscInt, PetscInt, PetscSFPackOpt, const PetscInt *, const void *, PetscInt, PetscSFPackOpt, const PetscInt *, void *);
125:   PetscErrorCode (*d_ScatterAndMinloc)(PetscSFLink, PetscInt, PetscInt, PetscSFPackOpt, const PetscInt *, const void *, PetscInt, PetscSFPackOpt, const PetscInt *, void *);
126:   PetscErrorCode (*d_ScatterAndMaxloc)(PetscSFLink, PetscInt, PetscInt, PetscSFPackOpt, const PetscInt *, const void *, PetscInt, PetscSFPackOpt, const PetscInt *, void *);
127:   PetscErrorCode (*d_ScatterAndMult)(PetscSFLink, PetscInt, PetscInt, PetscSFPackOpt, const PetscInt *, const void *, PetscInt, PetscSFPackOpt, const PetscInt *, void *);
128:   PetscErrorCode (*d_ScatterAndLAND)(PetscSFLink, PetscInt, PetscInt, PetscSFPackOpt, const PetscInt *, const void *, PetscInt, PetscSFPackOpt, const PetscInt *, void *);
129:   PetscErrorCode (*d_ScatterAndBAND)(PetscSFLink, PetscInt, PetscInt, PetscSFPackOpt, const PetscInt *, const void *, PetscInt, PetscSFPackOpt, const PetscInt *, void *);
130:   PetscErrorCode (*d_ScatterAndLOR)(PetscSFLink, PetscInt, PetscInt, PetscSFPackOpt, const PetscInt *, const void *, PetscInt, PetscSFPackOpt, const PetscInt *, void *);
131:   PetscErrorCode (*d_ScatterAndBOR)(PetscSFLink, PetscInt, PetscInt, PetscSFPackOpt, const PetscInt *, const void *, PetscInt, PetscSFPackOpt, const PetscInt *, void *);
132:   PetscErrorCode (*d_ScatterAndLXOR)(PetscSFLink, PetscInt, PetscInt, PetscSFPackOpt, const PetscInt *, const void *, PetscInt, PetscSFPackOpt, const PetscInt *, void *);
133:   PetscErrorCode (*d_ScatterAndBXOR)(PetscSFLink, PetscInt, PetscInt, PetscSFPackOpt, const PetscInt *, const void *, PetscInt, PetscSFPackOpt, const PetscInt *, void *);
134:   PetscErrorCode (*d_FetchAndAddLocal)(PetscSFLink, PetscInt, PetscInt, PetscSFPackOpt, const PetscInt *, void *, PetscInt, PetscSFPackOpt, const PetscInt *, const void *, void *);

136:   /* Packing routines using atomics when there are data race chances */
137:   PetscErrorCode (*da_UnpackAndInsert)(PetscSFLink, PetscInt, PetscInt, PetscSFPackOpt, const PetscInt *, void *, const void *);
138:   PetscErrorCode (*da_UnpackAndAdd)(PetscSFLink, PetscInt, PetscInt, PetscSFPackOpt, const PetscInt *, void *, const void *);
139:   PetscErrorCode (*da_UnpackAndMin)(PetscSFLink, PetscInt, PetscInt, PetscSFPackOpt, const PetscInt *, void *, const void *);
140:   PetscErrorCode (*da_UnpackAndMax)(PetscSFLink, PetscInt, PetscInt, PetscSFPackOpt, const PetscInt *, void *, const void *);
141:   PetscErrorCode (*da_UnpackAndMinloc)(PetscSFLink, PetscInt, PetscInt, PetscSFPackOpt, const PetscInt *, void *, const void *);
142:   PetscErrorCode (*da_UnpackAndMaxloc)(PetscSFLink, PetscInt, PetscInt, PetscSFPackOpt, const PetscInt *, void *, const void *);
143:   PetscErrorCode (*da_UnpackAndMult)(PetscSFLink, PetscInt, PetscInt, PetscSFPackOpt, const PetscInt *, void *, const void *);
144:   PetscErrorCode (*da_UnpackAndLAND)(PetscSFLink, PetscInt, PetscInt, PetscSFPackOpt, const PetscInt *, void *, const void *);
145:   PetscErrorCode (*da_UnpackAndBAND)(PetscSFLink, PetscInt, PetscInt, PetscSFPackOpt, const PetscInt *, void *, const void *);
146:   PetscErrorCode (*da_UnpackAndLOR)(PetscSFLink, PetscInt, PetscInt, PetscSFPackOpt, const PetscInt *, void *, const void *);
147:   PetscErrorCode (*da_UnpackAndBOR)(PetscSFLink, PetscInt, PetscInt, PetscSFPackOpt, const PetscInt *, void *, const void *);
148:   PetscErrorCode (*da_UnpackAndLXOR)(PetscSFLink, PetscInt, PetscInt, PetscSFPackOpt, const PetscInt *, void *, const void *);
149:   PetscErrorCode (*da_UnpackAndBXOR)(PetscSFLink, PetscInt, PetscInt, PetscSFPackOpt, const PetscInt *, void *, const void *);
150:   PetscErrorCode (*da_FetchAndAdd)(PetscSFLink, PetscInt, PetscInt, PetscSFPackOpt, const PetscInt *, void *, void *);

152:   PetscErrorCode (*da_ScatterAndInsert)(PetscSFLink, PetscInt, PetscInt, PetscSFPackOpt, const PetscInt *, const void *, PetscInt, PetscSFPackOpt, const PetscInt *, void *);
153:   PetscErrorCode (*da_ScatterAndAdd)(PetscSFLink, PetscInt, PetscInt, PetscSFPackOpt, const PetscInt *, const void *, PetscInt, PetscSFPackOpt, const PetscInt *, void *);
154:   PetscErrorCode (*da_ScatterAndMin)(PetscSFLink, PetscInt, PetscInt, PetscSFPackOpt, const PetscInt *, const void *, PetscInt, PetscSFPackOpt, const PetscInt *, void *);
155:   PetscErrorCode (*da_ScatterAndMax)(PetscSFLink, PetscInt, PetscInt, PetscSFPackOpt, const PetscInt *, const void *, PetscInt, PetscSFPackOpt, const PetscInt *, void *);
156:   PetscErrorCode (*da_ScatterAndMinloc)(PetscSFLink, PetscInt, PetscInt, PetscSFPackOpt, const PetscInt *, const void *, PetscInt, PetscSFPackOpt, const PetscInt *, void *);
157:   PetscErrorCode (*da_ScatterAndMaxloc)(PetscSFLink, PetscInt, PetscInt, PetscSFPackOpt, const PetscInt *, const void *, PetscInt, PetscSFPackOpt, const PetscInt *, void *);
158:   PetscErrorCode (*da_ScatterAndMult)(PetscSFLink, PetscInt, PetscInt, PetscSFPackOpt, const PetscInt *, const void *, PetscInt, PetscSFPackOpt, const PetscInt *, void *);
159:   PetscErrorCode (*da_ScatterAndLAND)(PetscSFLink, PetscInt, PetscInt, PetscSFPackOpt, const PetscInt *, const void *, PetscInt, PetscSFPackOpt, const PetscInt *, void *);
160:   PetscErrorCode (*da_ScatterAndBAND)(PetscSFLink, PetscInt, PetscInt, PetscSFPackOpt, const PetscInt *, const void *, PetscInt, PetscSFPackOpt, const PetscInt *, void *);
161:   PetscErrorCode (*da_ScatterAndLOR)(PetscSFLink, PetscInt, PetscInt, PetscSFPackOpt, const PetscInt *, const void *, PetscInt, PetscSFPackOpt, const PetscInt *, void *);
162:   PetscErrorCode (*da_ScatterAndBOR)(PetscSFLink, PetscInt, PetscInt, PetscSFPackOpt, const PetscInt *, const void *, PetscInt, PetscSFPackOpt, const PetscInt *, void *);
163:   PetscErrorCode (*da_ScatterAndLXOR)(PetscSFLink, PetscInt, PetscInt, PetscSFPackOpt, const PetscInt *, const void *, PetscInt, PetscSFPackOpt, const PetscInt *, void *);
164:   PetscErrorCode (*da_ScatterAndBXOR)(PetscSFLink, PetscInt, PetscInt, PetscSFPackOpt, const PetscInt *, const void *, PetscInt, PetscSFPackOpt, const PetscInt *, void *);
165:   PetscErrorCode (*da_FetchAndAddLocal)(PetscSFLink, PetscInt, PetscInt, PetscSFPackOpt, const PetscInt *, void *, PetscInt, PetscSFPackOpt, const PetscInt *, const void *, void *);
166:   #if defined(PETSC_HAVE_CUDA) || defined(PETSC_HAVE_HIP)
167:   PetscInt     maxResidentThreadsPerGPU; /* It is a copy from SF for convenience */
168:   cupmStream_t stream;                   /* stream on which input/output root/leafdata is computed on (default is PetscDefaultCudaStream) */
169:   #endif
170: #endif
171:   PetscMPIInt  tag;                  /* Each link has a tag so we can perform multiple SF ops at the same time */
172:   MPI_Datatype unit;                 /* The MPI datatype this PetscSFLink is built for */
173:   MPI_Datatype basicunit;            /* unit is made of MPI builtin dataype basicunit */
174:   PetscBool    isbuiltin;            /* Is unit an MPI/PETSc builtin datatype? If it is true, then bs=1 and basicunit is equivalent to unit */
175:   size_t       unitbytes;            /* Number of bytes in a unit */
176:   PetscInt     bs;                   /* Number of basic units in a unit */
177:   const void  *rootdata, *leafdata;  /* rootdata and leafdata the link is working on. They are used as keys for pending links. */
178:   PetscMemType rootmtype, leafmtype; /* root/leafdata's memory type */

180:   /* For local and remote communication */
181:   PetscMemType rootmtype_mpi, leafmtype_mpi;   /* Mtypes of buffers passed to MPI. If use_gpu_aware_mpi, they are same as root/leafmtype. Otherwise they are PETSC_MEMTYPE_HOST */
182:   PetscBool    rootdirect[2], leafdirect[2];   /* Can root/leafdata be directly passed to SF (i.e., without buffering). In layout of [PETSCSF_LOCAL/REMOTE]. See more in PetscSFLinkCreate() */
183:   PetscInt     rootdirect_mpi, leafdirect_mpi; /* Can root/leafdata for remote be directly passed to MPI? 1: yes, 0: no. See more in PetscSFLinkCreate() */
184:   const void  *rootdatadirect[2][2];           /* The root/leafdata used to init root/leaf requests, in layout of [PETSCSF_DIRECTION][PETSC_MEMTYPE]. */
185:   const void  *leafdatadirect[2][2];           /* ... We need them to look up links when root/leafdirect_mpi are true */
186:   char        *rootbuf[2][2];                  /* Buffers for packed roots, in layout of [PETSCSF_LOCAL/REMOTE][PETSC_MEMTYPE]. PETSCSF_LOCAL does not need MPI, .. */
187:                                                /* .. but in case rootmtype is different from leafmtype, we still need to pack local roots and then copy them to memory of leafmtype */
188:   char        *rootbuf_alloc[2][2];            /* Log memory allocated by petsc. We need it since rootbuf[][] may point to rootdata given by user */
189:   char        *leafbuf[2][2];                  /* Buffers for packed leaves, in layout of [PETSCSF_LOCAL/REMOTE][PETSC_MEMTYPE] */
190:   char        *leafbuf_alloc[2][2];
191:   MPI_Request *rootreqs[2][2][2];       /* Root requests in layout of [PETSCSF_DIRECTION][PETSC_MEMTYPE][rootdirect_mpi] */
192:   MPI_Request *leafreqs[2][2][2];       /* Leaf requests in layout of [PETSCSF_DIRECTION][PETSC_MEMTYPE][leafdirect_mpi] */
193:   PetscBool    rootreqsinited[2][2][2]; /* Are root requests initialized? Also in layout of [PETSCSF_DIRECTION][PETSC_MEMTYPE][rootdirect_mpi]*/
194:   PetscBool    leafreqsinited[2][2][2]; /* Are leaf requests initialized? Also in layout of [PETSCSF_DIRECTION][PETSC_MEMTYPE][leafdirect_mpi]*/
195:   MPI_Request *reqs;                    /* An array of length (nrootreqs+nleafreqs)*8. Pointers in rootreqs[][][] and leafreqs[][][] point here */
196:   PetscSFLink  next;

198:   PetscBool use_nvshmem; /* Does this link use nvshem (vs. MPI) for communication? */
199: #if defined(PETSC_HAVE_NVSHMEM)
200:   cupmEvent_t  dataReady;        /* Events to mark readiness of root/leafdata */
201:   cupmEvent_t  endRemoteComm;    /* Events to mark end of local/remote communication */
202:   cupmStream_t remoteCommStream; /* Streams for remote (i.e., inter-rank) communication */

204:   /* The buffers are allocated in device symmetric heap. Their length is the maximal length over all ranks in the comm, and therefore is the same. */
205:   uint64_t *rootSendSig, *rootRecvSig; /* [max{niranks-ndiranks}], signals used when rootbuf works as send/recv buf */
206:   uint64_t *leafSendSig, *leafRecvSig; /* [max{nranks-ndranks}], signals used when leafbuf works as send/recv buf */
207: #endif
208: };

210: PETSC_INTERN PetscErrorCode PetscSFSetErrorOnUnsupportedOverlap(PetscSF, MPI_Datatype, const void *, const void *);

212: /* Create/setup/retrieve/destroy a link */
213: PETSC_INTERN PetscErrorCode PetscSFLinkCreate(PetscSF, MPI_Datatype, PetscMemType, const void *, PetscMemType, const void *, MPI_Op, PetscSFOperation, PetscSFLink *);
214: PETSC_INTERN PetscErrorCode PetscSFLinkSetUp_Host(PetscSF, PetscSFLink, MPI_Datatype);
215: PETSC_INTERN PetscErrorCode PetscSFLinkGetInUse(PetscSF, MPI_Datatype, const void *, const void *, PetscCopyMode, PetscSFLink *);
216: PETSC_INTERN PetscErrorCode PetscSFLinkReclaim(PetscSF, PetscSFLink *);
217: PETSC_INTERN PetscErrorCode PetscSFLinkDestroy(PetscSF, PetscSFLink);

219: /* Get pack/unpack function pointers from a link */
220: static inline PetscErrorCode PetscSFLinkGetPack(PetscSFLink link, PetscMemType mtype, PetscErrorCode (**Pack)(PetscSFLink, PetscInt, PetscInt, PetscSFPackOpt, const PetscInt *, const void *, void *))
221: {
222:   PetscFunctionBegin;
223:   if (PetscMemTypeHost(mtype)) *Pack = link->h_Pack;
224: #if defined(PETSC_HAVE_DEVICE)
225:   else *Pack = link->d_Pack;
226: #endif
227:   PetscFunctionReturn(PETSC_SUCCESS);
228: }

230: PETSC_INTERN PetscErrorCode PetscSFLinkGetUnpackAndOp(PetscSFLink, PetscMemType, MPI_Op, PetscBool, PetscErrorCode (**UnpackAndOp)(PetscSFLink, PetscInt, PetscInt, PetscSFPackOpt, const PetscInt *, void *, const void *));
231: PETSC_INTERN PetscErrorCode PetscSFLinkGetFetchAndOp(PetscSFLink, PetscMemType, MPI_Op, PetscBool, PetscErrorCode (**FetchAndOp)(PetscSFLink, PetscInt, PetscInt, PetscSFPackOpt, const PetscInt *, void *, void *));
232: PETSC_INTERN PetscErrorCode PetscSFLinkGetScatterAndOp(PetscSFLink, PetscMemType, MPI_Op, PetscBool, PetscErrorCode (**ScatterAndOp)(PetscSFLink, PetscInt, PetscInt, PetscSFPackOpt, const PetscInt *, const void *, PetscInt, PetscSFPackOpt, const PetscInt *, void *));
233: PETSC_INTERN PetscErrorCode PetscSFLinkGetFetchAndOpLocal(PetscSFLink, PetscMemType, MPI_Op, PetscBool, PetscErrorCode (**FetchAndOpLocal)(PetscSFLink, PetscInt, PetscInt, PetscSFPackOpt, const PetscInt *, void *, PetscInt, PetscSFPackOpt, const PetscInt *, const void *, void *));

235: /* Do Pack/Unpack/Fetch/Scatter with the link */
236: PETSC_INTERN PetscErrorCode PetscSFLinkPackRootData(PetscSF, PetscSFLink, PetscSFScope, const void *);
237: PETSC_INTERN PetscErrorCode PetscSFLinkPackLeafData(PetscSF, PetscSFLink, PetscSFScope, const void *);
238: PETSC_INTERN PetscErrorCode PetscSFLinkUnpackRootData(PetscSF, PetscSFLink, PetscSFScope, void *, MPI_Op);
239: PETSC_INTERN PetscErrorCode PetscSFLinkUnpackLeafData(PetscSF, PetscSFLink, PetscSFScope, void *, MPI_Op);
240: PETSC_INTERN PetscErrorCode PetscSFLinkFetchAndOpRemote(PetscSF, PetscSFLink, void *, MPI_Op);

242: PETSC_INTERN PetscErrorCode PetscSFLinkScatterLocal(PetscSF, PetscSFLink, PetscSFDirection, void *, void *, MPI_Op);
243: PETSC_INTERN PetscErrorCode PetscSFLinkFetchAndOpLocal(PetscSF, PetscSFLink, void *, const void *, void *, MPI_Op);

245: PETSC_INTERN PetscErrorCode PetscSFSetUpPackFields(PetscSF);
246: PETSC_INTERN PetscErrorCode PetscSFResetPackFields(PetscSF);
247: PETSC_INTERN PetscErrorCode PetscSFLinkCreate_MPI(PetscSF, MPI_Datatype, PetscMemType, const void *, PetscMemType, const void *, MPI_Op, PetscSFOperation, PetscSFLink *);

249: #if defined(PETSC_HAVE_CUDA)
250: PETSC_INTERN PetscErrorCode PetscSFLinkSetUp_CUDA(PetscSF, PetscSFLink, MPI_Datatype);
251: #endif

253: #if defined(PETSC_HAVE_HIP)
254: PETSC_INTERN PetscErrorCode PetscSFLinkSetUp_HIP(PetscSF, PetscSFLink, MPI_Datatype);
255: #endif

257: #if defined(PETSC_HAVE_KOKKOS)
258: PETSC_INTERN PetscErrorCode PetscSFLinkSetUp_Kokkos(PetscSF, PetscSFLink, MPI_Datatype);
259: #endif

261: #if defined(PETSC_HAVE_NVSHMEM)
262: PETSC_INTERN PetscErrorCode PetscSFLinkCreate_NVSHMEM(PetscSF, MPI_Datatype, PetscMemType, const void *, PetscMemType, const void *, MPI_Op, PetscSFOperation, PetscSFLink *);
263: PETSC_INTERN PetscErrorCode PetscSFLinkNvshmemCheck(PetscSF, PetscMemType, const void *, PetscMemType, const void *, PetscBool *);
264: #endif

266: static inline PetscErrorCode PetscSFLinkGetMPIBuffersAndRequests(PetscSF sf, PetscSFLink link, PetscSFDirection direction, void **rootbuf, void **leafbuf, MPI_Request **rootreqs, MPI_Request **leafreqs)
267: {
268:   const PetscMemType rootmtype_mpi = link->rootmtype_mpi, leafmtype_mpi = link->leafmtype_mpi; /* memtype of buffers passed to MPI */
269:   const PetscInt     rootdirect_mpi = link->rootdirect_mpi, leafdirect_mpi = link->leafdirect_mpi;

271:   PetscFunctionBegin;
272:   if (link->InitMPIRequests) PetscCall((*link->InitMPIRequests)(sf, link, direction)); // init (persistent) MPI requests

274:   if (rootbuf) *rootbuf = link->rootbuf[PETSCSF_REMOTE][rootmtype_mpi];
275:   if (leafbuf) *leafbuf = link->leafbuf[PETSCSF_REMOTE][leafmtype_mpi];
276:   if (rootreqs) *rootreqs = link->rootreqs[direction][rootmtype_mpi][rootdirect_mpi];
277:   if (leafreqs) *leafreqs = link->leafreqs[direction][leafmtype_mpi][leafdirect_mpi];
278:   PetscFunctionReturn(PETSC_SUCCESS);
279: }

281: static inline PetscErrorCode PetscSFLinkStartCommunication(PetscSF sf, PetscSFLink link, PetscSFDirection direction)
282: {
283:   PetscFunctionBegin;
284:   if (link->StartCommunication) PetscCall((*link->StartCommunication)(sf, link, direction));
285:   PetscFunctionReturn(PETSC_SUCCESS);
286: }

288: static inline PetscErrorCode PetscSFLinkFinishCommunication(PetscSF sf, PetscSFLink link, PetscSFDirection direction)
289: {
290:   PetscFunctionBegin;
291:   if (link->FinishCommunication) PetscCall((*link->FinishCommunication)(sf, link, direction));
292:   PetscFunctionReturn(PETSC_SUCCESS);
293: }

295: /* A set of helper routines for Pack/Unpack/Scatter on GPUs */
296: #if defined(PETSC_HAVE_CUDA) || defined(PETSC_HAVE_HIP) || defined(PETSC_HAVE_SYCL)
297: /* PetscSFLinkCopyXxxxBufferInCaseNotUseGpuAwareMPI routines are simple: if not use_gpu_aware_mpi, we need
298:    to copy the buffer from GPU to CPU before MPI calls, and from CPU to GPU after MPI calls.
299: */
300: static inline PetscErrorCode PetscSFLinkCopyRootBufferInCaseNotUseGpuAwareMPI(PetscSF sf, PetscSFLink link, PetscBool device2host)
301: {
302:   PetscSF_Basic *bas = (PetscSF_Basic *)sf->data;

304:   PetscFunctionBegin;
305:   /* rootdata is on device but we use regular MPI for communication */
306:   if (PetscMemTypeDevice(link->rootmtype) && PetscMemTypeHost(link->rootmtype_mpi) && bas->rootbuflen[PETSCSF_REMOTE]) {
307:     void  *h_buf = link->rootbuf[PETSCSF_REMOTE][PETSC_MEMTYPE_HOST];
308:     void  *d_buf = link->rootbuf[PETSCSF_REMOTE][PETSC_MEMTYPE_DEVICE];
309:     size_t count = bas->rootbuflen[PETSCSF_REMOTE] * link->unitbytes;
310:     if (device2host) {
311:       PetscCall((*link->Memcpy)(link, PETSC_MEMTYPE_HOST, h_buf, PETSC_MEMTYPE_DEVICE, d_buf, count));
312:       PetscCall(PetscLogGpuToCpu(count));
313:     } else {
314:       PetscCall((*link->Memcpy)(link, PETSC_MEMTYPE_DEVICE, d_buf, PETSC_MEMTYPE_HOST, h_buf, count));
315:       PetscCall(PetscLogCpuToGpu(count));
316:     }
317:   }
318:   PetscFunctionReturn(PETSC_SUCCESS);
319: }

321: static inline PetscErrorCode PetscSFLinkCopyLeafBufferInCaseNotUseGpuAwareMPI(PetscSF sf, PetscSFLink link, PetscBool device2host)
322: {
323:   PetscFunctionBegin;
324:   if (PetscMemTypeDevice(link->leafmtype) && PetscMemTypeHost(link->leafmtype_mpi) && sf->leafbuflen[PETSCSF_REMOTE]) {
325:     void  *h_buf = link->leafbuf[PETSCSF_REMOTE][PETSC_MEMTYPE_HOST];
326:     void  *d_buf = link->leafbuf[PETSCSF_REMOTE][PETSC_MEMTYPE_DEVICE];
327:     size_t count = sf->leafbuflen[PETSCSF_REMOTE] * link->unitbytes;
328:     if (device2host) {
329:       PetscCall((*link->Memcpy)(link, PETSC_MEMTYPE_HOST, h_buf, PETSC_MEMTYPE_DEVICE, d_buf, count));
330:       PetscCall(PetscLogGpuToCpu(count));
331:     } else {
332:       PetscCall((*link->Memcpy)(link, PETSC_MEMTYPE_DEVICE, d_buf, PETSC_MEMTYPE_HOST, h_buf, count));
333:       PetscCall(PetscLogCpuToGpu(count));
334:     }
335:   }
336:   PetscFunctionReturn(PETSC_SUCCESS);
337: }

339: /* Make sure root/leafbuf for the remote is ready for MPI */
340: static inline PetscErrorCode PetscSFLinkSyncStreamBeforeCallMPI(PetscSF sf, PetscSFLink link, PetscSFDirection direction)
341: {
342:   PetscSF_Basic *bas;
343:   PetscInt       buflen;
344:   PetscMemType   mtype;

346:   PetscFunctionBegin;
347:   if (direction == PETSCSF_ROOT2LEAF) {
348:     bas    = (PetscSF_Basic *)sf->data;
349:     mtype  = link->rootmtype;
350:     buflen = bas->rootbuflen[PETSCSF_REMOTE];
351:   } else {
352:     mtype  = link->leafmtype;
353:     buflen = sf->leafbuflen[PETSCSF_REMOTE];
354:   }

356:   if (PetscMemTypeDevice(mtype) && buflen) PetscCall((*link->SyncStream)(link));
357:   PetscFunctionReturn(PETSC_SUCCESS);
358: }
359: #else /* Host only */
360:   #define PetscSFLinkCopyRootBufferInCaseNotUseGpuAwareMPI(a, b, c) PETSC_SUCCESS
361:   #define PetscSFLinkCopyLeafBufferInCaseNotUseGpuAwareMPI(a, b, c) PETSC_SUCCESS
362:   #define PetscSFLinkSyncStreamBeforeCallMPI(a, b, c)               PETSC_SUCCESS
363: #endif

365: /* Get root indices used for pack/unpack

367: Input arguments:
368:   +sf    - StarForest
369:   .link  - The link, which provides the stream for the async memcpy (In SF, we make all GPU operations asynchronous to avoid unexpected pipeline stalls)
370:   .mtype - In what type of memory? (PETSC_MEMTYPE_DEVICE or PETSC_MEMTYPE_HOST)
371:   -scope - Which part of the indices? (PETSCSF_LOCAL or PETSCSF_REMOTE)

373:  Output arguments:
374:   +count   - Count of indices
375:   .start   - The first index (only useful when indices is NULL)
376:   .opt     - Packing optimizations
377:   -indices - Indices of roots for pack/unpack. NULL means indices are contiguous
378:  */
379: static inline PetscErrorCode PetscSFLinkGetRootPackOptAndIndices(PetscSF sf, PetscSFLink link, PetscMemType mtype, PetscSFScope scope, PetscInt *count, PetscInt *start, PetscSFPackOpt *opt, const PetscInt **indices)
380: {
381:   PetscSF_Basic *bas = (PetscSF_Basic *)sf->data;
382:   PetscInt       offset;

384:   PetscFunctionBegin;
385:   *count   = bas->rootbuflen[scope];
386:   *start   = bas->rootstart[scope];
387:   *opt     = NULL;
388:   *indices = NULL;

390:   /* We have these rules:
391:     1) opt == NULL && indices == NULL ==> indices are contiguous.
392:     2) opt != NULL ==> indices are in 3D but not contiguous. On host, indices != NULL since indices are already available and we do not
393:        want to enforce all operations to use opt; but on device, indices = NULL since we do not want to copy indices to device.
394:   */
395:   if (!bas->rootcontig[scope]) {
396:     offset = (scope == PETSCSF_LOCAL) ? 0 : bas->ioffset[bas->ndiranks];
397:     if (PetscMemTypeHost(mtype)) {
398:       *opt     = bas->rootpackopt[scope];
399:       *indices = bas->irootloc + offset;
400:     } else {
401:       size_t size;
402:       if (bas->rootpackopt[scope]) {
403:         if (!bas->rootpackopt_d[scope]) {
404:           PetscCall(PetscMalloc1(1, &bas->rootpackopt_d[scope]));
405:           PetscCall(PetscArraycpy(bas->rootpackopt_d[scope], bas->rootpackopt[scope], 1)); /* Make pointers in bas->rootpackopt_d[] still work on host */
406:           size = (bas->rootpackopt[scope]->n * 7 + 2) * sizeof(PetscInt);                  /* See comments at struct _n_PetscSFPackOpt*/
407:           PetscCall(PetscSFMalloc(sf, PETSC_MEMTYPE_DEVICE, size, (void **)&bas->rootpackopt_d[scope]->array));
408:           PetscCall((*link->Memcpy)(link, PETSC_MEMTYPE_DEVICE, bas->rootpackopt_d[scope]->array, PETSC_MEMTYPE_HOST, bas->rootpackopt[scope]->array, size));
409:         }
410:         *opt = bas->rootpackopt_d[scope];
411:       } else { /* On device, we only provide indices when there is no optimization. We're reluctant to copy indices to device. */
412:         if (!bas->irootloc_d[scope]) {
413:           size = bas->rootbuflen[scope] * sizeof(PetscInt);
414:           PetscCall(PetscSFMalloc(sf, PETSC_MEMTYPE_DEVICE, size, (void **)&bas->irootloc_d[scope]));
415:           PetscCall((*link->Memcpy)(link, PETSC_MEMTYPE_DEVICE, bas->irootloc_d[scope], PETSC_MEMTYPE_HOST, bas->irootloc + offset, size));
416:         }
417:         *indices = bas->irootloc_d[scope];
418:       }
419:     }
420:   }
421:   PetscFunctionReturn(PETSC_SUCCESS);
422: }

424: /* Get leaf indices used for pack/unpack

426:   See also PetscSFLinkGetRootPackOptAndIndices()
427:  */
428: static inline PetscErrorCode PetscSFLinkGetLeafPackOptAndIndices(PetscSF sf, PetscSFLink link, PetscMemType mtype, PetscSFScope scope, PetscInt *count, PetscInt *start, PetscSFPackOpt *opt, const PetscInt **indices)
429: {
430:   PetscInt offset;

432:   PetscFunctionBegin;
433:   *count   = sf->leafbuflen[scope];
434:   *start   = sf->leafstart[scope];
435:   *opt     = NULL;
436:   *indices = NULL;
437:   if (!sf->leafcontig[scope]) {
438:     offset = (scope == PETSCSF_LOCAL) ? 0 : sf->roffset[sf->ndranks];
439:     if (PetscMemTypeHost(mtype)) {
440:       *opt     = sf->leafpackopt[scope];
441:       *indices = sf->rmine + offset;
442:     } else {
443:       size_t size;
444:       if (sf->leafpackopt[scope]) {
445:         if (!sf->leafpackopt_d[scope]) {
446:           PetscCall(PetscMalloc1(1, &sf->leafpackopt_d[scope]));
447:           PetscCall(PetscArraycpy(sf->leafpackopt_d[scope], sf->leafpackopt[scope], 1));
448:           size = (sf->leafpackopt[scope]->n * 7 + 2) * sizeof(PetscInt);                                       /* See comments at struct _n_PetscSFPackOpt*/
449:           PetscCall(PetscSFMalloc(sf, PETSC_MEMTYPE_DEVICE, size, (void **)&sf->leafpackopt_d[scope]->array)); /* Change ->array to a device pointer */
450:           PetscCall((*link->Memcpy)(link, PETSC_MEMTYPE_DEVICE, sf->leafpackopt_d[scope]->array, PETSC_MEMTYPE_HOST, sf->leafpackopt[scope]->array, size));
451:         }
452:         *opt = sf->leafpackopt_d[scope];
453:       } else {
454:         if (!sf->rmine_d[scope]) {
455:           size = sf->leafbuflen[scope] * sizeof(PetscInt);
456:           PetscCall(PetscSFMalloc(sf, PETSC_MEMTYPE_DEVICE, size, (void **)&sf->rmine_d[scope]));
457:           PetscCall((*link->Memcpy)(link, PETSC_MEMTYPE_DEVICE, sf->rmine_d[scope], PETSC_MEMTYPE_HOST, sf->rmine + offset, size));
458:         }
459:         *indices = sf->rmine_d[scope];
460:       }
461:     }
462:   }
463:   PetscFunctionReturn(PETSC_SUCCESS);
464: }