Actual source code: vpscat.c
1: #define PETSCVEC_DLL
3: /*
4: Defines parallel vector scatters.
5: */
7: #include private/isimpl.h
8: #include private/vecimpl.h
9: #include ../src/vec/vec/impls/dvecimpl.h
10: #include ../src/vec/vec/impls/mpi/pvecimpl.h
11: #include petscsys.h
15: PetscErrorCode VecScatterView_MPI(VecScatter ctx,PetscViewer viewer)
16: {
17: VecScatter_MPI_General *to=(VecScatter_MPI_General*)ctx->todata;
18: VecScatter_MPI_General *from=(VecScatter_MPI_General*)ctx->fromdata;
19: PetscErrorCode ierr;
20: PetscInt i;
21: PetscMPIInt rank;
22: PetscViewerFormat format;
23: PetscTruth iascii;
26: PetscTypeCompare((PetscObject)viewer,PETSC_VIEWER_ASCII,&iascii);
27: if (iascii) {
28: MPI_Comm_rank(((PetscObject)ctx)->comm,&rank);
29: PetscViewerGetFormat(viewer,&format);
30: if (format == PETSC_VIEWER_ASCII_INFO) {
31: PetscInt nsend_max,nrecv_max,lensend_max,lenrecv_max,alldata,itmp;
33: MPI_Reduce(&to->n,&nsend_max,1,MPIU_INT,MPI_MAX,0,((PetscObject)ctx)->comm);
34: MPI_Reduce(&from->n,&nrecv_max,1,MPIU_INT,MPI_MAX,0,((PetscObject)ctx)->comm);
35: itmp = to->starts[to->n+1];
36: MPI_Reduce(&itmp,&lensend_max,1,MPIU_INT,MPI_MAX,0,((PetscObject)ctx)->comm);
37: itmp = from->starts[from->n+1];
38: MPI_Reduce(&itmp,&lenrecv_max,1,MPIU_INT,MPI_MAX,0,((PetscObject)ctx)->comm);
39: MPI_Reduce(&itmp,&alldata,1,MPIU_INT,MPI_SUM,0,((PetscObject)ctx)->comm);
41: PetscViewerASCIIPrintf(viewer,"VecScatter statistics\n");
42: PetscViewerASCIIPrintf(viewer," Maximum number sends %D\n",nsend_max);
43: PetscViewerASCIIPrintf(viewer," Maximum number receives %D\n",nrecv_max);
44: PetscViewerASCIIPrintf(viewer," Maximum data sent %D\n",(int)(lensend_max*to->bs*sizeof(PetscScalar)));
45: PetscViewerASCIIPrintf(viewer," Maximum data received %D\n",(int)(lenrecv_max*to->bs*sizeof(PetscScalar)));
46: PetscViewerASCIIPrintf(viewer," Total data sent %D\n",(int)(alldata*to->bs*sizeof(PetscScalar)));
48: } else {
49: PetscViewerASCIISynchronizedPrintf(viewer,"[%d] Number sends = %D; Number to self = %D\n",rank,to->n,to->local.n);
50: if (to->n) {
51: for (i=0; i<to->n; i++){
52: PetscViewerASCIISynchronizedPrintf(viewer,"[%d] %D length = %D to whom %D\n",rank,i,to->starts[i+1]-to->starts[i],to->procs[i]);
53: }
54: PetscViewerASCIISynchronizedPrintf(viewer,"Now the indices for all remote sends (in order by process sent to)\n");
55: for (i=0; i<to->starts[to->n]; i++){
56: PetscViewerASCIISynchronizedPrintf(viewer,"[%d]%D \n",rank,to->indices[i]);
57: }
58: }
60: PetscViewerASCIISynchronizedPrintf(viewer,"[%d]Number receives = %D; Number from self = %D\n",rank,from->n,from->local.n);
61: if (from->n) {
62: for (i=0; i<from->n; i++){
63: PetscViewerASCIISynchronizedPrintf(viewer,"[%d] %D length %D from whom %D\n",rank,i,from->starts[i+1]-from->starts[i],from->procs[i]);
64: }
66: PetscViewerASCIISynchronizedPrintf(viewer,"Now the indices for all remote receives (in order by process received from)\n");
67: for (i=0; i<from->starts[from->n]; i++){
68: PetscViewerASCIISynchronizedPrintf(viewer,"[%d]%D \n",rank,from->indices[i]);
69: }
70: }
71: if (to->local.n) {
72: PetscViewerASCIISynchronizedPrintf(viewer,"[%d]Indices for local part of scatter\n",rank);
73: for (i=0; i<to->local.n; i++){
74: PetscViewerASCIISynchronizedPrintf(viewer,"[%d]From %D to %D \n",rank,from->local.vslots[i],to->local.vslots[i]);
75: }
76: }
78: PetscViewerFlush(viewer);
79: }
80: } else {
81: SETERRQ1(PETSC_ERR_SUP,"Viewer type %s not supported for this scatter",((PetscObject)viewer)->type_name);
82: }
83: return(0);
84: }
86: /* -----------------------------------------------------------------------------------*/
87: /*
88: The next routine determines what part of the local part of the scatter is an
89: exact copy of values into their current location. We check this here and
90: then know that we need not perform that portion of the scatter when the vector is
91: scattering to itself with INSERT_VALUES.
93: This is currently not used but would speed up, for example DALocalToLocalBegin/End()
95: */
98: PetscErrorCode VecScatterLocalOptimize_Private(VecScatter scatter,VecScatter_Seq_General *to,VecScatter_Seq_General *from)
99: {
100: PetscInt n = to->n,n_nonmatching = 0,i,*to_slots = to->vslots,*from_slots = from->vslots;
102: PetscInt *nto_slots,*nfrom_slots,j = 0;
103:
105: for (i=0; i<n; i++) {
106: if (to_slots[i] != from_slots[i]) n_nonmatching++;
107: }
109: if (!n_nonmatching) {
110: to->nonmatching_computed = PETSC_TRUE;
111: to->n_nonmatching = from->n_nonmatching = 0;
112: PetscInfo1(scatter,"Reduced %D to 0\n", n);
113: } else if (n_nonmatching == n) {
114: to->nonmatching_computed = PETSC_FALSE;
115: PetscInfo(scatter,"All values non-matching\n");
116: } else {
117: to->nonmatching_computed= PETSC_TRUE;
118: to->n_nonmatching = from->n_nonmatching = n_nonmatching;
119: PetscMalloc(n_nonmatching*sizeof(PetscInt),&nto_slots);
120: PetscMalloc(n_nonmatching*sizeof(PetscInt),&nfrom_slots);
121: to->slots_nonmatching = nto_slots;
122: from->slots_nonmatching = nfrom_slots;
123: for (i=0; i<n; i++) {
124: if (to_slots[i] != from_slots[i]) {
125: nto_slots[j] = to_slots[i];
126: nfrom_slots[j] = from_slots[i];
127: j++;
128: }
129: }
130: PetscInfo2(scatter,"Reduced %D to %D\n",n,n_nonmatching);
131: }
132: return(0);
133: }
135: /* --------------------------------------------------------------------------------------*/
137: /* -------------------------------------------------------------------------------------*/
140: PetscErrorCode VecScatterDestroy_PtoP(VecScatter ctx)
141: {
142: VecScatter_MPI_General *to = (VecScatter_MPI_General*)ctx->todata;
143: VecScatter_MPI_General *from = (VecScatter_MPI_General*)ctx->fromdata;
144: PetscErrorCode ierr;
145: PetscInt i;
148: if (to->use_readyreceiver) {
149: /*
150: Since we have already posted sends we must cancel them before freeing
151: the requests
152: */
153: for (i=0; i<from->n; i++) {
154: MPI_Cancel(from->requests+i);
155: }
156: for (i=0; i<to->n; i++) {
157: MPI_Cancel(to->rev_requests+i);
158: }
159: }
161: #if defined(PETSC_HAVE_MPI_ALLTOALLW) && !defined(PETSC_USE_64BIT_INDICES)
162: if (to->use_alltoallw) {
163: PetscFree3(to->wcounts,to->wdispls,to->types);
164: PetscFree3(from->wcounts,from->wdispls,from->types);
165: }
166: #endif
168: #if defined(PETSC_HAVE_MPI_WINDOW)
169: if (to->use_window) {
170: MPI_Win_free(&from->window);
171: MPI_Win_free(&to->window);
172: }
173: #endif
175: if (to->use_alltoallv) {
176: PetscFree2(to->counts,to->displs);
177: PetscFree2(from->counts,from->displs);
178: }
180: /* release MPI resources obtained with MPI_Send_init() and MPI_Recv_init() */
181: /*
182: IBM's PE version of MPI has a bug where freeing these guys will screw up later
183: message passing.
184: */
185: #if !defined(PETSC_HAVE_BROKEN_REQUEST_FREE)
186: if (!to->use_alltoallv) { /* currently the to->requests etc are ALWAYS allocated even if not used */
187: if (to->requests) {
188: for (i=0; i<to->n; i++) {
189: MPI_Request_free(to->requests + i);
190: }
191: }
192: if (to->rev_requests) {
193: for (i=0; i<to->n; i++) {
194: MPI_Request_free(to->rev_requests + i);
195: }
196: }
197: }
198: /*
199: MPICH could not properly cancel requests thus with ready receiver mode we
200: cannot free the requests. It may be fixed now, if not then put the following
201: code inside a if !to->use_readyreceiver) {
202: */
203: if (!to->use_alltoallv) { /* currently the from->requests etc are ALWAYS allocated even if not used */
204: if (from->requests) {
205: for (i=0; i<from->n; i++) {
206: MPI_Request_free(from->requests + i);
207: }
208: }
210: if (from->rev_requests) {
211: for (i=0; i<from->n; i++) {
212: MPI_Request_free(from->rev_requests + i);
213: }
214: }
215: }
216: #endif
218: PetscFree(to->local.vslots);
219: PetscFree(from->local.vslots);
220: PetscFree2(to->counts,to->displs);
221: PetscFree2(from->counts,from->displs);
222: PetscFree(to->local.slots_nonmatching);
223: PetscFree(from->local.slots_nonmatching);
224: PetscFree(to->rev_requests);
225: PetscFree(from->rev_requests);
226: PetscFree(to->requests);
227: PetscFree(from->requests);
228: PetscFree4(to->values,to->indices,to->starts,to->procs);
229: PetscFree2(to->sstatus,to->rstatus);
230: PetscFree4(from->values,from->indices,from->starts,from->procs);
231: PetscFree(from);
232: PetscFree(to);
233: PetscHeaderDestroy(ctx);
234: return(0);
235: }
239: /* --------------------------------------------------------------------------------------*/
240: /*
241: Special optimization to see if the local part of the scatter is actually
242: a copy. The scatter routines call PetscMemcpy() instead.
243:
244: */
247: PetscErrorCode VecScatterLocalOptimizeCopy_Private(VecScatter scatter,VecScatter_Seq_General *to,VecScatter_Seq_General *from,PetscInt bs)
248: {
249: PetscInt n = to->n,i,*to_slots = to->vslots,*from_slots = from->vslots;
250: PetscInt to_start,from_start;
254: to_start = to_slots[0];
255: from_start = from_slots[0];
257: for (i=1; i<n; i++) {
258: to_start += bs;
259: from_start += bs;
260: if (to_slots[i] != to_start) return(0);
261: if (from_slots[i] != from_start) return(0);
262: }
263: to->is_copy = PETSC_TRUE;
264: to->copy_start = to_slots[0];
265: to->copy_length = bs*sizeof(PetscScalar)*n;
266: from->is_copy = PETSC_TRUE;
267: from->copy_start = from_slots[0];
268: from->copy_length = bs*sizeof(PetscScalar)*n;
269: PetscInfo(scatter,"Local scatter is a copy, optimizing for it\n");
270: return(0);
271: }
273: /* --------------------------------------------------------------------------------------*/
277: PetscErrorCode VecScatterCopy_PtoP_X(VecScatter in,VecScatter out)
278: {
279: VecScatter_MPI_General *in_to = (VecScatter_MPI_General*)in->todata;
280: VecScatter_MPI_General *in_from = (VecScatter_MPI_General*)in->fromdata,*out_to,*out_from;
281: PetscErrorCode ierr;
282: PetscInt ny,bs = in_from->bs;
285: out->begin = in->begin;
286: out->end = in->end;
287: out->copy = in->copy;
288: out->destroy = in->destroy;
289: out->view = in->view;
291: /* allocate entire send scatter context */
292: PetscNewLog(out,VecScatter_MPI_General,&out_to);
293: PetscNewLog(out,VecScatter_MPI_General,&out_from);
295: ny = in_to->starts[in_to->n];
296: out_to->n = in_to->n;
297: out_to->type = in_to->type;
298: out_to->sendfirst = in_to->sendfirst;
300: PetscMalloc(out_to->n*sizeof(MPI_Request),&out_to->requests);
301: PetscMalloc4(bs*ny,PetscScalar,&out_to->values,ny,PetscInt,&out_to->indices,out_to->n+1,PetscInt,&out_to->starts,out_to->n,PetscMPIInt,&out_to->procs);
302: PetscMalloc2(PetscMax(in_to->n,in_from->n),MPI_Status,&out_to->sstatus,PetscMax(in_to->n,in_from->n),MPI_Status,&out_to->rstatus);
303: PetscMemcpy(out_to->indices,in_to->indices,ny*sizeof(PetscInt));
304: PetscMemcpy(out_to->starts,in_to->starts,(out_to->n+1)*sizeof(PetscInt));
305: PetscMemcpy(out_to->procs,in_to->procs,(out_to->n)*sizeof(PetscMPIInt));
306:
307: out->todata = (void*)out_to;
308: out_to->local.n = in_to->local.n;
309: out_to->local.nonmatching_computed = PETSC_FALSE;
310: out_to->local.n_nonmatching = 0;
311: out_to->local.slots_nonmatching = 0;
312: if (in_to->local.n) {
313: PetscMalloc(in_to->local.n*sizeof(PetscInt),&out_to->local.vslots);
314: PetscMalloc(in_from->local.n*sizeof(PetscInt),&out_from->local.vslots);
315: PetscMemcpy(out_to->local.vslots,in_to->local.vslots,in_to->local.n*sizeof(PetscInt));
316: PetscMemcpy(out_from->local.vslots,in_from->local.vslots,in_from->local.n*sizeof(PetscInt));
317: } else {
318: out_to->local.vslots = 0;
319: out_from->local.vslots = 0;
320: }
322: /* allocate entire receive context */
323: out_from->type = in_from->type;
324: ny = in_from->starts[in_from->n];
325: out_from->n = in_from->n;
326: out_from->sendfirst = in_from->sendfirst;
328: PetscMalloc(out_from->n*sizeof(MPI_Request),&out_from->requests);
329: PetscMalloc4(ny*bs,PetscScalar,&out_from->values,ny,PetscInt,&out_from->indices,out_from->n+1,PetscInt,&out_from->starts,out_from->n,PetscMPIInt,&out_from->procs);
330: PetscMemcpy(out_from->indices,in_from->indices,ny*sizeof(PetscInt));
331: PetscMemcpy(out_from->starts,in_from->starts,(out_from->n+1)*sizeof(PetscInt));
332: PetscMemcpy(out_from->procs,in_from->procs,(out_from->n)*sizeof(PetscMPIInt));
333: out->fromdata = (void*)out_from;
334: out_from->local.n = in_from->local.n;
335: out_from->local.nonmatching_computed = PETSC_FALSE;
336: out_from->local.n_nonmatching = 0;
337: out_from->local.slots_nonmatching = 0;
339: /*
340: set up the request arrays for use with isend_init() and irecv_init()
341: */
342: {
343: PetscMPIInt tag;
344: MPI_Comm comm;
345: PetscInt *sstarts = out_to->starts, *rstarts = out_from->starts;
346: PetscMPIInt *sprocs = out_to->procs, *rprocs = out_from->procs;
347: PetscInt i;
348: PetscTruth flg;
349: MPI_Request *swaits = out_to->requests,*rwaits = out_from->requests;
350: MPI_Request *rev_swaits,*rev_rwaits;
351: PetscScalar *Ssvalues = out_to->values, *Srvalues = out_from->values;
353: PetscMalloc(in_to->n*sizeof(MPI_Request),&out_to->rev_requests);
354: PetscMalloc(in_from->n*sizeof(MPI_Request),&out_from->rev_requests);
356: rev_rwaits = out_to->rev_requests;
357: rev_swaits = out_from->rev_requests;
359: out_from->bs = out_to->bs = bs;
360: tag = ((PetscObject)out)->tag;
361: comm = ((PetscObject)out)->comm;
363: /* Register the receives that you will use later (sends for scatter reverse) */
364: for (i=0; i<out_from->n; i++) {
365: MPI_Recv_init(Srvalues+bs*rstarts[i],bs*rstarts[i+1]-bs*rstarts[i],MPIU_SCALAR,rprocs[i],tag,comm,rwaits+i);
366: MPI_Send_init(Srvalues+bs*rstarts[i],bs*rstarts[i+1]-bs*rstarts[i],MPIU_SCALAR,rprocs[i],tag,comm,rev_swaits+i);
367: }
369: PetscOptionsHasName(PETSC_NULL,"-vecscatter_rsend",&flg);
370: if (flg) {
371: out_to->use_readyreceiver = PETSC_TRUE;
372: out_from->use_readyreceiver = PETSC_TRUE;
373: for (i=0; i<out_to->n; i++) {
374: MPI_Rsend_init(Ssvalues+bs*sstarts[i],bs*sstarts[i+1]-bs*sstarts[i],MPIU_SCALAR,sprocs[i],tag,comm,swaits+i);
375: }
376: if (out_from->n) {MPI_Startall_irecv(out_from->starts[out_from->n]*out_from->bs,out_from->n,out_from->requests);}
377: MPI_Barrier(comm);
378: PetscInfo(in,"Using VecScatter ready receiver mode\n");
379: } else {
380: out_to->use_readyreceiver = PETSC_FALSE;
381: out_from->use_readyreceiver = PETSC_FALSE;
382: PetscOptionsHasName(PETSC_NULL,"-vecscatter_ssend",&flg);
383: if (flg) {
384: PetscInfo(in,"Using VecScatter Ssend mode\n");
385: }
386: for (i=0; i<out_to->n; i++) {
387: if (!flg) {
388: MPI_Send_init(Ssvalues+bs*sstarts[i],bs*sstarts[i+1]-bs*sstarts[i],MPIU_SCALAR,sprocs[i],tag,comm,swaits+i);
389: } else {
390: MPI_Ssend_init(Ssvalues+bs*sstarts[i],bs*sstarts[i+1]-bs*sstarts[i],MPIU_SCALAR,sprocs[i],tag,comm,swaits+i);
391: }
392: }
393: }
394: /* Register receives for scatter reverse */
395: for (i=0; i<out_to->n; i++) {
396: MPI_Recv_init(Ssvalues+bs*sstarts[i],bs*sstarts[i+1]-bs*sstarts[i],MPIU_SCALAR,sprocs[i],tag,comm,rev_rwaits+i);
397: }
398: }
400: return(0);
401: }
405: PetscErrorCode VecScatterCopy_PtoP_AllToAll(VecScatter in,VecScatter out)
406: {
407: VecScatter_MPI_General *in_to = (VecScatter_MPI_General*)in->todata;
408: VecScatter_MPI_General *in_from = (VecScatter_MPI_General*)in->fromdata,*out_to,*out_from;
409: PetscErrorCode ierr;
410: PetscInt ny,bs = in_from->bs;
411: PetscMPIInt size;
414: MPI_Comm_size(((PetscObject)in)->comm,&size);
415: out->begin = in->begin;
416: out->end = in->end;
417: out->copy = in->copy;
418: out->destroy = in->destroy;
419: out->view = in->view;
421: /* allocate entire send scatter context */
422: PetscNewLog(out,VecScatter_MPI_General,&out_to);
423: PetscNewLog(out,VecScatter_MPI_General,&out_from);
425: ny = in_to->starts[in_to->n];
426: out_to->n = in_to->n;
427: out_to->type = in_to->type;
428: out_to->sendfirst = in_to->sendfirst;
430: PetscMalloc(out_to->n*sizeof(MPI_Request),&out_to->requests);
431: PetscMalloc4(bs*ny,PetscScalar,&out_to->values,ny,PetscInt,&out_to->indices,out_to->n+1,PetscInt,&out_to->starts,out_to->n,PetscMPIInt,&out_to->procs);
432: PetscMalloc2(PetscMax(in_to->n,in_from->n),MPI_Status,&out_to->sstatus,PetscMax(in_to->n,in_from->n),MPI_Status,&out_to->rstatus);
433: PetscMemcpy(out_to->indices,in_to->indices,ny*sizeof(PetscInt));
434: PetscMemcpy(out_to->starts,in_to->starts,(out_to->n+1)*sizeof(PetscInt));
435: PetscMemcpy(out_to->procs,in_to->procs,(out_to->n)*sizeof(PetscMPIInt));
436:
437: out->todata = (void*)out_to;
438: out_to->local.n = in_to->local.n;
439: out_to->local.nonmatching_computed = PETSC_FALSE;
440: out_to->local.n_nonmatching = 0;
441: out_to->local.slots_nonmatching = 0;
442: if (in_to->local.n) {
443: PetscMalloc(in_to->local.n*sizeof(PetscInt),&out_to->local.vslots);
444: PetscMalloc(in_from->local.n*sizeof(PetscInt),&out_from->local.vslots);
445: PetscMemcpy(out_to->local.vslots,in_to->local.vslots,in_to->local.n*sizeof(PetscInt));
446: PetscMemcpy(out_from->local.vslots,in_from->local.vslots,in_from->local.n*sizeof(PetscInt));
447: } else {
448: out_to->local.vslots = 0;
449: out_from->local.vslots = 0;
450: }
452: /* allocate entire receive context */
453: out_from->type = in_from->type;
454: ny = in_from->starts[in_from->n];
455: out_from->n = in_from->n;
456: out_from->sendfirst = in_from->sendfirst;
458: PetscMalloc(out_from->n*sizeof(MPI_Request),&out_from->requests);
459: PetscMalloc4(ny*bs,PetscScalar,&out_from->values,ny,PetscInt,&out_from->indices,out_from->n+1,PetscInt,&out_from->starts,out_from->n,PetscMPIInt,&out_from->procs);
460: PetscMemcpy(out_from->indices,in_from->indices,ny*sizeof(PetscInt));
461: PetscMemcpy(out_from->starts,in_from->starts,(out_from->n+1)*sizeof(PetscInt));
462: PetscMemcpy(out_from->procs,in_from->procs,(out_from->n)*sizeof(PetscMPIInt));
463: out->fromdata = (void*)out_from;
464: out_from->local.n = in_from->local.n;
465: out_from->local.nonmatching_computed = PETSC_FALSE;
466: out_from->local.n_nonmatching = 0;
467: out_from->local.slots_nonmatching = 0;
469: out_to->use_alltoallv = out_from->use_alltoallv = PETSC_TRUE;
471: PetscMalloc2(size,PetscMPIInt,&out_to->counts,size,PetscMPIInt,&out_to->displs);
472: PetscMemcpy(out_to->counts,in_to->counts,size*sizeof(PetscMPIInt));
473: PetscMemcpy(out_to->displs,in_to->displs,size*sizeof(PetscMPIInt));
475: PetscMalloc2(size,PetscMPIInt,&out_from->counts,size,PetscMPIInt,&out_from->displs);
476: PetscMemcpy(out_from->counts,in_from->counts,size*sizeof(PetscMPIInt));
477: PetscMemcpy(out_from->displs,in_from->displs,size*sizeof(PetscMPIInt));
478: return(0);
479: }
480: /* --------------------------------------------------------------------------------------------------
481: Packs and unpacks the message data into send or from receive buffers.
483: These could be generated automatically.
485: Fortran kernels etc. could be used.
486: */
487: PETSC_STATIC_INLINE void Pack_1(PetscInt n,const PetscInt *indicesx,const PetscScalar *x,PetscScalar *y)
488: {
489: PetscInt i;
490: for (i=0; i<n; i++) {
491: y[i] = x[indicesx[i]];
492: }
493: }
494: PETSC_STATIC_INLINE void UnPack_1(PetscInt n,const PetscScalar *x,const PetscInt *indicesy,PetscScalar *y,InsertMode addv)
495: {
496: PetscInt i;
497: switch (addv) {
498: case INSERT_VALUES:
499: for (i=0; i<n; i++) {
500: y[indicesy[i]] = x[i];
501: }
502: break;
503: case ADD_VALUES:
504: for (i=0; i<n; i++) {
505: y[indicesy[i]] += x[i];
506: }
507: break;
508: #if !defined(PETSC_USE_COMPLEX)
509: case MAX_VALUES:
510: for (i=0; i<n; i++) {
511: y[indicesy[i]] = PetscMax(y[indicesy[i]],x[i]);
512: }
513: #else
514: case MAX_VALUES:
515: #endif
516: case NOT_SET_VALUES:
517: break;
518: }
519: }
521: PETSC_STATIC_INLINE void Scatter_1(PetscInt n,const PetscInt *indicesx,const PetscScalar *x,const PetscInt *indicesy,PetscScalar *y,InsertMode addv)
522: {
523: PetscInt i;
524: switch (addv) {
525: case INSERT_VALUES:
526: for (i=0; i<n; i++) {
527: y[indicesy[i]] = x[indicesx[i]];
528: }
529: break;
530: case ADD_VALUES:
531: for (i=0; i<n; i++) {
532: y[indicesy[i]] += x[indicesx[i]];
533: }
534: break;
535: #if !defined(PETSC_USE_COMPLEX)
536: case MAX_VALUES:
537: for (i=0; i<n; i++) {
538: y[indicesy[i]] = PetscMax(y[indicesy[i]],x[indicesx[i]]);
539: }
540: #else
541: case MAX_VALUES:
542: #endif
543: case NOT_SET_VALUES:
544: break;
545: }
546: }
548: /* ----------------------------------------------------------------------------------------------- */
549: PETSC_STATIC_INLINE void Pack_2(PetscInt n,const PetscInt *indicesx,const PetscScalar *x,PetscScalar *y)
550: {
551: PetscInt i,idx;
553: for (i=0; i<n; i++) {
554: idx = *indicesx++;
555: y[0] = x[idx];
556: y[1] = x[idx+1];
557: y += 2;
558: }
559: }
560: PETSC_STATIC_INLINE void UnPack_2(PetscInt n,const PetscScalar *x,const PetscInt *indicesy,PetscScalar *y,InsertMode addv)
561: {
562: PetscInt i,idy;
564: switch (addv) {
565: case INSERT_VALUES:
566: for (i=0; i<n; i++) {
567: idy = *indicesy++;
568: y[idy] = x[0];
569: y[idy+1] = x[1];
570: x += 2;
571: }
572: break;
573: case ADD_VALUES:
574: for (i=0; i<n; i++) {
575: idy = *indicesy++;
576: y[idy] += x[0];
577: y[idy+1] += x[1];
578: x += 2;
579: }
580: break;
581: #if !defined(PETSC_USE_COMPLEX)
582: case MAX_VALUES:
583: for (i=0; i<n; i++) {
584: idy = *indicesy++;
585: y[idy] = PetscMax(y[idy],x[0]);
586: y[idy+1] = PetscMax(y[idy+1],x[1]);
587: x += 2;
588: }
589: #else
590: case MAX_VALUES:
591: #endif
592: case NOT_SET_VALUES:
593: break;
594: }
595: }
597: PETSC_STATIC_INLINE void Scatter_2(PetscInt n,const PetscInt *indicesx,const PetscScalar *x,const PetscInt *indicesy,PetscScalar *y,InsertMode addv)
598: {
599: PetscInt i,idx,idy;
601: switch (addv) {
602: case INSERT_VALUES:
603: for (i=0; i<n; i++) {
604: idx = *indicesx++;
605: idy = *indicesy++;
606: y[idy] = x[idx];
607: y[idy+1] = x[idx+1];
608: }
609: break;
610: case ADD_VALUES:
611: for (i=0; i<n; i++) {
612: idx = *indicesx++;
613: idy = *indicesy++;
614: y[idy] += x[idx];
615: y[idy+1] += x[idx+1];
616: }
617: break;
618: #if !defined(PETSC_USE_COMPLEX)
619: case MAX_VALUES:
620: for (i=0; i<n; i++) {
621: idx = *indicesx++;
622: idy = *indicesy++;
623: y[idy] = PetscMax(y[idy],x[idx]);
624: y[idy+1] = PetscMax(y[idy+1],x[idx+1]);
625: }
626: #else
627: case MAX_VALUES:
628: #endif
629: case NOT_SET_VALUES:
630: break;
631: }
632: }
633: /* ----------------------------------------------------------------------------------------------- */
634: PETSC_STATIC_INLINE void Pack_3(PetscInt n,const PetscInt *indicesx,const PetscScalar *x,PetscScalar *y)
635: {
636: PetscInt i,idx;
638: for (i=0; i<n; i++) {
639: idx = *indicesx++;
640: y[0] = x[idx];
641: y[1] = x[idx+1];
642: y[2] = x[idx+2];
643: y += 3;
644: }
645: }
646: PETSC_STATIC_INLINE void UnPack_3(PetscInt n,const PetscScalar *x,const PetscInt *indicesy,PetscScalar *y,InsertMode addv)
647: {
648: PetscInt i,idy;
650: switch (addv) {
651: case INSERT_VALUES:
652: for (i=0; i<n; i++) {
653: idy = *indicesy++;
654: y[idy] = x[0];
655: y[idy+1] = x[1];
656: y[idy+2] = x[2];
657: x += 3;
658: }
659: break;
660: case ADD_VALUES:
661: for (i=0; i<n; i++) {
662: idy = *indicesy++;
663: y[idy] += x[0];
664: y[idy+1] += x[1];
665: y[idy+2] += x[2];
666: x += 3;
667: }
668: break;
669: #if !defined(PETSC_USE_COMPLEX)
670: case MAX_VALUES:
671: for (i=0; i<n; i++) {
672: idy = *indicesy++;
673: y[idy] = PetscMax(y[idy],x[0]);
674: y[idy+1] = PetscMax(y[idy+1],x[1]);
675: y[idy+2] = PetscMax(y[idy+2],x[2]);
676: x += 3;
677: }
678: #else
679: case MAX_VALUES:
680: #endif
681: case NOT_SET_VALUES:
682: break;
683: }
684: }
686: PETSC_STATIC_INLINE void Scatter_3(PetscInt n,const PetscInt *indicesx,const PetscScalar *x,const PetscInt *indicesy,PetscScalar *y,InsertMode addv)
687: {
688: PetscInt i,idx,idy;
690: switch (addv) {
691: case INSERT_VALUES:
692: for (i=0; i<n; i++) {
693: idx = *indicesx++;
694: idy = *indicesy++;
695: y[idy] = x[idx];
696: y[idy+1] = x[idx+1];
697: y[idy+2] = x[idx+2];
698: }
699: break;
700: case ADD_VALUES:
701: for (i=0; i<n; i++) {
702: idx = *indicesx++;
703: idy = *indicesy++;
704: y[idy] += x[idx];
705: y[idy+1] += x[idx+1];
706: y[idy+2] += x[idx+2];
707: }
708: break;
709: #if !defined(PETSC_USE_COMPLEX)
710: case MAX_VALUES:
711: for (i=0; i<n; i++) {
712: idx = *indicesx++;
713: idy = *indicesy++;
714: y[idy] = PetscMax(y[idy],x[idx]);
715: y[idy+1] = PetscMax(y[idy+1],x[idx+1]);
716: y[idy+2] = PetscMax(y[idy+2],x[idx+2]);
717: }
718: #else
719: case MAX_VALUES:
720: #endif
721: case NOT_SET_VALUES:
722: break;
723: }
724: }
725: /* ----------------------------------------------------------------------------------------------- */
726: PETSC_STATIC_INLINE void Pack_4(PetscInt n,const PetscInt *indicesx,const PetscScalar *x,PetscScalar *y)
727: {
728: PetscInt i,idx;
730: for (i=0; i<n; i++) {
731: idx = *indicesx++;
732: y[0] = x[idx];
733: y[1] = x[idx+1];
734: y[2] = x[idx+2];
735: y[3] = x[idx+3];
736: y += 4;
737: }
738: }
739: PETSC_STATIC_INLINE void UnPack_4(PetscInt n,const PetscScalar *x,const PetscInt *indicesy,PetscScalar *y,InsertMode addv)
740: {
741: PetscInt i,idy;
743: switch (addv) {
744: case INSERT_VALUES:
745: for (i=0; i<n; i++) {
746: idy = *indicesy++;
747: y[idy] = x[0];
748: y[idy+1] = x[1];
749: y[idy+2] = x[2];
750: y[idy+3] = x[3];
751: x += 4;
752: }
753: break;
754: case ADD_VALUES:
755: for (i=0; i<n; i++) {
756: idy = *indicesy++;
757: y[idy] += x[0];
758: y[idy+1] += x[1];
759: y[idy+2] += x[2];
760: y[idy+3] += x[3];
761: x += 4;
762: }
763: break;
764: #if !defined(PETSC_USE_COMPLEX)
765: case MAX_VALUES:
766: for (i=0; i<n; i++) {
767: idy = *indicesy++;
768: y[idy] = PetscMax(y[idy],x[0]);
769: y[idy+1] = PetscMax(y[idy+1],x[1]);
770: y[idy+2] = PetscMax(y[idy+2],x[2]);
771: y[idy+3] = PetscMax(y[idy+3],x[3]);
772: x += 4;
773: }
774: #else
775: case MAX_VALUES:
776: #endif
777: case NOT_SET_VALUES:
778: break;
779: }
780: }
782: PETSC_STATIC_INLINE void Scatter_4(PetscInt n,const PetscInt *indicesx,const PetscScalar *x,const PetscInt *indicesy,PetscScalar *y,InsertMode addv)
783: {
784: PetscInt i,idx,idy;
786: switch (addv) {
787: case INSERT_VALUES:
788: for (i=0; i<n; i++) {
789: idx = *indicesx++;
790: idy = *indicesy++;
791: y[idy] = x[idx];
792: y[idy+1] = x[idx+1];
793: y[idy+2] = x[idx+2];
794: y[idy+3] = x[idx+3];
795: }
796: break;
797: case ADD_VALUES:
798: for (i=0; i<n; i++) {
799: idx = *indicesx++;
800: idy = *indicesy++;
801: y[idy] += x[idx];
802: y[idy+1] += x[idx+1];
803: y[idy+2] += x[idx+2];
804: y[idy+3] += x[idx+3];
805: }
806: break;
807: #if !defined(PETSC_USE_COMPLEX)
808: case MAX_VALUES:
809: for (i=0; i<n; i++) {
810: idx = *indicesx++;
811: idy = *indicesy++;
812: y[idy] = PetscMax(y[idy],x[idx]);
813: y[idy+1] = PetscMax(y[idy+1],x[idx+1]);
814: y[idy+2] = PetscMax(y[idy+2],x[idx+2]);
815: y[idy+3] = PetscMax(y[idy+3],x[idx+3]);
816: }
817: #else
818: case MAX_VALUES:
819: #endif
820: case NOT_SET_VALUES:
821: break;
822: }
823: }
824: /* ----------------------------------------------------------------------------------------------- */
825: PETSC_STATIC_INLINE void Pack_5(PetscInt n,const PetscInt *indicesx,const PetscScalar *x,PetscScalar *y)
826: {
827: PetscInt i,idx;
829: for (i=0; i<n; i++) {
830: idx = *indicesx++;
831: y[0] = x[idx];
832: y[1] = x[idx+1];
833: y[2] = x[idx+2];
834: y[3] = x[idx+3];
835: y[4] = x[idx+4];
836: y += 5;
837: }
838: }
839: PETSC_STATIC_INLINE void UnPack_5(PetscInt n,const PetscScalar *x,const PetscInt *indicesy,PetscScalar *y,InsertMode addv)
840: {
841: PetscInt i,idy;
843: switch (addv) {
844: case INSERT_VALUES:
845: for (i=0; i<n; i++) {
846: idy = *indicesy++;
847: y[idy] = x[0];
848: y[idy+1] = x[1];
849: y[idy+2] = x[2];
850: y[idy+3] = x[3];
851: y[idy+4] = x[4];
852: x += 5;
853: }
854: break;
855: case ADD_VALUES:
856: for (i=0; i<n; i++) {
857: idy = *indicesy++;
858: y[idy] += x[0];
859: y[idy+1] += x[1];
860: y[idy+2] += x[2];
861: y[idy+3] += x[3];
862: y[idy+4] += x[4];
863: x += 5;
864: }
865: break;
866: #if !defined(PETSC_USE_COMPLEX)
867: case MAX_VALUES:
868: for (i=0; i<n; i++) {
869: idy = *indicesy++;
870: y[idy] = PetscMax(y[idy],x[0]);
871: y[idy+1] = PetscMax(y[idy+1],x[1]);
872: y[idy+2] = PetscMax(y[idy+2],x[2]);
873: y[idy+3] = PetscMax(y[idy+3],x[3]);
874: y[idy+4] = PetscMax(y[idy+4],x[4]);
875: x += 5;
876: }
877: #else
878: case MAX_VALUES:
879: #endif
880: case NOT_SET_VALUES:
881: break;
882: }
883: }
885: PETSC_STATIC_INLINE void Scatter_5(PetscInt n,const PetscInt *indicesx,const PetscScalar *x,const PetscInt *indicesy,PetscScalar *y,InsertMode addv)
886: {
887: PetscInt i,idx,idy;
889: switch (addv) {
890: case INSERT_VALUES:
891: for (i=0; i<n; i++) {
892: idx = *indicesx++;
893: idy = *indicesy++;
894: y[idy] = x[idx];
895: y[idy+1] = x[idx+1];
896: y[idy+2] = x[idx+2];
897: y[idy+3] = x[idx+3];
898: y[idy+4] = x[idx+4];
899: }
900: break;
901: case ADD_VALUES:
902: for (i=0; i<n; i++) {
903: idx = *indicesx++;
904: idy = *indicesy++;
905: y[idy] += x[idx];
906: y[idy+1] += x[idx+1];
907: y[idy+2] += x[idx+2];
908: y[idy+3] += x[idx+3];
909: y[idy+4] += x[idx+4];
910: }
911: break;
912: #if !defined(PETSC_USE_COMPLEX)
913: case MAX_VALUES:
914: for (i=0; i<n; i++) {
915: idx = *indicesx++;
916: idy = *indicesy++;
917: y[idy] = PetscMax(y[idy],x[idx]);
918: y[idy+1] = PetscMax(y[idy+1],x[idx+1]);
919: y[idy+2] = PetscMax(y[idy+2],x[idx+2]);
920: y[idy+3] = PetscMax(y[idy+3],x[idx+3]);
921: y[idy+4] = PetscMax(y[idy+4],x[idx+4]);
922: }
923: #else
924: case MAX_VALUES:
925: #endif
926: case NOT_SET_VALUES:
927: break;
928: }
929: }
930: /* ----------------------------------------------------------------------------------------------- */
931: PETSC_STATIC_INLINE void Pack_6(PetscInt n,const PetscInt *indicesx,const PetscScalar *x,PetscScalar *y)
932: {
933: PetscInt i,idx;
935: for (i=0; i<n; i++) {
936: idx = *indicesx++;
937: y[0] = x[idx];
938: y[1] = x[idx+1];
939: y[2] = x[idx+2];
940: y[3] = x[idx+3];
941: y[4] = x[idx+4];
942: y[5] = x[idx+5];
943: y += 6;
944: }
945: }
946: PETSC_STATIC_INLINE void UnPack_6(PetscInt n,const PetscScalar *x,const PetscInt *indicesy,PetscScalar *y,InsertMode addv)
947: {
948: PetscInt i,idy;
950: switch (addv) {
951: case INSERT_VALUES:
952: for (i=0; i<n; i++) {
953: idy = *indicesy++;
954: y[idy] = x[0];
955: y[idy+1] = x[1];
956: y[idy+2] = x[2];
957: y[idy+3] = x[3];
958: y[idy+4] = x[4];
959: y[idy+5] = x[5];
960: x += 6;
961: }
962: break;
963: case ADD_VALUES:
964: for (i=0; i<n; i++) {
965: idy = *indicesy++;
966: y[idy] += x[0];
967: y[idy+1] += x[1];
968: y[idy+2] += x[2];
969: y[idy+3] += x[3];
970: y[idy+4] += x[4];
971: y[idy+5] += x[5];
972: x += 6;
973: }
974: break;
975: #if !defined(PETSC_USE_COMPLEX)
976: case MAX_VALUES:
977: for (i=0; i<n; i++) {
978: idy = *indicesy++;
979: y[idy] = PetscMax(y[idy],x[0]);
980: y[idy+1] = PetscMax(y[idy+1],x[1]);
981: y[idy+2] = PetscMax(y[idy+2],x[2]);
982: y[idy+3] = PetscMax(y[idy+3],x[3]);
983: y[idy+4] = PetscMax(y[idy+4],x[4]);
984: y[idy+5] = PetscMax(y[idy+5],x[5]);
985: x += 6;
986: }
987: #else
988: case MAX_VALUES:
989: #endif
990: case NOT_SET_VALUES:
991: break;
992: }
993: }
995: PETSC_STATIC_INLINE void Scatter_6(PetscInt n,const PetscInt *indicesx,const PetscScalar *x,const PetscInt *indicesy,PetscScalar *y,InsertMode addv)
996: {
997: PetscInt i,idx,idy;
999: switch (addv) {
1000: case INSERT_VALUES:
1001: for (i=0; i<n; i++) {
1002: idx = *indicesx++;
1003: idy = *indicesy++;
1004: y[idy] = x[idx];
1005: y[idy+1] = x[idx+1];
1006: y[idy+2] = x[idx+2];
1007: y[idy+3] = x[idx+3];
1008: y[idy+4] = x[idx+4];
1009: y[idy+5] = x[idx+5];
1010: }
1011: break;
1012: case ADD_VALUES:
1013: for (i=0; i<n; i++) {
1014: idx = *indicesx++;
1015: idy = *indicesy++;
1016: y[idy] += x[idx];
1017: y[idy+1] += x[idx+1];
1018: y[idy+2] += x[idx+2];
1019: y[idy+3] += x[idx+3];
1020: y[idy+4] += x[idx+4];
1021: y[idy+5] += x[idx+5];
1022: }
1023: break;
1024: #if !defined(PETSC_USE_COMPLEX)
1025: case MAX_VALUES:
1026: for (i=0; i<n; i++) {
1027: idx = *indicesx++;
1028: idy = *indicesy++;
1029: y[idy] = PetscMax(y[idy],x[idx]);
1030: y[idy+1] = PetscMax(y[idy+1],x[idx+1]);
1031: y[idy+2] = PetscMax(y[idy+2],x[idx+2]);
1032: y[idy+3] = PetscMax(y[idy+3],x[idx+3]);
1033: y[idy+4] = PetscMax(y[idy+4],x[idx+4]);
1034: y[idy+5] = PetscMax(y[idy+5],x[idx+5]);
1035: }
1036: #else
1037: case MAX_VALUES:
1038: #endif
1039: case NOT_SET_VALUES:
1040: break;
1041: }
1042: }
1043: /* ----------------------------------------------------------------------------------------------- */
1044: PETSC_STATIC_INLINE void Pack_7(PetscInt n,const PetscInt *indicesx,const PetscScalar *x,PetscScalar *y)
1045: {
1046: PetscInt i,idx;
1048: for (i=0; i<n; i++) {
1049: idx = *indicesx++;
1050: y[0] = x[idx];
1051: y[1] = x[idx+1];
1052: y[2] = x[idx+2];
1053: y[3] = x[idx+3];
1054: y[4] = x[idx+4];
1055: y[5] = x[idx+5];
1056: y[6] = x[idx+6];
1057: y += 7;
1058: }
1059: }
1060: PETSC_STATIC_INLINE void UnPack_7(PetscInt n,const PetscScalar *x,const PetscInt *indicesy,PetscScalar *y,InsertMode addv)
1061: {
1062: PetscInt i,idy;
1064: switch (addv) {
1065: case INSERT_VALUES:
1066: for (i=0; i<n; i++) {
1067: idy = *indicesy++;
1068: y[idy] = x[0];
1069: y[idy+1] = x[1];
1070: y[idy+2] = x[2];
1071: y[idy+3] = x[3];
1072: y[idy+4] = x[4];
1073: y[idy+5] = x[5];
1074: y[idy+6] = x[6];
1075: x += 7;
1076: }
1077: break;
1078: case ADD_VALUES:
1079: for (i=0; i<n; i++) {
1080: idy = *indicesy++;
1081: y[idy] += x[0];
1082: y[idy+1] += x[1];
1083: y[idy+2] += x[2];
1084: y[idy+3] += x[3];
1085: y[idy+4] += x[4];
1086: y[idy+5] += x[5];
1087: y[idy+6] += x[6];
1088: x += 7;
1089: }
1090: break;
1091: #if !defined(PETSC_USE_COMPLEX)
1092: case MAX_VALUES:
1093: for (i=0; i<n; i++) {
1094: idy = *indicesy++;
1095: y[idy] = PetscMax(y[idy],x[0]);
1096: y[idy+1] = PetscMax(y[idy+1],x[1]);
1097: y[idy+2] = PetscMax(y[idy+2],x[2]);
1098: y[idy+3] = PetscMax(y[idy+3],x[3]);
1099: y[idy+4] = PetscMax(y[idy+4],x[4]);
1100: y[idy+5] = PetscMax(y[idy+5],x[5]);
1101: y[idy+6] = PetscMax(y[idy+6],x[6]);
1102: x += 7;
1103: }
1104: #else
1105: case MAX_VALUES:
1106: #endif
1107: case NOT_SET_VALUES:
1108: break;
1109: }
1110: }
1112: PETSC_STATIC_INLINE void Scatter_7(PetscInt n,const PetscInt *indicesx,const PetscScalar *x,const PetscInt *indicesy,PetscScalar *y,InsertMode addv)
1113: {
1114: PetscInt i,idx,idy;
1116: switch (addv) {
1117: case INSERT_VALUES:
1118: for (i=0; i<n; i++) {
1119: idx = *indicesx++;
1120: idy = *indicesy++;
1121: y[idy] = x[idx];
1122: y[idy+1] = x[idx+1];
1123: y[idy+2] = x[idx+2];
1124: y[idy+3] = x[idx+3];
1125: y[idy+4] = x[idx+4];
1126: y[idy+5] = x[idx+5];
1127: y[idy+6] = x[idx+6];
1128: }
1129: break;
1130: case ADD_VALUES:
1131: for (i=0; i<n; i++) {
1132: idx = *indicesx++;
1133: idy = *indicesy++;
1134: y[idy] += x[idx];
1135: y[idy+1] += x[idx+1];
1136: y[idy+2] += x[idx+2];
1137: y[idy+3] += x[idx+3];
1138: y[idy+4] += x[idx+4];
1139: y[idy+5] += x[idx+5];
1140: y[idy+6] += x[idx+6];
1141: }
1142: break;
1143: #if !defined(PETSC_USE_COMPLEX)
1144: case MAX_VALUES:
1145: for (i=0; i<n; i++) {
1146: idx = *indicesx++;
1147: idy = *indicesy++;
1148: y[idy] = PetscMax(y[idy],x[idx]);
1149: y[idy+1] = PetscMax(y[idy+1],x[idx+1]);
1150: y[idy+2] = PetscMax(y[idy+2],x[idx+2]);
1151: y[idy+3] = PetscMax(y[idy+3],x[idx+3]);
1152: y[idy+4] = PetscMax(y[idy+4],x[idx+4]);
1153: y[idy+5] = PetscMax(y[idy+5],x[idx+5]);
1154: y[idy+6] = PetscMax(y[idy+6],x[idx+6]);
1155: }
1156: #else
1157: case MAX_VALUES:
1158: #endif
1159: case NOT_SET_VALUES:
1160: break;
1161: }
1162: }
1163: /* ----------------------------------------------------------------------------------------------- */
1164: PETSC_STATIC_INLINE void Pack_8(PetscInt n,const PetscInt *indicesx,const PetscScalar *x,PetscScalar *y)
1165: {
1166: PetscInt i,idx;
1168: for (i=0; i<n; i++) {
1169: idx = *indicesx++;
1170: y[0] = x[idx];
1171: y[1] = x[idx+1];
1172: y[2] = x[idx+2];
1173: y[3] = x[idx+3];
1174: y[4] = x[idx+4];
1175: y[5] = x[idx+5];
1176: y[6] = x[idx+6];
1177: y[7] = x[idx+7];
1178: y += 8;
1179: }
1180: }
1181: PETSC_STATIC_INLINE void UnPack_8(PetscInt n,const PetscScalar *x,const PetscInt *indicesy,PetscScalar *y,InsertMode addv)
1182: {
1183: PetscInt i,idy;
1185: switch (addv) {
1186: case INSERT_VALUES:
1187: for (i=0; i<n; i++) {
1188: idy = *indicesy++;
1189: y[idy] = x[0];
1190: y[idy+1] = x[1];
1191: y[idy+2] = x[2];
1192: y[idy+3] = x[3];
1193: y[idy+4] = x[4];
1194: y[idy+5] = x[5];
1195: y[idy+6] = x[6];
1196: y[idy+7] = x[7];
1197: x += 8;
1198: }
1199: break;
1200: case ADD_VALUES:
1201: for (i=0; i<n; i++) {
1202: idy = *indicesy++;
1203: y[idy] += x[0];
1204: y[idy+1] += x[1];
1205: y[idy+2] += x[2];
1206: y[idy+3] += x[3];
1207: y[idy+4] += x[4];
1208: y[idy+5] += x[5];
1209: y[idy+6] += x[6];
1210: y[idy+7] += x[7];
1211: x += 8;
1212: }
1213: break;
1214: #if !defined(PETSC_USE_COMPLEX)
1215: case MAX_VALUES:
1216: for (i=0; i<n; i++) {
1217: idy = *indicesy++;
1218: y[idy] = PetscMax(y[idy],x[0]);
1219: y[idy+1] = PetscMax(y[idy+1],x[1]);
1220: y[idy+2] = PetscMax(y[idy+2],x[2]);
1221: y[idy+3] = PetscMax(y[idy+3],x[3]);
1222: y[idy+4] = PetscMax(y[idy+4],x[4]);
1223: y[idy+5] = PetscMax(y[idy+5],x[5]);
1224: y[idy+6] = PetscMax(y[idy+6],x[6]);
1225: y[idy+7] = PetscMax(y[idy+7],x[7]);
1226: x += 8;
1227: }
1228: #else
1229: case MAX_VALUES:
1230: #endif
1231: case NOT_SET_VALUES:
1232: break;
1233: }
1234: }
1236: PETSC_STATIC_INLINE void Scatter_8(PetscInt n,const PetscInt *indicesx,const PetscScalar *x,const PetscInt *indicesy,PetscScalar *y,InsertMode addv)
1237: {
1238: PetscInt i,idx,idy;
1240: switch (addv) {
1241: case INSERT_VALUES:
1242: for (i=0; i<n; i++) {
1243: idx = *indicesx++;
1244: idy = *indicesy++;
1245: y[idy] = x[idx];
1246: y[idy+1] = x[idx+1];
1247: y[idy+2] = x[idx+2];
1248: y[idy+3] = x[idx+3];
1249: y[idy+4] = x[idx+4];
1250: y[idy+5] = x[idx+5];
1251: y[idy+6] = x[idx+6];
1252: y[idy+7] = x[idx+7];
1253: }
1254: break;
1255: case ADD_VALUES:
1256: for (i=0; i<n; i++) {
1257: idx = *indicesx++;
1258: idy = *indicesy++;
1259: y[idy] += x[idx];
1260: y[idy+1] += x[idx+1];
1261: y[idy+2] += x[idx+2];
1262: y[idy+3] += x[idx+3];
1263: y[idy+4] += x[idx+4];
1264: y[idy+5] += x[idx+5];
1265: y[idy+6] += x[idx+6];
1266: y[idy+7] += x[idx+7];
1267: }
1268: break;
1269: #if !defined(PETSC_USE_COMPLEX)
1270: case MAX_VALUES:
1271: for (i=0; i<n; i++) {
1272: idx = *indicesx++;
1273: idy = *indicesy++;
1274: y[idy] = PetscMax(y[idy],x[idx]);
1275: y[idy+1] = PetscMax(y[idy+1],x[idx+1]);
1276: y[idy+2] = PetscMax(y[idy+2],x[idx+2]);
1277: y[idy+3] = PetscMax(y[idy+3],x[idx+3]);
1278: y[idy+4] = PetscMax(y[idy+4],x[idx+4]);
1279: y[idy+5] = PetscMax(y[idy+5],x[idx+5]);
1280: y[idy+6] = PetscMax(y[idy+6],x[idx+6]);
1281: y[idy+7] = PetscMax(y[idy+7],x[idx+7]);
1282: }
1283: #else
1284: case MAX_VALUES:
1285: #endif
1286: case NOT_SET_VALUES:
1287: break;
1288: }
1289: }
1291: /* ----------------------------------------------------------------------------------------------- */
1292: PETSC_STATIC_INLINE void Pack_12(PetscInt n,const PetscInt *indicesx,const PetscScalar *x,PetscScalar *y)
1293: {
1294: PetscInt i,idx;
1296: for (i=0; i<n; i++) {
1297: idx = *indicesx++;
1298: y[0] = x[idx];
1299: y[1] = x[idx+1];
1300: y[2] = x[idx+2];
1301: y[3] = x[idx+3];
1302: y[4] = x[idx+4];
1303: y[5] = x[idx+5];
1304: y[6] = x[idx+6];
1305: y[7] = x[idx+7];
1306: y[8] = x[idx+8];
1307: y[9] = x[idx+9];
1308: y[10] = x[idx+10];
1309: y[11] = x[idx+11];
1310: y += 12;
1311: }
1312: }
1313: PETSC_STATIC_INLINE void UnPack_12(PetscInt n,const PetscScalar *x,const PetscInt *indicesy,PetscScalar *y,InsertMode addv)
1314: {
1315: PetscInt i,idy;
1317: switch (addv) {
1318: case INSERT_VALUES:
1319: for (i=0; i<n; i++) {
1320: idy = *indicesy++;
1321: y[idy] = x[0];
1322: y[idy+1] = x[1];
1323: y[idy+2] = x[2];
1324: y[idy+3] = x[3];
1325: y[idy+4] = x[4];
1326: y[idy+5] = x[5];
1327: y[idy+6] = x[6];
1328: y[idy+7] = x[7];
1329: y[idy+8] = x[8];
1330: y[idy+9] = x[9];
1331: y[idy+10] = x[10];
1332: y[idy+11] = x[11];
1333: x += 12;
1334: }
1335: break;
1336: case ADD_VALUES:
1337: for (i=0; i<n; i++) {
1338: idy = *indicesy++;
1339: y[idy] += x[0];
1340: y[idy+1] += x[1];
1341: y[idy+2] += x[2];
1342: y[idy+3] += x[3];
1343: y[idy+4] += x[4];
1344: y[idy+5] += x[5];
1345: y[idy+6] += x[6];
1346: y[idy+7] += x[7];
1347: y[idy+8] += x[8];
1348: y[idy+9] += x[9];
1349: y[idy+10] += x[10];
1350: y[idy+11] += x[11];
1351: x += 12;
1352: }
1353: break;
1354: #if !defined(PETSC_USE_COMPLEX)
1355: case MAX_VALUES:
1356: for (i=0; i<n; i++) {
1357: idy = *indicesy++;
1358: y[idy] = PetscMax(y[idy],x[0]);
1359: y[idy+1] = PetscMax(y[idy+1],x[1]);
1360: y[idy+2] = PetscMax(y[idy+2],x[2]);
1361: y[idy+3] = PetscMax(y[idy+3],x[3]);
1362: y[idy+4] = PetscMax(y[idy+4],x[4]);
1363: y[idy+5] = PetscMax(y[idy+5],x[5]);
1364: y[idy+6] = PetscMax(y[idy+6],x[6]);
1365: y[idy+7] = PetscMax(y[idy+7],x[7]);
1366: y[idy+8] = PetscMax(y[idy+8],x[8]);
1367: y[idy+9] = PetscMax(y[idy+9],x[9]);
1368: y[idy+10] = PetscMax(y[idy+10],x[10]);
1369: y[idy+11] = PetscMax(y[idy+11],x[11]);
1370: x += 12;
1371: }
1372: #else
1373: case MAX_VALUES:
1374: #endif
1375: case NOT_SET_VALUES:
1376: break;
1377: }
1378: }
1380: PETSC_STATIC_INLINE void Scatter_12(PetscInt n,const PetscInt *indicesx,const PetscScalar *x,const PetscInt *indicesy,PetscScalar *y,InsertMode addv)
1381: {
1382: PetscInt i,idx,idy;
1384: switch (addv) {
1385: case INSERT_VALUES:
1386: for (i=0; i<n; i++) {
1387: idx = *indicesx++;
1388: idy = *indicesy++;
1389: y[idy] = x[idx];
1390: y[idy+1] = x[idx+1];
1391: y[idy+2] = x[idx+2];
1392: y[idy+3] = x[idx+3];
1393: y[idy+4] = x[idx+4];
1394: y[idy+5] = x[idx+5];
1395: y[idy+6] = x[idx+6];
1396: y[idy+7] = x[idx+7];
1397: y[idy+8] = x[idx+8];
1398: y[idy+9] = x[idx+9];
1399: y[idy+10] = x[idx+10];
1400: y[idy+11] = x[idx+11];
1401: }
1402: break;
1403: case ADD_VALUES:
1404: for (i=0; i<n; i++) {
1405: idx = *indicesx++;
1406: idy = *indicesy++;
1407: y[idy] += x[idx];
1408: y[idy+1] += x[idx+1];
1409: y[idy+2] += x[idx+2];
1410: y[idy+3] += x[idx+3];
1411: y[idy+4] += x[idx+4];
1412: y[idy+5] += x[idx+5];
1413: y[idy+6] += x[idx+6];
1414: y[idy+7] += x[idx+7];
1415: y[idy+8] += x[idx+8];
1416: y[idy+9] += x[idx+9];
1417: y[idy+10] += x[idx+10];
1418: y[idy+11] += x[idx+11];
1419: }
1420: break;
1421: #if !defined(PETSC_USE_COMPLEX)
1422: case MAX_VALUES:
1423: for (i=0; i<n; i++) {
1424: idx = *indicesx++;
1425: idy = *indicesy++;
1426: y[idy] = PetscMax(y[idy],x[idx]);
1427: y[idy+1] = PetscMax(y[idy+1],x[idx+1]);
1428: y[idy+2] = PetscMax(y[idy+2],x[idx+2]);
1429: y[idy+3] = PetscMax(y[idy+3],x[idx+3]);
1430: y[idy+4] = PetscMax(y[idy+4],x[idx+4]);
1431: y[idy+5] = PetscMax(y[idy+5],x[idx+5]);
1432: y[idy+6] = PetscMax(y[idy+6],x[idx+6]);
1433: y[idy+7] = PetscMax(y[idy+7],x[idx+7]);
1434: y[idy+8] = PetscMax(y[idy+8],x[idx+8]);
1435: y[idy+9] = PetscMax(y[idy+9],x[idx+9]);
1436: y[idy+10] = PetscMax(y[idy+10],x[idx+10]);
1437: y[idy+11] = PetscMax(y[idy+11],x[idx+11]);
1438: }
1439: #else
1440: case MAX_VALUES:
1441: #endif
1442: case NOT_SET_VALUES:
1443: break;
1444: }
1445: }
1447: /* Create the VecScatterBegin/End_P for our chosen block sizes */
1448: #define BS 1
1449: #include ../src/vec/vec/utils/vpscat.h
1450: #define BS 2
1451: #include ../src/vec/vec/utils/vpscat.h
1452: #define BS 3
1453: #include ../src/vec/vec/utils/vpscat.h
1454: #define BS 4
1455: #include ../src/vec/vec/utils/vpscat.h
1456: #define BS 5
1457: #include ../src/vec/vec/utils/vpscat.h
1458: #define BS 6
1459: #include ../src/vec/vec/utils/vpscat.h
1460: #define BS 7
1461: #include ../src/vec/vec/utils/vpscat.h
1462: #define BS 8
1463: #include ../src/vec/vec/utils/vpscat.h
1464: #define BS 12
1465: #include ../src/vec/vec/utils/vpscat.h
1467: /* ==========================================================================================*/
1469: /* create parallel to sequential scatter context */
1471: PetscErrorCode VecScatterCreateCommon_PtoS(VecScatter_MPI_General *,VecScatter_MPI_General *,VecScatter);
1475: /*@
1476: VecScatterCreateLocal - Creates a VecScatter from a list of messages it must send and receive.
1478: Collective on VecScatter
1480: Input Parameters:
1481: + VecScatter - obtained with VecScatterCreateEmpty()
1482: . nsends -
1483: . sendSizes -
1484: . sendProcs -
1485: . sendIdx - indices where the sent entries are obtained from (in local, on process numbering), this is one long array of size \sum_{i=0,i<nsends} sendSizes[i]
1486: . nrecvs - number of receives to expect
1487: . recvSizes -
1488: . recvProcs - processes who are sending to me
1489: . recvIdx - indices of where received entries are to be put, (in local, on process numbering), this is one long array of size \sum_{i=0,i<nrecvs} recvSizes[i]
1490: - bs - size of block
1492: Notes: sendSizes[] and recvSizes[] cannot have any 0 entries. If you want to support having 0 entries you need
1493: to change the code below to "compress out" the sendProcs[] and recvProcs[] entries that have 0 entries.
1495: Probably does not handle sends to self properly. It should remove those from the counts that are used
1496: in allocating space inside of the from struct
1498: Level: intermediate
1500: @*/
1501: PetscErrorCode VecScatterCreateLocal(VecScatter ctx,PetscInt nsends,const PetscInt sendSizes[],const PetscInt sendProcs[],const PetscInt sendIdx[],PetscInt nrecvs,const PetscInt recvSizes[],const PetscInt recvProcs[],const PetscInt recvIdx[],PetscInt bs)
1502: {
1503: VecScatter_MPI_General *from, *to;
1504: PetscInt sendSize, recvSize;
1505: PetscInt n, i;
1506: PetscErrorCode ierr;
1508: /* allocate entire send scatter context */
1509: PetscNewLog(ctx,VecScatter_MPI_General,&to);
1510: to->n = nsends;
1511: for(n = 0, sendSize = 0; n < to->n; n++) {sendSize += sendSizes[n];}
1512: PetscMalloc(to->n*sizeof(MPI_Request),&to->requests);
1513: PetscMalloc4(bs*sendSize,PetscScalar,&to->values,sendSize,PetscInt,&to->indices,to->n+1,PetscInt,&to->starts,to->n,PetscMPIInt,&to->procs);
1514: PetscMalloc2(PetscMax(to->n,nrecvs),MPI_Status,&to->sstatus,PetscMax(to->n,nrecvs),MPI_Status,&to->rstatus);
1515: to->starts[0] = 0;
1516: for(n = 0; n < to->n; n++) {
1517: if (sendSizes[n] <=0 ) SETERRQ2(PETSC_ERR_ARG_OUTOFRANGE,"sendSizes[n=%D] = %D cannot be less than 1",n,sendSizes[n]);
1518: to->starts[n+1] = to->starts[n] + sendSizes[n];
1519: to->procs[n] = sendProcs[n];
1520: for(i = to->starts[n]; i < to->starts[n]+sendSizes[n]; i++) {
1521: to->indices[i] = sendIdx[i];
1522: }
1523: }
1524: ctx->todata = (void *) to;
1526: /* allocate entire receive scatter context */
1527: PetscNewLog(ctx,VecScatter_MPI_General,&from);
1528: from->n = nrecvs;
1529: for(n = 0, recvSize = 0; n < from->n; n++) {recvSize += recvSizes[n];}
1530: PetscMalloc(from->n*sizeof(MPI_Request),&from->requests);
1531: PetscMalloc4(bs*recvSize,PetscScalar,&from->values,recvSize,PetscInt,&from->indices,from->n+1,PetscInt,&from->starts,from->n,PetscMPIInt,&from->procs);
1532: from->starts[0] = 0;
1533: for(n = 0; n < from->n; n++) {
1534: if (recvSizes[n] <=0 ) SETERRQ2(PETSC_ERR_ARG_OUTOFRANGE,"recvSizes[n=%D] = %D cannot be less than 1",n,recvSizes[n]);
1535: from->starts[n+1] = from->starts[n] + recvSizes[n];
1536: from->procs[n] = recvProcs[n];
1537: for(i = from->starts[n]; i < from->starts[n]+recvSizes[n]; i++) {
1538: from->indices[i] = recvIdx[i];
1539: }
1540: }
1541: ctx->fromdata = (void *)from;
1543: /* No local scatter optimization */
1544: from->local.n = 0;
1545: from->local.vslots = 0;
1546: to->local.n = 0;
1547: to->local.vslots = 0;
1548: from->local.nonmatching_computed = PETSC_FALSE;
1549: from->local.n_nonmatching = 0;
1550: from->local.slots_nonmatching = 0;
1551: to->local.nonmatching_computed = PETSC_FALSE;
1552: to->local.n_nonmatching = 0;
1553: to->local.slots_nonmatching = 0;
1555: from->type = VEC_SCATTER_MPI_GENERAL;
1556: to->type = VEC_SCATTER_MPI_GENERAL;
1557: from->bs = bs;
1558: to->bs = bs;
1559: VecScatterCreateCommon_PtoS(from, to, ctx);
1560: return(0);
1561: }
1563: /*
1564: bs indicates how many elements there are in each block. Normally this would be 1.
1566: contains check that PetscMPIInt can handle the sizes needed
1567: */
1570: PetscErrorCode VecScatterCreate_PtoS(PetscInt nx,const PetscInt *inidx,PetscInt ny,const PetscInt *inidy,Vec xin,Vec yin,PetscInt bs,VecScatter ctx)
1571: {
1572: VecScatter_MPI_General *from,*to;
1573: PetscMPIInt size,rank,imdex,tag,n;
1574: PetscInt *source = PETSC_NULL,*owners = PETSC_NULL;
1575: PetscInt *lowner = PETSC_NULL,*start = PETSC_NULL,lengthy,lengthx;
1576: PetscMPIInt *nprocs = PETSC_NULL,nrecvs;
1577: PetscInt i,j,idx,nsends;
1578: PetscInt *owner = PETSC_NULL,*starts = PETSC_NULL,count,slen;
1579: PetscInt *rvalues,*svalues,base,*values,nprocslocal,recvtotal,*rsvalues;
1580: PetscMPIInt *onodes1,*olengths1;
1581: MPI_Comm comm;
1582: MPI_Request *send_waits = PETSC_NULL,*recv_waits = PETSC_NULL;
1583: MPI_Status recv_status,*send_status;
1584: PetscErrorCode ierr;
1587: PetscObjectGetNewTag((PetscObject)ctx,&tag);
1588: PetscObjectGetComm((PetscObject)xin,&comm);
1589: MPI_Comm_rank(comm,&rank);
1590: MPI_Comm_size(comm,&size);
1591: owners = xin->map->range;
1592: VecGetSize(yin,&lengthy);
1593: VecGetSize(xin,&lengthx);
1595: /* first count number of contributors to each processor */
1596: PetscMalloc2(size,PetscMPIInt,&nprocs,nx,PetscInt,&owner);
1597: PetscMemzero(nprocs,size*sizeof(PetscMPIInt));
1598: j = 0;
1599: nsends = 0;
1600: for (i=0; i<nx; i++) {
1601: idx = inidx[i];
1602: if (idx < owners[j]) j = 0;
1603: for (; j<size; j++) {
1604: if (idx < owners[j+1]) {
1605: if (!nprocs[j]++) nsends++;
1606: owner[i] = j;
1607: break;
1608: }
1609: }
1610: }
1611: nprocslocal = nprocs[rank];
1612: nprocs[rank] = 0;
1613: if (nprocslocal) nsends--;
1614: /* inform other processors of number of messages and max length*/
1615: PetscGatherNumberOfMessages(comm,PETSC_NULL,nprocs,&nrecvs);
1616: PetscGatherMessageLengths(comm,nsends,nrecvs,nprocs,&onodes1,&olengths1);
1617: PetscSortMPIIntWithArray(nrecvs,onodes1,olengths1);
1618: recvtotal = 0; for (i=0; i<nrecvs; i++) recvtotal += olengths1[i];
1620: /* post receives: */
1621: PetscMalloc3(recvtotal,PetscInt,&rvalues,nrecvs,PetscInt,&source,nrecvs,MPI_Request,&recv_waits);
1622: count = 0;
1623: for (i=0; i<nrecvs; i++) {
1624: MPI_Irecv((rvalues+count),olengths1[i],MPIU_INT,onodes1[i],tag,comm,recv_waits+i);
1625: count += olengths1[i];
1626: }
1628: /* do sends:
1629: 1) starts[i] gives the starting index in svalues for stuff going to
1630: the ith processor
1631: */
1632: PetscMalloc3(nx,PetscInt,&svalues,nsends,MPI_Request,&send_waits,size+1,PetscInt,&starts);
1633: starts[0] = 0;
1634: for (i=1; i<size; i++) { starts[i] = starts[i-1] + nprocs[i-1];}
1635: for (i=0; i<nx; i++) {
1636: if (owner[i] != rank) {
1637: svalues[starts[owner[i]]++] = inidx[i];
1638: }
1639: }
1641: starts[0] = 0;
1642: for (i=1; i<size+1; i++) { starts[i] = starts[i-1] + nprocs[i-1];}
1643: count = 0;
1644: for (i=0; i<size; i++) {
1645: if (nprocs[i]) {
1646: MPI_Isend(svalues+starts[i],nprocs[i],MPIU_INT,i,tag,comm,send_waits+count++);
1647: }
1648: }
1650: /* wait on receives */
1651: count = nrecvs;
1652: slen = 0;
1653: while (count) {
1654: MPI_Waitany(nrecvs,recv_waits,&imdex,&recv_status);
1655: /* unpack receives into our local space */
1656: MPI_Get_count(&recv_status,MPIU_INT,&n);
1657: slen += n;
1658: count--;
1659: }
1661: if (slen != recvtotal) SETERRQ2(PETSC_ERR_PLIB,"Total message lengths %D not expected %D",slen,recvtotal);
1662:
1663: /* allocate entire send scatter context */
1664: PetscNewLog(ctx,VecScatter_MPI_General,&to);
1665: to->n = nrecvs;
1666: PetscMalloc(nrecvs*sizeof(MPI_Request),&to->requests);
1667: PetscMalloc4(bs*slen,PetscScalar,&to->values,slen,PetscInt,&to->indices,nrecvs+1,PetscInt,&to->starts,nrecvs,PetscMPIInt,&to->procs);
1668: PetscMalloc2(PetscMax(to->n,nsends),MPI_Status,&to->sstatus,PetscMax(to->n,nsends),MPI_Status,&to->rstatus);
1669: ctx->todata = (void*)to;
1670: to->starts[0] = 0;
1672: if (nrecvs) {
1674: /* move the data into the send scatter */
1675: base = owners[rank];
1676: rsvalues = rvalues;
1677: for (i=0; i<nrecvs; i++) {
1678: to->starts[i+1] = to->starts[i] + olengths1[i];
1679: to->procs[i] = onodes1[i];
1680: values = rsvalues;
1681: rsvalues += olengths1[i];
1682: for (j=0; j<olengths1[i]; j++) {
1683: to->indices[to->starts[i] + j] = values[j] - base;
1684: }
1685: }
1686: }
1687: PetscFree(olengths1);
1688: PetscFree(onodes1);
1689: PetscFree3(rvalues,source,recv_waits);
1691: /* allocate entire receive scatter context */
1692: PetscNewLog(ctx,VecScatter_MPI_General,&from);
1693: from->n = nsends;
1695: PetscMalloc(nsends*sizeof(MPI_Request),&from->requests);
1696: PetscMalloc4((ny-nprocslocal)*bs,PetscScalar,&from->values,ny-nprocslocal,PetscInt,&from->indices,nsends+1,PetscInt,&from->starts,from->n,PetscMPIInt,&from->procs);
1697: ctx->fromdata = (void*)from;
1699: /* move data into receive scatter */
1700: PetscMalloc2(size,PetscInt,&lowner,nsends+1,PetscInt,&start);
1701: count = 0; from->starts[0] = start[0] = 0;
1702: for (i=0; i<size; i++) {
1703: if (nprocs[i]) {
1704: lowner[i] = count;
1705: from->procs[count++] = i;
1706: from->starts[count] = start[count] = start[count-1] + nprocs[i];
1707: }
1708: }
1710: for (i=0; i<nx; i++) {
1711: if (owner[i] != rank) {
1712: from->indices[start[lowner[owner[i]]]++] = inidy[i];
1713: if (inidy[i] >= lengthy) SETERRQ(PETSC_ERR_ARG_OUTOFRANGE,"Scattering past end of TO vector");
1714: }
1715: }
1716: PetscFree2(lowner,start);
1717: PetscFree2(nprocs,owner);
1718:
1719: /* wait on sends */
1720: if (nsends) {
1721: PetscMalloc(nsends*sizeof(MPI_Status),&send_status);
1722: MPI_Waitall(nsends,send_waits,send_status);
1723: PetscFree(send_status);
1724: }
1725: PetscFree3(svalues,send_waits,starts);
1727: if (nprocslocal) {
1728: PetscInt nt = from->local.n = to->local.n = nprocslocal;
1729: /* we have a scatter to ourselves */
1730: PetscMalloc(nt*sizeof(PetscInt),&to->local.vslots);
1731: PetscMalloc(nt*sizeof(PetscInt),&from->local.vslots);
1732: nt = 0;
1733: for (i=0; i<nx; i++) {
1734: idx = inidx[i];
1735: if (idx >= owners[rank] && idx < owners[rank+1]) {
1736: to->local.vslots[nt] = idx - owners[rank];
1737: from->local.vslots[nt++] = inidy[i];
1738: if (inidy[i] >= lengthy) SETERRQ(PETSC_ERR_ARG_OUTOFRANGE,"Scattering past end of TO vector");
1739: }
1740: }
1741: } else {
1742: from->local.n = 0;
1743: from->local.vslots = 0;
1744: to->local.n = 0;
1745: to->local.vslots = 0;
1746: }
1748: from->local.nonmatching_computed = PETSC_FALSE;
1749: from->local.n_nonmatching = 0;
1750: from->local.slots_nonmatching = 0;
1751: to->local.nonmatching_computed = PETSC_FALSE;
1752: to->local.n_nonmatching = 0;
1753: to->local.slots_nonmatching = 0;
1755: from->type = VEC_SCATTER_MPI_GENERAL;
1756: to->type = VEC_SCATTER_MPI_GENERAL;
1757: from->bs = bs;
1758: to->bs = bs;
1759: VecScatterCreateCommon_PtoS(from,to,ctx);
1760: return(0);
1761: }
1763: /*
1764: bs indicates how many elements there are in each block. Normally this would be 1.
1765: */
1768: PetscErrorCode VecScatterCreateCommon_PtoS(VecScatter_MPI_General *from,VecScatter_MPI_General *to,VecScatter ctx)
1769: {
1770: MPI_Comm comm = ((PetscObject)ctx)->comm;
1771: PetscMPIInt tag = ((PetscObject)ctx)->tag, tagr;
1772: PetscInt bs = to->bs;
1773: PetscMPIInt size;
1774: PetscInt i, n;
1776:
1778: PetscObjectGetNewTag((PetscObject)ctx,&tagr);
1779: ctx->destroy = VecScatterDestroy_PtoP;
1781: PetscOptionsHasName(PETSC_NULL,"-vecscatter_reproduce",&ctx->reproduce);
1782: PetscOptionsHasName(PETSC_NULL,"-vecscatter_sendfirst",&to->sendfirst);
1783: from->sendfirst = to->sendfirst;
1785: MPI_Comm_size(comm,&size);
1786: /* check if the receives are ALL going into contiguous locations; if so can skip indexing */
1787: to->contiq = PETSC_FALSE;
1788: n = from->starts[from->n];
1789: from->contiq = PETSC_TRUE;
1790: for (i=1; i<n; i++) {
1791: if (from->indices[i] != from->indices[i-1] + bs) {
1792: from->contiq = PETSC_FALSE;
1793: break;
1794: }
1795: }
1797: PetscOptionsHasName(PETSC_NULL,"-vecscatter_alltoall",&to->use_alltoallv);
1798: from->use_alltoallv = to->use_alltoallv;
1799: if (from->use_alltoallv) PetscInfo(ctx,"Using MPI_Alltoallv() for scatter\n");
1800: #if defined(PETSC_HAVE_MPI_ALLTOALLW) && !defined(PETSC_USE_64BIT_INDICES)
1801: if (to->use_alltoallv) {
1802: PetscOptionsHasName(PETSC_NULL,"-vecscatter_nopack",&to->use_alltoallw);
1803: }
1804: from->use_alltoallw = to->use_alltoallw;
1805: if (from->use_alltoallw) PetscInfo(ctx,"Using MPI_Alltoallw() for scatter\n");
1806: #endif
1808: #if defined(PETSC_HAVE_MPI_WIN_CREATE)
1809: PetscOptionsHasName(PETSC_NULL,"-vecscatter_window",&to->use_window);
1810: from->use_window = to->use_window;
1811: #endif
1813: if (to->use_alltoallv) {
1815: PetscMalloc2(size,PetscMPIInt,&to->counts,size,PetscMPIInt,&to->displs);
1816: PetscMemzero(to->counts,size*sizeof(PetscMPIInt));
1817: for (i=0; i<to->n; i++) {
1818: to->counts[to->procs[i]] = bs*(to->starts[i+1] - to->starts[i]);
1819: }
1820: to->displs[0] = 0;
1821: for (i=1; i<size; i++) {
1822: to->displs[i] = to->displs[i-1] + to->counts[i-1];
1823: }
1825: PetscMalloc2(size,PetscMPIInt,&from->counts,size,PetscMPIInt,&from->displs);
1826: PetscMemzero(from->counts,size*sizeof(PetscMPIInt));
1827: for (i=0; i<from->n; i++) {
1828: from->counts[from->procs[i]] = bs*(from->starts[i+1] - from->starts[i]);
1829: }
1830: from->displs[0] = 0;
1831: for (i=1; i<size; i++) {
1832: from->displs[i] = from->displs[i-1] + from->counts[i-1];
1833: }
1834: #if defined(PETSC_HAVE_MPI_ALLTOALLW) && !defined(PETSC_USE_64BIT_INDICES)
1835: if (to->use_alltoallw) {
1836: PetscMPIInt mpibs = PetscMPIIntCast(bs), mpilen;
1837: ctx->packtogether = PETSC_FALSE;
1838: PetscMalloc3(size,PetscMPIInt,&to->wcounts,size,PetscMPIInt,&to->wdispls,size,MPI_Datatype,&to->types);
1839: PetscMemzero(to->wcounts,size*sizeof(PetscMPIInt));
1840: PetscMemzero(to->wdispls,size*sizeof(PetscMPIInt));
1841: for (i=0; i<size; i++) {
1842: to->types[i] = MPIU_SCALAR;
1843: }
1845: for (i=0; i<to->n; i++) {
1846: to->wcounts[to->procs[i]] = 1;
1847: mpilen = PetscMPIIntCast(to->starts[i+1]-to->starts[i]);
1848: MPI_Type_create_indexed_block(mpilen,mpibs,to->indices+to->starts[i],MPIU_SCALAR,to->types+to->procs[i]);
1849: MPI_Type_commit(to->types+to->procs[i]);
1850: }
1851: PetscMalloc3(size,PetscMPIInt,&from->wcounts,size,PetscMPIInt,&from->wdispls,size,MPI_Datatype,&from->types);
1852: PetscMemzero(from->wcounts,size*sizeof(PetscMPIInt));
1853: PetscMemzero(from->wdispls,size*sizeof(PetscMPIInt));
1854: for (i=0; i<size; i++) {
1855: from->types[i] = MPIU_SCALAR;
1856: }
1857: if (from->contiq) {
1858: PetscInfo(ctx,"Scattered vector entries are stored contiquously, taking advantage of this with -vecscatter_alltoall\n");
1859: for (i=0; i<from->n; i++) {
1860: from->wcounts[from->procs[i]] = bs*(from->starts[i+1] - from->starts[i]);
1861: }
1862: if (from->n) from->wdispls[from->procs[0]] = sizeof(PetscScalar)*from->indices[0];
1863: for (i=1; i<from->n; i++) {
1864: from->wdispls[from->procs[i]] = from->wdispls[from->procs[i-1]] + sizeof(PetscScalar)*from->wcounts[from->procs[i-1]];
1865: }
1866: } else {
1867: for (i=0; i<from->n; i++) {
1868: from->wcounts[from->procs[i]] = 1;
1869: mpilen = PetscMPIIntCast(from->starts[i+1]-from->starts[i]);
1870: MPI_Type_create_indexed_block(mpilen,mpibs,from->indices+from->starts[i],MPIU_SCALAR,from->types+from->procs[i]);
1871: MPI_Type_commit(from->types+from->procs[i]);
1872: }
1873: }
1874: } else {
1875: ctx->copy = VecScatterCopy_PtoP_AllToAll;
1876: }
1877: #else
1878: to->use_alltoallw = PETSC_FALSE;
1879: from->use_alltoallw = PETSC_FALSE;
1880: ctx->copy = VecScatterCopy_PtoP_AllToAll;
1881: #endif
1882: #if defined(PETSC_HAVE_MPI_WIN_CREATE)
1883: } else if (to->use_window) {
1884: PetscMPIInt temptag,winsize;
1885: MPI_Request *request;
1886: MPI_Status *status;
1887:
1888: PetscObjectGetNewTag((PetscObject)ctx,&temptag);
1889: winsize = (to->n ? to->starts[to->n] : 0)*sizeof(PetscScalar);
1890: MPI_Win_create(to->values ? to->values : MPI_BOTTOM,winsize,sizeof(PetscScalar),MPI_INFO_NULL,comm,&to->window);
1891: PetscMalloc(to->n,&to->winstarts);
1892: PetscMalloc2(to->n,MPI_Request,&request,to->n,MPI_Status,&status);
1893: for (i=0; i<to->n; i++) {
1894: MPI_Irecv(to->winstarts+i,1,MPIU_INT,to->procs[i],temptag,comm,request+i);
1895: }
1896: for (i=0; i<from->n; i++) {
1897: MPI_Send(from->starts+i,1,MPIU_INT,from->procs[i],temptag,comm);
1898: }
1899: MPI_Waitall(to->n,request,status);
1900: PetscFree2(request,status);
1902: winsize = (from->n ? from->starts[from->n] : 0)*sizeof(PetscScalar);
1903: MPI_Win_create(from->values ? from->values : MPI_BOTTOM,winsize,sizeof(PetscScalar),MPI_INFO_NULL,comm,&from->window);
1904: PetscMalloc(from->n,&from->winstarts);
1905: PetscMalloc2(from->n,MPI_Request,&request,from->n,MPI_Status,&status);
1906: for (i=0; i<from->n; i++) {
1907: MPI_Irecv(from->winstarts+i,1,MPIU_INT,from->procs[i],temptag,comm,request+i);
1908: }
1909: for (i=0; i<to->n; i++) {
1910: MPI_Send(to->starts+i,1,MPIU_INT,to->procs[i],temptag,comm);
1911: }
1912: MPI_Waitall(from->n,request,status);
1913: PetscFree2(request,status);
1914: #endif
1915: } else {
1916: PetscTruth use_rsend = PETSC_FALSE, use_ssend = PETSC_FALSE;
1917: PetscInt *sstarts = to->starts, *rstarts = from->starts;
1918: PetscMPIInt *sprocs = to->procs, *rprocs = from->procs;
1919: MPI_Request *swaits = to->requests,*rwaits = from->requests;
1920: MPI_Request *rev_swaits,*rev_rwaits;
1921: PetscScalar *Ssvalues = to->values, *Srvalues = from->values;
1923: /* allocate additional wait variables for the "reverse" scatter */
1924: PetscMalloc(to->n*sizeof(MPI_Request),&rev_rwaits);
1925: PetscMalloc(from->n*sizeof(MPI_Request),&rev_swaits);
1926: to->rev_requests = rev_rwaits;
1927: from->rev_requests = rev_swaits;
1929: /* Register the receives that you will use later (sends for scatter reverse) */
1930: PetscOptionsHasName(PETSC_NULL,"-vecscatter_rsend",&use_rsend);
1931: PetscOptionsHasName(PETSC_NULL,"-vecscatter_ssend",&use_ssend);
1932: if (use_rsend) {
1933: PetscInfo(ctx,"Using VecScatter ready receiver mode\n");
1934: to->use_readyreceiver = PETSC_TRUE;
1935: from->use_readyreceiver = PETSC_TRUE;
1936: } else {
1937: to->use_readyreceiver = PETSC_FALSE;
1938: from->use_readyreceiver = PETSC_FALSE;
1939: }
1940: if (use_ssend) {
1941: PetscInfo(ctx,"Using VecScatter Ssend mode\n");
1942: }
1944: for (i=0; i<from->n; i++) {
1945: if (use_rsend) {
1946: MPI_Rsend_init(Srvalues+bs*rstarts[i],bs*rstarts[i+1]-bs*rstarts[i],MPIU_SCALAR,rprocs[i],tagr,comm,rev_swaits+i);
1947: } else if (use_ssend) {
1948: MPI_Ssend_init(Srvalues+bs*rstarts[i],bs*rstarts[i+1]-bs*rstarts[i],MPIU_SCALAR,rprocs[i],tagr,comm,rev_swaits+i);
1949: } else {
1950: MPI_Send_init(Srvalues+bs*rstarts[i],bs*rstarts[i+1]-bs*rstarts[i],MPIU_SCALAR,rprocs[i],tagr,comm,rev_swaits+i);
1951: }
1952: }
1954: for (i=0; i<to->n; i++) {
1955: if (use_rsend) {
1956: MPI_Rsend_init(Ssvalues+bs*sstarts[i],bs*sstarts[i+1]-bs*sstarts[i],MPIU_SCALAR,sprocs[i],tag,comm,swaits+i);
1957: } else if (use_ssend) {
1958: MPI_Ssend_init(Ssvalues+bs*sstarts[i],bs*sstarts[i+1]-bs*sstarts[i],MPIU_SCALAR,sprocs[i],tag,comm,swaits+i);
1959: } else {
1960: MPI_Send_init(Ssvalues+bs*sstarts[i],bs*sstarts[i+1]-bs*sstarts[i],MPIU_SCALAR,sprocs[i],tag,comm,swaits+i);
1961: }
1962: }
1963: /* Register receives for scatter and reverse */
1964: for (i=0; i<from->n; i++) {
1965: MPI_Recv_init(Srvalues+bs*rstarts[i],bs*rstarts[i+1]-bs*rstarts[i],MPIU_SCALAR,rprocs[i],tag,comm,rwaits+i);
1966: }
1967: for (i=0; i<to->n; i++) {
1968: MPI_Recv_init(Ssvalues+bs*sstarts[i],bs*sstarts[i+1]-bs*sstarts[i],MPIU_SCALAR,sprocs[i],tagr,comm,rev_rwaits+i);
1969: }
1970: if (use_rsend) {
1971: if (to->n) {MPI_Startall_irecv(to->starts[to->n]*to->bs,to->n,to->rev_requests);}
1972: if (from->n) {MPI_Startall_irecv(from->starts[from->n]*from->bs,from->n,from->requests);}
1973: MPI_Barrier(comm);
1974: }
1976: ctx->copy = VecScatterCopy_PtoP_X;
1977: }
1978: PetscInfo1(ctx,"Using blocksize %D scatter\n",bs);
1979: switch (bs) {
1980: case 12:
1981: ctx->begin = VecScatterBegin_12;
1982: ctx->end = VecScatterEnd_12;
1983: break;
1984: case 8:
1985: ctx->begin = VecScatterBegin_8;
1986: ctx->end = VecScatterEnd_8;
1987: break;
1988: case 7:
1989: ctx->begin = VecScatterBegin_7;
1990: ctx->end = VecScatterEnd_7;
1991: break;
1992: case 6:
1993: ctx->begin = VecScatterBegin_6;
1994: ctx->end = VecScatterEnd_6;
1995: break;
1996: case 5:
1997: ctx->begin = VecScatterBegin_5;
1998: ctx->end = VecScatterEnd_5;
1999: break;
2000: case 4:
2001: ctx->begin = VecScatterBegin_4;
2002: ctx->end = VecScatterEnd_4;
2003: break;
2004: case 3:
2005: ctx->begin = VecScatterBegin_3;
2006: ctx->end = VecScatterEnd_3;
2007: break;
2008: case 2:
2009: ctx->begin = VecScatterBegin_2;
2010: ctx->end = VecScatterEnd_2;
2011: break;
2012: case 1:
2013: ctx->begin = VecScatterBegin_1;
2014: ctx->end = VecScatterEnd_1;
2015: break;
2016: default:
2017: SETERRQ(PETSC_ERR_SUP,"Blocksize not supported");
2018: }
2019: ctx->view = VecScatterView_MPI;
2020: /* Check if the local scatter is actually a copy; important special case */
2021: if (to->local.n) {
2022: VecScatterLocalOptimizeCopy_Private(ctx,&to->local,&from->local,bs);
2023: }
2024: return(0);
2025: }
2029: /* ------------------------------------------------------------------------------------*/
2030: /*
2031: Scatter from local Seq vectors to a parallel vector.
2032: Reverses the order of the arguments, calls VecScatterCreate_PtoS() then
2033: reverses the result.
2034: */
2037: PetscErrorCode VecScatterCreate_StoP(PetscInt nx,const PetscInt *inidx,PetscInt ny,const PetscInt *inidy,Vec xin,Vec yin,PetscInt bs,VecScatter ctx)
2038: {
2039: PetscErrorCode ierr;
2040: MPI_Request *waits;
2041: VecScatter_MPI_General *to,*from;
2044: VecScatterCreate_PtoS(ny,inidy,nx,inidx,yin,xin,bs,ctx);
2045: to = (VecScatter_MPI_General*)ctx->fromdata;
2046: from = (VecScatter_MPI_General*)ctx->todata;
2047: ctx->todata = (void*)to;
2048: ctx->fromdata = (void*)from;
2049: /* these two are special, they are ALWAYS stored in to struct */
2050: to->sstatus = from->sstatus;
2051: to->rstatus = from->rstatus;
2053: from->sstatus = 0;
2054: from->rstatus = 0;
2056: waits = from->rev_requests;
2057: from->rev_requests = from->requests;
2058: from->requests = waits;
2059: waits = to->rev_requests;
2060: to->rev_requests = to->requests;
2061: to->requests = waits;
2062: return(0);
2063: }
2065: /* ---------------------------------------------------------------------------------*/
2068: PetscErrorCode VecScatterCreate_PtoP(PetscInt nx,const PetscInt *inidx,PetscInt ny,const PetscInt *inidy,Vec xin,Vec yin,VecScatter ctx)
2069: {
2071: PetscMPIInt size,rank,tag,imdex,n;
2072: PetscInt *owners = xin->map->range;
2073: PetscMPIInt *nprocs = PETSC_NULL;
2074: PetscInt i,j,idx,nsends,*local_inidx = PETSC_NULL,*local_inidy = PETSC_NULL;
2075: PetscInt *owner = PETSC_NULL,*starts = PETSC_NULL,count,slen;
2076: PetscInt *rvalues = PETSC_NULL,*svalues = PETSC_NULL,base,*values = PETSC_NULL,*rsvalues,recvtotal,lastidx;
2077: PetscMPIInt *onodes1,*olengths1,nrecvs;
2078: MPI_Comm comm;
2079: MPI_Request *send_waits = PETSC_NULL,*recv_waits = PETSC_NULL;
2080: MPI_Status recv_status,*send_status = PETSC_NULL;
2081: PetscTruth duplicate = PETSC_FALSE;
2082: #if defined(PETSC_USE_DEBUG)
2083: PetscTruth found = PETSC_FALSE;
2084: #endif
2087: PetscObjectGetNewTag((PetscObject)ctx,&tag);
2088: PetscObjectGetComm((PetscObject)xin,&comm);
2089: MPI_Comm_size(comm,&size);
2090: MPI_Comm_rank(comm,&rank);
2091: if (size == 1) {
2092: VecScatterCreate_StoP(nx,inidx,ny,inidy,xin,yin,1,ctx);
2093: return(0);
2094: }
2096: /*
2097: Each processor ships off its inidx[j] and inidy[j] to the appropriate processor
2098: They then call the StoPScatterCreate()
2099: */
2100: /* first count number of contributors to each processor */
2101: PetscMalloc3(size,PetscMPIInt,&nprocs,nx,PetscInt,&owner,(size+1),PetscInt,&starts);
2102: PetscMemzero(nprocs,size*sizeof(PetscMPIInt));
2103: lastidx = -1;
2104: j = 0;
2105: for (i=0; i<nx; i++) {
2106: /* if indices are NOT locally sorted, need to start search at the beginning */
2107: if (lastidx > (idx = inidx[i])) j = 0;
2108: lastidx = idx;
2109: for (; j<size; j++) {
2110: if (idx >= owners[j] && idx < owners[j+1]) {
2111: nprocs[j]++;
2112: owner[i] = j;
2113: #if defined(PETSC_USE_DEBUG)
2114: found = PETSC_TRUE;
2115: #endif
2116: break;
2117: }
2118: }
2119: #if defined(PETSC_USE_DEBUG)
2120: if (!found) SETERRQ1(PETSC_ERR_ARG_OUTOFRANGE,"Index %D out of range",idx);
2121: found = PETSC_FALSE;
2122: #endif
2123: }
2124: nsends = 0; for (i=0; i<size; i++) { nsends += (nprocs[i] > 0);}
2126: /* inform other processors of number of messages and max length*/
2127: PetscGatherNumberOfMessages(comm,PETSC_NULL,nprocs,&nrecvs);
2128: PetscGatherMessageLengths(comm,nsends,nrecvs,nprocs,&onodes1,&olengths1);
2129: PetscSortMPIIntWithArray(nrecvs,onodes1,olengths1);
2130: recvtotal = 0; for (i=0; i<nrecvs; i++) recvtotal += olengths1[i];
2132: /* post receives: */
2133: PetscMalloc5(2*recvtotal,PetscInt,&rvalues,2*nx,PetscInt,&svalues,nrecvs,MPI_Request,&recv_waits,nsends,MPI_Request,&send_waits,nsends,MPI_Status,&send_status);
2135: count = 0;
2136: for (i=0; i<nrecvs; i++) {
2137: MPI_Irecv((rvalues+2*count),2*olengths1[i],MPIU_INT,onodes1[i],tag,comm,recv_waits+i);
2138: count += olengths1[i];
2139: }
2140: PetscFree(onodes1);
2142: /* do sends:
2143: 1) starts[i] gives the starting index in svalues for stuff going to
2144: the ith processor
2145: */
2146: starts[0]= 0;
2147: for (i=1; i<size; i++) { starts[i] = starts[i-1] + nprocs[i-1];}
2148: for (i=0; i<nx; i++) {
2149: svalues[2*starts[owner[i]]] = inidx[i];
2150: svalues[1 + 2*starts[owner[i]]++] = inidy[i];
2151: }
2153: starts[0] = 0;
2154: for (i=1; i<size+1; i++) { starts[i] = starts[i-1] + nprocs[i-1];}
2155: count = 0;
2156: for (i=0; i<size; i++) {
2157: if (nprocs[i]) {
2158: MPI_Isend(svalues+2*starts[i],2*nprocs[i],MPIU_INT,i,tag,comm,send_waits+count);
2159: count++;
2160: }
2161: }
2162: PetscFree3(nprocs,owner,starts);
2164: /* wait on receives */
2165: count = nrecvs;
2166: slen = 0;
2167: while (count) {
2168: MPI_Waitany(nrecvs,recv_waits,&imdex,&recv_status);
2169: /* unpack receives into our local space */
2170: MPI_Get_count(&recv_status,MPIU_INT,&n);
2171: slen += n/2;
2172: count--;
2173: }
2174: if (slen != recvtotal) SETERRQ2(PETSC_ERR_PLIB,"Total message lengths %D not as expected %D",slen,recvtotal);
2175:
2176: PetscMalloc2(slen,PetscInt,&local_inidx,slen,PetscInt,&local_inidy);
2177: base = owners[rank];
2178: count = 0;
2179: rsvalues = rvalues;
2180: for (i=0; i<nrecvs; i++) {
2181: values = rsvalues;
2182: rsvalues += 2*olengths1[i];
2183: for (j=0; j<olengths1[i]; j++) {
2184: local_inidx[count] = values[2*j] - base;
2185: local_inidy[count++] = values[2*j+1];
2186: }
2187: }
2188: PetscFree(olengths1);
2190: /* wait on sends */
2191: if (nsends) {MPI_Waitall(nsends,send_waits,send_status);}
2192: PetscFree5(rvalues,svalues,recv_waits,send_waits,send_status);
2194: /*
2195: should sort and remove duplicates from local_inidx,local_inidy
2196: */
2198: #if defined(do_it_slow)
2199: /* sort on the from index */
2200: PetscSortIntWithArray(slen,local_inidx,local_inidy);
2201: start = 0;
2202: while (start < slen) {
2203: count = start+1;
2204: last = local_inidx[start];
2205: while (count < slen && last == local_inidx[count]) count++;
2206: if (count > start + 1) { /* found 2 or more same local_inidx[] in a row */
2207: /* sort on to index */
2208: PetscSortInt(count-start,local_inidy+start);
2209: }
2210: /* remove duplicates; not most efficient way, but probably good enough */
2211: i = start;
2212: while (i < count-1) {
2213: if (local_inidy[i] != local_inidy[i+1]) {
2214: i++;
2215: } else { /* found a duplicate */
2216: duplicate = PETSC_TRUE;
2217: for (j=i; j<slen-1; j++) {
2218: local_inidx[j] = local_inidx[j+1];
2219: local_inidy[j] = local_inidy[j+1];
2220: }
2221: slen--;
2222: count--;
2223: }
2224: }
2225: start = count;
2226: }
2227: #endif
2228: if (duplicate) {
2229: PetscInfo(ctx,"Duplicate from to indices passed in VecScatterCreate(), they are ignored\n");
2230: }
2231: VecScatterCreate_StoP(slen,local_inidx,slen,local_inidy,xin,yin,1,ctx);
2232: PetscFree2(local_inidx,local_inidy);
2233: return(0);
2234: }