Actual source code: mpiov.c
2: #define PETSCMAT_DLL
4: /*
5: Routines to compute overlapping regions of a parallel MPI matrix
6: and to find submatrices that were shared across processors.
7: */
8: #include ../src/mat/impls/aij/mpi/mpiaij.h
9: #include petscbt.h
11: static PetscErrorCode MatIncreaseOverlap_MPIAIJ_Once(Mat,PetscInt,IS *);
12: static PetscErrorCode MatIncreaseOverlap_MPIAIJ_Local(Mat,PetscInt,char **,PetscInt*,PetscInt**);
13: static PetscErrorCode MatIncreaseOverlap_MPIAIJ_Receive(Mat,PetscInt,PetscInt **,PetscInt**,PetscInt*);
14: EXTERN PetscErrorCode MatGetRow_MPIAIJ(Mat,PetscInt,PetscInt*,PetscInt**,PetscScalar**);
15: EXTERN PetscErrorCode MatRestoreRow_MPIAIJ(Mat,PetscInt,PetscInt*,PetscInt**,PetscScalar**);
19: PetscErrorCode MatIncreaseOverlap_MPIAIJ(Mat C,PetscInt imax,IS is[],PetscInt ov)
20: {
22: PetscInt i;
25: if (ov < 0) SETERRQ(PETSC_ERR_ARG_OUTOFRANGE,"Negative overlap specified");
26: for (i=0; i<ov; ++i) {
27: MatIncreaseOverlap_MPIAIJ_Once(C,imax,is);
28: }
29: return(0);
30: }
32: /*
33: Sample message format:
34: If a processor A wants processor B to process some elements corresponding
35: to index sets is[1],is[5]
36: mesg [0] = 2 (no of index sets in the mesg)
37: -----------
38: mesg [1] = 1 => is[1]
39: mesg [2] = sizeof(is[1]);
40: -----------
41: mesg [3] = 5 => is[5]
42: mesg [4] = sizeof(is[5]);
43: -----------
44: mesg [5]
45: mesg [n] datas[1]
46: -----------
47: mesg[n+1]
48: mesg[m] data(is[5])
49: -----------
50:
51: Notes:
52: nrqs - no of requests sent (or to be sent out)
53: nrqr - no of requests recieved (which have to be or which have been processed
54: */
57: static PetscErrorCode MatIncreaseOverlap_MPIAIJ_Once(Mat C,PetscInt imax,IS is[])
58: {
59: Mat_MPIAIJ *c = (Mat_MPIAIJ*)C->data;
60: PetscMPIInt *w1,*w2,nrqr,*w3,*w4,*onodes1,*olengths1,*onodes2,*olengths2;
61: const PetscInt **idx,*idx_i;
62: PetscInt *n,*rtable,**data,len;
64: PetscMPIInt size,rank,tag1,tag2;
65: PetscInt m,i,j,k,**rbuf,row,proc,nrqs,msz,**outdat,**ptr;
66: PetscInt *ctr,*pa,*tmp,*isz,*isz1,**xdata,**rbuf2;
67: PetscBT *table;
68: MPI_Comm comm;
69: MPI_Request *s_waits1,*r_waits1,*s_waits2,*r_waits2;
70: MPI_Status *s_status,*recv_status;
73: comm = ((PetscObject)C)->comm;
74: size = c->size;
75: rank = c->rank;
76: m = C->rmap->N;
78: PetscObjectGetNewTag((PetscObject)C,&tag1);
79: PetscObjectGetNewTag((PetscObject)C,&tag2);
80:
81: PetscMalloc((imax+1)*sizeof(PetscInt*)+ (imax + m)*sizeof(PetscInt),&idx);
82: n = (PetscInt*)(idx + imax);
83: rtable = n + imax;
84:
85: for (i=0; i<imax; i++) {
86: ISGetIndices(is[i],&idx[i]);
87: ISGetLocalSize(is[i],&n[i]);
88: }
89:
90: /* Create hash table for the mapping :row -> proc*/
91: for (i=0,j=0; i<size; i++) {
92: len = C->rmap->range[i+1];
93: for (; j<len; j++) {
94: rtable[j] = i;
95: }
96: }
98: /* evaluate communication - mesg to who,length of mesg, and buffer space
99: required. Based on this, buffers are allocated, and data copied into them*/
100: PetscMalloc(size*4*sizeof(PetscMPIInt),&w1);/* mesg size */
101: w2 = w1 + size; /* if w2[i] marked, then a message to proc i*/
102: w3 = w2 + size; /* no of IS that needs to be sent to proc i */
103: w4 = w3 + size; /* temp work space used in determining w1, w2, w3 */
104: PetscMemzero(w1,size*3*sizeof(PetscMPIInt)); /* initialise work vector*/
105: for (i=0; i<imax; i++) {
106: PetscMemzero(w4,size*sizeof(PetscMPIInt)); /* initialise work vector*/
107: idx_i = idx[i];
108: len = n[i];
109: for (j=0; j<len; j++) {
110: row = idx_i[j];
111: if (row < 0) {
112: SETERRQ(PETSC_ERR_ARG_OUTOFRANGE,"Index set cannot have negative entries");
113: }
114: proc = rtable[row];
115: w4[proc]++;
116: }
117: for (j=0; j<size; j++){
118: if (w4[j]) { w1[j] += w4[j]; w3[j]++;}
119: }
120: }
122: nrqs = 0; /* no of outgoing messages */
123: msz = 0; /* total mesg length (for all proc */
124: w1[rank] = 0; /* no mesg sent to intself */
125: w3[rank] = 0;
126: for (i=0; i<size; i++) {
127: if (w1[i]) {w2[i] = 1; nrqs++;} /* there exists a message to proc i */
128: }
129: /* pa - is list of processors to communicate with */
130: PetscMalloc((nrqs+1)*sizeof(PetscInt),&pa);
131: for (i=0,j=0; i<size; i++) {
132: if (w1[i]) {pa[j] = i; j++;}
133: }
135: /* Each message would have a header = 1 + 2*(no of IS) + data */
136: for (i=0; i<nrqs; i++) {
137: j = pa[i];
138: w1[j] += w2[j] + 2*w3[j];
139: msz += w1[j];
140: }
142: /* Determine the number of messages to expect, their lengths, from from-ids */
143: PetscGatherNumberOfMessages(comm,w2,w1,&nrqr);
144: PetscGatherMessageLengths(comm,nrqs,nrqr,w1,&onodes1,&olengths1);
146: /* Now post the Irecvs corresponding to these messages */
147: PetscPostIrecvInt(comm,tag1,nrqr,onodes1,olengths1,&rbuf,&r_waits1);
149: /* Allocate Memory for outgoing messages */
150: PetscMalloc(2*size*sizeof(PetscInt*) + (size+msz)*sizeof(PetscInt),&outdat);
151: ptr = outdat + size; /* Pointers to the data in outgoing buffers */
152: PetscMemzero(outdat,2*size*sizeof(PetscInt*));
153: tmp = (PetscInt*)(outdat + 2*size);
154: ctr = tmp + msz;
156: {
157: PetscInt *iptr = tmp,ict = 0;
158: for (i=0; i<nrqs; i++) {
159: j = pa[i];
160: iptr += ict;
161: outdat[j] = iptr;
162: ict = w1[j];
163: }
164: }
166: /* Form the outgoing messages */
167: /*plug in the headers*/
168: for (i=0; i<nrqs; i++) {
169: j = pa[i];
170: outdat[j][0] = 0;
171: PetscMemzero(outdat[j]+1,2*w3[j]*sizeof(PetscInt));
172: ptr[j] = outdat[j] + 2*w3[j] + 1;
173: }
174:
175: /* Memory for doing local proc's work*/
176: {
177: PetscInt *d_p;
178: char *t_p;
180: PetscMalloc((imax)*(sizeof(PetscBT) + sizeof(PetscInt*)+ sizeof(PetscInt)) +
181: (m)*imax*sizeof(PetscInt) + (m/PETSC_BITS_PER_BYTE+1)*imax*sizeof(char) + 1,&table);
182: PetscMemzero(table,(imax)*(sizeof(PetscBT) + sizeof(PetscInt*)+ sizeof(PetscInt)) +
183: (m)*imax*sizeof(PetscInt) + (m/PETSC_BITS_PER_BYTE+1)*imax*sizeof(char) + 1);
184: data = (PetscInt **)(table + imax);
185: isz = (PetscInt *)(data + imax);
186: d_p = (PetscInt *)(isz + imax);
187: t_p = (char *)(d_p + m*imax);
188: for (i=0; i<imax; i++) {
189: table[i] = t_p + (m/PETSC_BITS_PER_BYTE+1)*i;
190: data[i] = d_p + (m)*i;
191: }
192: }
194: /* Parse the IS and update local tables and the outgoing buf with the data*/
195: {
196: PetscInt n_i,*data_i,isz_i,*outdat_j,ctr_j;
197: PetscBT table_i;
199: for (i=0; i<imax; i++) {
200: PetscMemzero(ctr,size*sizeof(PetscInt));
201: n_i = n[i];
202: table_i = table[i];
203: idx_i = idx[i];
204: data_i = data[i];
205: isz_i = isz[i];
206: for (j=0; j<n_i; j++) { /* parse the indices of each IS */
207: row = idx_i[j];
208: proc = rtable[row];
209: if (proc != rank) { /* copy to the outgoing buffer */
210: ctr[proc]++;
211: *ptr[proc] = row;
212: ptr[proc]++;
213: } else { /* Update the local table */
214: if (!PetscBTLookupSet(table_i,row)) { data_i[isz_i++] = row;}
215: }
216: }
217: /* Update the headers for the current IS */
218: for (j=0; j<size; j++) { /* Can Optimise this loop by using pa[] */
219: if ((ctr_j = ctr[j])) {
220: outdat_j = outdat[j];
221: k = ++outdat_j[0];
222: outdat_j[2*k] = ctr_j;
223: outdat_j[2*k-1] = i;
224: }
225: }
226: isz[i] = isz_i;
227: }
228: }
229:
232: /* Now post the sends */
233: PetscMalloc((nrqs+1)*sizeof(MPI_Request),&s_waits1);
234: for (i=0; i<nrqs; ++i) {
235: j = pa[i];
236: MPI_Isend(outdat[j],w1[j],MPIU_INT,j,tag1,comm,s_waits1+i);
237: }
238:
239: /* No longer need the original indices*/
240: for (i=0; i<imax; ++i) {
241: ISRestoreIndices(is[i],idx+i);
242: }
243: PetscFree(idx);
245: for (i=0; i<imax; ++i) {
246: ISDestroy(is[i]);
247: }
248:
249: /* Do Local work*/
250: MatIncreaseOverlap_MPIAIJ_Local(C,imax,table,isz,data);
252: /* Receive messages*/
253: PetscMalloc((nrqr+1)*sizeof(MPI_Status),&recv_status);
254: if (nrqr) {MPI_Waitall(nrqr,r_waits1,recv_status);}
255:
256: PetscMalloc((nrqs+1)*sizeof(MPI_Status),&s_status);
257: if (nrqs) {MPI_Waitall(nrqs,s_waits1,s_status);}
259: /* Phase 1 sends are complete - deallocate buffers */
260: PetscFree(outdat);
261: PetscFree(w1);
263: PetscMalloc((nrqr+1)*sizeof(PetscInt*),&xdata);
264: PetscMalloc((nrqr+1)*sizeof(PetscInt),&isz1);
265: MatIncreaseOverlap_MPIAIJ_Receive(C,nrqr,rbuf,xdata,isz1);
266: PetscFree(rbuf);
268:
269: /* Send the data back*/
270: /* Do a global reduction to know the buffer space req for incoming messages*/
271: {
272: PetscMPIInt *rw1;
273:
274: PetscMalloc(size*sizeof(PetscMPIInt),&rw1);
275: PetscMemzero(rw1,size*sizeof(PetscMPIInt));
277: for (i=0; i<nrqr; ++i) {
278: proc = recv_status[i].MPI_SOURCE;
279: if (proc != onodes1[i]) SETERRQ(PETSC_ERR_PLIB,"MPI_SOURCE mismatch");
280: rw1[proc] = isz1[i];
281: }
282: PetscFree(onodes1);
283: PetscFree(olengths1);
285: /* Determine the number of messages to expect, their lengths, from from-ids */
286: PetscGatherMessageLengths(comm,nrqr,nrqs,rw1,&onodes2,&olengths2);
287: PetscFree(rw1);
288: }
289: /* Now post the Irecvs corresponding to these messages */
290: PetscPostIrecvInt(comm,tag2,nrqs,onodes2,olengths2,&rbuf2,&r_waits2);
292: /* Now post the sends */
293: PetscMalloc((nrqr+1)*sizeof(MPI_Request),&s_waits2);
294: for (i=0; i<nrqr; ++i) {
295: j = recv_status[i].MPI_SOURCE;
296: MPI_Isend(xdata[i],isz1[i],MPIU_INT,j,tag2,comm,s_waits2+i);
297: }
299: /* receive work done on other processors*/
300: {
301: PetscInt is_no,ct1,max,*rbuf2_i,isz_i,*data_i,jmax;
302: PetscMPIInt idex;
303: PetscBT table_i;
304: MPI_Status *status2;
305:
306: PetscMalloc((PetscMax(nrqr,nrqs)+1)*sizeof(MPI_Status),&status2);
307: for (i=0; i<nrqs; ++i) {
308: MPI_Waitany(nrqs,r_waits2,&idex,status2+i);
309: /* Process the message*/
310: rbuf2_i = rbuf2[idex];
311: ct1 = 2*rbuf2_i[0]+1;
312: jmax = rbuf2[idex][0];
313: for (j=1; j<=jmax; j++) {
314: max = rbuf2_i[2*j];
315: is_no = rbuf2_i[2*j-1];
316: isz_i = isz[is_no];
317: data_i = data[is_no];
318: table_i = table[is_no];
319: for (k=0; k<max; k++,ct1++) {
320: row = rbuf2_i[ct1];
321: if (!PetscBTLookupSet(table_i,row)) { data_i[isz_i++] = row;}
322: }
323: isz[is_no] = isz_i;
324: }
325: }
327: if (nrqr) {MPI_Waitall(nrqr,s_waits2,status2);}
328: PetscFree(status2);
329: }
330:
331: for (i=0; i<imax; ++i) {
332: ISCreateGeneral(PETSC_COMM_SELF,isz[i],data[i],is+i);
333: }
334:
335: PetscFree(onodes2);
336: PetscFree(olengths2);
338: PetscFree(pa);
339: PetscFree(rbuf2);
340: PetscFree(s_waits1);
341: PetscFree(r_waits1);
342: PetscFree(s_waits2);
343: PetscFree(r_waits2);
344: PetscFree(table);
345: PetscFree(s_status);
346: PetscFree(recv_status);
347: PetscFree(xdata[0]);
348: PetscFree(xdata);
349: PetscFree(isz1);
350: return(0);
351: }
355: /*
356: MatIncreaseOverlap_MPIAIJ_Local - Called by MatincreaseOverlap, to do
357: the work on the local processor.
359: Inputs:
360: C - MAT_MPIAIJ;
361: imax - total no of index sets processed at a time;
362: table - an array of char - size = m bits.
363:
364: Output:
365: isz - array containing the count of the solution elements correspondign
366: to each index set;
367: data - pointer to the solutions
368: */
369: static PetscErrorCode MatIncreaseOverlap_MPIAIJ_Local(Mat C,PetscInt imax,PetscBT *table,PetscInt *isz,PetscInt **data)
370: {
371: Mat_MPIAIJ *c = (Mat_MPIAIJ*)C->data;
372: Mat A = c->A,B = c->B;
373: Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data,*b = (Mat_SeqAIJ*)B->data;
374: PetscInt start,end,val,max,rstart,cstart,*ai,*aj;
375: PetscInt *bi,*bj,*garray,i,j,k,row,*data_i,isz_i;
376: PetscBT table_i;
379: rstart = C->rmap->rstart;
380: cstart = C->cmap->rstart;
381: ai = a->i;
382: aj = a->j;
383: bi = b->i;
384: bj = b->j;
385: garray = c->garray;
387:
388: for (i=0; i<imax; i++) {
389: data_i = data[i];
390: table_i = table[i];
391: isz_i = isz[i];
392: for (j=0,max=isz[i]; j<max; j++) {
393: row = data_i[j] - rstart;
394: start = ai[row];
395: end = ai[row+1];
396: for (k=start; k<end; k++) { /* Amat */
397: val = aj[k] + cstart;
398: if (!PetscBTLookupSet(table_i,val)) { data_i[isz_i++] = val;}
399: }
400: start = bi[row];
401: end = bi[row+1];
402: for (k=start; k<end; k++) { /* Bmat */
403: val = garray[bj[k]];
404: if (!PetscBTLookupSet(table_i,val)) { data_i[isz_i++] = val;}
405: }
406: }
407: isz[i] = isz_i;
408: }
409: return(0);
410: }
414: /*
415: MatIncreaseOverlap_MPIAIJ_Receive - Process the recieved messages,
416: and return the output
418: Input:
419: C - the matrix
420: nrqr - no of messages being processed.
421: rbuf - an array of pointers to the recieved requests
422:
423: Output:
424: xdata - array of messages to be sent back
425: isz1 - size of each message
427: For better efficiency perhaps we should malloc separately each xdata[i],
428: then if a remalloc is required we need only copy the data for that one row
429: rather then all previous rows as it is now where a single large chunck of
430: memory is used.
432: */
433: static PetscErrorCode MatIncreaseOverlap_MPIAIJ_Receive(Mat C,PetscInt nrqr,PetscInt **rbuf,PetscInt **xdata,PetscInt * isz1)
434: {
435: Mat_MPIAIJ *c = (Mat_MPIAIJ*)C->data;
436: Mat A = c->A,B = c->B;
437: Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data,*b = (Mat_SeqAIJ*)B->data;
439: PetscMPIInt rank;
440: PetscInt rstart,cstart,*ai,*aj,*bi,*bj,*garray,i,j,k;
441: PetscInt row,total_sz,ct,ct1,ct2,ct3,mem_estimate,oct2,l,start,end;
442: PetscInt val,max1,max2,m,no_malloc =0,*tmp,new_estimate,ctr;
443: PetscInt *rbuf_i,kmax,rbuf_0;
444: PetscBT xtable;
447: rank = c->rank;
448: m = C->rmap->N;
449: rstart = C->rmap->rstart;
450: cstart = C->cmap->rstart;
451: ai = a->i;
452: aj = a->j;
453: bi = b->i;
454: bj = b->j;
455: garray = c->garray;
456:
457:
458: for (i=0,ct=0,total_sz=0; i<nrqr; ++i) {
459: rbuf_i = rbuf[i];
460: rbuf_0 = rbuf_i[0];
461: ct += rbuf_0;
462: for (j=1; j<=rbuf_0; j++) { total_sz += rbuf_i[2*j]; }
463: }
464:
465: if (C->rmap->n) max1 = ct*(a->nz + b->nz)/C->rmap->n;
466: else max1 = 1;
467: mem_estimate = 3*((total_sz > max1 ? total_sz : max1)+1);
468: PetscMalloc(mem_estimate*sizeof(PetscInt),&xdata[0]);
469: ++no_malloc;
470: PetscBTCreate(m,xtable);
471: PetscMemzero(isz1,nrqr*sizeof(PetscInt));
472:
473: ct3 = 0;
474: for (i=0; i<nrqr; i++) { /* for easch mesg from proc i */
475: rbuf_i = rbuf[i];
476: rbuf_0 = rbuf_i[0];
477: ct1 = 2*rbuf_0+1;
478: ct2 = ct1;
479: ct3 += ct1;
480: for (j=1; j<=rbuf_0; j++) { /* for each IS from proc i*/
481: PetscBTMemzero(m,xtable);
482: oct2 = ct2;
483: kmax = rbuf_i[2*j];
484: for (k=0; k<kmax; k++,ct1++) {
485: row = rbuf_i[ct1];
486: if (!PetscBTLookupSet(xtable,row)) {
487: if (!(ct3 < mem_estimate)) {
488: new_estimate = (PetscInt)(1.5*mem_estimate)+1;
489: PetscMalloc(new_estimate*sizeof(PetscInt),&tmp);
490: PetscMemcpy(tmp,xdata[0],mem_estimate*sizeof(PetscInt));
491: PetscFree(xdata[0]);
492: xdata[0] = tmp;
493: mem_estimate = new_estimate; ++no_malloc;
494: for (ctr=1; ctr<=i; ctr++) { xdata[ctr] = xdata[ctr-1] + isz1[ctr-1];}
495: }
496: xdata[i][ct2++] = row;
497: ct3++;
498: }
499: }
500: for (k=oct2,max2=ct2; k<max2; k++) {
501: row = xdata[i][k] - rstart;
502: start = ai[row];
503: end = ai[row+1];
504: for (l=start; l<end; l++) {
505: val = aj[l] + cstart;
506: if (!PetscBTLookupSet(xtable,val)) {
507: if (!(ct3 < mem_estimate)) {
508: new_estimate = (PetscInt)(1.5*mem_estimate)+1;
509: PetscMalloc(new_estimate*sizeof(PetscInt),&tmp);
510: PetscMemcpy(tmp,xdata[0],mem_estimate*sizeof(PetscInt));
511: PetscFree(xdata[0]);
512: xdata[0] = tmp;
513: mem_estimate = new_estimate; ++no_malloc;
514: for (ctr=1; ctr<=i; ctr++) { xdata[ctr] = xdata[ctr-1] + isz1[ctr-1];}
515: }
516: xdata[i][ct2++] = val;
517: ct3++;
518: }
519: }
520: start = bi[row];
521: end = bi[row+1];
522: for (l=start; l<end; l++) {
523: val = garray[bj[l]];
524: if (!PetscBTLookupSet(xtable,val)) {
525: if (!(ct3 < mem_estimate)) {
526: new_estimate = (PetscInt)(1.5*mem_estimate)+1;
527: PetscMalloc(new_estimate*sizeof(PetscInt),&tmp);
528: PetscMemcpy(tmp,xdata[0],mem_estimate*sizeof(PetscInt));
529: PetscFree(xdata[0]);
530: xdata[0] = tmp;
531: mem_estimate = new_estimate; ++no_malloc;
532: for (ctr =1; ctr <=i; ctr++) { xdata[ctr] = xdata[ctr-1] + isz1[ctr-1];}
533: }
534: xdata[i][ct2++] = val;
535: ct3++;
536: }
537: }
538: }
539: /* Update the header*/
540: xdata[i][2*j] = ct2 - oct2; /* Undo the vector isz1 and use only a var*/
541: xdata[i][2*j-1] = rbuf_i[2*j-1];
542: }
543: xdata[i][0] = rbuf_0;
544: xdata[i+1] = xdata[i] + ct2;
545: isz1[i] = ct2; /* size of each message */
546: }
547: PetscBTDestroy(xtable);
548: PetscInfo4(C,"Allocated %D bytes, required %D bytes, no of mallocs = %D\n",rank,mem_estimate,ct3,no_malloc);
549: return(0);
550: }
551: /* -------------------------------------------------------------------------*/
552: EXTERN PetscErrorCode MatGetSubMatrices_MPIAIJ_Local(Mat,PetscInt,const IS[],const IS[],MatReuse,Mat*);
553: EXTERN PetscErrorCode MatAssemblyEnd_SeqAIJ(Mat,MatAssemblyType);
554: /*
555: Every processor gets the entire matrix
556: */
559: PetscErrorCode MatGetSubMatrix_MPIAIJ_All(Mat A,MatGetSubMatrixOption flag,MatReuse scall,Mat *Bin[])
560: {
561: Mat B;
562: Mat_MPIAIJ *a = (Mat_MPIAIJ *)A->data;
563: Mat_SeqAIJ *b,*ad = (Mat_SeqAIJ*)a->A->data,*bd = (Mat_SeqAIJ*)a->B->data;
565: PetscMPIInt size,rank,*recvcounts = 0,*displs = 0;
566: PetscInt sendcount,i,*rstarts = A->rmap->range,n,cnt,j;
567: PetscInt m,*b_sendj,*garray = a->garray,*lens,*jsendbuf,*a_jsendbuf,*b_jsendbuf;
568: MatScalar *sendbuf,*recvbuf,*a_sendbuf,*b_sendbuf;
571: MPI_Comm_size(((PetscObject)A)->comm,&size);
572: MPI_Comm_rank(((PetscObject)A)->comm,&rank);
574: if (scall == MAT_INITIAL_MATRIX) {
575: /* ----------------------------------------------------------------
576: Tell every processor the number of nonzeros per row
577: */
578: PetscMalloc(A->rmap->N*sizeof(PetscInt),&lens);
579: for (i=A->rmap->rstart; i<A->rmap->rend; i++) {
580: lens[i] = ad->i[i-A->rmap->rstart+1] - ad->i[i-A->rmap->rstart] + bd->i[i-A->rmap->rstart+1] - bd->i[i-A->rmap->rstart];
581: }
582: sendcount = A->rmap->rend - A->rmap->rstart;
583: PetscMalloc(2*size*sizeof(PetscMPIInt),&recvcounts);
584: displs = recvcounts + size;
585: for (i=0; i<size; i++) {
586: recvcounts[i] = A->rmap->range[i+1] - A->rmap->range[i];
587: displs[i] = A->rmap->range[i];
588: }
589: #if defined(PETSC_HAVE_MPI_IN_PLACE)
590: MPI_Allgatherv(MPI_IN_PLACE,0,MPI_DATATYPE_NULL,lens,recvcounts,displs,MPIU_INT,((PetscObject)A)->comm);
591: #else
592: MPI_Allgatherv(lens+A->rmap->rstart,sendcount,MPIU_INT,lens,recvcounts,displs,MPIU_INT,((PetscObject)A)->comm);
593: #endif
594: /* ---------------------------------------------------------------
595: Create the sequential matrix of the same type as the local block diagonal
596: */
597: MatCreate(PETSC_COMM_SELF,&B);
598: MatSetSizes(B,A->rmap->N,A->cmap->N,PETSC_DETERMINE,PETSC_DETERMINE);
599: MatSetType(B,((PetscObject)a->A)->type_name);
600: MatSeqAIJSetPreallocation(B,0,lens);
601: PetscMalloc(sizeof(Mat),Bin);
602: **Bin = B;
603: b = (Mat_SeqAIJ *)B->data;
605: /*--------------------------------------------------------------------
606: Copy my part of matrix column indices over
607: */
608: sendcount = ad->nz + bd->nz;
609: jsendbuf = b->j + b->i[rstarts[rank]];
610: a_jsendbuf = ad->j;
611: b_jsendbuf = bd->j;
612: n = A->rmap->rend - A->rmap->rstart;
613: cnt = 0;
614: for (i=0; i<n; i++) {
616: /* put in lower diagonal portion */
617: m = bd->i[i+1] - bd->i[i];
618: while (m > 0) {
619: /* is it above diagonal (in bd (compressed) numbering) */
620: if (garray[*b_jsendbuf] > A->rmap->rstart + i) break;
621: jsendbuf[cnt++] = garray[*b_jsendbuf++];
622: m--;
623: }
625: /* put in diagonal portion */
626: for (j=ad->i[i]; j<ad->i[i+1]; j++) {
627: jsendbuf[cnt++] = A->rmap->rstart + *a_jsendbuf++;
628: }
630: /* put in upper diagonal portion */
631: while (m-- > 0) {
632: jsendbuf[cnt++] = garray[*b_jsendbuf++];
633: }
634: }
635: if (cnt != sendcount) SETERRQ2(PETSC_ERR_PLIB,"Corrupted PETSc matrix: nz given %D actual nz %D",sendcount,cnt);
637: /*--------------------------------------------------------------------
638: Gather all column indices to all processors
639: */
640: for (i=0; i<size; i++) {
641: recvcounts[i] = 0;
642: for (j=A->rmap->range[i]; j<A->rmap->range[i+1]; j++) {
643: recvcounts[i] += lens[j];
644: }
645: }
646: displs[0] = 0;
647: for (i=1; i<size; i++) {
648: displs[i] = displs[i-1] + recvcounts[i-1];
649: }
650: #if defined(PETSC_HAVE_MPI_IN_PLACE)
651: MPI_Allgatherv(MPI_IN_PLACE,0,MPI_DATATYPE_NULL,b->j,recvcounts,displs,MPIU_INT,((PetscObject)A)->comm);
652: #else
653: MPI_Allgatherv(jsendbuf,sendcount,MPIU_INT,b->j,recvcounts,displs,MPIU_INT,((PetscObject)A)->comm);
654: #endif
655: /*--------------------------------------------------------------------
656: Assemble the matrix into useable form (note numerical values not yet set)
657: */
658: /* set the b->ilen (length of each row) values */
659: PetscMemcpy(b->ilen,lens,A->rmap->N*sizeof(PetscInt));
660: /* set the b->i indices */
661: b->i[0] = 0;
662: for (i=1; i<=A->rmap->N; i++) {
663: b->i[i] = b->i[i-1] + lens[i-1];
664: }
665: PetscFree(lens);
666: MatAssemblyBegin(B,MAT_FINAL_ASSEMBLY);
667: MatAssemblyEnd(B,MAT_FINAL_ASSEMBLY);
669: } else {
670: B = **Bin;
671: b = (Mat_SeqAIJ *)B->data;
672: }
674: /*--------------------------------------------------------------------
675: Copy my part of matrix numerical values into the values location
676: */
677: if (flag == MAT_GET_VALUES){
678: sendcount = ad->nz + bd->nz;
679: sendbuf = b->a + b->i[rstarts[rank]];
680: a_sendbuf = ad->a;
681: b_sendbuf = bd->a;
682: b_sendj = bd->j;
683: n = A->rmap->rend - A->rmap->rstart;
684: cnt = 0;
685: for (i=0; i<n; i++) {
687: /* put in lower diagonal portion */
688: m = bd->i[i+1] - bd->i[i];
689: while (m > 0) {
690: /* is it above diagonal (in bd (compressed) numbering) */
691: if (garray[*b_sendj] > A->rmap->rstart + i) break;
692: sendbuf[cnt++] = *b_sendbuf++;
693: m--;
694: b_sendj++;
695: }
697: /* put in diagonal portion */
698: for (j=ad->i[i]; j<ad->i[i+1]; j++) {
699: sendbuf[cnt++] = *a_sendbuf++;
700: }
702: /* put in upper diagonal portion */
703: while (m-- > 0) {
704: sendbuf[cnt++] = *b_sendbuf++;
705: b_sendj++;
706: }
707: }
708: if (cnt != sendcount) SETERRQ2(PETSC_ERR_PLIB,"Corrupted PETSc matrix: nz given %D actual nz %D",sendcount,cnt);
709:
710: /* -----------------------------------------------------------------
711: Gather all numerical values to all processors
712: */
713: if (!recvcounts) {
714: PetscMalloc(2*size*sizeof(PetscInt),&recvcounts);
715: displs = recvcounts + size;
716: }
717: for (i=0; i<size; i++) {
718: recvcounts[i] = b->i[rstarts[i+1]] - b->i[rstarts[i]];
719: }
720: displs[0] = 0;
721: for (i=1; i<size; i++) {
722: displs[i] = displs[i-1] + recvcounts[i-1];
723: }
724: recvbuf = b->a;
725: #if defined(PETSC_HAVE_MPI_IN_PLACE)
726: MPI_Allgatherv(MPI_IN_PLACE,0,MPI_DATATYPE_NULL,recvbuf,recvcounts,displs,MPIU_SCALAR,((PetscObject)A)->comm);
727: #else
728: MPI_Allgatherv(sendbuf,sendcount,MPIU_SCALAR,recvbuf,recvcounts,displs,MPIU_SCALAR,((PetscObject)A)->comm);
729: #endif
730: } /* endof (flag == MAT_GET_VALUES) */
731: PetscFree(recvcounts);
733: if (A->symmetric){
734: MatSetOption(B,MAT_SYMMETRIC,PETSC_TRUE);
735: } else if (A->hermitian) {
736: MatSetOption(B,MAT_HERMITIAN,PETSC_TRUE);
737: } else if (A->structurally_symmetric) {
738: MatSetOption(B,MAT_STRUCTURALLY_SYMMETRIC,PETSC_TRUE);
739: }
740: return(0);
741: }
745: PetscErrorCode MatGetSubMatrices_MPIAIJ(Mat C,PetscInt ismax,const IS isrow[],const IS iscol[],MatReuse scall,Mat *submat[])
746: {
748: PetscInt nmax,nstages_local,nstages,i,pos,max_no,nrow,ncol;
749: PetscTruth rowflag,colflag,wantallmatrix = PETSC_FALSE,twantallmatrix;
752: /*
753: Check for special case each processor gets entire matrix
754: */
755: if (ismax == 1 && C->rmap->N == C->cmap->N) {
756: ISIdentity(*isrow,&rowflag);
757: ISIdentity(*iscol,&colflag);
758: ISGetLocalSize(*isrow,&nrow);
759: ISGetLocalSize(*iscol,&ncol);
760: if (rowflag && colflag && nrow == C->rmap->N && ncol == C->cmap->N) {
761: wantallmatrix = PETSC_TRUE;
762: PetscOptionsGetTruth(((PetscObject)C)->prefix,"-use_fast_submatrix",&wantallmatrix,PETSC_NULL);
763: }
764: }
765: MPI_Allreduce(&wantallmatrix,&twantallmatrix,1,MPI_INT,MPI_MIN,((PetscObject)C)->comm);
766: if (twantallmatrix) {
767: MatGetSubMatrix_MPIAIJ_All(C,MAT_GET_VALUES,scall,submat);
768: return(0);
769: }
771: /* Allocate memory to hold all the submatrices */
772: if (scall != MAT_REUSE_MATRIX) {
773: PetscMalloc((ismax+1)*sizeof(Mat),submat);
774: }
775: /* Determine the number of stages through which submatrices are done */
776: nmax = 20*1000000 / (C->cmap->N * sizeof(PetscInt));
777: if (!nmax) nmax = 1;
778: nstages_local = ismax/nmax + ((ismax % nmax)?1:0);
780: /* Make sure every processor loops through the nstages */
781: MPI_Allreduce(&nstages_local,&nstages,1,MPIU_INT,MPI_MAX,((PetscObject)C)->comm);
783: for (i=0,pos=0; i<nstages; i++) {
784: if (pos+nmax <= ismax) max_no = nmax;
785: else if (pos == ismax) max_no = 0;
786: else max_no = ismax-pos;
787: MatGetSubMatrices_MPIAIJ_Local(C,max_no,isrow+pos,iscol+pos,scall,*submat+pos);
788: pos += max_no;
789: }
790: return(0);
791: }
792: /* -------------------------------------------------------------------------*/
795: PetscErrorCode MatGetSubMatrices_MPIAIJ_Local(Mat C,PetscInt ismax,const IS isrow[],const IS iscol[],MatReuse scall,Mat *submats)
796: {
797: Mat_MPIAIJ *c = (Mat_MPIAIJ*)C->data;
798: Mat A = c->A;
799: Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data,*b = (Mat_SeqAIJ*)c->B->data,*mat;
800: const PetscInt **icol,**irow;
801: PetscInt *nrow,*ncol,start;
803: PetscMPIInt rank,size,tag0,tag1,tag2,tag3,*w1,*w2,*w3,*w4,nrqr;
804: PetscInt **sbuf1,**sbuf2,i,j,k,l,ct1,ct2,**rbuf1,row,proc;
805: PetscInt nrqs,msz,**ptr,*req_size,*ctr,*pa,*tmp,tcol;
806: PetscInt **rbuf3,*req_source,**sbuf_aj,**rbuf2,max1,max2,**rmap;
807: PetscInt **cmap,**lens,is_no,ncols,*cols,mat_i,*mat_j,tmp2,jmax;
808: const PetscInt *irow_i;
809: PetscInt ctr_j,*sbuf1_j,*sbuf_aj_i,*rbuf1_i,kmax,*cmap_i,*lens_i;
810: PetscInt *rmap_i;
811: MPI_Request *s_waits1,*r_waits1,*s_waits2,*r_waits2,*r_waits3;
812: MPI_Request *r_waits4,*s_waits3,*s_waits4;
813: MPI_Status *r_status1,*r_status2,*s_status1,*s_status3,*s_status2;
814: MPI_Status *r_status3,*r_status4,*s_status4;
815: MPI_Comm comm;
816: PetscScalar **rbuf4,**sbuf_aa,*vals,*mat_a,*sbuf_aa_i;
817: PetscTruth sorted;
818: PetscMPIInt *onodes1,*olengths1;
819: PetscMPIInt idex,idex2,end;
822: comm = ((PetscObject)C)->comm;
823: tag0 = ((PetscObject)C)->tag;
824: size = c->size;
825: rank = c->rank;
826:
827: /* Get some new tags to keep the communication clean */
828: PetscObjectGetNewTag((PetscObject)C,&tag1);
829: PetscObjectGetNewTag((PetscObject)C,&tag2);
830: PetscObjectGetNewTag((PetscObject)C,&tag3);
832: /* Check if the col indices are sorted */
833: for (i=0; i<ismax; i++) {
834: ISSorted(isrow[i],&sorted);
835: /*if (!sorted) SETERRQ(PETSC_ERR_ARG_WRONGSTATE,"isrow is not sorted");*/
836: ISSorted(iscol[i],&sorted);
837: /* if (!sorted) SETERRQ(PETSC_ERR_ARG_WRONGSTATE,"iscol is not sorted"); */
838: }
840: PetscMalloc((2*ismax+1)*(sizeof(PetscInt*)+ sizeof(PetscInt)),&irow);
841: icol = irow + ismax;
842: nrow = (PetscInt*)(icol + ismax);
843: ncol = nrow + ismax;
845: for (i=0; i<ismax; i++) {
846: ISGetIndices(isrow[i],&irow[i]);
847: ISGetIndices(iscol[i],&icol[i]);
848: ISGetLocalSize(isrow[i],&nrow[i]);
849: ISGetLocalSize(iscol[i],&ncol[i]);
850: }
852: /* evaluate communication - mesg to who, length of mesg, and buffer space
853: required. Based on this, buffers are allocated, and data copied into them*/
854: PetscMalloc(size*4*sizeof(PetscMPIInt),&w1); /* mesg size */
855: w2 = w1 + size; /* if w2[i] marked, then a message to proc i*/
856: w3 = w2 + size; /* no of IS that needs to be sent to proc i */
857: w4 = w3 + size; /* temp work space used in determining w1, w2, w3 */
858: PetscMemzero(w1,size*3*sizeof(PetscMPIInt)); /* initialize work vector*/
859: for (i=0; i<ismax; i++) {
860: PetscMemzero(w4,size*sizeof(PetscMPIInt)); /* initialize work vector*/
861: jmax = nrow[i];
862: irow_i = irow[i];
863: for (j=0; j<jmax; j++) {
864: l = 0;
865: row = irow_i[j];
866: while (row >= C->rmap->range[l+1]) l++;
867: proc = l;
868: w4[proc]++;
869: }
870: for (j=0; j<size; j++) {
871: if (w4[j]) { w1[j] += w4[j]; w3[j]++;}
872: }
873: }
874:
875: nrqs = 0; /* no of outgoing messages */
876: msz = 0; /* total mesg length (for all procs) */
877: w1[rank] = 0; /* no mesg sent to self */
878: w3[rank] = 0;
879: for (i=0; i<size; i++) {
880: if (w1[i]) { w2[i] = 1; nrqs++;} /* there exists a message to proc i */
881: }
882: PetscMalloc((nrqs+1)*sizeof(PetscInt),&pa); /*(proc -array)*/
883: for (i=0,j=0; i<size; i++) {
884: if (w1[i]) { pa[j] = i; j++; }
885: }
887: /* Each message would have a header = 1 + 2*(no of IS) + data */
888: for (i=0; i<nrqs; i++) {
889: j = pa[i];
890: w1[j] += w2[j] + 2* w3[j];
891: msz += w1[j];
892: }
894: /* Determine the number of messages to expect, their lengths, from from-ids */
895: PetscGatherNumberOfMessages(comm,w2,w1,&nrqr);
896: PetscGatherMessageLengths(comm,nrqs,nrqr,w1,&onodes1,&olengths1);
898: /* Now post the Irecvs corresponding to these messages */
899: PetscPostIrecvInt(comm,tag0,nrqr,onodes1,olengths1,&rbuf1,&r_waits1);
900:
901: PetscFree(onodes1);
902: PetscFree(olengths1);
903:
904: /* Allocate Memory for outgoing messages */
905: PetscMalloc(2*size*sizeof(PetscInt*) + 2*msz*sizeof(PetscInt) + size*sizeof(PetscInt),&sbuf1);
906: ptr = sbuf1 + size; /* Pointers to the data in outgoing buffers */
907: PetscMemzero(sbuf1,2*size*sizeof(PetscInt*));
908: /* allocate memory for outgoing data + buf to receive the first reply */
909: tmp = (PetscInt*)(ptr + size);
910: ctr = tmp + 2*msz;
912: {
913: PetscInt *iptr = tmp,ict = 0;
914: for (i=0; i<nrqs; i++) {
915: j = pa[i];
916: iptr += ict;
917: sbuf1[j] = iptr;
918: ict = w1[j];
919: }
920: }
922: /* Form the outgoing messages */
923: /* Initialize the header space */
924: for (i=0; i<nrqs; i++) {
925: j = pa[i];
926: sbuf1[j][0] = 0;
927: PetscMemzero(sbuf1[j]+1,2*w3[j]*sizeof(PetscInt));
928: ptr[j] = sbuf1[j] + 2*w3[j] + 1;
929: }
930:
931: /* Parse the isrow and copy data into outbuf */
932: for (i=0; i<ismax; i++) {
933: PetscMemzero(ctr,size*sizeof(PetscInt));
934: irow_i = irow[i];
935: jmax = nrow[i];
936: for (j=0; j<jmax; j++) { /* parse the indices of each IS */
937: l = 0;
938: row = irow_i[j];
939: while (row >= C->rmap->range[l+1]) l++;
940: proc = l;
941: if (proc != rank) { /* copy to the outgoing buf*/
942: ctr[proc]++;
943: *ptr[proc] = row;
944: ptr[proc]++;
945: }
946: }
947: /* Update the headers for the current IS */
948: for (j=0; j<size; j++) { /* Can Optimise this loop too */
949: if ((ctr_j = ctr[j])) {
950: sbuf1_j = sbuf1[j];
951: k = ++sbuf1_j[0];
952: sbuf1_j[2*k] = ctr_j;
953: sbuf1_j[2*k-1] = i;
954: }
955: }
956: }
958: /* Now post the sends */
959: PetscMalloc((nrqs+1)*sizeof(MPI_Request),&s_waits1);
960: for (i=0; i<nrqs; ++i) {
961: j = pa[i];
962: MPI_Isend(sbuf1[j],w1[j],MPIU_INT,j,tag0,comm,s_waits1+i);
963: }
965: /* Post Receives to capture the buffer size */
966: PetscMalloc((nrqs+1)*sizeof(MPI_Request),&r_waits2);
967: PetscMalloc((nrqs+1)*sizeof(PetscInt*),&rbuf2);
968: rbuf2[0] = tmp + msz;
969: for (i=1; i<nrqs; ++i) {
970: rbuf2[i] = rbuf2[i-1]+w1[pa[i-1]];
971: }
972: for (i=0; i<nrqs; ++i) {
973: j = pa[i];
974: MPI_Irecv(rbuf2[i],w1[j],MPIU_INT,j,tag1,comm,r_waits2+i);
975: }
977: /* Send to other procs the buf size they should allocate */
978:
980: /* Receive messages*/
981: PetscMalloc((nrqr+1)*sizeof(MPI_Request),&s_waits2);
982: PetscMalloc((nrqr+1)*sizeof(MPI_Status),&r_status1);
983: PetscMalloc(2*nrqr*sizeof(PetscInt) + (nrqr+1)*sizeof(PetscInt*),&sbuf2);
984: req_size = (PetscInt*)(sbuf2 + nrqr);
985: req_source = req_size + nrqr;
986:
987: {
988: Mat_SeqAIJ *sA = (Mat_SeqAIJ*)c->A->data,*sB = (Mat_SeqAIJ*)c->B->data;
989: PetscInt *sAi = sA->i,*sBi = sB->i,id,rstart = C->rmap->rstart;
990: PetscInt *sbuf2_i;
992: for (i=0; i<nrqr; ++i) {
993: MPI_Waitany(nrqr,r_waits1,&idex,r_status1+i);
994: req_size[idex] = 0;
995: rbuf1_i = rbuf1[idex];
996: start = 2*rbuf1_i[0] + 1;
997: MPI_Get_count(r_status1+i,MPIU_INT,&end);
998: PetscMalloc((end+1)*sizeof(PetscInt),&sbuf2[idex]);
999: sbuf2_i = sbuf2[idex];
1000: for (j=start; j<end; j++) {
1001: id = rbuf1_i[j] - rstart;
1002: ncols = sAi[id+1] - sAi[id] + sBi[id+1] - sBi[id];
1003: sbuf2_i[j] = ncols;
1004: req_size[idex] += ncols;
1005: }
1006: req_source[idex] = r_status1[i].MPI_SOURCE;
1007: /* form the header */
1008: sbuf2_i[0] = req_size[idex];
1009: for (j=1; j<start; j++) { sbuf2_i[j] = rbuf1_i[j]; }
1010: MPI_Isend(sbuf2_i,end,MPIU_INT,req_source[idex],tag1,comm,s_waits2+i);
1011: }
1012: }
1013: PetscFree(r_status1);
1014: PetscFree(r_waits1);
1016: /* recv buffer sizes */
1017: /* Receive messages*/
1018:
1019: PetscMalloc((nrqs+1)*sizeof(PetscInt*),&rbuf3);
1020: PetscMalloc((nrqs+1)*sizeof(PetscScalar*),&rbuf4);
1021: PetscMalloc((nrqs+1)*sizeof(MPI_Request),&r_waits3);
1022: PetscMalloc((nrqs+1)*sizeof(MPI_Request),&r_waits4);
1023: PetscMalloc((nrqs+1)*sizeof(MPI_Status),&r_status2);
1025: for (i=0; i<nrqs; ++i) {
1026: MPI_Waitany(nrqs,r_waits2,&idex,r_status2+i);
1027: PetscMalloc((rbuf2[idex][0]+1)*sizeof(PetscInt),&rbuf3[idex]);
1028: PetscMalloc((rbuf2[idex][0]+1)*sizeof(PetscScalar),&rbuf4[idex]);
1029: MPI_Irecv(rbuf3[idex],rbuf2[idex][0],MPIU_INT,r_status2[i].MPI_SOURCE,tag2,comm,r_waits3+idex);
1030: MPI_Irecv(rbuf4[idex],rbuf2[idex][0],MPIU_SCALAR,r_status2[i].MPI_SOURCE,tag3,comm,r_waits4+idex);
1031: }
1032: PetscFree(r_status2);
1033: PetscFree(r_waits2);
1034:
1035: /* Wait on sends1 and sends2 */
1036: PetscMalloc((nrqs+1)*sizeof(MPI_Status),&s_status1);
1037: PetscMalloc((nrqr+1)*sizeof(MPI_Status),&s_status2);
1039: if (nrqs) {MPI_Waitall(nrqs,s_waits1,s_status1);}
1040: if (nrqr) {MPI_Waitall(nrqr,s_waits2,s_status2);}
1041: PetscFree(s_status1);
1042: PetscFree(s_status2);
1043: PetscFree(s_waits1);
1044: PetscFree(s_waits2);
1046: /* Now allocate buffers for a->j, and send them off */
1047: PetscMalloc((nrqr+1)*sizeof(PetscInt*),&sbuf_aj);
1048: for (i=0,j=0; i<nrqr; i++) j += req_size[i];
1049: PetscMalloc((j+1)*sizeof(PetscInt),&sbuf_aj[0]);
1050: for (i=1; i<nrqr; i++) sbuf_aj[i] = sbuf_aj[i-1] + req_size[i-1];
1051:
1052: PetscMalloc((nrqr+1)*sizeof(MPI_Request),&s_waits3);
1053: {
1054: PetscInt nzA,nzB,*a_i = a->i,*b_i = b->i,lwrite;
1055: PetscInt *cworkA,*cworkB,cstart = C->cmap->rstart,rstart = C->rmap->rstart,*bmap = c->garray;
1056: PetscInt cend = C->cmap->rend;
1057: PetscInt *a_j = a->j,*b_j = b->j,ctmp;
1059: for (i=0; i<nrqr; i++) {
1060: rbuf1_i = rbuf1[i];
1061: sbuf_aj_i = sbuf_aj[i];
1062: ct1 = 2*rbuf1_i[0] + 1;
1063: ct2 = 0;
1064: for (j=1,max1=rbuf1_i[0]; j<=max1; j++) {
1065: kmax = rbuf1[i][2*j];
1066: for (k=0; k<kmax; k++,ct1++) {
1067: row = rbuf1_i[ct1] - rstart;
1068: nzA = a_i[row+1] - a_i[row]; nzB = b_i[row+1] - b_i[row];
1069: ncols = nzA + nzB;
1070: cworkA = a_j + a_i[row]; cworkB = b_j + b_i[row];
1072: /* load the column indices for this row into cols*/
1073: cols = sbuf_aj_i + ct2;
1074:
1075: lwrite = 0;
1076: for (l=0; l<nzB; l++) {
1077: if ((ctmp = bmap[cworkB[l]]) < cstart) cols[lwrite++] = ctmp;
1078: }
1079: for (l=0; l<nzA; l++) cols[lwrite++] = cstart + cworkA[l];
1080: for (l=0; l<nzB; l++) {
1081: if ((ctmp = bmap[cworkB[l]]) >= cend) cols[lwrite++] = ctmp;
1082: }
1084: ct2 += ncols;
1085: }
1086: }
1087: MPI_Isend(sbuf_aj_i,req_size[i],MPIU_INT,req_source[i],tag2,comm,s_waits3+i);
1088: }
1089: }
1090: PetscMalloc((nrqs+1)*sizeof(MPI_Status),&r_status3);
1091: PetscMalloc((nrqr+1)*sizeof(MPI_Status),&s_status3);
1093: /* Allocate buffers for a->a, and send them off */
1094: PetscMalloc((nrqr+1)*sizeof(PetscScalar*),&sbuf_aa);
1095: for (i=0,j=0; i<nrqr; i++) j += req_size[i];
1096: PetscMalloc((j+1)*sizeof(PetscScalar),&sbuf_aa[0]);
1097: for (i=1; i<nrqr; i++) sbuf_aa[i] = sbuf_aa[i-1] + req_size[i-1];
1098:
1099: PetscMalloc((nrqr+1)*sizeof(MPI_Request),&s_waits4);
1100: {
1101: PetscInt nzA,nzB,*a_i = a->i,*b_i = b->i, *cworkB,lwrite;
1102: PetscInt cstart = C->cmap->rstart,rstart = C->rmap->rstart,*bmap = c->garray;
1103: PetscInt cend = C->cmap->rend;
1104: PetscInt *b_j = b->j;
1105: PetscScalar *vworkA,*vworkB,*a_a = a->a,*b_a = b->a;
1106:
1107: for (i=0; i<nrqr; i++) {
1108: rbuf1_i = rbuf1[i];
1109: sbuf_aa_i = sbuf_aa[i];
1110: ct1 = 2*rbuf1_i[0]+1;
1111: ct2 = 0;
1112: for (j=1,max1=rbuf1_i[0]; j<=max1; j++) {
1113: kmax = rbuf1_i[2*j];
1114: for (k=0; k<kmax; k++,ct1++) {
1115: row = rbuf1_i[ct1] - rstart;
1116: nzA = a_i[row+1] - a_i[row]; nzB = b_i[row+1] - b_i[row];
1117: ncols = nzA + nzB;
1118: cworkB = b_j + b_i[row];
1119: vworkA = a_a + a_i[row];
1120: vworkB = b_a + b_i[row];
1122: /* load the column values for this row into vals*/
1123: vals = sbuf_aa_i+ct2;
1124:
1125: lwrite = 0;
1126: for (l=0; l<nzB; l++) {
1127: if ((bmap[cworkB[l]]) < cstart) vals[lwrite++] = vworkB[l];
1128: }
1129: for (l=0; l<nzA; l++) vals[lwrite++] = vworkA[l];
1130: for (l=0; l<nzB; l++) {
1131: if ((bmap[cworkB[l]]) >= cend) vals[lwrite++] = vworkB[l];
1132: }
1133:
1134: ct2 += ncols;
1135: }
1136: }
1137: MPI_Isend(sbuf_aa_i,req_size[i],MPIU_SCALAR,req_source[i],tag3,comm,s_waits4+i);
1138: }
1139: }
1140: PetscMalloc((nrqs+1)*sizeof(MPI_Status),&r_status4);
1141: PetscMalloc((nrqr+1)*sizeof(MPI_Status),&s_status4);
1142: PetscFree(rbuf1);
1144: /* Form the matrix */
1145: /* create col map */
1146: {
1147: const PetscInt *icol_i;
1148:
1149: PetscMalloc((1+ismax)*sizeof(PetscInt*)+ (1+ismax*C->cmap->N)*sizeof(PetscInt),&cmap);
1150: cmap[0] = (PetscInt*)(cmap + ismax);
1151: PetscMemzero(cmap[0],(1+ismax*C->cmap->N)*sizeof(PetscInt));
1152: for (i=1; i<ismax; i++) { cmap[i] = cmap[i-1] + C->cmap->N; }
1153: for (i=0; i<ismax; i++) {
1154: jmax = ncol[i];
1155: icol_i = icol[i];
1156: cmap_i = cmap[i];
1157: for (j=0; j<jmax; j++) {
1158: cmap_i[icol_i[j]] = j+1;
1159: }
1160: }
1161: }
1163: /* Create lens which is required for MatCreate... */
1164: for (i=0,j=0; i<ismax; i++) { j += nrow[i]; }
1165: PetscMalloc( (1+ismax)*sizeof(PetscInt*)+ j*sizeof(PetscInt),&lens);
1166: lens[0] = (PetscInt*)(lens + ismax);
1167: PetscMemzero(lens[0],j*sizeof(PetscInt));
1168: for (i=1; i<ismax; i++) { lens[i] = lens[i-1] + nrow[i-1]; }
1169:
1170: /* Update lens from local data */
1171: for (i=0; i<ismax; i++) {
1172: jmax = nrow[i];
1173: cmap_i = cmap[i];
1174: irow_i = irow[i];
1175: lens_i = lens[i];
1176: for (j=0; j<jmax; j++) {
1177: l = 0;
1178: row = irow_i[j];
1179: while (row >= C->rmap->range[l+1]) l++;
1180: proc = l;
1181: if (proc == rank) {
1182: MatGetRow_MPIAIJ(C,row,&ncols,&cols,0);
1183: for (k=0; k<ncols; k++) {
1184: if (cmap_i[cols[k]]) { lens_i[j]++;}
1185: }
1186: MatRestoreRow_MPIAIJ(C,row,&ncols,&cols,0);
1187: }
1188: }
1189: }
1190:
1191: /* Create row map*/
1192: PetscMalloc((1+ismax)*sizeof(PetscInt*)+ ismax*C->rmap->N*sizeof(PetscInt),&rmap);
1193: rmap[0] = (PetscInt*)(rmap + ismax);
1194: PetscMemzero(rmap[0],ismax*C->rmap->N*sizeof(PetscInt));
1195: for (i=1; i<ismax; i++) { rmap[i] = rmap[i-1] + C->rmap->N;}
1196: for (i=0; i<ismax; i++) {
1197: rmap_i = rmap[i];
1198: irow_i = irow[i];
1199: jmax = nrow[i];
1200: for (j=0; j<jmax; j++) {
1201: rmap_i[irow_i[j]] = j;
1202: }
1203: }
1204:
1205: /* Update lens from offproc data */
1206: {
1207: PetscInt *rbuf2_i,*rbuf3_i,*sbuf1_i;
1209: for (tmp2=0; tmp2<nrqs; tmp2++) {
1210: MPI_Waitany(nrqs,r_waits3,&idex2,r_status3+tmp2);
1211: idex = pa[idex2];
1212: sbuf1_i = sbuf1[idex];
1213: jmax = sbuf1_i[0];
1214: ct1 = 2*jmax+1;
1215: ct2 = 0;
1216: rbuf2_i = rbuf2[idex2];
1217: rbuf3_i = rbuf3[idex2];
1218: for (j=1; j<=jmax; j++) {
1219: is_no = sbuf1_i[2*j-1];
1220: max1 = sbuf1_i[2*j];
1221: lens_i = lens[is_no];
1222: cmap_i = cmap[is_no];
1223: rmap_i = rmap[is_no];
1224: for (k=0; k<max1; k++,ct1++) {
1225: row = rmap_i[sbuf1_i[ct1]]; /* the val in the new matrix to be */
1226: max2 = rbuf2_i[ct1];
1227: for (l=0; l<max2; l++,ct2++) {
1228: if (cmap_i[rbuf3_i[ct2]]) {
1229: lens_i[row]++;
1230: }
1231: }
1232: }
1233: }
1234: }
1235: }
1236: PetscFree(r_status3);
1237: PetscFree(r_waits3);
1238: if (nrqr) {MPI_Waitall(nrqr,s_waits3,s_status3);}
1239: PetscFree(s_status3);
1240: PetscFree(s_waits3);
1242: /* Create the submatrices */
1243: if (scall == MAT_REUSE_MATRIX) {
1244: PetscTruth flag;
1246: /*
1247: Assumes new rows are same length as the old rows,hence bug!
1248: */
1249: for (i=0; i<ismax; i++) {
1250: mat = (Mat_SeqAIJ *)(submats[i]->data);
1251: if ((submats[i]->rmap->n != nrow[i]) || (submats[i]->cmap->n != ncol[i])) {
1252: SETERRQ(PETSC_ERR_ARG_SIZ,"Cannot reuse matrix. wrong size");
1253: }
1254: PetscMemcmp(mat->ilen,lens[i],submats[i]->rmap->n*sizeof(PetscInt),&flag);
1255: if (!flag) {
1256: SETERRQ(PETSC_ERR_ARG_SIZ,"Cannot reuse matrix. wrong no of nonzeros");
1257: }
1258: /* Initial matrix as if empty */
1259: PetscMemzero(mat->ilen,submats[i]->rmap->n*sizeof(PetscInt));
1260: submats[i]->factor = C->factor;
1261: }
1262: } else {
1263: for (i=0; i<ismax; i++) {
1264: MatCreate(PETSC_COMM_SELF,submats+i);
1265: MatSetSizes(submats[i],nrow[i],ncol[i],PETSC_DETERMINE,PETSC_DETERMINE);
1266: MatSetType(submats[i],((PetscObject)A)->type_name);
1267: MatSeqAIJSetPreallocation(submats[i],0,lens[i]);
1268: }
1269: }
1271: /* Assemble the matrices */
1272: /* First assemble the local rows */
1273: {
1274: PetscInt ilen_row,*imat_ilen,*imat_j,*imat_i,old_row;
1275: PetscScalar *imat_a;
1276:
1277: for (i=0; i<ismax; i++) {
1278: mat = (Mat_SeqAIJ*)submats[i]->data;
1279: imat_ilen = mat->ilen;
1280: imat_j = mat->j;
1281: imat_i = mat->i;
1282: imat_a = mat->a;
1283: cmap_i = cmap[i];
1284: rmap_i = rmap[i];
1285: irow_i = irow[i];
1286: jmax = nrow[i];
1287: for (j=0; j<jmax; j++) {
1288: l = 0;
1289: row = irow_i[j];
1290: while (row >= C->rmap->range[l+1]) l++;
1291: proc = l;
1292: if (proc == rank) {
1293: old_row = row;
1294: row = rmap_i[row];
1295: ilen_row = imat_ilen[row];
1296: MatGetRow_MPIAIJ(C,old_row,&ncols,&cols,&vals);
1297: mat_i = imat_i[row] ;
1298: mat_a = imat_a + mat_i;
1299: mat_j = imat_j + mat_i;
1300: for (k=0; k<ncols; k++) {
1301: if ((tcol = cmap_i[cols[k]])) {
1302: *mat_j++ = tcol - 1;
1303: *mat_a++ = vals[k];
1304: ilen_row++;
1305: }
1306: }
1307: MatRestoreRow_MPIAIJ(C,old_row,&ncols,&cols,&vals);
1308: imat_ilen[row] = ilen_row;
1309: }
1310: }
1311: }
1312: }
1314: /* Now assemble the off proc rows*/
1315: {
1316: PetscInt *sbuf1_i,*rbuf2_i,*rbuf3_i,*imat_ilen,ilen;
1317: PetscInt *imat_j,*imat_i;
1318: PetscScalar *imat_a,*rbuf4_i;
1320: for (tmp2=0; tmp2<nrqs; tmp2++) {
1321: MPI_Waitany(nrqs,r_waits4,&idex2,r_status4+tmp2);
1322: idex = pa[idex2];
1323: sbuf1_i = sbuf1[idex];
1324: jmax = sbuf1_i[0];
1325: ct1 = 2*jmax + 1;
1326: ct2 = 0;
1327: rbuf2_i = rbuf2[idex2];
1328: rbuf3_i = rbuf3[idex2];
1329: rbuf4_i = rbuf4[idex2];
1330: for (j=1; j<=jmax; j++) {
1331: is_no = sbuf1_i[2*j-1];
1332: rmap_i = rmap[is_no];
1333: cmap_i = cmap[is_no];
1334: mat = (Mat_SeqAIJ*)submats[is_no]->data;
1335: imat_ilen = mat->ilen;
1336: imat_j = mat->j;
1337: imat_i = mat->i;
1338: imat_a = mat->a;
1339: max1 = sbuf1_i[2*j];
1340: for (k=0; k<max1; k++,ct1++) {
1341: row = sbuf1_i[ct1];
1342: row = rmap_i[row];
1343: ilen = imat_ilen[row];
1344: mat_i = imat_i[row] ;
1345: mat_a = imat_a + mat_i;
1346: mat_j = imat_j + mat_i;
1347: max2 = rbuf2_i[ct1];
1348: for (l=0; l<max2; l++,ct2++) {
1349: if ((tcol = cmap_i[rbuf3_i[ct2]])) {
1350: *mat_j++ = tcol - 1;
1351: *mat_a++ = rbuf4_i[ct2];
1352: ilen++;
1353: }
1354: }
1355: imat_ilen[row] = ilen;
1356: }
1357: }
1358: }
1359: }
1360: PetscFree(r_status4);
1361: PetscFree(r_waits4);
1362: if (nrqr) {MPI_Waitall(nrqr,s_waits4,s_status4);}
1363: PetscFree(s_waits4);
1364: PetscFree(s_status4);
1366: /* Restore the indices */
1367: for (i=0; i<ismax; i++) {
1368: ISRestoreIndices(isrow[i],irow+i);
1369: ISRestoreIndices(iscol[i],icol+i);
1370: }
1372: /* Destroy allocated memory */
1373: PetscFree(irow);
1374: PetscFree(w1);
1375: PetscFree(pa);
1377: PetscFree(sbuf1);
1378: PetscFree(rbuf2);
1379: for (i=0; i<nrqr; ++i) {
1380: PetscFree(sbuf2[i]);
1381: }
1382: for (i=0; i<nrqs; ++i) {
1383: PetscFree(rbuf3[i]);
1384: PetscFree(rbuf4[i]);
1385: }
1387: PetscFree(sbuf2);
1388: PetscFree(rbuf3);
1389: PetscFree(rbuf4);
1390: PetscFree(sbuf_aj[0]);
1391: PetscFree(sbuf_aj);
1392: PetscFree(sbuf_aa[0]);
1393: PetscFree(sbuf_aa);
1394:
1395: PetscFree(cmap);
1396: PetscFree(rmap);
1397: PetscFree(lens);
1399: for (i=0; i<ismax; i++) {
1400: MatAssemblyBegin(submats[i],MAT_FINAL_ASSEMBLY);
1401: MatAssemblyEnd(submats[i],MAT_FINAL_ASSEMBLY);
1402: }
1403: return(0);
1404: }