Actual source code: baijov.c
1: #define PETSCMAT_DLL
3: /*
4: Routines to compute overlapping regions of a parallel MPI matrix
5: and to find submatrices that were shared across processors.
6: */
7: #include ../src/mat/impls/baij/mpi/mpibaij.h
8: #include petscbt.h
10: static PetscErrorCode MatIncreaseOverlap_MPIBAIJ_Once(Mat,PetscInt,IS *);
11: static PetscErrorCode MatIncreaseOverlap_MPIBAIJ_Local(Mat,PetscInt,char **,PetscInt*,PetscInt**);
12: static PetscErrorCode MatIncreaseOverlap_MPIBAIJ_Receive(Mat,PetscInt,PetscInt **,PetscInt**,PetscInt*);
13: EXTERN PetscErrorCode MatGetRow_MPIBAIJ(Mat,PetscInt,PetscInt*,PetscInt**,PetscScalar**);
14: EXTERN PetscErrorCode MatRestoreRow_MPIBAIJ(Mat,PetscInt,PetscInt*,PetscInt**,PetscScalar**);
18: PetscErrorCode MatIncreaseOverlap_MPIBAIJ(Mat C,PetscInt imax,IS is[],PetscInt ov)
19: {
21: PetscInt i,N=C->cmap->N, bs=C->rmap->bs;
22: IS *is_new;
25: PetscMalloc(imax*sizeof(IS),&is_new);
26: /* Convert the indices into block format */
27: ISCompressIndicesGeneral(N,bs,imax,is,is_new);
28: if (ov < 0){ SETERRQ(PETSC_ERR_ARG_OUTOFRANGE,"Negative overlap specified\n");}
29: for (i=0; i<ov; ++i) {
30: MatIncreaseOverlap_MPIBAIJ_Once(C,imax,is_new);
31: }
32: for (i=0; i<imax; i++) {ISDestroy(is[i]);}
33: ISExpandIndicesGeneral(N,bs,imax,is_new,is);
34: for (i=0; i<imax; i++) {ISDestroy(is_new[i]);}
35: PetscFree(is_new);
36: return(0);
37: }
39: /*
40: Sample message format:
41: If a processor A wants processor B to process some elements corresponding
42: to index sets is[1], is[5]
43: mesg [0] = 2 (no of index sets in the mesg)
44: -----------
45: mesg [1] = 1 => is[1]
46: mesg [2] = sizeof(is[1]);
47: -----------
48: mesg [5] = 5 => is[5]
49: mesg [6] = sizeof(is[5]);
50: -----------
51: mesg [7]
52: mesg [n] data(is[1])
53: -----------
54: mesg[n+1]
55: mesg[m] data(is[5])
56: -----------
57:
58: Notes:
59: nrqs - no of requests sent (or to be sent out)
60: nrqr - no of requests recieved (which have to be or which have been processed
61: */
64: static PetscErrorCode MatIncreaseOverlap_MPIBAIJ_Once(Mat C,PetscInt imax,IS is[])
65: {
66: Mat_MPIBAIJ *c = (Mat_MPIBAIJ*)C->data;
67: const PetscInt **idx,*idx_i;
68: PetscInt *n,*w3,*w4,*rtable,**data,len;
70: PetscMPIInt size,rank,tag1,tag2,*w2,*w1,nrqr;
71: PetscInt Mbs,i,j,k,**rbuf,row,proc,nrqs,msz,**outdat,**ptr;
72: PetscInt *ctr,*pa,*tmp,*isz,*isz1,**xdata,**rbuf2;
73: PetscMPIInt *onodes1,*olengths1,*onodes2,*olengths2;
74: PetscBT *table;
75: MPI_Comm comm;
76: MPI_Request *s_waits1,*r_waits1,*s_waits2,*r_waits2;
77: MPI_Status *s_status,*recv_status;
80: comm = ((PetscObject)C)->comm;
81: size = c->size;
82: rank = c->rank;
83: Mbs = c->Mbs;
85: PetscObjectGetNewTag((PetscObject)C,&tag1);
86: PetscObjectGetNewTag((PetscObject)C,&tag2);
88: PetscMalloc((imax+1)*sizeof(PetscInt*)+ (imax + Mbs)*sizeof(PetscInt),&idx);
89: n = (PetscInt*)(idx + imax);
90: rtable = n + imax;
91:
92: for (i=0; i<imax; i++) {
93: ISGetIndices(is[i],&idx[i]);
94: ISGetLocalSize(is[i],&n[i]);
95: }
96:
97: /* Create hash table for the mapping :row -> proc*/
98: for (i=0,j=0; i<size; i++) {
99: len = c->rangebs[i+1];
100: for (; j<len; j++) {
101: rtable[j] = i;
102: }
103: }
105: /* evaluate communication - mesg to who,length of mesg, and buffer space
106: required. Based on this, buffers are allocated, and data copied into them*/
107: PetscMalloc(size*2*sizeof(PetscInt)+size*2*sizeof(PetscMPIInt),&w1);/* mesg size */
108: w2 = w1 + size; /* if w2[i] marked, then a message to proc i*/
109: w3 = (PetscInt*) (w2 + size); /* no of IS that needs to be sent to proc i */
110: w4 = w3 + size; /* temp work space used in determining w1, w2, w3 */
111: PetscMemzero(w1,size*sizeof(PetscInt)+2*size*sizeof(PetscMPIInt)); /* initialise work vector*/
112: for (i=0; i<imax; i++) {
113: PetscMemzero(w4,size*sizeof(PetscInt)); /* initialise work vector*/
114: idx_i = idx[i];
115: len = n[i];
116: for (j=0; j<len; j++) {
117: row = idx_i[j];
118: if (row < 0) {
119: SETERRQ(PETSC_ERR_ARG_OUTOFRANGE,"Index set cannot have negative entries");
120: }
121: proc = rtable[row];
122: w4[proc]++;
123: }
124: for (j=0; j<size; j++){
125: if (w4[j]) { w1[j] += w4[j]; w3[j]++;}
126: }
127: }
129: nrqs = 0; /* no of outgoing messages */
130: msz = 0; /* total mesg length (for all proc */
131: w1[rank] = 0; /* no mesg sent to itself */
132: w3[rank] = 0;
133: for (i=0; i<size; i++) {
134: if (w1[i]) {w2[i] = 1; nrqs++;} /* there exists a message to proc i */
135: }
136: /* pa - is list of processors to communicate with */
137: PetscMalloc((nrqs+1)*sizeof(PetscInt),&pa);
138: for (i=0,j=0; i<size; i++) {
139: if (w1[i]) {pa[j] = i; j++;}
140: }
142: /* Each message would have a header = 1 + 2*(no of IS) + data */
143: for (i=0; i<nrqs; i++) {
144: j = pa[i];
145: w1[j] += w2[j] + 2*w3[j];
146: msz += w1[j];
147: }
148:
149: /* Determine the number of messages to expect, their lengths, from from-ids */
150: PetscGatherNumberOfMessages(comm,w2,w1,&nrqr);
151: PetscGatherMessageLengths(comm,nrqs,nrqr,w1,&onodes1,&olengths1);
153: /* Now post the Irecvs corresponding to these messages */
154: PetscPostIrecvInt(comm,tag1,nrqr,onodes1,olengths1,&rbuf,&r_waits1);
155:
156: /* Allocate Memory for outgoing messages */
157: PetscMalloc(2*size*sizeof(PetscInt*) + (size+msz)*sizeof(PetscInt),&outdat);
158: ptr = outdat + size; /* Pointers to the data in outgoing buffers */
159: PetscMemzero(outdat,2*size*sizeof(PetscInt*));
160: tmp = (PetscInt*)(outdat + 2*size);
161: ctr = tmp + msz;
163: {
164: PetscInt *iptr = tmp,ict = 0;
165: for (i=0; i<nrqs; i++) {
166: j = pa[i];
167: iptr += ict;
168: outdat[j] = iptr;
169: ict = w1[j];
170: }
171: }
173: /* Form the outgoing messages */
174: /*plug in the headers*/
175: for (i=0; i<nrqs; i++) {
176: j = pa[i];
177: outdat[j][0] = 0;
178: PetscMemzero(outdat[j]+1,2*w3[j]*sizeof(PetscInt));
179: ptr[j] = outdat[j] + 2*w3[j] + 1;
180: }
181:
182: /* Memory for doing local proc's work*/
183: {
184: PetscInt *d_p;
185: char *t_p;
187: PetscMalloc((imax)*(sizeof(PetscBT) + sizeof(PetscInt*)+ sizeof(PetscInt)) +
188: (Mbs)*imax*sizeof(PetscInt) + (Mbs/PETSC_BITS_PER_BYTE+1)*imax*sizeof(char),&table);
189: PetscMemzero(table,(imax)*(sizeof(PetscBT) + sizeof(PetscInt*)+ sizeof(PetscInt)) +
190: (Mbs)*imax*sizeof(PetscInt) + (Mbs/PETSC_BITS_PER_BYTE+1)*imax*sizeof(char));
191: data = (PetscInt **)(table + imax);
192: isz = (PetscInt *)(data + imax);
193: d_p = (PetscInt *)(isz + imax);
194: t_p = (char *)(d_p + Mbs*imax);
195: for (i=0; i<imax; i++) {
196: table[i] = t_p + (Mbs/PETSC_BITS_PER_BYTE+1)*i;
197: data[i] = d_p + (Mbs)*i;
198: }
199: }
201: /* Parse the IS and update local tables and the outgoing buf with the data*/
202: {
203: PetscInt n_i,*data_i,isz_i,*outdat_j,ctr_j;
204: PetscBT table_i;
206: for (i=0; i<imax; i++) {
207: PetscMemzero(ctr,size*sizeof(PetscInt));
208: n_i = n[i];
209: table_i = table[i];
210: idx_i = idx[i];
211: data_i = data[i];
212: isz_i = isz[i];
213: for (j=0; j<n_i; j++) { /* parse the indices of each IS */
214: row = idx_i[j];
215: proc = rtable[row];
216: if (proc != rank) { /* copy to the outgoing buffer */
217: ctr[proc]++;
218: *ptr[proc] = row;
219: ptr[proc]++;
220: }
221: else { /* Update the local table */
222: if (!PetscBTLookupSet(table_i,row)) { data_i[isz_i++] = row;}
223: }
224: }
225: /* Update the headers for the current IS */
226: for (j=0; j<size; j++) { /* Can Optimise this loop by using pa[] */
227: if ((ctr_j = ctr[j])) {
228: outdat_j = outdat[j];
229: k = ++outdat_j[0];
230: outdat_j[2*k] = ctr_j;
231: outdat_j[2*k-1] = i;
232: }
233: }
234: isz[i] = isz_i;
235: }
236: }
237:
238: /* Now post the sends */
239: PetscMalloc((nrqs+1)*sizeof(MPI_Request),&s_waits1);
240: for (i=0; i<nrqs; ++i) {
241: j = pa[i];
242: MPI_Isend(outdat[j],w1[j],MPIU_INT,j,tag1,comm,s_waits1+i);
243: }
244:
245: /* No longer need the original indices*/
246: for (i=0; i<imax; ++i) {
247: ISRestoreIndices(is[i],idx+i);
248: }
249: PetscFree(idx);
251: for (i=0; i<imax; ++i) {
252: ISDestroy(is[i]);
253: }
254:
255: /* Do Local work*/
256: MatIncreaseOverlap_MPIBAIJ_Local(C,imax,table,isz,data);
258: /* Receive messages*/
259: PetscMalloc((nrqr+1)*sizeof(MPI_Status),&recv_status);
260: if (nrqr) {MPI_Waitall(nrqr,r_waits1,recv_status);}
261:
262: PetscMalloc((nrqs+1)*sizeof(MPI_Status),&s_status);
263: if (nrqs) {MPI_Waitall(nrqs,s_waits1,s_status);}
265: /* Phase 1 sends are complete - deallocate buffers */
266: PetscFree(outdat);
267: PetscFree(w1);
269: PetscMalloc((nrqr+1)*sizeof(PetscInt*),&xdata);
270: PetscMalloc((nrqr+1)*sizeof(PetscInt),&isz1);
271: MatIncreaseOverlap_MPIBAIJ_Receive(C,nrqr,rbuf,xdata,isz1);
272: PetscFree(rbuf);
274: /* Send the data back*/
275: /* Do a global reduction to know the buffer space req for incoming messages*/
276: {
277: PetscMPIInt *rw1;
278:
279: PetscMalloc(size*sizeof(PetscInt),&rw1);
280: PetscMemzero(rw1,size*sizeof(PetscInt));
282: for (i=0; i<nrqr; ++i) {
283: proc = recv_status[i].MPI_SOURCE;
284: if (proc != onodes1[i]) SETERRQ(PETSC_ERR_PLIB,"MPI_SOURCE mismatch");
285: rw1[proc] = isz1[i];
286: }
287:
288: PetscFree(onodes1);
289: PetscFree(olengths1);
291: /* Determine the number of messages to expect, their lengths, from from-ids */
292: PetscGatherMessageLengths(comm,nrqr,nrqs,rw1,&onodes2,&olengths2);
293: PetscFree(rw1);
294: }
295: /* Now post the Irecvs corresponding to these messages */
296: PetscPostIrecvInt(comm,tag2,nrqs,onodes2,olengths2,&rbuf2,&r_waits2);
297:
298: /* Now post the sends */
299: PetscMalloc((nrqr+1)*sizeof(MPI_Request),&s_waits2);
300: for (i=0; i<nrqr; ++i) {
301: j = recv_status[i].MPI_SOURCE;
302: MPI_Isend(xdata[i],isz1[i],MPIU_INT,j,tag2,comm,s_waits2+i);
303: }
305: /* receive work done on other processors*/
306: {
307: PetscMPIInt idex;
308: PetscInt is_no,ct1,max,*rbuf2_i,isz_i,*data_i,jmax;
309: PetscBT table_i;
310: MPI_Status *status2;
311:
312: PetscMalloc((PetscMax(nrqr,nrqs)+1)*sizeof(MPI_Status),&status2);
314: for (i=0; i<nrqs; ++i) {
315: MPI_Waitany(nrqs,r_waits2,&idex,status2+i);
316: /* Process the message*/
317: rbuf2_i = rbuf2[idex];
318: ct1 = 2*rbuf2_i[0]+1;
319: jmax = rbuf2[idex][0];
320: for (j=1; j<=jmax; j++) {
321: max = rbuf2_i[2*j];
322: is_no = rbuf2_i[2*j-1];
323: isz_i = isz[is_no];
324: data_i = data[is_no];
325: table_i = table[is_no];
326: for (k=0; k<max; k++,ct1++) {
327: row = rbuf2_i[ct1];
328: if (!PetscBTLookupSet(table_i,row)) { data_i[isz_i++] = row;}
329: }
330: isz[is_no] = isz_i;
331: }
332: }
333: if (nrqr) {MPI_Waitall(nrqr,s_waits2,status2);}
334: PetscFree(status2);
335: }
336:
337: for (i=0; i<imax; ++i) {
338: ISCreateGeneral(PETSC_COMM_SELF,isz[i],data[i],is+i);
339: }
340:
341:
342: PetscFree(onodes2);
343: PetscFree(olengths2);
345: PetscFree(pa);
346: PetscFree(rbuf2);
347: PetscFree(s_waits1);
348: PetscFree(r_waits1);
349: PetscFree(s_waits2);
350: PetscFree(r_waits2);
351: PetscFree(table);
352: PetscFree(s_status);
353: PetscFree(recv_status);
354: PetscFree(xdata[0]);
355: PetscFree(xdata);
356: PetscFree(isz1);
357: return(0);
358: }
362: /*
363: MatIncreaseOverlap_MPIBAIJ_Local - Called by MatincreaseOverlap, to do
364: the work on the local processor.
366: Inputs:
367: C - MAT_MPIBAIJ;
368: imax - total no of index sets processed at a time;
369: table - an array of char - size = Mbs bits.
370:
371: Output:
372: isz - array containing the count of the solution elements corresponding
373: to each index set;
374: data - pointer to the solutions
375: */
376: static PetscErrorCode MatIncreaseOverlap_MPIBAIJ_Local(Mat C,PetscInt imax,PetscBT *table,PetscInt *isz,PetscInt **data)
377: {
378: Mat_MPIBAIJ *c = (Mat_MPIBAIJ*)C->data;
379: Mat A = c->A,B = c->B;
380: Mat_SeqBAIJ *a = (Mat_SeqBAIJ*)A->data,*b = (Mat_SeqBAIJ*)B->data;
381: PetscInt start,end,val,max,rstart,cstart,*ai,*aj;
382: PetscInt *bi,*bj,*garray,i,j,k,row,*data_i,isz_i;
383: PetscBT table_i;
386: rstart = c->rstartbs;
387: cstart = c->cstartbs;
388: ai = a->i;
389: aj = a->j;
390: bi = b->i;
391: bj = b->j;
392: garray = c->garray;
394:
395: for (i=0; i<imax; i++) {
396: data_i = data[i];
397: table_i = table[i];
398: isz_i = isz[i];
399: for (j=0,max=isz[i]; j<max; j++) {
400: row = data_i[j] - rstart;
401: start = ai[row];
402: end = ai[row+1];
403: for (k=start; k<end; k++) { /* Amat */
404: val = aj[k] + cstart;
405: if (!PetscBTLookupSet(table_i,val)) { data_i[isz_i++] = val;}
406: }
407: start = bi[row];
408: end = bi[row+1];
409: for (k=start; k<end; k++) { /* Bmat */
410: val = garray[bj[k]];
411: if (!PetscBTLookupSet(table_i,val)) { data_i[isz_i++] = val;}
412: }
413: }
414: isz[i] = isz_i;
415: }
416: return(0);
417: }
420: /*
421: MatIncreaseOverlap_MPIBAIJ_Receive - Process the recieved messages,
422: and return the output
424: Input:
425: C - the matrix
426: nrqr - no of messages being processed.
427: rbuf - an array of pointers to the recieved requests
428:
429: Output:
430: xdata - array of messages to be sent back
431: isz1 - size of each message
433: For better efficiency perhaps we should malloc separately each xdata[i],
434: then if a remalloc is required we need only copy the data for that one row
435: rather than all previous rows as it is now where a single large chunck of
436: memory is used.
438: */
439: static PetscErrorCode MatIncreaseOverlap_MPIBAIJ_Receive(Mat C,PetscInt nrqr,PetscInt **rbuf,PetscInt **xdata,PetscInt * isz1)
440: {
441: Mat_MPIBAIJ *c = (Mat_MPIBAIJ*)C->data;
442: Mat A = c->A,B = c->B;
443: Mat_SeqBAIJ *a = (Mat_SeqBAIJ*)A->data,*b = (Mat_SeqBAIJ*)B->data;
445: PetscInt rstart,cstart,*ai,*aj,*bi,*bj,*garray,i,j,k;
446: PetscInt row,total_sz,ct,ct1,ct2,ct3,mem_estimate,oct2,l,start,end;
447: PetscInt val,max1,max2,rank,Mbs,no_malloc =0,*tmp,new_estimate,ctr;
448: PetscInt *rbuf_i,kmax,rbuf_0;
449: PetscBT xtable;
452: rank = c->rank;
453: Mbs = c->Mbs;
454: rstart = c->rstartbs;
455: cstart = c->cstartbs;
456: ai = a->i;
457: aj = a->j;
458: bi = b->i;
459: bj = b->j;
460: garray = c->garray;
461:
462:
463: for (i=0,ct=0,total_sz=0; i<nrqr; ++i) {
464: rbuf_i = rbuf[i];
465: rbuf_0 = rbuf_i[0];
466: ct += rbuf_0;
467: for (j=1; j<=rbuf_0; j++) { total_sz += rbuf_i[2*j]; }
468: }
469:
470: if (c->Mbs) max1 = ct*(a->nz +b->nz)/c->Mbs;
471: else max1 = 1;
472: mem_estimate = 3*((total_sz > max1 ? total_sz : max1)+1);
473: PetscMalloc(mem_estimate*sizeof(PetscInt),&xdata[0]);
474: ++no_malloc;
475: PetscBTCreate(Mbs,xtable);
476: PetscMemzero(isz1,nrqr*sizeof(PetscInt));
477:
478: ct3 = 0;
479: for (i=0; i<nrqr; i++) { /* for easch mesg from proc i */
480: rbuf_i = rbuf[i];
481: rbuf_0 = rbuf_i[0];
482: ct1 = 2*rbuf_0+1;
483: ct2 = ct1;
484: ct3 += ct1;
485: for (j=1; j<=rbuf_0; j++) { /* for each IS from proc i*/
486: PetscBTMemzero(Mbs,xtable);
487: oct2 = ct2;
488: kmax = rbuf_i[2*j];
489: for (k=0; k<kmax; k++,ct1++) {
490: row = rbuf_i[ct1];
491: if (!PetscBTLookupSet(xtable,row)) {
492: if (!(ct3 < mem_estimate)) {
493: new_estimate = (PetscInt)(1.5*mem_estimate)+1;
494: PetscMalloc(new_estimate * sizeof(PetscInt),&tmp);
495: PetscMemcpy(tmp,xdata[0],mem_estimate*sizeof(PetscInt));
496: PetscFree(xdata[0]);
497: xdata[0] = tmp;
498: mem_estimate = new_estimate; ++no_malloc;
499: for (ctr=1; ctr<=i; ctr++) { xdata[ctr] = xdata[ctr-1] + isz1[ctr-1];}
500: }
501: xdata[i][ct2++] = row;
502: ct3++;
503: }
504: }
505: for (k=oct2,max2=ct2; k<max2; k++) {
506: row = xdata[i][k] - rstart;
507: start = ai[row];
508: end = ai[row+1];
509: for (l=start; l<end; l++) {
510: val = aj[l] + cstart;
511: if (!PetscBTLookupSet(xtable,val)) {
512: if (!(ct3 < mem_estimate)) {
513: new_estimate = (PetscInt)(1.5*mem_estimate)+1;
514: PetscMalloc(new_estimate * sizeof(PetscInt),&tmp);
515: PetscMemcpy(tmp,xdata[0],mem_estimate*sizeof(PetscInt));
516: PetscFree(xdata[0]);
517: xdata[0] = tmp;
518: mem_estimate = new_estimate; ++no_malloc;
519: for (ctr=1; ctr<=i; ctr++) { xdata[ctr] = xdata[ctr-1] + isz1[ctr-1];}
520: }
521: xdata[i][ct2++] = val;
522: ct3++;
523: }
524: }
525: start = bi[row];
526: end = bi[row+1];
527: for (l=start; l<end; l++) {
528: val = garray[bj[l]];
529: if (!PetscBTLookupSet(xtable,val)) {
530: if (!(ct3 < mem_estimate)) {
531: new_estimate = (PetscInt)(1.5*mem_estimate)+1;
532: PetscMalloc(new_estimate * sizeof(PetscInt),&tmp);
533: PetscMemcpy(tmp,xdata[0],mem_estimate*sizeof(PetscInt));
534: PetscFree(xdata[0]);
535: xdata[0] = tmp;
536: mem_estimate = new_estimate; ++no_malloc;
537: for (ctr =1; ctr <=i; ctr++) { xdata[ctr] = xdata[ctr-1] + isz1[ctr-1];}
538: }
539: xdata[i][ct2++] = val;
540: ct3++;
541: }
542: }
543: }
544: /* Update the header*/
545: xdata[i][2*j] = ct2 - oct2; /* Undo the vector isz1 and use only a var*/
546: xdata[i][2*j-1] = rbuf_i[2*j-1];
547: }
548: xdata[i][0] = rbuf_0;
549: xdata[i+1] = xdata[i] + ct2;
550: isz1[i] = ct2; /* size of each message */
551: }
552: PetscBTDestroy(xtable);
553: PetscInfo3(C,"Allocated %D bytes, required %D, no of mallocs = %D\n",mem_estimate,ct3,no_malloc);
554: return(0);
555: }
557: static PetscErrorCode MatGetSubMatrices_MPIBAIJ_local(Mat,PetscInt,const IS[],const IS[],MatReuse,Mat *);
561: PetscErrorCode MatGetSubMatrices_MPIBAIJ(Mat C,PetscInt ismax,const IS isrow[],const IS iscol[],MatReuse scall,Mat *submat[])
562: {
563: IS *isrow_new,*iscol_new;
564: Mat_MPIBAIJ *c = (Mat_MPIBAIJ*)C->data;
566: PetscInt nmax,nstages_local,nstages,i,pos,max_no,N=C->cmap->N,bs=C->rmap->bs;
569: /* The compression and expansion should be avoided. Does'nt point
570: out errors might change the indices hence buggey */
572: PetscMalloc(2*(ismax+1)*sizeof(IS),&isrow_new);
573: iscol_new = isrow_new + ismax;
574: ISCompressIndicesSorted(N,bs,ismax,isrow,isrow_new);
575: ISCompressIndicesSorted(N,bs,ismax,iscol,iscol_new);
577: /* Allocate memory to hold all the submatrices */
578: if (scall != MAT_REUSE_MATRIX) {
579: PetscMalloc((ismax+1)*sizeof(Mat),submat);
580: }
581: /* Determine the number of stages through which submatrices are done */
582: nmax = 20*1000000 / (c->Nbs * sizeof(PetscInt));
583: if (!nmax) nmax = 1;
584: nstages_local = ismax/nmax + ((ismax % nmax)?1:0);
585:
586: /* Make sure every processor loops through the nstages */
587: MPI_Allreduce(&nstages_local,&nstages,1,MPIU_INT,MPI_MAX,((PetscObject)C)->comm);
588: for (i=0,pos=0; i<nstages; i++) {
589: if (pos+nmax <= ismax) max_no = nmax;
590: else if (pos == ismax) max_no = 0;
591: else max_no = ismax-pos;
592: MatGetSubMatrices_MPIBAIJ_local(C,max_no,isrow_new+pos,iscol_new+pos,scall,*submat+pos);
593: pos += max_no;
594: }
595:
596: for (i=0; i<ismax; i++) {
597: ISDestroy(isrow_new[i]);
598: ISDestroy(iscol_new[i]);
599: }
600: PetscFree(isrow_new);
601: return(0);
602: }
604: #if defined (PETSC_USE_CTABLE)
607: PetscErrorCode PetscGetProc(const PetscInt row, const PetscMPIInt size, const PetscInt proc_gnode[], PetscMPIInt *rank)
608: {
609: PetscInt nGlobalNd = proc_gnode[size];
610: PetscMPIInt fproc = PetscMPIIntCast( (PetscInt)(((float)row * (float)size / (float)nGlobalNd + 0.5)));
611:
613: if (fproc > size) fproc = size;
614: while (row < proc_gnode[fproc] || row >= proc_gnode[fproc+1]) {
615: if (row < proc_gnode[fproc]) fproc--;
616: else fproc++;
617: }
618: *rank = fproc;
619: return(0);
620: }
621: #endif
623: /* -------------------------------------------------------------------------*/
624: /* This code is used for BAIJ and SBAIJ matrices (unfortunate dependency) */
627: static PetscErrorCode MatGetSubMatrices_MPIBAIJ_local(Mat C,PetscInt ismax,const IS isrow[],const IS iscol[],MatReuse scall,Mat *submats)
628: {
629: Mat_MPIBAIJ *c = (Mat_MPIBAIJ*)C->data;
630: Mat A = c->A;
631: Mat_SeqBAIJ *a = (Mat_SeqBAIJ*)A->data,*b = (Mat_SeqBAIJ*)c->B->data,*mat;
632: const PetscInt **irow,**icol,*irow_i;
633: PetscInt *nrow,*ncol,*w3,*w4,start;
635: PetscMPIInt size,tag0,tag1,tag2,tag3,*w1,*w2,nrqr,idex,end,proc;
636: PetscInt **sbuf1,**sbuf2,rank,i,j,k,l,ct1,ct2,**rbuf1,row;
637: PetscInt nrqs,msz,**ptr,*req_size,*ctr,*pa,*tmp,tcol;
638: PetscInt **rbuf3,*req_source,**sbuf_aj,**rbuf2,max1,max2;
639: PetscInt **lens,is_no,ncols,*cols,mat_i,*mat_j,tmp2,jmax;
640: PetscInt ctr_j,*sbuf1_j,*sbuf_aj_i,*rbuf1_i,kmax,*lens_i;
641: PetscInt bs=C->rmap->bs,bs2=c->bs2,*a_j=a->j,*b_j=b->j,*cworkA,*cworkB;
642: PetscInt cstart = c->cstartbs,nzA,nzB,*a_i=a->i,*b_i=b->i,imark;
643: PetscInt *bmap = c->garray,ctmp,rstart=c->rstartbs;
644: MPI_Request *s_waits1,*r_waits1,*s_waits2,*r_waits2,*r_waits3;
645: MPI_Request *r_waits4,*s_waits3,*s_waits4;
646: MPI_Status *r_status1,*r_status2,*s_status1,*s_status3,*s_status2;
647: MPI_Status *r_status3,*r_status4,*s_status4;
648: MPI_Comm comm;
649: MatScalar **rbuf4,**sbuf_aa,*vals,*mat_a,*sbuf_aa_i,*vworkA,*vworkB;
650: MatScalar *a_a=a->a,*b_a=b->a;
651: PetscTruth flag,sorted;
652: PetscMPIInt *onodes1,*olengths1;
654: #if defined (PETSC_USE_CTABLE)
655: PetscInt tt;
656: PetscTable *rowmaps,*colmaps,lrow1_grow1,lcol1_gcol1;
657: #else
658: PetscInt **cmap,*cmap_i,*rtable,*rmap_i,**rmap, Mbs = c->Mbs;
659: #endif
662: comm = ((PetscObject)C)->comm;
663: tag0 = ((PetscObject)C)->tag;
664: size = c->size;
665: rank = c->rank;
666:
667: /* Get some new tags to keep the communication clean */
668: PetscObjectGetNewTag((PetscObject)C,&tag1);
669: PetscObjectGetNewTag((PetscObject)C,&tag2);
670: PetscObjectGetNewTag((PetscObject)C,&tag3);
672: /* Check if the col indices are sorted */
673: for (i=0; i<ismax; i++) {
674: ISSorted(iscol[i],&sorted);
675: if (!sorted) SETERRQ(PETSC_ERR_ARG_WRONGSTATE,"IS is not sorted");
676: }
678: #if defined(PETSC_USE_CTABLE)
679: PetscMalloc((2*ismax+1)*(sizeof(PetscInt*)+ sizeof(PetscInt)),&irow);
680: #else
681: PetscMalloc((2*ismax+1)*(sizeof(PetscInt*)+ sizeof(PetscInt)) + (Mbs+1)*sizeof(PetscInt),&irow);
682: #endif
684: icol = irow + ismax;
685: nrow = (PetscInt*)(icol + ismax);
686: ncol = nrow + ismax;
687: #if !defined (PETSC_USE_CTABLE)
688: rtable = ncol + ismax;
689: /* Create hash table for the mapping :row -> proc*/
690: for (i=0,j=0; i<size; i++) {
691: jmax = c->rowners[i+1];
692: for (; j<jmax; j++) {
693: rtable[j] = i;
694: }
695: }
696: #endif
697:
698: for (i=0; i<ismax; i++) {
699: ISGetIndices(isrow[i],&irow[i]);
700: ISGetIndices(iscol[i],&icol[i]);
701: ISGetLocalSize(isrow[i],&nrow[i]);
702: ISGetLocalSize(iscol[i],&ncol[i]);
703: }
705: /* evaluate communication - mesg to who,length of mesg,and buffer space
706: required. Based on this, buffers are allocated, and data copied into them*/
707: PetscMalloc(size*2*sizeof(PetscInt) + size*2*sizeof(PetscMPIInt),&w1); /* mesg size */
708: w2 = w1 + size; /* if w2[i] marked, then a message to proc i*/
709: w3 = (PetscInt*)(w2 + size); /* no of IS that needs to be sent to proc i */
710: w4 = w3 + size; /* temp work space used in determining w1, w2, w3 */
711: PetscMemzero(w1,size*sizeof(PetscInt)+2*size*sizeof(PetscMPIInt)); /* initialise work vector*/
712: for (i=0; i<ismax; i++) {
713: PetscMemzero(w4,size*sizeof(PetscInt)); /* initialise work vector*/
714: jmax = nrow[i];
715: irow_i = irow[i];
716: for (j=0; j<jmax; j++) {
717: row = irow_i[j];
718: #if defined (PETSC_USE_CTABLE)
719: PetscGetProc(row,size,c->rangebs,&proc);
720: #else
721: proc = rtable[row];
722: #endif
723: w4[proc]++;
724: }
725: for (j=0; j<size; j++) {
726: if (w4[j]) { w1[j] += w4[j]; w3[j]++;}
727: }
728: }
730: nrqs = 0; /* no of outgoing messages */
731: msz = 0; /* total mesg length for all proc */
732: w1[rank] = 0; /* no mesg sent to intself */
733: w3[rank] = 0;
734: for (i=0; i<size; i++) {
735: if (w1[i]) { w2[i] = 1; nrqs++;} /* there exists a message to proc i */
736: }
737: PetscMalloc((nrqs+1)*sizeof(PetscInt),&pa); /*(proc -array)*/
738: for (i=0,j=0; i<size; i++) {
739: if (w1[i]) { pa[j] = i; j++; }
740: }
742: /* Each message would have a header = 1 + 2*(no of IS) + data */
743: for (i=0; i<nrqs; i++) {
744: j = pa[i];
745: w1[j] += w2[j] + 2* w3[j];
746: msz += w1[j];
747: }
749: /* Determine the number of messages to expect, their lengths, from from-ids */
750: PetscGatherNumberOfMessages(comm,w2,w1,&nrqr);
751: PetscGatherMessageLengths(comm,nrqs,nrqr,w1,&onodes1,&olengths1);
753: /* Now post the Irecvs corresponding to these messages */
754: PetscPostIrecvInt(comm,tag0,nrqr,onodes1,olengths1,&rbuf1,&r_waits1);
755:
756: PetscFree(onodes1);
757: PetscFree(olengths1);
759: /* Allocate Memory for outgoing messages */
760: PetscMalloc(2*size*sizeof(PetscInt*) + 2*msz*sizeof(PetscInt) + size*sizeof(PetscInt),&sbuf1);
761: ptr = sbuf1 + size; /* Pointers to the data in outgoing buffers */
762: PetscMemzero(sbuf1,2*size*sizeof(PetscInt*));
763: /* allocate memory for outgoing data + buf to receive the first reply */
764: tmp = (PetscInt*)(ptr + size);
765: ctr = tmp + 2*msz;
767: {
768: PetscInt *iptr = tmp,ict = 0;
769: for (i=0; i<nrqs; i++) {
770: j = pa[i];
771: iptr += ict;
772: sbuf1[j] = iptr;
773: ict = w1[j];
774: }
775: }
777: /* Form the outgoing messages */
778: /* Initialise the header space */
779: for (i=0; i<nrqs; i++) {
780: j = pa[i];
781: sbuf1[j][0] = 0;
782: PetscMemzero(sbuf1[j]+1,2*w3[j]*sizeof(PetscInt));
783: ptr[j] = sbuf1[j] + 2*w3[j] + 1;
784: }
785:
786: /* Parse the isrow and copy data into outbuf */
787: for (i=0; i<ismax; i++) {
788: PetscMemzero(ctr,size*sizeof(PetscInt));
789: irow_i = irow[i];
790: jmax = nrow[i];
791: for (j=0; j<jmax; j++) { /* parse the indices of each IS */
792: row = irow_i[j];
793: #if defined (PETSC_USE_CTABLE)
794: PetscGetProc(row,size,c->rangebs,&proc);
795: #else
796: proc = rtable[row];
797: #endif
798: if (proc != rank) { /* copy to the outgoing buf*/
799: ctr[proc]++;
800: *ptr[proc] = row;
801: ptr[proc]++;
802: }
803: }
804: /* Update the headers for the current IS */
805: for (j=0; j<size; j++) { /* Can Optimise this loop too */
806: if ((ctr_j = ctr[j])) {
807: sbuf1_j = sbuf1[j];
808: k = ++sbuf1_j[0];
809: sbuf1_j[2*k] = ctr_j;
810: sbuf1_j[2*k-1] = i;
811: }
812: }
813: }
815: /* Now post the sends */
816: PetscMalloc((nrqs+1)*sizeof(MPI_Request),&s_waits1);
817: for (i=0; i<nrqs; ++i) {
818: j = pa[i];
819: MPI_Isend(sbuf1[j],w1[j],MPIU_INT,j,tag0,comm,s_waits1+i);
820: }
822: /* Post Recieves to capture the buffer size */
823: PetscMalloc((nrqs+1)*sizeof(MPI_Request),&r_waits2);
824: PetscMalloc((nrqs+1)*sizeof(PetscInt*),&rbuf2);
825: rbuf2[0] = tmp + msz;
826: for (i=1; i<nrqs; ++i) {
827: j = pa[i];
828: rbuf2[i] = rbuf2[i-1]+w1[pa[i-1]];
829: }
830: for (i=0; i<nrqs; ++i) {
831: j = pa[i];
832: MPI_Irecv(rbuf2[i],w1[j],MPIU_INT,j,tag1,comm,r_waits2+i);
833: }
835: /* Send to other procs the buf size they should allocate */
837: /* Receive messages*/
838: PetscMalloc((nrqr+1)*sizeof(MPI_Request),&s_waits2);
839: PetscMalloc((nrqr+1)*sizeof(MPI_Status),&r_status1);
840: PetscMalloc(2*nrqr*sizeof(PetscInt) + (nrqr+1)*sizeof(PetscInt*),&sbuf2);
841: req_size = (PetscInt*)(sbuf2 + nrqr);
842: req_source = req_size + nrqr;
843:
844: {
845: Mat_SeqBAIJ *sA = (Mat_SeqBAIJ*)c->A->data,*sB = (Mat_SeqBAIJ*)c->B->data;
846: PetscInt *sAi = sA->i,*sBi = sB->i,id,*sbuf2_i;
848: for (i=0; i<nrqr; ++i) {
849: MPI_Waitany(nrqr,r_waits1,&idex,r_status1+i);
850: req_size[idex] = 0;
851: rbuf1_i = rbuf1[idex];
852: start = 2*rbuf1_i[0] + 1;
853: MPI_Get_count(r_status1+i,MPIU_INT,&end);
854: PetscMalloc(end*sizeof(PetscInt),&sbuf2[idex]);
855: sbuf2_i = sbuf2[idex];
856: for (j=start; j<end; j++) {
857: id = rbuf1_i[j] - rstart;
858: ncols = sAi[id+1] - sAi[id] + sBi[id+1] - sBi[id];
859: sbuf2_i[j] = ncols;
860: req_size[idex] += ncols;
861: }
862: req_source[idex] = r_status1[i].MPI_SOURCE;
863: /* form the header */
864: sbuf2_i[0] = req_size[idex];
865: for (j=1; j<start; j++) { sbuf2_i[j] = rbuf1_i[j]; }
866: MPI_Isend(sbuf2_i,end,MPIU_INT,req_source[idex],tag1,comm,s_waits2+i);
867: }
868: }
869: PetscFree(r_status1);
870: PetscFree(r_waits1);
872: /* recv buffer sizes */
873: /* Receive messages*/
875: PetscMalloc((nrqs+1)*sizeof(PetscInt*),&rbuf3);
876: PetscMalloc((nrqs+1)*sizeof(MatScalar*),&rbuf4);
877: PetscMalloc((nrqs+1)*sizeof(MPI_Request),&r_waits3);
878: PetscMalloc((nrqs+1)*sizeof(MPI_Request),&r_waits4);
879: PetscMalloc((nrqs+1)*sizeof(MPI_Status),&r_status2);
881: for (i=0; i<nrqs; ++i) {
882: MPI_Waitany(nrqs,r_waits2,&idex,r_status2+i);
883: PetscMalloc(rbuf2[idex][0]*sizeof(PetscInt),&rbuf3[idex]);
884: PetscMalloc(rbuf2[idex][0]*bs2*sizeof(MatScalar),&rbuf4[idex]);
885: MPI_Irecv(rbuf3[idex],rbuf2[idex][0],MPIU_INT,r_status2[i].MPI_SOURCE,tag2,comm,r_waits3+idex);
886: MPI_Irecv(rbuf4[idex],rbuf2[idex][0]*bs2,MPIU_MATSCALAR,r_status2[i].MPI_SOURCE,tag3,comm,r_waits4+idex);
887: }
888: PetscFree(r_status2);
889: PetscFree(r_waits2);
890:
891: /* Wait on sends1 and sends2 */
892: PetscMalloc((nrqs+1)*sizeof(MPI_Status),&s_status1);
893: PetscMalloc((nrqr+1)*sizeof(MPI_Status),&s_status2);
895: if (nrqs) {MPI_Waitall(nrqs,s_waits1,s_status1);}
896: if (nrqr) {MPI_Waitall(nrqr,s_waits2,s_status2);}
897: PetscFree(s_status1);
898: PetscFree(s_status2);
899: PetscFree(s_waits1);
900: PetscFree(s_waits2);
902: /* Now allocate buffers for a->j, and send them off */
903: PetscMalloc((nrqr+1)*sizeof(PetscInt*),&sbuf_aj);
904: for (i=0,j=0; i<nrqr; i++) j += req_size[i];
905: PetscMalloc((j+1)*sizeof(PetscInt),&sbuf_aj[0]);
906: for (i=1; i<nrqr; i++) sbuf_aj[i] = sbuf_aj[i-1] + req_size[i-1];
907:
908: PetscMalloc((nrqr+1)*sizeof(MPI_Request),&s_waits3);
909: {
910: for (i=0; i<nrqr; i++) {
911: rbuf1_i = rbuf1[i];
912: sbuf_aj_i = sbuf_aj[i];
913: ct1 = 2*rbuf1_i[0] + 1;
914: ct2 = 0;
915: for (j=1,max1=rbuf1_i[0]; j<=max1; j++) {
916: kmax = rbuf1[i][2*j];
917: for (k=0; k<kmax; k++,ct1++) {
918: row = rbuf1_i[ct1] - rstart;
919: nzA = a_i[row+1] - a_i[row]; nzB = b_i[row+1] - b_i[row];
920: ncols = nzA + nzB;
921: cworkA = a_j + a_i[row]; cworkB = b_j + b_i[row];
923: /* load the column indices for this row into cols*/
924: cols = sbuf_aj_i + ct2;
925: for (l=0; l<nzB; l++) {
926: if ((ctmp = bmap[cworkB[l]]) < cstart) cols[l] = ctmp;
927: else break;
928: }
929: imark = l;
930: for (l=0; l<nzA; l++) cols[imark+l] = cstart + cworkA[l];
931: for (l=imark; l<nzB; l++) cols[nzA+l] = bmap[cworkB[l]];
932: ct2 += ncols;
933: }
934: }
935: MPI_Isend(sbuf_aj_i,req_size[i],MPIU_INT,req_source[i],tag2,comm,s_waits3+i);
936: }
937: }
938: PetscMalloc((nrqs+1)*sizeof(MPI_Status),&r_status3);
939: PetscMalloc((nrqr+1)*sizeof(MPI_Status),&s_status3);
941: /* Allocate buffers for a->a, and send them off */
942: PetscMalloc((nrqr+1)*sizeof(MatScalar *),&sbuf_aa);
943: for (i=0,j=0; i<nrqr; i++) j += req_size[i];
944: PetscMalloc((j+1)*bs2*sizeof(MatScalar),&sbuf_aa[0]);
945: for (i=1; i<nrqr; i++) sbuf_aa[i] = sbuf_aa[i-1] + req_size[i-1]*bs2;
946:
947: PetscMalloc((nrqr+1)*sizeof(MPI_Request),&s_waits4);
948: {
949: for (i=0; i<nrqr; i++) {
950: rbuf1_i = rbuf1[i];
951: sbuf_aa_i = sbuf_aa[i];
952: ct1 = 2*rbuf1_i[0]+1;
953: ct2 = 0;
954: for (j=1,max1=rbuf1_i[0]; j<=max1; j++) {
955: kmax = rbuf1_i[2*j];
956: for (k=0; k<kmax; k++,ct1++) {
957: row = rbuf1_i[ct1] - rstart;
958: nzA = a_i[row+1] - a_i[row]; nzB = b_i[row+1] - b_i[row];
959: ncols = nzA + nzB;
960: cworkA = a_j + a_i[row]; cworkB = b_j + b_i[row];
961: vworkA = a_a + a_i[row]*bs2; vworkB = b_a + b_i[row]*bs2;
963: /* load the column values for this row into vals*/
964: vals = sbuf_aa_i+ct2*bs2;
965: for (l=0; l<nzB; l++) {
966: if ((bmap[cworkB[l]]) < cstart) {
967: PetscMemcpy(vals+l*bs2,vworkB+l*bs2,bs2*sizeof(MatScalar));
968: }
969: else break;
970: }
971: imark = l;
972: for (l=0; l<nzA; l++) {
973: PetscMemcpy(vals+(imark+l)*bs2,vworkA+l*bs2,bs2*sizeof(MatScalar));
974: }
975: for (l=imark; l<nzB; l++) {
976: PetscMemcpy(vals+(nzA+l)*bs2,vworkB+l*bs2,bs2*sizeof(MatScalar));
977: }
978: ct2 += ncols;
979: }
980: }
981: MPI_Isend(sbuf_aa_i,req_size[i]*bs2,MPIU_MATSCALAR,req_source[i],tag3,comm,s_waits4+i);
982: }
983: }
984: PetscMalloc((nrqs+1)*sizeof(MPI_Status),&r_status4);
985: PetscMalloc((nrqr+1)*sizeof(MPI_Status),&s_status4);
986: PetscFree(rbuf1);
988: /* Form the matrix */
989: /* create col map */
990: {
991: const PetscInt *icol_i;
992: #if defined (PETSC_USE_CTABLE)
993: /* Create row map*/
994: PetscMalloc((1+ismax)*sizeof(PetscTable),&colmaps);
995: for (i=0; i<ismax; i++) {
996: PetscTableCreate(ncol[i]+1,&colmaps[i]);
997: }
998: #else
999: PetscMalloc((1+ismax)*sizeof(PetscInt*)+ ismax*c->Nbs*sizeof(PetscInt),&cmap);
1000: cmap[0] = (PetscInt*)(cmap + ismax);
1001: PetscMemzero(cmap[0],(1+ismax*c->Nbs)*sizeof(PetscInt));
1002: for (i=1; i<ismax; i++) { cmap[i] = cmap[i-1] + c->Nbs; }
1003: #endif
1004: for (i=0; i<ismax; i++) {
1005: jmax = ncol[i];
1006: icol_i = icol[i];
1007: #if defined (PETSC_USE_CTABLE)
1008: lcol1_gcol1 = colmaps[i];
1009: for (j=0; j<jmax; j++) {
1010: PetscTableAdd(lcol1_gcol1,icol_i[j]+1,j+1);
1011: }
1012: #else
1013: cmap_i = cmap[i];
1014: for (j=0; j<jmax; j++) {
1015: cmap_i[icol_i[j]] = j+1;
1016: }
1017: #endif
1018: }
1019: }
1021: /* Create lens which is required for MatCreate... */
1022: for (i=0,j=0; i<ismax; i++) { j += nrow[i]; }
1023: PetscMalloc((1+ismax)*sizeof(PetscInt*)+ j*sizeof(PetscInt),&lens);
1024: lens[0] = (PetscInt*)(lens + ismax);
1025: PetscMemzero(lens[0],j*sizeof(PetscInt));
1026: for (i=1; i<ismax; i++) { lens[i] = lens[i-1] + nrow[i-1]; }
1027:
1028: /* Update lens from local data */
1029: for (i=0; i<ismax; i++) {
1030: jmax = nrow[i];
1031: #if defined (PETSC_USE_CTABLE)
1032: lcol1_gcol1 = colmaps[i];
1033: #else
1034: cmap_i = cmap[i];
1035: #endif
1036: irow_i = irow[i];
1037: lens_i = lens[i];
1038: for (j=0; j<jmax; j++) {
1039: row = irow_i[j];
1040: #if defined (PETSC_USE_CTABLE)
1041: PetscGetProc(row,size,c->rangebs,&proc);
1042: #else
1043: proc = rtable[row];
1044: #endif
1045: if (proc == rank) {
1046: /* Get indices from matA and then from matB */
1047: row = row - rstart;
1048: nzA = a_i[row+1] - a_i[row]; nzB = b_i[row+1] - b_i[row];
1049: cworkA = a_j + a_i[row]; cworkB = b_j + b_i[row];
1050: #if defined (PETSC_USE_CTABLE)
1051: for (k=0; k<nzA; k++) {
1052: PetscTableFind(lcol1_gcol1,cstart+cworkA[k]+1,&tt);
1053: if (tt) { lens_i[j]++; }
1054: }
1055: for (k=0; k<nzB; k++) {
1056: PetscTableFind(lcol1_gcol1,bmap[cworkB[k]]+1,&tt);
1057: if (tt) { lens_i[j]++; }
1058: }
1059: #else
1060: for (k=0; k<nzA; k++) {
1061: if (cmap_i[cstart + cworkA[k]]) { lens_i[j]++; }
1062: }
1063: for (k=0; k<nzB; k++) {
1064: if (cmap_i[bmap[cworkB[k]]]) { lens_i[j]++; }
1065: }
1066: #endif
1067: }
1068: }
1069: }
1070: #if defined (PETSC_USE_CTABLE)
1071: /* Create row map*/
1072: PetscMalloc((1+ismax)*sizeof(PetscTable),&rowmaps);
1073: for (i=0; i<ismax; i++){
1074: PetscTableCreate(nrow[i]+1,&rowmaps[i]);
1075: }
1076: #else
1077: /* Create row map*/
1078: PetscMalloc((1+ismax)*sizeof(PetscInt*)+ ismax*Mbs*sizeof(PetscInt),&rmap);
1079: rmap[0] = (PetscInt*)(rmap + ismax);
1080: PetscMemzero(rmap[0],ismax*Mbs*sizeof(PetscInt));
1081: for (i=1; i<ismax; i++) { rmap[i] = rmap[i-1] + Mbs;}
1082: #endif
1083: for (i=0; i<ismax; i++) {
1084: irow_i = irow[i];
1085: jmax = nrow[i];
1086: #if defined (PETSC_USE_CTABLE)
1087: lrow1_grow1 = rowmaps[i];
1088: for (j=0; j<jmax; j++) {
1089: PetscTableAdd(lrow1_grow1,irow_i[j]+1,j+1);
1090: }
1091: #else
1092: rmap_i = rmap[i];
1093: for (j=0; j<jmax; j++) {
1094: rmap_i[irow_i[j]] = j;
1095: }
1096: #endif
1097: }
1099: /* Update lens from offproc data */
1100: {
1101: PetscInt *rbuf2_i,*rbuf3_i,*sbuf1_i;
1102: PetscMPIInt ii;
1104: for (tmp2=0; tmp2<nrqs; tmp2++) {
1105: MPI_Waitany(nrqs,r_waits3,&ii,r_status3+tmp2);
1106: idex = pa[ii];
1107: sbuf1_i = sbuf1[idex];
1108: jmax = sbuf1_i[0];
1109: ct1 = 2*jmax+1;
1110: ct2 = 0;
1111: rbuf2_i = rbuf2[ii];
1112: rbuf3_i = rbuf3[ii];
1113: for (j=1; j<=jmax; j++) {
1114: is_no = sbuf1_i[2*j-1];
1115: max1 = sbuf1_i[2*j];
1116: lens_i = lens[is_no];
1117: #if defined (PETSC_USE_CTABLE)
1118: lcol1_gcol1 = colmaps[is_no];
1119: lrow1_grow1 = rowmaps[is_no];
1120: #else
1121: cmap_i = cmap[is_no];
1122: rmap_i = rmap[is_no];
1123: #endif
1124: for (k=0; k<max1; k++,ct1++) {
1125: #if defined (PETSC_USE_CTABLE)
1126: PetscTableFind(lrow1_grow1,sbuf1_i[ct1]+1,&row);
1127: row--;
1128: if (row < 0) { SETERRQ(PETSC_ERR_PLIB,"row not found in table"); }
1129: #else
1130: row = rmap_i[sbuf1_i[ct1]]; /* the val in the new matrix to be */
1131: #endif
1132: max2 = rbuf2_i[ct1];
1133: for (l=0; l<max2; l++,ct2++) {
1134: #if defined (PETSC_USE_CTABLE)
1135: PetscTableFind(lcol1_gcol1,rbuf3_i[ct2]+1,&tt);
1136: if (tt) {
1137: lens_i[row]++;
1138: }
1139: #else
1140: if (cmap_i[rbuf3_i[ct2]]) {
1141: lens_i[row]++;
1142: }
1143: #endif
1144: }
1145: }
1146: }
1147: }
1148: }
1149: PetscFree(r_status3);
1150: PetscFree(r_waits3);
1151: if (nrqr) {MPI_Waitall(nrqr,s_waits3,s_status3);}
1152: PetscFree(s_status3);
1153: PetscFree(s_waits3);
1155: /* Create the submatrices */
1156: if (scall == MAT_REUSE_MATRIX) {
1157: /*
1158: Assumes new rows are same length as the old rows, hence bug!
1159: */
1160: for (i=0; i<ismax; i++) {
1161: mat = (Mat_SeqBAIJ *)(submats[i]->data);
1162: if ((mat->mbs != nrow[i]) || (mat->nbs != ncol[i] || C->rmap->bs != bs)) {
1163: SETERRQ(PETSC_ERR_ARG_SIZ,"Cannot reuse matrix. wrong size");
1164: }
1165: PetscMemcmp(mat->ilen,lens[i],mat->mbs *sizeof(PetscInt),&flag);
1166: if (!flag) {
1167: SETERRQ(PETSC_ERR_ARG_INCOMP,"Cannot reuse matrix. wrong no of nonzeros");
1168: }
1169: /* Initial matrix as if empty */
1170: PetscMemzero(mat->ilen,mat->mbs*sizeof(PetscInt));
1171: submats[i]->factor = C->factor;
1172: }
1173: } else {
1174: for (i=0; i<ismax; i++) {
1175: MatCreate(PETSC_COMM_SELF,submats+i);
1176: MatSetSizes(submats[i],nrow[i]*bs,ncol[i]*bs,nrow[i]*bs,ncol[i]*bs);
1177: MatSetType(submats[i],((PetscObject)A)->type_name);
1178: MatSeqBAIJSetPreallocation(submats[i],C->rmap->bs,0,lens[i]);
1179: MatSeqSBAIJSetPreallocation(submats[i],C->rmap->bs,0,lens[i]);
1180: }
1181: }
1183: /* Assemble the matrices */
1184: /* First assemble the local rows */
1185: {
1186: PetscInt ilen_row,*imat_ilen,*imat_j,*imat_i;
1187: MatScalar *imat_a;
1188:
1189: for (i=0; i<ismax; i++) {
1190: mat = (Mat_SeqBAIJ*)submats[i]->data;
1191: imat_ilen = mat->ilen;
1192: imat_j = mat->j;
1193: imat_i = mat->i;
1194: imat_a = mat->a;
1196: #if defined (PETSC_USE_CTABLE)
1197: lcol1_gcol1 = colmaps[i];
1198: lrow1_grow1 = rowmaps[i];
1199: #else
1200: cmap_i = cmap[i];
1201: rmap_i = rmap[i];
1202: #endif
1203: irow_i = irow[i];
1204: jmax = nrow[i];
1205: for (j=0; j<jmax; j++) {
1206: row = irow_i[j];
1207: #if defined (PETSC_USE_CTABLE)
1208: PetscGetProc(row,size,c->rangebs,&proc);
1209: #else
1210: proc = rtable[row];
1211: #endif
1212: if (proc == rank) {
1213: row = row - rstart;
1214: nzA = a_i[row+1] - a_i[row];
1215: nzB = b_i[row+1] - b_i[row];
1216: cworkA = a_j + a_i[row];
1217: cworkB = b_j + b_i[row];
1218: vworkA = a_a + a_i[row]*bs2;
1219: vworkB = b_a + b_i[row]*bs2;
1220: #if defined (PETSC_USE_CTABLE)
1221: PetscTableFind(lrow1_grow1,row+rstart+1,&row);
1222: row--;
1223: if (row < 0) { SETERRQ(PETSC_ERR_PLIB,"row not found in table"); }
1224: #else
1225: row = rmap_i[row + rstart];
1226: #endif
1227: mat_i = imat_i[row];
1228: mat_a = imat_a + mat_i*bs2;
1229: mat_j = imat_j + mat_i;
1230: ilen_row = imat_ilen[row];
1232: /* load the column indices for this row into cols*/
1233: for (l=0; l<nzB; l++) {
1234: if ((ctmp = bmap[cworkB[l]]) < cstart) {
1235: #if defined (PETSC_USE_CTABLE)
1236: PetscTableFind(lcol1_gcol1,ctmp+1,&tcol);
1237: if (tcol) {
1238: #else
1239: if ((tcol = cmap_i[ctmp])) {
1240: #endif
1241: *mat_j++ = tcol - 1;
1242: PetscMemcpy(mat_a,vworkB+l*bs2,bs2*sizeof(MatScalar));
1243: mat_a += bs2;
1244: ilen_row++;
1245: }
1246: } else break;
1247: }
1248: imark = l;
1249: for (l=0; l<nzA; l++) {
1250: #if defined (PETSC_USE_CTABLE)
1251: PetscTableFind(lcol1_gcol1,cstart+cworkA[l]+1,&tcol);
1252: if (tcol) {
1253: #else
1254: if ((tcol = cmap_i[cstart + cworkA[l]])) {
1255: #endif
1256: *mat_j++ = tcol - 1;
1257: PetscMemcpy(mat_a,vworkA+l*bs2,bs2*sizeof(MatScalar));
1258: mat_a += bs2;
1259: ilen_row++;
1260: }
1261: }
1262: for (l=imark; l<nzB; l++) {
1263: #if defined (PETSC_USE_CTABLE)
1264: PetscTableFind(lcol1_gcol1,bmap[cworkB[l]]+1,&tcol);
1265: if (tcol) {
1266: #else
1267: if ((tcol = cmap_i[bmap[cworkB[l]]])) {
1268: #endif
1269: *mat_j++ = tcol - 1;
1270: PetscMemcpy(mat_a,vworkB+l*bs2,bs2*sizeof(MatScalar));
1271: mat_a += bs2;
1272: ilen_row++;
1273: }
1274: }
1275: imat_ilen[row] = ilen_row;
1276: }
1277: }
1278:
1279: }
1280: }
1282: /* Now assemble the off proc rows*/
1283: {
1284: PetscInt *sbuf1_i,*rbuf2_i,*rbuf3_i,*imat_ilen,ilen;
1285: PetscInt *imat_j,*imat_i;
1286: MatScalar *imat_a,*rbuf4_i;
1287: PetscMPIInt ii;
1289: for (tmp2=0; tmp2<nrqs; tmp2++) {
1290: MPI_Waitany(nrqs,r_waits4,&ii,r_status4+tmp2);
1291: idex = pa[ii];
1292: sbuf1_i = sbuf1[idex];
1293: jmax = sbuf1_i[0];
1294: ct1 = 2*jmax + 1;
1295: ct2 = 0;
1296: rbuf2_i = rbuf2[ii];
1297: rbuf3_i = rbuf3[ii];
1298: rbuf4_i = rbuf4[ii];
1299: for (j=1; j<=jmax; j++) {
1300: is_no = sbuf1_i[2*j-1];
1301: #if defined (PETSC_USE_CTABLE)
1302: lrow1_grow1 = rowmaps[is_no];
1303: lcol1_gcol1 = colmaps[is_no];
1304: #else
1305: rmap_i = rmap[is_no];
1306: cmap_i = cmap[is_no];
1307: #endif
1308: mat = (Mat_SeqBAIJ*)submats[is_no]->data;
1309: imat_ilen = mat->ilen;
1310: imat_j = mat->j;
1311: imat_i = mat->i;
1312: imat_a = mat->a;
1313: max1 = sbuf1_i[2*j];
1314: for (k=0; k<max1; k++,ct1++) {
1315: row = sbuf1_i[ct1];
1316: #if defined (PETSC_USE_CTABLE)
1317: PetscTableFind(lrow1_grow1,row+1,&row);
1318: row--;
1319: if(row < 0) { SETERRQ(PETSC_ERR_PLIB,"row not found in table"); }
1320: #else
1321: row = rmap_i[row];
1322: #endif
1323: ilen = imat_ilen[row];
1324: mat_i = imat_i[row];
1325: mat_a = imat_a + mat_i*bs2;
1326: mat_j = imat_j + mat_i;
1327: max2 = rbuf2_i[ct1];
1328: for (l=0; l<max2; l++,ct2++) {
1329: #if defined (PETSC_USE_CTABLE)
1330: PetscTableFind(lcol1_gcol1,rbuf3_i[ct2]+1,&tcol);
1331: if (tcol) {
1332: #else
1333: if ((tcol = cmap_i[rbuf3_i[ct2]])) {
1334: #endif
1335: *mat_j++ = tcol - 1;
1336: /* *mat_a++= rbuf4_i[ct2]; */
1337: PetscMemcpy(mat_a,rbuf4_i+ct2*bs2,bs2*sizeof(MatScalar));
1338: mat_a += bs2;
1339: ilen++;
1340: }
1341: }
1342: imat_ilen[row] = ilen;
1343: }
1344: }
1345: }
1346: }
1347: PetscFree(r_status4);
1348: PetscFree(r_waits4);
1349: if (nrqr) {MPI_Waitall(nrqr,s_waits4,s_status4);}
1350: PetscFree(s_waits4);
1351: PetscFree(s_status4);
1353: /* Restore the indices */
1354: for (i=0; i<ismax; i++) {
1355: ISRestoreIndices(isrow[i],irow+i);
1356: ISRestoreIndices(iscol[i],icol+i);
1357: }
1359: /* Destroy allocated memory */
1360: PetscFree(irow);
1361: PetscFree(w1);
1362: PetscFree(pa);
1364: PetscFree(sbuf1);
1365: PetscFree(rbuf2);
1366: for (i=0; i<nrqr; ++i) {
1367: PetscFree(sbuf2[i]);
1368: }
1369: for (i=0; i<nrqs; ++i) {
1370: PetscFree(rbuf3[i]);
1371: PetscFree(rbuf4[i]);
1372: }
1374: PetscFree(sbuf2);
1375: PetscFree(rbuf3);
1376: PetscFree(rbuf4);
1377: PetscFree(sbuf_aj[0]);
1378: PetscFree(sbuf_aj);
1379: PetscFree(sbuf_aa[0]);
1380: PetscFree(sbuf_aa);
1382: #if defined (PETSC_USE_CTABLE)
1383: for (i=0; i<ismax; i++){
1384: PetscTableDestroy(rowmaps[i]);
1385: PetscTableDestroy(colmaps[i]);
1386: }
1387: PetscFree(colmaps);
1388: PetscFree(rowmaps);
1389: #else
1390: PetscFree(rmap);
1391: PetscFree(cmap);
1392: #endif
1393: PetscFree(lens);
1395: for (i=0; i<ismax; i++) {
1396: MatAssemblyBegin(submats[i],MAT_FINAL_ASSEMBLY);
1397: MatAssemblyEnd(submats[i],MAT_FINAL_ASSEMBLY);
1398: }
1399: return(0);
1400: }