Actual source code: mpiaijAssemble.cu

petsc-3.3-p5 2012-12-01
  1: #include "petscconf.h"
  2: PETSC_CUDA_EXTERN_C_BEGIN
 3:  #include ../src/mat/impls/aij/seq/aij.h
 4:  #include ../src/mat/impls/aij/mpi/mpiaij.h
  5: #include "petscbt.h"
 6:  #include ../src/vec/vec/impls/dvecimpl.h
  7: #include "petsc-private/vecimpl.h"
  8: PETSC_CUDA_EXTERN_C_END
  9: #undef VecType
 10:  #include ../src/mat/impls/aij/seq/seqcusp/cuspmatimpl.h

 12: #include <thrust/reduce.h>
 13: #include <thrust/inner_product.h>

 15: #include <cusp/array1d.h>
 16: #include <cusp/print.h>
 17: #include <cusp/coo_matrix.h>

 19: #include <cusp/io/matrix_market.h>

 21: #include <thrust/iterator/counting_iterator.h>
 22: #include <thrust/iterator/transform_iterator.h>
 23: #include <thrust/iterator/permutation_iterator.h>
 24: #include <thrust/functional.h>
 25: #include <thrust/partition.h>
 26: #include <thrust/remove.h>

 28: // this example illustrates how to make repeated access to a range of values
 29: // examples:
 30: //   repeated_range([0, 1, 2, 3], 1) -> [0, 1, 2, 3]
 31: //   repeated_range([0, 1, 2, 3], 2) -> [0, 0, 1, 1, 2, 2, 3, 3]
 32: //   repeated_range([0, 1, 2, 3], 3) -> [0, 0, 0, 1, 1, 1, 2, 2, 2, 3, 3, 3]
 33: //   ...

 35: template <typename Iterator>
 36: class repeated_range
 37: {
 38:     public:

 40:     typedef typename thrust::iterator_difference<Iterator>::type difference_type;

 42:     struct repeat_functor : public thrust::unary_function<difference_type,difference_type>
 43:     {
 44:         difference_type repeats;

 46:         repeat_functor(difference_type repeats)
 47:             : repeats(repeats) {}

 49:         __host__ __device__
 50:         difference_type operator()(const difference_type& i) const
 51:         {
 52:             return i / repeats;
 53:         }
 54:     };

 56:     typedef typename thrust::counting_iterator<difference_type>                   CountingIterator;
 57:     typedef typename thrust::transform_iterator<repeat_functor, CountingIterator> TransformIterator;
 58:     typedef typename thrust::permutation_iterator<Iterator,TransformIterator>     PermutationIterator;

 60:     // type of the repeated_range iterator
 61:     typedef PermutationIterator iterator;

 63:     // construct repeated_range for the range [first,last)
 64:     repeated_range(Iterator first, Iterator last, difference_type repeats)
 65:         : first(first), last(last), repeats(repeats) {}

 67:     iterator begin(void) const
 68:     {
 69:         return PermutationIterator(first, TransformIterator(CountingIterator(0), repeat_functor(repeats)));
 70:     }

 72:     iterator end(void) const
 73:     {
 74:         return begin() + repeats * (last - first);
 75:     }

 77:     protected:
 78:     difference_type repeats;
 79:     Iterator first;
 80:     Iterator last;

 82: };

 84: // this example illustrates how to repeat blocks in a range multiple times
 85: // examples:
 86: //   tiled_range([0, 1, 2, 3], 2)    -> [0, 1, 2, 3, 0, 1, 2, 3]
 87: //   tiled_range([0, 1, 2, 3], 4, 2) -> [0, 1, 2, 3, 0, 1, 2, 3]
 88: //   tiled_range([0, 1, 2, 3], 2, 2) -> [0, 1, 0, 1, 2, 3, 2, 3]
 89: //   tiled_range([0, 1, 2, 3], 2, 3) -> [0, 1, 0, 1 0, 1, 2, 3, 2, 3, 2, 3]
 90: //   ...

 92: template <typename Iterator>
 93: class tiled_range
 94: {
 95:     public:

 97:     typedef typename thrust::iterator_difference<Iterator>::type difference_type;

 99:     struct tile_functor : public thrust::unary_function<difference_type,difference_type>
100:     {
101:         difference_type repeats;
102:         difference_type tile_size;

104:         tile_functor(difference_type repeats, difference_type tile_size)
105:             : tile_size(tile_size), repeats(repeats) {}

107:         __host__ __device__
108:         difference_type operator()(const difference_type& i) const
109:         {
110:             return tile_size * (i / (tile_size * repeats)) + i % tile_size;
111:         }
112:     };

114:     typedef typename thrust::counting_iterator<difference_type>                   CountingIterator;
115:     typedef typename thrust::transform_iterator<tile_functor, CountingIterator>   TransformIterator;
116:     typedef typename thrust::permutation_iterator<Iterator,TransformIterator>     PermutationIterator;

118:     // type of the tiled_range iterator
119:     typedef PermutationIterator iterator;

121:     // construct repeated_range for the range [first,last)
122:     tiled_range(Iterator first, Iterator last, difference_type repeats)
123:         : first(first), last(last), repeats(repeats), tile_size(last - first) {}

125:     tiled_range(Iterator first, Iterator last, difference_type repeats, difference_type tile_size)
126:         : first(first), last(last), repeats(repeats), tile_size(tile_size)
127:     {
128:       // ASSERT((last - first) % tile_size == 0)
129:     }

131:     iterator begin(void) const
132:     {
133:         return PermutationIterator(first, TransformIterator(CountingIterator(0), tile_functor(repeats, tile_size)));
134:     }

136:     iterator end(void) const
137:     {
138:         return begin() + repeats * (last - first);
139:     }

141:     protected:
142:     difference_type repeats;
143:     difference_type tile_size;
144:     Iterator first;
145:     Iterator last;
146: };

148: typedef cusp::device_memory memSpace;
149: typedef int   IndexType;
150: typedef float ValueType;
151: typedef cusp::array1d<IndexType, memSpace> IndexArray;
152: typedef cusp::array1d<ValueType, memSpace> ValueArray;
153: typedef cusp::array1d<IndexType, cusp::host_memory> IndexHostArray;
154: typedef IndexArray::iterator IndexArrayIterator;
155: typedef ValueArray::iterator ValueArrayIterator;

157: struct is_diag
158: {
159:   IndexType first, last;

161:   is_diag(IndexType first, IndexType last) : first(first), last(last) {};

163:   template <typename Tuple>
164:   __host__ __device__
165:   bool operator()(Tuple t) {
166:     // Check column
167:     const IndexType row = thrust::get<0>(t);
168:     const IndexType col = thrust::get<1>(t);
169:     return (row >= first) && (row < last) && (col >= first) && (col < last);
170:   }
171: };

173: struct is_nonlocal
174: {
175:   IndexType first, last;

177:   is_nonlocal(IndexType first, IndexType last) : first(first), last(last) {};

179:   template <typename Tuple>
180:   __host__ __device__
181:   bool operator()(Tuple t) {
182:     // Check column
183:     const IndexType row = thrust::get<0>(t);
184:     return (row < first) || (row >= last);
185:   }
186: };

188: /*@C
189:   MatMPIAIJSetValuesBatch - Set multiple blocks of values into a matrix

191:   Not collective

193:   Input Parameters:
194: + J  - the assembled Mat object
195: . Ne -  the number of blocks (elements)
196: . Nl -  the block size (number of dof per element)
197: . elemRows - List of block row indices, in bunches of length Nl
198: - elemMats - List of block values, in bunches of Nl*Nl

200:   Level: advanced

202: .seealso MatSetValues()
203: @*/
206: PetscErrorCode MatSetValuesBatch_MPIAIJCUSP(Mat J, PetscInt Ne, PetscInt Nl, PetscInt *elemRows, const PetscScalar *elemMats)
207: {
208:   // Assumptions:
209:   //   1) Each elemMat is square, of size Nl x Nl
210:   //
211:   //      This means that any nonlocal entry (i,j) where i is owned by another process is matched to
212:   //        a) an offdiagonal entry (j,i) if j is diagonal, or
213:   //        b) another nonlocal entry (j,i) if j is offdiagonal
214:   //
215:   //      No - numSendEntries: The number of on-process  diagonal+offdiagonal entries
216:   //      numRecvEntries:      The number of off-process diagonal+offdiagonal entries
217:   //
218:   //  Glossary:
219:   //     diagonal: (i,j) such that i,j in [firstRow, lastRow)
220:   //  offdiagonal: (i,j) such that i in [firstRow, lastRow), and j not in [firstRow, lastRow)
221:   //     nonlocal: (i,j) such that i not in [firstRow, lastRow)
222:   //  nondiagonal: (i,j) such that i not in [firstRow, lastRow), or j not in [firstRow, lastRow)
223:   //   on-process: entries provided by elemMats
224:   //  off-process: entries received from other processes
225:   MPI_Comm        comm = ((PetscObject) J)->comm;
226:   Mat_MPIAIJ     *j    = (Mat_MPIAIJ *) J->data;
227:   size_t          N    = Ne * Nl;    // Length of elemRows (dimension of unassembled space)
228:   size_t          No   = Ne * Nl*Nl; // Length of elemMats (total number of values)
229:   PetscInt        Nr;                // Size of J          (dimension of assembled space)
230:   PetscInt        firstRow, lastRow, firstCol;
231:   const PetscInt *rowRanges;
232:   PetscInt        numNonlocalRows;   // Number of rows in elemRows not owned by this process
233:   PetscInt        numSendEntries;    // Number of (i,j,v) entries sent to other processes
234:   PetscInt        numRecvEntries;    // Number of (i,j,v) entries received from other processes
235:   PetscInt        Nc;
236:   PetscMPIInt     numProcs, rank;
237:   PetscErrorCode  ierr;

239:   // copy elemRows and elemMat to device
240:   IndexArray d_elemRows(elemRows, elemRows + N);
241:   ValueArray d_elemMats(elemMats, elemMats + No);

244:   MPI_Comm_size(comm, &numProcs);
245:   MPI_Comm_rank(comm, &rank);
246:   // get matrix information
247:   MatGetLocalSize(J, &Nr, PETSC_NULL);
248:   MatGetOwnershipRange(J, &firstRow, &lastRow);
249:   MatGetOwnershipRanges(J, &rowRanges);
250:   MatGetOwnershipRangeColumn(J, &firstCol, PETSC_NULL);
251:   PetscInfo3(J, "Assembling matrix of size %d (rows %d -- %d)\n", Nr, firstRow, lastRow);

253:   // repeat elemRows entries Nl times
254:   PetscInfo(J, "Making row indices\n");
255:   repeated_range<IndexArrayIterator> rowInd(d_elemRows.begin(), d_elemRows.end(), Nl);

257:   // tile rows of elemRows Nl times
258:   PetscInfo(J, "Making column indices\n");
259:   tiled_range<IndexArrayIterator> colInd(d_elemRows.begin(), d_elemRows.end(), Nl, Nl);

261:   // Find number of nonlocal rows, convert nonlocal rows to procs, and send sizes of off-proc entries (could send diag and offdiag sizes)
262:   // TODO: Ask Nathan how to do this on GPU
263:   PetscLogEventBegin(MAT_SetValuesBatchI,0,0,0,0);
264:   PetscMPIInt *procSendSizes, *procRecvSizes;
265:   PetscMalloc2(numProcs, PetscInt, &procSendSizes, numProcs, PetscInt, &procRecvSizes);
266:   PetscMemzero(procSendSizes, numProcs * sizeof(PetscInt));
267:   PetscMemzero(procRecvSizes, numProcs * sizeof(PetscInt));
268:   numNonlocalRows = 0;
269:   for(size_t i = 0; i < N; ++i) {
270:     const PetscInt row = elemRows[i];

272:     if ((row < firstRow) || (row >= lastRow)) {
273:       numNonlocalRows++;
274:       for(IndexType p = 0; p < numProcs; ++p) {
275:         if ((row >= rowRanges[p]) && (row < rowRanges[p+1])) {
276:           procSendSizes[p] += Nl;
277:           break;
278:         }
279:       }
280:     }
281:   }
282:   numSendEntries = numNonlocalRows*Nl;
283:   PetscInfo2(J, "Nonlocal rows %d total entries %d\n", numNonlocalRows, No);
284:   MPI_Alltoall(procSendSizes, 1, MPIU_INT, procRecvSizes, 1, MPIU_INT, comm);
285:   numRecvEntries = 0;
286:   for(PetscInt p = 0; p < numProcs; ++p) {
287:     numRecvEntries += procRecvSizes[p];
288:   }
289:   PetscInfo2(j->A, "Send entries %d Recv Entries %d\n", numSendEntries, numRecvEntries);
290:   PetscLogEventEnd(MAT_SetValuesBatchI,0,0,0,0);
291:   // Allocate storage for "fat" COO representation of matrix
292:   PetscLogEventBegin(MAT_SetValuesBatchII,0,0,0,0);
293:   PetscInfo2(j->A, "Making COO matrices, diag entries %d, nondiag entries %d\n", No-numSendEntries+numRecvEntries, numSendEntries*2);
294:   cusp::coo_matrix<IndexType,ValueType, memSpace> diagCOO(Nr, Nr, No-numSendEntries+numRecvEntries); // ALLOC: This is oversized because I also count offdiagonal entries
295:   IndexArray nondiagonalRows(numSendEntries+numSendEntries); // ALLOC: This is oversized because numSendEntries > on-process offdiagonal entries
296:   IndexArray nondiagonalCols(numSendEntries+numSendEntries); // ALLOC: This is oversized because numSendEntries > on-process offdiagonal entries
297:   ValueArray nondiagonalVals(numSendEntries+numSendEntries); // ALLOC: This is oversized because numSendEntries > on-process offdiagonal entries
298:   IndexArray nonlocalRows(numSendEntries);
299:   IndexArray nonlocalCols(numSendEntries);
300:   ValueArray nonlocalVals(numSendEntries);
301:   // partition on-process entries into diagonal and off-diagonal+nonlocal
302:   PetscInfo(J, "Splitting on-process entries into diagonal and off-diagonal+nonlocal\n");
303:   thrust::fill(diagCOO.row_indices.begin(), diagCOO.row_indices.end(), -1);
304:   thrust::fill(nondiagonalRows.begin(),     nondiagonalRows.end(),     -1);
305:   thrust::partition_copy(thrust::make_zip_iterator(thrust::make_tuple(rowInd.begin(), colInd.begin(), d_elemMats.begin())),
306:                          thrust::make_zip_iterator(thrust::make_tuple(rowInd.end(),   colInd.end(),   d_elemMats.end())),
307:                          thrust::make_zip_iterator(thrust::make_tuple(diagCOO.row_indices.begin(),    diagCOO.column_indices.begin(), diagCOO.values.begin())),
308:                          thrust::make_zip_iterator(thrust::make_tuple(nondiagonalRows.begin(),        nondiagonalCols.begin(),        nondiagonalVals.begin())),
309:                          is_diag(firstRow, lastRow));
310:   // Current size without off-proc entries
311:   PetscInt diagonalSize    = (diagCOO.row_indices.end() - diagCOO.row_indices.begin()) - thrust::count(diagCOO.row_indices.begin(), diagCOO.row_indices.end(), -1);
312:   PetscInt nondiagonalSize = No - diagonalSize;
313:   PetscInfo2(j->A, "Diagonal size %d Nondiagonal size %d\n", diagonalSize, nondiagonalSize);
314:   ///cusp::print(diagCOO);
315:   ///cusp::print(nondiagonalRows);
316:   // partition on-process entries again into off-diagonal and nonlocal
317:   PetscInfo(J, "Splitting on-process entries into off-diagonal and nonlocal\n");
318:   thrust::stable_partition(thrust::make_zip_iterator(thrust::make_tuple(nondiagonalRows.begin(), nondiagonalCols.begin(), nondiagonalVals.begin())),
319:                            thrust::make_zip_iterator(thrust::make_tuple(nondiagonalRows.end(),   nondiagonalCols.end(),   nondiagonalVals.end())),
320:                            is_nonlocal(firstRow, lastRow));
321:   PetscInt nonlocalSize    = numSendEntries;
322:   PetscInt offdiagonalSize = nondiagonalSize - nonlocalSize;
323:   PetscInfo2(j->A, "Nonlocal size %d Offdiagonal size %d\n", nonlocalSize, offdiagonalSize);
324:   PetscLogEventEnd(MAT_SetValuesBatchII,0,0,0,0);
325:   ///cusp::print(nondiagonalRows);
326:   // send off-proc entries (pack this up later)
327:   PetscLogEventBegin(MAT_SetValuesBatchIII,0,0,0,0);
328:   PetscMPIInt *procSendDispls, *procRecvDispls;
329:   PetscInt    *sendRows, *recvRows;
330:   PetscInt    *sendCols, *recvCols;
331:   PetscScalar *sendVals, *recvVals;
332:   PetscMalloc2(numProcs, PetscInt, &procSendDispls, numProcs, PetscInt, &procRecvDispls);
333:   PetscMalloc3(numSendEntries, PetscInt, &sendRows, numSendEntries, PetscInt, &sendCols, numSendEntries, PetscScalar, &sendVals);
334:   PetscMalloc3(numRecvEntries, PetscInt, &recvRows, numRecvEntries, PetscInt, &recvCols, numRecvEntries, PetscScalar, &recvVals);
335:   procSendDispls[0] = procRecvDispls[0] = 0;
336:   for(PetscInt p = 1; p < numProcs; ++p) {
337:     procSendDispls[p] = procSendDispls[p-1] + procSendSizes[p-1];
338:     procRecvDispls[p] = procRecvDispls[p-1] + procRecvSizes[p-1];
339:   }
340: #if 0
341:   thrust::copy(nondiagonalRows.begin(), nondiagonalRows.begin()+nonlocalSize, sendRows);
342:   thrust::copy(nondiagonalCols.begin(), nondiagonalCols.begin()+nonlocalSize, sendCols);
343:   thrust::copy(nondiagonalVals.begin(), nondiagonalVals.begin()+nonlocalSize, sendVals);
344: #else
345:   thrust::copy(thrust::make_zip_iterator(thrust::make_tuple(nondiagonalRows.begin(), nondiagonalCols.begin(), nondiagonalVals.begin())),
346:                thrust::make_zip_iterator(thrust::make_tuple(nondiagonalRows.begin(), nondiagonalCols.begin(), nondiagonalVals.begin()))+nonlocalSize,
347:                thrust::make_zip_iterator(thrust::make_tuple(sendRows,                sendCols,                sendVals)));
348: #endif
349:   //   could pack into a struct and unpack
350:   MPI_Alltoallv(sendRows, procSendSizes, procSendDispls, MPIU_INT,    recvRows, procRecvSizes, procRecvDispls, MPIU_INT,    comm);
351:   MPI_Alltoallv(sendCols, procSendSizes, procSendDispls, MPIU_INT,    recvCols, procRecvSizes, procRecvDispls, MPIU_INT,    comm);
352:   MPI_Alltoallv(sendVals, procSendSizes, procSendDispls, MPIU_SCALAR, recvVals, procRecvSizes, procRecvDispls, MPIU_SCALAR, comm);
353:   PetscFree2(procSendSizes, procRecvSizes);
354:   PetscFree2(procSendDispls, procRecvDispls);
355:   PetscFree3(sendRows, sendCols, sendVals);
356:   PetscLogEventEnd(MAT_SetValuesBatchIII,0,0,0,0);

358:   PetscLogEventBegin(MAT_SetValuesBatchIV,0,0,0,0);
359:   // Create off-diagonal matrix
360:   cusp::coo_matrix<IndexType,ValueType, memSpace> offdiagCOO(Nr, Nr, offdiagonalSize+numRecvEntries); // ALLOC: This is oversizes because we count diagonal entries in numRecvEntries
361:   // partition again into diagonal and off-diagonal
362:   IndexArray d_recvRows(recvRows, recvRows+numRecvEntries);
363:   IndexArray d_recvCols(recvCols, recvCols+numRecvEntries);
364:   ValueArray d_recvVals(recvVals, recvVals+numRecvEntries);
365: #if 0
366:   thrust::copy(nondiagonalRows.end()-offdiagonalSize, nondiagonalRows.end(), offdiagCOO.row_indices.begin());
367:   thrust::copy(nondiagonalCols.end()-offdiagonalSize, nondiagonalCols.end(), offdiagCOO.column_indices.begin());
368:   thrust::copy(nondiagonalVals.end()-offdiagonalSize, nondiagonalVals.end(), offdiagCOO.values.begin());
369: #else
370:   thrust::copy(thrust::make_zip_iterator(thrust::make_tuple(nondiagonalRows.end(),          nondiagonalCols.end(),             nondiagonalVals.end()))-offdiagonalSize,
371:                thrust::make_zip_iterator(thrust::make_tuple(nondiagonalRows.end(),          nondiagonalCols.end(),             nondiagonalVals.end())),
372:                thrust::make_zip_iterator(thrust::make_tuple(offdiagCOO.row_indices.begin(), offdiagCOO.column_indices.begin(), offdiagCOO.values.begin())));
373: #endif
374:   thrust::fill(diagCOO.row_indices.begin()+diagonalSize,       diagCOO.row_indices.end(),    -1);
375:   thrust::fill(offdiagCOO.row_indices.begin()+offdiagonalSize, offdiagCOO.row_indices.end(), -1);
376:   thrust::partition_copy(thrust::make_zip_iterator(thrust::make_tuple(d_recvRows.begin(), d_recvCols.begin(), d_recvVals.begin())),
377:                          thrust::make_zip_iterator(thrust::make_tuple(d_recvRows.end(),   d_recvCols.end(),   d_recvVals.end())),
378:                          thrust::make_zip_iterator(thrust::make_tuple(diagCOO.row_indices.begin()+diagonalSize, diagCOO.column_indices.begin()+diagonalSize, diagCOO.values.begin()+diagonalSize)),
379:                          thrust::make_zip_iterator(thrust::make_tuple(offdiagCOO.row_indices.begin()+offdiagonalSize, offdiagCOO.column_indices.begin()+offdiagonalSize, offdiagCOO.values.begin()+offdiagonalSize)),
380:                          is_diag(firstRow, lastRow));
381:   PetscFree3(recvRows, recvCols, recvVals);
382:   diagonalSize    = (diagCOO.row_indices.end()    - diagCOO.row_indices.begin())    - thrust::count(diagCOO.row_indices.begin(),    diagCOO.row_indices.end(),    -1);
383:   offdiagonalSize = (offdiagCOO.row_indices.end() - offdiagCOO.row_indices.begin()) - thrust::count(offdiagCOO.row_indices.begin(), offdiagCOO.row_indices.end(), -1);

385:   // sort COO format by (i,j), this is the most costly step
386:   PetscInfo(J, "Sorting rows and columns\n");
387:   diagCOO.sort_by_row_and_column();
388:   offdiagCOO.sort_by_row_and_column();
389:   PetscInt diagonalOffset    = (diagCOO.row_indices.end()    - diagCOO.row_indices.begin())    - diagonalSize;
390:   PetscInt offdiagonalOffset = (offdiagCOO.row_indices.end() - offdiagCOO.row_indices.begin()) - offdiagonalSize;

392:   // print the "fat" COO representation
393:   if (PetscLogPrintInfo) {
394:     cusp::print(diagCOO);
395:     cusp::print(offdiagCOO);
396:   }

398:   // Figure out the number of unique off-diagonal columns
399:   Nc = thrust::inner_product(offdiagCOO.column_indices.begin()+offdiagonalOffset,
400:                              offdiagCOO.column_indices.end()   - 1,
401:                              offdiagCOO.column_indices.begin()+offdiagonalOffset + 1,
402:                              size_t(1), thrust::plus<size_t>(), thrust::not_equal_to<IndexType>());

404:   // compute number of unique (i,j) entries
405:   //   this counts the number of changes as we move along the (i,j) list
406:   PetscInfo(J, "Computing number of unique entries\n");
407:   size_t num_diag_entries = thrust::inner_product
408:     (thrust::make_zip_iterator(thrust::make_tuple(diagCOO.row_indices.begin(), diagCOO.column_indices.begin())) + diagonalOffset,
409:      thrust::make_zip_iterator(thrust::make_tuple(diagCOO.row_indices.end(),   diagCOO.column_indices.end())) - 1,
410:      thrust::make_zip_iterator(thrust::make_tuple(diagCOO.row_indices.begin(), diagCOO.column_indices.begin())) + diagonalOffset + 1,
411:      size_t(1),
412:      thrust::plus<size_t>(),
413:      thrust::not_equal_to< thrust::tuple<IndexType,IndexType> >());
414:   size_t num_offdiag_entries = thrust::inner_product
415:     (thrust::make_zip_iterator(thrust::make_tuple(offdiagCOO.row_indices.begin(), offdiagCOO.column_indices.begin())) + offdiagonalOffset,
416:      thrust::make_zip_iterator(thrust::make_tuple(offdiagCOO.row_indices.end(),   offdiagCOO.column_indices.end())) - 1,
417:      thrust::make_zip_iterator(thrust::make_tuple(offdiagCOO.row_indices.begin(), offdiagCOO.column_indices.begin())) + offdiagonalOffset + 1,
418:      size_t(1),
419:      thrust::plus<size_t>(),
420:      thrust::not_equal_to< thrust::tuple<IndexType,IndexType> >());

422:   // allocate COO storage for final matrix
423:   PetscInfo(J, "Allocating compressed matrices\n");
424:   cusp::coo_matrix<IndexType, ValueType, memSpace> A(Nr, Nr, num_diag_entries);
425:   cusp::coo_matrix<IndexType, ValueType, memSpace> B(Nr, Nc, num_offdiag_entries);

427:   // sum values with the same (i,j) index
428:   // XXX thrust::reduce_by_key is unoptimized right now, so we provide a SpMV-based one in cusp::detail
429:   //     the Cusp one is 2x faster, but still not optimal
430:   // This could possibly be done in-place
431:   PetscInfo(J, "Compressing matrices\n");
432:   thrust::reduce_by_key(thrust::make_zip_iterator(thrust::make_tuple(diagCOO.row_indices.begin(), diagCOO.column_indices.begin())) + diagonalOffset,
433:      thrust::make_zip_iterator(thrust::make_tuple(diagCOO.row_indices.end(),   diagCOO.column_indices.end())),
434:      diagCOO.values.begin() + diagonalOffset,
435:      thrust::make_zip_iterator(thrust::make_tuple(A.row_indices.begin(), A.column_indices.begin())),
436:      A.values.begin(),
437:      thrust::equal_to< thrust::tuple<IndexType,IndexType> >(),
438:      thrust::plus<ValueType>());

440:   thrust::reduce_by_key(thrust::make_zip_iterator(thrust::make_tuple(offdiagCOO.row_indices.begin(), offdiagCOO.column_indices.begin())) + offdiagonalOffset,
441:      thrust::make_zip_iterator(thrust::make_tuple(offdiagCOO.row_indices.end(),   offdiagCOO.column_indices.end())),
442:      offdiagCOO.values.begin() + offdiagonalOffset,
443:      thrust::make_zip_iterator(thrust::make_tuple(B.row_indices.begin(), B.column_indices.begin())),
444:      B.values.begin(),
445:      thrust::equal_to< thrust::tuple<IndexType,IndexType> >(),
446:      thrust::plus<ValueType>());

448:   // Convert row and column numbers
449:   if (firstRow) {
450:     thrust::transform(A.row_indices.begin(), A.row_indices.end(), thrust::make_constant_iterator(firstRow), A.row_indices.begin(), thrust::minus<IndexType>());
451:     thrust::transform(B.row_indices.begin(), B.row_indices.end(), thrust::make_constant_iterator(firstRow), B.row_indices.begin(), thrust::minus<IndexType>());
452:   }
453:   if (firstCol) {
454:     thrust::transform(A.column_indices.begin(), A.column_indices.end(), thrust::make_constant_iterator(firstCol), A.column_indices.begin(), thrust::minus<IndexType>());
455:   }
456: #if 0 // This is done by MatSetUpMultiply_MPIAIJ()
457:   //   TODO: Get better code from Nathan
458:   IndexArray d_colmap(Nc);
459:   thrust::unique_copy(B.column_indices.begin(), B.column_indices.end(), d_colmap.begin());
460:   IndexHostArray colmap(d_colmap.begin(), d_colmap.end());
461:   IndexType      newCol = 0;
462:   for(IndexHostArray::iterator c_iter = colmap.begin(); c_iter != colmap.end(); ++c_iter, ++newCol) {
463:     thrust::replace(B.column_indices.begin(), B.column_indices.end(), *c_iter, newCol);
464:   }
465: #endif

467:   // print the final matrix
468:   if (PetscLogPrintInfo) {
469:     cusp::print(A);
470:     cusp::print(B);
471:   }

473:   PetscInfo(J, "Converting to PETSc matrix\n");
474:   MatSetType(J, MATMPIAIJCUSP);
475:   CUSPMATRIX *Agpu = new CUSPMATRIX;
476:   CUSPMATRIX *Bgpu = new CUSPMATRIX;
477:   cusp::convert(A, *Agpu);
478:   cusp::convert(B, *Bgpu);
479:   if (PetscLogPrintInfo) {
480:     cusp::print(*Agpu);
481:     cusp::print(*Bgpu);
482:   }
483:   {
484:     PetscInfo(J, "Copying to CPU matrix");
485:     MatCUSPCopyFromGPU(j->A, Agpu);
486:     MatCUSPCopyFromGPU(j->B, Bgpu);
487: #if 0 // This is done by MatSetUpMultiply_MPIAIJ()
488:     // Create the column map
489:     PetscFree(j->garray);
490:     PetscMalloc(Nc * sizeof(PetscInt), &j->garray);
491:     PetscInt c = 0;
492:     for(IndexHostArray::iterator c_iter = colmap.begin(); c_iter != colmap.end(); ++c_iter, ++c) {
493:       j->garray[c] = *c_iter;
494:     }
495: #endif
496:   }
497:   PetscLogEventEnd(MAT_SetValuesBatchIV,0,0,0,0);
498:   return(0);
499: }