Actual source code: fftw.c

petsc-3.5.2 2014-09-08
Report Typos and Errors
  2: /*
  3:     Provides an interface to the FFTW package.
  4:     Testing examples can be found in ~src/mat/examples/tests
  5: */

  7: #include <../src/mat/impls/fft/fft.h>   /*I "petscmat.h" I*/
  8: EXTERN_C_BEGIN
  9: #include <fftw3-mpi.h>
 10: EXTERN_C_END

 12: typedef struct {
 13:   ptrdiff_t    ndim_fftw,*dim_fftw;
 14: #if defined(PETSC_USE_64BIT_INDICES)
 15:   fftw_iodim64 *iodims;
 16: #else
 17:   fftw_iodim   *iodims;
 18: #endif
 19:   PetscInt     partial_dim;
 20:   fftw_plan    p_forward,p_backward;
 21:   unsigned     p_flag; /* planner flags, FFTW_ESTIMATE,FFTW_MEASURE, FFTW_PATIENT, FFTW_EXHAUSTIVE */
 22:   PetscScalar  *finarray,*foutarray,*binarray,*boutarray; /* keep track of arrays becaue fftw plan should be
 23:                                                             executed for the arrays with which the plan was created */
 24: } Mat_FFTW;

 26: extern PetscErrorCode MatMult_SeqFFTW(Mat,Vec,Vec);
 27: extern PetscErrorCode MatMultTranspose_SeqFFTW(Mat,Vec,Vec);
 28: extern PetscErrorCode MatMult_MPIFFTW(Mat,Vec,Vec);
 29: extern PetscErrorCode MatMultTranspose_MPIFFTW(Mat,Vec,Vec);
 30: extern PetscErrorCode MatDestroy_FFTW(Mat);
 31: extern PetscErrorCode VecDestroy_MPIFFTW(Vec);

 33: /* MatMult_SeqFFTW performs forward DFT in parallel
 34:    Input parameter:
 35:      A - the matrix
 36:      x - the vector on which FDFT will be performed

 38:    Output parameter:
 39:      y - vector that stores result of FDFT
 40: */
 43: PetscErrorCode MatMult_SeqFFTW(Mat A,Vec x,Vec y)
 44: {
 46:   Mat_FFT        *fft  = (Mat_FFT*)A->data;
 47:   Mat_FFTW       *fftw = (Mat_FFTW*)fft->data;
 48:   PetscScalar    *x_array,*y_array;
 49: #if defined(PETSC_USE_COMPLEX)
 50: #if defined(PETSC_USE_64BIT_INDICES) 
 51:   fftw_iodim64   *iodims;
 52: #else
 53:   fftw_iodim     *iodims;
 54: #endif
 55:   PetscInt       i;
 56: #endif
 57:   PetscInt       ndim = fft->ndim,*dim = fft->dim;

 60:   VecGetArray(x,&x_array);
 61:   VecGetArray(y,&y_array);
 62:   if (!fftw->p_forward) { /* create a plan, then excute it */
 63:     switch (ndim) {
 64:     case 1:
 65: #if defined(PETSC_USE_COMPLEX)
 66:       fftw->p_forward = fftw_plan_dft_1d(dim[0],(fftw_complex*)x_array,(fftw_complex*)y_array,FFTW_FORWARD,fftw->p_flag);
 67: #else
 68:       fftw->p_forward = fftw_plan_dft_r2c_1d(dim[0],(double*)x_array,(fftw_complex*)y_array,fftw->p_flag);
 69: #endif
 70:       break;
 71:     case 2:
 72: #if defined(PETSC_USE_COMPLEX)
 73:       fftw->p_forward = fftw_plan_dft_2d(dim[0],dim[1],(fftw_complex*)x_array,(fftw_complex*)y_array,FFTW_FORWARD,fftw->p_flag);
 74: #else
 75:       fftw->p_forward = fftw_plan_dft_r2c_2d(dim[0],dim[1],(double*)x_array,(fftw_complex*)y_array,fftw->p_flag);
 76: #endif
 77:       break;
 78:     case 3:
 79: #if defined(PETSC_USE_COMPLEX)
 80:       fftw->p_forward = fftw_plan_dft_3d(dim[0],dim[1],dim[2],(fftw_complex*)x_array,(fftw_complex*)y_array,FFTW_FORWARD,fftw->p_flag);
 81: #else
 82:       fftw->p_forward = fftw_plan_dft_r2c_3d(dim[0],dim[1],dim[2],(double*)x_array,(fftw_complex*)y_array,fftw->p_flag);
 83: #endif
 84:       break;
 85:     default:
 86: #if defined(PETSC_USE_COMPLEX)
 87:       iodims = fftw->iodims;
 88: #if defined(PETSC_USE_64BIT_INDICES)
 89:       if (ndim) {
 90:         iodims[ndim-1].n = (ptrdiff_t)dim[ndim-1];
 91:         iodims[ndim-1].is = iodims[ndim-1].os = 1;
 92:         for (i=ndim-2; i>=0; --i) {
 93:           iodims[i].n = (ptrdiff_t)dim[i];
 94:           iodims[i].is = iodims[i].os = iodims[i+1].is * iodims[i+1].n;
 95:         }
 96:       }
 97:       fftw->p_forward = fftw_plan_guru64_dft((int)ndim,(fftw_iodim64*)iodims,0,NULL,(fftw_complex*)x_array,(fftw_complex*)y_array,FFTW_FORWARD,fftw->p_flag);
 98: #else
 99:       if (ndim) {
100:         iodims[ndim-1].n = (int)dim[ndim-1];
101:         iodims[ndim-1].is = iodims[ndim-1].os = 1;
102:         for (i=ndim-2; i>=0; --i) {
103:           iodims[i].n = (int)dim[i];
104:           iodims[i].is = iodims[i].os = iodims[i+1].is * iodims[i+1].n;
105:         }
106:       }
107:       fftw->p_forward = fftw_plan_guru_dft((int)ndim,(fftw_iodim*)iodims,0,NULL,(fftw_complex*)x_array,(fftw_complex*)y_array,FFTW_FORWARD,fftw->p_flag);
108: #endif

110: #else
111:       fftw->p_forward = fftw_plan_dft_r2c(ndim,(int*)dim,(double*)x_array,(fftw_complex*)y_array,fftw->p_flag);
112: #endif
113:       break;
114:     }
115:     fftw->finarray  = x_array;
116:     fftw->foutarray = y_array;
117:     /* Warning: if (fftw->p_flag!==FFTW_ESTIMATE) The data in the in/out arrays is overwritten!
118:                 planning should be done before x is initialized! See FFTW manual sec2.1 or sec4 */
119:     fftw_execute(fftw->p_forward);
120:   } else { /* use existing plan */
121:     if (fftw->finarray != x_array || fftw->foutarray != y_array) { /* use existing plan on new arrays */
122:       fftw_execute_dft(fftw->p_forward,(fftw_complex*)x_array,(fftw_complex*)y_array);
123:     } else {
124:       fftw_execute(fftw->p_forward);
125:     }
126:   }
127:   VecRestoreArray(y,&y_array);
128:   VecRestoreArray(x,&x_array);
129:   return(0);
130: }

132: /* MatMultTranspose_SeqFFTW performs serial backward DFT
133:    Input parameter:
134:      A - the matrix
135:      x - the vector on which BDFT will be performed

137:    Output parameter:
138:      y - vector that stores result of BDFT
139: */

143: PetscErrorCode MatMultTranspose_SeqFFTW(Mat A,Vec x,Vec y)
144: {
146:   Mat_FFT        *fft  = (Mat_FFT*)A->data;
147:   Mat_FFTW       *fftw = (Mat_FFTW*)fft->data;
148:   PetscScalar    *x_array,*y_array;
149:   PetscInt       ndim=fft->ndim,*dim=fft->dim;
150: #if defined(PETSC_USE_COMPLEX)
151: #if defined(PETSC_USE_64BIT_INDICES)
152:   fftw_iodim64   *iodims=fftw->iodims;
153: #else
154:   fftw_iodim     *iodims=fftw->iodims;
155: #endif
156: #endif
157: 
159:   VecGetArray(x,&x_array);
160:   VecGetArray(y,&y_array);
161:   if (!fftw->p_backward) { /* create a plan, then excute it */
162:     switch (ndim) {
163:     case 1:
164: #if defined(PETSC_USE_COMPLEX)
165:       fftw->p_backward = fftw_plan_dft_1d(dim[0],(fftw_complex*)x_array,(fftw_complex*)y_array,FFTW_BACKWARD,fftw->p_flag);
166: #else
167:       fftw->p_backward= fftw_plan_dft_c2r_1d(dim[0],(fftw_complex*)x_array,(double*)y_array,fftw->p_flag);
168: #endif
169:       break;
170:     case 2:
171: #if defined(PETSC_USE_COMPLEX)
172:       fftw->p_backward = fftw_plan_dft_2d(dim[0],dim[1],(fftw_complex*)x_array,(fftw_complex*)y_array,FFTW_BACKWARD,fftw->p_flag);
173: #else
174:       fftw->p_backward= fftw_plan_dft_c2r_2d(dim[0],dim[1],(fftw_complex*)x_array,(double*)y_array,fftw->p_flag);
175: #endif
176:       break;
177:     case 3:
178: #if defined(PETSC_USE_COMPLEX)
179:       fftw->p_backward = fftw_plan_dft_3d(dim[0],dim[1],dim[2],(fftw_complex*)x_array,(fftw_complex*)y_array,FFTW_BACKWARD,fftw->p_flag);
180: #else
181:       fftw->p_backward= fftw_plan_dft_c2r_3d(dim[0],dim[1],dim[2],(fftw_complex*)x_array,(double*)y_array,fftw->p_flag);
182: #endif
183:       break;
184:     default:
185: #if defined(PETSC_USE_COMPLEX)
186: #if defined(PETSC_USE_64BIT_INDICES)
187:       fftw->p_backward = fftw_plan_guru64_dft((int)ndim,(fftw_iodim64*)iodims,0,NULL,(fftw_complex*)x_array,(fftw_complex*)y_array,FFTW_BACKWARD,fftw->p_flag);
188: #else
189:       fftw->p_backward = fftw_plan_guru_dft((int)ndim,iodims,0,NULL,(fftw_complex*)x_array,(fftw_complex*)y_array,FFTW_BACKWARD,fftw->p_flag);
190: #endif
191: #else
192:       fftw->p_backward= fftw_plan_dft_c2r((int)ndim,(int*)dim,(fftw_complex*)x_array,(double*)y_array,fftw->p_flag);
193: #endif
194:       break;
195:     }
196:     fftw->binarray  = x_array;
197:     fftw->boutarray = y_array;
198:     fftw_execute(fftw->p_backward);
199:   } else { /* use existing plan */
200:     if (fftw->binarray != x_array || fftw->boutarray != y_array) { /* use existing plan on new arrays */
201:       fftw_execute_dft(fftw->p_backward,(fftw_complex*)x_array,(fftw_complex*)y_array);
202:     } else {
203:       fftw_execute(fftw->p_backward);
204:     }
205:   }
206:   VecRestoreArray(y,&y_array);
207:   VecRestoreArray(x,&x_array);
208:   return(0);
209: }

211: /* MatMult_MPIFFTW performs forward DFT in parallel
212:    Input parameter:
213:      A - the matrix
214:      x - the vector on which FDFT will be performed

216:    Output parameter:
217:    y   - vector that stores result of FDFT
218: */
221: PetscErrorCode MatMult_MPIFFTW(Mat A,Vec x,Vec y)
222: {
224:   Mat_FFT        *fft  = (Mat_FFT*)A->data;
225:   Mat_FFTW       *fftw = (Mat_FFTW*)fft->data;
226:   PetscScalar    *x_array,*y_array;
227:   PetscInt       ndim=fft->ndim,*dim=fft->dim;
228:   MPI_Comm       comm;

231:   PetscObjectGetComm((PetscObject)A,&comm);
232:   VecGetArray(x,&x_array);
233:   VecGetArray(y,&y_array);
234:   if (!fftw->p_forward) { /* create a plan, then excute it */
235:     switch (ndim) {
236:     case 1:
237: #if defined(PETSC_USE_COMPLEX)
238:       fftw->p_forward = fftw_mpi_plan_dft_1d(dim[0],(fftw_complex*)x_array,(fftw_complex*)y_array,comm,FFTW_FORWARD,fftw->p_flag);
239: #else
240:       SETERRQ(comm,PETSC_ERR_SUP,"not support for real numbers yet");
241: #endif
242:       break;
243:     case 2:
244: #if defined(PETSC_USE_COMPLEX) /* For complex transforms call fftw_mpi_plan_dft, for real transforms call fftw_mpi_plan_dft_r2c */
245:       fftw->p_forward = fftw_mpi_plan_dft_2d(dim[0],dim[1],(fftw_complex*)x_array,(fftw_complex*)y_array,comm,FFTW_FORWARD,fftw->p_flag);
246: #else
247:       fftw->p_forward = fftw_mpi_plan_dft_r2c_2d(dim[0],dim[1],(double*)x_array,(fftw_complex*)y_array,comm,FFTW_ESTIMATE);
248: #endif
249:       break;
250:     case 3:
251: #if defined(PETSC_USE_COMPLEX)
252:       fftw->p_forward = fftw_mpi_plan_dft_3d(dim[0],dim[1],dim[2],(fftw_complex*)x_array,(fftw_complex*)y_array,comm,FFTW_FORWARD,fftw->p_flag);
253: #else
254:       fftw->p_forward = fftw_mpi_plan_dft_r2c_3d(dim[0],dim[1],dim[2],(double*)x_array,(fftw_complex*)y_array,comm,FFTW_ESTIMATE);
255: #endif
256:       break;
257:     default:
258: #if defined(PETSC_USE_COMPLEX)
259:       fftw->p_forward = fftw_mpi_plan_dft(fftw->ndim_fftw,fftw->dim_fftw,(fftw_complex*)x_array,(fftw_complex*)y_array,comm,FFTW_FORWARD,fftw->p_flag);
260: #else
261:       fftw->p_forward = fftw_mpi_plan_dft_r2c(fftw->ndim_fftw,fftw->dim_fftw,(double*)x_array,(fftw_complex*)y_array,comm,FFTW_ESTIMATE);
262: #endif
263:       break;
264:     }
265:     fftw->finarray  = x_array;
266:     fftw->foutarray = y_array;
267:     /* Warning: if (fftw->p_flag!==FFTW_ESTIMATE) The data in the in/out arrays is overwritten!
268:                 planning should be done before x is initialized! See FFTW manual sec2.1 or sec4 */
269:     fftw_execute(fftw->p_forward);
270:   } else { /* use existing plan */
271:     if (fftw->finarray != x_array || fftw->foutarray != y_array) { /* use existing plan on new arrays */
272:       fftw_execute_dft(fftw->p_forward,(fftw_complex*)x_array,(fftw_complex*)y_array);
273:     } else {
274:       fftw_execute(fftw->p_forward);
275:     }
276:   }
277:   VecRestoreArray(y,&y_array);
278:   VecRestoreArray(x,&x_array);
279:   return(0);
280: }

282: /* MatMultTranspose_MPIFFTW performs parallel backward DFT
283:    Input parameter:
284:      A - the matrix
285:      x - the vector on which BDFT will be performed

287:    Output parameter:
288:      y - vector that stores result of BDFT
289: */
292: PetscErrorCode MatMultTranspose_MPIFFTW(Mat A,Vec x,Vec y)
293: {
295:   Mat_FFT        *fft  = (Mat_FFT*)A->data;
296:   Mat_FFTW       *fftw = (Mat_FFTW*)fft->data;
297:   PetscScalar    *x_array,*y_array;
298:   PetscInt       ndim=fft->ndim,*dim=fft->dim;
299:   MPI_Comm       comm;

302:   PetscObjectGetComm((PetscObject)A,&comm);
303:   VecGetArray(x,&x_array);
304:   VecGetArray(y,&y_array);
305:   if (!fftw->p_backward) { /* create a plan, then excute it */
306:     switch (ndim) {
307:     case 1:
308: #if defined(PETSC_USE_COMPLEX)
309:       fftw->p_backward = fftw_mpi_plan_dft_1d(dim[0],(fftw_complex*)x_array,(fftw_complex*)y_array,comm,FFTW_BACKWARD,fftw->p_flag);
310: #else
311:       SETERRQ(comm,PETSC_ERR_SUP,"not support for real numbers yet");
312: #endif
313:       break;
314:     case 2:
315: #if defined(PETSC_USE_COMPLEX) /* For complex transforms call fftw_mpi_plan_dft with flag FFTW_BACKWARD, for real transforms call fftw_mpi_plan_dft_c2r */
316:       fftw->p_backward = fftw_mpi_plan_dft_2d(dim[0],dim[1],(fftw_complex*)x_array,(fftw_complex*)y_array,comm,FFTW_BACKWARD,fftw->p_flag);
317: #else
318:       fftw->p_backward = fftw_mpi_plan_dft_c2r_2d(dim[0],dim[1],(fftw_complex*)x_array,(double*)y_array,comm,FFTW_ESTIMATE);
319: #endif
320:       break;
321:     case 3:
322: #if defined(PETSC_USE_COMPLEX)
323:       fftw->p_backward = fftw_mpi_plan_dft_3d(dim[0],dim[1],dim[2],(fftw_complex*)x_array,(fftw_complex*)y_array,comm,FFTW_BACKWARD,fftw->p_flag);
324: #else
325:       fftw->p_backward = fftw_mpi_plan_dft_c2r_3d(dim[0],dim[1],dim[2],(fftw_complex*)x_array,(double*)y_array,comm,FFTW_ESTIMATE);
326: #endif
327:       break;
328:     default:
329: #if defined(PETSC_USE_COMPLEX)
330:       fftw->p_backward = fftw_mpi_plan_dft(fftw->ndim_fftw,fftw->dim_fftw,(fftw_complex*)x_array,(fftw_complex*)y_array,comm,FFTW_BACKWARD,fftw->p_flag);
331: #else
332:       fftw->p_backward = fftw_mpi_plan_dft_c2r(fftw->ndim_fftw,fftw->dim_fftw,(fftw_complex*)x_array,(double*)y_array,comm,FFTW_ESTIMATE);
333: #endif
334:       break;
335:     }
336:     fftw->binarray  = x_array;
337:     fftw->boutarray = y_array;
338:     fftw_execute(fftw->p_backward);
339:   } else { /* use existing plan */
340:     if (fftw->binarray != x_array || fftw->boutarray != y_array) { /* use existing plan on new arrays */
341:       fftw_execute_dft(fftw->p_backward,(fftw_complex*)x_array,(fftw_complex*)y_array);
342:     } else {
343:       fftw_execute(fftw->p_backward);
344:     }
345:   }
346:   VecRestoreArray(y,&y_array);
347:   VecRestoreArray(x,&x_array);
348:   return(0);
349: }

353: PetscErrorCode MatDestroy_FFTW(Mat A)
354: {
355:   Mat_FFT        *fft  = (Mat_FFT*)A->data;
356:   Mat_FFTW       *fftw = (Mat_FFTW*)fft->data;

360:   fftw_destroy_plan(fftw->p_forward);
361:   fftw_destroy_plan(fftw->p_backward);
362:   if (fftw->iodims) {
363:     free(fftw->iodims);
364:   }
365:   PetscFree(fftw->dim_fftw);
366:   PetscFree(fft->data);
367:   fftw_mpi_cleanup();
368:   return(0);
369: }

371: #include <../src/vec/vec/impls/mpi/pvecimpl.h>   /*I  "petscvec.h"   I*/
374: PetscErrorCode VecDestroy_MPIFFTW(Vec v)
375: {
377:   PetscScalar    *array;

380:   VecGetArray(v,&array);
381:   fftw_free((fftw_complex*)array);
382:   VecRestoreArray(v,&array);
383:   VecDestroy_MPI(v);
384:   return(0);
385: }

389: /*@
390:    MatGetVecsFFTW - Get vector(s) compatible with the matrix, i.e. with the
391:      parallel layout determined by FFTW

393:    Collective on Mat

395:    Input Parameter:
396: .   A - the matrix

398:    Output Parameter:
399: +   x - (optional) input vector of forward FFTW
400: -   y - (optional) output vector of forward FFTW
401: -   z - (optional) output vector of backward FFTW

403:   Level: advanced

405:   Note: The parallel layout of output of forward FFTW is always same as the input
406:         of the backward FFTW. But parallel layout of the input vector of forward
407:         FFTW might not be same as the output of backward FFTW.
408:         Also note that we need to provide enough space while doing parallel real transform.
409:         We need to pad extra zeros at the end of last dimension. For this reason the one needs to
410:         invoke the routine fftw_mpi_local_size_transposed routines. Remember one has to change the
411:         last dimension from n to n/2+1 while invoking this routine. The number of zeros to be padded
412:         depends on if the last dimension is even or odd. If the last dimension is even need to pad two
413:         zeros if it is odd only one zero is needed.
414:         Lastly one needs some scratch space at the end of data set in each process. alloc_local
415:         figures out how much space is needed, i.e. it figures out the data+scratch space for
416:         each processor and returns that.

418: .seealso: MatCreateFFTW()
419: @*/
420: PetscErrorCode MatGetVecsFFTW(Mat A,Vec *x,Vec *y,Vec *z)
421: {

425:   PetscTryMethod(A,"MatGetVecsFFTW_C",(Mat,Vec*,Vec*,Vec*),(A,x,y,z));
426:   return(0);
427: }

431: PetscErrorCode  MatGetVecsFFTW_FFTW(Mat A,Vec *fin,Vec *fout,Vec *bout)
432: {
434:   PetscMPIInt    size,rank;
435:   MPI_Comm       comm;
436:   Mat_FFT        *fft  = (Mat_FFT*)A->data;
437:   Mat_FFTW       *fftw = (Mat_FFTW*)fft->data;
438:   PetscInt       N     = fft->N;
439:   PetscInt       ndim  = fft->ndim,*dim=fft->dim,n=fft->n;

444:   PetscObjectGetComm((PetscObject)A,&comm);

446:   MPI_Comm_size(comm, &size);
447:   MPI_Comm_rank(comm, &rank);
448:   if (size == 1) { /* sequential case */
449: #if defined(PETSC_USE_COMPLEX)
450:     if (fin) {VecCreateSeq(PETSC_COMM_SELF,N,fin);}
451:     if (fout) {VecCreateSeq(PETSC_COMM_SELF,N,fout);}
452:     if (bout) {VecCreateSeq(PETSC_COMM_SELF,N,bout);}
453: #else
454:     if (fin) {VecCreateSeq(PETSC_COMM_SELF,n,fin);}
455:     if (fout) {VecCreateSeq(PETSC_COMM_SELF,n,fout);}
456:     if (bout) {VecCreateSeq(PETSC_COMM_SELF,n,bout);}
457: #endif
458:   } else { /* parallel cases */
459:     ptrdiff_t    alloc_local,local_n0,local_0_start;
460:     ptrdiff_t    local_n1;
461:     fftw_complex *data_fout;
462:     ptrdiff_t    local_1_start;
463: #if defined(PETSC_USE_COMPLEX)
464:     fftw_complex *data_fin,*data_bout;
465: #else
466:     double    *data_finr,*data_boutr;
467:     PetscInt  n1,N1;
468:     ptrdiff_t temp;
469: #endif

471:     switch (ndim) {
472:     case 1:
473: #if !defined(PETSC_USE_COMPLEX)
474:       SETERRQ(comm,PETSC_ERR_SUP,"FFTW does not allow parallel real 1D transform");
475: #else
476:       alloc_local = fftw_mpi_local_size_1d(dim[0],comm,FFTW_FORWARD,FFTW_ESTIMATE,&local_n0,&local_0_start,&local_n1,&local_1_start);
477:       if (fin) {
478:         data_fin  = (fftw_complex*)fftw_malloc(sizeof(fftw_complex)*alloc_local);
479:         VecCreateMPIWithArray(comm,1,local_n0,N,(const PetscScalar*)data_fin,fin);

481:         (*fin)->ops->destroy = VecDestroy_MPIFFTW;
482:       }
483:       if (fout) {
484:         data_fout = (fftw_complex*)fftw_malloc(sizeof(fftw_complex)*alloc_local);
485:         VecCreateMPIWithArray(comm,1,local_n1,N,(const PetscScalar*)data_fout,fout);

487:         (*fout)->ops->destroy = VecDestroy_MPIFFTW;
488:       }
489:       alloc_local = fftw_mpi_local_size_1d(dim[0],comm,FFTW_BACKWARD,FFTW_ESTIMATE,&local_n0,&local_0_start,&local_n1,&local_1_start);
490:       if (bout) {
491:         data_bout = (fftw_complex*)fftw_malloc(sizeof(fftw_complex)*alloc_local);
492:         VecCreateMPIWithArray(comm,1,local_n1,N,(const PetscScalar*)data_bout,bout);

494:         (*bout)->ops->destroy = VecDestroy_MPIFFTW;
495:       }
496:       break;
497: #endif
498:     case 2:
499: #if !defined(PETSC_USE_COMPLEX) /* Note that N1 is no more the product of individual dimensions */
500:       alloc_local =  fftw_mpi_local_size_2d_transposed(dim[0],dim[1]/2+1,comm,&local_n0,&local_0_start,&local_n1,&local_1_start);
501:       N1          = 2*dim[0]*(dim[1]/2+1); n1 = 2*local_n0*(dim[1]/2+1);
502:       if (fin) {
503:         data_finr = (double*)fftw_malloc(sizeof(double)*alloc_local*2);
504:         VecCreateMPIWithArray(comm,1,(PetscInt)n1,N1,(PetscScalar*)data_finr,fin);

506:         (*fin)->ops->destroy = VecDestroy_MPIFFTW;
507:       }
508:       if (bout) {
509:         data_boutr = (double*)fftw_malloc(sizeof(double)*alloc_local*2);
510:         VecCreateMPIWithArray(comm,1,(PetscInt)n1,N1,(PetscScalar*)data_boutr,bout);

512:         (*bout)->ops->destroy = VecDestroy_MPIFFTW;
513:       }
514:       if (fout) {
515:         data_fout = (fftw_complex*)fftw_malloc(sizeof(fftw_complex)*alloc_local);
516:         VecCreateMPIWithArray(comm,1,(PetscInt)n1,N1,(PetscScalar*)data_fout,fout);

518:         (*fout)->ops->destroy = VecDestroy_MPIFFTW;
519:       }
520: #else
521:       /* Get local size */
522:       alloc_local = fftw_mpi_local_size_2d(dim[0],dim[1],comm,&local_n0,&local_0_start);
523:       if (fin) {
524:         data_fin = (fftw_complex*)fftw_malloc(sizeof(fftw_complex)*alloc_local);
525:         VecCreateMPIWithArray(comm,1,n,N,(const PetscScalar*)data_fin,fin);

527:         (*fin)->ops->destroy = VecDestroy_MPIFFTW;
528:       }
529:       if (fout) {
530:         data_fout = (fftw_complex*)fftw_malloc(sizeof(fftw_complex)*alloc_local);
531:         VecCreateMPIWithArray(comm,1,n,N,(const PetscScalar*)data_fout,fout);

533:         (*fout)->ops->destroy = VecDestroy_MPIFFTW;
534:       }
535:       if (bout) {
536:         data_bout = (fftw_complex*)fftw_malloc(sizeof(fftw_complex)*alloc_local);
537:         VecCreateMPIWithArray(comm,1,n,N,(const PetscScalar*)data_bout,bout);

539:         (*bout)->ops->destroy = VecDestroy_MPIFFTW;
540:       }
541: #endif
542:       break;
543:     case 3:
544: #if !defined(PETSC_USE_COMPLEX)
545:       alloc_local =  fftw_mpi_local_size_3d_transposed(dim[0],dim[1],dim[2]/2+1,comm,&local_n0,&local_0_start,&local_n1,&local_1_start);
546:       N1 = 2*dim[0]*dim[1]*(dim[2]/2+1); n1 = 2*local_n0*dim[1]*(dim[2]/2+1);
547:       if (fin) {
548:         data_finr = (double*)fftw_malloc(sizeof(double)*alloc_local*2);
549:         VecCreateMPIWithArray(comm,1,(PetscInt)n1,N1,(PetscScalar*)data_finr,fin);

551:         (*fin)->ops->destroy = VecDestroy_MPIFFTW;
552:       }
553:       if (bout) {
554:         data_boutr=(double*)fftw_malloc(sizeof(double)*alloc_local*2);
555:         VecCreateMPIWithArray(comm,1,(PetscInt)n1,N1,(PetscScalar*)data_boutr,bout);

557:         (*bout)->ops->destroy   = VecDestroy_MPIFFTW;
558:       }

560:       if (fout) {
561:         data_fout=(fftw_complex*)fftw_malloc(sizeof(fftw_complex)*alloc_local);
562:         VecCreateMPIWithArray(comm,1,n1,N1,(PetscScalar*)data_fout,fout);

564:         (*fout)->ops->destroy   = VecDestroy_MPIFFTW;
565:       }
566: #else
567:       alloc_local = fftw_mpi_local_size_3d(dim[0],dim[1],dim[2],comm,&local_n0,&local_0_start);
568:       if (fin) {
569:         data_fin  = (fftw_complex*)fftw_malloc(sizeof(fftw_complex)*alloc_local);
570:         VecCreateMPIWithArray(comm,1,n,N,(const PetscScalar*)data_fin,fin);

572:         (*fin)->ops->destroy   = VecDestroy_MPIFFTW;
573:       }
574:       if (fout) {
575:         data_fout = (fftw_complex*)fftw_malloc(sizeof(fftw_complex)*alloc_local);
576:         VecCreateMPIWithArray(comm,1,n,N,(const PetscScalar*)data_fout,fout);

578:         (*fout)->ops->destroy   = VecDestroy_MPIFFTW;
579:       }
580:       if (bout) {
581:         data_bout  = (fftw_complex*)fftw_malloc(sizeof(fftw_complex)*alloc_local);
582:         VecCreateMPIWithArray(comm,1,n,N,(const PetscScalar*)data_bout,bout);

584:         (*bout)->ops->destroy   = VecDestroy_MPIFFTW;
585:       }
586: #endif
587:       break;
588:     default:
589: #if !defined(PETSC_USE_COMPLEX)
590:       temp = (fftw->dim_fftw)[fftw->ndim_fftw-1];

592:       (fftw->dim_fftw)[fftw->ndim_fftw-1] = temp/2 + 1;

594:       alloc_local = fftw_mpi_local_size_transposed(fftw->ndim_fftw,fftw->dim_fftw,comm,&local_n0,&local_0_start,&local_n1,&local_1_start);
595:       N1          = 2*N*(PetscInt)((fftw->dim_fftw)[fftw->ndim_fftw-1])/((PetscInt) temp);

597:       (fftw->dim_fftw)[fftw->ndim_fftw-1] = temp;

599:       if (fin) {
600:         data_finr=(double*)fftw_malloc(sizeof(double)*alloc_local*2);
601:         VecCreateMPIWithArray(comm,1,(PetscInt)n,N1,(PetscScalar*)data_finr,fin);

603:         (*fin)->ops->destroy = VecDestroy_MPIFFTW;
604:       }
605:       if (bout) {
606:         data_boutr=(double*)fftw_malloc(sizeof(double)*alloc_local*2);
607:         VecCreateMPIWithArray(comm,1,(PetscInt)n,N1,(PetscScalar*)data_boutr,bout);

609:         (*bout)->ops->destroy = VecDestroy_MPIFFTW;
610:       }

612:       if (fout) {
613:         data_fout=(fftw_complex*)fftw_malloc(sizeof(fftw_complex)*alloc_local);
614:         VecCreateMPIWithArray(comm,1,n,N1,(PetscScalar*)data_fout,fout);

616:         (*fout)->ops->destroy = VecDestroy_MPIFFTW;
617:       }
618: #else
619:       alloc_local = fftw_mpi_local_size(fftw->ndim_fftw,fftw->dim_fftw,comm,&local_n0,&local_0_start);
620:       if (fin) {
621:         data_fin  = (fftw_complex*)fftw_malloc(sizeof(fftw_complex)*alloc_local);
622:         VecCreateMPIWithArray(comm,1,n,N,(const PetscScalar*)data_fin,fin);

624:         (*fin)->ops->destroy = VecDestroy_MPIFFTW;
625:       }
626:       if (fout) {
627:         data_fout = (fftw_complex*)fftw_malloc(sizeof(fftw_complex)*alloc_local);
628:         VecCreateMPIWithArray(comm,1,n,N,(const PetscScalar*)data_fout,fout);

630:         (*fout)->ops->destroy = VecDestroy_MPIFFTW;
631:       }
632:       if (bout) {
633:         data_bout  = (fftw_complex*)fftw_malloc(sizeof(fftw_complex)*alloc_local);
634:         VecCreateMPIWithArray(comm,1,n,N,(const PetscScalar*)data_bout,bout);

636:         (*bout)->ops->destroy = VecDestroy_MPIFFTW;
637:       }
638: #endif
639:       break;
640:     }
641:   }
642:   return(0);
643: }

647: /*@
648:    VecScatterPetscToFFTW - Copies the PETSc vector to the vector that goes into FFTW block.

650:    Collective on Mat

652:    Input Parameters:
653: +  A - FFTW matrix
654: -  x - the PETSc vector

656:    Output Parameters:
657: .  y - the FFTW vector

659:   Options Database Keys:
660: . -mat_fftw_plannerflags - set FFTW planner flags

662:    Level: intermediate

664:    Note: For real parallel FFT, FFTW requires insertion of extra space at the end of last dimension. This required even when
665:          one is not doing in-place transform. The last dimension size must be changed to 2*(dim[last]/2+1) to accommodate these extra
666:          zeros. This routine does that job by scattering operation.

668: .seealso: VecScatterFFTWToPetsc()
669: @*/
670: PetscErrorCode VecScatterPetscToFFTW(Mat A,Vec x,Vec y)
671: {

675:   PetscTryMethod(A,"VecScatterPetscToFFTW_C",(Mat,Vec,Vec),(A,x,y));
676:   return(0);
677: }

681: PetscErrorCode VecScatterPetscToFFTW_FFTW(Mat A,Vec x,Vec y)
682: {
684:   MPI_Comm       comm;
685:   Mat_FFT        *fft  = (Mat_FFT*)A->data;
686:   Mat_FFTW       *fftw = (Mat_FFTW*)fft->data;
687:   PetscInt       N     =fft->N;
688:   PetscInt       ndim  =fft->ndim,*dim=fft->dim;
689:   PetscInt       low;
690:   PetscMPIInt    rank,size;
691:   PetscInt       vsize,vsize1;
692:   ptrdiff_t      local_n0,local_0_start;
693:   ptrdiff_t      local_n1,local_1_start;
694:   VecScatter     vecscat;
695:   IS             list1,list2;
696: #if !defined(PETSC_USE_COMPLEX)
697:   PetscInt       i,j,k,partial_dim;
698:   PetscInt       *indx1, *indx2, tempindx, tempindx1;
699:   PetscInt       NM;
700:   ptrdiff_t      temp;
701: #endif

704:   PetscObjectGetComm((PetscObject)A,&comm);
705:   MPI_Comm_size(comm, &size);
706:   MPI_Comm_rank(comm, &rank);
707:   VecGetOwnershipRange(y,&low,NULL);

709:   if (size==1) {
710:     VecGetSize(x,&vsize);
711:     VecGetSize(y,&vsize1);
712:     ISCreateStride(PETSC_COMM_SELF,N,0,1,&list1);
713:     VecScatterCreate(x,list1,y,list1,&vecscat);
714:     VecScatterBegin(vecscat,x,y,INSERT_VALUES,SCATTER_FORWARD);
715:     VecScatterEnd(vecscat,x,y,INSERT_VALUES,SCATTER_FORWARD);
716:     VecScatterDestroy(&vecscat);
717:     ISDestroy(&list1);
718:   } else {
719:     switch (ndim) {
720:     case 1:
721: #if defined(PETSC_USE_COMPLEX)
722:       fftw_mpi_local_size_1d(dim[0],comm,FFTW_FORWARD,FFTW_ESTIMATE,&local_n0,&local_0_start,&local_n1,&local_1_start);

724:       ISCreateStride(comm,local_n0,local_0_start,1,&list1);
725:       ISCreateStride(comm,local_n0,low,1,&list2);
726:       VecScatterCreate(x,list1,y,list2,&vecscat);
727:       VecScatterBegin(vecscat,x,y,INSERT_VALUES,SCATTER_FORWARD);
728:       VecScatterEnd(vecscat,x,y,INSERT_VALUES,SCATTER_FORWARD);
729:       VecScatterDestroy(&vecscat);
730:       ISDestroy(&list1);
731:       ISDestroy(&list2);
732: #else
733:       SETERRQ(comm,PETSC_ERR_SUP,"FFTW does not support parallel 1D real transform");
734: #endif
735:       break;
736:     case 2:
737: #if defined(PETSC_USE_COMPLEX)
738:       fftw_mpi_local_size_2d(dim[0],dim[1],comm,&local_n0,&local_0_start);

740:       ISCreateStride(comm,local_n0*dim[1],local_0_start*dim[1],1,&list1);
741:       ISCreateStride(comm,local_n0*dim[1],low,1,&list2);
742:       VecScatterCreate(x,list1,y,list2,&vecscat);
743:       VecScatterBegin(vecscat,x,y,INSERT_VALUES,SCATTER_FORWARD);
744:       VecScatterEnd(vecscat,x,y,INSERT_VALUES,SCATTER_FORWARD);
745:       VecScatterDestroy(&vecscat);
746:       ISDestroy(&list1);
747:       ISDestroy(&list2);
748: #else
749:       fftw_mpi_local_size_2d_transposed(dim[0],dim[1]/2+1,comm,&local_n0,&local_0_start,&local_n1,&local_1_start);

751:       PetscMalloc(sizeof(PetscInt)*((PetscInt)local_n0)*dim[1],&indx1);
752:       PetscMalloc(sizeof(PetscInt)*((PetscInt)local_n0)*dim[1],&indx2);

754:       if (dim[1]%2==0) {
755:         NM = dim[1]+2;
756:       } else {
757:         NM = dim[1]+1;
758:       }
759:       for (i=0; i<local_n0; i++) {
760:         for (j=0; j<dim[1]; j++) {
761:           tempindx  = i*dim[1] + j;
762:           tempindx1 = i*NM + j;

764:           indx1[tempindx]=local_0_start*dim[1]+tempindx;
765:           indx2[tempindx]=low+tempindx1;
766:         }
767:       }

769:       ISCreateGeneral(comm,local_n0*dim[1],indx1,PETSC_COPY_VALUES,&list1);
770:       ISCreateGeneral(comm,local_n0*dim[1],indx2,PETSC_COPY_VALUES,&list2);

772:       VecScatterCreate(x,list1,y,list2,&vecscat);
773:       VecScatterBegin(vecscat,x,y,INSERT_VALUES,SCATTER_FORWARD);
774:       VecScatterEnd(vecscat,x,y,INSERT_VALUES,SCATTER_FORWARD);
775:       VecScatterDestroy(&vecscat);
776:       ISDestroy(&list1);
777:       ISDestroy(&list2);
778:       PetscFree(indx1);
779:       PetscFree(indx2);
780: #endif
781:       break;

783:     case 3:
784: #if defined(PETSC_USE_COMPLEX)
785:       fftw_mpi_local_size_3d(dim[0],dim[1],dim[2],comm,&local_n0,&local_0_start);

787:       ISCreateStride(comm,local_n0*dim[1]*dim[2],local_0_start*dim[1]*dim[2],1,&list1);
788:       ISCreateStride(comm,local_n0*dim[1]*dim[2],low,1,&list2);
789:       VecScatterCreate(x,list1,y,list2,&vecscat);
790:       VecScatterBegin(vecscat,x,y,INSERT_VALUES,SCATTER_FORWARD);
791:       VecScatterEnd(vecscat,x,y,INSERT_VALUES,SCATTER_FORWARD);
792:       VecScatterDestroy(&vecscat);
793:       ISDestroy(&list1);
794:       ISDestroy(&list2);
795: #else
796:       /* buggy, needs to be fixed. See src/mat/examples/tests/ex158.c */
797:       SETERRQ(comm,PETSC_ERR_SUP,"FFTW does not support parallel 3D real transform");
798:       fftw_mpi_local_size_3d_transposed(dim[0],dim[1],dim[2]/2+1,comm,&local_n0,&local_0_start,&local_n1,&local_1_start);

800:       PetscMalloc(sizeof(PetscInt)*((PetscInt)local_n0)*dim[1]*dim[2],&indx1);
801:       PetscMalloc(sizeof(PetscInt)*((PetscInt)local_n0)*dim[1]*dim[2],&indx2);

803:       if (dim[2]%2==0) NM = dim[2]+2;
804:       else             NM = dim[2]+1;

806:       for (i=0; i<local_n0; i++) {
807:         for (j=0; j<dim[1]; j++) {
808:           for (k=0; k<dim[2]; k++) {
809:             tempindx  = i*dim[1]*dim[2] + j*dim[2] + k;
810:             tempindx1 = i*dim[1]*NM + j*NM + k;

812:             indx1[tempindx]=local_0_start*dim[1]*dim[2]+tempindx;
813:             indx2[tempindx]=low+tempindx1;
814:           }
815:         }
816:       }

818:       ISCreateGeneral(comm,local_n0*dim[1]*dim[2],indx1,PETSC_COPY_VALUES,&list1);
819:       ISCreateGeneral(comm,local_n0*dim[1]*dim[2],indx2,PETSC_COPY_VALUES,&list2);
820:       VecScatterCreate(x,list1,y,list2,&vecscat);
821:       VecScatterBegin(vecscat,x,y,INSERT_VALUES,SCATTER_FORWARD);
822:       VecScatterEnd(vecscat,x,y,INSERT_VALUES,SCATTER_FORWARD);
823:       VecScatterDestroy(&vecscat);
824:       ISDestroy(&list1);
825:       ISDestroy(&list2);
826:       PetscFree(indx1);
827:       PetscFree(indx2);
828: #endif
829:       break;

831:     default:
832: #if defined(PETSC_USE_COMPLEX)
833:       fftw_mpi_local_size(fftw->ndim_fftw,fftw->dim_fftw,comm,&local_n0,&local_0_start);

835:       ISCreateStride(comm,local_n0*(fftw->partial_dim),local_0_start*(fftw->partial_dim),1,&list1);
836:       ISCreateStride(comm,local_n0*(fftw->partial_dim),low,1,&list2);
837:       VecScatterCreate(x,list1,y,list2,&vecscat);
838:       VecScatterBegin(vecscat,x,y,INSERT_VALUES,SCATTER_FORWARD);
839:       VecScatterEnd(vecscat,x,y,INSERT_VALUES,SCATTER_FORWARD);
840:       VecScatterDestroy(&vecscat);
841:       ISDestroy(&list1);
842:       ISDestroy(&list2);
843: #else
844:       /* buggy, needs to be fixed. See src/mat/examples/tests/ex158.c */
845:       SETERRQ(comm,PETSC_ERR_SUP,"FFTW does not support parallel DIM>3 real transform");
846:       temp = (fftw->dim_fftw)[fftw->ndim_fftw-1];

848:       (fftw->dim_fftw)[fftw->ndim_fftw-1] = temp/2 + 1;

850:       fftw_mpi_local_size_transposed(fftw->ndim_fftw,fftw->dim_fftw,comm,&local_n0,&local_0_start,&local_n1,&local_1_start);

852:       (fftw->dim_fftw)[fftw->ndim_fftw-1] = temp;

854:       partial_dim = fftw->partial_dim;

856:       PetscMalloc(sizeof(PetscInt)*((PetscInt)local_n0)*partial_dim,&indx1);
857:       PetscMalloc(sizeof(PetscInt)*((PetscInt)local_n0)*partial_dim,&indx2);

859:       if (dim[ndim-1]%2==0) NM = 2;
860:       else                  NM = 1;

862:       j = low;
863:       for (i=0,k=1; i<((PetscInt)local_n0)*partial_dim;i++,k++) {
864:         indx1[i] = local_0_start*partial_dim + i;
865:         indx2[i] = j;
866:         if (k%dim[ndim-1]==0) j+=NM;
867:         j++;
868:       }
869:       ISCreateGeneral(comm,local_n0*partial_dim,indx1,PETSC_COPY_VALUES,&list1);
870:       ISCreateGeneral(comm,local_n0*partial_dim,indx2,PETSC_COPY_VALUES,&list2);
871:       VecScatterCreate(x,list1,y,list2,&vecscat);
872:       VecScatterBegin(vecscat,x,y,INSERT_VALUES,SCATTER_FORWARD);
873:       VecScatterEnd(vecscat,x,y,INSERT_VALUES,SCATTER_FORWARD);
874:       VecScatterDestroy(&vecscat);
875:       ISDestroy(&list1);
876:       ISDestroy(&list2);
877:       PetscFree(indx1);
878:       PetscFree(indx2);
879: #endif
880:       break;
881:     }
882:   }
883:   return(0);
884: }

888: /*@
889:    VecScatterFFTWToPetsc - Converts FFTW output to the PETSc vector.

891:    Collective on Mat

893:     Input Parameters:
894: +   A - FFTW matrix
895: -   x - FFTW vector

897:    Output Parameters:
898: .  y - PETSc vector

900:    Level: intermediate

902:    Note: While doing real transform the FFTW output of backward DFT contains extra zeros at the end of last dimension.
903:          VecScatterFFTWToPetsc removes those extra zeros.

905: .seealso: VecScatterPetscToFFTW()
906: @*/
907: PetscErrorCode VecScatterFFTWToPetsc(Mat A,Vec x,Vec y)
908: {

912:   PetscTryMethod(A,"VecScatterFFTWToPetsc_C",(Mat,Vec,Vec),(A,x,y));
913:   return(0);
914: }

918: PetscErrorCode VecScatterFFTWToPetsc_FFTW(Mat A,Vec x,Vec y)
919: {
921:   MPI_Comm       comm;
922:   Mat_FFT        *fft  = (Mat_FFT*)A->data;
923:   Mat_FFTW       *fftw = (Mat_FFTW*)fft->data;
924:   PetscInt       N     = fft->N;
925:   PetscInt       ndim  = fft->ndim,*dim=fft->dim;
926:   PetscInt       low;
927:   PetscMPIInt    rank,size;
928:   ptrdiff_t      local_n0,local_0_start;
929:   ptrdiff_t      local_n1,local_1_start;
930:   VecScatter     vecscat;
931:   IS             list1,list2;
932: #if !defined(PETSC_USE_COMPLEX)
933:   PetscInt       i,j,k,partial_dim;
934:   PetscInt       *indx1, *indx2, tempindx, tempindx1;
935:   PetscInt       NM;
936:   ptrdiff_t      temp;
937: #endif

940:   PetscObjectGetComm((PetscObject)A,&comm);
941:   MPI_Comm_size(comm, &size);
942:   MPI_Comm_rank(comm, &rank);
943:   VecGetOwnershipRange(x,&low,NULL);

945:   if (size==1) {
946:     ISCreateStride(comm,N,0,1,&list1);
947:     VecScatterCreate(x,list1,y,list1,&vecscat);
948:     VecScatterBegin(vecscat,x,y,INSERT_VALUES,SCATTER_FORWARD);
949:     VecScatterEnd(vecscat,x,y,INSERT_VALUES,SCATTER_FORWARD);
950:     VecScatterDestroy(&vecscat);
951:     ISDestroy(&list1);

953:   } else {
954:     switch (ndim) {
955:     case 1:
956: #if defined(PETSC_USE_COMPLEX)
957:       fftw_mpi_local_size_1d(dim[0],comm,FFTW_BACKWARD,FFTW_ESTIMATE,&local_n0,&local_0_start,&local_n1,&local_1_start);

959:       ISCreateStride(comm,local_n1,local_1_start,1,&list1);
960:       ISCreateStride(comm,local_n1,low,1,&list2);
961:       VecScatterCreate(x,list1,y,list2,&vecscat);
962:       VecScatterBegin(vecscat,x,y,INSERT_VALUES,SCATTER_FORWARD);
963:       VecScatterEnd(vecscat,x,y,INSERT_VALUES,SCATTER_FORWARD);
964:       VecScatterDestroy(&vecscat);
965:       ISDestroy(&list1);
966:       ISDestroy(&list2);
967: #else
968:       SETERRQ(comm,PETSC_ERR_SUP,"No support for real parallel 1D FFT");
969: #endif
970:       break;
971:     case 2:
972: #if defined(PETSC_USE_COMPLEX)
973:       fftw_mpi_local_size_2d(dim[0],dim[1],comm,&local_n0,&local_0_start);

975:       ISCreateStride(comm,local_n0*dim[1],local_0_start*dim[1],1,&list1);
976:       ISCreateStride(comm,local_n0*dim[1],low,1,&list2);
977:       VecScatterCreate(x,list2,y,list1,&vecscat);
978:       VecScatterBegin(vecscat,x,y,INSERT_VALUES,SCATTER_FORWARD);
979:       VecScatterEnd(vecscat,x,y,INSERT_VALUES,SCATTER_FORWARD);
980:       VecScatterDestroy(&vecscat);
981:       ISDestroy(&list1);
982:       ISDestroy(&list2);
983: #else
984:       fftw_mpi_local_size_2d_transposed(dim[0],dim[1]/2+1,comm,&local_n0,&local_0_start,&local_n1,&local_1_start);

986:       PetscMalloc(sizeof(PetscInt)*((PetscInt)local_n0)*dim[1],&indx1);
987:       PetscMalloc(sizeof(PetscInt)*((PetscInt)local_n0)*dim[1],&indx2);

989:       if (dim[1]%2==0) NM = dim[1]+2;
990:       else             NM = dim[1]+1;

992:       for (i=0; i<local_n0; i++) {
993:         for (j=0; j<dim[1]; j++) {
994:           tempindx = i*dim[1] + j;
995:           tempindx1 = i*NM + j;

997:           indx1[tempindx]=local_0_start*dim[1]+tempindx;
998:           indx2[tempindx]=low+tempindx1;
999:         }
1000:       }

1002:       ISCreateGeneral(comm,local_n0*dim[1],indx1,PETSC_COPY_VALUES,&list1);
1003:       ISCreateGeneral(comm,local_n0*dim[1],indx2,PETSC_COPY_VALUES,&list2);

1005:       VecScatterCreate(x,list2,y,list1,&vecscat);
1006:       VecScatterBegin(vecscat,x,y,INSERT_VALUES,SCATTER_FORWARD);
1007:       VecScatterEnd(vecscat,x,y,INSERT_VALUES,SCATTER_FORWARD);
1008:       VecScatterDestroy(&vecscat);
1009:       ISDestroy(&list1);
1010:       ISDestroy(&list2);
1011:       PetscFree(indx1);
1012:       PetscFree(indx2);
1013: #endif
1014:       break;
1015:     case 3:
1016: #if defined(PETSC_USE_COMPLEX)
1017:       fftw_mpi_local_size_3d(dim[0],dim[1],dim[2],comm,&local_n0,&local_0_start);

1019:       ISCreateStride(comm,local_n0*dim[1]*dim[2],local_0_start*dim[1]*dim[2],1,&list1);
1020:       ISCreateStride(comm,local_n0*dim[1]*dim[2],low,1,&list2);
1021:       VecScatterCreate(x,list1,y,list2,&vecscat);
1022:       VecScatterBegin(vecscat,x,y,INSERT_VALUES,SCATTER_FORWARD);
1023:       VecScatterEnd(vecscat,x,y,INSERT_VALUES,SCATTER_FORWARD);
1024:       VecScatterDestroy(&vecscat);
1025:       ISDestroy(&list1);
1026:       ISDestroy(&list2);
1027: #else
1028:       fftw_mpi_local_size_3d_transposed(dim[0],dim[1],dim[2]/2+1,comm,&local_n0,&local_0_start,&local_n1,&local_1_start);

1030:       PetscMalloc(sizeof(PetscInt)*((PetscInt)local_n0)*dim[1]*dim[2],&indx1);
1031:       PetscMalloc(sizeof(PetscInt)*((PetscInt)local_n0)*dim[1]*dim[2],&indx2);

1033:       if (dim[2]%2==0) NM = dim[2]+2;
1034:       else             NM = dim[2]+1;

1036:       for (i=0; i<local_n0; i++) {
1037:         for (j=0; j<dim[1]; j++) {
1038:           for (k=0; k<dim[2]; k++) {
1039:             tempindx  = i*dim[1]*dim[2] + j*dim[2] + k;
1040:             tempindx1 = i*dim[1]*NM + j*NM + k;

1042:             indx1[tempindx]=local_0_start*dim[1]*dim[2]+tempindx;
1043:             indx2[tempindx]=low+tempindx1;
1044:           }
1045:         }
1046:       }

1048:       ISCreateGeneral(comm,local_n0*dim[1]*dim[2],indx1,PETSC_COPY_VALUES,&list1);
1049:       ISCreateGeneral(comm,local_n0*dim[1]*dim[2],indx2,PETSC_COPY_VALUES,&list2);

1051:       VecScatterCreate(x,list2,y,list1,&vecscat);
1052:       VecScatterBegin(vecscat,x,y,INSERT_VALUES,SCATTER_FORWARD);
1053:       VecScatterEnd(vecscat,x,y,INSERT_VALUES,SCATTER_FORWARD);
1054:       VecScatterDestroy(&vecscat);
1055:       ISDestroy(&list1);
1056:       ISDestroy(&list2);
1057:       PetscFree(indx1);
1058:       PetscFree(indx2);
1059: #endif
1060:       break;
1061:     default:
1062: #if defined(PETSC_USE_COMPLEX)
1063:       fftw_mpi_local_size(fftw->ndim_fftw,fftw->dim_fftw,comm,&local_n0,&local_0_start);

1065:       ISCreateStride(comm,local_n0*(fftw->partial_dim),local_0_start*(fftw->partial_dim),1,&list1);
1066:       ISCreateStride(comm,local_n0*(fftw->partial_dim),low,1,&list2);
1067:       VecScatterCreate(x,list1,y,list2,&vecscat);
1068:       VecScatterBegin(vecscat,x,y,INSERT_VALUES,SCATTER_FORWARD);
1069:       VecScatterEnd(vecscat,x,y,INSERT_VALUES,SCATTER_FORWARD);
1070:       VecScatterDestroy(&vecscat);
1071:       ISDestroy(&list1);
1072:       ISDestroy(&list2);
1073: #else
1074:       temp = (fftw->dim_fftw)[fftw->ndim_fftw-1];

1076:       (fftw->dim_fftw)[fftw->ndim_fftw-1] = temp/2 + 1;

1078:       fftw_mpi_local_size_transposed(fftw->ndim_fftw,fftw->dim_fftw,comm,&local_n0,&local_0_start,&local_n1,&local_1_start);

1080:       (fftw->dim_fftw)[fftw->ndim_fftw-1] = temp;

1082:       partial_dim = fftw->partial_dim;

1084:       PetscMalloc(sizeof(PetscInt)*((PetscInt)local_n0)*partial_dim,&indx1);
1085:       PetscMalloc(sizeof(PetscInt)*((PetscInt)local_n0)*partial_dim,&indx2);

1087:       if (dim[ndim-1]%2==0) NM = 2;
1088:       else                  NM = 1;

1090:       j = low;
1091:       for (i=0,k=1; i<((PetscInt)local_n0)*partial_dim; i++,k++) {
1092:         indx1[i] = local_0_start*partial_dim + i;
1093:         indx2[i] = j;
1094:         if (k%dim[ndim-1]==0) j+=NM;
1095:         j++;
1096:       }
1097:       ISCreateGeneral(comm,local_n0*partial_dim,indx1,PETSC_COPY_VALUES,&list1);
1098:       ISCreateGeneral(comm,local_n0*partial_dim,indx2,PETSC_COPY_VALUES,&list2);

1100:       VecScatterCreate(x,list2,y,list1,&vecscat);
1101:       VecScatterBegin(vecscat,x,y,INSERT_VALUES,SCATTER_FORWARD);
1102:       VecScatterEnd(vecscat,x,y,INSERT_VALUES,SCATTER_FORWARD);
1103:       VecScatterDestroy(&vecscat);
1104:       ISDestroy(&list1);
1105:       ISDestroy(&list2);
1106:       PetscFree(indx1);
1107:       PetscFree(indx2);
1108: #endif
1109:       break;
1110:     }
1111:   }
1112:   return(0);
1113: }

1117: /*
1118:     MatCreate_FFTW - Creates a matrix object that provides FFT via the external package FFTW

1120:   Options Database Keys:
1121: + -mat_fftw_plannerflags - set FFTW planner flags

1123:    Level: intermediate

1125: */
1126: PETSC_EXTERN PetscErrorCode MatCreate_FFTW(Mat A)
1127: {
1129:   MPI_Comm       comm;
1130:   Mat_FFT        *fft=(Mat_FFT*)A->data;
1131:   Mat_FFTW       *fftw;
1132:   PetscInt       n=fft->n,N=fft->N,ndim=fft->ndim,*dim=fft->dim;
1133:   const char     *plans[]={"FFTW_ESTIMATE","FFTW_MEASURE","FFTW_PATIENT","FFTW_EXHAUSTIVE"};
1134:   unsigned       iplans[]={FFTW_ESTIMATE,FFTW_MEASURE,FFTW_PATIENT,FFTW_EXHAUSTIVE};
1135:   PetscBool      flg;
1136:   PetscInt       p_flag,partial_dim=1,ctr;
1137:   PetscMPIInt    size,rank;
1138:   ptrdiff_t      *pdim;
1139:   ptrdiff_t      local_n1,local_1_start;
1140: #if !defined(PETSC_USE_COMPLEX)
1141:   ptrdiff_t      temp;
1142:   PetscInt       N1,tot_dim;
1143: #else
1144:   PetscInt       n1;
1145: #endif

1148:   PetscObjectGetComm((PetscObject)A,&comm);
1149:   MPI_Comm_size(comm, &size);
1150:   MPI_Comm_rank(comm, &rank);

1152:   fftw_mpi_init();
1153:   pdim    = (ptrdiff_t*)calloc(ndim,sizeof(ptrdiff_t));
1154:   pdim[0] = dim[0];
1155: #if !defined(PETSC_USE_COMPLEX)
1156:   tot_dim = 2*dim[0];
1157: #endif
1158:   for (ctr=1; ctr<ndim; ctr++) {
1159:     partial_dim *= dim[ctr];
1160:     pdim[ctr]    = dim[ctr];
1161: #if !defined(PETSC_USE_COMPLEX)
1162:     if (ctr==ndim-1) tot_dim *= (dim[ctr]/2+1);
1163:     else             tot_dim *= dim[ctr];
1164: #endif
1165:   }

1167:   if (size == 1) {
1168: #if defined(PETSC_USE_COMPLEX)
1169:     MatSetSizes(A,N,N,N,N);
1170:     n    = N;
1171: #else
1172:     MatSetSizes(A,tot_dim,tot_dim,tot_dim,tot_dim);
1173:     n    = tot_dim;
1174: #endif

1176:   } else {
1177:     ptrdiff_t local_n0,local_0_start;
1178:     switch (ndim) {
1179:     case 1:
1180: #if !defined(PETSC_USE_COMPLEX)
1181:       SETERRQ(comm,PETSC_ERR_SUP,"FFTW does not support parallel 1D real transform");
1182: #else
1183:       fftw_mpi_local_size_1d(dim[0],comm,FFTW_FORWARD,FFTW_ESTIMATE,&local_n0,&local_0_start,&local_n1,&local_1_start);

1185:       n    = (PetscInt)local_n0;
1186:       n1   = (PetscInt)local_n1;
1187:       MatSetSizes(A,n1,n,N,N);
1188: #endif
1189:       break;
1190:     case 2:
1191: #if defined(PETSC_USE_COMPLEX)
1192:       fftw_mpi_local_size_2d(dim[0],dim[1],comm,&local_n0,&local_0_start);
1193:       /*
1194:        PetscSynchronizedPrintf(comm,"[%d] MatCreateSeqFFTW: local_n0, local_0_start %d %d, N %d,dim %d, %d\n",rank,(PetscInt)local_n0*dim[1],(PetscInt)local_0_start,m,dim[0],dim[1]);
1195:        PetscSynchronizedFlush(comm,PETSC_STDOUT);
1196:        */
1197:       n    = (PetscInt)local_n0*dim[1];
1198:       MatSetSizes(A,n,n,N,N);
1199: #else
1200:       fftw_mpi_local_size_2d_transposed(dim[0],dim[1]/2+1,comm,&local_n0,&local_0_start,&local_n1,&local_1_start);

1202:       n    = 2*(PetscInt)local_n0*(dim[1]/2+1);
1203:       MatSetSizes(A,n,n,2*dim[0]*(dim[1]/2+1),2*dim[0]*(dim[1]/2+1));
1204: #endif
1205:       break;
1206:     case 3:
1207: #if defined(PETSC_USE_COMPLEX)
1208:       fftw_mpi_local_size_3d(dim[0],dim[1],dim[2],comm,&local_n0,&local_0_start);

1210:       n    = (PetscInt)local_n0*dim[1]*dim[2];
1211:       MatSetSizes(A,n,n,N,N);
1212: #else
1213:       fftw_mpi_local_size_3d_transposed(dim[0],dim[1],dim[2]/2+1,comm,&local_n0,&local_0_start,&local_n1,&local_1_start);

1215:       n   = 2*(PetscInt)local_n0*dim[1]*(dim[2]/2+1);
1216:       MatSetSizes(A,n,n,2*dim[0]*dim[1]*(dim[2]/2+1),2*dim[0]*dim[1]*(dim[2]/2+1));
1217: #endif
1218:       break;
1219:     default:
1220: #if defined(PETSC_USE_COMPLEX)
1221:       fftw_mpi_local_size(ndim,pdim,comm,&local_n0,&local_0_start);

1223:       n    = (PetscInt)local_n0*partial_dim;
1224:       MatSetSizes(A,n,n,N,N);
1225: #else
1226:       temp = pdim[ndim-1];

1228:       pdim[ndim-1] = temp/2 + 1;

1230:       fftw_mpi_local_size_transposed(ndim,pdim,comm,&local_n0,&local_0_start,&local_n1,&local_1_start);

1232:       n  = 2*(PetscInt)local_n0*partial_dim*pdim[ndim-1]/temp;
1233:       N1 = 2*N*(PetscInt)pdim[ndim-1]/((PetscInt) temp);

1235:       pdim[ndim-1] = temp;

1237:       MatSetSizes(A,n,n,N1,N1);
1238: #endif
1239:       break;
1240:     }
1241:   }
1242:   PetscObjectChangeTypeName((PetscObject)A,MATFFTW);
1243:   PetscNewLog(A,&fftw);
1244:   fft->data = (void*)fftw;

1246:   fft->n            = n;
1247:   fftw->ndim_fftw   = (ptrdiff_t)ndim; /* This is dimension of fft */
1248:   fftw->partial_dim = partial_dim;

1250:   PetscMalloc1(ndim, &(fftw->dim_fftw));
1251:   if (size == 1) {
1252: #if defined(PETSC_USE_64BIT_INDICES)
1253:     fftw->iodims = (fftw_iodim64 *) malloc(sizeof(fftw_iodim64) * ndim);
1254: #else
1255:     fftw->iodims = (fftw_iodim *) malloc(sizeof(fftw_iodim) * ndim);
1256: #endif
1257:   }

1259:   for (ctr=0;ctr<ndim;ctr++) (fftw->dim_fftw)[ctr]=dim[ctr];

1261:   fftw->p_forward  = 0;
1262:   fftw->p_backward = 0;
1263:   fftw->p_flag     = FFTW_ESTIMATE;

1265:   if (size == 1) {
1266:     A->ops->mult          = MatMult_SeqFFTW;
1267:     A->ops->multtranspose = MatMultTranspose_SeqFFTW;
1268:   } else {
1269:     A->ops->mult          = MatMult_MPIFFTW;
1270:     A->ops->multtranspose = MatMultTranspose_MPIFFTW;
1271:   }
1272:   fft->matdestroy = MatDestroy_FFTW;
1273:   A->assembled    = PETSC_TRUE;
1274:   A->preallocated = PETSC_TRUE;

1276:   PetscObjectComposeFunction((PetscObject)A,"MatGetVecsFFTW_C",MatGetVecsFFTW_FFTW);
1277:   PetscObjectComposeFunction((PetscObject)A,"VecScatterPetscToFFTW_C",VecScatterPetscToFFTW_FFTW);
1278:   PetscObjectComposeFunction((PetscObject)A,"VecScatterFFTWToPetsc_C",VecScatterFFTWToPetsc_FFTW);

1280:   /* get runtime options */
1281:   PetscOptionsBegin(PetscObjectComm((PetscObject)A),((PetscObject)A)->prefix,"FFTW Options","Mat");
1282:   PetscOptionsEList("-mat_fftw_plannerflags","Planner Flags","None",plans,4,plans[0],&p_flag,&flg);
1283:   if (flg) {
1284:     fftw->p_flag = iplans[p_flag];
1285:   }
1286:   PetscOptionsEnd();
1287:   return(0);
1288: }