#include <stdio.h>
#include <stdlib.h>
#include <math.h>
#include "mpi.h"
#include "jacobi.h"
#include "testname.h"

/*
 * do_print controls output useful for debugging and observing,
 * particularly the sloooowwww convergence of this method.
 */
static int do_print = 0; 
static int max_it   = 100;

/*
 * Define the symbol NO_CONV_TEST to mimic an explicit differencing scheme
 */

int main( argc, argv )
int argc;
char **argv;
{
    int        rank, size, i, j, itcnt;
    Mesh       mesh;
#ifndef NO_CONV_TEST
    double     diffnorm, gdiffnorm;
#else
#define gdiffnorm 1.0    
#endif
    double     *swap;
    double     *xlocalrow, *xnewrow;
#ifdef _AIX
#pragma disjoint (*xlocalrow, *xnewrow)
#endif
    int        maxm, maxn, lrow;
    int        k;
    double     t;

    MPI_Init( &argc, &argv );

    MPI_Comm_rank( MPI_COMM_WORLD, &rank );
    MPI_Comm_size( MPI_COMM_WORLD, &size );

    Get_command_line( rank, argc, argv, &maxm, &maxn, &do_print, &max_it );

    Setup_mesh( maxm, maxn, &mesh );

    /* Run twice and take the last time (ensures that code is paged in) */
    for (k=0; k<2; k++) {
	itcnt = 0;
	Init_mesh( &mesh );
	MPI_Barrier( mesh.ring_comm );
	t = MPI_Wtime();
	if (do_print && rank == 0) {
	    printf( "Starting at time %f\n", t ); fflush(stdout);
	}
	/* lrow is used enough that we make a local copy */
	lrow = mesh.lrow;
	do {
	    Exchange( &mesh );
	
	    /* 
	       Compute new values (but not on boundary).
	       Note that many compilers will not generate good code for this
	       (i.e., will not do the obvious strength reductions).
	       The code below gets reasonable performance on some systems;
	       additional rewriting might help others (e.g., 1+j instead of j+1).

	       Note that without some "disjoint" xnewrow, xlocalrow,
	       extra loads are needed in the loop.
	       */
	    itcnt ++;
#ifndef NO_CONV_TEST
	    diffnorm  = 0.0;
#endif
	    xnewrow	  = mesh.xnew   + 1 * maxm;
	    xlocalrow = mesh.xlocal + 1 * maxm;
	    for (i=1; i<=lrow; i++) {
		for (j=1; j<maxm-1; j++) {
		    double diff;
		    /* Multiply by .25 instead of divide by 4.0 */
		    xnewrow[j] = (xlocalrow[j+1] + 
				  xlocalrow[j-1] +
				  xlocalrow[maxm + j] + 
				  xlocalrow[-maxm + j]) * 0.25;
#ifndef NO_CONV_TEST
		    /* Accumulating diffnorm here can reduce cache misses
		       on large data.  
		       Local variable diff is used because some compilers can not 
		       do this simple common sub-expression analysis 
		       */
		    diff	  = xnewrow[j] - xlocalrow[j];
		    diffnorm += diff * diff;
#endif
		}
		xnewrow   += maxm;
		xlocalrow += maxm;
	    }

	    swap = mesh.xlocal; mesh.xlocal = mesh.xnew ; mesh.xnew = swap;

	    /* Convergence test */
#ifndef NO_CONV_TEST
	    MPI_Allreduce( &diffnorm, &gdiffnorm, 1, MPI_DOUBLE, MPI_SUM,
			   mesh.ring_comm );
	    gdiffnorm = sqrt( gdiffnorm );
	    if (do_print && rank == 0) {
		printf( "At iteration %d, diff is %e\n", itcnt, gdiffnorm );
		fflush( stdout );
	    }
#endif
	} while (gdiffnorm > 1.0e-2 && itcnt < max_it);
    
	t = MPI_Wtime() - t;
    }
    if (rank == 0) {
#ifdef NO_CONV_TEST
	printf( "%s: %d iterations in %f secs (%f MFlops), m=%d n=%d np=%d\n", 
		TESTNAME, itcnt, t,
		itcnt * (maxm-2.0)*(maxn-2)*(4)*1.0e-6/t, maxm, maxn, size );
#else
	/* 4 ops for new value, 3 for norm */
	printf( 
  "%s: %d iterations in %f secs (%f MFlops); diffnorm %f, m=%d n=%d np=%d\n", 
	        TESTNAME, itcnt, t,
		itcnt * (maxm-2.0)*(maxn-2)*(4+3)*1.0e-6/t, gdiffnorm, 
                maxm, maxn, size );
#endif
    }

    MPI_Comm_free( &mesh.ring_comm );
    MPI_Finalize( );
    return 0;
}
#include <stdio.h>
#include <stdlib.h>
#include <math.h>
#include "mpi.h"
#include "jacobi.h"

/* 
 * Command line arguments are not defined by the MPI Standard.
 * We'll just get the values from process 0 and broadcast them
 *
 * maxm is the number of columns and maxn is the number of rows in the
 * global mesh.  Because the mesh is divided into row-oriented slabs, the
 * resulting pieces are roughly maxm x maxn/p.  See setupmesh.c for
 * the exact dimensions.
 * 
 */

void Get_command_line( rank, argc, argv, maxm, maxn, do_print, maxit )
int rank, argc, *maxm, *maxn, *do_print, *maxit;
char **argv;
{
    int args[4];
    int i;

    if (rank == 0) {
	/* Get maxn from the command line */
	*maxn = DEFAULT_MAXN;
	*maxm = -1;
	for (i=1; i<argc; i++) {
	    if (!argv[i]) continue;
	    if (strcmp( argv[i], "-print" ) == 0) *do_print = 1;
	    else if (strcmp( argv[i], "-n" ) == 0) {
		*maxn = atoi( argv[i+1] );
		i++;
	    }
	    else if (strcmp( argv[i], "-m" ) == 0) {
		*maxm = atoi( argv[i+1] );
		i++;
	    }
	    else if (strcmp( argv[i], "-maxit" ) == 0) {
		*maxit = atoi( argv[i+1] );
		i++;
	    }
	}
	if (*maxm < 0) *maxm = *maxn;
	args[0] = *maxn;
	args[1] = *maxm;
	args[2] = *do_print;
	args[3] = *maxit;
    }
    MPI_Bcast( args, 4, MPI_INT, 0, MPI_COMM_WORLD );
    *maxn     = args[0];
    *maxm     = args[1];
    *do_print = args[2];
    *maxit    = args[3];
}
#include <stdio.h>
#include <stdlib.h>
#include <math.h>
#include "mpi.h"
#include "jacobi.h"

void Setup_mesh( maxm, maxn, mesh )
int  maxm, maxn;
Mesh *mesh;
{
    int false = 0;
    int true = 1;
    /* For some items, it is helpful to have local copies */
    int lrow, rank, size;
    register double *xlocal, *xnew;
    
    MPI_Comm_size( MPI_COMM_WORLD, &size );
    MPI_Cart_create( MPI_COMM_WORLD, 1, &size, &false, true, 
		     &mesh->ring_comm );
    MPI_Cart_shift( mesh->ring_comm, 0, 1, &mesh->down_nbr, &mesh->up_nbr );
    MPI_Comm_rank( mesh->ring_comm, &rank );
    MPI_Comm_size( mesh->ring_comm, &size );

    /* Number of local rows */
    lrow = (maxn - 2) / size;
    if (rank < ((maxn - 2) % size)) lrow++;

    mesh->lrow = lrow;
    mesh->maxm = maxm;
    mesh->maxn = maxn;

    /* Allocate the local meshes */
    mesh->xlocal = xlocal = 
	(double *)malloc( maxm * ( lrow + 2 ) * sizeof(double) );
    mesh->xnew   = xnew =
	(double *)malloc( maxm * ( lrow + 2 ) * sizeof(double) );
    if (!mesh->xlocal || !mesh->xnew) {
	fprintf( stderr, "Unable to allocate local mesh\n" );
	MPI_Abort( MPI_COMM_WORLD, 1 );
    }
}

void Init_mesh( mesh )
Mesh *mesh;
{
    int i, j, lrow, rank, maxm;
    register double *xlocal, *xnew;

    xlocal = mesh->xlocal;
    xnew   = mesh->xnew;
    lrow   = mesh->lrow;
    maxm   = mesh->maxm;

    MPI_Comm_rank( mesh->ring_comm, &rank );

    /* Fill the data as specified */
    for (i=1; i<=lrow; i++) 
	for (j=0; j<maxm; j++) {
	    xlocal[i*maxm+j] = rank;
	    xnew[i*maxm+j] = rank;
	}
    /* Boundary data or initial values in ghost cells */
    for (j=0; j<maxm; j++) {
	xlocal[j]		  = -1;
	xlocal[(lrow+1)*maxm + j] = rank + 1;
	xnew[j]			  = -1;
	xnew[(lrow+1)*maxm + j]	  = rank + 1;
    }
    /* Boundary data in xnew if we use the "swap" approach */
    for (i=1; i<=lrow; i++) {
	xnew[i*maxm]	    = rank;
	xnew[i*maxm+maxm-1] = rank;
    }
}
#include <stdio.h>
#include <stdlib.h>
#include <math.h>
#include "mpi.h"
#include "jacobi.h"

void Exchange( mesh )
Mesh *mesh;
{
    MPI_Status statuses[4];
    double *xlocal     = mesh->xlocal;
    int     maxm       = mesh->maxm;
    int     lrow       = mesh->lrow;
    int     up_nbr     = mesh->up_nbr;
    int     down_nbr   = mesh->down_nbr;
    MPI_Comm ring_comm = mesh->ring_comm;
    MPI_Request r[4];

    /* Send up, then receive from below */
    MPI_Irecv( xlocal, maxm, MPI_DOUBLE, down_nbr, 0, ring_comm, &r[1] );
    MPI_Irecv( xlocal + maxm * (lrow + 1), maxm, MPI_DOUBLE, up_nbr, 1, 
	      ring_comm, &r[3] );
    MPI_Isend( xlocal + maxm * lrow, maxm, MPI_DOUBLE, up_nbr, 0,
	      ring_comm, &r[0] );
    /* Send down, then receive from above */
    MPI_Isend( xlocal + maxm, maxm, MPI_DOUBLE, down_nbr, 1, ring_comm, 
	       &r[2] );
    MPI_Waitall( 4, r, statuses );
}