jstroem/Makefile

## batch_bang.sh
#!/bin/bash
# This script is intepreted by the Bourne Shell, sh
#
# Documentation for SGE is found in:
# http://docs.oracle.com/cd/E19279-01/820-3257-12/n1ge.html
#
# Tell SGE which shell to run the job script in rather than depending
# on SGE to try and figure it out.
#$ -S /bin/bash
#
# Export all my environment variables to the job
#$ -V
#
# Tun the job in the same directory from which you submitted it
#$ -cwd
#
#
# --- Don't change anything above this line ---
#
# Give a name to the job
#$ -N MPI
#
# Specify a time limit for the job, not more than 30 minutes
#$ -l h_rt=00:10:00
#
# Specify the parallel environment and number of cores,
# If not a multiple of 8, you'll get the whole node anyway
#$ -pe orte 8
#
# Join stdout and stderr so they are reported in job output file
#$ -j y
#
#
# Choose the queue to run the job
#
# Debug queue: only one node may be used at a time for up to 30 minutes
# Interactive or batch jobs, maximum of 1 job per user running at a time
#
# Normal queue: job may use all available compute nodes (256 cores)
# for up to 60 minutes
# Batch jobs, maximum of 2 jobs per user running at a time
# To use more than one node, specify the "normal" queue
#$ -q normal.q
# #$ -q debug.q
#
# Specifies the circumstances under which mail is to be sent to the job owner
# defined by -M option. For example, options "bea" cause mail to be sent at the
# begining, end, and at abort time (if it happens) of the job.
# Option "n" means no mail will be sent.
#$ -m aeb
#
# *** Change to the address you want the notification sent to, and
# *** REMOVE the blank between the # and the $
#$ -M jesper.jln@gmail.com
#


echo
echo " *** Current working directory"
pwd
echo
echo " *** Compiler"
# Output which  compiler are we using and the environment
mpicc -v
echo
echo " *** Environment"
printenv

echo

echo ">>> Job Starts"
date
mpirun -np $NSLOTS ./mpi -n 5000 -proc_width 4 -proc_height 2 -o mpi.txt -c checksum.txt

date
echo ">>> Job Ends"

## common.cpp
#include <stdlib.h>
#include <stdio.h>
#include <assert.h>
#include <float.h>
#include <string.h>
#include <math.h>
#include <time.h>
#include <mpi.h>
#include <sys/time.h>
#include "common.h"

double size;

//
//  tuned constants
//
#define density 0.0005
#define mass    0.01
#define cutoff  0.01
#define min_r   (cutoff/100)
#define dt      0.0005

//
//  timer
//
double read_timer( )
{
    static bool initialized = false;
    static struct timeval start;
    struct timeval end;
    if( !initialized )
    {
        gettimeofday( &start, NULL );
        initialized = true;
    }
    gettimeofday( &end, NULL );
    return (end.tv_sec - start.tv_sec) + 1.0e-6 * (end.tv_usec - start.tv_usec);
}

//
//  keep density constant
//
double set_size( int n )
{
    size = sqrt( density * n );
    return size;
}

//
//  Initialize the particle positions and velocities
//
void init_particles( int n, particle_t *p )
{
    srand48( 1 );

    int sx = (int)ceil(sqrt((double)n));
    int sy = (n+sx-1)/sx;

    int *shuffle = (int*)malloc( n * sizeof(int) );
    for( int i = 0; i < n; i++ )
        shuffle[i] = i;

    for( int i = 0; i < n; i++ )
    {
        //
        //  make sure particles are not spatially sorted
        //
        int j = lrand48()%(n-i);
        int k = shuffle[j];
        shuffle[j] = shuffle[n-i-1];

        //
        //  distribute particles evenly to ensure proper spacing
        //
        p[i].x = size*(1.+(k%sx))/(1+sx);
        p[i].y = size*(1.+(k/sx))/(1+sy);
        p[i].index = i;

        //
        //  assign random velocities within a bound
        //
        p[i].vx = drand48()*2-1;
        p[i].vy = drand48()*2-1;
    }
    free( shuffle );
}

//
//  interact two particles
//
void apply_force( particle_t &particle, particle_t &neighbor )
{

    double dx = neighbor.x - particle.x;
    double dy = neighbor.y - particle.y;
    double r2 = dx * dx + dy * dy;
    if( r2 > cutoff*cutoff )
        return;
    r2 = fmax( r2, min_r*min_r );
    double r = sqrt( r2 );

    //
    //  very simple short-range repulsive force
    //
    double coef = ( 1 - cutoff / r ) / r2 / mass;
    particle.ax += coef * dx;
    particle.ay += coef * dy;
}

//
//  integrate the ODE
//
void move( particle_t &p )
{
    //
    //  slightly simplified Velocity Verlet integration
    //  conserves energy better than explicit Euler method
    //
    p.vx += p.ax * dt;
    p.vy += p.ay * dt;
    p.x  += p.vx * dt;
    p.y  += p.vy * dt;

    //
    //  bounce from walls
    //
    while( p.x < 0 || p.x > size )
    {
        p.x  = p.x < 0 ? -p.x : 2*size-p.x;
        p.vx = -p.vx;
    }
    while( p.y < 0 || p.y > size )
    {
        p.y  = p.y < 0 ? -p.y : 2*size-p.y;
        p.vy = -p.vy;
    }
}

//
//  I/O routines
//
void save( FILE *f, int n, particle_t *p )
{
    static bool first = true;
    if( first )
    {
        fprintf( f, "%d %g\n", n, size );
        first = false;
    }
    for( int i = 0; i < n; i++ )
        fprintf( f, "%g %g\n", p[i].x, p[i].y );
}

//
//  command line option processing
//
int find_option( int argc, char **argv, const char *option )
{
    for( int i = 1; i < argc; i++ )
        if( strcmp( argv[i], option ) == 0 )
            return i;
    return -1;
}

int read_int( int argc, char **argv, const char *option, int default_value )
{
    int iplace = find_option( argc, argv, option );
    if( iplace >= 0 && iplace < argc-1 )
        return atoi( argv[iplace+1] );
    return default_value;
}

char *read_string( int argc, char **argv, const char *option, char *default_value )
{
    int iplace = find_option( argc, argv, option );
    if( iplace >= 0 && iplace < argc-1 )
        return argv[iplace+1];
    return default_value;
}

## common.h
#ifndef __CS267_COMMON_H__
#define __CS267_COMMON_H__

inline int min( int a, int b ) { return a < b ? a : b; }
inline int max( int a, int b ) { return a > b ? a : b; }

//
//  saving parameters
//
const int NSTEPS = 1000;
const int SAVEFREQ = 10;

//
// particle data structure
//
typedef struct
{
  double index;
  double x;
  double y;
  double vx;
  double vy;
  double ax;
  double ay;
} particle_t;

typedef struct
{
	int offset;
	int size;
	int proc_x;
	int proc_y;
	int rank;
	double from_x;
	double from_y;
	double to_x;
	double to_y;
	MPI_Request recv_req;
	MPI_Request send_req;
	MPI_Status recv_stat;
	MPI_Status send_stat;
} neighbor_t;

//
//  timing routines
//
double read_timer( );

//
//  simulation routines
//
double set_size( int n );
void init_particles( int n, particle_t *p );
void apply_force( particle_t &particle, particle_t &neighbor );
void move( particle_t &p );

//
//  I/O routines
//
FILE *open_save( char *filename, int n );
void save( FILE *f, int n, particle_t *p );

//
//  argument processing routines
//
int find_option( int argc, char **argv, const char *option );
int read_int( int argc, char **argv, const char *option, int default_value );
char *read_string( int argc, char **argv, const char *option, char *default_value );

#endif

## Makefile
HOST = $(shell hostname)
BANG   =  $(shell expr match `hostname` ccom-bang)
BANG-COMPUTE   =  $(shell expr match `hostname` compute)
LILLIPUT   =  $(shell expr match `hostname` lilliput)


ifneq ($(BANG), 0)
PUB     = /share/class/public/cse260-fa12
include $(PUB)/Arch/arch.gnu.generic
else
ifneq ($(BANG-COMPUTE), 0)
PUB     = /share/class/public/cse260-fa12
include $(PUB)/Arch/arch.gnu.generic
else
ifneq ($(LILLIPUT), 0)
PUB	= /class/public/cse260-fa12
include $(PUB)/Arch/arch.intel.generic
else
# PUB = /Users/baden/lib
include $(PUB)/Arch/arch.gnu
# include $(PUB)/Arch/arch.gnu-4.5
endif
endif
endif
#
# Add symbol table information for gdb/cachegrind
ifeq ($(debug), 1)
        CFLAGS += -g
        LDFLAGS += -g
        C++FLAGS += -g
endif


# Add symbol table information for gprof
ifeq ($(gprof), 1)
        CFLAGS += -g -pg
        C++FLAGS += -g -pg
        LDFLAGS += -g -pg
endif

# If you want to compile for single precision,
# specify single=1 on the "make" command line
ifeq ($(single), 1)
else
    C++FLAGS += -D_DOUBLE
    CFLAGS += -D_DOUBLE
endif


# If you want to compile so that you call the plotter for
# flattened 2D arrays (implemented as 1D arrays)
# specify flattened=1 on the "make" command line
ifeq ($(flattened), 1)
    C++FLAGS += -DPLOT1D
    CFLAGS += -DPLOT1D
endif

# If you want to use restrict pointers, make restrict=1
# This applies to the hand code version
ifeq ($(restrict), 1)
    C++FLAGS += -D__RESTRICT
    CFLAGS += -D__RESTRICT
ifneq ($(CARVER), 0)
    C++FLAGS += -restrict
    CFLAGS += -restrict
endif
endif


#DEBUG += -DDEBUG
TARGETS = mpi

app:		$(TARGETS)

OBJECTS = common.o mpi.o.o
#ifeq ($(no-mpi),1)
#OBJECTS += Timer.o
#endif
app:	$(TARGETS)

mpi: mpi.o common.o
	$(C++LINK) $(LDFLAGS) -o $@ mpi.o common.o $(LDLIBS)

clean:
	$(RM) *.o $(TARGETS);
	$(RM) core.*;

## mpi.cpp
#include <mpi.h>
#include <stdlib.h>
#include <stdio.h>
#include <assert.h>
#include "common.h"

const int   SELF = 0,
            BOTTOM_STRAIGHT = 1,
            BOTTOM_LEFT = 2,
            STRAIGHT_LEFT = 3,
            TOP_LEFT = 4,
            TOP_STRAIGHT = 5,
            TOP_RIGHT = 6,
            STRAIGHT_RIGHT = 7,
            BOTTOM_RIGHT = 8;

//
//  benchmarking program
//
int main( int argc, char **argv )
{
    //
    //  process command line parameters
    //
    if( find_option( argc, argv, "-h" ) >= 0 )
    {
        printf( "Options:\n" );
        printf( "-h to see this help\n" );
        printf( "-n <int> to set the number of particles\n" );
        printf( "-proc_width <int> to set the width of the processor grid\n" );
        printf( "-proc_height <int> to set the height of the processor grid\n" );
        printf( "-o <filename> to specify the output file name\n" );
        printf( "-c <checksum> to specify the output checksum file name\n" );
        return 0;
    }

    int n = read_int( argc, argv, "-n", 1000 );
    int proc_width = read_int( argc, argv, "-proc_width", NULL );
    int proc_height = read_int( argc, argv, "-proc_height", NULL );
    char *savename = read_string( argc, argv, "-o", NULL );
    char *checksum = read_string( argc, argv, "-c", NULL );

    //
    //  set up MPI
    //
    int n_proc, myrank;
    MPI_Init( &argc, &argv );
    MPI_Comm_size( MPI_COMM_WORLD, &n_proc );
    MPI_Comm_rank( MPI_COMM_WORLD, &myrank );

    //Ensure that px and py are correct setup.
     if ((proc_height * proc_width) != n_proc && !myrank){
        printf("\n *** The number of  processors in the geometry (%d)  is not the same as the number requested (%d)",proc_height*proc_width,n_proc);
        exit(-1);
     }

     int my_x = myrank % proc_width;
     int my_y = myrank / proc_width;

    //
    // Setup the neighbor setup
    //
    neighbor_t *neighbors = (neighbor_t*) malloc( 9 * sizeof(neighbor_t));
    neighbors[ SELF ].proc_x = my_x; neighbors[ SELF ].proc_y = my_y;
    neighbors[ BOTTOM_STRAIGHT ].proc_x = my_x; neighbors[ BOTTOM_STRAIGHT ].proc_y = my_y + 1;
    neighbors[ BOTTOM_LEFT ].proc_x = my_x - 1; neighbors[ BOTTOM_LEFT ].proc_y = my_y + 1;
    neighbors[ STRAIGHT_LEFT ].proc_x = my_x - 1; neighbors[ STRAIGHT_LEFT ].proc_y = my_y;
    neighbors[ TOP_LEFT ].proc_x = my_x - 1; neighbors[ TOP_LEFT ].proc_y = my_y - 1;
    neighbors[ TOP_STRAIGHT ].proc_x = my_x; neighbors[ TOP_STRAIGHT ].proc_y = my_y - 1;
    neighbors[ TOP_RIGHT ].proc_x = my_x + 1; neighbors[ TOP_RIGHT ].proc_y = my_y - 1;
    neighbors[ STRAIGHT_RIGHT ].proc_x = my_x + 1; neighbors[ STRAIGHT_RIGHT ].proc_y = my_y;
    neighbors[ BOTTOM_RIGHT ].proc_x = my_x + 1; neighbors[ BOTTOM_RIGHT ].proc_y = my_y + 1;

    //
    //  allocate generic resources
    //
    FILE *fsave = savename && myrank == 0 ? fopen( savename, "w" ) : NULL;
    FILE *fchecksum = checksum && myrank == 0 ? fopen( checksum, "w" ) : NULL;
    particle_t *particles = (particle_t*) malloc( n * sizeof(particle_t));
    particle_t *particles_prev = (particle_t*) malloc( n * sizeof(particle_t));

    MPI_Datatype PARTICLE;
    MPI_Type_contiguous( 7, MPI_DOUBLE, &PARTICLE );
    MPI_Type_commit( &PARTICLE );

    //
    //  initialize and distribute the particles (that's fine to leave it unoptimized)
    //
    double size = set_size( n );
    double size_x = size / proc_width;
    double size_y = size / proc_height;

    //
    //  set up the data partitioning across processors
    //
    if (!myrank) { //Init all particles at rank 0
        init_particles( n, particles_prev );
    }
    MPI_Bcast(particles_prev, n, PARTICLE, 0, MPI_COMM_WORLD);

    for( int i = 0; i < 9; i++ ) {
        neighbors[ i ].from_x = size_x * neighbors[ i ].proc_x;
        neighbors[ i ].to_x = size_x + neighbors[ i ].from_x;
        neighbors[ i ].from_y = size_y * neighbors[ i ].proc_y;
        neighbors[ i ].to_y = size_y + neighbors[ i ].from_y;
        if (neighbors[ i ].proc_x > -1 && neighbors[i].proc_x < proc_width && neighbors[ i ].proc_y > -1 && neighbors[i].proc_y < proc_height) {
            neighbors[ i ].rank =  neighbors[ i ].proc_x + neighbors[ i ].proc_y * proc_width;
            neighbors[ i ].offset = (i == 0 ? 0 : neighbors[ i - 1 ].size + neighbors[ i - 1 ].offset);
            neighbors[ i ].size = 0;
            for(int j = 0; j < n; j++){
                if (particles_prev[j].x >= neighbors[i].from_x && particles_prev[j].x < neighbors[i].to_x &&
                    particles_prev[j].y >= neighbors[i].from_y && particles_prev[j].y < neighbors[i].to_y){
                        particles[ neighbors[ i ].size + neighbors[ i ].offset ] = particles_prev[j];
                        neighbors[ i ].size += 1;
                }
            }
        } else {
            neighbors[ i ].rank =  -1;
            neighbors[ i ].offset = neighbors[ i - 1 ].size + neighbors[ i - 1 ].offset;
            neighbors[ i ].size = 0;
        }
    }

    particle_t *tmp = particles; particles = particles_prev; particles_prev = tmp;

    //Myrank is main so he is the one gathering the info
    int *n_proc_sizes = new int[n_proc];
    int *n_proc_offset = new int[n_proc];
    int count = 0;

    MPI_Gather( &neighbors[SELF].size, 1, MPI_INT, n_proc_sizes, 1, MPI_INT, 0, MPI_COMM_WORLD );
    if(!myrank ) {
        int real_n = 0;
        for(int i = 0; i < n_proc; i++){
            real_n += n_proc_sizes[i];
        }
        printf("Before correctness check: n: %d, real-n: %d\n",n, real_n);
    }

    //
    //  simulate a number of time steps
    //
    double simulation_time = read_timer( );
    for( int step = 0; step < NSTEPS; step++ )
    {
        //
        //  save current step if necessary (slightly different semantics than in other codes)
        //
        if( (step%SAVEFREQ) == 0 && savename ) {
            //To to collect from every node
            MPI_Gather( &neighbors[SELF].size, 1, MPI_INT, n_proc_sizes, 1, MPI_INT, 0, MPI_COMM_WORLD );
            //Calculate offsets
            if (myrank == 0){
                for(int i = 0; i < n_proc; i++){
                    if (i == 0) {
                        n_proc_offset[i] = 0;
                    } else {
                        n_proc_offset[i] = n_proc_offset[i-1] + n_proc_sizes[i-1];
                    }
                }
            }
            MPI_Gatherv( particles_prev, neighbors[SELF].size, PARTICLE, particles, n_proc_sizes, n_proc_offset, PARTICLE, 0, MPI_COMM_WORLD );
            if (myrank == 0 && fsave) {
                save( fsave, n, particles );
            }
        }

        int look_nodes = neighbors[ 8 ].offset + neighbors[ 8 ].size;

        //Get the new sizes from the other neighbors
        for( int i = 1; i < 9; i++) {
            if (neighbors[i].rank != -1) {
                MPI_Irecv( &neighbors[i].size, 1, MPI_INT, neighbors[i].rank, 0, MPI_COMM_WORLD, &neighbors[i].recv_req);
            }
        }

        //
        //  compute forces in us and the neighbors
        //
        for( int i = 0; i < look_nodes; i++ )
        {
            particles_prev[i].ax = particles_prev[i].ay = 0;
            for (int j = 0; j < look_nodes; j++ ) {
                apply_force( particles_prev[i], particles_prev[j] );
            }
        }

        //
        //  move particles
        //
        count = 0;
        for( int i = 0; i < look_nodes; i++ ) {
            move(particles_prev[i]);
            //Find the particles that are still in this cube
            if (particles_prev[i].x >= neighbors[SELF].from_x && particles_prev[i].x < neighbors[SELF].to_x &&
                particles_prev[i].y >= neighbors[SELF].from_y && particles_prev[i].y < neighbors[SELF].to_y){
                particles[count] = particles_prev[i];
                count++;
            }
        }
        neighbors[SELF].size = count;

        //Send sizes to the neighbors
        for( int i = 1; i < 9; i++) {
            if (neighbors[i].rank != -1){
                MPI_Isend( &neighbors[SELF].size,1,MPI_INT,neighbors[i].rank,0, MPI_COMM_WORLD, &neighbors[i].send_req );
            }
        }

        for( int i = 1; i < 9; i++) {
            if (neighbors[i].rank != -1){
                MPI_Wait(&neighbors[i].recv_req,&neighbors[i].recv_stat);
            }
            neighbors[ i ].offset = neighbors[ i - 1 ].size + neighbors[ i - 1 ].offset;
            if (neighbors[i].rank != -1){
                MPI_Irecv( &particles[neighbors[i].offset],neighbors[i].size, PARTICLE, neighbors[i].rank, 0, MPI_COMM_WORLD, &neighbors[i].recv_req );
            }
        }

        for( int i = 1; i < 9; i++) {
            if (neighbors[i].rank != -1){
                MPI_Wait(&neighbors[i].send_req,&neighbors[i].send_stat);
                MPI_Isend( &particles[0],neighbors[SELF].size, PARTICLE, neighbors[i].rank, 0, MPI_COMM_WORLD, &neighbors[i].send_req );
            }
        }

        particle_t *tmp = particles; particles = particles_prev; particles_prev = tmp;
        for( int i = 1; i < 9; i++) {
            if (neighbors[i].rank != -1){
                MPI_Wait(&neighbors[i].recv_req,&neighbors[i].recv_stat);
                MPI_Wait(&neighbors[i].send_req,&neighbors[i].send_stat);
            }
        }


    }
    simulation_time = read_timer( ) - simulation_time;

    MPI_Gather( &neighbors[SELF].size, 1, MPI_INT, n_proc_sizes, 1, MPI_INT, 0, MPI_COMM_WORLD );
    if(!myrank ) {
        int real_n = 0;
        for(int i = 0; i < n_proc; i++){
            if (i == 0) {
                n_proc_offset[i] = 0;
            } else {
                n_proc_offset[i] = n_proc_offset[i-1] + n_proc_sizes[i-1];
            }
            real_n += n_proc_sizes[i];
        }
        printf("After correctness check: n: %d, real-n: %d\n",n, real_n);

        printf( "n = %d, n_procs = %d, proc_width = %d, proc_height = %d, simulation time = %g s\n", n, n_proc, proc_width, proc_height, simulation_time );
    }
    MPI_Gatherv( particles_prev, neighbors[SELF].size, PARTICLE, particles, n_proc_sizes, n_proc_offset, PARTICLE, 0, MPI_COMM_WORLD );
    //Sort every particle to their i value
    if (!myrank && fchecksum){
        for(int j = 0; j < n; j++){
            particles_prev[(int)particles[j].index] = particles[j];
        }
        save( fchecksum, n, particles_prev );
    }

    //
    //  release resources
    //
    free( neighbors );
    free( particles );
    free( particles_prev );
    if( fsave )
        fclose( fsave );
    if (fchecksum)
        fclose( fchecksum );

    MPI_Finalize( );

    return 0;
}
	#!/bin/bash
	# This script is intepreted by the Bourne Shell, sh
	#
	# Documentation for SGE is found in:
	# http://docs.oracle.com/cd/E19279-01/820-3257-12/n1ge.html
	#
	# Tell SGE which shell to run the job script in rather than depending
	# on SGE to try and figure it out.
	#$ -S /bin/bash
	#
	# Export all my environment variables to the job
	#$ -V
	#
	# Tun the job in the same directory from which you submitted it
	#$ -cwd
	#
	#
	# --- Don't change anything above this line ---
	#
	# Give a name to the job
	#$ -N MPI
	#
	# Specify a time limit for the job, not more than 30 minutes
	#$ -l h_rt=00:10:00
	#
	# Specify the parallel environment and number of cores,
	# If not a multiple of 8, you'll get the whole node anyway
	#$ -pe orte 8
	#
	# Join stdout and stderr so they are reported in job output file
	#$ -j y
	#
	#
	# Choose the queue to run the job
	#
	# Debug queue: only one node may be used at a time for up to 30 minutes
	# Interactive or batch jobs, maximum of 1 job per user running at a time
	#
	# Normal queue: job may use all available compute nodes (256 cores)
	# for up to 60 minutes
	# Batch jobs, maximum of 2 jobs per user running at a time
	# To use more than one node, specify the "normal" queue
	#$ -q normal.q
	# #$ -q debug.q
	#
	# Specifies the circumstances under which mail is to be sent to the job owner
	# defined by -M option. For example, options "bea" cause mail to be sent at the
	# begining, end, and at abort time (if it happens) of the job.
	# Option "n" means no mail will be sent.
	#$ -m aeb
	#
	# *** Change to the address you want the notification sent to, and
	# *** REMOVE the blank between the # and the $
	#$ -M jesper.jln@gmail.com
	#


	echo
	echo " *** Current working directory"
	pwd
	echo
	echo " *** Compiler"
	# Output which compiler are we using and the environment
	mpicc -v
	echo
	echo " *** Environment"
	printenv

	echo

	echo ">>> Job Starts"
	date
	mpirun -np $NSLOTS ./mpi -n 5000 -proc_width 4 -proc_height 2 -o mpi.txt -c checksum.txt

	date
	echo ">>> Job Ends"
	#include <stdlib.h>
	#include <stdio.h>
	#include <assert.h>
	#include <float.h>
	#include <string.h>
	#include <math.h>
	#include <time.h>
	#include <mpi.h>
	#include <sys/time.h>
	#include "common.h"

	double size;

	//
	// tuned constants
	//
	#define density 0.0005
	#define mass 0.01
	#define cutoff 0.01
	#define min_r (cutoff/100)
	#define dt 0.0005

	//
	// timer
	//
	double read_timer( )
	{
	static bool initialized = false;
	static struct timeval start;
	struct timeval end;
	if( !initialized )
	{
	gettimeofday( &start, NULL );
	initialized = true;
	}
	gettimeofday( &end, NULL );
	return (end.tv_sec - start.tv_sec) + 1.0e-6 * (end.tv_usec - start.tv_usec);
	}

	//
	// keep density constant
	//
	double set_size( int n )
	{
	size = sqrt( density * n );
	return size;
	}

	//
	// Initialize the particle positions and velocities
	//
	void init_particles( int n, particle_t *p )
	{
	srand48( 1 );

	int sx = (int)ceil(sqrt((double)n));
	int sy = (n+sx-1)/sx;

	int shuffle = (int)malloc( n * sizeof(int) );
	for( int i = 0; i < n; i++ )
	shuffle[i] = i;

	for( int i = 0; i < n; i++ )
	{
	//
	// make sure particles are not spatially sorted
	//
	int j = lrand48()%(n-i);
	int k = shuffle[j];
	shuffle[j] = shuffle[n-i-1];

	//
	// distribute particles evenly to ensure proper spacing
	//
	p[i].x = size*(1.+(k%sx))/(1+sx);
	p[i].y = size*(1.+(k/sx))/(1+sy);
	p[i].index = i;

	//
	// assign random velocities within a bound
	//
	p[i].vx = drand48()*2-1;
	p[i].vy = drand48()*2-1;
	}
	free( shuffle );
	}

	//
	// interact two particles
	//
	void apply_force( particle_t &particle, particle_t &neighbor )
	{

	double dx = neighbor.x - particle.x;
	double dy = neighbor.y - particle.y;
	double r2 = dx * dx + dy * dy;
	if( r2 > cutoff*cutoff )
	return;
	r2 = fmax( r2, min_r*min_r );
	double r = sqrt( r2 );

	//
	// very simple short-range repulsive force
	//
	double coef = ( 1 - cutoff / r ) / r2 / mass;
	particle.ax += coef * dx;
	particle.ay += coef * dy;
	}

	//
	// integrate the ODE
	//
	void move( particle_t &p )
	{
	//
	// slightly simplified Velocity Verlet integration
	// conserves energy better than explicit Euler method
	//
	p.vx += p.ax * dt;
	p.vy += p.ay * dt;
	p.x += p.vx * dt;
	p.y += p.vy * dt;

	//
	// bounce from walls
	//
	while( p.x < 0 \|\| p.x > size )
	{
	p.x = p.x < 0 ? -p.x : 2*size-p.x;
	p.vx = -p.vx;
	}
	while( p.y < 0 \|\| p.y > size )
	{
	p.y = p.y < 0 ? -p.y : 2*size-p.y;
	p.vy = -p.vy;
	}
	}

	//
	// I/O routines
	//
	void save( FILE f, int n, particle_t p )
	{
	static bool first = true;
	if( first )
	{
	fprintf( f, "%d %g\n", n, size );
	first = false;
	}
	for( int i = 0; i < n; i++ )
	fprintf( f, "%g %g\n", p[i].x, p[i].y );
	}

	//
	// command line option processing
	//
	int find_option( int argc, char *argv, const char option )
	{
	for( int i = 1; i < argc; i++ )
	if( strcmp( argv[i], option ) == 0 )
	return i;
	return -1;
	}

	int read_int( int argc, char *argv, const char option, int default_value )
	{
	int iplace = find_option( argc, argv, option );
	if( iplace >= 0 && iplace < argc-1 )
	return atoi( argv[iplace+1] );
	return default_value;
	}

	char read_string( int argc, char argv, const char option, char *default_value )
	{
	int iplace = find_option( argc, argv, option );
	if( iplace >= 0 && iplace < argc-1 )
	return argv[iplace+1];
	return default_value;
	}
	#ifndef __CS267_COMMON_H__
	#define __CS267_COMMON_H__

	inline int min( int a, int b ) { return a < b ? a : b; }
	inline int max( int a, int b ) { return a > b ? a : b; }

	//
	// saving parameters
	//
	const int NSTEPS = 1000;
	const int SAVEFREQ = 10;

	//
	// particle data structure
	//
	typedef struct
	{
	double index;
	double x;
	double y;
	double vx;
	double vy;
	double ax;
	double ay;
	} particle_t;

	typedef struct
	{
	int offset;
	int size;
	int proc_x;
	int proc_y;
	int rank;
	double from_x;
	double from_y;
	double to_x;
	double to_y;
	MPI_Request recv_req;
	MPI_Request send_req;
	MPI_Status recv_stat;
	MPI_Status send_stat;
	} neighbor_t;

	//
	// timing routines
	//
	double read_timer( );

	//
	// simulation routines
	//
	double set_size( int n );
	void init_particles( int n, particle_t *p );
	void apply_force( particle_t &particle, particle_t &neighbor );
	void move( particle_t &p );

	//
	// I/O routines
	//
	FILE open_save( char filename, int n );
	void save( FILE f, int n, particle_t p );

	//
	// argument processing routines
	//
	int find_option( int argc, char *argv, const char option );
	int read_int( int argc, char *argv, const char option, int default_value );
	char read_string( int argc, char argv, const char option, char *default_value );

	#endif
	HOST = $(shell hostname)
	BANG = $(shell expr match `hostname` ccom-bang)
	BANG-COMPUTE = $(shell expr match `hostname` compute)
	LILLIPUT = $(shell expr match `hostname` lilliput)



	ifneq ($(BANG), 0)
	PUB = /share/class/public/cse260-fa12
	include $(PUB)/Arch/arch.gnu.generic
	else
	ifneq ($(BANG-COMPUTE), 0)
	PUB = /share/class/public/cse260-fa12
	include $(PUB)/Arch/arch.gnu.generic
	else
	ifneq ($(LILLIPUT), 0)
	PUB = /class/public/cse260-fa12
	include $(PUB)/Arch/arch.intel.generic
	else
	# PUB = /Users/baden/lib
	include $(PUB)/Arch/arch.gnu
	# include $(PUB)/Arch/arch.gnu-4.5
	endif
	endif
	endif
	#
	# Add symbol table information for gdb/cachegrind
	ifeq ($(debug), 1)
	CFLAGS += -g
	LDFLAGS += -g
	C++FLAGS += -g
	endif


	# Add symbol table information for gprof
	ifeq ($(gprof), 1)
	CFLAGS += -g -pg
	C++FLAGS += -g -pg
	LDFLAGS += -g -pg
	endif

	# If you want to compile for single precision,
	# specify single=1 on the "make" command line
	ifeq ($(single), 1)
	else
	C++FLAGS += -D_DOUBLE
	CFLAGS += -D_DOUBLE
	endif


	# If you want to compile so that you call the plotter for
	# flattened 2D arrays (implemented as 1D arrays)
	# specify flattened=1 on the "make" command line
	ifeq ($(flattened), 1)
	C++FLAGS += -DPLOT1D
	CFLAGS += -DPLOT1D
	endif

	# If you want to use restrict pointers, make restrict=1
	# This applies to the hand code version
	ifeq ($(restrict), 1)
	C++FLAGS += -D__RESTRICT
	CFLAGS += -D__RESTRICT
	ifneq ($(CARVER), 0)
	C++FLAGS += -restrict
	CFLAGS += -restrict
	endif
	endif


	#DEBUG += -DDEBUG
	TARGETS = mpi

	app: $(TARGETS)

	OBJECTS = common.o mpi.o.o
	#ifeq ($(no-mpi),1)
	#OBJECTS += Timer.o
	#endif
	app: $(TARGETS)

	mpi: mpi.o common.o
	$(C++LINK) $(LDFLAGS) -o $@ mpi.o common.o $(LDLIBS)

	clean:
	$(RM) *.o $(TARGETS);
	$(RM) core.*;