// this is file testbutterfly.cc            -bds 10/2002
#include <iostream>
#include <vector>
#include <mpi.h>
#include "butterfly.h"

/*
The butterfly_Allreduce is found in porsche:~saunders/butterfly.h.
It only works for datatype double, binop +.
*/


// time_it() is a tool for timing the various all-reduction functions.
// The implementation is at the end of this file.
void time_it(int (*reduce_func)(void*, void*, int, int, int, int), 
             char* reducer_name, int reps,
             void* data, void* result, int size, MPI_Datatype type,
	     MPI_Op binop, int root, MPI_Comm comm);

main(int argc, char* argv[])
{
    MPI_Init(&argc, &argv);

    ///////////////////////////////////////////////////////////////////
    // Create a communicator which excludes process 0 (i.e. porsche).
	//// get group
	MPI_Group world_group;
	MPI_Comm_group(MPI_COMM_WORLD, &world_group);
	//// set up vector of ranks for new comm
        int wp; // size of world comm.
	MPI_Comm_size(MPI_COMM_WORLD, &wp);
	vector<int> ranks(wp-1); 
	for (int i = 1; i < wp; ++i) ranks[i-1] = i;
	//// make new group
	MPI_Group rack_group;
	MPI_Group_incl(world_group, wp-1, &ranks[0], &rack_group);
	//// make new comm
	MPI_Comm rack; 
	MPI_Comm_create(MPI_COMM_WORLD, rack_group, &rack);

    ///////////////////////////////////////////////////////////////////
    // reduction timing tests
    int wr; // my rank in world comm.
    MPI_Comm_rank(MPI_COMM_WORLD, &wr);
    if (wr != 0) // leave porsche out of it altogether.
    {
	int r; MPI_Comm_rank(rack, &r);
	double data = r + 1, result, result2;

    /////////////////////////////////////////////
    // reductions where there is one data item //

        int reps = 1;
	// Case 1: Using MPI's built-in reduce function.
	time_it(MPI_Allreduce, "Case 1a: MPI_Allreduce", reps, 
	        &data, &result, 1, MPI_DOUBLE, MPI_SUM, 0, rack);
	time_it(MPI_Allreduce, "     1a: MPI_Allreduce", reps, 
	        &data, &result, 1, MPI_DOUBLE, MPI_SUM, 0, rack);

	// Case 3: Using a reduce built from a butterfly pattern of send-recv's.
	time_it(butterfly_Allreduce, "     2a: butterfly_Allreduce", reps, 
	        &data, &result2, 1, MPI_DOUBLE, MPI_SUM, 0, rack);
	time_it(butterfly_Allreduce, "     2a: butterfly_Allreduce", reps, 
	        &data, &result2, 1, MPI_DOUBLE, MPI_SUM, 0, rack);

    //////////////////////////////////////////////////////////
    // experiments on reductions using large arrays of data //
	int n = 100000;
	double D[n];
	for(int i = 0; i < n; ++i) D[i] = r+1;
	double R[n];
	double R2[n];

	time_it(MPI_Allreduce, "     1b: MPI_Allreduce", reps, 
	        D, R, n, MPI_DOUBLE, MPI_SUM, 0, rack);
	time_it(MPI_Allreduce, "     1b: MPI_Allreduce", reps, 
	        D, R, n, MPI_DOUBLE, MPI_SUM, 0, rack);

	time_it(butterfly_Allreduce, "     2b: butterfly_Allreduce", reps, 
	        D, R2, n, MPI_DOUBLE, MPI_SUM, 0, rack);
	time_it(butterfly_Allreduce, "     2b: butterfly_Allreduce", reps, 
	        D, R2, n, MPI_DOUBLE, MPI_SUM, 0, rack);

    }
    MPI_Finalize();
	
}

// time_it() definition:
void time_it(int (*reduce_func)(void*, void*, int, int, int, int), 
             char* reducer_name, int reps,
             void* data, void* result, int size, MPI_Datatype type,
	     MPI_Op binop, int root, MPI_Comm comm)
/* Known bugs:  for the printout part, time_it() assumes the result 
datatype is double and it shows just the last value in the result array.
*/
{
	int r; MPI_Comm_rank(comm, &r);
	double start_time, elapsed_time;
	MPI_Barrier(comm);
	if (r == root) start_time = MPI_Wtime();

	for (int i = 0; i < reps; ++i)
	    reduce_func(data, result, size, type, binop, comm);

	if (r == root) 
	{   elapsed_time = MPI_Wtime() - start_time;
	    cout << reducer_name << " computed " 
                 << static_cast<double*>(result)[size-1];
	    cout << " repeated " << reps << " times";
	    cout << " in " << elapsed_time << " seconds." << endl;
	}
} // time_it()
