/* Timing test program for several reduction strategies.
Note this program is designed to be used with 17 processes
of which 16 participate in the reductions.  At a minimum
there must be 10 processes.
*/
#include <iostream>
#include <vector>
#include <mpi.h>
#include "tree-n-star-reduce.h"

/*******************************************************
The goal of this homework is to implement, test, and 
analyze alternate ways to do reduction operations.  The coding 
is to be done in tree-n-star-reduce.h, which can then be run
with this test program.  This program is written to obtain
three timings in each of the following 6 cases:
case 1a: One value per process being reduced using MPI_Reduce 
case 2a: One value per process being reduced using your star_Reduce 
case 3a: One value per process being reduced using your tree_Reduce 
case 1b: An array of many values per process reduced by MPI_Reduce 
case 2b: An array of many values per process reduced by your star_Reduce 
case 3b: An array of many values per process reduced by your tree_Reduce 

There are two problems in this homework:

********
1. Problem 1 has three parts:  
Parts 1 and 2 are to improve the implementations
of star_Reduce and tree_Reduce as specified in tree-n-star-reduce.h.
Part 3 is to change this test routine so that it tests the reducers
on int data instead of double data and also uses multiplication and max
as the binop as well as addition in various test cases.

Submit your code files, a script of a run with this program, and
a one page discussion/explanation of the timings you got.

********
2. Problem 2 is to implement star_Allreduce and tree_Allreduce.
star_Allreduce does an Allreduce by a star_Reduce followed by 
a Bcast.  tree_Allreduce does a tree_Reduce followed by
a "tree_Bcast" (or you could call it "tree_Expand" or "reverse tree_Reduce"),
i.e. a form of bcast wherein at each step the number of processes 
having the result data doubles.

Submit (1) your code files, (2) a script of a run timing your two Allreduce's
with MPI_Allreduce and with "butterfly_Allreduce",
and (3) a one page discussion/explanation of the timings you got.

The butterfly_Allreduce is found in porsche:~saunders/butterfly.h.
It only works for root = 0, p a power of 2, datatype double, binop +.

********
The due date is Oct 22.

*******************************************************
*******************************************************

Notes:  
1. Sometimes it appears that timings depend on the order
in which timed activities are done.  If you wish, you may rearrange 
the order of the 18 timed activities in this program to see what 
effect that may have on the results.

2. Compare the effects of the order in which the 16 cpu's are 
used.  The makefile might be:
reduction: reduction.C tree-n-star-reduce.h
        mpiCC reduction.C -o reduction

# 8 cars used cyclically (processes 0 and 8 on the 2 honda cpu's, ...)
run1:
        mpirun -np 17 -machinefile cars reduction

# 8 cars used doubly (processes 0 and 1 on the 2 honda cpu's, ...)
run2:
        mpirun -np 17 -machinefile car2 reduction

*******************************************************/

// time_it() is a tool for timing the various reduction functions.
// The implementation is at the end of this file.
void time_it(int (*reduce_func)(void*, void*, int, int, int, int, int), 
             char* reducer_name, int reps,
             void* data, void* result, int size, MPI_Datatype type,
	     MPI_Op binop, int root, MPI_Comm comm);

main(int argc, char* argv[])
{
    MPI_Init(&argc, &argv);

    ///////////////////////////////////////////////////////////////////
    // Create a communicator which excludes process 0 (i.e. porsche).
	//// get group
	MPI_Group world_group;
	MPI_Comm_group(MPI_COMM_WORLD, &world_group);
	//// set up vector of ranks for new comm
        int wp; // size of world comm.
	MPI_Comm_size(MPI_COMM_WORLD, &wp);
	vector<int> ranks(wp-1); 
	for (int i = 1; i < wp; ++i) ranks[i-1] = i;
	//// make new group
	MPI_Group rack_group;
	MPI_Group_incl(world_group, wp-1, &ranks[0], &rack_group);
	//// make new comm
	MPI_Comm rack; 
	MPI_Comm_create(MPI_COMM_WORLD, rack_group, &rack);

    ///////////////////////////////////////////////////////////////////
    // reduction timing tests
    int wr; // my rank in world comm.
    MPI_Comm_rank(MPI_COMM_WORLD, &wr);
    if (wr != 0) // leave porsche out of it altogether.
    {
	int r; MPI_Comm_rank(rack, &r);
	double data = r + 1, result;

    /////////////////////////////////////////////
    // reductions where there is one data item //

        int reps = 10;
	// Case 1: Using MPI's built-in reduce function.
	time_it(MPI_Reduce, "Case 1a: MPI_Reduce", reps, 
	        &data, &result, 1, MPI_DOUBLE, MPI_SUM, 0, rack);

	time_it(MPI_Reduce, "Case 1a: MPI_Reduce", reps, 
	        &data, &result, 1, MPI_DOUBLE, MPI_SUM, 1, rack);

	time_it(MPI_Reduce, "Case 1a: MPI_Reduce", reps, 
	        &data, &result, 1, MPI_DOUBLE, MPI_SUM, 8, rack);

	// Case 2: Using a reduce built from a star pattern of send-recv's.
	time_it(star_Reduce, "Case 2a: star_Reduce", reps, 
	        &data, &result, 1, MPI_DOUBLE, MPI_SUM, 0, rack);

	time_it(star_Reduce, "Case 2a: star_Reduce", reps, 
	        &data, &result, 1, MPI_DOUBLE, MPI_SUM, 1, rack);

	time_it(star_Reduce, "Case 2a: star_Reduce", reps, 
	        &data, &result, 1, MPI_DOUBLE, MPI_SUM, 8, rack);

	// Case 3: Using a reduce built from a tree pattern of send-recv's.
	time_it(tree_Reduce, "Case 3a: tree_Reduce", reps, 
	        &data, &result, 1, MPI_DOUBLE, MPI_SUM, 0, rack);

	time_it(tree_Reduce, "Case 3a: tree_Reduce", reps, 
	        &data, &result, 1, MPI_DOUBLE, MPI_SUM, 1, rack);

	time_it(tree_Reduce, "Case 3a: tree_Reduce", reps, 
	        &data, &result, 1, MPI_DOUBLE, MPI_SUM, 8, rack);

    //////////////////////////////////////////////////////////
    // experiments on reductions using large arrays of data //
	int n = 100000;
	double D[n];
	for(int i = 0; i < n; ++i) D[i] = r+1;
	double R[n];

	time_it(MPI_Reduce, "Case 1b: MPI_Reduce", 2, 
	        D, R, n, MPI_DOUBLE, MPI_SUM, 0, rack);

	time_it(MPI_Reduce, "Case 1b: MPI_Reduce", 2, 
	        D, R, n, MPI_DOUBLE, MPI_SUM, 1, rack);

	time_it(MPI_Reduce, "Case 1b: MPI_Reduce", 2, 
	        D, R, n, MPI_DOUBLE, MPI_SUM, 8, rack);

	time_it(star_Reduce, "Case 2b: star_Reduce", 2, 
	        D, R, n, MPI_DOUBLE, MPI_SUM, 0, rack);

	time_it(star_Reduce, "Case 2b: star_Reduce", 2, 
	        D, R, n, MPI_DOUBLE, MPI_SUM, 1, rack);

	time_it(star_Reduce, "Case 2b: star_Reduce", 2, 
	        D, R, n, MPI_DOUBLE, MPI_SUM, 8, rack);

	time_it(tree_Reduce, "Case 3b: tree_Reduce", 2, 
	        D, R, n, MPI_DOUBLE, MPI_SUM, 0, rack);

	time_it(tree_Reduce, "Case 3b: tree_Reduce", 2, 
	        D, R, n, MPI_DOUBLE, MPI_SUM, 1, rack);

	time_it(tree_Reduce, "Case 3b: tree_Reduce", 2, 
	        D, R, n, MPI_DOUBLE, MPI_SUM, 8, rack);

    }
    MPI_Finalize();
	
}

// time_it() definition:
void time_it(int (*reduce_func)(void*, void*, int, int, int, int, int), 
             char* reducer_name, int reps,
             void* data, void* result, int size, MPI_Datatype type,
	     MPI_Op binop, int root, MPI_Comm comm)
/* Known bugs:  for the printout part, time_it() assumes the result 
datatype is double and it shows just the last value in the result array.
*/
{
	int r; MPI_Comm_rank(comm, &r);
	double start_time, elapsed_time;
	MPI_Barrier(comm);
	if (r == root) start_time = MPI_Wtime();

	for (int i = 0; i < reps; ++i)
	    reduce_func(data, result, size, type, binop, root, comm);

	if (r == root) 
	{   elapsed_time = MPI_Wtime() - start_time;
	    cout << reducer_name << " computed " 
                 << static_cast<double*>(result)[size-1];
	    cout << " repeated " << reps << " times";
	    cout << " in " << elapsed_time << " seconds." << endl;
	}
} // time_it()
