/* Timing test program for several reduction strategies. Note this program is designed to be used with 17 processes of which 16 participate in the reductions. At a minimum there must be 10 processes. */ #include #include #include #include "tree-n-star-reduce.h" /******************************************************* The goal of this homework is to implement, test, and analyze alternate ways to do reduction operations. The coding is to be done in tree-n-star-reduce.h, which can then be run with this test program. This program is written to obtain three timings in each of the following 6 cases: case 1a: One value per process being reduced using MPI_Reduce case 2a: One value per process being reduced using your star_Reduce case 3a: One value per process being reduced using your tree_Reduce case 1b: An array of many values per process reduced by MPI_Reduce case 2b: An array of many values per process reduced by your star_Reduce case 3b: An array of many values per process reduced by your tree_Reduce There are two problems in this homework: ******** 1. Problem 1 has three parts: Parts 1 and 2 are to improve the implementations of star_Reduce and tree_Reduce as specified in tree-n-star-reduce.h. Part 3 is to change this test routine so that it tests the reducers on int data instead of double data and also uses multiplication and max as the binop as well as addition in various test cases. Submit your code files, a script of a run with this program, and a one page discussion/explanation of the timings you got. ******** 2. Problem 2 is to implement star_Allreduce and tree_Allreduce. star_Allreduce does an Allreduce by a star_Reduce followed by a Bcast. tree_Allreduce does a tree_Reduce followed by a "tree_Bcast" (or you could call it "tree_Expand" or "reverse tree_Reduce"), i.e. a form of bcast wherein at each step the number of processes having the result data doubles. Submit (1) your code files, (2) a script of a run timing your two Allreduce's with MPI_Allreduce and with "butterfly_Allreduce", and (3) a one page discussion/explanation of the timings you got. The butterfly_Allreduce is found in porsche:~saunders/butterfly.h. It only works for root = 0, p a power of 2, datatype double, binop +. ******** The due date is Oct 22. ******************************************************* ******************************************************* Notes: 1. Sometimes it appears that timings depend on the order in which timed activities are done. If you wish, you may rearrange the order of the 18 timed activities in this program to see what effect that may have on the results. 2. Compare the effects of the order in which the 16 cpu's are used. The makefile might be: reduction: reduction.C tree-n-star-reduce.h mpiCC reduction.C -o reduction # 8 cars used cyclically (processes 0 and 8 on the 2 honda cpu's, ...) run1: mpirun -np 17 -machinefile cars reduction # 8 cars used doubly (processes 0 and 1 on the 2 honda cpu's, ...) run2: mpirun -np 17 -machinefile car2 reduction *******************************************************/ // time_it() is a tool for timing the various reduction functions. // The implementation is at the end of this file. void time_it(int (*reduce_func)(void*, void*, int, int, int, int, int), char* reducer_name, int reps, void* data, void* result, int size, MPI_Datatype type, MPI_Op binop, int root, MPI_Comm comm); main(int argc, char* argv[]) { MPI_Init(&argc, &argv); /////////////////////////////////////////////////////////////////// // Create a communicator which excludes process 0 (i.e. porsche). //// get group MPI_Group world_group; MPI_Comm_group(MPI_COMM_WORLD, &world_group); //// set up vector of ranks for new comm int wp; // size of world comm. MPI_Comm_size(MPI_COMM_WORLD, &wp); vector ranks(wp-1); for (int i = 1; i < wp; ++i) ranks[i-1] = i; //// make new group MPI_Group rack_group; MPI_Group_incl(world_group, wp-1, &ranks[0], &rack_group); //// make new comm MPI_Comm rack; MPI_Comm_create(MPI_COMM_WORLD, rack_group, &rack); /////////////////////////////////////////////////////////////////// // reduction timing tests int wr; // my rank in world comm. MPI_Comm_rank(MPI_COMM_WORLD, &wr); if (wr != 0) // leave porsche out of it altogether. { int r; MPI_Comm_rank(rack, &r); double data = r + 1, result; ///////////////////////////////////////////// // reductions where there is one data item // int reps = 10; // Case 1: Using MPI's built-in reduce function. time_it(MPI_Reduce, "Case 1a: MPI_Reduce", reps, &data, &result, 1, MPI_DOUBLE, MPI_SUM, 0, rack); time_it(MPI_Reduce, "Case 1a: MPI_Reduce", reps, &data, &result, 1, MPI_DOUBLE, MPI_SUM, 1, rack); time_it(MPI_Reduce, "Case 1a: MPI_Reduce", reps, &data, &result, 1, MPI_DOUBLE, MPI_SUM, 8, rack); // Case 2: Using a reduce built from a star pattern of send-recv's. time_it(star_Reduce, "Case 2a: star_Reduce", reps, &data, &result, 1, MPI_DOUBLE, MPI_SUM, 0, rack); time_it(star_Reduce, "Case 2a: star_Reduce", reps, &data, &result, 1, MPI_DOUBLE, MPI_SUM, 1, rack); time_it(star_Reduce, "Case 2a: star_Reduce", reps, &data, &result, 1, MPI_DOUBLE, MPI_SUM, 8, rack); // Case 3: Using a reduce built from a tree pattern of send-recv's. time_it(tree_Reduce, "Case 3a: tree_Reduce", reps, &data, &result, 1, MPI_DOUBLE, MPI_SUM, 0, rack); time_it(tree_Reduce, "Case 3a: tree_Reduce", reps, &data, &result, 1, MPI_DOUBLE, MPI_SUM, 1, rack); time_it(tree_Reduce, "Case 3a: tree_Reduce", reps, &data, &result, 1, MPI_DOUBLE, MPI_SUM, 8, rack); ////////////////////////////////////////////////////////// // experiments on reductions using large arrays of data // int n = 100000; double D[n]; for(int i = 0; i < n; ++i) D[i] = r+1; double R[n]; time_it(MPI_Reduce, "Case 1b: MPI_Reduce", 2, D, R, n, MPI_DOUBLE, MPI_SUM, 0, rack); time_it(MPI_Reduce, "Case 1b: MPI_Reduce", 2, D, R, n, MPI_DOUBLE, MPI_SUM, 1, rack); time_it(MPI_Reduce, "Case 1b: MPI_Reduce", 2, D, R, n, MPI_DOUBLE, MPI_SUM, 8, rack); time_it(star_Reduce, "Case 2b: star_Reduce", 2, D, R, n, MPI_DOUBLE, MPI_SUM, 0, rack); time_it(star_Reduce, "Case 2b: star_Reduce", 2, D, R, n, MPI_DOUBLE, MPI_SUM, 1, rack); time_it(star_Reduce, "Case 2b: star_Reduce", 2, D, R, n, MPI_DOUBLE, MPI_SUM, 8, rack); time_it(tree_Reduce, "Case 3b: tree_Reduce", 2, D, R, n, MPI_DOUBLE, MPI_SUM, 0, rack); time_it(tree_Reduce, "Case 3b: tree_Reduce", 2, D, R, n, MPI_DOUBLE, MPI_SUM, 1, rack); time_it(tree_Reduce, "Case 3b: tree_Reduce", 2, D, R, n, MPI_DOUBLE, MPI_SUM, 8, rack); } MPI_Finalize(); } // time_it() definition: void time_it(int (*reduce_func)(void*, void*, int, int, int, int, int), char* reducer_name, int reps, void* data, void* result, int size, MPI_Datatype type, MPI_Op binop, int root, MPI_Comm comm) /* Known bugs: for the printout part, time_it() assumes the result datatype is double and it shows just the last value in the result array. */ { int r; MPI_Comm_rank(comm, &r); double start_time, elapsed_time; MPI_Barrier(comm); if (r == root) start_time = MPI_Wtime(); for (int i = 0; i < reps; ++i) reduce_func(data, result, size, type, binop, root, comm); if (r == root) { elapsed_time = MPI_Wtime() - start_time; cout << reducer_name << " computed " << static_cast(result)[size-1]; cout << " repeated " << reps << " times"; cout << " in " << elapsed_time << " seconds." << endl; } } // time_it()