#ifndef __BUTTERFLY_ALLREDUCE__
#define __BUTTERFLY_ALLREDUCE__
#include <mpi.h>

int butterfly_Allreduce(void* local, void* result, int size, MPI_Datatype type, MPI_Op binop, MPI_Comm comm)
{
/* This stub ignores the input type, acts as if it were MPI_Double.
 * This stub ignores the input binop, acts as if it were MPI_Sum.
 */
	int r,p; 
	MPI_Status status;
	MPI_Comm_size(comm, &p);
	MPI_Comm_rank(comm, &r);
        double* R = static_cast<double*>(result);
        double* L = static_cast<double*>(local);
	double* R2;

        //// k = highest power of 2 which is not greater than p.
	int k; for (k=1; k <= p; k *= 2); k /=2;
	if ( r < k ) R2 = new double[size];

        //// get data from rough edge beyond k
	if ( r < k && r+k < p ) 
	{
	    MPI_Recv(R2, size, type, r+k, 0, comm, &status);
	    for(int i = 0; i < size; ++i) R[i] = L[i] + R2[i];
	}
        else if ( r >= k )
	    MPI_Send(L, size, type, r-k, 0, comm);
	else // p - k <= r < k 
	    for(int i = 0; i < size; ++i) R[i] = L[i];
	//// now procs [0..k) have R set, and the data from 
        //// procs [k..p) is included.

        //// butterfly on procs [0..k), where k is an exact power of 2.
	if ( r < k ) 
	    for (int j = 1; j < k; j *= 2)
	    {
	        int rr = (r % (2*j) < j) ? r+j : r-j;
	        MPI_Sendrecv(R, size, type, rr, 0,
                             R2, size, type, rr, 0, 
                              comm, &status);
	        for(int i = 0; i < size; ++i) R[i] += R2[i];
	    }

        //// give result to rough edge beyond k.
	if ( r < k && r+k < p ) 
	    MPI_Send(R, size, type, r+k, 0, comm);
        else if ( r >= k )
	    MPI_Recv(R, size, type, r-k, 0, comm, &status);

//	/*********** correctness checking code *************/
//        if (r != 0) 
//	    MPI_Send(R, size, type, 0, 0, comm);
//        else for (int i = 1; i < p; ++i)
//	{
//	    MPI_Recv(R2, size, type, i, 0, comm, &status);
//	    for (int j = 0; j < size; ++j)
//		if ( R[j] != R2[j] )
//		    cerr << "problem on proc " << i << " in posn " << j 
//			<< ": it's R[j] = " << R2[j]
//			<< "but 0's R[j] = " << R[j] << endl;
//		
//	}
//	/*********** end correctness checking code *************/
}

#endif
