/* cyclehunt.cc  -bds 11/02
 * One dimensional cellular automata project. 
 * To find the length of the cycles in cellular automata. 
 * To discover which rules are regular or irregular regarding cycle length.
 * Draft sequential code */

/**************************************************************************************
       ProjectB goal is to parallelize the first for-loop in main() below.

That loop involves cycle hunting for 2^14 combinations of rules and initial states
(each of 256 rules with each of 64 initial states).  For each combination
it tries to find a cycle within M = 10^5 generation updates of a one dimensional ring 
cellular automaton with 64 cells.  

The problem here is the lack of load balance.  Some of the searches are very short 
and some are very long.  Find a way to dispatch tasks so that the work on the several 
processors is not too out of balance.  The main function used in the search is findCycle, 
whose parameter m limits how far it searches.  You can break up a call to findCycle 
into segments.  For instance, the serial code calls findCycle(data, M), but you could 
break this up into two calls findCycle(data, M/2), or 100 calls findCycle(data, 1000), 
etc.  Note that the data often gets into the "DONE" state well before this many calls.  
The data can be queried for DONE-ness, and also observe that when findCycle is called 
on DONE data, it is effectively a no-op.

The serial code runs for about 7 minutes on honda.  The project goal is to speed it up
as much as you can.  Can you get it under 30 seconds?!  For this project you can report 
just the timing data on the number of processors which gives you your best bottom line 
time.  Simple best performance on some subset, possibly all, of the 16 rack processors 
is the goal.  Plots of speedup/efficiency on various numbers of nodes is optional here.

Note that it is OK to run more than one process per processor.  When load balancing is 
an issue, doing this is sometimes a useful strategy (but only sometimes).  Also, it is 
acceptable for Porche to act as the master process.  Note that there may be heavy use 
of Porsche in the next couple of weeks.  

There is no need to alter any of the subroutines below, except main, and it is not expected  
that you will alter any of the other procedures.  However, for the sake of speed, you can 
undertake optimizations of some of the subroutines if you wish.  Any idea that contributes
to speedup contributes to the goal of this project.  

In main, there is no need to alter the second for-loop (the output stage).  Your parallel 
code should have the identical output to the serial code.  The serial output is saved in 
~saunders/cyclehunt.result.  In designing your solution, do not use information about 
specific rules and specific initial states that you may learn from this output or during
debugging.  Your load balancing should work as well if the set of rules and/or set
of initial states were changed.  In fact the point of the parallelization is not to
shorten a specific 7 minute computation.  That, after all, has already been run.  
Rather, the point is to shorten future days long computations with the most interesting
rules and with many initial states. 

In your writeup describe your strategy for organizing the tasks to achieve a reasonable
load balance.  Include code, output, timing data, discussion of it's speedup/efficiency.
Plots are optional.  Due Dec 10.

**************************************************************************************/
#include <vector>
#include <map>
#include <iostream>
//#include <stdlib.h>
using namespace std;

#include "apply_rule.h"
// apply_rule.h defines these  functions:
// apply_rule(), display_cells(), areEqual(), and binrule().
// apply_rule(rule, left, center, right) gives next value for center.
// according to rule.  You shouldn't need to modify any of that.


/* cellular_automaton_step applies the rule to modify each position
 * in [b..e), assuming wrap around so that b is a neighbor of e-1.
 * Requires e-b is at least 3, and 0 <= rule < 256.
 */
template<class Ptr> 
void cellular_automaton_step(int rule, Ptr b, Ptr e)
{   typedef iterator_traits<Ptr>::value_type Cell;
    Cell left = e[-1];
    Cell first = *b;
    for (Ptr i = b; i < e-1; ++i)
    {   left = apply_rule(rule, left, *i, i[1]);
        swap(left, *i);
    }
    e[-1] = apply_rule(rule, left, e[-1], first);
}

const int N = 64; // Ring cellular automaton size for this project.

enum Stage {FINDING_CYCLE_NODE, FINDING_CYCLE_LENGTH, DONE}; 

// Cycle_hunt_checkpoint allows for dispatching of a controlled amount of 
// cycle hunting for a given rule and initial state and afterwards having a 
// checkpoint which can be used for a further amount of hunting for the same cycle.  
struct Cycle_hunt_checkpoint
{   int rule;
    vector<int> C;
    vector<int> D; 
    int to_cycle_steps; // Steps taken by C and D during hunt for a cycle node. 
			// It will be 1 + 3k for some k.
    int length_cycle_steps; // Steps taken by C after a node in cycle has been found.
			    // When stage == DONE, length_cycle_steps is the cycle length.
    Stage stage;
    Cycle_hunt_checkpoint(int _rule, unsigned long long initial = 1)
    : rule(_rule), C(N), D(N), to_cycle_steps(1), length_cycle_steps(0), 
      stage(FINDING_CYCLE_NODE)
    {
	// set the initial states of C and D 
        unsigned long long L;
        for (int i = 0; i < N; ++i) C[i] = (initial & ((L=1) << i))?1:0;
        //for (int i = 0; i < N; ++i) cout << C[i]; cout << endl;
        copy(C.begin(), C.end(), D.begin());
        cellular_automaton_step(rule, D.begin(), D.end());
    }
};

void findCycle(Cycle_hunt_checkpoint& data, const int& m = 1)
// Updates data until cycle length found or M generation updates have occured, 
// whichever comes first.  Note that this function can be restarted with the same
// data argument and it will advance the data to a checkpoint m steps forward
// from it's input checkpoint state.
{
    int i = 0;
    switch (data.stage)
    {    
	 case FINDING_CYCLE_NODE: 

	    // get a position on the cycle.
	    while ( !areEqual(data.C.begin(), data.C.end(), data.D.begin()) && i < m)
	    {
		cellular_automaton_step(data.rule, data.C.begin(), data.C.end());
		cellular_automaton_step(data.rule, data.D.begin(), data.D.end());
		cellular_automaton_step(data.rule, data.D.begin(), data.D.end());
		data.to_cycle_steps += 3;
		++i;
	    }
	    if (i < m) 
	        data.stage = FINDING_CYCLE_LENGTH;
	    else
	        break;

        case FINDING_CYCLE_LENGTH: 
	    do
	    {
                cellular_automaton_step(data.rule, data.C.begin(), data.C.end());
		data.length_cycle_steps += 1;
		++i;
 	    }
	    while ( !areEqual(data.C.begin(), data.C.end(), data.D.begin()) && i < m );
	    if (i < m) data.stage = DONE;
	    break;

         case DONE: break; 
     }
}
         

int main(int argc, char** argv)
{
    int M = 100000; // max iterations
    typedef map<int, int> FreqTable;
    vector<FreqTable> Rules(256);
    unsigned long long initial;

    // 2^14 findCycle trials = 256(rules) x 64(initial states) 
    for (int rule = 0; rule < 256; ++rule)
    {   // 32 trials where initial state has only 1 or 2 cells on.
	for (unsigned int init = 1; init != 0; init *= 2)
	{   initial = init | 1; // add 1 when init > 1.  Thus 2 bits are on.
            Cycle_hunt_checkpoint CK(rule, initial);
	    findCycle(CK, M);
	    if (CK.stage == DONE)
	        Rules[rule][CK.length_cycle_steps] += 1;
	    else
	        Rules[rule][0] += 1;
        }
	    
        // 32 trials where initial state is random.
	for (int i = 0; i < 32; ++i)
	{   initial = random(); // 32 random bits
	    initial = (initial << 32) + random();  // now 64 random bits
            Cycle_hunt_checkpoint CK(rule, initial);
	    findCycle(CK, M);
	    if (CK.stage == DONE)
	        Rules[rule][CK.length_cycle_steps] += 1;
	    else
	        Rules[rule][0] += 1;
        }
    }

	    
    // display resulting cycle length frequencies for each rule.
    // Note: a cycle length of 0 means the length is not yet found after M steps.
    for (int rule = 0; rule < 256; ++rule)
    {   cout << "rule " << rule << ": "; 
	for (FreqTable::iterator i = Rules[rule].begin(); i != Rules[rule].end(); ++i)
	{
	    if (i != Rules[rule].begin()) cout << ", ";
	    cout << i->first << ":" << i->second;
	}
	cout << endl;
    }
}// main

