/* cyclehunt.cc -bds 11/02 * One dimensional cellular automata project. * To find the length of the cycles in cellular automata. * To discover which rules are regular or irregular regarding cycle length. * Draft sequential code */ /************************************************************************************** ProjectB goal is to parallelize the first for-loop in main() below. That loop involves cycle hunting for 2^14 combinations of rules and initial states (each of 256 rules with each of 64 initial states). For each combination it tries to find a cycle within M = 10^5 generation updates of a one dimensional ring cellular automaton with 64 cells. The problem here is the lack of load balance. Some of the searches are very short and some are very long. Find a way to dispatch tasks so that the work on the several processors is not too out of balance. The main function used in the search is findCycle, whose parameter m limits how far it searches. You can break up a call to findCycle into segments. For instance, the serial code calls findCycle(data, M), but you could break this up into two calls findCycle(data, M/2), or 100 calls findCycle(data, 1000), etc. Note that the data often gets into the "DONE" state well before this many calls. The data can be queried for DONE-ness, and also observe that when findCycle is called on DONE data, it is effectively a no-op. The serial code runs for about 7 minutes on honda. The project goal is to speed it up as much as you can. Can you get it under 30 seconds?! For this project you can report just the timing data on the number of processors which gives you your best bottom line time. Simple best performance on some subset, possibly all, of the 16 rack processors is the goal. Plots of speedup/efficiency on various numbers of nodes is optional here. Note that it is OK to run more than one process per processor. When load balancing is an issue, doing this is sometimes a useful strategy (but only sometimes). Also, it is acceptable for Porche to act as the master process. Note that there may be heavy use of Porsche in the next couple of weeks. There is no need to alter any of the subroutines below, except main, and it is not expected that you will alter any of the other procedures. However, for the sake of speed, you can undertake optimizations of some of the subroutines if you wish. Any idea that contributes to speedup contributes to the goal of this project. In main, there is no need to alter the second for-loop (the output stage). Your parallel code should have the identical output to the serial code. The serial output is saved in ~saunders/cyclehunt.result. In designing your solution, do not use information about specific rules and specific initial states that you may learn from this output or during debugging. Your load balancing should work as well if the set of rules and/or set of initial states were changed. In fact the point of the parallelization is not to shorten a specific 7 minute computation. That, after all, has already been run. Rather, the point is to shorten future days long computations with the most interesting rules and with many initial states. In your writeup describe your strategy for organizing the tasks to achieve a reasonable load balance. Include code, output, timing data, discussion of it's speedup/efficiency. Plots are optional. Due Dec 10. **************************************************************************************/ #include #include #include //#include using namespace std; #include "apply_rule.h" // apply_rule.h defines these functions: // apply_rule(), display_cells(), areEqual(), and binrule(). // apply_rule(rule, left, center, right) gives next value for center. // according to rule. You shouldn't need to modify any of that. /* cellular_automaton_step applies the rule to modify each position * in [b..e), assuming wrap around so that b is a neighbor of e-1. * Requires e-b is at least 3, and 0 <= rule < 256. */ template void cellular_automaton_step(int rule, Ptr b, Ptr e) { typedef iterator_traits::value_type Cell; Cell left = e[-1]; Cell first = *b; for (Ptr i = b; i < e-1; ++i) { left = apply_rule(rule, left, *i, i[1]); swap(left, *i); } e[-1] = apply_rule(rule, left, e[-1], first); } const int N = 64; // Ring cellular automaton size for this project. enum Stage {FINDING_CYCLE_NODE, FINDING_CYCLE_LENGTH, DONE}; // Cycle_hunt_checkpoint allows for dispatching of a controlled amount of // cycle hunting for a given rule and initial state and afterwards having a // checkpoint which can be used for a further amount of hunting for the same cycle. struct Cycle_hunt_checkpoint { int rule; vector C; vector D; int to_cycle_steps; // Steps taken by C and D during hunt for a cycle node. // It will be 1 + 3k for some k. int length_cycle_steps; // Steps taken by C after a node in cycle has been found. // When stage == DONE, length_cycle_steps is the cycle length. Stage stage; Cycle_hunt_checkpoint(int _rule, unsigned long long initial = 1) : rule(_rule), C(N), D(N), to_cycle_steps(1), length_cycle_steps(0), stage(FINDING_CYCLE_NODE) { // set the initial states of C and D unsigned long long L; for (int i = 0; i < N; ++i) C[i] = (initial & ((L=1) << i))?1:0; //for (int i = 0; i < N; ++i) cout << C[i]; cout << endl; copy(C.begin(), C.end(), D.begin()); cellular_automaton_step(rule, D.begin(), D.end()); } }; void findCycle(Cycle_hunt_checkpoint& data, const int& m = 1) // Updates data until cycle length found or M generation updates have occured, // whichever comes first. Note that this function can be restarted with the same // data argument and it will advance the data to a checkpoint m steps forward // from it's input checkpoint state. { int i = 0; switch (data.stage) { case FINDING_CYCLE_NODE: // get a position on the cycle. while ( !areEqual(data.C.begin(), data.C.end(), data.D.begin()) && i < m) { cellular_automaton_step(data.rule, data.C.begin(), data.C.end()); cellular_automaton_step(data.rule, data.D.begin(), data.D.end()); cellular_automaton_step(data.rule, data.D.begin(), data.D.end()); data.to_cycle_steps += 3; ++i; } if (i < m) data.stage = FINDING_CYCLE_LENGTH; else break; case FINDING_CYCLE_LENGTH: do { cellular_automaton_step(data.rule, data.C.begin(), data.C.end()); data.length_cycle_steps += 1; ++i; } while ( !areEqual(data.C.begin(), data.C.end(), data.D.begin()) && i < m ); if (i < m) data.stage = DONE; break; case DONE: break; } } int main(int argc, char** argv) { int M = 100000; // max iterations typedef map FreqTable; vector Rules(256); unsigned long long initial; // 2^14 findCycle trials = 256(rules) x 64(initial states) for (int rule = 0; rule < 256; ++rule) { // 32 trials where initial state has only 1 or 2 cells on. for (unsigned int init = 1; init != 0; init *= 2) { initial = init | 1; // add 1 when init > 1. Thus 2 bits are on. Cycle_hunt_checkpoint CK(rule, initial); findCycle(CK, M); if (CK.stage == DONE) Rules[rule][CK.length_cycle_steps] += 1; else Rules[rule][0] += 1; } // 32 trials where initial state is random. for (int i = 0; i < 32; ++i) { initial = random(); // 32 random bits initial = (initial << 32) + random(); // now 64 random bits Cycle_hunt_checkpoint CK(rule, initial); findCycle(CK, M); if (CK.stage == DONE) Rules[rule][CK.length_cycle_steps] += 1; else Rules[rule][0] += 1; } } // display resulting cycle length frequencies for each rule. // Note: a cycle length of 0 means the length is not yet found after M steps. for (int rule = 0; rule < 256; ++rule) { cout << "rule " << rule << ": "; for (FreqTable::iterator i = Rules[rule].begin(); i != Rules[rule].end(); ++i) { if (i != Rules[rule].begin()) cout << ", "; cout << i->first << ":" << i->second; } cout << endl; } }// main