// file lz.cc updated 4/99 -bds #include typedef int codeunit; class LZT /* Lempel Ziv Table */ { const codeunit SIZE = 4096; /*2^12*/ class node {public: codeunit init_seg; char last; node(codeunit u = -1, char K = 0) { init_seg = u; last = K; } }; node table[SIZE]; set codes; codeunit next; public: LZT() /* Constructor. Initialize table to have codes for all 256 single character strings. */ ; /* A codeunit is a compressed representation of a string. In discussing the LZT functions we use string(w) to denote the string represented by codeunit w. But string(w) is not an implemented function. Also string(w)K means the string represented by w with character K adjoined at the right end. */ void make_code(codeunit u, char K) /* Assumes code u is in the table and that no code for string(u)K is currently in the table. Installs the code for string(u)K. */ ; void get_or_make_code(codeunit u, char K, codeunit & v, bool & isnew) /* Set v to the code for string(u)K. If this is a new code, install it in the table and set isnew to true, else isnew is false. */ ; bool new_code(codeunit w) /* Return true iff w is not in the table */ ; char first(codeunit w) /* Assumes w is in the table. Return the first character in string(w) */ ; void out(codeunit w, ostream& output) /* write string(w) to output. */ ; }; void compress(istream& input, ostream& output) { LZT T; char K; codeunit v, w; bool isnew; input >> K; v = K; while (input >> K) { /* if no comresssion code exists for string(v)K then 1. make a compression code for string(v)K for future use. 2. output v (the code of a longest posssible previously seen string in the input). 3. use K to start the next compression code. otherwise 1. set v to the compresssion code of string(v)K and continue seeking the longest previously seen string from the input. */ T.get_or_make_code(v, K, w, isnew); if (isnew) {output << v; v = K;} else v = w; } } void uncompress(istream& input, ostream& output) { LZT T; char K; codeunit w, v; bool isnew; input >> v; T.out(v, output); while (input >> w) { /* First make the (always new) compression code for the previous code v and first character K of the string of the current code w. However there is a slight chance the current code is unknown to us. The only way w can be unfamiliar to us is if the sender happened to use the extension of the previous code that she had just made, an extension we have not yet installed in our table. In that case the first character of the two codes are the same, so we can install the new code, then use it. For example suppose a portion of the text was "ababa", and v such that string(v) = "ab" was sent because the sender did not at that time have a code for "aba". The sender would have then made a code, w, for "aba" and, as it turns out, immediately used it for the last 3 letters of "ababa". In that case, since we have not previously seen w, it is not in our table. We cannot make it from v and the first letter "a" of w, because we do not know w yet. However in this situation it is necessarily the case that v and w have the same first letter, so we can use the known first letter of v. */ K = T.first(T.new_code(w) ? v : w); T.make_code(v, K); /* output string(w) and continue */ T.out(w, output); v = w; } } int main(int argc, char** argv) { if (argc != 2) { cout << " This is a filter to compress:\n"; cout << argv[0] << " c < source_file > compressed_file\n"; cout << " or uncompress:\n"; cout << argv[0] << " u < compressed_file > source_file\n"; } else if (argv[1] == "c") compress(cin, cout); else uncompress(cin, cout); return 0; } // The code below is an incomplete implemementation of the LZT class methods. LZT::LZT() /* Constructor. Initialize table to have codes for all 256 single character strings. */ {/*...*/ for (int i = 0; i < 256; i++ ) { table[i].init_seg = -1; table[i].last = (char) i; } for (int i = 256; i < SIZE; i++ ) { table[i].init_seg = -1; table[i].last = 0; } next = 256; } /* A codeunit is a compressed representation of a string. In discussing the LZT functions we use string(w) to denote the string represented by codeunit w. But string(w) is not an implemented function. Also string(w)K means the string represented by w with character K adjoined at the right end. */ void LZT::make_code(codeunit u, char K) /* Assumes code u is in the table and that no code for string(u)K is currently in the table. Installs the code for string(u)K. */ {/*...*/ node n(u,K); if ( next < SIZE ) table[next++] = n; codes.insert(n); } void LZT::get_or_make_code(codeunit u, char K, codeunit& v, bool& isnew) /* Set v to the code for string(u)K. If this is a new code, install it in the table and set isnew to 1, else isnew is 0. */ {/*...*/ this is not complete. We need to check how we handle running out of space if (codes.member(node(u,K)) v = next; make_code(u, K); isnew = new_code(v); } bool LZT::new_code(codeunit w) /* Return true iff w is not in the table */ { /*...*/ return w >= 256 && table[w].init_seg == -1; } char LZT::first(codeunit w) /* Assumes w is in the table. Return the first character in string(w) */ { char K; while (w >= 0 ) { K = table[w].last; w = table[w].init_seg; } return K; } #include void LZT::out(codeunit w, ostream& output) /* write string(w) to output. */ { stack S; while (w >= 0 ) { char K = table[w].last; S.push(K); w = table[w].init_seg; } while ( ! S.empty()) { output << S.top(); S.pop(); } } /* File compression methods Huffman Limpel-Zev encoded fixed variable text length (one char) (several chars) codeunit variable fixed length (one or more bits) (12 bits) prefix NO prefix of a EVERY prefix of an encoded property codeunit is a codeunit string is an encoded string other depends on prior adaptive: set of codeunits features knowledge of character in use can change as frequencies compression/uncompression proceeds provably best in flexible - competes well with its class specialist compression methods for each special kind of input text */