// file lz.cc  updated 4/99 -bds
#include <iostream.h>

typedef int codeunit;

class LZT /* Lempel Ziv Table */
{
  const codeunit SIZE = 4096; /*2^12*/
  class node 
  {public:
    codeunit init_seg; 
    char last;
    node(codeunit u = -1, char K = 0)
    { init_seg = u; last = K; }
  };
  node table[SIZE];
  set<node> codes;
  codeunit next;
 public:
  LZT() 
    /* Constructor.  Initialize table to have codes for all 256 
       single character strings. */
    ;
  /* A codeunit is a compressed representation of a string.
     In discussing the LZT functions we use string(w) to denote the string
     represented by codeunit w.  But string(w) is not an implemented 
     function. Also string(w)K means the string represented by w with 
     character K adjoined at the right end. */
  void make_code(codeunit u, char K)
    /* Assumes code u is in the table and that no code for string(u)K is
       currently in the table.  Installs the code for string(u)K. */
    ;
  void get_or_make_code(codeunit u, char K, codeunit & v, bool & isnew)
    /* Set v to the code for string(u)K.  If this is a new code, install it 
       in the table and set isnew to true, else isnew is false. */
    ;
  bool new_code(codeunit w)
    /* Return true iff w is not in the table */
    ;
  char first(codeunit w)
    /* Assumes w is in the table. Return the first character in string(w) */
    ;
  void out(codeunit w, ostream& output)
    /* write string(w) to output. */
    ;
};

void compress(istream& input, ostream& output)
{
  LZT T;
  char K; codeunit v, w; bool isnew;

  input >> K; v = K;
  while (input >> K)
  {
    /* if no comresssion code exists for string(v)K then
          1. make a compression code for string(v)K for future use.
          2. output v (the code of a longest posssible previously seen string
	      in the input). 
          3. use K to start the next compression code.
       otherwise
	  1. set v to the compresssion code of string(v)K and continue
	  seeking the longest previously seen string from the input.
    */
    T.get_or_make_code(v, K, w, isnew);
    if (isnew) 
      {output << v; 
      v = K;}
    else 
      v = w;
  }
}

void uncompress(istream& input, ostream& output)
{
  LZT T;
  char K; codeunit w, v; bool isnew;

  input >> v; 
  T.out(v, output);
  while (input >> w)
  {
    /* First make the (always new) compression code for the previous code v
       and first character K of the string of the current code w.  However 
       there is a slight chance the current code is unknown to us.  The only 
       way w can be unfamiliar to us is if the sender happened to use the 
       extension of the previous code that she had just made, an extension 
       we have not yet installed in our table.  In that case the first 
       character of the two codes are the same, so we can install the new 
       code, then use it.

       For example suppose a portion of the text was "ababa", and v 
       such that string(v) =  "ab" was sent because the sender did not
       at that time have a code for "aba".  The sender would have then
       made a code, w, for "aba" and, as it turns out, immediately used it 
       for the last 3 letters of "ababa".  In that case, since we
       have not previously seen w, it is not in our table.  We cannot
       make it from v and the first letter "a" of w, because we do not
       know w yet.  However in this situation it is necessarily the
       case that v and w have the same first letter, so we can use the
       known first letter of v.
    */
    K = T.first(T.new_code(w) ? v : w);
    T.make_code(v, K);

    /* output string(w) and continue */
    T.out(w, output);  
    v = w;
  }
}

int main(int argc, char** argv)
{ 
  if (argc != 2)
  { cout << "  This is a filter to compress:\n";
    cout <<  argv[0] << " c < source_file > compressed_file\n";
    cout << "  or uncompress:\n"; 
    cout <<  argv[0] << " u < compressed_file > source_file\n";
  }
  else if (argv[1] == "c")
    compress(cin, cout);
  else 
    uncompress(cin, cout);
  return 0;
} 

// The code below is an incomplete implemementation of the LZT class methods.
LZT::LZT() 
  /* Constructor.  Initialize table to have codes for all 256 
     single character strings. */
{/*...*/
  for (int i = 0; i < 256; i++ ) 
  { table[i].init_seg = -1;
    table[i].last = (char) i;
  }
  for (int i = 256; i < SIZE; i++ ) 
  { table[i].init_seg = -1;
    table[i].last = 0;
  }
  next = 256;
}
  /* A codeunit is a compressed representation of a string.
     In discussing the LZT functions we use string(w) to denote the string
     represented by codeunit w.  But string(w) is not an implemented 
     function. Also string(w)K means the string represented by w with 
     character K adjoined at the right end. */

void LZT::make_code(codeunit u, char K)
  /* Assumes code u is in the table and that no code for string(u)K is
     currently in the table.  Installs the code for string(u)K. */
{/*...*/
  node n(u,K);
  if ( next < SIZE )
  table[next++] = n;
  codes.insert(n);
}

void LZT::get_or_make_code(codeunit u, char K, codeunit& v, bool& isnew)
  /* Set v to the code for string(u)K.  If this is a new code, install it 
     in the table and set isnew to 1, else isnew is 0. */
{/*...*/
  this is not complete.  We need to check how we handle running out of space
  if (codes.member(node(u,K))
  v = next;
  make_code(u, K);
  isnew = new_code(v);
}

bool LZT::new_code(codeunit w)
  /* Return true iff w is not in the table */
{ /*...*/
  return w >= 256 && table[w].init_seg == -1;
}

char LZT::first(codeunit w)
  /* Assumes w is in the table. Return the first character in string(w) */
{ char K; 
  while (w >= 0 )
  { K = table[w].last;
    w = table[w].init_seg;
  }
  return K;
}

#include <stack.h>
void LZT::out(codeunit w, ostream& output)
  /* write string(w) to output. */
{ stack<char> S;
  while (w >= 0 )
  { char K = table[w].last;
     S.push(K);
     w = table[w].init_seg;
  }
  while ( ! S.empty())
  {
    output << S.top(); S.pop();
  }
}

/*
                    File compression methods


               Huffman                 Limpel-Zev

encoded        fixed                   variable
text length    (one char)              (several chars)

codeunit       variable                fixed
length         (one or more bits)      (12 bits)

prefix         NO prefix of a          EVERY prefix of an encoded 
property       codeunit is a codeunit  string is an encoded string

other          depends on prior        adaptive:  set of codeunits 
features       knowledge of character  in use can change as 
	       frequencies             compression/uncompression proceeds 

	       provably best in        flexible - competes well with 
	       its class               specialist compression methods 
				       for each special kind of input text           
*/
