/*
 * Copyright (c) 2007-2010, 2013 Genome Research Ltd.
 * Author(s): James Bonfield
 * 
 * Redistribution and use in source and binary forms, with or without 
 * modification, are permitted provided that the following conditions are met:
 * 
 *    1. Redistributions of source code must retain the above copyright notice,
 *       this list of conditions and the following disclaimer.
 * 
 *    2. Redistributions in binary form must reproduce the above
 *       copyright notice, this list of conditions and the following
 *       disclaimer in the documentation and/or other materials provided
 *       with the distribution.
 * 
 *    3. Neither the names Genome Research Ltd and Wellcome Trust Sanger
 *    Institute nor the names of its contributors may be used to endorse
 *    or promote products derived from this software without specific
 *    prior written permission.
 * 
 * THIS SOFTWARE IS PROVIDED BY GENOME RESEARCH LTD AND CONTRIBUTORS "AS
 * IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
 * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A
 * PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL GENOME RESEARCH
 * LTD OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
 * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
 * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
 * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
 * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
 * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
 * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 */

/*
 * This code implements "interlaced deflate", and is *based* on a
 * simplistic implementation of the Deflate algorithm.
 * See http://www.ietf.org/rfc/rfc1951.txt for details on the original
 * Deflate algorithm.
 *
 * It differs from RFC1951 in two important ways:
 *
 * 1) It only supports the huffman encoding step and does not attempt
 *    to do any LZ-style string matching to generate distance codes.
 *    (These generally do not improve data compression for our desired
 *    use.)
 *
 * 2) It optionally allows interleaving of multiple huffman trees for
 *    a single data stream. NB: when multiple codes are used this is
 *    incompatible with RFC1951.
 *
 * It has been written here, instead of using zlib, so that we can separate
 * out the encoding of the huffman tree from the compression of the data
 * stream into separate memory sections with the intent to optimise
 * compression of very small blocks of data by sharing one set of frequency
 * tables (ie huffman tree) with multiple sets of compressed data blocks.
 *
 * James Bonfield, 2007
 */

#ifdef HAVE_CONFIG_H
#include "io_lib_config.h"
#endif

#ifndef NDEBUG
#define NDEBUG /* disable asserts for production use */
#endif

#include <stdio.h>
#include <stdlib.h>
#include <string.h>
#include <ctype.h>
#include <unistd.h>
#include <assert.h>
#include <unistd.h>

#include "io_lib/deflate_interlaced.h"

#ifndef MIN
#    define MIN(a,b) ((a) < (b) ? (a) : (b))
#endif
#ifndef MAX
#    define MAX(a,b) ((a) > (b) ? (a) : (b))
#endif

/* #define TEST_MAIN */

/*
 * ---------------------------------------------------------------------------
 * Local structs & defines
 */

/* Used in tree construction only */
typedef struct node {
    int count;
    int sym; /* char or SYM_EOF */
    struct node *parent;
    struct node *next;
} node_t;

#define SYM_EOF 256

static void output_code_set(FILE *fp, huffman_codes_t *codes);
static void output_code_set2(FILE *fp, huffman_codes_t *codes);
int next_symbol(block_t *in, int *htab);

/*
 * ---------------------------------------------------------------------------
 * Our standard precomputed tables, for DNA, English text, etc.
 */

/* DNA */
/*
 * A   00
 * C   01
 * G   110
 * T   10
 * N   1110
 * EOF 11110
 * ?   11111*
 */
static huffman_code_t codes_dna[] = {
    {'A',  2}, {'C',  2}, {'T',  2}, {'G',  3}, {'N',  4}, {  0,  5},
    {SYM_EOF,  6},
    {  1, 13}, {  2, 13}, {  3, 13}, {  4, 13}, {  5, 13}, {  6, 13},
    {  7, 14}, {  8, 14}, {  9, 14}, { 10, 14}, { 11, 14}, { 12, 14},
    { 13, 14}, { 14, 14}, { 15, 14}, { 16, 14}, { 17, 14}, { 18, 14},
    { 19, 14}, { 20, 14}, { 21, 14}, { 22, 14}, { 23, 14}, { 24, 14},
    { 25, 14}, { 26, 14}, { 27, 14}, { 28, 14}, { 29, 14}, { 30, 14},
    { 31, 14}, { 32, 14}, { 33, 14}, { 34, 14}, { 35, 14}, { 36, 14},
    { 37, 14}, { 38, 14}, { 39, 14}, { 40, 14}, { 41, 14}, { 42, 14},
    { 43, 14}, { 44, 14}, { 45, 14}, { 46, 14}, { 47, 14}, {'0', 14},
    {'1', 14}, {'2', 14}, {'3', 14}, {'4', 14}, {'5', 14}, {'6', 14},
    {'7', 14}, {'8', 14}, {'9', 14}, { 58, 14}, { 59, 14}, { 60, 14},
    { 61, 14}, { 62, 14}, { 63, 14}, { 64, 14}, {'B', 14}, {'D', 14},
    {'E', 14}, {'F', 14}, {'H', 14}, {'I', 14}, {'J', 14}, {'K', 14},
    {'L', 14}, {'M', 14}, {'O', 14}, {'P', 14}, {'Q', 14}, {'R', 14},
    {'S', 14}, {'U', 14}, {'V', 14}, {'W', 14}, {'X', 14}, {'Y', 14},
    {'Z', 14}, { 91, 14}, { 92, 14}, { 93, 14}, { 94, 14}, { 95, 14},
    { 96, 14}, {'a', 14}, {'b', 14}, {'c', 14}, {'d', 14}, {'e', 14},
    {'f', 14}, {'g', 14}, {'h', 14}, {'i', 14}, {'j', 14}, {'k', 14},
    {'l', 14}, {'m', 14}, {'n', 14}, {'o', 14}, {'p', 14}, {'q', 14},
    {'r', 14}, {'s', 14}, {'t', 14}, {'u', 14}, {'v', 14}, {'w', 14},
    {'x', 14}, {'y', 14}, {'z', 14}, {123, 14}, {124, 14}, {125, 14},
    {126, 14}, {127, 14}, {128, 14}, {129, 14}, {130, 14}, {131, 14},
    {132, 14}, {133, 14}, {134, 14}, {135, 14}, {136, 14}, {137, 14},
    {138, 14}, {139, 14}, {140, 14}, {141, 14}, {142, 14}, {143, 14},
    {144, 14}, {145, 14}, {146, 14}, {147, 14}, {148, 14}, {149, 14},
    {150, 14}, {151, 14}, {152, 14}, {153, 14}, {154, 14}, {155, 14},
    {156, 14}, {157, 14}, {158, 14}, {159, 14}, {160, 14}, {161, 14},
    {162, 14}, {163, 14}, {164, 14}, {165, 14}, {166, 14}, {167, 14},
    {168, 14}, {169, 14}, {170, 14}, {171, 14}, {172, 14}, {173, 14},
    {174, 14}, {175, 14}, {176, 14}, {177, 14}, {178, 14}, {179, 14},
    {180, 14}, {181, 14}, {182, 14}, {183, 14}, {184, 14}, {185, 14},
    {186, 14}, {187, 14}, {188, 14}, {189, 14}, {190, 14}, {191, 14},
    {192, 14}, {193, 14}, {194, 14}, {195, 14}, {196, 14}, {197, 14},
    {198, 14}, {199, 14}, {200, 14}, {201, 14}, {202, 14}, {203, 14},
    {204, 14}, {205, 14}, {206, 14}, {207, 14}, {208, 14}, {209, 14},
    {210, 14}, {211, 14}, {212, 14}, {213, 14}, {214, 14}, {215, 14},
    {216, 14}, {217, 14}, {218, 14}, {219, 14}, {220, 14}, {221, 14},
    {222, 14}, {223, 14}, {224, 14}, {225, 14}, {226, 14}, {227, 14},
    {228, 14}, {229, 14}, {230, 14}, {231, 14}, {232, 14}, {233, 14},
    {234, 14}, {235, 14}, {236, 14}, {237, 14}, {238, 14}, {239, 14},
    {240, 14}, {241, 14}, {242, 14}, {243, 14}, {244, 14}, {245, 14},
    {246, 14}, {247, 14}, {248, 14}, {249, 14}, {250, 14}, {251, 14},
    {252, 14}, {253, 14}, {254, 14}, {255, 14},
};

/* DNA with a few ambiguity codes */
static huffman_code_t codes_dna_ambig[] = {
    {'A',  2}, {'C',  2}, {'T',  2}, {'G',  3}, {'N',  4}, {  0,  7},
    { 45,  7}, {'B',  8}, {'D',  8}, {'H',  8}, {'K',  8}, {'M',  8},
    {'R',  8}, {'S',  8}, {'V',  8}, {'W',  8}, {'Y',  8}, {SYM_EOF, 11},
    {226, 14}, {  1, 15}, {  2, 15}, {  3, 15}, {  4, 15}, {  5, 15},
    {  6, 15}, {  7, 15}, {  8, 15}, {  9, 15}, { 10, 15}, { 11, 15},
    { 12, 15}, { 13, 15}, { 14, 15}, { 15, 15}, { 16, 15}, { 17, 15},
    { 18, 15}, { 19, 15}, { 20, 15}, { 21, 15}, { 22, 15}, { 23, 15},
    { 24, 15}, { 25, 15}, { 26, 15}, { 27, 15}, { 28, 15}, { 29, 15},
    { 30, 15}, { 31, 15}, { 32, 15}, { 33, 15}, { 34, 15}, { 35, 15},
    { 36, 15}, { 37, 15}, { 38, 15}, { 39, 15}, { 40, 15}, { 41, 15},
    { 42, 15}, { 43, 15}, { 44, 15}, { 46, 15}, { 47, 15}, {'0', 15},
    {'1', 15}, {'2', 15}, {'3', 15}, {'4', 15}, {'5', 15}, {'6', 15},
    {'7', 15}, {'8', 15}, {'9', 15}, { 58, 15}, { 59, 15}, { 60, 15},
    { 61, 15}, { 62, 15}, { 63, 15}, { 64, 15}, {'E', 15}, {'F', 15},
    {'I', 15}, {'J', 15}, {'L', 15}, {'O', 15}, {'P', 15}, {'Q', 15},
    {'U', 15}, {'X', 15}, {'Z', 15}, { 91, 15}, { 92, 15}, { 93, 15},
    { 94, 15}, { 95, 15}, { 96, 15}, {'a', 15}, {'b', 15}, {'c', 15},
    {'d', 15}, {'e', 15}, {'f', 15}, {'g', 15}, {'h', 15}, {'i', 15},
    {'j', 15}, {'k', 15}, {'l', 15}, {'m', 15}, {'n', 15}, {'o', 15},
    {'p', 15}, {'q', 15}, {'r', 15}, {'s', 15}, {'t', 15}, {'u', 15},
    {'v', 15}, {'w', 15}, {'x', 15}, {'y', 15}, {'z', 15}, {123, 15},
    {124, 15}, {125, 15}, {126, 15}, {127, 15}, {128, 15}, {129, 15},
    {130, 15}, {131, 15}, {132, 15}, {133, 15}, {134, 15}, {135, 15},
    {136, 15}, {137, 15}, {138, 15}, {139, 15}, {140, 15}, {141, 15},
    {142, 15}, {143, 15}, {144, 15}, {145, 15}, {146, 15}, {147, 15},
    {148, 15}, {149, 15}, {150, 15}, {151, 15}, {152, 15}, {153, 15},
    {154, 15}, {155, 15}, {156, 15}, {157, 15}, {158, 15}, {159, 15},
    {160, 15}, {161, 15}, {162, 15}, {163, 15}, {164, 15}, {165, 15},
    {166, 15}, {167, 15}, {168, 15}, {169, 15}, {170, 15}, {171, 15},
    {172, 15}, {173, 15}, {174, 15}, {175, 15}, {176, 15}, {177, 15},
    {178, 15}, {179, 15}, {180, 15}, {181, 15}, {182, 15}, {183, 15},
    {184, 15}, {185, 15}, {186, 15}, {187, 15}, {188, 15}, {189, 15},
    {190, 15}, {191, 15}, {192, 15}, {193, 15}, {194, 15}, {195, 15},
    {196, 15}, {197, 15}, {198, 15}, {199, 15}, {200, 15}, {201, 15},
    {202, 15}, {203, 15}, {204, 15}, {205, 15}, {206, 15}, {207, 15},
    {208, 15}, {209, 15}, {210, 15}, {211, 15}, {212, 15}, {213, 15},
    {214, 15}, {215, 15}, {216, 15}, {217, 15}, {218, 15}, {219, 15},
    {220, 15}, {221, 15}, {222, 15}, {223, 15}, {224, 15}, {225, 15},
    {227, 15}, {228, 15}, {229, 15}, {230, 15}, {231, 15}, {232, 15},
    {233, 15}, {234, 15}, {235, 15}, {236, 15}, {237, 15}, {238, 15},
    {239, 15}, {240, 15}, {241, 15}, {242, 15}, {243, 15}, {244, 15},
    {245, 15}, {246, 15}, {247, 15}, {248, 15}, {249, 15}, {250, 15},
    {251, 15}, {252, 15}, {253, 15}, {254, 15}, {255, 15},
};

/* English text */
static huffman_code_t codes_english[] = {
    { 32,  3}, {'e',  3}, {'a',  4}, {'i',  4}, {'n',  4}, {'o',  4},
    {'s',  4}, {'t',  4}, {'d',  5}, {'h',  5}, {'l',  5}, {'r',  5},
    {'u',  5}, { 10,  6}, { 13,  6}, { 44,  6}, {'c',  6}, {'f',  6},
    {'g',  6}, {'m',  6}, {'p',  6}, {'w',  6}, {'y',  6}, { 46,  7},
    {'b',  7}, {'v',  7}, { 34,  8}, {'I',  8}, {'k',  8}, { 45,  9},
    {'A',  9}, {'N',  9}, {'T',  9}, { 39, 10}, { 59, 10}, { 63, 10},
    {'B', 10}, {'C', 10}, {'E', 10}, {'H', 10}, {'M', 10}, {'S', 10},
    {'W', 10}, {'x', 10}, { 33, 11}, {'0', 11}, {'1', 11}, {'F', 11},
    {'G', 11}, {  0, 15}, {  1, 15}, {  2, 15}, {  3, 15}, {  4, 15},
    {  5, 15}, {  6, 15}, {  7, 15}, {  8, 15}, {  9, 15}, { 11, 15},
    { 12, 15}, { 14, 15}, { 15, 15}, { 16, 15}, { 17, 15}, { 18, 15},
    { 19, 15}, { 20, 15}, { 21, 15}, { 22, 15}, { 23, 15}, { 24, 15},
    { 25, 15}, { 26, 15}, { 27, 15}, { 28, 15}, { 29, 15}, { 30, 15},
    { 31, 15}, { 35, 15}, { 36, 15}, { 37, 15}, { 38, 15}, { 40, 15},
    { 41, 15}, { 42, 15}, { 43, 15}, { 47, 15}, {'2', 15}, {'3', 15},
    {'4', 15}, {'5', 15}, {'6', 15}, {'7', 15}, {'8', 15}, {'9', 15},
    { 58, 15}, { 60, 15}, { 61, 15}, { 62, 15}, { 64, 15}, {'D', 15},
    {'J', 15}, {'K', 15}, {'L', 15}, {'O', 15}, {'P', 15}, {'Q', 15},
    {'R', 15}, {'U', 15}, {'V', 15}, {'X', 15}, {'Y', 15}, {'Z', 15},
    { 91, 15}, { 92, 15}, { 93, 15}, { 94, 15}, { 95, 15}, { 96, 15},
    {'j', 15}, {'q', 15}, {'z', 15}, {123, 15}, {124, 15}, {125, 15},
    {126, 15}, {127, 15}, {128, 15}, {129, 15}, {130, 15}, {131, 15},
    {132, 15}, {133, 15}, {134, 15}, {135, 15}, {136, 15}, {137, 15},
    {138, 15}, {139, 15}, {140, 15}, {141, 15}, {142, 15}, {143, 15},
    {144, 15}, {145, 15}, {146, 15}, {147, 15}, {148, 15}, {149, 15},
    {150, 15}, {151, 15}, {152, 15}, {153, 15}, {154, 15}, {155, 15},
    {156, 15}, {157, 15}, {158, 15}, {159, 15}, {160, 15}, {161, 15},
    {162, 15}, {163, 15}, {164, 15}, {165, 15}, {166, 15}, {167, 15},
    {168, 15}, {169, 15}, {170, 15}, {171, 15}, {172, 15}, {173, 15},
    {174, 15}, {175, 15}, {176, 15}, {177, 15}, {178, 15}, {179, 15},
    {180, 15}, {181, 15}, {182, 15}, {183, 15}, {184, 15}, {185, 15},
    {186, 15}, {187, 15}, {188, 15}, {189, 15}, {190, 15}, {191, 15},
    {192, 15}, {193, 15}, {194, 15}, {195, 15}, {196, 15}, {197, 15},
    {198, 15}, {199, 15}, {200, 15}, {201, 15}, {202, 15}, {203, 15},
    {204, 15}, {205, 15}, {206, 15}, {207, 15}, {208, 15}, {209, 15},
    {210, 15}, {211, 15}, {212, 15}, {213, 15}, {214, 15}, {215, 15},
    {216, 15}, {217, 15}, {218, 15}, {219, 15}, {220, 15}, {221, 15},
    {222, 15}, {223, 15}, {224, 15}, {225, 15}, {226, 15}, {227, 15},
    {228, 15}, {229, 15}, {230, 15}, {231, 15}, {232, 15}, {233, 15},
    {234, 15}, {235, 15}, {236, 15}, {237, 15}, {238, 15}, {239, 15},
    {240, 15}, {241, 15}, {242, 15}, {243, 15}, {244, 15}, {245, 15},
    {246, 15}, {247, 15}, {248, 15}, {249, 15}, {250, 15}, {251, 15},
    {252, 15}, {253, 15}, {254, 15}, {255, 15}, {SYM_EOF, 15},
};

static huffman_codeset_t *static_codeset[NCODES_STATIC];

/*
 * ---------------------------------------------------------------------------
 * Block_t structure support
 */

/*
 * Allocates and returns a new block_t struct of a specified default size.
 * A default 'data' pointer may be passed in, in which it must have
 * been created using malloc(size). Otherwise if data is NULL then
 * size indicates the amount of memory to allocate. Size maybe zero to
 * defer allocation.
 *
 * Returns newly created block_t* on success
 *         NULL on failure
 */
block_t *block_create(unsigned char *data, size_t size) {
    block_t *b = (block_t *)malloc(sizeof(*b));
    if (!b)
	return NULL;

    b->data = data;
    b->alloc = size;
    b->byte = 0;
    b->bit = 0;

    if (size && !data && NULL == (b->data = calloc(size, 1))) {
	free(b);
	return NULL;
    }

    return b;
}

/*
 * Deallocates memory created by block_create().
 * keep_data is a boolean which if true requests that the data held within
 * the block should not be deallocated as it is in use elsewhere.
 */
void block_destroy(block_t *blk, int keep_data) {
    if (!blk)
	return;

    if (!keep_data && blk->data)
	free(blk->data);
    
    free(blk);
}

/*
 * Ensures a block_t holds at least 'size' bytes.
 * Newly allocated data is initialised to zero.
 *
 * Returns 0 on success
 *        -1 on failure, leaving block pointing to the existing data
 */
int block_resize(block_t *blk, size_t size) {
    unsigned char *newp = NULL;

    if (!blk)
	return -1;

    /* Grow size to next power of 2, if we're growing */
    if (size > blk->alloc) {
	size--;
	size |= size >> 1;
	size |= size >> 2;
	size |= size >> 4;
	size |= size >> 8;
	size |= size >> 16;
	size++;
    }

    if (NULL == (newp = realloc(blk->data, size)))
	return -1;
    else
	blk->data = newp;

    if (size > blk->alloc)
	memset(&blk->data[blk->alloc], 0, size - blk->alloc);
    blk->alloc = size;

    return 0;
}


/*
 * ---------------------------------------------------------------------------
 * Tree building and code generation functions
 */

/*
 * Reverses the order of bits in the bottom nbits of val.
 * Returns the bit-reverse value.
 */
unsigned int bit_reverse(unsigned int val, int nbits) {
    unsigned int new = 0, i;

    for (i = 0; i < nbits; i++) {
	new = (new << 1) | (val & 1);
	val >>= 1;
    }

    return new;
}


/*
 * Generates canonical huffman codes given a set of symbol bit lengths.
 * The results are stored within the supplied huffman_codes_t struct.
 *
 * Returns 0 on success
 *        -1 on failure
 */
static int canonical_codes(huffman_codes_t *c) {
    int i, j;
    unsigned int code, last_len;
    int clens[33];
    int offs[33];
    huffman_code_t ctmp[258];
    signed int symtab[258];

    /* Sort by bit-length, subfield symbol - much faster than qsort() */
    for (i = 0; i < 258; i++)
	symtab[i] = -1;
    for (i = 0; i < c->ncodes; i++)
	symtab[c->codes[i].symbol] = i;
    for (i = 0; i <= 32; i++)
	offs[i] = clens[i] = 0;
    for (i = 0; i < c->ncodes; i++)
	clens[c->codes[i].nbits]++;
    for (i = 1; i <= 32; i++)
	offs[i] = offs[i-1] + clens[i-1];
    for (i = 0; i < 258; i++) {
	if (symtab[i] != -1)
	    ctmp[offs[c->codes[symtab[i]].nbits]++] = c->codes[symtab[i]];
    }
    memcpy(c->codes, ctmp, c->ncodes * sizeof(huffman_code_t));

    /*
     * Force all codes to be <= max_code_len. This is needed due to the
     * 15-bit length limitation of Deflate literal codes and the 7-bit 
     * limit of the code bit-length table.
     */
    /* Find first point of failure */
    for (i = 0; i < c->ncodes; i++) {
	if (c->codes[i].nbits > c->max_code_len)
	    break;
    }

    /*
     * From here on we shrink the length of the current code by increasing
     * the length of an earlier symbol, at last_code.
     */
    if (i != c->ncodes) {
	int delta = 0;

	/*
	fprintf(stderr, "=== REORDERING %d ===\n", c->code_set);
	output_code_set(stderr, c);
	output_code_set2(stderr, c);
	*/

	for (; i < c->ncodes; i++) {
	    int k, cur_len;

	    c->codes[i].nbits -= delta;
	    if (c->codes[i].nbits <= c->max_code_len)
		continue;

	    for (j = i; j >= 0 && c->codes[j].nbits >= c->max_code_len; j--)
		;
	    if (j < 0) {
		fprintf(stderr,
			"Too many symbols to fit in bit-length requirements\n");
		fprintf(stderr, "=== FAILING ===\n");
		output_code_set(stderr, c);
		output_code_set2(stderr, c);
		abort();
	    }

	    /*
	    fprintf(stderr, "Changing code %d/%d to len %d\n",
		    c->codes[i].symbol, c->codes[j].symbol,
		    c->codes[j].nbits+1);
	    */
	    cur_len = c->codes[i].nbits;
	    c->codes[i].nbits = ++c->codes[j].nbits;

	    /*
	     * Shrink the next code by one, or if none at that bit-length
	     * the next 2, and so on
	     */
	    delta = 1;
	    for (k = i+1; delta && k < c->ncodes; k++) {
		while (c->codes[k].nbits > cur_len) {
		    delta *= 2;
		    cur_len++;
		}
		c->codes[k].nbits--;
		delta--;
	    }
	    assert(delta == 0);
	}

	/*
	fprintf(stderr, "=== REORDERED TO %d ===\n", c->code_set);
	output_code_set(stderr, c);
	output_code_set2(stderr, c);
	*/

	/* Ordering is shot - regenerate via brute force way */
	return canonical_codes(c);
    }


    /* Generate codes */
    code = last_len = 0; /* stop warning */
    for (i = 0; i < c->ncodes; i++) {
	int nbits = c->codes[i].nbits;

	if (i == 0) {
	    code = 0;
	    last_len = nbits;
	} else {
	    code++;
	}
	if (nbits > last_len) {
	    code <<= (nbits - last_len);
	    last_len = nbits;
	}
	c->codes[i].code = bit_reverse(code, nbits);
    }

    /* Reindex so the symbol is the primary index into codes */
    for (i = 0; i <= 257; i++) {
	c->lookup[i].nbits = 0;
    }
    for (i = 0; i < c->ncodes; i++) {
        c->lookup[c->codes[i].symbol] = c->codes[i];
    }

    return 0;
}

static int node_compar2(const void *vp1, const void *vp2) {
    const node_t *n1 = *(const node_t **)vp1;
    const node_t *n2 = *(const node_t **)vp2;

    /*
     * The sort order is vital here. This needs to return the same collating
     * order on all systems so that differing qsort() functions will not
     * swap around symbols with the same bit lengths, hence we sort by both
     * fields to force a unique stable ordering.
     */
    if (n1->count != n2->count)
	return n1->count - n2->count;
    else
	return n2->sym - n1->sym;
}

/*
 * Computes the huffman bit-lengths for a data set. We don't care
 * about the actual tree, just how deep the symbols end up.
 *
 * Huffman trees are constructed by constructing a set of nodes
 * initially containing the symbol and it's frequency. We then merge
 * the two least used nodes to produce a new node with a combined
 * frequency. Repeat until one root node is left.
 *
 * data/len is the input data to analyse.
 *
 * 'eof' is a boolean to indicate whether the EOF symbol should be included
 * in the symbols produced.
 *
 * all_codes is a boolean to indicate whether we should include symbols not
 * found in the input data set. (This was used to create the static lookup
 * tables.)
 *
 * Returns huffman_codes_t* on success
 *         NULL on failure
 */
huffman_codes_t *calc_bit_lengths(unsigned char *data, int len,
				  int eof, int max_code_len, int all_codes,
				  int start, int skip) {
    int i, ncodes;
    node_t nodes[258+257], *head, *new = &nodes[258];
    node_t *n2[258+257];
    huffman_codes_t *c;
    int hist[256];

    if (NULL == (c = (huffman_codes_t *)malloc(sizeof(*c))))
	return NULL;
    c->codes_static = 0;
    c->max_code_len = max_code_len;

    /* Count frequencies of symbols */
    memset(hist, 0, 256*sizeof(*hist));
    /* Calc freqs */
    for (i = start; i < len; i+=skip) {
	hist[data[i]]++;
    }


    /*
     * Initialise nodes. We build a map of ASCII character code to node
     * number. (By default it's a simple 1:1 mapping unless legal_chars is
     * defined.)
     */
    ncodes = 0;
    if (eof) {
	nodes[ncodes].sym = SYM_EOF;
	nodes[ncodes].count = eof;
	nodes[ncodes].parent = NULL;
	n2[ncodes] = &nodes[ncodes];
	ncodes++;
    }

    /* All 256 chars existing at a minimal level */
    if (all_codes) {
	for (i = 0; i < 256; i++) {
	    nodes[ncodes].sym = i;
	    nodes[ncodes].count = hist[i];
	    nodes[ncodes].parent = NULL;
	    n2[ncodes] = &nodes[ncodes];
	    ncodes++;
	}
    } else {
	/* Only include non-zero symbols */
	for (i = 0; i < 256; i++) {
	    if (hist[i] == 0)
		continue;
	    nodes[ncodes].sym = i;
	    nodes[ncodes].count = hist[i];
	    nodes[ncodes].parent = NULL;
	    n2[ncodes] = &nodes[ncodes];
	    ncodes++;
	}
    }

    /* Sort by counts, smallest first and form a sorted linked list */
    qsort(n2, ncodes, sizeof(*n2), node_compar2);

    /* Skip symbols that do not occur, unless all_codes is true */
    for (i = 0; i < ncodes; i++) {
	n2[i]->next = i+1 < ncodes ? n2[i+1] : NULL;
    }

    /* Repeatedly merge two smallest values */
    head = n2[0];
    while (head && head->next) {
	node_t *after = head->next, *n;
	int sum = head->count + head->next->count;
	
	for (n = head->next->next; n; after = n, n = n->next) {
	    if (sum < n->count)
		break;
	}

	/* Produce a new summation node and link it in place */
	after->next = new;
	new->next = n;
	new->sym = '?';
	new->count = sum;
	new->parent = NULL;
	head->parent = new;
	head->next->parent = new;
	head = head->next->next;

	new++;
    }

    /* Walk up tree computing the bit-lengths for our symbols */
    c->ncodes = ncodes;
    c->codes = (huffman_code_t *)malloc(c->ncodes * sizeof(*c->codes));
    if (NULL == c->codes) {
	free(c);
	return NULL;
    }

    for (i = 0; i < ncodes; i++) {
	int len = 0;
	node_t *n;
	for (n = n2[i]->parent; n; n = n->parent) {
	    len++;
	}

	c->codes[i].symbol = n2[i]->sym;
	c->codes[i].freq   = n2[i]->count;
	c->codes[i].nbits  = len ? len : 1; /* special case, nul input */
    }

    if (0 != canonical_codes(c)) {
	free(c);
	return NULL;
    }

    return c;
}

/*
 * A special case of the generate_code_set() function below, but for
 * creating predefined code sets from bit-length arrays. Useful for
 * code that wants to use a predetermined huffman tree.
 *
 *
 * Returns huffman_codes_t* on success; free using huffman_codes_destroy().
 *         NULL on failure.
 */
huffman_codeset_t *codes2codeset(huffman_code_t *codes, int ncodes,
				 int code_num) {
    huffman_codeset_t *cs;
    huffman_codes_t *c;

    if (NULL == (cs = (huffman_codeset_t *)malloc(sizeof(*cs))))
	return NULL;

    if (NULL == (c = (huffman_codes_t *)malloc(sizeof(*c))))
	return NULL;

    cs->codes = (huffman_codes_t **)malloc(sizeof(*cs->codes));
    cs->codes[0] = c;
    cs->ncodes = 1;
    cs->code_set = code_num;
    cs->blk = NULL;
    cs->bit_num = 0;
    cs->decode_t = NULL;
    cs->decode_J4 = NULL;
    
    c->codes_static = 1;
    c->max_code_len = MAX_CODE_LEN;

    c->codes = codes;
    c->ncodes = ncodes;

    cs->bit_num = 0; /* FIXME: need to know this */

    canonical_codes(c);

    return cs;
}

/*
 * Initialises and returns a huffman_codes_t struct from a specified code_set.
 * If code_set is not one of the standard predefined values then the
 * input data is analysed using calc_bit_lengths() above to produce the
 * optimal set of huffman codes, otherwise we return predefined values.
 *
 * 'eof' is a boolean to indicate whether the EOF symbol should be included
 * in the symbols produced.
 *
 * all_codes is a boolean to indicate whether we should include symbols not
 * found in the input data set. (This was used to create the static lookup
 * tables.)
 *
 * Returns huffman_codes_t* on success; free using huffman_codes_destroy().
 *         NULL on failure.
 */
huffman_codeset_t *generate_code_set(int code_set, int ncodes,
				     unsigned char *data, int len,
				     int eof, int max_code_len,
				     int all_codes) {
    huffman_codeset_t *cs;

    /*
     * Either INLINE or a CODE_USER+ set of codes.
     * => analyse the data and compute a new set of bit-lengths & codes.
     */
    if (code_set >= 128 || code_set == CODE_INLINE) { 
	int i;

	if (NULL == (cs = (huffman_codeset_t *)malloc(sizeof(*cs))))
	    return NULL;

	cs->code_set = code_set;
	cs->ncodes = ncodes;
	cs->codes = (huffman_codes_t **)malloc(cs->ncodes*sizeof(*cs->codes));
	cs->blk = NULL;
	cs->bit_num = 0;
	cs->decode_t = NULL;
	cs->decode_J4 = NULL;

	for (i = 0; i < ncodes; i++) {
	    /*
	     * If requested, include EOF all code sets, but at a
	     * frequency of only '1' occurrance where we predict it
	     * not to be needed.
	     */
	    if (eof && (len+i)%ncodes)
		eof = 1;
	    cs->codes[i] = calc_bit_lengths(data, len, eof, max_code_len,
					    all_codes, i, ncodes);
	    cs->codes[i]->codes_static = 0;
	    if (NULL == cs->codes[i]) {
		/* FIXME: tidy up */
		return NULL;
	    }

	    canonical_codes(cs->codes[i]);
	}

    /*
     * Otherwise we use the determined codes at the top of this file, such
     * as codes_dna and codes_english.
     */
    } else {
	if (code_set < 1 || code_set >= NCODES_STATIC) {
	    fprintf(stderr, "Unknown huffman code set '%d'\n", code_set);
	    return NULL;
	}

	/* If our global codeset hasn't been initialised yet, do so */
	if (!static_codeset[code_set]) {
	    huffman_codes_t *c = (huffman_codes_t *)malloc(sizeof(*c));

	    if (NULL == (cs = (huffman_codeset_t *)malloc(sizeof(*cs))))
		return NULL;

	    cs->codes = (huffman_codes_t **)malloc(sizeof(*cs->codes));
	    cs->codes[0] = c;
	    cs->ncodes = 1;
	    cs->code_set = code_set;
	    cs->blk = NULL;
	    cs->bit_num = 0;
	    cs->decode_t = NULL;
	    cs->decode_J4 = NULL;

	    c->codes_static = 1;
	    c->max_code_len = MAX_CODE_LEN;

	    switch(code_set) {
	    case CODE_DNA:
		c->codes = codes_dna;
		c->ncodes = sizeof(codes_dna)/sizeof(*c->codes);
		cs->bit_num = 5;
		break;

	    case CODE_DNA_AMBIG:
		c->codes = codes_dna_ambig;
		c->ncodes = sizeof(codes_dna_ambig)/sizeof(*c->codes);
		cs->bit_num = 1;
		break;

	    case CODE_ENGLISH:
		c->codes = codes_english;
		c->ncodes = sizeof(codes_english)/sizeof(*c->codes);
		cs->bit_num = 1;
		break;

	    default:
		fprintf(stderr, "Unknown huffman code set '%d'\n", code_set);
		return NULL;
	    }

	    canonical_codes(c);

	    static_codeset[code_set] = cs;
	}

	cs = static_codeset[code_set];
    }

    return cs;
}

void huffman_codes_destroy(huffman_codes_t *c) {
    if (!c)
	return;

    if (!c->codes_static && c->codes)
	free(c->codes);

    free(c);
}

void huffman_codeset_destroy(huffman_codeset_t *cs) {
    int i;

    if (!cs)
	return;

    /* If this codeset is one of the predefined global ones we do nothing */
    if (cs->ncodes == 1 && cs->codes[0]->codes_static)
	return;

    for (i = 0; i < cs->ncodes; i++)
	huffman_codes_destroy(cs->codes[i]);
    if (cs->codes)
	free(cs->codes);
    if (cs->blk)
	block_destroy(cs->blk, 0);
    if (cs->decode_t)
	free(cs->decode_t);
    if (cs->decode_J4)
	free(cs->decode_J4);

    free(cs);
}

/*
 * ---------------------------------------------------------------------------
 * Encoding and decoding related functions
 */

/*
 * Can store up to 24-bits worth of data encoded in an integer value
 * Possibly we'd want to have a less optimal store_bits function when dealing
 * with nbits > 24, but for now we assume the codes generated are never
 * that big. (Given this is only possible with 121392 or more
 * characters with exactly the correct frequency distribution we check
 * for it elsewhere.)
 */
static void store_bits(block_t *block, unsigned int val, int nbits) {
    /* fprintf(stderr, " store_bits: %02x %d\n", val, nbits); */

#if 1
    {
	unsigned int curr = block->data[block->byte];
	curr |= (val & ((1 << nbits)-1)) << block->bit;
	block->bit += nbits;
	while (block->bit >= 8) {
	    block->data[block->byte++] = curr & 0xff;
	    curr >>= 8;
	    block->bit -= 8;
	}
	block->data[block->byte] = curr & 0xff;
    }
    return;

#else

    {
      /* Slow, but simple */
      unsigned int mask = 1;
      int bit = 1 << block->bit;
      do {
	if (val & mask)
	    block->data[block->byte] |= bit;
	/*
	 * Data should be zeroed anyway, so this is not needed.
	 *
	 * else
	 *    block->data[block->byte] &= ~bit;
	*/

	if (++block->bit == 8) {
	    block->bit = 0;
	    block->byte++;
	    block->data[block->byte] = 0;
	    bit = 1;
	} else {
	    bit <<= 1;
	}
	mask <<= 1;
      } while(--nbits);
    }

#endif
}

/*
 * Reads up to 24-bits worth of data and returns. Updates the block
 * byte and bit values to indicate the current 'read' position.
 *
 * Returns unsigned value on success (>=0)
 *         -1 on failure
 */
static signed int get_bits(block_t *block, int nbits) {
    unsigned int val, bnum = 0;

    if (block->byte*8 + block->bit + nbits > block->alloc * 8)
	return -1;

    /* Fetch the partial byte of data */
    val = (block->data[block->byte]) >> block->bit;
    bnum = 8 - block->bit;

    /* And additional entire bytes worth as required */
    while (bnum <= nbits) {
	val |= block->data[++block->byte] << bnum;
	bnum += 8;
    }

    block->bit = (block->bit + nbits) % 8;
    return val & ((1 << nbits) - 1);
}

/* stores nbytes bytes, padding to align on the next byte boundary */
void store_bytes(block_t *block, unsigned char *val, int nbytes) {
    /* Align */
    if (block->bit != 0) {
	block->byte++;
	block->bit = 0;
    }

    /* Resize */
    block_resize(block, block->byte + nbytes + 1);

    /* Store */
    memcpy(&block->data[block->byte], val, nbytes);
    block->byte += nbytes;
}


/*
 * Encodes the huffman symbol bit-lengths as a serialised block of data
 * suitable for storing in a ZTR "ZLBH" chunk. This uses the Deflate
 * storage format defined in RFC1951.
 *
 * Returns: 0 on success
 *         -1 on failure
 */
int store_codes_single(block_t *out, huffman_codes_t *c) {
    int i;
    unsigned char bl_code[257]; /* bit-length codes and for codes 16-18 */
    unsigned char bl_opt[257];  /*     the operand to the blcode */
    unsigned char sorted_codes[258];
    int bl_freq[19]; /* frequency of bit-length codes produced */
    int bl_count;
    huffman_codes_t *bl_cds = NULL;
    int hclen_order[] = {
	16, 17, 18, 0, 8, 7, 9, 6, 10, 5, 11, 4, 12, 3, 13, 2, 14, 1, 15
    };
    int hlit, hdist, hclen, hcmap[19];

    /* output_code_set (stderr, c); */

    if (out->alloc < out->byte + 1000) {
	out->alloc = out->byte + 1000;
	if (NULL == (out->data = realloc(out->data, out->alloc)))
	    return -1;
    }

    /*
     *-----------------------------------------------------------------
     * Reformat the dynamic code bit-lengths into an alphabet of 19 
     * "code length" symbols as defined in RFC1951.
     */
    memset(sorted_codes, 0, 258);
    for (i = 0; i < c->ncodes; i++) {
	sorted_codes[c->codes[i].symbol] = c->codes[i].nbits;
    }
    for (i = 0; i < 19; i++)
	bl_freq[i] = 0;

    bl_count = 0;
    for (i = 0; i < 257; ) {
	int j = i+1, n;
	int v = sorted_codes[i];
	while (j < 257 && sorted_codes[j] == v)
	    j++;

	n = j-i; /* n = run-length */
	/* fprintf(stderr, "value=%d, run_len=%d\n", v, n); */
	if (v == 0) {
	    /* bit-len zero => no code and uses code 17/18 for run len */
	    while (n > 0) {
		while (n >= 11) {
		    bl_freq[18]++;
		    bl_code[bl_count] = 18;
		    bl_opt[bl_count] = MIN(n, 138)-11;
		    n -= bl_opt[bl_count]+11;
		    bl_count++;
		}

		while (n >= 3) {
		    bl_freq[17]++;
		    bl_code[bl_count] = 17;
		    bl_opt[bl_count] = MIN(n, 10)-3;
		    n -= bl_opt[bl_count]+3;
		    bl_count++;
		}

		switch (n) {
		case 2: bl_code[bl_count++] = 0; bl_freq[0]++; n--;
		case 1: bl_code[bl_count++] = 0; bl_freq[0]++; n--;
		}
	    }
	} else if (v <= 15) {
	    /* non-zero code, uses code 16 for run-len */
	    if (n >= 4) {
		bl_freq[v]++;
		bl_code[bl_count++] = v;
		n--;
		while (n >= 3) {
		    bl_freq[16]++;
		    bl_code[bl_count] = 16;
		    bl_opt[bl_count] = MIN(n, 6)-3;
		    n -= bl_opt[bl_count]+3;
		    bl_count++;
		}
	    }

	    switch(n) {
	    case 3: bl_code[bl_count++] = v; bl_freq[v]++; n--;
	    case 2: bl_code[bl_count++] = v; bl_freq[v]++; n--;
	    case 1: bl_code[bl_count++] = v; bl_freq[v]++; n--;
	    }
	} else {
	    fprintf(stderr, "Unsupported code length: %d\n", v);
	}

	i = j;
    }
    hlit = 257;

    /* Add a single distance code of zero bits. This means that there
     * are no distance codes used at all.
     */
    bl_code[bl_count++] = 0;
    bl_freq[0]++;
    hdist = 1;

    /* Produce new huffman codes for our code-length symbols. */
    bl_cds = calc_bit_lengths(bl_code, bl_count, 0, 7, 0, 0, 1);
    /* output_code_set (stderr, bl_cds); */

    /*
     *-----------------------------------------------------------------
     * Output the "code length" bit-lengths, 3 bits at a time.
     *
     * Compute how many HCLEN code length values we need, using the
     * predefined order in the RFC.
     */
    for (hclen = 19; hclen > 0; hclen--) {
	if (bl_freq[hclen_order[hclen-1]])
	    break;
    }

    store_bits(out, hlit-257,  5);
    store_bits(out, hdist-1, 5);
    store_bits(out, hclen-4, 4);

    for (i = 0; i < 19; i++)
	hcmap[i] = -1;
    for (i = 0; i < bl_cds->ncodes; i++)
	hcmap[bl_cds->codes[i].symbol] = i;
    for (i = 0; i < hclen; i++) {
	if (hcmap[hclen_order[i]] >= 0) {
	    store_bits(out, bl_cds->codes[hcmap[hclen_order[i]]].nbits, 3);
	} else {
	    store_bits(out, 0, 3);
	}
    }


    /*
     *----------------------------------------------------------------
     * Finally output the original bit-lengths using the code-len codes.
     */
    for (i = 0; i < bl_count; i++) {
	huffman_code_t *c = &bl_cds->codes[hcmap[bl_code[i]]];
	store_bits(out, c->code, c->nbits); 
	/*
	fprintf(stderr, "bl_code %d (opt %d), code %d/%d\n",
		bl_code[i], bl_opt[i], c->code, c->nbits);
	*/
	switch(bl_code[i]) {
	case 18:
	    store_bits(out, bl_opt[i], 7);
	    break;
	case 17:
	    store_bits(out, bl_opt[i], 3);
	    break;
	case 16:
	    store_bits(out, bl_opt[i], 2);
	    break;
	}
    }

    huffman_codes_destroy(bl_cds);

    return 0;
}

/*
 * A wrapper around store_codes_single to output either a single or multiple
 * huffman codes to a block.
 *
 * This also creates a new block and fills out the block header appropriately.
 *
 * Returns 0 on success
 *        -1 on failure
 */
int store_codes(block_t *out, huffman_codeset_t *cs, int last_block) {
    int i;

    if (out->alloc < out->byte + 1000) {
	out->alloc = out->byte + 1000;
	if (NULL == (out->data = realloc(out->data, out->alloc)))
	    return -1;
    }

    /* Header details */
    store_bits(out, last_block != 0,  1); /* last block */
    if (cs->ncodes == 1) {
	store_bits(out, 2,  2); /* dynamic huffman */
    } else {
	int nbits = 0;
	store_bits(out, 3,  2); /* multiple tree dynamic huffman */
	while (1<<nbits <= cs->ncodes-1)
	    nbits++;
	store_bits(out, nbits-1, 4);
	store_bits(out, cs->ncodes-1, nbits);
    }

    for (i = 0; i < cs->ncodes; i++) {
	if (-1 == store_codes_single(out, cs->codes[i]))
	    return -1;
    }

    return 0;
}

/*
 * This is the opposite of the store_codes() function. It loads generates
 * huffman_codes_t structs from the a serialised data stream as presented
 * in the above format.
 *
 * The input data is the data-string. On return the number of bytes
 * consumed will be returned in *len_used (if non NULL).
 * This is to allow stripping off of the huffman codes from a longer
 * array of data (ie probably followed by the STHUFF encoded chunk
 * itself).
 *
 * Returns: malloced huffman_codes_t structure on success.
 *          NULL on failure.
 */
huffman_codes_t *restore_codes_single(block_t *block) {
    int hlit, hdist, hclen;
    int hclen_order[19] = {
	16, 17, 18, 0, 8, 7, 9, 6, 10, 5, 11, 4, 12, 3, 13, 2, 14, 1, 15
    };
    int hc_bitlen[19], i;
    huffman_codes_t *bl_cds, *cds;
    int sym, sym_val, last_len;
    int htab[256];

    hlit   = get_bits(block, 5)+257;
    hdist  = get_bits(block, 5)+1;
    hclen  = get_bits(block, 4)+4;

    /*
    fprintf(stderr, "bfinal = %d, btype=%d\n", *bfinal, btype);
    fprintf(stderr, "hlit=0x%x, hdist=0x%x, hclen=0x%x\n",
	    hlit, hdist, hclen);
    */

    /* Read HCLEN code-lengths and construct huffman codes from them */
    for (i = 0; i < hclen; i++)
	hc_bitlen[hclen_order[i]] = get_bits(block, 3);
    for (; i < 19; i++)
	hc_bitlen[hclen_order[i]] = 0;

    bl_cds = (huffman_codes_t *)malloc(sizeof(*bl_cds));
    bl_cds->codes_static = 0;
    bl_cds->ncodes = 0;
    bl_cds->codes = (huffman_code_t *)malloc(19 * sizeof(*bl_cds->codes));
    bl_cds->max_code_len = 7;
    for (i = 0; i < 19; i++) {
	if (hc_bitlen[i]) {
	    bl_cds->codes[bl_cds->ncodes].symbol = i;
	    bl_cds->codes[bl_cds->ncodes].nbits = hc_bitlen[i];
	    bl_cds->ncodes++;
	}
    }

    canonical_codes(bl_cds);
    /* output_code_set (stderr, bl_cds); */

    /* Build a lookup table of possible codes to symbols */
    for (i = 0; i < 256; i++)
	htab[i] = -1;
    for (i = 0; i < bl_cds->ncodes; i++) {
	htab[bit_reverse(bl_cds->codes[i].code, bl_cds->codes[i].nbits)
	     | (1<<bl_cds->codes[i].nbits)]
	    = bl_cds->codes[i].symbol;
    }

    /* Now decode the next HLIT literal codes using bl_cds */
    cds = (huffman_codes_t *)malloc(sizeof(*cds));
    cds->codes_static = 0;
    cds->ncodes = 0;
    cds->codes = (huffman_code_t *)malloc(257 * sizeof(*cds->codes));
    cds->max_code_len = 15;
    sym_val = last_len = 0;
    while ((sym = next_symbol(block, htab)) != -1) {
	int count;
	/* fprintf(stderr, "LIT Sym=%d\n", sym); */

	switch(sym) {
	case 16:
	    count = get_bits(block, 2) + 3;
	    /* fprintf(stderr, "   +%d\n", count); */
	    for (i = 0; i < count; i++) {
		cds->codes[cds->ncodes].symbol = sym_val++;
		cds->codes[cds->ncodes++].nbits = last_len;
	    }
	    break;
	    
	case 17:
	    count = get_bits(block, 3) + 3;
	    /* fprintf(stderr, "   +%d\n", count); */
	    sym_val += count;
	    last_len = 0;
	    break;
	    
	case 18:
	    count = get_bits(block, 7) + 11;
	    /* fprintf(stderr, "   +%d\n", count); */
	    sym_val += count;
	    last_len = 0;
	    break;

	case 0:
	    sym_val++;
	    last_len = 0;
	    break;

	default:
	    cds->codes[cds->ncodes].symbol = sym_val++;
	    last_len = cds->codes[cds->ncodes++].nbits = sym;
	}

	if (sym_val >= hlit)
	    break;
    }
    assert(sym != -1);
    assert(cds->ncodes <= 257);

    /* Skip HDIST codes. Hopefully only 1 of zero length */
    sym_val = 0;
    while ((sym = next_symbol(block, htab)) != -1) {
	/* fprintf(stderr, "DIST Sym=%d\n", sym); */

	switch(sym) {
	case 16:
	    sym_val += get_bits(block, 2) + 3;
	    break;
	    
	case 17:
	    sym_val += get_bits(block, 3) + 3;
	    break;
	    
	case 18:
	    sym_val += get_bits(block, 7) + 11;
	    break;

	default:
	    sym_val++;
	}

	if (sym_val >= hdist)
	    break;
    }
    assert(sym != -1);

    huffman_codes_destroy(bl_cds);
    canonical_codes(cds);
    /* output_code_set(stderr, cds); */

    return cds;
}

huffman_codeset_t *restore_codes(block_t *block, int *bfinal) {
    int btype;
    huffman_codeset_t *cs;
    
    /* Header details */
    if (bfinal)
	*bfinal = get_bits(block, 1);
    else
	get_bits(block, 1);
    btype  = get_bits(block, 2);

    cs = (huffman_codeset_t *)malloc(sizeof(*cs));
    cs->code_set = 0;
    cs->blk = NULL;
    cs->bit_num = 0;
    cs->decode_t = NULL;
    cs->decode_J4 = NULL;

    if (btype == 2) {
	/* Standard Deflate algorithm */
	cs->ncodes = 1;
	cs->codes = (huffman_codes_t **)malloc(cs->ncodes*sizeof(*cs->codes));
	cs->codes[0] = restore_codes_single(block);
    } else if (btype == 3) {
	/* Deflate extension - multiple codes */
	int nbits, i;
	nbits  = get_bits(block, 4) + 1;
	cs->ncodes = get_bits(block, nbits) + 1;
	cs->codes = (huffman_codes_t **)malloc(cs->ncodes*sizeof(*cs->codes));
	for (i = 0; i < cs->ncodes; i++) {
	    cs->codes[i] = restore_codes_single(block);
	}
    } else {
	fprintf(stderr, "restore_codes() only implemented for "
		"BTYPE == DYNAMIC HUFFMAN and INTERLACED HUFFMAN\n");
	return NULL;
    }

    cs->bit_num = block->bit;

    return cs;
}

/*
 * Given a multiple sets of huffman codes and a block of data this
 * compresses and returns the data block. It iterates around each set
 * of huffman codes in a cyclic fashion encoding each byte with the
 * appropriate huffman codes.
 *
 * Returns: 0 on success
 *          -1 on failure
 */
int huffman_multi_encode(block_t *blk, huffman_codeset_t *cs,
			 int code_set, unsigned char *data, int len) {
    int i, nc;
    huffman_code_t *lookup;
    huffman_codes_t **c;

    if (!cs) {
	/* No codes known, so derive our own */
	fprintf(stderr, "FIXME: use generate_code_set() to build our own codes here\n");
	return -1;
    } else {
	c  = cs->codes;
	nc = cs->ncodes;
    }

    /*
     * The maximum size to encode len bytes is < 9 bits per symbol
     * (not quite 8 due to an EOF symbol) plus the overhead of the bit-length
     * tree. That in turn, with alternating 8/9 bit-lengths would max out
     * as 258*8 + 5+5+4 + 19*3 + 258*5 bits (429 bytes), but in practice
     * I'm not even sure if it's possible to construct such a set of code
     * lengths that would compress that poor.
     *
     * This of course assumes we're using appropriate compression codes.
     * Given a user may give a completely inappropriate code we have to
     * assume every symbol is actually 15 bits instead of < 9 on average.
     *
     * We ensure blk here is large enough for the worst case scenario so we
     * don't incur overheads in store_bits().
     */
    if (blk->alloc <= 429 + 2*len + 2 + blk->byte) {
	blk->alloc  = 429 + 2*len + 2 + blk->byte;
	blk->data = realloc(blk->data, blk->alloc);
	if (!blk->data)
	    return -1;
    }

    /*
     * Splitting this special case out is worth it as it's approx 30% faster.
     * Also note that the nc > 1 case is faster with a separate counter and
     * test than using modulus (by a factor of 2). It could be sped up further
     * for powers of 2 using bitwise AND, but the difference is not huge.
     */
    if (nc == 1) {
	lookup = c[0]->lookup;
	for (i = 0; i < len; i++) {
	    assert(lookup[data[i]].nbits > 0);
	    store_bits(blk, lookup[data[i]].code,
		       lookup[data[i]].nbits);
	}
    } else {
	int count = 0;
	for (i = 0; i < len; i++) {
	    lookup = c[count]->lookup;
	    assert(lookup[data[i]].nbits > 0);
	    store_bits(blk, lookup[data[i]].code,
		       lookup[data[i]].nbits);
	    if (++count == nc)
		count = 0;
	}
    }

    lookup = c[i%nc]->lookup;
    store_bits(blk, lookup[SYM_EOF].code, lookup[SYM_EOF].nbits);

    assert(blk->alloc > blk->byte);

    /* We probably massively overallocated, so return some of it back */
    blk->data = realloc(blk->data, blk->byte+1);
    blk->alloc = blk->byte+1;

    return 0;
}

/*
 * The opposite of huffman_encode().
 * Decode a huffman stream from 'block' using huffman codes 'c'.
 *
 * Returns: allocated block_t pointer on success
 *          NULL on failure.
 *
 * Method 1
 * --------
 *
 * At any node in our tree we can precompute a lookup table so that upon
 * reading the next 'k' bits we know the new node we'd end up in and what
 * symbols to export.
 * Then decoding simply works in fixed sets of k bits at a time.
 *
 * We use k=4 for efficient table space (they fit neatly in cache) and ease
 * of decoding 4-bits at a time. k=8 is about 20% faster as reading the input
 * byte by byte is easy, but the setup time is substantially longer
 * (16x at a guess) and the lookup tables no longer fit in the L1 cache.
 */
block_t *huffman_decode(block_t *in, huffman_codes_t *c) {
    block_t *out;
    htree_t t[513]; /* number of internal nodes */
    int i, j, n;
    int new_node, node_num;
    h_jump4_t J4[513][16];
    unsigned char *cp;

    if (NULL == (out = block_create(NULL, 8*in->alloc+8))) {
	block_destroy(in, 0);
	return NULL;
    }

    /* Construct the tree from the codes */
    new_node = 1;
    t[0].l[0] = t[0].l[1] = -1;
    t[0].c[0] = t[0].c[1] = 0;
    for (i = 0; i < c->ncodes; i++) {
	int n = 0;
	unsigned int v = c->codes[i].code;

	for (j = 0; j < c->codes[i].nbits-1; j++) {
	    int b = v & 1;
	    if (t[n].c[b]) {
		n = t[n].c[b];
	    } else {
		n = (t[n].c[b] = new_node++);
		t[n].c[0] = t[n].c[1] = 0;
		t[n].l[0] = t[n].l[1] = -1;
	    }
	    v >>= 1;
	}
	/* last bit */
	t[n].l[v & 1] = c->codes[i].symbol;
    }

    /* Build the 16 wide lookup table per node */
    for (n = 0; n < new_node; n++) {
	for (j = 0; j < 16; j++) {
	    unsigned int v = j;
	    int n2 = n;
	    h_jump4_t *hj = &J4[n][j];
	    hj->nsymbols = 0;
	    hj->top_bit = 0;
	    for (i = 0; i < 4; i++) {
		int b = v & 1;
		if (t[n2].l[b] >= 0) {
		    hj->symbol[hj->nsymbols++] = t[n2].l[b];
		    if (t[n2].l[b] == SYM_EOF)
			if (!hj->top_bit)
			    hj->top_bit |= 1 << (hj->nsymbols-1);
		}
		n2 = t[n2].c[b];
		v >>= 1;
	    }
	    hj->jump = n2;
	}
    }

#if 0
    /* Debug output */
    for (n = 0; n < new_node; n++) {
	printf("Node %d, c[]={%d,%d}, l[]={%d,%d}\n",
	       n, t[n].c[0], t[n].c[1], t[n].l[0], t[n].l[1]);
	for (i = 0; i < 256; i++) {
	    printf("\t%02x %s =>%02d, ", i, print_8rev(i), J4[n][i].jump);
	    for (k = 0; k < J4[n][i].nsymbols; k++) {
		if (isprint(J4[n][i].symbol[k]))
		    printf(" '%c'", J4[n][i].symbol[k]);
		else
		    printf(" %03d", J4[n][i].symbol[k]);
	    }
	    printf("\n");
	}
    }
#endif


    /*
     * Decoding - part 1
     * We're part way through a byte, so decode bit by bit up to the next
     * whole byte and then we start the fast decoding section.
     */
    cp = out->data;
    node_num = 0;
    while (in->bit != 0) {
	int b = get_bits(in, 1);
	if (t[node_num].l[b] != -1) {
	    if (t[node_num].l[b] != SYM_EOF) {
		*cp++ = t[node_num].l[b];
	    } else {
		out->byte = cp - out->data;
		return out;
	    }
	}
	node_num = t[node_num].c[b];
    }

    /*
     * Decoding - part 2
     *
     * We handle data nibble by nibble using the nibble to get an
     * h_jump4_t lookup from the J4[] table.
     * If top_bit is clear then we know we have no funny business (SYM_EOF)
     * so we use a fast decoding technique, otherwise we have to do a slower
     * loop with a check.
     */
    {
	int last_node = 0;
	unsigned char *last_cp = cp;
	h_jump4_t *x = &J4[node_num][in->data[in->byte] & 0x0f];
	int l = x->nsymbols;
	int b;

	/*
	 * This is the tight loop, so we over-optimise here by ignoring EOF
	 * and relying on knowing the length of the input data stream.
	 * This allows us to ignore the 9-bit data and only operate on
	 * the basic 0-255 symbols, glossing over the minor issue that EOF
	 * will look like an ordinary symbol.
	 */
	for (i = in->byte; i < in->alloc; i++) {
	    last_cp = cp;
	    last_node = node_num;

	    x = &J4[node_num][in->data[i] & 0x0f];
	    l = x->nsymbols;

	    for (j = 0; j < l; j++) {	
		*cp++ = x->symbol[j];
	    }
	    node_num = x->jump;

	    if (x->top_bit)
		break;

	    x = &J4[node_num][(in->data[i] >> 4) & 0x0f];
	    l = x->nsymbols;
	     
	    for (j = 0; j < l; j++) {	
		*cp++ = x->symbol[j];
	    }
	    node_num = x->jump;

	    if (x->top_bit)
		break;
	}

    
	/*
	 * Decoding - part 3
	 *
	 * The above optimisation has unfortunately added EOF to our data
	 * along with whatever else was packed in the last byte after the
	 * EOF symbol. So we rewind one byte and finish off decoding
	 * the slow way - walking the tree.
	 */
	cp = last_cp;
	node_num = last_node;
	in->byte = i;
	in->bit = 0;
	while (-1 != (b = get_bits(in, 1))) {
	    if (t[node_num].l[b] != -1) {
		if (t[node_num].l[b] != SYM_EOF) {
		    *cp++ = t[node_num].l[b];
		} else {
		    out->byte = cp - out->data;
		    return out;
		}
	    }
	    node_num = t[node_num].c[b];
	}
     }


    /* We shouldn't reach here */
    return NULL;
}

int init_decode_tables(huffman_codeset_t *cs) {
    int nnodes, i, j, n, nc;
    huffman_codes_t **c;
    int new_node, rec;
    h_jump4_t (*J4)[16] = NULL;
    htree_t *t;
    
    c = cs->codes;
    nc = cs->ncodes;

    /* Allocate memory for internal nodes (nsyms-1 for each code set) */
    for (nnodes = i = 0; i < nc; i++) {
	nnodes += c[i]->ncodes-1;
    }

    if (NULL == (t = (htree_t *)malloc(nnodes * sizeof(*t))))
	goto error;

    if (NULL == (J4 = (h_jump4_t (*)[16])malloc(nnodes * sizeof(*J4))))
	goto error;

    /*
     * Construct the tree from the codes.
     * We have one tree for all 'nc' huffman codes with each tree pointing
     * to the root of the next one (or first) tree whenever we emit a
     * symbol.
     *
     * This then effectively means the decoding step is identical to the
     * single huffman code function.
     */
    new_node = 0;
    for (rec = 0; rec < nc; rec++) {
	int root = new_node++;
	int next_root = rec == nc-1
	    ? 0
	    : root + c[rec]->ncodes-1;
	    
	t[root].l[0] = t[root].l[1] = -1;
	t[root].c[0] = t[root].c[1] = next_root;
	for (i = 0; i < c[rec]->ncodes; i++) {
	    int n = root;
	    unsigned int v = c[rec]->codes[i].code;

	    for (j = 0; j < c[rec]->codes[i].nbits-1; j++) {
		int b = v & 1;
		if (t[n].c[b] != next_root) {
		    n = t[n].c[b];
		} else {
		    n = (t[n].c[b] = new_node++);
		    t[n].c[0] = t[n].c[1] = next_root;
		    t[n].l[0] = t[n].l[1] = -1;
		}
		v >>= 1;
	    }
	    /* last bit */
	    t[n].l[v & 1] = c[rec]->codes[i].symbol;
	}
    }

    /*
    for (i = 0; i < new_node; i++) {
	printf("t[%d] = {(%d,%d), (%d,%d)}\n",
	       i,
	       t[i].l[0], t[i].l[1],
	       t[i].c[0], t[i].c[1]);
    }
    */

    /* Build the 16 wide lookup table per node */
    for (n = 0; n < new_node; n++) {
	for (j = 0; j < 16; j++) {
	    unsigned int v = j;
	    int n2 = n;
	    h_jump4_t *hj = &J4[n][j];
	    hj->nsymbols = 0;
	    hj->top_bit = 0;
	    for (i = 0; i < 4; i++) {
		int b = v & 1;
		if (t[n2].l[b] >= 0) {
		    hj->symbol[hj->nsymbols++] = t[n2].l[b];
		    if (t[n2].l[b] == SYM_EOF)
			if (!hj->top_bit)
			    hj->top_bit |= 1 << (hj->nsymbols-1);
		}
		n2 = t[n2].c[b];
		v >>= 1;
	    }
	    hj->jump = n2;
	    /*
	    printf("J4[%d][%d] = {'%.*s', %d}\n",
		   n, j, hj->nsymbols, hj->symbol, n2);
	    */
	}
    }

    cs->decode_t = t;
    cs->decode_J4 = J4;

    return 0;

 error:
    if (t)
	free(t);

    if (J4)
	free(J4);

    cs->decode_t = NULL;
    cs->decode_J4 = NULL;

    return -1;
}

/*
 * The opposite of huffman_encode().
 * Decode a huffman stream from 'block' using huffman codes 'c'.
 *
 * Returns: allocated block_t pointer on success
 *          NULL on failure.
 *
 * Method 1
 * --------
 *
 * At any node in our tree we can precompute a lookup table so that upon
 * reading the next 'k' bits we know the new node we'd end up in and what
 * symbols to export.
 * Then decoding simply works in fixed sets of k bits at a time.
 *
 * We use k=4 for efficient table space (they fit neatly in cache) and ease
 * of decoding 4-bits at a time. k=8 is about 20% faster as reading the input
 * byte by byte is easy, but the setup time is substantially longer
 * (16x at a guess) and the lookup tables no longer fit in the L1 cache.
 *
 * NB: This version also handles multiple interleaved huffman codes as
 * this support doesn't really slow down the decoding process.
 */
block_t *huffman_multi_decode(block_t *in, huffman_codeset_t *cs) {
    block_t *out = NULL;
    int i, j;
    int node_num;
    unsigned char *cp;
    h_jump4_t (*J4)[16];
    htree_t *t;

    if (!cs)
	return NULL;

    /* Ensure precomputed lookup tables exist */
    if (!cs->decode_t || !cs->decode_J4)
	if (-1 == init_decode_tables(cs))
	    return NULL;

    t  = cs->decode_t;
    J4 = cs->decode_J4;

    if (NULL == (out = block_create(NULL, 9*(in->alloc+1)))) {
	goto error;
    }

    /*
     * Decoding - part 1
     * We're part way through a byte, so decode bit by bit up to the next
     * whole byte and then we start the fast decoding section.
     */
    cp = out->data;
    node_num = 0;
    while (in->bit != 0) {
	int b = get_bits(in, 1);
	htree_t *t2 = &t[node_num];
	if (t2->l[b] != -1) {
	    if (t2->l[b] != SYM_EOF) {
		*cp++ = t2->l[b];
	    } else {
		out->byte = cp - out->data;
		goto success;
	    }
	}
	node_num = t2->c[b];
    }

    /*
     * Decoding - part 2
     *
     * We now handle data nibble by nibble using the nibble to get an
     * h_jump4_t lookup from the J4[] table.
     * If top_bit is clear then we know we have no funny business (SYM_EOF)
     * so we use a fast decoding technique, otherwise we have to do a slower
     * loop with a check.
     */
    {
	int last_node = node_num;
	unsigned char *last_cp = cp;
	h_jump4_t *x = &J4[node_num][in->data[in->byte] & 0x0f];
	int l = x->nsymbols;
	int b;

	/*
	 * This is the tight loop, so we over-optimise here by ignoring EOF
	 * and relying on knowing the length of the input data stream.
	 * This allows us to ignore the 9-bit data and only operate on
	 * the basic 0-255 symbols, glossing over the minor issue that EOF
	 * will look like an ordinary symbol.
	 */
	for (i = in->byte; i < in->alloc; i++) {
	    last_cp = cp;
	    last_node = node_num;

	    x = &J4[node_num][in->data[i] & 0x0f];
	    l = x->nsymbols;

	    /* printf("val=%d\n", in->data[i] & 0x0f); */
	    for (j = 0; j < l; j++) {
		*cp++ = x->symbol[j];
	    }
	    node_num = x->jump;

	    if (x->top_bit)
		break;

	    x = &J4[node_num][(in->data[i] >> 4) & 0x0f];
	    l = x->nsymbols;
	     
	    for (j = 0; j < l; j++) {	
		*cp++ = x->symbol[j];
	    }
	    node_num = x->jump;

	    if (x->top_bit)
		break;
	}

    
	/*
	 * Decoding - part 3
	 *
	 * The above optimisation has unfortunately added EOF to our data
	 * along with whatever else was packed in the last byte after the
	 * EOF symbol. So we rewind one byte and finish off decoding
	 * the slow way - walking the tree.
	 */
	cp = last_cp;
	node_num = last_node;
	in->byte = i;
	in->bit = 0;
	while (-1 != (b = get_bits(in, 1))) {
	    htree_t *t2 = &t[node_num];
	    if (t2->l[b] != -1) {
		if (t2->l[b] != SYM_EOF) {
		    *cp++ = t2->l[b];
		} else {
		    out->byte = cp - out->data;
		    goto success;
		}
	    }
	    node_num = t2->c[b];
	}
     }

 success:
    return out;

 error:
    if (out)
	block_destroy(out, 0);

    return NULL;
}

#if 0
/* A simple to understand (but slow) version of the above function */
block_t *huffman_multi_decode(block_t *in, huffman_codes_t **c, int nc) {
    block_t *out;
    htree_t (*t)[513]; /* number of internal nodes */
    int i, j, n, rec;
    int new_node, node_num;
    unsigned char *cp;
    int bC; /* byte count */

    if (NULL == (t = (htree_t (*)[513])malloc(nc * sizeof(*t))))
	return NULL;

    if (NULL == (out = block_create(NULL, 8*in->alloc+8))) {
	block_destroy(in, 0);
	free(t);
	return NULL;
    }

    /* Construct the tree from the codes */
    for (rec = 0; rec < nc; rec++) {
	new_node = 1;
	t[rec][0].l[0] = t[rec][0].l[1] = -1;
	t[rec][0].c[0] = t[rec][0].c[1] = 0;
	for (i = 0; i < c[rec]->ncodes; i++) {
	    int n = 0;
	    unsigned int v = c[rec]->codes[i].code;

	    for (j = 0; j < c[rec]->codes[i].nbits-1; j++) {
		int b = v & 1;
		if (t[rec][n].c[b]) {
		    n = t[rec][n].c[b];
		} else {
		    n = (t[rec][n].c[b] = new_node++);
		    t[rec][n].c[0] = t[rec][n].c[1] = 0;
		    t[rec][n].l[0] = t[rec][n].l[1] = -1;
		}
		v >>= 1;
	    }
	    /* last bit */
	    t[rec][n].l[v & 1] = c[rec]->codes[i].symbol;
	}
    }

    /*
     * Decoding - the slow way. How to speed up multi-code decoding?
     */
    cp = out->data;
    node_num = 0;
    bC = 0;
    while (in->byte < in->alloc) {
	htree_t *t2;
	int b = get_bits(in, 1);
	t2 = &t[bC%nc][node_num];
	if (t2->l[b] != -1) {
	    if (t2->l[b] != SYM_EOF) {
		*cp++ = t2->l[b];
		bC++;
		node_num = 0;
	    } else {
		out->byte = cp - out->data;
		free(t);
		return out;
	    }
	} else {
	    node_num = t2->c[b];
	}
    }

    /* We shouldn't reach here */
    free(t);
    return NULL;
}
#endif

/*
 * A slow version of the above huffman_decode function. This is designed as
 * a piecemeal decoder for purposes of restoring the huffman codes themselves.
 * NB: this only works for code lengths small enough to keep inside the
 * htab[] dimensions - IT DOES NOT CHECK THIS.
 *
 * Returns the next symbol
 *         -1 for failure
 */
int next_symbol(block_t *in, int *htab) {
    int b, v = 0, c = 1;
    while ((b = get_bits(in, 1)) != -1) {
	v = (v<<1) | b | (c <<= 1);

	if (htab[v] != -1)
	    return htab[v];
    }

    return -1;
}


/*
 * ---------------------------------------------------------------------------
 * Debug code. This turns the library into a stand-alone program for
 * easy debugging.x
 */
static void output_code_set(FILE *fp, huffman_codes_t *cds) {
    int i, j;
    int nbits_in = 0, nbits_out = 0;
    huffman_code_t *codes = cds->codes;
    int ncodes = cds->ncodes;

    fprintf(fp, "static huffman_code_t codes_FIXME[] = {\n");
    for (i = j = 0; i < ncodes; i++) {
	nbits_out += codes[i].nbits * codes[i].freq;
	nbits_in  += 8*codes[i].freq;
	if (j == 0)
	    fprintf(fp, "    ");
	if (codes[i].symbol == SYM_EOF) {
	    fprintf(fp, "{SYM_EOF,%3d}, ", codes[i].nbits);
	    j = 10;
	} else {
	    if (isalnum(codes[i].symbol)) {
		fprintf(fp, "{'%c',%3d}, ", codes[i].symbol, codes[i].nbits);
	    } else {
		fprintf(fp, "{%3d,%3d}, ", codes[i].symbol, codes[i].nbits);
	    }
	}
	j++;
	
	if (j >= 6) {
	    fputc('\n', fp);
	    j = 0;
	}
    }
    if (j)
	fputc('\n', fp);
    fprintf(fp, "};\n");
    fprintf(fp, "/* Expected compression to %f of input */\n",
	    (double)nbits_out/nbits_in);
}

static void output_code_set2(FILE *fp, huffman_codes_t *cds) {
    int i;
    huffman_code_t *codes = cds->codes;
    int ncodes = cds->ncodes;

    fprintf(fp, "huffman_code_t = {\n");
    for (i = 0; i < ncodes; i++) {
	fprintf(fp, "\t%d:\t%3d %c %2d %04x %d\n",
		i,codes[i].symbol, codes[i].symbol,
		codes[i].nbits, codes[i].code,
		codes[i].freq);
    }
    fprintf(fp, "};\n");
}


/*
 * --------------------------------------------------------------------------
 * A test main() to create an application capable of compressing and
 * uncompressing stdin.
 */

#ifdef TEST_MAIN
#include <fcntl.h>
/* #include <zlib.h> */

/*
 * Slurps the entirety of stdin into a malloced buffer and returns a pointer
 * to it.
 *
 * Returns: malloced buffer on success, *lenp equal to length
 *          NULL on failure
 */
static unsigned char *load(int *lenp, char *fn) {
    unsigned char *data = NULL;
    int dsize = 0;
    int dcurr = 0, len;
    int fd = 0;

    if (fn) {
	if (-1 == (fd = open(fn, O_RDONLY, 0))) {
	    perror(fn);
	    return NULL;
	}
    }

    do {
	if (dsize - dcurr < 8192) {
	    dsize = dsize ? dsize * 2 : 8192;
	    if (NULL == (data = realloc(data, dsize))) {
		if (fd)
		    close(fd);
		return NULL;
	    }
	}

	len = read(fd, data + dcurr, 8192);
	if (len > 0)
	    dcurr += len;
    } while (len > 0);

    if (len == -1) {
	perror("read");
	if (fd)
	    close(fd);
	return NULL;
    }

    if (fd)
	close(fd);

    *lenp = dcurr;
    return data;
}

/*
 * Returns 0 for success
 *        -1 for failure.
 */
int decode_main(unsigned char *data, int len, int code_set) {
    huffman_codeset_t *cs = NULL;
    block_t *blk_in;
    unsigned char *out;
    int err, out_len, i;
    int bfinal;

    blk_in = block_create(NULL, 1000 + len);

    if (code_set != 0) {
	cs = generate_code_set(code_set, 1,  /* no. codes */
			       NULL, 0,      /* data + size */
			       1,	     /* eof */
			       MAX_CODE_LEN,
			       0);	     /* all_codes */
	store_codes(blk_in, cs, 1);
    }

    if (blk_in->bit != 0) {
	blk_in->data[blk_in->byte] |= data[0];
	memcpy(&blk_in->data[blk_in->byte+1], data+1, len-1);
    } else {
	memcpy(&blk_in->data[blk_in->byte], data, len);
    }

    /* Do the decoding */
    do {
	block_t *out;
	if (!cs)
	    cs = restore_codes(blk_in, &bfinal);
	out = huffman_multi_decode(blk_in, cs);
	write(1, out->data, out->byte);
	block_destroy(out, 0);
	huffman_codeset_destroy(cs);
	cs = NULL;
    } while (!bfinal);

    block_destroy(blk_in, 0);

    return 0;
}

/*
 * Returns 0 for success
 *        -1 for failure.
 */
int encode_main(unsigned char *data, int len, int code_set, int rec_size,
		int blk_size, int dump_tree, int exit_after_tree) {
    /* Encoding */
    unsigned char *d2 = data;
    block_t *blk;
    huffman_codeset_t *cs;

    blk = block_create(NULL, 8192);
    fprintf(stderr, "Input %d bytes\n", len);

    do {
	int l2 = len > blk_size ? blk_size : len;
	int i;

	if (code_set != 0)
	    l2 = len; /* predefined code-sets have final-block bit set */
	cs = generate_code_set(code_set, rec_size,
			       d2, l2, /* Data and length */
			       1,      /* eof */
			       MAX_CODE_LEN,
			       0);     /* all codes */
	if (!cs)
	    return -1;

	if (dump_tree) {
	    for (i = 0; i < rec_size; i++) {
		printf("==Sub-set %d==\n", i);
		output_code_set(stdout, cs->codes[i]);
		/* output_code_set2(stdout, cs->codes[i]); */
	    }
	    if (exit_after_tree)
		return 0;
	}

	store_codes(blk, cs, l2 == len);
	if (code_set != 0) {
	    blk->data[blk->byte = 0] = 0;  /* starting bit no. preseved */
	} else {
	    /*
	      fprintf(stderr, "codes finished at %d bytes, %d bits\n",
		    blk->byte, blk->bit);
	    */
	}
	if (exit_after_tree) {
	    write(1, blk->data, blk->byte + (blk->bit != 0));
	    return 0;
	}

	huffman_multi_encode(blk, cs, code_set, d2, l2);

	huffman_codeset_destroy(cs);
	len -= l2;
	d2  += l2;

    } while (len > 0);
    write(1, blk->data, blk->byte + (blk->bit != 0));

    fprintf(stderr, "Output %ld bytes\n", blk->byte + (blk->bit != 0));
    block_destroy(blk, 0);

    return 0;
}

int main(int argc, char **argv) {
    unsigned char *data;
    int decode = 0;
    int dump_tree = 0;
    int exit_after_tree = 0;
    int code_set = CODE_INLINE;
    int blk_size = 0x7fff;
    int rec_size = 1;
    int c, r, len;
    char *fn = NULL;

    while ((c = getopt(argc, argv, "c:detxl:b:hr:i:")) != -1) {
	switch (c) {
	case 'b':
	    blk_size = atoi(optarg);
	    break;

	case 'c':
	    code_set = atoi(optarg);
	    break;

	case 'r':
	    rec_size = atoi(optarg);
	    break;

	case 'd':
	    decode = 1;
	    break;

	case 'e':
	    decode = 0;
	    break;

	case 't':
	    dump_tree = 1;
	    break;

	case 'x':
	    exit_after_tree = 1;
	    break;
	    
	case 'i':
	    fn = optarg;
	    break;

	default:
	    fprintf(stderr, "Usage: huffman_static [options] < stdin > stdout\n");
	    fprintf(stderr, "    Decoding options\n");
	    fprintf(stderr, "        -d\tdecode flag\n");
	    fprintf(stderr, "    Encoding options\n");
	    fprintf(stderr, "        -e\tencode flag\n");
	    fprintf(stderr, "        -b size\tspecify the block-size\n");
	    fprintf(stderr, "        -c code\tspecify code-set. 0 => inline\n");
	    fprintf(stderr, "        -t\tpretty-print the code-set used\n");
	    fprintf(stderr, "        -x\texit after outputting code-set\n");
	    exit(c != 'h');
	}
    }

    data = load(&len, fn);

    r = decode
	? decode_main(data, len, code_set)
	: encode_main(data, len, code_set, rec_size,
		      blk_size, dump_tree, exit_after_tree);

    free(data);

    return r == 0 ? 0 : 1;
}
#endif