/*  File: dict.c
 *  Author: Richard Durbin and Jean Thierry-Mieg (rd@sanger.ac.uk)
 *  Copyright (C) J Thierry-Mieg and R Durbin, 1995
 * -------------------------------------------------------------------
 * Acedb is free software; you can redistribute it and/or
 * modify it under the terms of the GNU General Public License
 * as published by the Free Software Foundation; either version 2
 * of the License, or (at your option) any later version.
 * 
 * This program is distributed in the hope that it will be useful,
 * but WITHOUT ANY WARRANTY; without even the implied warranty of
 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 * GNU General Public License for more details.
 * 
 * You should have received a copy of the GNU General Public License
 * along with this program; if not, write to the Free Software
 * Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA  02111-1307, USA.
 * or see the on-line version at http://www.gnu.org/copyleft/gpl.txt
 * -------------------------------------------------------------------
 * This file is part of the ACEDB genome database package, written by
 * 	Richard Durbin (MRC LMB, UK) rd@mrc-lmb.cam.ac.uk, and
 *	Jean Thierry-Mieg (CRBM du CNRS, France) mieg@kaa.cnrs-mop.fr
 *
 * Description: 
 * Exported functions:
 * HISTORY:
	   DICT library was rewritten nov 2002 to ensure
	   that the char* returned by dictName are valid
	   untill the whole dict is destroyed.
	   i.e. they are never reallocated even if you
	   keep adding names foreever
	   and dictRemove was added
 * Last edited: Dec  4 14:50 2002 (mieg)
 * Created: Tue Jan 17 17:33:44 1995 (rd)
 *-------------------------------------------------------------------
 */

/* $Id: dict.c,v 1.19 2003/06/29 23:45:08 mieg Exp $ */
#include "regular.h"
                             /* every thing here is private */

static char *DICT_MAGIC = "dict-magic" ;
typedef struct { unsigned curr ; int pos, free, max ; char *base ; } DVOC ;
typedef struct {
  BOOL caseSensitive ;          /* default FALSE, public */
  int dim ;         /* dimension of the hash table, has to be a power of 2 */
  int max ;           /* 2 to the  dict->dim */
  int count ;         /* number of active names in the dict */
  int nVoc ;           /* current dVoc */
  int newPos ;          /* moving index in the hash table */
  char *magic ;
  Array table ;			/* hash table */
  Array keys ;		/* ofsets in the set of vocs */
  Array dVocs ;                  /* array of DVOC */
  STORE_HANDLE handle ;
} _DICT ;

#define DICT _DICT
#include "dict.h"
#undef DICT

/*
  The names are stored incrementally in large buffers, dVoc->buff, which
  are never reallocated, so that the pointers returned by dictName
  remain valid until the dict is destroyed.

  The key is as usual in acedb a composite 1 byte for the dVoc number
  3 bytes for the offsett in dVoc->buff. In reality we divide by 8
  the offset since we align the names on 64 bit boundaries.

  The table is a double hashing hash table, its dimension is a power of 2
  so it is prime with the delta hash value which is always odd. In this
  way the orbit (first hash value modulo delta covers the whole table).

  The first 0 found during the travle indicates that the name is absent
  in the hash table. The first negative value indicates a spot than has 
  been freed and can be reused, but does not stop the bouncing search .

  We retrieve a name by direct dereferencing.
  We recognize a name by hash search.
*/

#define DEBUG_MODE 
#ifndef DEBUG_MODE
/* optimised mode */
/* these 2 private complex macros are useful to accelerate the hashing code */
/* whereas the public interface always checks the range of the parameters */

#define dictKey2Name(_dict,_k) ((arrp (_dict->dVocs, ((k >> 24) & 0x000000ff) - 1, DVOC))->base + ((0x00ffffff & (_k)) << 3))
#define dictIndex2Name(_dict_i) (dictKey2Name(_dict,arr(-dict->keys,_i,KEY)))

#else
/* debugging mode */
static char *dictKey2Name (_DICT *dict, KEY k) 
{
  int nVoc = ((k >> 24) & 0x000000ff) - 1 ;
  int pos = (0x00ffffff & k) << 3 ;
  DVOC *dVoc ;
  
  if (nVoc < 0 || nVoc >= arrayMax (dict->dVocs))
    messcrash ("uDictKey2Name bad nVoc = %d", nVoc) ;
  dVoc = arrp (dict->dVocs, nVoc, DVOC) ;
  if (pos >= dVoc->curr)
    messcrash ("uDictKey2Name bad pos = %d >= curr = %d", pos, dVoc->curr) ;
  return dVoc->base + pos ;
}

static char *dictIndex2Name (_DICT *dict, int ii)
{
  KEY key ;
  
  if (ii < 0 || ii >= arrayMax (dict->keys))
    messcrash ("uDictIndex2Name") ;
  key = arr(dict->keys, ii, KEY) ;
  return dictKey2Name (dict, key) ;
}
#endif /* DEBUG_MODE */

/************* standard utility from Jean *************/
#define  SIZEOFINT (8 * sizeof (int))
static int dictHash (char *cp, int n, BOOL isDiff)
{
  register int i ;
  register unsigned int j, x = 0 ;
  register int rotate = isDiff ? 21 : 13 ;
  register int leftover =  SIZEOFINT - rotate ;

  while (*cp)
    x = freeupper (*cp++) ^ (( x >> leftover) | (x << rotate)) ; 

				/* compress down to n bits */
  for (j = x, i = n ; i < SIZEOFINT ; i += n)
    j ^= (x >> i) ;
  j &= (1 << n) - 1 ;

  if (isDiff)			/* return odd number */
    j |= 1 ;

  return j ;
}  /* dictHash */

/****************************************************************************/

static void dictReHash (_DICT *dict, int newDim)
{
  int ii ;
  KEY *kp ;

  if (newDim <= dict->dim)
    return ;
  dict->dim = newDim ;
  dict->max = 1 << newDim ;
				/* remake the table */
  arrayDestroy (dict->table) ;
  dict->table = arrayHandleCreate (dict->max, int, dict->handle) ;
  array (dict->table, dict->max-1, int) = 0 ;	/* set arrayMax */

				/* reinsert all the names */
  for (ii = 1, kp = arrp(dict->keys, ii, KEY)  ; ii < arrayMax(dict->keys) ; ii++, kp++)
    { dictFind (dict, dictKey2Name (dict, *kp), 0) ;
				/* will fail, but sets dict->newPos */
      arr(dict->table, dict->newPos, int) = ii ;
    }
} /* dictReHash */

/****************************************************************************/

static DVOC *dictAddVoc (_DICT *dict, int s)
{
  DVOC *dVoc ;
  int n1 = 0 ;
 
  dVoc = arrayp (dict->dVocs, dict->nVoc, DVOC) ;
  if (dict->nVoc > 0)
    n1 = 2 * (dVoc - 1)->max ;
  if (n1 < 8 * s)
    n1 = 8 * s ;
  dVoc->base = halloc (n1, dict->handle) ; /* never realloc */
  dVoc->curr = 0 ;
  dVoc->max = n1 ;
  dVoc->free = n1 ;
  
  dict->nVoc++ ;
  return dVoc ;
}  /* dictAddVoc */


void uDictDestroy (_DICT *dict) 
{ 
  if (dict && dict->magic == DICT_MAGIC) 
    {  /* we need 2 lines, because otherwise messfree sets (freed) dict->handle=0 */
      STORE_HANDLE handle = dict->handle ;
      messfree (handle) ;
    }
}

/****************************************************************************/

_DICT *dictHandleCreate (int size, STORE_HANDLE handle)
{
  STORE_HANDLE h = handleHandleCreate (handle) ;
  _DICT *dict = (_DICT*) halloc (sizeof (_DICT), h) ;

  dict->handle = h ;
  dict->magic = DICT_MAGIC ;
  dict->caseSensitive = FALSE ;
  dict->count = 0 ;
  for (dict->dim = 6, dict->max = 64 ; 
       dict->max < size ; 
       ++dict->dim, dict->max *= 2) ;
  dict->table = arrayHandleCreate (dict->max, int, dict->handle) ;
  array (dict->table, dict->max-1, int) = 0 ;	/* set arrayMax */
  dict->dVocs = arrayHandleCreate (8, DVOC, dict->handle) ;
  dictAddVoc (dict, 8 * dict->max) ;
  dict->keys = arrayHandleCreate (dict->dim/4, KEY, dict->handle) ;
  array (dict->keys, 0, KEY) = 0 ;		/* reserved for empty table entry */

  return dict ;
} /* dictHandleCreate */

_DICT *dictCreate (int size) 
{
  return dictHandleCreate (size, 0) ; 
} /* dictCreate */

_DICT *dictCaseSensitiveHandleCreate (int size, STORE_HANDLE handle) 
{
  _DICT *dict = dictHandleCreate (size, handle) ;
  dict->caseSensitive = TRUE ;
  
  return dict ;
} /* dictCaseSensitiveHandleCreate */

/****************************************************************************/

BOOL dictFind (_DICT *dict, char *s, int *ip)
{
  register int ii, h, dh = 0 ;
  int (*mystrcmp)() = dict->caseSensitive ?
    strcmp : strcasecmp ;

  if (!dict || !s || !*s)
    return FALSE ;

  dict->newPos = 0 ; /* will become first reusable spot */
  h = dictHash (s, dict->dim, FALSE) ;
  
  while (TRUE)
    { ii = arr (dict->table, h, KEY) ;
      if (!ii)			/* empty slot, s is unknown */
	{ 
	  if (ip) 
	    *ip = ii - 1 ;
	  if (dict->newPos == 0)
	    dict->newPos = h ;
	  return FALSE ;
	}
      else if (ii < 0)		/* freed stop */
	{ 
	  if (dict->newPos == 0)
	    dict->newPos = h ;
	  continue ;
	}
      else if (!mystrcmp (s, dictIndex2Name(dict,ii)))
	{ if (ip) 
	    *ip = ii - 1 ;
	  dict->newPos = h ;
	  return TRUE ;
	}
      if (!dh)
	dh = dictHash (s, dict->dim, TRUE) ;
      h += dh ;
      if (h >= dict->max)
	h -= dict->max ;
    }
} /* dictFind */

/****************************************************************************/

BOOL dictRemove (_DICT *dict, char *s) 
{
  int ii = 0 ;

  if (!dict || 
      !s ||
      !dictFind (dict, s, &ii))	/* word unkown */
    return FALSE ;

  ii++ ;
  arr (dict->keys, ii, KEY) = 0 ;  /* will not be rehashed */
  arr (dict->table, dict->newPos, int) = -ii ; /* will be reusable */
  dict->count-- ;
  return TRUE ;
} /* dictRemove */

/****************************************************************************/
/* always fills ip, returns TRUE if added, FALSE if known */
BOOL dictAdd (_DICT *dict, char *s, int *ip)
{
  int ii = 0, len ;
  DVOC *dVoc  ;

  if (!dict || !s)
    return FALSE ;

  if (dictFind (dict, s, &ii))	/* word already known */
    {
      if (ip)
	*ip = ii ;
      return FALSE ;
    }
  ii++ ;
  if (ii < 0)
    ii = -ii ; /* reuse */
  else
    ii = arrayMax(dict->keys) ;
  array (dict->table, dict->newPos, int) = ii ;
  
  dVoc = arrp (dict->dVocs, dict->nVoc - 1, DVOC) ;
  len = strlen (s) ;
  if (len + 1 >= dVoc->free)
    dVoc = dictAddVoc (dict, len) ;

  array (dict->keys, ii, KEY) = (0xff000000 & ((dict->nVoc)<<24)) | (0x00ffffff & (dVoc->curr)>>3) ;
  strcpy (dVoc->base + dVoc->curr, s) ;
  len++ ;            /* count the terminal zero */
  while (len%8) len++ ; /* adjust on word boundary */
  dVoc->curr += len ;
  dVoc->free -= len ;
  dict->count++ ;
  if (arrayMax(dict->keys) > 0.4 * dict->max)
    dictReHash (dict, dict->dim+1) ;

  if (ip)
    *ip = ii - 1 ;
  return TRUE ;
} /* dictAdd */

/********************** utilities ***********************/
/* dictName returns a pointer that never gets reallocated */

char *dictName (_DICT *dict, int ii)
{
  KEY key ;

  if (ii < 0 || ii + 1 >= arrayMax(dict->keys))
    {
      messcrash ("Call to dictName() out of bounds: %d not in [0,%d] ", ii, dictMax(dict) - 1) ;
      return "(Dict error, NULL NAME)" ;
    }

  key = arr (dict->keys, ii + 1, KEY) ;
  return dictKey2Name(dict, key) ;
} /* dictName */

int dictCount (_DICT *dict)
{
  return dict->count ;   /* number of active names */
}  /* dictMax */

int dictMax (_DICT *dict)   /* max to be used if looping on all entries */
{
  return arrayMax(dict->keys) - 1 ;   /* 0 == reserved pseudo key */
}  /* dictMax */

/******************** end of file **********************/
/****************************************************************************/
/****************************************************************************/