/*
   File: lxcn_vocabulary.c
   Defines the functions to manipulate a vocabulary

   Copyright 2009 Radboud University of Nijmegen
 
   This program is free software: you can redistribute it and/or modify
   it under the terms of the GNU General Public License as published by
   the Free Software Foundation, either version 3 of the License, or
   (at your option) any later version.

   This program is distributed in the hope that it will be useful,
   but WITHOUT ANY WARRANTY; without even the implied warranty of
   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
   GNU General Public License for more details.

   You should have received a copy of the GNU General Public License
   along with this program.  If not, see <http://www.gnu.org/licenses/>.

   CVS ID: "$Id$"
*/

/* include config.h if autoconfigured */
#ifdef HAVE_CONFIG_H
#include <config.h>
#endif

/* global includes */
#include <stdio.h>
#include <ctype.h>
#include <string.h>

/* Libabase includes */
#include <abase_porting.h>
#include <abase_error.h>
#include <abase_memalloc.h>
#include <abase_pool_alloc.h>
#include <abase_fileutil.h>

/* Local includes */
#include "lxcn_vocabulary.h"
#include "lxcn_vocabulary_impl.h"

/*
 * This string uniquely identifies the version of vocabulary-only
 * files. It must be changed every time the on-disk format changes.
 *
 * Proposed system: if it changes in a new AGFL version, copy that
 * version. If it changes in-between, add a lowercase letter suffix,
 * to distinguish it from a pure AGFL version (e.g. "2.8a").
 */
char vocabulary_version[] = "2.8";

/*
   Node allocation
*/
static VocTrie new_trie (Pool p, VocIndexTree tails, char *search_key, int rem_offset, int info)
{ VocTrie new_node = (VocTrie) abs_pool_malloc (p, sizeof (struct voc_trie_rec), "new_trie");
  new_node -> tails = tails;
  new_node -> search_key = search_key;
  new_node -> rem_offset = rem_offset;
  new_node -> info = info;
  return (new_node);
}

static VocIndexTree new_index_tree (Pool p, VocIndexTree left, VocIndexTree right,
				    VocTrie sub_trie, unsigned char key, signed char balfac)
{ VocIndexTree new_node =
	(VocIndexTree) abs_pool_malloc (p, sizeof (struct voc_index_tree_rec), "new_index_tree");
  new_node -> left = left;
  new_node -> right = right;
  new_node -> sub_trie = sub_trie;
  new_node -> key = key;
  new_node -> balfac = balfac;
  return (new_node);
}

/*
   Freeing
*/
static void free_voc_index_tree (VocIndexTree tree);
static void free_voc_trie (VocTrie trie);

static void free_voc_index_tree (VocIndexTree tree)
{ if (tree != NULL)
    { free_voc_index_tree (tree -> left);
      free_voc_index_tree (tree -> right);
      free_voc_trie (tree -> sub_trie);
      abs_free (tree, "free_voc_index_tree: tree");
    };
}

static void free_voc_trie (VocTrie trie)
{ if (trie != NULL)
    { abs_free (trie -> search_key, "free_voc_trie: search_key");
      free_voc_index_tree (trie -> tails);
      abs_free (trie, "free_voc_trie: trie");
    };
}

/*
   Leaf construction
*/
static VocTrie make_trie_leaf (Pool p, char *key, int offset, int **ret_info)
{ /* Privatize search key and allocate trie node */
  char *new_key = abs_pool_new_string (p, key, "make_trie_leaf");
  VocTrie new_leaf = new_trie (p, voc_index_tree_nil, new_key, offset, 0);
  *ret_info = &new_leaf -> info;

  /* Done creating trie node */
  return (new_leaf);
}

static VocIndexTree make_subtrie_leaf (Pool p, VocTrie current)
{ /* Allocate node and copy contents */
  VocTrie new_leaf =
	(VocTrie) abs_pool_malloc (p, sizeof (struct voc_trie_rec), "make_subtrie_leaf");
  unsigned char key = (unsigned char) current -> search_key[current -> rem_offset];
  new_leaf -> tails = voc_index_tree_nil;
  new_leaf -> search_key = current -> search_key;
  new_leaf -> rem_offset = current -> rem_offset + 1;
  new_leaf -> info = current -> info;
  return (new_index_tree (p, voc_index_tree_nil, voc_index_tree_nil, new_leaf, key, 0));
}

static VocIndexTree make_index_tree_leaf (Pool p, char *search_key, int offset, int **ret_info)
{ VocTrie singlet_trie = make_trie_leaf (p, search_key, offset + 1, ret_info);
  unsigned char key = (unsigned char) search_key[offset];
  return (new_index_tree (p, voc_index_tree_nil, voc_index_tree_nil, singlet_trie, key, 0));
}

/*
   Trie insertion
*/
static void enter_into_trie (Pool p, VocTrie *root, char *search_key, int offset, int **ret_info);
static void enter_into_index_tree (Pool p, VocIndexTree *root, char *search_key, int offset, int **ret_info)
{ VocIndexTree *fixation;
  VocIndexTree *insert_ptr;
  VocIndexTree last_unbal;
  VocIndexTree below_unbal;
  VocIndexTree twobelow_unbal;
  VocIndexTree ptr, new_index_tree;
  unsigned char key = (unsigned char) search_key[offset];

  /*
     The following action can only be taken when our father node contains
     a valid prefix of our search key and our search key is the first key
     to be entered with this prefix.
  */
  if (*root == voc_index_tree_nil)
    { *root = make_index_tree_leaf (p, search_key, offset, ret_info);
      return;
    };

  /* search in tree */
  insert_ptr = root;
  fixation = root;
  last_unbal = *root;
  while ((*insert_ptr) != voc_index_tree_nil)
    { if ((*insert_ptr) -> balfac)
	{ fixation = insert_ptr;
	  last_unbal = *insert_ptr;
	};
      if (key < (*insert_ptr) -> key)
        insert_ptr = &((*insert_ptr) -> left);
      else if (key > (*insert_ptr) -> key)
	insert_ptr = &((*insert_ptr) -> right);
      else
	{ /* We found a subtrie for this key, continue with next offset */
	  enter_into_trie (p, &((*insert_ptr) -> sub_trie), search_key, offset + 1, ret_info);
	  return;
        };
    };

  /*
     We found a location to insert the new leaf and we remember it: the
     pointer rotations for rebalancing may not leave *insert_ptr intact
  */
  new_index_tree = make_index_tree_leaf (p, search_key, offset, ret_info);
  *insert_ptr = new_index_tree;

  /* adjust balance factors from last_unbal to the inserted node */
  if (key < last_unbal -> key)
    { below_unbal = last_unbal -> left;
      last_unbal -> balfac--;
    }
  else
    { below_unbal = last_unbal -> right;
      last_unbal -> balfac++;
    };

  ptr = below_unbal;
  while (ptr != new_index_tree)
    if (key < ptr -> key)
      { ptr -> balfac--;
	ptr = ptr -> left;
      }
    else
      { ptr -> balfac++;
	ptr = ptr -> right;
      };

  /* if tree not too much out of balance, done */
  if ((-1 <= last_unbal -> balfac) && (last_unbal -> balfac <= 1))
    return;

  /* if last_unbal has the same sign as below_unbal, it is easy */
  if (((last_unbal -> balfac > 0) && (below_unbal -> balfac > 0)) ||
      ((last_unbal -> balfac < 0) && (below_unbal -> balfac < 0)))
    { if (last_unbal -> balfac > 0)
	{ last_unbal -> right = below_unbal -> left;
	  below_unbal -> left = last_unbal;
	}
      else
	{ last_unbal -> left = below_unbal -> right;
	  below_unbal -> right = last_unbal;
	}
      last_unbal -> balfac = 0;
      below_unbal -> balfac = 0;
      *fixation = below_unbal;
      return;
    };

  /* The difficult case */
  if (last_unbal -> balfac > 0)
    { twobelow_unbal = below_unbal -> left;
      last_unbal -> right = twobelow_unbal -> left;
      below_unbal -> left = twobelow_unbal -> right;
      twobelow_unbal -> left = last_unbal;
      twobelow_unbal -> right = below_unbal;
      last_unbal -> balfac = (twobelow_unbal -> balfac == 1)?-1:0;
      below_unbal -> balfac = (twobelow_unbal -> balfac == -1)?1:0;
    }
  else
    { twobelow_unbal = below_unbal -> right;
      last_unbal -> left = twobelow_unbal -> right;
      below_unbal -> right = twobelow_unbal -> left;
      twobelow_unbal -> right = last_unbal;
      twobelow_unbal -> left = below_unbal;
      last_unbal -> balfac = (twobelow_unbal -> balfac == -1)?1:0;
      below_unbal -> balfac = (twobelow_unbal -> balfac == 1)?-1:0;
    };
  twobelow_unbal -> balfac = 0;
  *fixation = twobelow_unbal;
}

static void enter_into_trie (Pool p, VocTrie *root, char *search_key, int offset, int **ret_info)
{ VocTrie current;
  char *curr_key;
  int curr_offset;
  char *curr_remainder;

  /* If we point to nil, create a new leaf and be done */
  if (*root == voc_trie_nil)
    { *root = make_trie_leaf (p, search_key, offset, ret_info);
      return;
    };

  /* Pick up current trie node, key and offset */
  current = *root;
  curr_key = current -> search_key;
  curr_offset = current -> rem_offset;
  curr_remainder = curr_key + curr_offset;

  /*
     If the current remainder is the same as our own remainder,
     we have entered it before.
  */
  if ((curr_key != NULL) && (strcmp (curr_remainder, search_key + offset) == 0))
    { *ret_info = &current -> info;
      return;
    };

  /*
     The key for which we are searching and the key of this node
     are now certain to be different. If the current remainder still
     points to a string of length > 0, the current node is a leaf of
     the trie. In this case we lower the leaf to a sub_trie and
     continue the search.
  */
  if ((curr_key != NULL) && (*curr_remainder != '\0'))
    { /* Invalidate current entry */
      current -> tails = make_subtrie_leaf (p, current);
      current -> info = 0;
      current -> search_key = NULL;
      current -> rem_offset = 0;
    };

  /*
     At this point there are two possibilities:
     a) The current key has an empty remainder and is a prefix of our search key
	(Note that it cannot be identical; that case has been checked earlier)
     b) The current key was lowered by the previous check.

     Hence if we are at the end of our search key we may enter it. This is
     the final action when we enter a prefix of an already entered key.
  */
  if (search_key[offset] == '\0')
    { current -> search_key = abs_pool_new_string (p, search_key, "enter_into_trie");
      current -> rem_offset = offset;
      current -> info = 0;
      *ret_info = &current -> info;
      return;
    };

  /*
     At this point the current search pattern is not exhausted, not equal
     to the remainder of the current node and a subtrie may exist.
     We continue through the index_tree.
  */
  enter_into_index_tree (p, &current -> tails, search_key, offset, ret_info);
}

/*
   Lookup in vocabularies
   Note that this is the exact matching lookup.
*/
static int *lookup_in_trie (VocTrie voc, char *search_key, int offset);
static int *lookup_in_index_tree (VocIndexTree vix, char *search_key, int offset)
{ /* We have a non zero character key to search for in the binary tree */
  unsigned char key = (unsigned char) search_key[offset];
  VocIndexTree lv = vix;

  /* Iterative lookup */
  while (lv != voc_index_tree_nil)
    { if (key < lv -> key) lv = lv -> left;
      else if (key > lv -> key) lv = lv -> right;
      else
	/* Keys match; continue searching in subtrie */
	return (lookup_in_trie (lv -> sub_trie, search_key, offset + 1));
    };

  return (NULL);
}

static int *lookup_in_trie (VocTrie voc, char *search_key, int offset)
{ char *curr_key;
  int curr_offset;

  /* done for an empty vocabulary */
  if (voc == voc_trie_nil)
    return (NULL);

  /* check if our search_key and our current search key match */
  curr_key = voc -> search_key;
  curr_offset = voc -> rem_offset;
  if ((curr_key != NULL) && !strcmp (curr_key + curr_offset, search_key + offset))
    { /* Keys match, return info */
      return (&voc -> info);
    };

  /* No match on current key; if the current key is a subtrie tail, we fail */
  if ((curr_key != NULL) && (curr_key[curr_offset] != '\0'))
    return (NULL);

  /* No match on current key; if we have exhausted the search key, we fail */
  if (search_key[offset] == '\0')  
    return (NULL);

  /* The search key is not exhausted and a subtrie exists */
  return (lookup_in_index_tree (voc -> tails, search_key, offset));  
}

/*
   Dumping of vocabularies
*/
static void dump_indent (int ind1, int ind2)
{ int ix;
  for (ix = 0; ix < ind1; ix++)
     abs_printf (">");
  for (ix = 0; ix < ind1 + ind2; ix++)
     abs_printf (" ");
}

static void dump_trie (VocTrie l, int ind1, int ind2);
static void dump_index_tree (VocIndexTree l, int ind1, int ind2)
{ if (l == voc_index_tree_nil) return;
  dump_index_tree (l -> left, ind1, ind2 + 1);
  dump_trie (l -> sub_trie, ind1, ind2);
  dump_index_tree (l -> right, ind1, ind2 + 1);
}

static void dump_trie (VocTrie l, int ind1, int ind2)
{ if (l == voc_trie_nil) abs_abort ("dump_trie", "nil trie");
  if (l -> search_key != NULL)
     { /* dump key with info */
       dump_indent (ind1, ind2);
       abs_message ("'%s' (rem %d): %d", l -> search_key, l -> rem_offset, l -> info);
     };
  if (l -> tails != voc_index_tree_nil)
     dump_index_tree (l -> tails, ind1 + 1, 0);
}

/*
   Iterating over vocabularies
*/
static void iterate_over_trie (VocTrie l, VocIterProc vip, void *arg);
static void iterate_over_index_tree (VocIndexTree l, VocIterProc vip, void *arg)
{ if (l == voc_index_tree_nil) return;
  iterate_over_index_tree (l -> left, vip, arg);
  iterate_over_trie (l -> sub_trie, vip, arg);
  iterate_over_index_tree (l -> right, vip, arg);
}

static void iterate_over_trie (VocTrie l, VocIterProc vip, void *arg)
{ if (l == voc_trie_nil) abs_abort ("iterate_over_trie", "nil trie");
  if (l -> search_key != NULL)
    /* Process key with arg and info */
    vip (arg, l -> search_key, l -> info);
  if (l -> tails != voc_index_tree_nil)
    iterate_over_index_tree (l -> tails, vip, arg);
}

/*
   Saving of vocabularies
*/
static void bin_save_trie (BinFile bf, VocTrie l);
static void bin_save_index_tree (BinFile bf, VocIndexTree l)
{ char tag = 4;	/* Mark as tree node */
  if (l == voc_index_tree_nil) abs_bug ("bin_save_index_tree", "null index tree");
  if (l -> sub_trie == voc_trie_nil) abs_bug ("bin_save_index_tree", "null subtrie");
  if (l -> left != voc_index_tree_nil) tag |= 1;
  if (l -> right != voc_index_tree_nil) tag |= 2;
  abs_bin_save_char (bf, tag);
  abs_bin_save_char (bf, l -> key);
  abs_bin_save_char (bf, l -> balfac);
  if (tag & 1) bin_save_index_tree (bf, l -> left);
  bin_save_trie (bf, l -> sub_trie);
  if (tag & 2) bin_save_index_tree (bf, l -> right);
}

static void bin_save_trie (BinFile bf, VocTrie l)
{ char tag = 0;	/* Mark as trie node */
  if ((l != voc_trie_nil) && (l -> search_key != NULL)) tag |= 1;
  if ((l != voc_trie_nil) && (l -> tails != voc_index_tree_nil)) tag |= 2;
  abs_bin_save_char (bf, tag);
  if (tag & 1)
    { /* Save key, offset and info */
      abs_bin_save_string (bf, l -> search_key);
      abs_bin_save_int (bf, l -> rem_offset);
      abs_bin_save_int (bf, l -> info);
    };
  if (tag & 2) bin_save_index_tree (bf, l -> tails);
}

/*
   Loading of vocabularies
*/
static void bin_load_trie (Pool p, BinFile bf, VocTrie *l);
static void bin_load_index_tree (Pool p, BinFile bf, VocIndexTree *l)
{ char tag;
  unsigned char key;
  signed char balfac;
  VocTrie sub_trie;
  VocIndexTree left = voc_index_tree_nil;
  VocIndexTree right = voc_index_tree_nil;
  abs_bin_load_char (bf, &tag);
  if ((tag & 0xf8) != 0)
    abs_abort ("bin_load_index_tree", "read erroneous index tree tag");
  abs_bin_load_char (bf, ((char *) &key));
  abs_bin_load_char (bf, ((char *) &balfac));
  if ((balfac != -1) && (balfac != 0) && (balfac != 1))
    abs_abort ("bin_load_index_tree", "read erroneous balance factor");
  if (tag & 1) bin_load_index_tree (p, bf, &left);
  bin_load_trie (p, bf, &sub_trie);
  if (tag & 2) bin_load_index_tree (p, bf, &right);
  *l = new_index_tree (p, left, right, sub_trie, key, balfac);
}

static void bin_load_trie (Pool p, BinFile bf, VocTrie *l)
{ VocIndexTree tails = voc_index_tree_nil;
  char *search_key = NULL;
  int rem_offset = 0;
  int info = 0;
  char tag;

  abs_bin_load_char (bf, &tag);
  if ((tag & 0xfc) != 0)
    abs_abort ("bin_load_trie", "read erroneous trie tag");
  if (tag & 1)
     { /* load key, offset and info */
       abs_bin_load_pool_string (p, bf, &search_key);
       abs_bin_load_int (bf, &rem_offset);
       abs_bin_load_int (bf, &info);
     };
  if (tag & 2) bin_load_index_tree (p, bf, &tails);
  *l = new_trie (p, tails, search_key, rem_offset, info);
}

Vocabulary new_Vocabulary (Pool p)
{ Vocabulary v;
  if (p == NULL)
    p = abs_pool_init(1 * 1024 * 1024, 8 * 1024 * 1024);
  v = abs_pool_calloc(p, 1, sizeof(*v), "lxcn_enter_into_vocabulary");
  v -> pool = p;
  return (v);
}

/*
   The actual vocabulary interface
*/
int *lxcn_enter_into_vocabulary (Vocabulary *voc, char *search_key)
{ int *ret_info;
  if (*voc == NULL) *voc = new_Vocabulary (NULL);
  enter_into_trie ((*voc)->pool, &(*voc)->trie, search_key, 0, &ret_info);
  return (ret_info);
}

int *lxcn_lookup_in_vocabulary (Vocabulary voc, char *search_key)
{ return (lookup_in_trie (voc->trie, search_key, 0));
}

void lxcn_dump_vocabulary (Vocabulary voc)
{ if (voc) dump_trie (voc->trie, 0, 0);
}

void lxcn_iterate_over_vocabulary (Vocabulary voc, VocIterProc vip, void *arg)
{ if (voc) iterate_over_trie (voc->trie, vip, arg);
}

void lxcn_bin_save_trie (BinFile bf, Vocabulary voc)
{ bin_save_trie (bf, voc ? voc->trie : voc_trie_nil);
}

/*
 * Loads a trie into a supplied pool instead of creating a new pool.
 */
void lxcn_bin_pool_load_trie (Pool p, BinFile bf, Vocabulary *voc)
{
  if (*voc == NULL) {
      *voc = new_Vocabulary(p);
  }
  bin_load_trie ((*voc)->pool, bf, &(*voc)->trie);
  //abs_pool_stats ((*voc)->pool, "lxcn_bin_pool_load_trie");
}

/*
 * Loads a trie; assumes *voc is NULL, really, since it replaces it.
 */
void lxcn_bin_load_trie (BinFile bf, Vocabulary *voc)
{
  lxcn_bin_pool_load_trie(NULL, bf, voc);
}

int lxcn_bin_save_vocabulary (char *path, Vocabulary voc)
{ BinFile bf = abs_bin_fopen (path, "w");
  if (bf != NULL)
  { abs_bin_save_version_nr (bf, "vocabulary", vocabulary_version);
    bin_save_trie (bf, voc->trie);
    abs_bin_save_eof (bf);
    abs_bin_fclose (bf);
    return 0;
  }
  return -1;
}

int lxcn_bin_load_vocabulary (char *path, Vocabulary *voc)
{ int err = 0;
  BinFile bf = abs_bin_fopen (path, "r");
  if (bf == NULL) return -1;
  err = abs_bin_verify_version_nr (bf, "vocabulary", 0, vocabulary_version);
  if (err == 0)
  {  lxcn_bin_load_trie(bf, voc);
     err = abs_bin_verify_eof (bf);
  }
  abs_bin_fclose (bf);
  return err;
}

void lxcn_free_vocabulary (Vocabulary voc)
{ //free_voc_trie(voc);
  abs_pool_free(voc->pool, "lxcn_free_vocabulary");
}

