/*
   File: lxcn_search.c
   Defines the routines to search in the lexicon or to match grammar terminals

   Copyright 2009 Radboud University of Nijmegen
 
   This program is free software: you can redistribute it and/or modify
   it under the terms of the GNU General Public License as published by
   the Free Software Foundation, either version 3 of the License, or
   (at your option) any later version.

   This program is distributed in the hope that it will be useful,
   but WITHOUT ANY WARRANTY; without even the implied warranty of
   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
   GNU General Public License for more details.

   You should have received a copy of the GNU General Public License
   along with this program.  If not, see <http://www.gnu.org/licenses/>.

   CVS ID: "$Id$"
*/

/*
 * TODO: add soft-hyphen convention to lxcn_nfa part
 */

#ifdef HAVE_CONFIG_H
#include <config.h>
#endif /* HAVE_CONFIG_H */

/* System includes */
#include <stdio.h>
#include <stdlib.h>
#include <string.h>
#include <assert.h>

/* includes from the abase lib */
#include <abase_memalloc.h>
#include <abase_error.h>

/* local includes */
#include "lxcn_vocabulary.h"
#include "lxcn_lexicon.h"
#include "lxcn_vocabulary_impl.h"
#include "lxcn_lexicon_impl.h"
#include "lxcn_search.h"
#include "lxcn_nfa.h"

/* Most of the functional modifications can be turned off */
#define RECYCLE 1             /* re-use pointers */
#define ENABLE_FUZZY 0        /* 0: never, 1: use as backup, 2: always use */

/*
   Impose a maximum stack/heap size; 1024 is probably not enough for
   doing serious best-first searches.
*/
#define DEFAULT_STACK_SIZE (1024+1)*64

#ifdef DEBUG_FUZZY
#define DB_FUZZY(x) x
#else
#define DB_FUZZY(x)
#endif /* DEBUG_FUZZY */

/* Temporary inclusion of max_edit_distance variable: should become hidden */
int max_edit_distance = 0;

/* Best_first: 0 - disabled, 1 - best-only, 2 - best-only (cautious) 3 - best-first  */
enum { DepthFirst, BestOnly, BestCautious, BestAll };
int best_first = BestCautious;

/*
   The following function decides whether a found hit should prevent hits at 
   higher distances when doing a best-first search, by examining the next
   character in the input stream and the lexeme marker matched. 

   Right now it is only really suited for singletoken and multitoken tries.
*/

static int is_proper_best_hit (char *input, int curr_mark)
{ return (lxcn_is_white_space (*input) ||
          (*input == '-') ||
	  lxcn_is_eos (*input));
}

/* Try to eat white space; return true if this succeeded.  */
static int skip_white_space (char **input)
{ if (!lxcn_is_white_space (**input)) return 0;
  
  /* Skip white space in input */
  do { (*input)++; }
  while (lxcn_is_white_space (**input));

  return 1;
}

/*
 * Comparison function for searching in sorted lists of lexicon words.
 *
 * Returns INT_MAX if the input doesn't match the lexicon word (lexeme)
 *         0 if they are identical
 *         > 0 for how many translations were needed to get them to match.
 */
int lxcn_strmatch (char *input, char *lexeme)
{ /* Try to match the text */
  char tr_inp_ch, inp_ch, lex_ch;
  int penalty = 0;

  while (!lxcn_is_eos (lex_ch = *lexeme++))
    { inp_ch = *input;

      if (lex_ch == ' ')
	{ /* a space matches any non empty white space */
          if (!skip_white_space (&input)) return INT_MAX;
	} 
      else if ((unsigned char) lex_ch == SoftHyphenChar)
        { /* A soft hyphen matches white space, a hyphen or absence of hyphen */
          /* It may be that we also have to allow for a hyphen followed by white space */
	  int is_space = skip_white_space (&input);
	  if (!is_space && (inp_ch == '-'))
	    /* Hyphen matches */
	    input++;
	  /* Else do nothing, lexeme points to next letter to match */
        }

      /* Match character or its translation */
      else if (lex_ch == inp_ch) input++;
      else if ((tr_inp_ch = lxcn_translate (inp_ch)), lex_ch == tr_inp_ch)
	{ input++;
	  penalty++;
	}
      else 
        { return INT_MAX;
	}
    };

  if (lxcn_is_eos(*input))
    { /* Report success */
      return penalty;
    }
  return INT_MAX;
}

static char *match_lexeme_tail (char *input, char *lexeme, int curr_mark, int *penalty)
{ /* Try to match the text */
  char inp_ch, lex_ch;

  while (!lxcn_is_eos (lex_ch = *lexeme++))
    { inp_ch = *input;

      if (lex_ch == ' ')
	{ /* a space matches any non empty white space */
          if (!skip_white_space (&input)) return (NULL);
	} 
      else if ((unsigned char) lex_ch == SoftHyphenChar)
        { /* A soft hyphen matches white space, a hyphen or absence of hyphen */
          /* It may be that we also have to allow for a hyphen followed by white space */
	  int is_space = skip_white_space (&input);
	  if (!is_space && (inp_ch == '-'))
	    /* Hyphen matches */
	    input++;
	  /* Else do nothing, lexeme points to next letter to match */
        }

      /* Match character or its translation */
      else if (lex_ch == inp_ch) input++;
      else if (curr_mark & LexemeLiteralBit) return (NULL);
      /*else if ((curr_mark & LexemeInsensitiveBitBit) &&
	       lxcn_translate (lex_ch) == lxcn_translate (inp_ch)) input++*/
      else if (lex_ch == lxcn_translate (inp_ch))
	{ input++;
          if (penalty != NULL) *penalty += lxcn_translate_penalty (inp_ch);
	}
      else return (NULL);
    };

  /* Report success */
  return (input);
}

static void match_fuzzy_lexeme_tail (AutomatonState *nfa, char *lexeme, int cutoff)
{ int lex_ch;
  while (!lxcn_is_eos (lex_ch = *lexeme++))
    { int curdist = lxcn_nfa_feed_char (nfa, lex_ch);
      if (curdist > cutoff) return;
    };
}

/*
   Matching of input against the lexicon:
   Mark that I unify some code with the matching of lexeme tails
*/

/* Define the structure of the lexicon iterator */
typedef enum
{ s_marker,
  s_entry,
  s_match_key,
  s_match_trans,
  s_match_soft_hyphen,
  s_pop_frame,
  s_match_key_insensitive,

  /* Sensitive to order! */
  s_fuzzy_marker,
  s_fuzzy_match,
  s_fuzzy_entry,
  s_fuzzy_result,
  s_fuzzy_pop_frame,

} iter_state;

typedef struct stack_item_rec *match_stack;
struct stack_item_rec
{ AutomatonState nfa;           /* Dynamic state of the automaton */
  iter_state curr_state;	/* Iterator State */
  VocTrie curr_trie;		/* Pointer into trie */
  char *input;			/* Pointer to input to be matched */
  int penalty;			/* Penalty accrued due to char translation */
  int curr_mark;		/* Current marker tried */
};

struct lexicon_iterator_rec
{ AutomatonData nfa_table;      /* Static state of the automaton */
  char *xlat_pos[64];		/* Translate nfa pos back to input */
  Lexicon lexicon;		/* Pointer to lexicon */
  int curr_voc;                 /* Number of current vocabulary */
  int cutoff;                   /* Penalty of best result found so far */
  int ambiguity;		/* Amount of hits at cutoff distance */
  char *start_input;		/* Pointer to start of input to be matched */
  match_stack sp;		/* Always pointing to the last used */
  match_stack stack;		/* Stack of entries to iterate */
  size_t stack_size;		/* Number of entries allocated */
  /*int match_insensitive;*/	/* whether case-insensitive matching */
};

/*
   Be careful with iter -> sp after calling push
*/
static void push (LexiconIterator iter, iter_state curr_state, VocTrie curr_trie,
		  int curr_mark, char *input, int penalty)
{ size_t size = (iter -> sp) - (iter -> stack);
  if (size > iter -> stack_size)
    abs_abort("iter stack overflow","lxcn_next_lexicon_match");  /* TODO: dynamic resizing */

  iter -> sp++;
  iter -> sp -> curr_state = curr_state;
  iter -> sp -> curr_trie = curr_trie;
  iter -> sp -> curr_mark = curr_mark;
  iter -> sp -> input = input;
  iter -> sp -> penalty = penalty;
}

static void heap_push (LexiconIterator iter); /* see below */

static void push_nfa (LexiconIterator iter, iter_state curr_state, VocTrie curr_trie,
		      int curr_mark, AutomatonState *nfa, char c)
{ int mindist = 0;
  push (iter, curr_state, curr_trie, curr_mark, NULL, 0);
  if (c == 0) 
    iter -> sp -> nfa = *nfa;
  else 
    mindist = lxcn_nfa_feed_char_copy (nfa, c, &iter -> sp -> nfa);

  iter -> sp -> penalty = mindist;

  /* Undo push if nfa rejected this character */
  if (mindist > iter -> cutoff)
    iter -> sp--;
  else if (best_first)
    heap_push (iter);
}

/* 
   Sorting functions

   This part can be improved in two ways:
   - sorting a list of pointers instead of the actual
     data will probably help

   - using buckets for each distance avoids the need
     for a heap altogether
*/
static void exchange (match_stack a, match_stack b)
{ struct stack_item_rec tmp = *a;
  *a = *b;
  *b = tmp;
  lxcn_swaps++;
}

/* 
   Compare two stack frames to determine which one is more promising
   Assumes it will never encounter stack frames of a type of
   s_fuzzy_marker or below.

   Returns: 0  if frames have equal priority
            >0 if a has higher priority than b
	    <0 if a has lower priority
*/
static int cmp_stack (match_stack a, match_stack b)
{ iter_state a_state, b_state;
  int a_dist, b_dist;
  a_state = a -> curr_state;
  a_dist = a -> penalty;
  b_state = b -> curr_state;
  b_dist = b -> penalty;

  /* We can never encounter the following frames in this procedure:
     s_fuzzy_result, s_fuzzy_entry, s_fuzzy_pop */

  /* Encountering non-fuzzy frames indicate a serious problem */
  if (a_state <= s_fuzzy_match || b_state <= s_fuzzy_match)
    abs_abort ("depth-first records encountered during best-first search", "cmp_stack");

  /* s_fuzzy_pop has the highest priority */
  if (a_state == s_fuzzy_pop_frame || b_state == s_fuzzy_pop_frame)
    return (a_state - b_state);

  /* A and B are two interesting frames; we need to take a closer look:
     for active frames we are interested in the least potential distance,
     for stopped frames we are interested in matching as much as possible
     (min pos vs. max pos) */

  if (a_dist != b_dist) return (b_dist - a_dist);
  /* If the costs are identical, results have higher priority */
  else return (a_state - b_state);
}

static void heap_push (LexiconIterator iter)
{ size_t parent, i;
  match_stack heap = iter -> stack;

  /* Move the element at sp to the proper position */
  i = (iter -> sp) - heap;
  while (i > (parent=(i-1)/2))
    { if (cmp_stack (heap+i, heap+parent) <= 0) break;
      exchange (heap+i, heap+parent);
      i = parent;
    };
}

static void heap_pop (LexiconIterator iter)
{ size_t child, i, size;
  match_stack heap = iter -> stack;

  /* Exchange top & restore heap property */
  exchange (iter -> sp, heap);

  size = (iter -> sp) - heap;
  for (i=0; (child=2*i+1) < size; )
    { /* Determine best child & compare with self */
      if (child+1 < size && cmp_stack (heap+child, heap+child+1) <= 0) child++;
      if (cmp_stack (heap+i, heap+child) < 0)
        { exchange (heap+i, heap+child);
          i = child;
        }
      else break;
    };
}

/* Recycle used memory allocations; this gains us speed. */
static LexiconIterator cached_iter = NULL;

unsigned long lxcn_nodes_visited, lxcn_swaps, lxcn_entries_tried;

LexiconIterator lxcn_init_lexicon_match (char *input, Lexicon lexicon)
#if RECYCLE
{ LexiconIterator iter = cached_iter;

  /* Claim the cached pointer. Most definately not thread-safe at present. */
  cached_iter = NULL;

  if (!iter)
    { iter = abs_malloc (sizeof (struct lexicon_iterator_rec), "lxcn_init_lexicon_match");
      iter -> stack = abs_malloc (DEFAULT_STACK_SIZE * sizeof (struct stack_item_rec),
				  "lxcn_init_lexicon_match");
      iter -> stack_size  = DEFAULT_STACK_SIZE;
    };
#else
{ LexiconIterator iter = abs_malloc (sizeof (struct lexicon_iterator_rec),
				     "lxcn_init_lexicon_match");
  iter -> stack = abs_calloc (DEFAULT_STACK_SIZE, sizeof (struct stack_item_rec),
				  "lxcn_init_lexicon_match");
  iter -> stack_size  = DEFAULT_STACK_SIZE;
#endif
  DB_FUZZY (abs_message ("[Searching '%s']\n", input));
  iter -> lexicon = lexicon;
  iter -> cutoff = max_edit_distance;
  iter -> ambiguity = 0;
  iter -> start_input = input;
  iter -> curr_voc = -1;
  /*iter -> match_insensitive = 0;*/
  iter -> sp = iter -> stack;

  /* Use "s_fuzzy_marker" to skip exact matching */
#if ENABLE_FUZZY > 1
  iter -> sp -> curr_state = s_fuzzy_marker;
#else
  iter -> sp -> curr_state = s_marker;
#endif
  iter -> sp -> curr_trie = voc_trie_nil;
  iter -> sp -> curr_mark = -1;
  iter -> sp -> input = input;
  iter -> sp -> penalty = 0;
  lxcn_swaps = lxcn_nodes_visited = 0;
  lxcn_entries_tried = 0;
  return (iter);
}

void lxcn_make_iter_insensitive (LexiconIterator iter)
{
    /*iter -> match_insensitive = 1;*/
}

static VocTrie search_subtrie (VocTrie trie, char key)
{ VocIndexTree lv = trie -> tails;
  unsigned char ukey = (unsigned char) key;

  /* Iterative search */
  while (lv != voc_index_tree_nil)
    { if (ukey < lv -> key) lv = lv -> left;
      else if (ukey > lv -> key) lv = lv -> right;
      else /* Keys match */
	return (lv -> sub_trie);
    };
  return (voc_trie_nil);
}

struct key_push_args {
    char key;

    LexiconIterator iter;
    iter_state state;
    int curr_mark;
    char *input;
    int penalty;
};

static void search_subtrie_insensitive_lv (VocIndexTree lv, struct key_push_args *pa)
{ 
  if (lv->left) {
      search_subtrie_insensitive_lv (lv->left, pa);
  }
  if (lv->right) {
      search_subtrie_insensitive_lv (lv->right, pa);
  }
  if (lxcn_translate(lv->key) == lxcn_translate(pa->key)) {
      /* printf("push: lex '%c', input '%c' next state %d\n", lv->key, pa->key, pa->state); */
      push (pa->iter, pa->state, lv->sub_trie, pa->curr_mark, pa->input, pa->penalty);
  }
}

static void search_subtrie_insensitive (VocTrie trie, char key,
	/* arguments for push */ LexiconIterator iter, iter_state state, int curr_mark, char *input, int penalty)
{
    VocIndexTree lv = trie -> tails;
    struct key_push_args pa;

    if (lv == voc_index_tree_nil)
	return;

    /*
     * Store parameters in a struct and pass a reference, to
     * prevent copying them all the way down the recursion.
     */
    pa.key = key;
    pa.iter = iter;
    pa.state = state;
    pa.curr_mark = curr_mark;
    pa.input = input;
    pa.penalty = penalty;

    search_subtrie_insensitive_lv(lv, &pa);
}

/* 
   Recursively generate all continuations at once;
*/
static void expand_all_subtries (LexiconIterator iter, VocIndexTree tree, int mark,
				 AutomatonState *nfa)
{ if (tree != voc_index_tree_nil && tree -> sub_trie != voc_trie_nil)
    { push_nfa (iter, s_fuzzy_entry, tree -> sub_trie, mark, nfa, tree -> key);
      expand_all_subtries (iter, tree -> left , mark, nfa);
      expand_all_subtries (iter, tree -> right, mark, nfa);
    };
}

/*
   Optimization: search deterministically when possible.
   This is a noticeable improvement for small edit distances
 */
static void search_all_subtries (LexiconIterator iter, VocTrie trie, int mark, AutomatonState *nfa)
{ int det_pos = lxcn_nfa_deterministic_pos (nfa, iter -> cutoff);
  if (det_pos >= 0) 
    { int inp_ch = *iter -> xlat_pos[det_pos];
      int lex_ch = lxcn_is_white_space (inp_ch)? ' ' : inp_ch;
      int trans_ch = lxcn_translate (lex_ch);
      VocTrie sub_trie = search_subtrie (trie, lex_ch);

      DB_FUZZY(abs_message ("Determinism (%c) =>%s\n", lex_ch, iter -> xlat_pos[det_pos]+1));
      if (sub_trie != voc_trie_nil)
        push_nfa (iter, s_fuzzy_entry, sub_trie, mark, nfa, lex_ch);

      /* Also push the alternative, if any */
      if (lex_ch == trans_ch || lex_ch == ' ') return;
      sub_trie = search_subtrie (trie, trans_ch);
      if (sub_trie != voc_trie_nil)
        push_nfa (iter, s_fuzzy_entry, sub_trie, mark, nfa, lex_ch);
    }
  else expand_all_subtries (iter, trie -> tails, mark, nfa);
}

/*
   Main routine: find next match in the search space

   This is a state machine. It starts in state s_marker to match the lexicon
   marker (pre/postfix bits etc), then goes to s_entry to match the word
   proper.  This state checks if the end if reached.
   Otherwise, each letter of the input word is matched in state s_key, and if
   it makes a difference, the translated letter is matched in s_match_trans.
   Then state s_match_soft_hyphen checks if a soft hyphen is present in the
   lexicon and if so, matches the rest.
   Otherwise, the current frame is popped (because the match fails).
   Then a frame that was pushed earlier gets back to work.
*/
char *lxcn_next_lexicon_match (LexiconIterator iter)
{ while (1)
    { iter_state state = iter -> sp -> curr_state;
      VocTrie curr_trie = iter -> sp -> curr_trie;
      int curr_mark = iter -> sp -> curr_mark;
      char *input = iter -> sp -> input;
      int penalty = iter -> sp -> penalty;

      DB_FUZZY(abs_message ("Size[%d] state[%d] cutoff[%d]\n", iter -> sp - iter -> stack,
	       state, iter -> cutoff));

      lxcn_nodes_visited += (state == s_entry) || (state == s_fuzzy_entry);

      /* Rare case of empty lexicon */
      switch (state)
	{ case s_marker:
	    { /* In this state we advance to the next vocabulary and marker */
	      iter -> curr_voc++;
	      if (iter -> curr_voc >= iter -> lexicon -> nr_vocabularies)
#if ENABLE_FUZZY
	        { if (iter -> cutoff == 0) return (NULL);	/* Done matching */
		  iter -> sp -> curr_state = s_fuzzy_marker;
		  break;
		};
#else
	        return (NULL);	/* Done matching */
#endif

	      /* New vocabulary to try */
	      curr_mark = iter -> lexicon -> all_lexeme_markers[iter -> curr_voc];
	      push (iter, s_entry, iter -> lexicon -> all_vocabularies[iter -> curr_voc] -> trie,
		    curr_mark, input, 0);
	    }; break;

	  case s_entry:
	    { /* Remember to continue with the next character as search key */
	      /*if (iter -> match_insensitive)
	        iter -> sp -> curr_state = s_match_key_insensitive;
	      else*/
	        iter -> sp -> curr_state = s_match_key;
	      if (curr_trie -> search_key == NULL) break;
              lxcn_entries_tried++;
	      /*if (iter -> match_insensitive) curr_mark |= LexemeInsensitiveBitBit;*/
	      input = match_lexeme_tail (input, curr_trie -> search_key + curr_trie -> rem_offset,
					 curr_mark, &iter -> sp -> penalty);
	      /* Return a result if some non-empty string was matched */
	      if ((input != NULL) && (input > iter -> start_input))
	        { if ((best_first == DepthFirst) ||
		      ((best_first < BestAll) && is_proper_best_hit (input, curr_mark)))
		    iter -> cutoff = 0;
		  DB_FUZZY(abs_message ("Exact match '%s' => '%s'\n",
					curr_trie -> search_key, input));
	          /* add this: iter -> sp -> curr_state = s_pop_frame; ? */
		  return (input);
		};
	    }; break;

	  case s_fuzzy_marker:
	    { /* Initialize the NFA (which was garbage until now) */
	      AutomatonState nfa;
	      char pattern[64];
	      int i;
	      DB_FUZZY(abs_message ("Becoming fuzzy...\n"));

	      /* Create a "clean" version of the search string */
	      skip_white_space (&input);
	      for (i = 0; i < sizeof(pattern)-1 && *input; i++) 
	        { iter -> xlat_pos[i] = input;
		  pattern[i] = skip_white_space (&input)? ' ' : *input++;
		};
	      iter -> xlat_pos[i] = input;
	      pattern[i] = '\0';
	      if (*input)
		DB_FUZZY (abs_message ("Fuzzy matching ignoring part of input\n"));
	      DB_FUZZY(abs_message ("Pattern [%s]\n", pattern));

	      /* Generate NFA & multiple entries */
	      lxcn_nfa_create (pattern, max_edit_distance, &iter -> nfa_table, &nfa);

	      iter -> sp -> curr_state = s_fuzzy_pop_frame;
	      for (i = iter -> lexicon -> nr_vocabularies; i--; )
		{ VocTrie trie = iter -> lexicon -> all_vocabularies[i] -> trie;
		  curr_mark = iter -> lexicon -> all_lexeme_markers[i];
	          push_nfa (iter, s_fuzzy_entry, trie, curr_mark, &nfa, 0);
		};
	    }; break;

	  case s_fuzzy_entry:
            { AutomatonState nfa;
	      int offset, distance;
              char *wordpart;

              iter -> sp -> curr_state = s_fuzzy_match;
              if (curr_trie -> search_key == NULL) break;

	      lxcn_entries_tried++;
	      wordpart = (curr_trie -> search_key) + (curr_trie -> rem_offset);
	      DB_FUZZY(abs_message ("Trying to match '%.*s|%s'\n", curr_trie -> rem_offset,
		       curr_trie -> search_key, wordpart));

	      nfa = iter -> sp -> nfa;
	      match_fuzzy_lexeme_tail (&nfa, wordpart, iter -> cutoff);

	      /*
		 We now may have a match; but where do we let it end? We could use
		 up all the available edit distance to delete input characters (get_max_pos),
		 but this is probably a bad idea as it may upset lexicalisation of further
		 elements in the input stream.
                 
                 So presently we start at the min_pos and try to find a word seperator.
		 This also reduces the amount of partial hits found by the lexer, which
		 is good for our peace of mind.
	      */

	      lxcn_nfa_get_min_pos (&nfa, &offset, &distance);
	      while (distance <= max_edit_distance &&
		     !is_proper_best_hit (iter -> xlat_pos[offset], curr_mark))
		  distance++, offset++;

#if ENABLE_FUZZY < 2
	      /* Don't return results already given by the exact match phase */
              if (distance == 0) break;
#endif

	      /*
		 We filter matches where the penalty is higher than the length of the
		 input matched to prevent spontaneous generation of lexical elements
	      */
	      if (offset <= distance || distance > iter -> cutoff) break;

	      input = iter -> xlat_pos[offset];
	      DB_FUZZY(abs_message ("Matched (%s) at distance (%d) =>%s\n",
		       best_first? "hold" : "return", distance, input));

	      iter -> sp -> input = input;
	      iter -> sp -> penalty = distance;

	      /* If we are not searching best first, simply return */
	      if (!best_first)
		return (input);

	      /* If doing a best-ONLY, set the new cutoff value to the current hit */
	      if (best_first < BestAll && is_proper_best_hit (input, curr_mark))
		{ if (distance < iter -> cutoff) iter -> ambiguity = 0;
		  else if (best_first == BestCautious && iter -> ambiguity)
		    { /* We have found another hit at the same distance; lower the bar */
		      iter -> cutoff--;
		      iter -> ambiguity = 0;
#if ENABLE_FUZZY < 2
		      /*
			 Stop searching when the cutoff is 0,
			 since we already know there are none
		      */
		      if (iter -> cutoff == 0)
			{ iter -> sp = iter -> stack;
			  iter -> sp -> curr_state = s_pop_frame;
			  DB_FUZZY(abs_message ("Abandoned search\n"));
			  return (NULL);
			};
#endif
		      break;
		    };
		  iter -> cutoff = distance;
		  iter -> ambiguity++;
		};

	      /* Enqueue the result & continue search */
	      if (best_first == BestCautious) iter -> sp -> penalty++;
	      iter -> sp -> curr_state = s_fuzzy_result;
	      nfa = iter -> sp -> nfa;
	      heap_push (iter);
	      search_all_subtries (iter, curr_trie, curr_mark, &nfa);
	      heap_pop (iter);
	    }; break; /* fall thru? */

          /*
	     It is possible to determine whether a NFA has become deterministic, and then
             switch back to the faster routines. This would need to be done very carefully
             when mixed with best-first searching...
	  */
	  case s_fuzzy_match:
            { AutomatonState nfa;

	      nfa = iter -> sp -> nfa;
              if (iter -> sp == iter -> stack)
	        iter -> sp -> curr_state = s_fuzzy_pop_frame;
	      else iter -> sp--;  

	      search_all_subtries (iter, curr_trie, curr_mark, &nfa);
	      if (best_first) heap_pop (iter);
            }; break;

	  case s_match_key_insensitive:
	    { char inp_ch = *input;
	      char lex_ch = lxcn_is_white_space (inp_ch)? ' ' : inp_ch;
	      iter_state next_state = s_entry;
	      char *next_input = input+1;

	      if (lex_ch == ' ') skip_white_space (&next_input);

	      /* Remember we have to check soft hyphens */
	      if ((curr_mark & LexemeLiteralBit) || (lex_ch == ' '))
		  next_state = s_match_soft_hyphen;

	      /* push all possible continuations */
	      iter -> sp -> curr_state = s_pop_frame;
	      search_subtrie_insensitive (curr_trie, lex_ch,   iter, next_state, curr_mark, next_input, penalty);

	    }; break;

	  case s_match_key:
	    { char inp_ch = *input;
	      char lex_ch = lxcn_is_white_space (inp_ch)? ' ' : inp_ch;
	      VocTrie sub_trie;
	      sub_trie = search_subtrie (curr_trie, lex_ch);

	      /* Remember we have to check the translated character and soft hyphens */
	      if ((curr_mark & LexemeLiteralBit) || (lex_ch == ' '))
		iter -> sp -> curr_state = s_match_soft_hyphen;
	      else iter -> sp -> curr_state = s_match_trans;

	      if (sub_trie != voc_trie_nil)
		{ /* We have longer lexemes to match */
		  input++;
		  if (lex_ch == ' ') skip_white_space (&input);
		  push (iter, s_entry, sub_trie, curr_mark, input, penalty);
		  break;
		};
	    }; break; /* fall thru? */

	  case s_match_trans:
	    { char inp_ch = *input;
	      char trans_ch = lxcn_translate (inp_ch);
	      char lex_ch = trans_ch;
	      VocTrie sub_trie;

	      /* Remember we have to check for soft hyphens */
	      iter -> sp -> curr_state = s_match_soft_hyphen;

	      /* If translation is identical, we already tried the character */
	      if (inp_ch == trans_ch) break;
	        
	      sub_trie = search_subtrie (curr_trie, lex_ch);
	      if (sub_trie != voc_trie_nil)
		{ /* We have longer lexemes to match */
	          input++;
		  push (iter, s_entry, sub_trie, curr_mark, input,
			penalty + lxcn_translate_penalty (inp_ch));
	          break;
	        };
	    }; break; /* fall thru? */

	  case s_match_soft_hyphen:
            { /* Remember we have to pop the current frame after continuation */
	      VocTrie sub_trie = search_subtrie (curr_trie, SoftHyphenChar);
	      iter -> sp -> curr_state = s_pop_frame;
	      if (sub_trie != voc_trie_nil)
	        { /* Three possibilities */
		  char inp_ch = *input;
		  if (inp_ch == '-') input++;
		  else skip_white_space (&input);
		  /* else do not move the input pointer */

		  /* Match the current input pointer against the SoftHyphen subtrie */
		  push (iter, s_entry, sub_trie, curr_mark, input, penalty);
		  break;
		};
            }; break;

          case s_fuzzy_result:
	    { if (best_first == BestCautious) iter -> sp -> penalty--;
	      if (iter -> cutoff < iter -> sp -> penalty)
                { /* Unwind the stack */
		  DB_FUZZY(abs_message ("Unwinding...\n"));
		  iter -> sp = iter -> stack;
                  iter -> sp -> curr_state = s_fuzzy_pop_frame;
                  break;
                };
              iter -> sp -> curr_state = s_fuzzy_pop_frame;
	      DB_FUZZY(abs_message ("Releasing held match '%s' =>%s\n",
				    curr_trie -> search_key, input));
              return (input);
            }; /* fall thru */

	  case s_fuzzy_pop_frame:
	  case s_pop_frame:
	    { /* If possible, pop the top frame */
	      if (iter -> sp == iter -> stack)
		return (NULL);			/* Done matching */
	      iter -> sp--;
	      if (state == s_fuzzy_pop_frame && best_first) heap_pop (iter);
	    };
	  default: break;
	}; 
    };

  return (NULL);
}

/* Store the iterator for later reusage, if we can (not threadsafe) */
void lxcn_finish_lexicon_match (LexiconIterator iter)
{ if (cached_iter == NULL) cached_iter = iter; 
  else 
    { abs_free ((void *) iter -> stack, "lxcn_finish_lexicon_match");
      abs_free ((void *) iter, "lxcn_finish_lexicon_match");
    };
}

/* not threadsafe */
void lxcn_dispose_garbage (void)
{ if (cached_iter == NULL) return;
  lxcn_finish_lexicon_match (cached_iter);
  cached_iter = NULL;
}

/*
   Access routines for matches
*/
void lxcn_get_lexicon_match_info (LexiconIterator iter, int *entry_nr, char **matched_lexeme,
				  LexemeType *matched_marker, int *penalty)
{ VocTrie curr_trie = iter -> sp -> curr_trie;
  *entry_nr = curr_trie -> info;
  *matched_lexeme = curr_trie -> search_key;
  *matched_marker = lxcn_lex_type_from_marker (iter -> sp -> curr_mark);
  if (penalty != NULL) *penalty = iter -> sp -> penalty;
}

/*
   Some simple statistics about the lexicon
*/
static void trie_size (VocTrie trie, int *nodes, int *entries, size_t *strsize);
static void index_tree_size (VocIndexTree tree, int *nodes, int *entries, size_t *strsize)
{ if (tree == voc_index_tree_nil || tree -> sub_trie == voc_trie_nil) return;
    
  trie_size (tree -> sub_trie, nodes, entries, strsize);
  index_tree_size (tree -> left, nodes, entries, strsize);
  index_tree_size (tree -> right, nodes, entries, strsize);
}

static void trie_size (VocTrie trie, int *nodes, int *entries, size_t *strsize)
{ if (trie -> search_key != NULL)
    *strsize += strlen (trie -> search_key), *entries += 1;
  *nodes += 1;

  index_tree_size (trie -> tails, nodes, entries, strsize);
}

void lxcn_lexicon_stats (Lexicon lexicon, int *vocs, int *nodes, int *entries, size_t *strsize)
{ int nr_vocs = lexicon -> nr_vocabularies;
  *vocs = nr_vocs;
  *nodes = *entries = 0;
  *strsize = 0;
  while (nr_vocs--) 
    trie_size (lexicon -> all_vocabularies[nr_vocs] -> trie, nodes, entries, strsize);
}
