/*
   File: lxcn_search.c
   Defines the routines to search in the lexicon or to match grammar terminals

   Copyright 2007 Radboud University of Nijmegen

   This program is free software; you can redistribute it and/or modify
   it under the terms of the GNU General Public License as published by
   the Free Software Foundation; either version 2 of the License, or
   (at your option) any later version.

   This program is distributed in the hope that it will be useful,
   but WITHOUT ANY WARRANTY; without even the implied warranty of   
   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
   GNU Library General Public License for more details.

   You should have received a copy of the GNU General Public License
   along with this program; if not, write to the Free Software
   Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA.
 
   CVS ID: "$Id: lxcn_search.c,v 1.2 2007/10/31 14:58:06 marcs Exp $"
*/

#ifdef HAVE_CONFIG_H
#include <config.h>
#endif /* HAVE_CONFIG_H */

/* System includes */
#include <stdio.h>
#include <stdlib.h>

/* includes from the abase lib */
#include <abase_memalloc.h>
#include <abase_error.h>

/* local includes */
#include "lxcn_vocabulary.h"
#include "lxcn_lexicon.h"
#include "lxcn_vocabulary_impl.h"
#include "lxcn_lexicon_impl.h"
#include "lxcn_search.h"

/*
   Until grammar terminals are passed to the lexicon generator
   and added to the lexicon, the runtime system should match
   grammar terminals, using the following code.

   Hint for andres:
   This one should also be extended for recognition with edit_distance
*/
static char *match_lexeme_tail (char *input, char *lexeme)
{ /* Try to match the text */
  char inp_ch, lex_ch;

  while (!lxcn_is_eos (lex_ch = *lexeme++))
    { inp_ch = *input;
      if (lex_ch == ' ')
	{ /* a space matches any non empty white space */
	  if (!lxcn_is_blank (inp_ch)) return (NULL);
	  input++;

	  /* Skip blanks in input */
	  while (lxcn_is_blank (*input)) input++;
	} 

      /* Match character or its translation */
      else if (lex_ch == inp_ch) input++;
      else if (lex_ch == lxcn_translate (inp_ch)) input++;
      else return (NULL);
    };

  /* Report success */
  return (input);
}

/*
   For grammar terminals, the lexeme marker may still be encoded as
   the first character of the lexeme
*/
char *lxcn_match_lexeme (char *input, char *lexeme, LexemeType *lex_type)
{ char marker = lexeme[0];
  if (is_an_old_lex_marker (marker, lex_type)) lexeme++;
  else *lex_type = SingleToken;
  return (match_lexeme_tail (input, lexeme));
}

/*
  Matching of input against the lexicon:

  Mark that I unify some code with the matching of lexeme tails
*/

/* Impose a maximum match length */
#define MAX_MATCH_LEN 1024

/* Define the structure of the lexicon iterator */
typedef enum
{ s_marker,
  s_entry,
  s_match_p,
  s_match_t,
  s_exit
} iter_state;

typedef struct stack_item_rec *match_stack;
struct stack_item_rec
{ iter_state curr_state;	/* Iterator State */
  VocTrie curr_trie;		/* Pointer into trie */
  char *input;			/* Pointer to input to be matched */
};

struct lexicon_iterator_rec
{ NewLexicon lexicon;		/* Pointer to lexicon */
  int curr_voc;			/* Number of current vocabulary */
  int curr_mark;		/* Current marker tried */
  match_stack sp;		/* Always pointing to the last used */
  match_stack stack;		/* Stack of entries to iterate */
};

/* To optimize we could save a freed iterator in a private variable */
LexiconIterator lxcn_init_lexicon_match (char *input, NewLexicon lexicon)
{ match_stack stack = abs_calloc (MAX_MATCH_LEN + 1, sizeof (struct stack_item_rec),
				  "lxcn_init_lexicon_match");
  LexiconIterator iter = abs_malloc (sizeof (struct lexicon_iterator_rec),
				     "lxcn_init_lexicon_match");
  iter -> lexicon = lexicon;
  iter -> curr_voc = -1;
  iter -> stack = stack;
  iter -> sp = iter -> stack;
  iter -> sp -> curr_state = s_marker;
  iter -> sp -> curr_trie = voc_trie_nil;
  iter -> sp -> input = input;
  return (iter);
}

static VocTrie search_subtrie (VocTrie trie, char key)
{ VocIndexTree lv = trie -> tails;
  unsigned char ukey = (unsigned char) key;

  /* Iterative search */
  while (lv != voc_index_tree_nil)
    { if (ukey < lv -> key) lv = lv -> left;
      else if (ukey > lv -> key) lv = lv -> right;
      else /* Keys match */
	return (lv -> sub_trie);
    };
  return (voc_trie_nil);
}

char *lxcn_next_lexicon_match (LexiconIterator iter)
{ while (1)
    { iter_state state = iter -> sp -> curr_state;
      VocTrie curr_trie = iter -> sp -> curr_trie;
      char *input = iter -> sp -> input;

      /* Rare case of empty lexicon */
      switch (state)
	{ case s_marker:
	    { /* In this state we advance to the next vocabulary and marker */
	      iter -> curr_voc++;
	      if (iter -> curr_voc >= iter -> lexicon -> nr_vocabularies)
	        return (NULL);	/* Done matching */

	      /* New vocabulary to try */
	      iter -> curr_mark = iter -> lexicon -> all_lexeme_markers[iter -> curr_voc];
	      iter -> sp++;
	      iter -> sp -> curr_state = s_entry;
	      iter -> sp -> curr_trie = iter -> lexicon -> all_vocabularies[iter -> curr_voc];
	      iter -> sp -> input = input;
	    }; break;
	  case s_entry:
	    { /* Remember to continue with the next character as search key */
	      iter -> sp -> curr_state = s_match_p;
	      if (curr_trie -> search_key == NULL) break;
	      input = match_lexeme_tail (input, curr_trie -> search_key + curr_trie -> rem_offset);
	      if (input != NULL)
		return (input);
	    }; break;
	  case s_match_p:
	    { char inp_ch = *input;
	      char lex_ch = inp_ch;
	      VocTrie sub_trie;
	      if (lxcn_is_blank (inp_ch)) lex_ch = ' ';
	      sub_trie = search_subtrie (curr_trie, lex_ch);

	      /* Remember we have to check the translated character */
	      iter -> sp -> curr_state = s_match_t;
	      if (sub_trie != voc_trie_nil)
		{ /* We have longer lexemes to match */
		  input++;
		  if (lex_ch == ' ')
		    while (lxcn_is_blank (*input)) input++;
		  iter -> sp++;
		  iter -> sp -> curr_state = s_entry;
		  iter -> sp -> curr_trie = sub_trie;
		  iter -> sp -> input = input;
		  break;
		};
	    }; /* fall thru */
	  case s_match_t:
	    { char inp_ch = *input;
	      char trans_ch = lxcn_translate (inp_ch);
	      char lex_ch = trans_ch;
	      VocTrie sub_trie;

	      /* Remember we have to pop the current frame after continuation */
	      iter -> sp -> curr_state = s_exit;

	      /* If translation is identical or input is blank, we already tried the character */
	      if ((inp_ch == trans_ch) || lxcn_is_blank (inp_ch))
	        break;
	        
	      sub_trie = search_subtrie (curr_trie, lex_ch);
	      if (sub_trie != voc_trie_nil)
		{ /* We have longer lexemes to match */
	          input++;
		  iter -> sp++;
	          iter -> sp -> curr_state = s_entry;
	          iter -> sp -> curr_trie = sub_trie;
	          iter -> sp -> input = input;
	          break;
	        };
	    }; /* fall thru */
	  case s_exit:
	    { /* If possible, pop the top frame */
	      if (iter -> sp == iter -> stack)
		return (NULL);			/* Done matching */
	      iter -> sp--;
	    };
	  default: break;
	}; 
    };
  return (0);
}

void lxcn_finish_lexicon_match (LexiconIterator iter)
{ abs_free ((void *) iter -> stack, "lxcn_finish_lexicon_match");
  abs_free ((void *) iter, "lxcn_finish_lexicon_match");
}

/*
   Access routines for matches
*/
void lxcn_get_lexicon_match_info (LexiconIterator iter, int *entry_nr,
				  char **matched_lexeme, LexemeType *matched_marker)
{ VocTrie curr_trie = iter -> sp -> curr_trie;
  *entry_nr = curr_trie -> info;
  *matched_lexeme = curr_trie -> search_key;
  if (!is_an_old_lex_marker (iter -> curr_mark & 7, matched_marker))
    *matched_marker = SingleToken;
}
