/*
   File: ebase_voc_search.c
   Implements the search through the vocabularies of a lexicon

   Copyright 2012 Marc Seutter
 
   This program is free software: you can redistribute it and/or modify
   it under the terms of the GNU General Public License as published by
   the Free Software Foundation, either version 3 of the License, or
   (at your option) any later version.

   This program is distributed in the hope that it will be useful,
   but WITHOUT ANY WARRANTY; without even the implied warranty of
   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
   GNU General Public License for more details.

   You should have received a copy of the GNU General Public License
   along with this program.  If not, see <http://www.gnu.org/licenses/>.

   CVS ID: "$Id: ebase_voc_search.c,v 1.4 2012/07/27 09:30:22 marcs Exp $"
*/

#ifdef HAVE_CONFIG_H
#include <config.h>
#endif /* HAVE_CONFIG_H */

/* System includes */
#include <stdio.h>
#include <stdlib.h>
#include <string.h>
#include <assert.h>

/* libdcg includes */
#include <dcg_error.h>
#include <dcg_alloc.h>

/* local includes */
#include "ebase_input.h"
#include "ebase_vocabulary.h"
#include "ebase_lexicon.h"
#include "ebase_vocabulary_impl.h"
#include "ebase_lexicon_impl.h"
#include "ebase_voc_search.h"

static int lexeme_tail_matches (Lexicon lex, char **input, char *trans, char *lexeme,
				int curr_mark, Penalty *rpenalty)
{ /* Try to match the text */
  char *ptr = *input;
  char *ctrans = trans;
  Penalty delta_penalty = 0;
  char lex_ch;

  while ((lex_ch = *lexeme++) != 0)
    { char *tstr = NULL;
      char tch;
      
      /* If next lexeme char is ' ', expect white space */
      if (lex_ch == ' ')
	{ /* a space matches any non empty white space */
	  if (ctrans != NULL) return (0);
          if (!ebs_is_white_spaces (lex, &ptr)) return (0);
	} 

      /* If next lexeme char is the soft hyphen, process special */
      else if ((unsigned char) lex_ch == SoftHyphenChar)
        { /*
	     A soft hyphen matches white space, a hyphen or absence of hyphen:
	     well@known -> well known, well-known or wellknown.
             It may be that we also have to allow for a hyphen followed by white
	     space. Maybe we should replace the direct recognition by a call to
	     ebs_is_hyphen to incorporate all UTF8 hyphens in the recognition
	  */
	  int was_space;
	  if (ctrans != NULL) return (0);
	  was_space = ebs_is_white_spaces (lex, &ptr);
	  if (!was_space && (*ptr == '-'))
	    /* Hyphen matches */
	    ptr++;
	  /* Else do nothing, lexeme points to next letter to match */
        }

      /* If we are in the middle of a UTF8 translation */
      else if (ctrans != NULL)
	{ if (lex_ch != *ctrans) return (0);
	  ctrans++;
	  if (!(*ctrans)) ctrans = NULL;
	}

      /* Match character or its translation */
      else if (lex_ch == *ptr) ptr++;

      /* Literal lexeme, no translations */
      else if (curr_mark & LexemeLiteralBit) return (0);

      /* Try if we recognize a translation */
      else if (ebs_is_translation (lex, &ptr, &tch, NULL, &tstr, &delta_penalty))
	{ if (tstr != NULL)
	    { if (lex_ch != *tstr) return (0);
	      tstr++;
	      if (!(*tstr)) ctrans = tstr;
	    }
	  else if (lex_ch == tch) ptr++;
	  else return (0);
	}
      else return (0);
    };

  /* Report success */
  *input = ptr;
  *rpenalty += delta_penalty;
  return (1);
}

/*
   Matching of input against the lexicon.
   Since we must find every matching lexeme in the lexicon
   in the current input, we must search the lexicon using
   an iterator. 
*/

/* Define the structure of the lexicon iterator */
typedef enum
{ s_marker,
  s_entry,
  s_match_key,
  s_match_trans,
  s_match_soft_hyphen,
  s_pop_frame,
} iter_state;

typedef struct stack_item_rec *match_stack;
#define match_stack_nil ((match_stack) NULL)
struct stack_item_rec
{ iter_state curr_state;	/* Iterator State */
  VocTrie curr_trie;		/* Pointer into trie */
  Penalty curr_penalty;		/* Penalty accumulated due to translations */
  char *input;			/* Pointer to input to be matched */
  char *trans;			/* Pointer to utf8 translation to be matched */
  match_stack prev;		/* Points to previous stack entry */
};

struct lexicon_voc_iterator_rec
{ Lexicon lex;			/* Pointer to lexicon */
  int curr_voc;                 /* Number of current vocabulary */
  int curr_mark;		/* Current marker */
  char *start_input;		/* Pointer to start of input to be matched */
  match_stack sp;		/* Always pointing to the last used */
};

/*
   Note: the pushing and popping of iterator frames can be optimized by
   local caching of frames. However, since these frames end up on the
   same free list in the memory management system, they probably get
   reused time after time anyway
*/
static void push (LexiconVocIterator iter, iter_state curr_state, VocTrie curr_trie,
		  Penalty curr_penalty, char *input, char *trans)
{ match_stack new_frame = (match_stack) dcg_malloc (sizeof (struct stack_item_rec));
  new_frame -> curr_state = curr_state;
  new_frame -> curr_trie = curr_trie;
  new_frame -> curr_penalty = curr_penalty;
  new_frame -> input = input;
  new_frame -> trans = trans;
  new_frame -> prev = iter -> sp;
  iter -> sp = new_frame;
}

static void pop (LexiconVocIterator iter)
{ match_stack curr = iter -> sp;
  iter -> sp = curr -> prev;
  dcg_detach ((void **) &curr);
}

LexiconVocIterator ebs_init_lexicon_voc_match (Lexicon lex, char *input)
{ LexiconVocIterator iter =
	(LexiconVocIterator) dcg_malloc (sizeof (struct lexicon_voc_iterator_rec));
  iter -> lex = lex;
  iter -> start_input = input;
  iter -> curr_voc = -1;
  iter -> curr_mark = 0;
  iter -> sp = match_stack_nil;
  push (iter, s_marker, NULL, 0, input, NULL);
  return (iter);
}

static VocTrie search_subtrie (VocTrie trie, char key)
{ VocIndexTree lv = trie -> tails;
  unsigned char ukey = (unsigned char) key;

  /* Iterative search */
  while (lv != voc_index_tree_nil)
    { if (ukey < lv -> key) lv = lv -> left;
      else if (ukey > lv -> key) lv = lv -> right;
      else /* Keys match */
	return (lv -> sub_trie);
    };
  return (voc_trie_nil);
}

/*
   Main routine: Find the next match in the search space

   This is a state machine. It starts in state s_marker to match the lexicon
   marker (pre/postfix bits etc), then goes to s_entry to match the word
   proper. This state checks if an end of the search if reached. If so,
   it returns the next unrecognized position in the input and the total
   accumulated penalty due to translations.

   Next, each letter of the input word is matched in state s_key, and if
   it makes a difference, a translated letter is matched in s_match_trans.

   The state s_match_soft_hyphen checks if a soft hyphen is present in the
   current vocabulary trie and if so, tries to match the rest.

   Finally, the current frame is popped and a frame that was pushed earlier
   gets back to work.
*/
char *ebs_next_lexicon_voc_match (LexiconVocIterator iter, Penalty *rpenalty)
{ while (1)
    { iter_state state = iter -> sp -> curr_state;
      VocTrie curr_trie = iter -> sp -> curr_trie;
      Penalty curr_penalty = iter -> sp -> curr_penalty;
      char *input = iter -> sp -> input;
      char *trans = iter -> sp -> trans;

      /* Rare case of empty lexicon */
      switch (state)
	{ case s_marker:
	    { /* In this state we advance to the next vocabulary and marker */
	      Lexicon lex = iter -> lex;
	      iter -> curr_voc++;
	      if (iter -> curr_voc >= lex -> nr_rt_vocs)
	        return (NULL);	/* Done matching */

	      /* New vocabulary to try */
	      iter -> curr_mark = lex -> rt_voc_markers[iter -> curr_voc];
	      push (iter, s_entry, lex -> rt_vocabularies[iter -> curr_voc] -> trie, 0,
		    input, NULL);
	    }; break;

	  case s_entry:
	    { /* Remember to continue with the next character as search key */
	      iter -> sp -> curr_state = s_match_key;
	      if (curr_trie -> search_key == NULL) break;
	      if (!lexeme_tail_matches (iter -> lex, &input, trans,
					curr_trie -> search_key + curr_trie -> rem_offset,
					iter -> curr_mark, &curr_penalty))
		break;

	      /* Check if we really recognized something */
	      if (input == iter -> start_input)
		break;

	      /* Ok, report success */
	      *rpenalty = curr_penalty;
	      return (input);
	    };

	  case s_match_key:
	    { VocTrie sub_trie;
	      char lex_ch;

	      /* Determine the next character to search in the trie */
	      if (trans != NULL)
		{ lex_ch = *trans++;
		  if (*trans) trans = NULL;
		}
	      else if (ebs_is_white_space (iter -> lex, &input))
		lex_ch = ' ';
	      else lex_ch = *input++;
	      sub_trie = search_subtrie (curr_trie, lex_ch);

	      /*
		 Determine the next state to enter
	      */
	      if (trans != NULL)
		/* No translations, no soft hyphens */
		iter -> sp -> curr_state = s_pop_frame;
	      else if ((iter -> curr_mark & LexemeLiteralBit) || (lex_ch == ' '))
		/* No translations allowed */
		iter -> sp -> curr_state = s_match_soft_hyphen;
	      else iter -> sp -> curr_state = s_match_trans;

	      /* If we found a matching subtrie, we have longer lexemes to match */
	      if (sub_trie != voc_trie_nil)
		push (iter, s_entry, sub_trie, curr_penalty, input, trans);
	    }; break;

	  case s_match_trans:
	    { VocTrie sub_trie;
	      char lex_ch;
	      char *tstr;
	      char tch;
	      if (trans != NULL)
		dcg_internal_error ("ebs_next_lexicon_voc_match");

	      /* Remember we have to check for soft hyphens after this state */
	      iter -> sp -> curr_state = s_match_soft_hyphen;

	      /* Check if we can translate the character ahead */
	      if (!ebs_is_translation (iter -> lex, &input, &tch, NULL, &tstr, &curr_penalty))
		break;

	      /* If we got a string returned, we have an utf8 translation */
	      if (tstr != NULL)
		{ lex_ch = *tstr++;
		  if (*tstr) tstr = NULL;
		}
	      else lex_ch = tch;
	      sub_trie = search_subtrie (curr_trie, lex_ch);

	      /* If we found a matching subtrie, we have longer lexemes to match */
	      if (sub_trie != voc_trie_nil)
	        push (iter, s_entry, sub_trie, curr_penalty, input, tstr);
	    }; break;

	  case s_match_soft_hyphen:
            { /* Remember we have to pop the current frame after continuation */
	      VocTrie sub_trie = search_subtrie (curr_trie, SoftHyphenChar);
	      iter -> sp -> curr_state = s_pop_frame;
	      if (sub_trie == voc_trie_nil)
	        break;

	      /* We have a soft hyphen in the trie, so there are 3 possibilities */
	      /* Note: in the future we have to take all possible utf8 hyphens into account */
	      if (*input == '-') input++;
	      else if (ebs_is_white_space (iter -> lex, &input)) ;	/* Ok */
	      /* else do not move the input pointer */

	      /* Match the current input pointer against the SoftHyphen subtrie */
	      push (iter, s_entry, sub_trie, curr_penalty, input, NULL);
	      break;
            }; break;

	  case s_pop_frame:
	    { /* If possible, pop the top frame */
	      pop (iter);
	    };
	  default: break;
	}; 
    };

  return (NULL);
}

void ebs_finish_lexicon_voc_match (LexiconVocIterator *iter)
{ if ((*iter) -> sp -> curr_state != s_marker)
    dcg_internal_error ("ebs_finish_lexicon_voc_match");
  pop (*iter);
  if ((*iter) -> sp != NULL)
    dcg_internal_error ("ebs_finish_lexicon_voc_match");
  dcg_detach ((void **) iter);
}

/*
   Access routines for matches
*/
void ebs_get_lexicon_voc_match_info (LexiconVocIterator iter, int *entry_nr,
				     char **matched_lexeme, int *matched_marker)
{ VocTrie curr_trie = iter -> sp -> curr_trie;
  *entry_nr = curr_trie -> info;
  *matched_lexeme = curr_trie -> search_key;
  *matched_marker = iter -> curr_mark;
}
