/*
   File: ebase_regexp_search.c
   Implements the search through the regexps of a lexicon

   Copyright 2012 Marc Seutter
 
   This program is free software: you can redistribute it and/or modify
   it under the terms of the GNU General Public License as published by
   the Free Software Foundation, either version 3 of the License, or
   (at your option) any later version.

   This program is distributed in the hope that it will be useful,
   but WITHOUT ANY WARRANTY; without even the implied warranty of
   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
   GNU General Public License for more details.

   You should have received a copy of the GNU General Public License
   along with this program.  If not, see <http://www.gnu.org/licenses/>.

   CVS ID: "$Id: ebase_regexp_search.c,v 1.2 2012/07/27 09:30:22 marcs Exp $"
*/

#ifdef HAVE_CONFIG_H
#include <config.h>
#endif /* HAVE_CONFIG_H */

/* System includes */
#include <stdio.h>
#include <stdlib.h>
#include <string.h>
#include <assert.h>

/* libdcg includes */
#include <dcg_error.h>
#include <dcg_alloc.h>

/* local includes */
#include "ebase_ds.h"
#include "ebase_input.h"
#include "ebase_lexicon.h"
#include "ebase_lexicon_impl.h"
#include "ebase_regexp_search.h"

/*
   Since we must find every matching regular expression in the lexicon
   against the current input, we must search the lexicon using an iterator. 
*/

/* Define the structure of the lexicon iterator */
typedef enum
{ s_regexp,
  s_final,
  s_match_input,
  s_pop_frame,
} iter_state;

typedef struct stack_item_rec *match_stack;
#define match_stack_nil ((match_stack) NULL)
struct stack_item_rec
{ iter_state curr_state;	/* Iterator State */
  nfa_state curr_nfa_state;	/* Current state of the Regexp NFA */
  char *input;			/* Pointer to input to be matched */
  int curr_trans;		/* Current nfa state transition */
  match_stack prev;		/* Points to previous stack entry */
};

struct lexicon_regexp_iterator_rec
{ Lexicon lex;			/* Pointer to lexicon */
  char *start_input;		/* Pointer to start of input to be matched */
  int kind;			/* Indicates the kind of regexps we want to match */
  int curr_nfa_nr;		/* Number of current nfa */
  nfa curr_nfa;			/* Pointer to current regexp NFA */
  char *longest_end;		/* Pointer to end of longest match */
  match_stack sp;		/* Always pointing to the last used */
};

/*
   Note: the pushing and popping of iterator frames can be optimized by
   local caching of frames. However, since these frames end up on the
   same free list in the memory management system, they probably get
   reused time after time anyway
*/
static void push (LexiconRegexpIterator iter, iter_state curr_state,
		  nfa_state curr_nfa_state, char *input)
{ match_stack new_frame = (match_stack) dcg_malloc (sizeof (struct stack_item_rec));
  new_frame -> curr_state = curr_state;
  new_frame -> curr_nfa_state = curr_nfa_state;
  new_frame -> input = input;
  new_frame -> curr_trans = -1;
  new_frame -> prev = iter -> sp;
  iter -> sp = new_frame;
}

static void pop (LexiconRegexpIterator iter)
{ match_stack curr = iter -> sp;
  iter -> sp = curr -> prev;
  dcg_detach ((void **) &curr);
}

LexiconRegexpIterator ebs_init_lexicon_regexp_match (Lexicon lex, char *input, int kind)
{ LexiconRegexpIterator iter =
	(LexiconRegexpIterator) dcg_malloc (sizeof (struct lexicon_regexp_iterator_rec));
  iter -> lex = lex;
  iter -> start_input = input;
  iter -> kind = kind;
  iter -> curr_nfa_nr = -1;
  iter -> curr_nfa = NULL;
  iter -> longest_end = NULL;
  iter -> sp = match_stack_nil;
  push (iter, s_regexp, 0, input);	/* Create bottom of stack */
  return (iter);
}

/*
   Character set matcher
*/
static int in_character_set (int ch, cset cs)
{ cset_part_list parts = cs -> parts;
  int result = 0;
  int ix;
  for (ix = 0; !result && (ix < parts -> size); ix++)
    { cset_part cp = parts -> array[ix];
      switch (cp -> tag)
	{ case TAGPart_match:
	    if (ch == cp -> Part_match.ch)
	      result = 1;
	    break;
	  case TAGPart_range:
	    if ((cp -> Part_range.first <= ch) && (ch <= cp -> Part_range.last))
	      result = 1;
	    break;
	  default: dcg_bad_tag (cp -> tag, "in_character_set");
	};
    };
  if (cs -> invert) return (!result);
  return (result);
}

/*
   Main routine: Find the next match in the search space

   This is a state machine. It starts in state s_regexp to proceed
   to the next NFA.

   In the state final, we check whether we have reached a final state
   of the current NFA. Since we search a longest match, we administer it
   in the iterator and continue. 

   In the state match_input, we check the current letter in the input.
   (allowing for utf8 sequences) against the transitions of the current nfa state.
   On success, we push the next state and recognized input and continue.

   Finally, the current frame is popped and a frame that was pushed earlier
   gets back to work. If this frame has state s_regexp, it is the bottom
   element of the stack. If we had found a match, we return to our caller.
   Upon the continuation, we are back in state s_regexp to proceed.
*/
char *ebs_next_lexicon_regexp_match (LexiconRegexpIterator iter)
{ while (1)
    { iter_state state = iter -> sp -> curr_state;
      nfa_state curr_nfa_state = iter -> sp -> curr_nfa_state;
      char *input = iter -> sp -> input;

      switch (state)
	{ case s_regexp:
	    { /* In this state we advance to the next regexp NFA, matching our kind */
	      Lexicon lex = iter -> lex;
	      nfa this_nfa;
	      iter -> curr_nfa_nr++;
	      if (iter -> curr_nfa_nr >= lex -> rt_regexp_nfas -> size)
	        return (NULL);	/* Done matching */
	      this_nfa = lex -> rt_regexp_nfas -> array[iter -> curr_nfa_nr];

	      /* If this nfa is not of the kind we want to search, step to the next */
	      if (!(this_nfa -> kind & iter -> kind))
		break;

	      /* New nfa to try */
	      iter -> curr_nfa = this_nfa;
	      iter -> longest_end = NULL;
	      push (iter, s_final, this_nfa -> states -> array[0], input);
	    }; break;

	  case s_final:
	    { /* Remember to continue with the next character in the input */
	      iter -> sp -> curr_state = s_match_input;
	      iter -> sp -> curr_trans = -1;
	      if (!curr_nfa_state -> final)
		break;
	      if (iter -> longest_end == NULL)
		iter -> longest_end = iter -> sp -> input;
	      else if (iter -> sp -> input > iter -> longest_end)
		iter -> longest_end = iter -> sp -> input;
	    }; break;

	  case s_match_input:
	    { nfa_trans_list transitions = curr_nfa_state -> transitions;
	      Lexicon lex = iter -> lex;
	      nfa_trans transition;
	      nfa_state dest;
	      iter -> sp -> curr_trans++;
	      if (iter -> sp -> curr_trans == transitions -> size)
		{ iter -> sp -> curr_state = s_pop_frame;
		  break;
		};
	      transition = transitions -> array[iter -> sp -> curr_trans];
	      dest = iter -> curr_nfa -> states -> array[transition -> dest];
	      switch (transition -> tag)
		{ case TAGTrans_char:
		    { int tch = transition -> Trans_char.ch;
		      int ch;
		      if (!*input)
			break;
		      if (!lex -> utf8_processing)
			ch = (int)(unsigned int)(unsigned char) *input++;
		      else if (ebs_is_utf8_char (&input, &ch))
			break;
		      if (ch != tch)
			break;
		      push (iter, s_final, dest, input);
		    }; break;
		  case TAGTrans_cset:
		    { int cset_nr = transition -> Trans_cset.cs;
		      cset tset = lex -> rt_character_sets -> array[cset_nr];
		      int ch;
		      if (!*input)
			break;
                      if (!lex -> utf8_processing)
                        ch = (int)(unsigned int)(unsigned char) *input++;
                      else if (ebs_is_utf8_char (&input, &ch))
                        break;
		      if (!in_character_set (ch, tset))
		        break;
		      push (iter, s_final, dest, input);
		    }; break;
		  case TAGTrans_anychar:
		    { int ch;
		      if (!*input)
			break;
		      if (!lex -> utf8_processing) input++;
                      else if (ebs_is_utf8_char (&input, &ch))
                        break;
		      push (iter, s_final, dest, input);
		    }; break;
		  case TAGTrans_whitespace:
		    { if (!*input)
			break;
		      if (!ebs_is_white_spaces (lex, &input))
			break;
		      push (iter, s_final, dest, input);
		    }; break;
		  case TAGTrans_nonwhitespace:
		    { int ch;
		      if (!*input)
			break;
		      if (ebs_is_white_spaces (lex, &input))
		        break;
		      if (!lex -> utf8_processing) input++;
                      else if (ebs_is_utf8_char (&input, &ch))
                        break;
		      push (iter, s_final, dest, input);
		    }; break;
		  case TAGTrans_empty:
		    push (iter, s_final, dest, input);
		    break;
		  default: dcg_bad_tag (transition -> tag, "ebs_next_lexicon_regexp_match");
		};
	    }; break;

	  case s_pop_frame:
	    { /* If possible, pop the top frame */
	      pop (iter);
	      if ((iter -> sp -> curr_state == s_regexp) &&
		  (iter -> longest_end != NULL))
		return (iter -> longest_end);
	    };
	  default: break;
	}; 
    };

  return (NULL);
}

void ebs_finish_lexicon_regexp_match (LexiconRegexpIterator *iter)
{ if ((*iter) -> sp -> curr_state != s_regexp)
    dcg_internal_error ("ebs_finish_lexicon_regexp_match");
  pop (*iter);
  if ((*iter) -> sp != NULL)
    dcg_internal_error ("ebs_finish_lexicon_regexp_match");
  dcg_detach ((void **) iter);
}

/*
   Access routines for matches
*/
void ebs_get_lexicon_regexp_match_info (LexiconRegexpIterator iter,
					int *matched_nr, int *matched_kind, int *matched_marker)
{ *matched_nr = iter -> curr_nfa_nr;
  *matched_kind = iter -> curr_nfa -> kind;
  *matched_marker = iter -> curr_nfa -> marker;
}
