/*
   File: lexemes.c
   Checks lexemes and other input related stuff

   Copyright (C) 2011 Marc Seutter

   This program is free software: you can redistribute it and/or modify
   it under the terms of the GNU General Public License as published by
   the Free Software Foundation, either version 3 of the License, or
   (at your option) any later version.

   This program is distributed in the hope that it will be useful,
   but WITHOUT ANY WARRANTY; without even the implied warranty of
   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
   GNU General Public License for more details.

   You should have received a copy of the GNU General Public License
   along with this program.  If not, see <http://www.gnu.org/licenses/>.

   CVS ID: "$Id: lexemes.c,v 1.22 2012/08/16 20:51:15 marcs Exp $"
*/

/* standard includes */
#include <stdio.h>

/* libdcg includes */
#include <dcg.h>
#include <dcg_error.h>

/* libeagbase includes */
#include <ebase_input.h>

/* Local includes */
#include "eag_ds.h"
#include "options.h"
#include "globals.h"
#include "contsens.h"
#include "lexemes.h"

/*
   For white space matching, the conventions of the old EAG{1,2} and
   those of AGFL must be combined into proper semantics.

   When the white space option has been explicitly set to an empty
   string, no white space will be implicitly read (old EAG{1,2}
   convention). Regular expressions and terminals must then explicitly
   match any spaces they want to read away.

   If the white space option has been set (default to " \t\r\n\f"),
   it is required that the single space be a part of it. In this
   case, regular expressions are transformed in the following way:

     a) a space will match one or more white space chars i.e. [WS]+.
     b) a dot (.) will not match a white space char i.e. [^WS].

   Regular expressions are therefore transformed. Simple terminals
   are also transformed before being passed to the lexicon generator.

   The check that the space is part of the white space option is
   already implemented in the lexer as well as the conversion of
   escaped \n,\t,\r to their equivalent actual characters.

   For the evaluation of affix rules, we need to store some additional
   information to recuperate the original string after checking.

   Since the handling of simple terminals is different in EAG than
   in AGFL, the user may set the agfl compatibility flag, meaning
   that the regexp operators +, *, ), [, ], ?, . will not be recognized
   in simple terminals.
*/

/*
   Collection of the white space and terminators pragmat
*/
static int convert_char_to_utf32 (char *kind, char **bptr, int *conv)
{ int nr_cbytes = 0;
  char *ptr = *bptr;
  int cur_ch;
  if (!*ptr) return (0);
  cur_ch = (int)(unsigned int)(unsigned char) *ptr++;
  *conv = cur_ch;
  *bptr = ptr;
  if (!utf8_processing) return (1);
  if (cur_ch < 0x80) return (1);

  /* We are in UTF8 mode and need continuation bytes */
  if ((cur_ch & 0xF8) == 0xF0)
    { nr_cbytes = 3;
      cur_ch = (cur_ch & 7);	/* lowest 3 bits valid, 3 continuation bytes */
    }
  else if ((cur_ch & 0xF0) == 0xE0)
    { nr_cbytes = 2;
      cur_ch = (cur_ch & 0xF);	/* lowest 4 bits valid, 2 continuation bytes */
    }
  else if ((cur_ch & 0xE0) == 0xC0)
    { nr_cbytes = 1;
      cur_ch = (cur_ch & 0x1F);	/* lowest 5 bits valid, 1 continuation byte */
    }
  else dcg_error (0, "Illegal UTF8 sequence in %s literal", kind);
  while (nr_cbytes > 0)
    { int next_ch = (int)(unsigned int)(unsigned char) *ptr;
      if (!next_ch)
	{ dcg_error (0, "Premature end of UTF8 sequence in %s literal", kind);
	  cur_ch <<= nr_cbytes;
	  *bptr = ptr;
	  *conv = cur_ch;
	  return (0);
	};
      ptr++;
      if ((next_ch & 0xC0) != 0xA0)
	dcg_error (0, "Illegal UTF8 sequence in %s literal", kind);
      cur_ch = (cur_ch << 6) | (next_ch & 0x3F);
      nr_cbytes--;
    };
  *bptr = ptr;
  *conv = cur_ch;
  return (1);
}

static int_list convert_cset_to_utf32_set (char *kind, char *src)
{ char *ptr = src;
  int_list result;
  int conv;
  
  if (src == string_nil) return (new_int_list ());
  result = init_int_list (strlen (src));
  while (convert_char_to_utf32 (kind, &ptr, &conv))
    app_int_list (result, conv);
  return (result);
}

void prepare_lexeme_gathering ()
{ dcg_hint ("      preparing lexeme administration...");
  if (white_space_option == NULL)
    white_space_option = new_string (" \t\r\n\f");
  if ((translate_option != TranslationsByFile) && (translate_fname[0]))
    { /* Normalize */
      detach_string (&translate_fname);
      translate_fname = new_string ("");
    };
  white_spaces = convert_cset_to_utf32_set ("white space", white_space_option);
  separators = convert_cset_to_utf32_set ("separators", separators_option);
}

/*
   Determine the parse limits for the regexp parser, by peeling off
   an optional leading '-' (Suffix marker), an optional leading (!)
   (Literal marker) and a trailing '-' (Prefix marker).
*/
static void pick_parse_limits (char *text, char **begin, char **end, int *marker)
{ int diff = strlen (text);
  *begin = text;
  *end = text + diff;
  if (hyphen_convention_option && ((**begin) == '-'))
    { /* Leading '-' is a suffix */
      *begin += 1;
      *marker |= LexemeSuffixBit;
      diff--;
    }
  if (!diff) return;	/* Bad terminal */
  if (hyphen_convention_option && ((**begin) == '!'))
    { /* Leading '!' is a literal match (meaningless for regexps */
      *begin += 1;
      *marker |= LexemeLiteralBit;
      diff--;
    }
  if (!diff) return;	/* Bad terminal */
  /* Check the case ending in '\-' */
  if ((diff > 1) && ((*end)[-2] == '\\') && ((*end)[-1] == '-')) return;
  if (hyphen_convention_option && ((*end)[-1] == '-'))
    { /* Trailing '-' is a prefix */
      *end -= 1;
      *marker |= LexemePrefixBit;
    }
}

/*
   We use the following grammar for regular expressions:

   regexp:
     regexp '|' term;
     term

   term:
     term closure;
     closure

   closure:
     closure '?';
     closure '*';
     closure '+';
     atom

   atom:
     <normal/utf8 character>;
     ".";
     cset;
     "(", regexp, ")";
     <empty>.

   Parsed character sets are immediately entered into the global all_csets.
   Character and character set parsing (including Unicode processing)
*/
static int is_this_character (char **ptr, char *end, char ch)
{ if (*ptr == end) return (0);
  if (**ptr != ch) return (0);
  (*ptr)++;
  return (1);
}

static void should_be_this_character (int gnr, int line, int col, char **ptr, char *end, char ch)
{ if (is_this_character (ptr, end, ch)) return;
  contsens_error_by_gnr (gnr, line, col, "'%c' expected", ch);
}

static int is_uc_character (char **ptr, char *end, int *uc_ch)
{ *uc_ch = (int)(unsigned int)(unsigned char) (**ptr);
  if (*ptr == end) return (0);
  (*ptr)++;
  return (1);
}

static int is_valid_character (int gnr, int line, int col, char **ptr, char *end, int *uc_ch)
{ int nr_cbytes = 0;
  int cur_ch;
  if (!is_uc_character (ptr, end, &cur_ch))
    { *uc_ch = 0;
      return (0);
    };
  *uc_ch = cur_ch;
  if (!utf8_processing) return (1);
  if (cur_ch < 0x80) return (1);
  if ((cur_ch & 0xF8) == 0xF0)
    { nr_cbytes = 3;
      cur_ch = (cur_ch & 7);	/* lowest 3 bits valid, 3 continuation bytes */
    }
  else if ((cur_ch & 0xF0) == 0xE0)
    { nr_cbytes = 2;
      cur_ch = (cur_ch & 0xF);	/* lowest 4 bits valid, 2 continuation bytes */
    }
  else if ((cur_ch & 0xE0) == 0xC0)
    { nr_cbytes = 1;
      cur_ch = (cur_ch & 0x1F);	/* lowest 5 bits valid, 1 continuation byte */
    }
  else contsens_error_by_gnr (gnr, line, col, "Illegal UTF8 sequence in literal");
  while (nr_cbytes > 0)
    { int next_ch;
      if (!is_uc_character (ptr, end, &next_ch))
	{ contsens_error_by_gnr (gnr, line, col, "Premature end of UTF8 sequence");
	  cur_ch <<= nr_cbytes;
	  break;
	};
      if ((next_ch & 0xC0) != 0xA0)
	contsens_error_by_gnr (gnr, line, col, "Illegal UTF8 sequence in literal");
      cur_ch = (cur_ch << 6) | (next_ch & 0x3F);
      nr_cbytes--;
    };
  *uc_ch = cur_ch;
  return (1);
}

static int add_new_unique_cset (cset my_set)
{ int ix;
  for (ix = 0; ix < all_csets -> size; ix++)
    if (cmp_cset (all_csets -> array[ix], my_set) == 0)
      return (ix);
  app_cset_list (all_csets, my_set);
  return (all_csets -> size - 1);
}

static void should_be_rest_cset (int gnr, int line, int col, char **ptr, char *end, regexp *re)
{ int invert = 0;
  int first = 0;
  cset_part_list parts = new_cset_part_list ();
  cset my_set;
  if (!is_valid_character (gnr, line, col, ptr, end, &first))
    contsens_error_by_gnr (gnr, line, col, "Premature end of character set");
      
  /* Check for invert flag */
  if (first == '^')
    { invert = 1;
      if (!is_valid_character (gnr, line, col, ptr, end, &first))
        contsens_error_by_gnr (gnr, line, col, "Premature end of character set");
    };

  /* Create character set and register it */
  my_set = new_cset (invert, parts);
  *re = regexp_nil;
  if (*ptr == end) return;	/* Error */

  /* Check for leading - or ] */
  if ((first == '-') || (first == ']'))
    { app_cset_part_list (parts, new_Part_match (first));
      if (!is_valid_character (gnr, line, col, ptr, end, &first))
        { contsens_error_by_gnr (gnr, line, col, "Premature end of character set");
	  return;
	};
    };

  /* Character set loop */
  while (1)
    { /* We have a character in first, check the next for a '-' or a ']' */
      int next, last;
      if (!is_valid_character (gnr, line, col, ptr, end, &next))
	{ contsens_error_by_gnr (gnr, line, col, "Premature end of character set");
	  app_cset_part_list (parts, new_Part_match (first));
	  break;
	}

      /* Check for closing ']' */
      if (next == ']')
	{ app_cset_part_list (parts, new_Part_match (first));
	  break;
	}

      /* If the next is not a '-' then iterate */
      if (next != '-')
	{ app_cset_part_list (parts, new_Part_match (first));
	  first = next;
	  continue;
	}

      /* Check the character following the '-' */
      if (!is_valid_character (gnr, line, col, ptr, end, &last))
	{ contsens_error_by_gnr (gnr, line, col, "Premature end of character set");
	  app_cset_part_list (parts, new_Part_match (first));
	  app_cset_part_list (parts, new_Part_match ((int) '-'));
	  break;
	}

      /* A '-' may end the character set */
      if (last == ']')
	{ /* Closing - at end */
	  app_cset_part_list (parts, new_Part_match (first));
	  app_cset_part_list (parts, new_Part_match ((int) '-'));
	  break;
	}

      /* We have a range, check its validity */
      if (first > last)
	contsens_error_by_gnr (gnr, line, col, "Illegal character range");
      else if (first == last)
	{ contsens_warning_by_gnr (gnr, line, col, "Silly character range");
	  app_cset_part_list (parts, new_Part_match (first));
	}
      else app_cset_part_list (parts, new_Part_range (first, last));

      /* Read next char */
      if (!is_valid_character (gnr, line, col, ptr, end, &first))
	{ contsens_error_by_gnr (gnr, line, col, "Premature end of character set");
	  return;
	};
      if (first == ']')
	break;
    };
  *re = new_Regexp_cset (add_new_unique_cset (my_set));
}

/*
   Regexp parsing
*/
static void may_be_regexp (int gnr, int line, int col, char **ptr, char *end, int rflag,
			   regexp *re);
static void may_be_atom (int gnr, int line, int col, char **ptr, char *end, int rflag, regexp *re)
{ int uc_char;
  if (rflag && is_this_character (ptr, end, '['))
    should_be_rest_cset (gnr, line, col, ptr, end, re);
  else if (rflag && is_this_character (ptr, end, '('))
    { may_be_regexp (gnr, line, col, ptr, end, rflag, re);
      should_be_this_character (gnr, line, col, ptr, end, ')');
    }
  else if (rflag && is_this_character (ptr, end, '.'))
    { if (strlen (white_space_option))
	*re = new_Regexp_nonwhitespace ();
      else *re = new_Regexp_anychar ();
    }
  else if (is_this_character (ptr, end, ' '))
    { if (strlen (white_space_option))
	*re = new_Regexp_whitespace ();
      else *re = new_Regexp_char ((int) ' ');
    }
  else if (is_this_character (ptr, end, '\\'))
    { uc_char = '\\';
      if (!is_valid_character (gnr, line, col, ptr, end, &uc_char))
        contsens_error_by_gnr (gnr, line, col, "character expected after backslash");
      *re = new_Regexp_char (uc_char);
    }
  else if (*ptr == end)
    *re = new_Regexp_empty ();
  else if (rflag && 
	   ((**ptr == '?')||(**ptr == '*')||(**ptr == '+')||(**ptr == ')')||(**ptr == '|')))
    *re = new_Regexp_empty ();
  else
    { (void) is_valid_character (gnr, line, col, ptr, end, &uc_char);
      *re = new_Regexp_char (uc_char);
    };
}

static void may_be_closure (int gnr, int line, int col, char **ptr, char *end, int rflag,
			    regexp *re)
{ may_be_atom (gnr, line, col, ptr, end, rflag, re);
  while (1)
    { if ((*ptr == end) || !rflag)
	return;
      else if (is_this_character (ptr, end, '?'))
	*re = new_Regexp_opt (*re);
      else if (is_this_character (ptr, end, '*'))
	*re = new_Regexp_star (*re);
      else if (is_this_character (ptr, end, '+'))
	*re = new_Regexp_plus (*re);
      else break;
    };
}

static void may_be_term (int gnr, int line, int col, char **ptr, char *end, int rflag, regexp *re)
{ regexp_list subs;
  may_be_closure (gnr, line, col, ptr, end, rflag, re);
  if (*ptr == end) return;
  if (rflag && ((**ptr == ')') || (**ptr == '|'))) return;
  subs = new_regexp_list ();
  app_regexp_list (subs, *re);
  *re = new_Regexp_concat (subs);
  while (*ptr != end)
    { regexp sub;
      if (rflag && ((**ptr == ')') || (**ptr == '|'))) return;
      may_be_closure (gnr, line, col, ptr, end, rflag, &sub);
      app_regexp_list (subs, sub);
    };
}

static void may_be_regexp (int gnr, int line, int col, char **ptr, char *end, int rflag, regexp *re)
{ may_be_term (gnr, line, col, ptr, end, rflag, re);
  if (rflag && is_this_character (ptr, end, '|'))
    { regexp_list subs = new_regexp_list ();
      app_regexp_list (subs, *re);
      *re = new_Regexp_or (subs);
      do
	{ regexp sub;
	  may_be_term (gnr, line, col, ptr, end, rflag, &sub);
	  app_regexp_list (subs, sub);
	}
      while (is_this_character (ptr, end, '|'));
    };
}

static void should_be_valid_regexp (int gnr, int line, int col, char *begin, char *end, int rflag,
				    regexp *re)
{ char *ptr = begin;
  if (ptr == end)
    { contsens_error_by_gnr (gnr, line, col, "empty regular expression or terminal");
      *re = regexp_nil;
    };
  may_be_regexp (gnr, line, col, &ptr, end, rflag, re);
  if (*re == regexp_nil) return;
  if (ptr != end)
    contsens_error_by_gnr (gnr, line, col, "garbage after valid literal");
}

/*
   Empty recognition
*/
static int regexp_can_produce_empty (regexp re)
{ switch (re -> tag)
    { case TAGRegexp_or:
	{ regexp_list subs = re -> Regexp_or.subs;
	  int ix;
	  for (ix = 0; ix < subs -> size; ix++)
	    if (regexp_can_produce_empty (subs -> array[ix]))
	      return (1);
	  return (0);
	};
      case TAGRegexp_concat:
	{ regexp_list subs = re -> Regexp_or.subs;
	  int ix;
	  for (ix = 0; ix < subs -> size; ix++)
	    if (!regexp_can_produce_empty (subs -> array[ix]))
	      return (0);
	  return (1);
	};
      case TAGRegexp_plus:
	return (regexp_can_produce_empty (re -> Regexp_plus.sub));
      case TAGRegexp_cset:
      case TAGRegexp_char:
      case TAGRegexp_anychar:
      case TAGRegexp_whitespace:
      case TAGRegexp_nonwhitespace:
	return (0);
      case TAGRegexp_star:
      case TAGRegexp_opt:
      case TAGRegexp_empty:
        return (1);
      default: dcg_bad_tag (re -> tag, "regexp_can_produce_empty");
    };
  return (0);
}

/*
   Terminal recognition
*/
static int is_my_valid_terminal (regexp re, int_list text)
{ switch (re -> tag)
    { case TAGRegexp_char:
	app_int_list (text, re -> Regexp_char.ch);
	return (1);
      case TAGRegexp_whitespace:
	app_int_list (text, (int)' ');
      case TAGRegexp_empty: return (1);
      case TAGRegexp_concat:
	{ regexp_list subs = re -> Regexp_concat.subs;
	  int ix;
	  for (ix = 0; ix < subs -> size; ix++)
	    if (!is_my_valid_terminal (subs -> array[ix], text))
	      return (0);
	  return (1);
	};
      default: break;
    };
  return (0);
}

static int is_a_valid_terminal (regexp re, int marker, terminal *term, char *origin)
{ int_list text = new_int_list ();
  if (!is_my_valid_terminal (re, text))
    { detach_int_list (&text);
      return (0);
    };
  *term = new_terminal (marker, text, origin);
  return (1);
}

static int add_new_unique_terminal (terminal my_term)
{ int ix;
  for (ix = 0; ix < all_terminals -> size; ix++)
    if (cmp_terminal (all_terminals -> array[ix], my_term) == 0)
      return (ix);
  app_terminal_list (all_terminals, my_term);
  return (all_terminals -> size - 1);
}

/*
   Regexp compilation into an NFA
*/
static nfa_state make_new_state (nfa my_nfa)
{ nfa_state my_state = new_nfa_state (my_nfa -> states -> size, 0, new_nfa_trans_list ());
  app_nfa_state_list (my_nfa -> states, my_state);
  return (my_state);
}

static void maybe_create_final_state (nfa my_nfa, nfa_state *final)
{ if (*final == nfa_state_nil)
    *final = make_new_state (my_nfa);
}

static void add_transition (nfa my_nfa, nfa_state source, nfa_trans trans)
{ nfa_state dest_state = my_nfa -> states -> array[trans -> dest];
  app_nfa_trans_list (source -> transitions, trans);
  dest_state -> indeg++;
}

static void translate_regexp (nfa my_nfa, regexp re, nfa_state initial, nfa_state *final)
{ maybe_create_final_state (my_nfa, final);
  switch (re -> tag)
    { case TAGRegexp_or:
	{ regexp_list subs = re -> Regexp_or.subs;
	  int ix;
	  for (ix = 0; ix < subs -> size; ix++)
	    translate_regexp (my_nfa, subs -> array[ix], initial, final);
	}; break;
      case TAGRegexp_concat:
	{ regexp_list subs = re -> Regexp_or.subs;
	  nfa_state previous = initial;
	  int ix;
	  for (ix = 0; ix < subs -> size - 1; ix++)
	    { nfa_state next = nfa_state_nil;
	      translate_regexp (my_nfa, subs -> array[ix], previous, &next);
	      previous = next;
	    };
	  translate_regexp (my_nfa, subs -> array[subs -> size - 1], previous, final);
	}; break;
      case TAGRegexp_star:
	{ nfa_state inter = make_new_state (my_nfa);
	  nfa_state inter2 = nfa_state_nil;
	  nfa_trans empty_ini = new_Trans_empty (inter -> self);
	  nfa_trans empty_loop = new_Trans_empty (inter -> self);
	  nfa_trans empty_final = new_Trans_empty ((*final) -> self);
	  translate_regexp (my_nfa, re -> Regexp_star.sub, inter, &inter2);
	  add_transition (my_nfa, initial, empty_ini);
	  add_transition (my_nfa, inter2, empty_loop);
	  add_transition (my_nfa, inter, empty_final);
	}; break;
      case TAGRegexp_plus:
	{ nfa_state inter = make_new_state (my_nfa);
	  nfa_state inter2 = nfa_state_nil;
	  nfa_trans empty_ini = new_Trans_empty (inter -> self);
	  nfa_trans empty_loop = new_Trans_empty (inter -> self);
	  nfa_trans empty_final = new_Trans_empty ((*final) -> self);
	  translate_regexp (my_nfa, re -> Regexp_plus.sub, inter, &inter2);
	  add_transition (my_nfa, initial, empty_ini);
	  add_transition (my_nfa, inter2, empty_loop);
	  add_transition (my_nfa, inter2, empty_final);
	}; break;
      case TAGRegexp_opt:
	{ nfa_trans empty_bypass = new_Trans_empty ((*final) -> self);
	  translate_regexp (my_nfa, re -> Regexp_opt.sub, initial, final);
	  add_transition (my_nfa, initial, empty_bypass);
	}; break;
      case TAGRegexp_cset:
	{ nfa_trans trans_cset = new_Trans_cset ((*final) -> self, re -> Regexp_cset.cs);
	  add_transition (my_nfa, initial, trans_cset);
	}; break;
      case TAGRegexp_char:
	{ nfa_trans trans_char = new_Trans_char ((*final) -> self, re -> Regexp_char.ch);
	  add_transition (my_nfa, initial, trans_char);
	}; break;
      case TAGRegexp_anychar:
	{ nfa_trans trans_any = new_Trans_anychar ((*final) -> self);
	  add_transition (my_nfa, initial, trans_any);
	}; break;
      case TAGRegexp_whitespace:
	{ nfa_trans trans_ws = new_Trans_whitespace ((*final) -> self);
	  add_transition (my_nfa, initial, trans_ws);
	}; break;
      case TAGRegexp_nonwhitespace:
	{ nfa_trans trans_nonws = new_Trans_nonwhitespace ((*final) -> self);
	  add_transition (my_nfa, initial, trans_nonws);
	}; break;
      case TAGRegexp_empty:
	{ nfa_trans trans_empty = new_Trans_empty ((*final) -> self);
	  add_transition (my_nfa, initial, trans_empty);
	}; break;
      default: dcg_bad_tag (re -> tag, "translate_regexp");
    };
}

/*
   NFA optimization by removing chains that end in an empty transition
*/
static int can_optimize_this_trans (nfa_state_list states, nfa_trans this_trans)
{ nfa_state dest_state = states -> array[this_trans -> dest];
  nfa_trans dest_trans_out;
  if (dest_state -> indeg > 1) return (0);
  if (dest_state -> indeg < 1) return (0);	/* already taken care off */
  if (dest_state -> transitions -> size > 1) return (0);
  if (dest_state -> transitions -> size == 0) return (0);
  dest_trans_out = dest_state -> transitions -> array[0];
  if (dest_trans_out -> tag != TAGTrans_empty) return (0);

  /* We have a transition followed by a single empty transition */
  this_trans -> dest = dest_trans_out -> dest;
  del_nfa_trans_list (dest_state -> transitions, 0);
  detach_nfa_trans (&dest_trans_out);
  dest_state -> indeg = 0;
  return (1);
}

static int can_optimize_this_state (nfa_state_list states, nfa_state this_state)
{ nfa_trans_list transitions = this_state -> transitions;
  int optimized = 0;
  int ix;
  for (ix = 0; ix < transitions -> size; ix++)
    if (can_optimize_this_trans (states, transitions -> array[ix]))
      optimized = 1;
  return (optimized);
}

/*
   Remove unreachable states
*/
static int_list prepare_state_remapping (nfa my_nfa)
{ nfa_state_list my_states = my_nfa -> states;
  int_list remap = init_int_list (my_states -> size);
  int remap_nr = 2;
  int ix;

  /* Keep initial and final state */
  app_int_list (remap, 0);
  app_int_list (remap, 1);

  /* remove states with no incoming transitions */
  for (ix = 2; ix < my_states -> size; ix++)
    { nfa_state my_state = my_states -> array[ix];
      if (my_state -> indeg)
	{ app_int_list (remap, remap_nr);
	  my_state -> self = remap_nr;
	  remap_nr++;
	}
      else app_int_list (remap, -1);
    };
  return (remap);
}

static void remove_unused_states (nfa my_nfa)
{ nfa_state_list my_states = my_nfa -> states;
  int ix = 2;
  while (ix < my_states -> size)
    if (!my_states -> array[ix] -> indeg)
      del_nfa_state_list (my_states, ix);
    else ix++;
}

static void remap_transitions (nfa my_nfa, int_list remap)
{ nfa_state_list my_states = my_nfa -> states;
  int ix;
  for (ix = 0; ix < my_states -> size; ix++)
    { nfa_state my_state = my_states -> array[ix];
      nfa_trans_list transitions = my_state -> transitions;
      int iy;
      for (iy = 0; iy < transitions -> size; iy++)
	{ nfa_trans my_trans = transitions -> array[iy];
	  my_trans -> dest = remap -> array[my_trans -> dest];
	};
    };
}

static void optimize_nfa (nfa my_nfa)
{ int_list remap;
  int change;
  do
    { nfa_state_list my_states = my_nfa -> states;
      int ix;
      change = 0;
      for (ix = 0; ix < my_states -> size; ix++)
	if (can_optimize_this_state (my_states, my_states -> array[ix]))
	  change = 1;
    }
  while (change);
  remap = prepare_state_remapping (my_nfa);
  remove_unused_states (my_nfa);
  remap_transitions (my_nfa, remap);
}

static int compile_regexp (regexp re, int kind, int marker, char *origin)
{ nfa_state initial = new_nfa_state (0, 0, new_nfa_trans_list ());
  nfa_state final = new_nfa_state (1, 1, new_nfa_trans_list ());
  nfa_state_list my_states = new_nfa_state_list ();
  nfa my_nfa = new_nfa (kind, marker, attach_string (origin), my_states);
  app_nfa_state_list (my_states, initial);
  app_nfa_state_list (my_states, final);
  translate_regexp (my_nfa, re, initial, &final);
  if (full_verbose && debug && dump_lexemes)
    { dcg_wlog ("Translated NFA:");
      pp_nfa (dcg_error_file (), my_nfa);
      dcg_wlog ("");
    };
  optimize_nfa (my_nfa);
  if (full_verbose && debug && dump_lexemes)
    { dcg_wlog ("Optimized NFA:");
      pp_nfa (dcg_error_file (), my_nfa);
      dcg_wlog ("");
    };

  /* For the moment we do not believe in identical nfa's */
  app_nfa_list (all_regexp_nfas, my_nfa);
  return (all_regexp_nfas -> size - 1);
}

/*
   Lexeme recognition
   Reparse the text of a lexeme and convert it into either a terminal
   or a nondeterministic finite automaton (regular expression matcher).
*/
void analyze_lexeme (int gnr, int line, int col, int rflag, char *text, int kind,
		     int *termnr, int *is_regexp, nullability *empty)
{ terminal my_term = terminal_nil;
  char *begin, *end;
  int marker = 0;
  regexp re;
  *empty = e_unknown;
  if (full_verbose && dump_lexemes)
    contsens_warning_by_gnr (gnr, line, col, "Entering regexp compiler with '%s'", text);
  pick_parse_limits (text, &begin, &end, &marker);
  should_be_valid_regexp (gnr, line, col, begin, end, rflag, &re);
  if (re == regexp_nil)
    { /* Error recovery */
      *termnr = -1;
      *is_regexp = 1;
      return;
    };

  /* For a regexp/terminal, we can deduce whether it may produce empty or not */
  if (regexp_can_produce_empty (re))
    *empty = e_may_produce_empty;
  else *empty = e_never_produces_empty;
  *is_regexp = !is_a_valid_terminal (re, marker, &my_term, text);
  if (*is_regexp)
    *termnr = compile_regexp (re, kind, marker, text);
  else *termnr = add_new_unique_terminal (my_term);
  if (full_verbose && dump_lexemes)
    { contsens_warning_by_gnr (gnr, line, col, "Parsed regexp with marker %d (%s):",
    	   marker, (*empty == e_may_produce_empty)?"can produce empty":"cannot produce empty");
      pp_regexp (dcg_error_file (), re);
      dcg_wlog ("");
      if (*is_regexp)
        { dcg_eprint ("This regexp is compiled to NFA (termnr = %d:", *termnr);
          pp_nfa (dcg_error_file (), all_regexp_nfas -> array[*termnr]);
          dcg_wlog ("");
        }
      else
        { dcg_eprint ("This regexp is a valid terminal (termnr = %d: ", *termnr);
          pp_terminal (dcg_error_file (), my_term);
          dcg_wlog ("");
        };
    };
}

char *normalize_text_constant (char *text, int *warn)
{ char *norm_cvt = (char *) dcg_malloc (strlen (text) + 1);	/* Allocate sufficient space */
  char *sptr, *dptr;
  char *norm_text;
  *warn = 0;
  for (sptr = text, dptr = norm_cvt; *sptr; )
    { /* Recognizing remaining escapes, maybe give a warning */
      if (*sptr == '\\')
	{ /* In affix terms, options strings, etc., the only valid escape left is '\\' */
	  sptr++;
	  if (*sptr != '\\') *warn = 1;
	};
      *dptr++ = *sptr++;
    };
  *dptr = '\0';
  norm_text = new_string (norm_cvt);
  dcg_detach ((void **) &norm_cvt);
  return (norm_text);
}

void try_dump_lexemes ()
{ int ix;
  if (!dump_lexemes) return;
  dcg_wlog ("Dump of white space: ");
  pp_int_list (dcg_error_file (), white_spaces);
  dcg_wlog ("\nDump of separators: ");
  pp_int_list (dcg_error_file (), separators);
  dcg_wlog ("\nTranslate option = %d, Translate file = '%s'",
	    translate_option, translate_fname);
  dcg_wlog ("Dump of all csets:");
  for (ix = 0; ix < all_csets -> size; ix++)
    { dcg_eprint ("%d: ", ix);
      pp_cset (dcg_error_file (), all_csets -> array[ix]);
      dcg_wlog ("");
    };
  dcg_wlog ("Dump of all terminals:");
  for (ix = 0; ix < all_terminals -> size; ix++)
    { dcg_eprint ("%d: ", ix);
      pp_terminal (dcg_error_file (), all_terminals -> array[ix]);
      dcg_wlog ("");
    };
  dcg_wlog ("Dump of all regexp NFAs:");
  for (ix = 0; ix < all_regexp_nfas -> size; ix++)
    { dcg_wlog ("%d: ", ix);
      pp_nfa (dcg_error_file (), all_regexp_nfas -> array[ix]);
      dcg_wlog ("");
    };
}

