/*
   File: arts_lexer.c
   Implementation of lexical analysis module.

   Copyright 2008-2010 Radboud University of Nijmegen
 
   This program is free software; you can redistribute it and/or modify
   it under the terms of the GNU General Public License as published by
   the Free Software Foundation; either version 2 of the License, or
   (at your option) any later version.
 
   This program is distributed in the hope that it will be useful,
   but WITHOUT ANY WARRANTY; without even the implied warranty of
   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
   GNU Library General Public License for more details.
 
   You should have received a copy of the GNU General Public License
   along with this program; if not, write to the Free Software
   Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA.

   CVS ID: "$Id$"
*/
#ifdef HAVE_CONFIG_H
#include <config.h>
#endif /* HAVE_CONFIG_H */

/* standard includes */
#include <stdio.h>
#include <stdarg.h>
#include <stdlib.h>
#include <string.h>
#include <assert.h>
#include <sys/types.h>

/* libabase includes */
#include <abase_error.h>
#include <abase_memalloc.h>
#include <abase_pool_alloc.h>

/* liblexicon includes */
#include <lxcn_input.h>
#include <lxcn_search.h>

/* libtrellis includes */
#include <trel_input.h>

/* local includes */
#include "pattern.h"
#include "arts_ds.h"
#include "arts_io.h"
#include "arts_escape.h"
#include "arts_lexer.h"
#include "arts_posmemo.h"

/*
   For regular expressions, all $MATCH regexps are combined into one
   big regexp; the same holds for all $SKIP regexps. Instead of trying
   to match them all individually, the DFA executes them all in parallel.
   At any point where a match may end, this is recorded.
*/

#if DEBUG_RTS
#define DB_RTS(x) x
#else
#define DB_RTS(x)
#endif
#define DEBUG_MEMO_BLOCKING	0

/*
   When debugging with -G:
   output state ptrs as well
*/
#if DEBUG_RTS
#define PRINT_STATE_PTRS	1
#endif

/*------------------------------------------------------------------------------
// Global data
//----------------------------------------------------------------------------*/
static RegExp *	      match_combined;
static RegExp *	      skip_combined;
static LexemeType     *skip_regexp_lex_types;
static LexemeType     *match_regexp_lex_types;

/* statically related to nr_lexicon_nonterminals */
#define NR_NONT_CLASSES (arts_ifd.nr_lexicon_nonterminals + 4)

/*
   Structures for memory management.
   Each kind of object is managed by its own MemInfo.
   They are all stuffed into a memory pool for even less overhead.
   None of the memory is really freed until the very end, when
   it is all done at once. A perfect match for a pool.
*/

#define USE_POOL	1
#define MEM_STATS	0

#if USE_POOL
static struct abs_pool *mem_pool;
#endif

typedef struct
{ size_t obj_size;
  void*	 free_list;
  int    requested;
  int    allocated;
  int    freed;
} MemInfo;

static MemInfo	nest_markers_mem;
static MemInfo	neg_memo_mem;
static MemInfo	pos_memo_mem;
static MemInfo	state_mem;
static MemInfo	transition_mem;

static void maybe_free_neg_memos (NegMemo *memos);
static void maybe_free_pos_memos (PosMemo *memos);
static void maybe_free_nest_markers (char *markers);
/*static*/ void print_state_transitions (Trellis t, State state);
void print_trellis (Trellis trellis);
static void reset_pos_memos (PosMemo *pos_memos);

static void unblock_pos_memos_for_state (PosMemo *pos_memos, Trellis trellis, State state);
static void copy_blocked_posmemos(PosMemo *from, PosMemo *to);
static int unblock_neg_memos_for_state (NegMemo *neg_memos, Trellis trellis, State state);
static void copy_blocked_negmemos(NegMemo *from, NegMemo *to);
static void adjust_relative_penalties(Trellis t, State s);

#if SHOW_LEXINFO_NRS
static void show_lexinfo_nrs (char *where)
{ abs_printf ("lex_info nrs at %s:", where);
  abs_printf ("nr_nont_classes = %d ", NR_NONT_CLASSES);
  abs_printf ("nr_terminals = %d ", arts_ifd.nr_terminals);
  abs_printf ("nr_matches = %d ", arts_ifd.nr_match_regexps);
  abs_printf ("nr_skips = %d ", arts_ifd.nr_skip_regexps);
  abs_printf ("nr_neg_memos = %d ", arts_ifd.nr_neg_memos);
  abs_message ("nr_syntax_nonterminals = %d", arts_ifd.nr_syntax_nonterminals);
}
#endif

/*------------------------------------------------------------------------------
// Memory manager
//----------------------------------------------------------------------------*/
static void init_mem_info (MemInfo* mem_info, size_t obj_size)
{ if (obj_size < sizeof(void *)) obj_size = sizeof(void *);
  mem_info -> obj_size = obj_size;
  mem_info -> free_list = NULL;
  mem_info -> requested = 0;
  mem_info -> allocated = 0;
  mem_info -> freed = 0;
}

static void end_mem_info (MemInfo* mem_info)
{
 void* mem = mem_info -> free_list;
  while (mem != NULL)
    { void* next = *(void**)mem;
#if !USE_POOL
      abs_free (mem, "end_mem_info");
#endif
      mem = next;
      mem_info -> freed++;
    };
  mem_info -> free_list = NULL;
}

static void *alloc_mem (MemInfo* mem_info)
{ void *mem;
  if (mem_info -> free_list == NULL)
    {
#if USE_POOL
      mem = abs_pool_malloc (mem_pool, mem_info -> obj_size, "alloc_mem");
#else
      mem = abs_malloc (mem_info -> obj_size, "alloc_mem");
#endif
      mem_info -> allocated++;
    }
  else
    { mem = mem_info -> free_list;
      mem_info -> free_list = *(void**) mem;
    };
  mem_info -> requested++;
  return (mem);
}

static void free_mem (MemInfo *mem_info, void *mem)
{ *(void **) mem = mem_info -> free_list;
  mem_info -> free_list = mem;
}

#if MEM_STATS
static void show_mem_info (MemInfo* mem_info, const char* str)
{ abs_message ("Memory stats for %s: requested %d, allocated %d, freed %d",
	       str, mem_info -> requested, mem_info -> allocated, mem_info -> freed);
}
#endif /* MEM_STATS */

static void init_memory_manager (int nr_nont_classes)
{ int nr_neg_memos = arts_ifd.nr_neg_memos;
  int nr_pos_memos = arts_ifd.nr_syntax_nonterminals;
  int nr_nest_markers = arts_ifd.nr_lrec_nests + 1;		/* Note the + 1 */

#if USE_POOL
  /*
   * Use one pool for all mem_infos.
   * When they are all done, just free the pool.
   */
  mem_pool = abs_pool_init(512 * 1024, 1024 * 1024);
#endif

  init_mem_info (&nest_markers_mem, nr_nest_markers * sizeof(char));
  init_mem_info (&neg_memo_mem, nr_neg_memos * sizeof (NegMemo));
  init_mem_info (&pos_memo_mem, nr_pos_memos * sizeof (PosMemo));
  init_mem_info (&state_mem, sizeof (struct StateExtension));
  init_mem_info (&transition_mem, sizeof (struct TransitionExtension));
}

static void end_memory_manager (void)
{ end_mem_info (&nest_markers_mem);
  end_mem_info (&neg_memo_mem);
  end_mem_info (&pos_memo_mem);
  end_mem_info (&state_mem);
  end_mem_info (&transition_mem);
#if MEM_STATS
#if USE_POOL
  abs_pool_stats(mem_pool, "end_memory_manager");
#endif
  show_mem_info (&nest_markers_mem,       "nest markers");
  show_mem_info (&neg_memo_mem,   "neg_memo tables");
  show_mem_info (&pos_memo_mem,   "pos_memo tables");
  show_mem_info (&state_mem,      "State extensions");
  show_mem_info (&transition_mem, "Transition extensions");
#endif /* MEM_STATS */
#if USE_POOL
  /*
   * Now really free all the memory.
   */
  abs_pool_free(mem_pool, "end_memory_manager");
#endif
}

static NegMemo* alloc_neg_memos (void)
{ NegMemo* neg_memos = (NegMemo*) alloc_mem (&neg_memo_mem);
  return (neg_memos);
}

static void free_neg_memos (NegMemo* neg_memos)
{ free_mem (&neg_memo_mem, neg_memos);
}

static PosMemo *alloc_pos_memos (void)
{ PosMemo *pos_memos = (PosMemo *) alloc_mem (&pos_memo_mem);
  return (pos_memos);
}

static void free_pos_memos (PosMemo* pos_memos)
{ reset_pos_memos (pos_memos);
  free_mem(&pos_memo_mem, pos_memos);
  posmemo_unknown_count -= arts_ifd.nr_syntax_nonterminals; /* posmemo_size */
}

static void free_pos_memos_quick (PosMemo* pos_memos)
{ free_mem(&pos_memo_mem, pos_memos);
}

static char *alloc_nest_markers (void)
{ char *markers = (char *) alloc_mem(&nest_markers_mem);
  return (markers);
}

static void free_nest_markers (char *markers)
{ free_mem(&nest_markers_mem, markers);
}

/*------------------------------------------------------------------------------
// Regular expressions
//----------------------------------------------------------------------------*/

static LexemeType derive_lex_type_and_strip_hyphens (char **p_txtbeg, char **p_txtend)
/* assumes real hyphens are escaped with '\' */
{   char *t_end = *p_txtend - 1;
    int has_endhyph = 0;

    if (*p_txtend - *p_txtbeg < 2) {
	return (SingleToken);	/* avoid indexing before string */
    }

    if (*t_end == '-') {
	if (t_end[-1] != '\\') {
		has_endhyph++;
	} else {
		/* count the number of backslashes:
		   if odd, the hyphen is escaped, so we don't have a prefix
		 */
		char *hp = t_end - 1;
		while ((hp > *p_txtbeg) && (*--hp == '\\')) ;
		if (*hp != '\\') hp++;
		has_endhyph = !((t_end - hp) % 2);
	}
    }

    if (has_endhyph) {
	*p_txtend = t_end;
	if (**p_txtbeg == '-') {
	    (*p_txtbeg)++;
	    return (Infix);
	} else {
	    return (Prefix);
	}
    } else if (**p_txtbeg == '-') {
	(*p_txtbeg)++;
	return (Suffix);
    } else {
	return (SingleToken);
    }
} /* derive_lex_type_and_strip_hyphens */

static LexemeType *alloc_re_lex_types (int nr)
{ return ((LexemeType*) abs_calloc (nr, sizeof (LexemeType), "alloc_re_lex_types"));
}

static void free_re_lex_types (LexemeType *re_lex_types)
{ abs_free (re_lex_types, "free_re_lex_types");
}

static RegExp *compile_regexps (int nr_regexps, int do_matches, LexemeType **re_lex_types_h)
{ if (nr_regexps > 0)
    { int ix;
      RegExp *combined;
      LexemeType *re_lex_types = alloc_re_lex_types (nr_regexps);
      *re_lex_types_h = re_lex_types;
  
      /*
       * Instead of compiling each regexp separately, we create one that
       * combines them all. Because we use a DFA to match it, it is no slower
       * than any of the separate regexps (apart from its compilation time).
       * It can save a lot of time if none of the regexps can match.
       * The DFA can even tell you which of the alternatives has matched;
       * unfortunately that doesn't help much, since we want them all.
       */
      combined = empty_regexp();
  
      for (ix = 0; ix < nr_regexps; ix++)
        { char *re_beg = (do_matches)?arts_ifd.match_regexp_names[ix].str 
				     :arts_ifd.skip_regexp_names[ix].str;
          char *re_end = re_beg + strlen(re_beg);

          re_lex_types[ix] = derive_lex_type_and_strip_hyphens(&re_beg, &re_end);

          if (*re_end)
	    { /* re_end was set back, so we use a local copy of the string */
	      size_t re_len = re_end - re_beg;
    	      char *re_nbeg = (char *) abs_malloc (re_len + 1, "compile_regexps:dup");
    	      strncpy(re_nbeg, re_beg, re_len);
    	      re_nbeg[re_len] = '\0';
    	      add_alternative_to_regexp (combined, re_nbeg);
    	      abs_free (re_nbeg, "compile_regexps:free");
            }
	  else
	    { add_alternative_to_regexp (combined, re_beg);
            };
        }
      finalize_regexp(combined);
      return (combined);
    }
  return NULL;
}

static void delete_regexps (int nr_regexps,
			    LexemeType *re_lex_types, RegExp *combined)
{ if (nr_regexps > 0)
    free_re_lex_types (re_lex_types);
  if (combined)
    delete_regexp (combined);
}

static void init_regexps (void)
{ match_combined = compile_regexps (arts_ifd.nr_match_regexps, 1, &match_regexp_lex_types);
  skip_combined =  compile_regexps (arts_ifd.nr_skip_regexps,  0, &skip_regexp_lex_types);
}

static void end_regexps (void)
{ delete_regexps (arts_ifd.nr_match_regexps, match_regexp_lex_types, match_combined);
  delete_regexps (arts_ifd.nr_skip_regexps,  skip_regexp_lex_types,  skip_combined);
}

/*------------------------------------------------------------------------------
// Initialization and finalization of module
//----------------------------------------------------------------------------*/
void init_lexer ()
{
#if DEBUG_NONT_CLASSES
  abs_message ("init_lexer: directors_option=%d neg_memo_option=%d",
	       arts_ifd.directors_option, arts_ifd.neg_memo_option);
#endif

  init_memory_manager (NR_NONT_CLASSES);
  lxcn_init_char_tables (arts_ifd.white_space_chars,
 			 arts_ifd.translate_src, arts_ifd.translate_dst,
			 arts_ifd.translate_penalties);
  init_regexps ();
#if SHOW_LEXINFO_NRS
  show_lexinfo_nrs ("end of init_lexer");
#endif
}

/* To be called at the very end of the program */
void end_lexer (void)
{ if (arts_ifd.free_mem_option)
  { end_regexps ();
    end_memory_manager ();
  }
}

/*------------------------------------------------------------------------------
// Initialization of states
//----------------------------------------------------------------------------*/
const Penalty penalty_unknown = 0;
const Penalty penalty_transition = 1;	/* Initial penalty for any transition */
const Penalty penalty_lexicon = 1;
const Penalty penalty_regexp_match = 1;
const Penalty penalty_regexp_skip = 2;
const Penalty penalty_other = 1;
const Penalty penalty_grammar_terminal = 1;
const Penalty penalty_lexicon_terminal = 1;

/*------------------------------------------------------------------------------
// Encoding functions for Transition.terminal field.
//
// See rtslex.h for actual bit encodings.
// code_nonterminal	used in Transition expansion (by TaggedValues) after match process
// code_terminal	used in the match process
// code_regexp		used in the match process
//----------------------------------------------------------------------------*/
/*
static Terminal code_nonterminal (unsigned id, int arity)
{ return (ENCODE_NONT (id, arity));
}
*/

static Terminal code_terminal (unsigned id)
{ return (ENCODE_TERM (id));
}

static Terminal code_regexp_match (unsigned id)
{ return (ENCODE_MATCH (id));
}

static Terminal code_regexp_skip (unsigned id)
{ return (ENCODE_SKIP (id));
}

static Terminal code_whitespace ()
{ return (ENCODE_WHITE ());
}

static Terminal code_other ()
{ return (ENCODE_OTHER ());
}

/*
   Code some transition tests.
   The type seems a bit unneeded, the terminal number encodes the
   same sort of info in the top bits.
*/


int is_eos_transition (TransitionExtension transition)
{ return (transition -> type & EosBit);
}

static int is_terminal_transition (TransitionExtension transition)
{ return (transition -> type & TermBit);
}

static int is_lexicon_transition (TransitionExtension transition)
{ return (transition -> type & LexBit);
}

static int is_fact_transition (TransitionExtension transition)
{ return (transition -> type & FactBit);
}

static int is_skip_regexp_transition (TransitionExtension transition)
{ return (transition -> type & SkipBit);
}

static int is_match_regexp_transition (TransitionExtension transition)
{ return (transition -> type & MatchBit);
}

static int is_whitespace_transition (TransitionExtension transition)
{ return (transition -> type & WhiteBit);
}

static int is_other_transition (TransitionExtension transition)
{ return (transition -> type & OtherBit);
}

/*
 * Calculate a bonus for multi-token matches: the square of the
 * number of space sequences in it.
 */

int multi_token_bonus(char *lexeme, int length)
{
    int bonus = 0;

    while (length > 0) {
	if (*lexeme == ' ') {
	    bonus++;
	    while (*lexeme == ' ') {
		lexeme++;
		length--;
	    }
	} else {
	    lexeme++;
	    length--;
	}
    }
    
    if (bonus) 
	return bonus * bonus;
    else
	return 0;
}

#if 0
/*
   TODO: Maybe we should clear params, memos, next, and trans in alloc_state.
*/
static void init_transition (Transition *transition)
{ transition -> params = NULL;
  transition -> penalty = penalty_unknown;
  transition -> trans_dest_state = NULL;
  transition -> next = NULL;
}

static void init_eos_transition (Transition *transition)
{ init_transition (transition);
  transition -> terminal = code_terminal (arts_ifd.eos_terminal);
  transition -> text = "<EOS>";
  transition -> type = EosBit | code_transition_type (SingleToken);
}

static void init_other_transition (Transition *transition, char *txt, int len)
{ init_transition (transition);
  transition -> terminal = ENCODE_OTHER (0);
  transition -> text = copy_string (txt, len);
  transition -> type = OtherBit | TxtFreeBit | code_transition_type (SingleToken);
  transition -> penalty = penalty_other;
}
#endif

TransitionExtension
alloc_transition_extension()
{
    TransitionExtension ext;

    ext = (TransitionExtension)alloc_mem(&transition_mem);
    ext->type = 0;
    ext->terminal = 0;
    ext->params = NULL;
    ext->penalty = 0;
    ext->lextype = SingleToken;
    ext->flags = 0;

    return ext;
}

void
free_transition_extension_cb(Trellis t, Transition trans)
{
    TransitionExtension ext = GetTransitionExtension(trans);
    DB_RTS(abs_message("free_transition_extension_cb %p -> %p", trans, ext);)

    abs_free(ext->params, "free_transition_extension_cb");
    free_mem (&transition_mem, ext);
}

/*
 * Callback from scanning a lexicon entry.
 * nr_entries = lxcn_get_entries_from_nr (t->lexicon, entry_nr, &entries);
 * has just been called; idx ranges [ 0,nr_entries >
 *
 * Returns the class of the lexicon nonterminal,
 * or -1 for a grammar terminal.
 *
 * If called with trans == NULL, this is a final call after all lexicon
 * entries have been processed.
 */
static
int scanned_lexicon_cb(Trellis t, State s, Transition trans, int *entries, int idx, int lex_type)
{
    int kind;
    int nont_nr, arity, freq;
    int nont_class;
    Value *params;
    TransitionExtension ext;

    if (trans == NULL) {
	adjust_relative_penalties(t, s);

	return 0;
    }

    ext = alloc_transition_extension();
    trel_trans_set_ext(trans, ext);

    lxcn_get_entry_params (arts_ifd.lexicon, entries, idx, &kind, &nont_nr, &arity, &freq, &params);

    ext->lextype = lex_type;

    if (kind) {	/* a grammar terminal */
	ext->terminal =  ENCODE_TERM (nont_nr);
	nont_class = -1;
	ext->type = TermBit;

	if (arts_ifd.transition_penalties_option) {
	    ext->penalty = penalty_grammar_terminal;
	}
    } else {	/* a lexicon nonterminal */
	ext->terminal = ENCODE_NONT (nont_nr, arity);
	ext->params = params;
	nont_class = (int)arts_ifd.lex_nont_nrs_table[nont_nr].ilval;
	ext->type = LexBit;

	if (arts_ifd.transition_penalties_option) {
	    ext->penalty = penalty_lexicon_terminal
			   - bonus_from_frequency(freq, arts_ifd.radix.lexicon_frequency);
	}
	if ((nont_class < 0) || (nont_class >= arts_ifd.nr_lexicon_nonterminals))
	    abs_abort ("scanned_lexicon_cb", "bad nontnr %d to lexicon nontnr %d",
		nont_nr, nont_class);
    }

    /* already in lexicon builder 
    if (arts_ifd.transition_penalties_option &&
	    lex_type == MultiToken) {
	char *text = trel_trans_get_text(trans);
	int len = trel_trans_get_length(trans);
	ext->penalty -= multi_token_bonus(text, len);
    }
    */

    DB_RTS(abs_message("scanned_lexicon_cb %p -> %p, terminal = %x", trans, ext, ext->terminal);)

    return nont_class;
}

static
int scanned_match_cb(Trellis t, State s, Transition trans, int id)
{
    if (trans == NULL) {
	return match_regexp_lex_types[id];
    } else {
	TransitionExtension ext = alloc_transition_extension();

	trel_trans_set_ext(trans, ext);
	DB_RTS(abs_message("scanned_match_cb %p -> %p", trans, ext);)

	ext->terminal = code_regexp_match (id);
	ext->type = MatchBit;
	if (arts_ifd.transition_penalties_option) {
	    StateExtension sext = GetStateExtension(s);
	    int adj = sext->penalty_adjustment;
	    if (adj > 0)
		adj = 0;

	    ext->penalty = penalty_regexp_match - adj;
	}
	ext->lextype = match_regexp_lex_types[id];

	/* not for regexen 
	if (arts_ifd.transition_penalties_option) {
	    char *text = trel_trans_get_text(trans);
	    int len = trel_trans_get_length(trans);
	    ext->penalty -= multi_token_bonus(text, len);
	}
	*/

	return 0;
    }
}

static
int scanned_skip_cb(Trellis t, State s, Transition trans, int id)
{
    if (trans == NULL) {
	return skip_regexp_lex_types[id];
    } else {
	TransitionExtension ext = alloc_transition_extension();

	trel_trans_set_ext(trans, ext);
	DB_RTS(abs_message("scanned_skip_cb %p -> %p", trans, ext);)

	ext->terminal = code_regexp_skip (id);
	ext->type = SkipBit;
	if (arts_ifd.transition_penalties_option) {
	    StateExtension sext = GetStateExtension(s);
	    int adj = sext->penalty_adjustment;
	    if (adj > 0)
		adj = 0;

	    ext->penalty = penalty_regexp_skip - adj;
	}
	ext->lextype = skip_regexp_lex_types[id];

	/* not for regexen 
	if (arts_ifd.transition_penalties_option) {
	    char *text = trel_trans_get_text(trans);
	    int len = trel_trans_get_length(trans);
	    ext->penalty -= multi_token_bonus(text, len);
	}
	*/

	return 0;
    }
}

static
void scanned_whitespace_cb(Trellis t, State s, Transition trans)
{
    TransitionExtension ext = alloc_transition_extension();

    trel_trans_set_ext(trans, ext);
    DB_RTS(abs_message("scanned_whitespace_cb %p -> %p", trans, ext);)
    ext->terminal = code_whitespace();
    ext->type = WhiteBit;
}

static
void scanned_other_cb(Trellis t, State s, Transition trans)
{
    TransitionExtension ext = alloc_transition_extension();

    trel_trans_set_ext(trans, ext);
    DB_RTS(abs_message("scanned_other_cb %p -> %p", trans, ext);)
    ext->terminal = code_other();
    ext->type = OtherBit;
    if (arts_ifd.transition_penalties_option) {
	StateExtension sext = GetStateExtension(s);
	int adj = sext->penalty_adjustment;
	if (adj > 0)
	    adj = 0;

	ext->penalty = penalty_other - adj;

	if (arts_ifd.transition_penalties_option) {
	    char *text = trel_trans_get_text(trans);
	    int len = trel_trans_get_length(trans);
	    ext->penalty -= multi_token_bonus(text, len);
	}
    }
}

static
void scanned_eos_cb(Trellis t, State s, Transition trans)
{
    TransitionExtension ext = alloc_transition_extension();

    trel_trans_set_ext(trans, ext);
    DB_RTS(abs_message("scanned_eos_cb %p -> %p", trans, ext);)
    ext->terminal = code_terminal (arts_ifd.eos_terminal);
    ext->type = EosBit | TermBit;
}

/*
 * Once lexical scanning for an input position is complete, we can use
 * the director set to block PosMemos and NegMemos.
 * The director set is the set of terminals that are possible at the
 * start of a rule, so blocking a rule from the director set effectively
 * means/happens when "none of the terminals in the set occur here".
 * In order to evaluate this, we use a temporary set of Memos, initialised
 * all to BLOCKED. For the elements in the director set that are actually
 * present, we unblock rules. That is the information that the table
 * provides us. When we have done that, we examine the temporary Memos
 * and block the real Memos where the temporary ones remained blocked.
 *
 * (We can't just start all real Memos with blocked, since then we'd never
 * enter any rule, and so never do any lexical scanning!)
 */
#define INIT_POSMEMO_WITH_DIRSET	1
#define INIT_NEGMEMO_WITH_DIRSET	0	/* see XXX in copy_blocked_negmemos() */
#define REBLOCK_POSMEMOS		0	/* off; see copy_blocked_posmemos() */
#define REBLOCK_NEGMEMOS		1

static
void scanning_completed_cb(Trellis t, State s)
{
#if DEBUG_MEMO_BLOCKING
    abs_message("scanning_completed_cb %p pos %d.%c", s, trel_state_get_pos(s), trel_state_get_lex_state(s));
#endif

    /*
     * Process Positive memos
     */
    if (INIT_POSMEMO_WITH_DIRSET &&
	    arts_ifd.directors_option && arts_ifd.directors_set_posmemos) {
	PosMemo *pos_memos;
	StateExtension ext;
	int ix;
	pos_memos = alloc_pos_memos();

	for (ix = 0; ix < arts_ifd.nr_syntax_nonterminals; ix++)
	    pos_memos[ix] = POSMEMO_BLOCKED;
	unblock_pos_memos_for_state(pos_memos, t, s);
	ext = GetStateExtension(s);
	copy_blocked_posmemos(pos_memos, ext->pos_memos);

	free_pos_memos_quick(pos_memos);
	/* ext->flags |= MemoBit; */
    }

    /*
     * Process Negative memos
     */
    if (INIT_NEGMEMO_WITH_DIRSET &&
	    arts_ifd.directors_option && arts_ifd.directors_set_posmemos) {
	NegMemo *neg_memos;
	StateExtension ext;
	int ix;
	int neg_memo_size = arts_ifd.nr_neg_memos;
	neg_memos = alloc_neg_memos();

	for (ix = 0; ix < neg_memo_size; ix++)
	    neg_memos[ix] = NEGMEMO_BLOCKED;
	if (unblock_neg_memos_for_state(neg_memos, t, s)) {
	    ext = GetStateExtension(s);
	    copy_blocked_negmemos(neg_memos, ext->neg_memos);
	}

	free_neg_memos(neg_memos);
	/* ext->flags |= MemoBit; */
    }
}

int init_or_find_fact_transition (Trellis trellis, int *entries, int idx,
				  State i_state, Transition *where)
{ Lexicon the_lex = arts_ifd.lexicon;
  int nont_nr, arity, freq, kind;
  int encoded_fact;
  int penalty;
  Value *params;
  int found = 0;
  Transition *prev = where;
  Transition transition = *prev;
  TransitionExtension ext;

  lxcn_get_entry_params (the_lex, entries, idx, &kind, &nont_nr, &arity, &freq, &params);
  encoded_fact = ENCODE_FACT (nont_nr, arity);
  penalty = -bonus_from_frequency(freq, arts_ifd.radix.fact_frequency);

  /*
   * "where" indicates where to start looking for the transition,
   * and where to place it once found or created.
   */
  while (transition != NULL)
  { 
    ext = GetTransitionExtension(transition);
    if (ext -> terminal         == encoded_fact &&
	ext -> penalty          == penalty &&
	/* always true: transition -> trans_dest_state == i_state && */
	!memcmp(ext -> params, params, arity * sizeof (*params)))
    { found = 1;
      /* Detach it from its current list position */
      *prev = trel_trans_get_next(transition);
      /* Free the data from lxcn_get_entry_params() */
      abs_free(params, "init_fact_transition");
      break;
    }
    prev = trel_trans_get_nextptr(transition);
    transition = *prev;
  }
  
  /*
   * Duplicate some code from init_fact_transition() below,
   * to avoid repeating the expensive call to lxcn_get_entry_params().
   */
  if (transition == NULL)
  { transition = trel_alloc_transition (trellis);
    trel_trans_set_dest(transition, i_state);
    ext = alloc_transition_extension();
    trel_trans_set_ext(transition, ext);
    ext -> type = FactBit;
    ext -> terminal = encoded_fact;
    ext -> params = params;
    ext -> penalty = penalty;
    found = 0;
  }
  trel_trans_set_next(transition, *where);
  *where = transition;
  return found;
}

#define BITS_PER_WORD	32
#define CHECK_BIT(t, n)	((t)[(n) / BITS_PER_WORD].ilval & \
			 ((int64)1 << ((n) % BITS_PER_WORD)))

/*
 * Indeed, you can get the same [bug] without follower set optimisation:
 *
 * demo g:
 *    a, $SKIP("xxx");
 *    a, b, c, d, e, f + g.
 * 
 * a:	"a".
 * b, c, d, e, f: .
 * g
 *
 * The input is "ag".
 *
 * where the $SKIP completes scanning at input state 1.S, which triggers
 * blocking of posmemos which don't comply with director sets.
 * This doesn't happen more often since the memo-blocking-by-director-set
 * isn't as aggressive as one might initially think. (The fully-scanned
 * event isn't triggered so often; it was chosen not to declare scanning
 * finished before scanning $MATCHes, which doesn't always happen; otherwise
 * the trellis would fill up with a large quantity of them)
 *
 * This issue should be fixed with the introduction of
 * if_glue_in_directorset().
 */

/*
 * If a director set contains glue, that means it must be nonfalse
 * and somewhere following its use there is glue (possibly with other
 * nonfalse rules called in between).
 * If this is the case, director set checking in this input state
 * is useless, since the glue will change the state before any
 * actual input is matched.
 * Therefore, no blocking should take place.
 * (It could be made more precise to allow blocking only if there is
 * no appropriate input for either input state)
 */

static
int if_glue_in_directorset (int nont_nr)
{
    DATA *other2;

    other2 = arts_ifd.other_posmemo_dir[2].data;/* magic number! */

    /* Check if rule has glue in director set */
    if (CHECK_BIT(other2, nont_nr)) {	
#if DEBUG_FOLLOW_BLOCKING
	  abs_message("if_glue_in_directorset rule %d: YES", nont_nr);
#endif
	return 1;
    }

#if DEBUG_FOLLOW_BLOCKING
	  abs_message("if_glue_in_directorset rule %d: NO", nont_nr);
#endif
    return 0;
}

/*
 */

#if 0
Transition init_fact_transition (Trellis trellis, int *entries, int idx, State i_state)
{
  Transition transition = trel_alloc_transition (trellis);
  Lexicon the_lex = arts_ifd.lexicon;
  int nont_nr, arity, freq, kind;
  Value *params;
  TransitionExtension ext;

  lxcn_get_entry_params (the_lex, entries, idx, &kind, &nont_nr,
                        &arity, &freq, &params);

  trel_trans_set_dest(transition, i_state);
  ext = alloc_transition_extension();
  trel_trans_set_ext(transition, ext);
  ext -> type = FactBit;
  ext -> terminal = ENCODE_FACT (nont_nr, arity);
  ext -> params = params;
  ext -> penalty = -bonus_from_frequency(freq); 	
  return (transition);
}
#endif

/*
 *   freq -> bonus
 * -------+-------
 *  0	  |   0
 *  1	  |   1
 *  2	  |   2  (except for radix 2!)
 * ...
 * radix-1|   2	 (except for radix 2!)
 * radix  |   3
 * radix^2|   4
 * radix^3|   5
 * ...
 *
 * Special cases:
 * 
 * radix = 0 => bonus = 0
 * radix = 1 => bonus = frequency
 */

int bonus_from_frequency (int freq, int radix)
{ int sign, max_freq, penalty, overflow_freq;
  if (radix == 0) return 0;
  if (radix == 1) return freq;
  if (freq < 0)
    { if (freq == -1) return freq;	/* -1 */
      sign = -1;
      freq = -freq;
    }
  else if (freq <= 1) return freq;	/* 0 or 1 */
  else sign = +1;
  overflow_freq = INT_MAX / radix;
  if (freq > overflow_freq) freq = overflow_freq;
  for (penalty = 2, max_freq = radix;	/* 2... */
        max_freq <= freq;
        penalty++, max_freq *= radix) ;
  return (sign * penalty);
}

/*
   Adjust the penalties of transitions relative to each other,
   by finding the smallest among the possible lexicon transitions out of a
   state, and subtracting it from all.
   We loop over all classes here, but we check for transitions
   being of the right type (lexicon and grammar terminals).
   This is because we only know for sure that those terminals have
   been fully discovered.
   Remember the adjustment we make, so that it can be applied to
   $SKIP and $MATCH too. It is guaranteed that they are not found
   before lexicon entries.
*/
static Penalty find_smallest_penalty(Trellis t, State s)
{
    Penalty smallest = INT_MAX;

    TransIter iter = trel_trans_iter(t, s);
    Transition trans;

    while ((trans = trel_trans_iter_next(iter)) != NULL) {
	TransitionExtension ext = GetTransitionExtension(trans);
	if (ext->type & (TermBit | LexBit)) {
	    Penalty penalty = ext->penalty;
	    if (penalty < smallest)
		smallest = penalty;
	}
    }

    trel_trans_iter_end(iter);
    return smallest;
}

static void adjust_penalties(Trellis t, State s, Penalty delta)
{
    TransIter iter = trel_trans_iter(t, s);
    Transition trans;

    while ((trans = trel_trans_iter_next(iter)) != NULL) {
	TransitionExtension ext = GetTransitionExtension(trans);
	if (ext->type & (TermBit | LexBit)) {
	    ext->penalty -= delta;
	}
    }
    trel_trans_iter_end(iter);
}

static void adjust_relative_penalties(Trellis t, State s)
{ 
    Penalty smallest;

    /* don't bother doing this if we're not going to use the penalties */
    if (!arts_ifd.transition_penalties_option)
	return;

    smallest = find_smallest_penalty(t, s);
    DB_RTS(abs_message("adjust_relative_penalties: State %p, smallest = %d",
	    s, smallest);)

    if (smallest != INT_MAX) {
	StateExtension ext = GetStateExtension(s);
	/*
 	 * Also use the occasion to increase all penalties on all transitions
	 * by penalty_transition.
	 */
	smallest -= penalty_transition;
	ext->penalty_adjustment = smallest;
	adjust_penalties(t, s, smallest);
    }
}


#define BITS_PER_WORD 32

static void unblock_neg_memos_from_list (NegMemo *neg_memos, DATA *neg_memo_directors)
{ int neg_memo_size = arts_ifd.nr_neg_memos;
  unsigned int word;
  int bit, i;

  /* open neg_memos in bit-vector */
  word = (unsigned int) neg_memo_directors->ilval;
  neg_memo_directors++;
  bit = 0;
  for (i = 0; i < neg_memo_size; i++)
    { if (bit == BITS_PER_WORD)
	{ word = (unsigned int) neg_memo_directors->ilval;
	  neg_memo_directors++;
	  bit = 0;
	};
      /* assert((i % BITS_PER_WORD) == bit); */
      if (word == 0)	/* optimisation: skip large groups of zeros */
	{ /* fprintf(stderr, "unblock_neg_memos_from_list: skipping %d bits\n", BITS_PER_WORD - bit); */
	  i += BITS_PER_WORD - bit - 1;	/* loop does another i++ */
	  bit = BITS_PER_WORD;	/* bit += BITS_PER_WORD - bit */
	  continue;
	}
      if (word & 0x01)
        { /* fprintf(stderr, "unblock_neg_memos_from_list: rule %d\n", i); */
	  neg_memos[i] = NEGMEMO_UNKNOWN;
	}
      bit++;
      word >>= 1;
    };
}

static void add_state_neg_memos (StateExtension state, NegMemo* neg_memos)
{
  if (state != NULL) state -> neg_memos = neg_memos;
}

static void unblock_neg_memos_from_regexp (NegMemo *neg_memos, Transition trans, DATA *dirp)
{
  while (trans != NULL)
    { TransitionExtension ext = GetTransitionExtension(trans);
      int nr = DECODE_REGEXP_NUMBER(ext->terminal);
      DATA *dir = dirp[nr].data;
#if DEBUG_MEMO_BLOCKING
      abs_message("unblock_pos_memos_from_regexp: %d", nr);
#endif
      unblock_neg_memos_from_list (neg_memos, dir);
      trans = trel_trans_get_next(trans);
    }
}

static void unblock_neg_memos_from_term (NegMemo *neg_memos, Transition trans, DATA *dirp)
{ while (trans != NULL)
    { TransitionExtension ext = GetTransitionExtension(trans);
      int nr = DECODE_TERM_NUMBER(ext->terminal);
      DATA *dir = dirp[nr].data;
#if DEBUG_MEMO_BLOCKING
      abs_message("unblock_neg_memos_from_term: %d", nr);
#endif
      unblock_neg_memos_from_list (neg_memos, dir);
      trans = trel_trans_get_next(trans);
    }
}

static int unblock_neg_memos_for_state (NegMemo *neg_memos, Trellis trellis, State state)
{
      Transition trans;
      int class;

#if DEBUG_MEMO_BLOCKING
      abs_message("unblock_neg_memos_for_state: pos %d.%c", trel_state_get_pos(state), trel_state_get_lex_state(state));
#endif

      /* $OTHERs */
      trans = trel_state_scan_other(trellis, state);
      if (trans != NULL)
        { /* we don't have negmemo-directors for $OTHER, so we must
	   * not block anything...
	   */
          return 0;
        }

      /* lexicon nonterminals */
      for (class = 0; class < arts_ifd.nr_lexicon_nonterminals; class++)
        { trans = trel_state_scan_lexterminal(trellis, state, class);
	  if (trans != NULL)
            { /* get inverted director set for lexicon nonterminal */
              unblock_neg_memos_from_list (neg_memos, arts_ifd.lex_negmemo_dir[class].data);
            }
        }
      /* grammar terminals */
      trans = trel_state_scan_terminal(trellis, state);
      unblock_neg_memos_from_term(neg_memos, trans, arts_ifd.term_negmemo_dir);
      /* $end of sentence$ */
      trans = trel_state_scan_eos(trellis, state);
      unblock_neg_memos_from_term(neg_memos, trans, arts_ifd.term_negmemo_dir);
      /* $MATCHes */
      trans = trel_state_scan_match(trellis, state);
      unblock_neg_memos_from_regexp(neg_memos, trans, arts_ifd.match_negmemo_dir);
      /* $SKIPs */
      trans = trel_state_scan_skip(trellis, state);
      unblock_neg_memos_from_regexp(neg_memos, trans, arts_ifd.skip_negmemo_dir);

      return 1;
}

static
void copy_blocked_negmemos(NegMemo *from, NegMemo *to)
{
    int i;

    for (i = 0; i < arts_ifd.nr_neg_memos; i++) {
	/*
	 * XXX should have director sets per ALTERNATIVE (per neg memo)
	 * or a mapping from alternative to rule number!
	 */
	/* int nont = negmemo_to_nont(i); */
	if (from[i] == NEGMEMO_BLOCKED /*&& !if_glue_in_directorset(nont)*/) {
	    if (to[i] == NEGMEMO_UNKNOWN) {
		to[i] = NEGMEMO_BLOCKED;
#if DEBUG_MEMO_BLOCKING >= 2
		abs_message("negmemo #%d set to BLOCKED", i);
#endif
	    } else {
		if (to[i] == NEGMEMO_BLOCKED) {
#if DEBUG_MEMO_BLOCKING
		    abs_message("Note: negmemo #%d already BLOCKED!", i);
#endif
		} else {
#if DEBUG_MEMO_BLOCKING
		    abs_message("NOTE: negmemo #%d not UNKNOWN or BLOCKED: %d!", i, to[i]);
#endif
#if REBLOCK_NEGMEMOS
		    /*
		     * Blocking a NegMemo in mid-flight looks very scary!
		     * However, it only concerns non-false rules that matched
		     * the empty string, which can be (and has been)
		     * done without looking at the input. However the
		     * follower set[1] makes clear that the actual input
		     * isn't a follower of this rule, so matching this
		     * rule doesn't "make sense".
		     *
		     * [1] the director set is the starter set, plus the
		     * follower set in case the rule is nonfalse.
		     */
		    to[i] = NEGMEMO_BLOCKED;
#endif
		}
	    }
	}
    }
}

static void add_state_pos_memos (StateExtension state, PosMemo *pos_memos)
{ if (state != NULL) state -> pos_memos = pos_memos;
}

static void add_state_nest_markers (StateExtension state, char *markers)
{ if (state != NULL) state -> nest_markers = markers;
}


static void initialize_neg_memos (NegMemo* neg_memos)
{ int neg_memo_size = arts_ifd.nr_neg_memos;
  int ix;
  for (ix = 0; ix < neg_memo_size; ix++) *neg_memos++ = NEGMEMO_UNKNOWN;
}

static void unblock_pos_memos_from_list (PosMemo *pos_memos, DATA *list)
{
#if 0	/* posmemo unblock table is a list of rule numbers */
  while (*list != 0)
    { /* fprintf(stderr, "unblock_pos_memos_from_list: rule %ld\n", *list); */
      pos_memos[*list] = POSMEMO_UNKNOWN;
      list++;
    }
#else	/* posmemo unblock table is a bitmap of rule numbers */
  int i;
  int last = arts_ifd.nr_syntax_nonterminals;
  unsigned int word = (unsigned int) list->ilval;
  int bit = 0;
  list++;
  for (i = 0; i < last; i++)
    { if (bit == BITS_PER_WORD)
	{ word = (unsigned int) list->ilval;
	  list++;
	  bit = 0;
	};
      //assert((i % BITS_PER_WORD) == bit);
      if (word == 0)	/* optimisation: skip large groups of zeros */
	{ /* fprintf(stderr, "unblock_pos_memos_from_list: skipping %d bits\n", BITS_PER_WORD - bit); */
	  i += BITS_PER_WORD - bit - 1;	/* loop does another i++ */
	  bit = BITS_PER_WORD;	/* bit += BITS_PER_WORD - bit */
	  continue;
	}
      if (word & 0x01)
        { /* fprintf(stderr, "unblock_pos_memos_from_list: rule %d\n", i); */
	  pos_memos[i] = POSMEMO_UNKNOWN;
	}
      bit++;
      word >>= 1;
    };
#endif
}

static void unblock_pos_memos_from_regexp (PosMemo *pos_memos, Transition trans, DATA *dirp)
{
  while (trans != NULL)
    { TransitionExtension ext = GetTransitionExtension(trans);
      int nr = DECODE_REGEXP_NUMBER(ext->terminal);
      DATA *dir = dirp[nr].data;
#if DEBUG_MEMO_BLOCKING
      abs_message("unblock_pos_memos_from_regexp: %d", nr);
#endif
      unblock_pos_memos_from_list (pos_memos, dir);
      trans = trel_trans_get_next(trans);
    }
}

static void unblock_pos_memos_from_term (PosMemo *pos_memos, Transition trans, DATA *dirp)
{ while (trans != NULL)
    { TransitionExtension ext = GetTransitionExtension(trans);
      int nr = DECODE_TERM_NUMBER(ext->terminal);
      DATA *dir = dirp[nr].data;
#if DEBUG_MEMO_BLOCKING
      abs_message("unblock_pos_memos_from_term: %d", nr);
#endif
      unblock_pos_memos_from_list (pos_memos, dir);
      trans = trel_trans_get_next(trans);
    }
}

/*
 * In the code we have a few tables made by "inverting" the director sets:
 * for each terminal, we have a list of grammar rules that may start with that
 * terminal.
 */

static void unblock_pos_memos_for_state (PosMemo *pos_memos, Trellis trellis, State state)
{
      Transition trans;
      int class;

#if DEBUG_MEMO_BLOCKING
      abs_message("unblock_pos_memos_for_state: pos %d.%c", trel_state_get_pos(state), trel_state_get_lex_state(state));
#endif

      /* lexicon nonterminals */
      for (class = 0; class < arts_ifd.nr_lexicon_nonterminals; class++)
        { trans = trel_state_scan_lexterminal(trellis, state, class);
	  if (trans != NULL)
            { /* get inverted director set for lexicon nonterminal */
	      unblock_pos_memos_from_list (pos_memos, arts_ifd.lex_posmemo_dir[class].data);
            }
        }
      /* grammar terminals */
      trans = trel_state_scan_terminal(trellis, state);
      unblock_pos_memos_from_term(pos_memos, trans, arts_ifd.term_posmemo_dir);
      /* $end of sentence$ */
      trans = trel_state_scan_eos(trellis, state);
      unblock_pos_memos_from_term(pos_memos, trans, arts_ifd.term_posmemo_dir);
      /* $MATCHes */
      trans = trel_state_scan_match(trellis, state);
      unblock_pos_memos_from_regexp(pos_memos, trans, arts_ifd.match_posmemo_dir);
      /* $SKIPs */
      trans = trel_state_scan_skip(trellis, state);
      unblock_pos_memos_from_regexp(pos_memos, trans, arts_ifd.skip_posmemo_dir);
      /* $OTHERs */
      trans = trel_state_scan_other(trellis, state);
      if (trans != NULL)
        { unblock_pos_memos_from_list(pos_memos, arts_ifd.other_posmemo_dir[0].data);
        }
}

static
void copy_blocked_posmemos(PosMemo *from, PosMemo *to)
{
    int i;

    for (i = 0; i < arts_ifd.nr_syntax_nonterminals; i++) {
	if (from[i] == POSMEMO_BLOCKED && !if_glue_in_directorset(i)) {
	    if (to[i] == POSMEMO_UNKNOWN) {
		to[i] = POSMEMO_BLOCKED;
		posmemo_blocked_count++;
		posmemo_unknown_count--;
#if DEBUG_MEMO_BLOCKING >= 2
		abs_message("posmemo #%d set to BLOCKED", i);
#endif
	    } else {
		if (to[i] == POSMEMO_BLOCKED) {
#if DEBUG_MEMO_BLOCKING
		    abs_message("Note: posmemo #%d already BLOCKED!", i);
#endif
		} else {
#if DEBUG_MEMO_BLOCKING
		    abs_message("NOTE: posmemo #%d not UNKNOWN or BLOCKED!", i);
		    posmemo_rdump_pmprod(to[i], 0);
		    posmemo_print_tree(to[i]);
#endif
#if REBLOCK_POSMEMOS
		    /*
		     * Detaching and blocking a PosMemo in mid-flight looks
		     * very scary!
		     * However, it only concerns non-false rules that matched
		     * the empty string, which can be (and has been)
		     * done without looking at the input. However the
		     * follower set[1] makes clear that the actual input
		     * isn't a follower of this rule, so matching this
		     * rule doesn't "make sense".
		     *
		     * [1] the director set is the starter set, plus the
		     * follower set in case the rule is nonfalse.
		     *
		     * A danger is that it deletes a list that is
		     * currently being walked by for example PLAY_PM,
		     * which will cause chaos. (This actually happened
		     * in a French grammar.) This might be fixed if
		     * stack-references to memos are reference-counted
		     * too, but that is probably not worth the trouble.
		     */
		    posmemo_detach_list(&to[i]);
		    to[i] = POSMEMO_BLOCKED;
		    posmemo_unknown_count--;
		    posmemo_blocked_count++;
#endif
		}
	    }
	}
    }
}

static void initialize_pos_memos (PosMemo *pos_memos, State state)
{ int ix;
  {
    for (ix = 0; ix < arts_ifd.nr_syntax_nonterminals; ix++)
      posmemo_init_table_entry (&(pos_memos[ix]));
  }
}

static void initialize_nest_markers (char *markers)
{ int ix;
  for (ix = 0; ix <= arts_ifd.nr_lrec_nests; ix++) markers[ix] = 0;	/* Note the <= */
}

static void reset_pos_memos (PosMemo *pos_memos)
{ int ix;
  int posmemo_size = arts_ifd.nr_syntax_nonterminals;
  if (pos_memos == NULL) return;
  for (ix = 0; ix < posmemo_size; ix++)  
    { PosMemo *memo = &pos_memos[ix];
      if (*memo != NULL)
	posmemo_detach_list(memo);
    };
}

static
void new_state_extension_cb(Trellis t, State state)
{
    StateExtension ext;

    DB_RTS(abs_message("new_state_extension_cb, pos=%d", trel_state_get_pos(state));)
    ext = (StateExtension) alloc_mem (&state_mem);
    /* ext->neg_memos = NULL;		initialized below */
    /* ext->pos_memos = NULL;		initialized below */
    /* ext->nest_markers = NULL;	initialized below */
    ext->facts = NULL;
    ext->penalty_adjustment = 0;
    /* ext->flags = 0; */
    trel_state_set_ext(state, ext);

    /* add pos_memos */
    { PosMemo *pos_memos = alloc_pos_memos ();
      add_state_pos_memos (ext, pos_memos);
      initialize_pos_memos (pos_memos, state);
    }

    /* add neg_memos */
    { NegMemo *neg_memos = alloc_neg_memos ();
      initialize_neg_memos (neg_memos);
      add_state_neg_memos (ext, neg_memos);
    }

    /* add nest_markers */
    { char *nest_markers = alloc_nest_markers ();
      initialize_nest_markers (nest_markers);
      add_state_nest_markers (ext, nest_markers);
    }
}

void reset_state_extension(Trellis t, State state)
{
    StateExtension ext = GetStateExtension(state);
    reset_pos_memos (ext->pos_memos);
}

static
void free_state_extension_cb(Trellis t, State state)
{
    StateExtension ext = GetStateExtension(state);
    Transition fact;

    DB_RTS(abs_message("free_state_extension_cb %p -> %p; pos=%d", state, ext, trel_state_get_pos(state));)

    maybe_free_pos_memos (ext->pos_memos);
    maybe_free_neg_memos (ext->neg_memos);
    maybe_free_nest_markers (ext->nest_markers);

    fact = ext->facts;

    while (fact != NULL) {
	Transition next = trel_trans_get_next(fact);
	trel_free_transition(t, fact);
	fact = next;
    }

    free_mem (&state_mem, ext);
}

void reset_trellis_pos_memos (Trellis trellis)
{ 
    State state = trel_get_initial_state(trellis);
    while (state != NULL) {
	StateExtension ext = GetStateExtension(state);
	PosMemo *pos_memos;

	if (ext == NULL) {
	    abs_message("State %p has no extension!\n", state);
	} else {
	    pos_memos = ext -> pos_memos;
	    reset_pos_memos (pos_memos);
	}
	state = trel_state_next(trellis, state);
    }
}

/*------------------------------------------------------------------------------
// Destruction of trellis
//----------------------------------------------------------------------------*/

static void maybe_free_neg_memos (NegMemo *memos)
{ if (memos != NULL)
    free_neg_memos (memos);
}

static void maybe_free_pos_memos (PosMemo *memos)
{ if (memos != NULL)
    free_pos_memos (memos);
}

static void maybe_free_nest_markers (char *markers)
{ if (markers != NULL)
    free_nest_markers (markers);
}

void delete_trellis (Trellis trellis)
{
  trel_delete(trellis);
#if SHOW_LEXINFO_NRS
  show_lexinfo_nrs ("end of delete_trellis");
#endif
}

/*------------------------------------------------------------------------------
// Printing of trellis
//----------------------------------------------------------------------------*/
static void print_terminal_text (Transition trans)
{ char *outxt = arts_dupstr_escaped (trel_trans_get_text(trans));
  TransitionExtension ext = GetTransitionExtension(trans);

  if (is_eos_transition (ext))
    abs_printf ("%s", outxt);
  else
    { /*abs_printf ("\"%s\"", outxt);*/
      abs_printf ("\"");
      lxcn_print_lexeme(outxt, ext->lextype);
      abs_printf ("\"");
    };
  abs_free (outxt, "print_terminal_text");
}

void print_transition (Transition trans)
{ State trans_dest = trel_trans_get_dest (trans);
  int parno;
  TransitionExtension ext = GetTransitionExtension(trans);
#if DEBUG
  int stype = DECODE_TERM_TYPE (ext -> terminal);
#endif

#if DEBUG_POSMEMO
  abs_printf("%p ", trans);
#endif
  if (ext->flags & UsedBit) {
      abs_printf("*");
  } else {
      abs_printf(" ");
  }
  print_terminal_text (trans);
  if (is_lexicon_transition (ext))
    { int arity = DECODE_NONT_ARITY(ext -> terminal);
      int nontnr = DECODE_NONT_NUMBER(ext -> terminal);	/* Note coder nontnr */
      char *nontnm = arts_ifd.nonterm_names[nontnr].str;
      DATA *pdomain = arts_ifd.nont_formals_domains[nontnr].data;
      abs_printf (" ");
#if DEBUG
      /* abs_printf ("%08x", stype); */
      abs_printf ("%d:", nontnr);
#endif
      abs_printf ("%s", nontnm);
#if DEBUG
      abs_printf ("/%d", arity);
#endif

      for (parno = 0; parno < arity; parno++)
        { if (parno == 0) abs_printf ("(");
          else abs_printf (",");
          print_affix (ext -> params[parno], (int) pdomain->ilval);
	  pdomain++;
        };

      if (arity > 0) abs_printf (")");
    }
  else if (is_fact_transition (ext))
    { int arity = DECODE_NONT_ARITY(ext -> terminal);
      int nontnr = DECODE_NONT_NUMBER(ext -> terminal);	/* Note coder nontnr */
      char *nontnm = arts_ifd.nonterm_names[nontnr].str;
#if DEBUG
      DATA *pdomain = arts_ifd.nont_formals_domains[nontnr].data;
#endif

      abs_printf (" ");
#if DEBUG
      /* abs_printf ("%08x", stype); */
      abs_printf ("%d:", nontnr);
#endif
      abs_printf ("%s", nontnm);
      /* For facts, don't decode the affixes, since it seems that
       * the critical affixes are missing, and here we don't know
       * which ones they are.
       */
      for (parno = 0; parno < arity; parno++)
        { if (parno == 0) abs_printf ("(");
          else abs_printf (", ");
#if DEBUG
      /* Horrible hack - assume TEXT and INT affixes are critical and
       * not returned in this result...
       */
	  while ((int) pdomain->ilval < 0) pdomain++;
	  abs_printf("%08lx=", ext -> params[parno]); fflush(NULL);
	  abs_printf("<%08lx>", pdomain->ilval); fflush(NULL);
          print_affix (ext -> params[parno], (int) pdomain->ilval);
	  pdomain++;
#else
	  abs_printf("%08lx", ext -> params[parno]);
#endif
        };

      if (arity > 0) abs_printf (")");
    }
  else
    {
#if DEBUG_NONTNR
      abs_printf ("%x#%ld:", stype, DECODE_TERM_NUMBER(ext->terminal));
#endif

      if (is_skip_regexp_transition(ext))
	{ abs_printf (" $SKIP(\"");
	  abs_printf ("%s", arts_ifd.skip_regexp_names[
			    DECODE_REGEXP_NUMBER(ext -> terminal)].str);
	  abs_printf ("\")");
	}
      else if (is_match_regexp_transition(ext))
	{ abs_printf (" $MATCH(\"");
	  abs_printf ("%s", arts_ifd.match_regexp_names[
			    DECODE_REGEXP_NUMBER(ext -> terminal)].str);
	  abs_printf ("\")");
	}
      else if (is_other_transition(ext))
	abs_printf (" $OTHER");
      else if (is_terminal_transition(ext))
	{ /*
	    we don't want the same text again 
	    char *escd = arts_dupstr_escaped
			(arts_ifd.term_names[DECODE_TERM_NUMBER(ext -> terminal)].str);
	    abs_printf (" <TERMINAL> \"%s\"", escd);
	    abs_free (escd, "print_terminal");
	  */
	}
      else if (is_whitespace_transition(ext))
        {
	    abs_printf("<WSP>");
	}
      else
        {
	    abs_printf("<UNK.TRANS>");
	}
      /* $end of sentence$ not shown */
    };

  if (arts_ifd.transition_penalties_option)
    abs_printf (" [%d]", ext -> penalty);

  abs_printf (" => ");
#if DEBUG
  abs_printf ("{%08x} ", ext -> type);
#endif
  if (trans_dest) abs_printf ("%d.%c",
	  trel_state_get_pos(trans_dest),
	  trel_state_get_lex_state(trans_dest));
  else abs_printf ("(none)");


#if PRINT_STATE_PTRS
  abs_printf (" (%p)", trans_dest);
#endif
}

static void print_state_number(State state)
{
    abs_printf ("%5d.%c ",
	    trel_state_get_pos(state),
	    trel_state_get_lex_state(state));

#if PRINT_STATE_PTRS
    abs_printf ("(state=%p) ", state);
#endif
}

void print_state_transitions (Trellis t, State state)
{ /* state != NULL, caller has checked */
    Transition trans;
    TransIter it = trel_trans_iter(t, state);
    int first = 1;

#if 0
    if (1) {
	StateExtension ext = GetStateExtension(state);
	if (ext->flags & MemoBit) {
	    abs_printf("+");
	} else {
	    abs_printf(" ");
	}
    }
#endif

    while ((trans = trel_trans_iter_next(it)) != NULL) {
	  if (first)
	      print_state_number(state);
	  else
	      abs_printf(", \n\t");
	  print_transition (trans);
	  first = 0;
    }

    trel_trans_iter_end(it);

    if (arts_ifd.graph_option >= 2) {
	StateExtension ext;

	if (first) {
	    print_state_number(state);
	    first = 0;
	}

	ext = GetStateExtension(state);
	if (ext->facts) {
	    trans = ext->facts;
	    while (trans != NULL) {
		if (first)
		    print_state_number(state);
		else
		    abs_printf(", \n\t");
		print_transition(trans);
		first = 0;
		trans = trel_trans_get_next(trans);
	    }
	}
    }
    if (!first)
	abs_printf ("\n");
}

/*--------------------------------------------------------------------------
// Top level routine for printing the lexical graph.
//------------------------------------------------------------------------*/
void print_trellis (Trellis trellis)
{
    State state = trel_get_initial_state(trellis);
    while (state != NULL) {
	print_state_transitions (trellis, state);

	state = trel_state_next(trellis, state);
    }
}

/*-------------------------------------------------------------------------
// Find the nearest State (i.e. the one with the lowest pos) that can
// be reached from here. Regexp ($MATCH or $SKIP) transitions are only
// considered if there are no other transitions.
//
// If the state has an EOS transition, we'll take that one.
//
// Return value: NULL if no transition, new State otherwise.
//-----------------------------------------------------------------------*/
State get_shortest_transition (Trellis trellis, State istate)
{
    /* This is not totally equivalent to the old code */
    Transition trans;

    if (trel_state_scan_eos(trellis, istate))
	return NULL;

    trans = trel_state_scan_any(trellis, istate);
    if (trans) {
	return trel_trans_get_dest(trans);
    }
    return NULL;

}

State get_next_whitespace_or_eos (Trellis trellis, State istate, int skip)
{
    State dest;

    if (trel_state_scan_eos(trellis, istate))
	return NULL;

    dest = trel_state_next_white (trellis, istate, skip);
    if (dest != NULL)
	return dest;

    return trel_get_eos_state(trellis);
}

/*
  The start point of the building of the trellis
*/
Trellis initialize_trellis (char *input, int linenr, int colnr)
{ Trellis trellis;
  int length;

  /* allocate the trellis structure */
  length = (int) strlen (input) + 1;

  trellis = trel_init (arts_ifd.nr_lexicon_nonterminals);
  trel_set_input(trellis, input, linenr, colnr);
  trel_set_lexicon(trellis, arts_ifd.lexicon);
  trel_set_white(trellis, arts_ifd.white_space_chars);
  trel_set_separators(trellis, arts_ifd.separator_chars);
  trel_set_match_regex(trellis, match_combined);
  trel_set_skip_regex(trellis, skip_combined);

  /* callbacks for transitions */
  trel_set_scanned_lexicon(trellis, scanned_lexicon_cb);
  trel_set_scanned_white(trellis, scanned_whitespace_cb);
  trel_set_scanned_other(trellis, scanned_other_cb);
  trel_set_scanned_match(trellis, scanned_match_cb);
  trel_set_scanned_skip(trellis, scanned_skip_cb);
  trel_set_scanned_eos(trellis, scanned_eos_cb);
  trel_set_scanning_completed(trellis, scanning_completed_cb);
  trel_set_free_trans_callback(trellis, free_transition_extension_cb);

  /* callback for state */
  trel_set_new_state_callback(trellis, new_state_extension_cb);
  trel_set_free_state_callback(trellis, free_state_extension_cb);

  return (trellis);
}
