/*
   File: erts_trellis_impl.h
   Defines the basic trellis datastructures for the EAG3 runtime system.

   Copyright 2012 Marc Seutter

   This program is free software: you can redistribute it and/or modify
   it under the terms of the GNU General Public License as published by
   the Free Software Foundation, either version 3 of the License, or
   (at your option) any later version.

   This program is distributed in the hope that it will be useful,
   but WITHOUT ANY WARRANTY; without even the implied warranty of
   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
   GNU General Public License for more details.

   You should have received a copy of the GNU General Public License
   along with this program.  If not, see <http://www.gnu.org/licenses/>.

   CVS ID: "$Id: erts_trellis_impl.h,v 1.12 2013/03/13 14:32:49 marcs Exp $"
*/

#ifndef IncErtsTrellisImpl
#define IncErtsTrellisImpl

/* libebase includes */
#include "ebase_input.h"

/* local includes */
#include "erts_handle.h"

/*
   There are 4 states associated with every input position.
   Matching begins before any white space which follows a word,
   so that suffixes can be properly matched.
  
   S    before white space.
        Any matching attempt from this state must match space+, or it fails.
        This brings you to state W. After the space the desired terminals are tried.

	If there are no white space characters nor separators, epsilon transitions
	will be inserted to go to the start of word state if white space matching
	has been turned off

   W    at the start of a Word. Depending on what kind of terminals are matched
        (prefix, suffix, etc), the next state will be
        fullword (word) -> E
        prefix   (Y-)   -> I
        (other terminals fail)
  
   I    Inside a word: a word cannot end here.
        The next state will be:
        prefix/infix    (y-, -y-)  -> I
        fullword/suffix (word, -y) -> E
  
   E    End of a word, and implicitly the begin (S) of the next word.
        A suffix can also be matched and returns to the same state.
        suffix          (-y)  -> E
  
   The GLUE operator makes an epsilon transition between the E and I state.
  
   Implicit in this scheme is that these states exist for every single input position,
   since most state changes described above are associated with consuming a certain
   amount of input, therefore with different input positions.

   Because state E of one word equals state S of the next, the outgoing transitions
   for E are tried first, then those of S. Fortunately they are disjunct.

     ws+           word         empty
   S ---> W  ---------------> E ----->  S
           \                 /^\
            \               / | \
             \y-       -y- /  \ | -y
              \        /  /    \/
               \      V  /
                \       / ^
                 \     / /
                  V   / -y
                   \ / word
                   ^I
                  /  \
                 |   |
                 \---/
                y-  -y-
*/

/* Encoding of lexical states */
#define LEX_STATE_ERROR 0
#define LEX_STATE_S 1
#define LEX_STATE_W 2
#define LEX_STATE_I 3
#define LEX_STATE_E 1
#define NR_LEX_STATES 3

/* Encoding of flags */
#define LEX_STATE_FULL_WORDS_PRESENT 1
#define LEX_STATE_PART_WORDS_PRESENT 2
#define LEX_STATE_WS_TRIED 4
#define LEX_STATE_EOS_TRIED 8
#define LEX_STATE_FULLY_SCANNED 16

/* Introduce the structure of states */
struct state_rec
{ Transition trans;		/* Sorted list of transitions from this state */
  int offset;			/* Offset between start of trellis and this state */
  int linenr;			/* Line number of this state */
  int colnr;			/* Column number of this state */
  short lex_flags;		/* part/full words in this state */
  uchar lex_state;		/* S, W, I or error */
  uchar lex_phase;		/* Phase of the state */
  int *neg_memos;		/* array of negative memo bits */
};

/*
   Transitions are labeled by their class, entry number (terminal nr, lex nont nr or
   regexp nr), and penalty. The call number of a lexicon nonterminal is kept in the
   info field

   The classes themselves form an internal trellis (hex coded, the definitions can
   be found in ebase_input.h as the compiler needs them to code director sets):

   00: error
   01: lexicon nonterminal
   02: terminal
       -- Note: this encompasses all phase 1 transitions
   03: $WORD (namely the shortest transition from the lexicon, ending a word)
       -- Note: this encompasses all phase 2 transitions
   04: $MATCH
   07: -- Note: this encompasses all phase 3 transitions
   08: $SKIP
   0C: (If we ever want it $REGEXP)
   0F: (If we ever want it $RECOGNIZABLE by either lexicon or regexps)
       -- Note: this encompasses all phase 4 transitions
   10: $OTHER
       -- Note: this encompasses all phase 5 transitions
   1F: $ANY
   20: ($WS)	(treated special)
   40: ($EOS)	(treated special)

   Flags to remember certain transitions
*/
#define TRANS_FULL_WORD 0x01
#define TRANS_USED 0x02

/* Introduce the structure of transitions */
struct transition_rec
{ Transition next;		/* Next in the list of transitions */
  State target;			/* Target state of this transition */
  char *from;			/* Start of transition */
  char *to;			/* End of transition */
  short class;			/* Transition class */
  short flags;			/* Transition flags */
  int nr;			/* nont id/terminal/regexp nr */
  int info;			/* call nr for lexicon nonterminals */
  Penalty penalty;		/* penalty incurred by translations, possibly modified */
};

/*
   A trellis is allocated for a memory chunk that is to be parsed.
   This may either be read from the input or just composed as a TEXT affix,
   which is meta defined and is now in need of a check at the second level
   of the grammar. Since meta parsing may start during any point in the
   affix propagation, the trelles for meta and first level parsing form a
   small stack reachable from the handle.

   After mallocing the trellis record, an array of pointers to states 
   of size NR_LEX_STATES * (input length + 1) is allocated. Whenever
   a state needs to be allocated, it is malloced and entered into this array,
   thus forming a state cache. Note that most entries in the array
   will remain null, since it is very unlikely that every input position
   is the start of a transition nor is it likely that an input position
   has all 3 of its possible states in use.
*/

/* Introduce the structure of a trellis */
struct trellis_rec
{ Lexicon lexicon;		/* Pointer to lexicon structure */
  Trellis previous;		/* Pointer to previous active trellis */
  int neg_memo_size;		/* Precalculated neg memo size */

  /* Information about the current chunk of input */
  State *state_cache;		/* NR_LEX_STATES * length + 1 */
  char *input;			/* Trellis input text */
  int length;			/* Length of input */
  int linenr;			/* Line number of input text */
  int colnr;			/* Column number of input text */

  /* Dynamic parse information */
  State curr_state;		/* Current (meta) parser state */ 
};

#endif /* IncErtsTrellisImpl */
