/*
   File: rtslex.h
   Interface to lexical analysis module.

   Copyright 2005 Radboud University of Nijmegen
 
   This program is free software; you can redistribute it and/or modify
   it under the terms of the GNU General Public License as published by
   the Free Software Foundation; either version 2 of the License, or
   (at your option) any later version.
 
   This program is distributed in the hope that it will be useful,
   but WITHOUT ANY WARRANTY; without even the implied warranty of
   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
   GNU Library General Public License for more details.
 
   You should have received a copy of the GNU General Public License
   along with this program; if not, write to the Free Software
   Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA.

   CVS ID: "$Id: rtslex.h,v 1.79 2007/11/01 11:03:52 olafs Exp $"
*/
#ifndef RTSLEX_H
#define RTSLEX_H

/* standard includes */
#include <limits.h>

/* libabase includes */
#include "abase_repr.h"

/* liblexicon includes */
#include <lexicon.h>
#include <lxcn_input.h>
#include <lexicon_search.h>

#ifdef PMRTS
//#include "posmemo.h"
struct _PMPROD;
typedef struct _PMPROD *local_PosMemo;
#endif

typedef long ARG;
typedef unsigned	TransitionType;
typedef unsigned	Position;	/* not completely abstracted yet */
typedef	unsigned	Terminal;	/* should be used by rtsagfl.c */

/*------------------------------------------------------------------------------
// Penalty management
// Penalties are defined to be 32 bit integers
//----------------------------------------------------------------------------*/
#define MAX_PENALTY	((Penalty) 0x7fffffff)
#define MIN_PENALTY	((Penalty) -MAX_PENALTY)
#define UNK_PENALTY	((Penalty) (MIN_PENALTY - 1))
#define NO_PENALTY	((Penalty) 0)

/*------------------------------------------------------------------------------
// Negative memoization
// The idea is that a parse should be blocked if the current penalty
// >= negative memo unless the negative memo is still unknown
//----------------------------------------------------------------------------*/
typedef Penalty		NegMemo;
#define NEGMEMO_BLOCKED MIN_PENALTY
#define NEGMEMO_UNKNOWN	UNK_PENALTY
#define NEGMEMO_SUCCESS MAX_PENALTY

typedef struct
{ Lexicon	lexicon;
  Trie		trie;
  char**	terminals;
  char**	matches;
  char**	skips;
  unsigned	nr_nont_classes;
  unsigned	nr_terminals;
  unsigned	nr_matches;
  unsigned	nr_skips;
  unsigned	nr_neg_memos;
  unsigned	nr_syntax_nonterminals;
  Terminal	eos_terminal;
  unsigned char *blanks;
  unsigned char *terminators;
  unsigned char *invisibles;
  unsigned char *translate_src;
  unsigned char *translate_dst;
} LexInfo;

/*---------------------------------------------------------------------------
// AggregatePartStates contains 1 bit for each of the 16
// possible partstates (0..15) at some position within token.
// At the position after the last valid part, it contains an extra flag
// NextTokenStartsHere, meaning we couldn't find a parts_transition but
// we did find a next token there.
//-------------------------------------------------------------------------*/
typedef unsigned	AggregatePartStates;
#define PartBitsMask	0x0001F		/* translate part_state into number */
#define MaxPartState	MultiTokenBit	/* partState now only has latest kind */
  /* since 2000aug07, a partstate only has one bit (the last part type
  ** so far), so we introduced MaxPartState, which is no longer PartBitsMask
  ** ("PartBitsMask + 1" would push the bit over the size of a long)
  */
#define NextTokenStartsHere	(1 << (MaxPartState + 1))

/*----------------------------------------------------------------------------
// Bit-fields of TransitionType, also used for partState:
// (0..MaxPartState is also used for looping over all 16 possible part_states,
// so it must be the lower bits of a word)
//--------------------------------------------------------------------------*/
#define PrefixBit	0x00001		/* prefix token */
#define InfixBit	0x00002		/* infix token */
#define SuffixBit	0x00004		/* suffix token */
#define SingleTokenBit	0x00008		/* single token */
#define MultiTokenBit	0x00010		/* multi token */
#define UnusedBits	0x00020		/* for future expansion */
#define OtherBit        0x00040		/* terminal is other  */
#define TermBit		0x00080		/* terminal is grammar lexeme */
#define LexBit		0x00100		/* terminal is lexicon lexeme */
#define MatchBit	0x00200		/* terminal is regexp match */
#define SkipBit		0x00400		/* terminal is regexp skip */
#define TransPartsBit	0x00800		/* has transition to next part */
#define TransTokenBit	0x01000		/* has transition to next token */
#define EosBit		0x02000		/* EOS */
#define TxtFreeBit	0x04000		/* transition -> text is malloced */
#define TokenStartBit	0x08000		/* =0 at start of 2nd and following parts */
#define TokenPartBit	0x10000		/* terminal is (live) part of token */
			/* Caution: Final and Nonfinal may occur together */
#define FinalPartBit	0x20000		/* part has transition to next token */
#define NonfinalPartBit	0x40000		/* part has transition to next part */

/*----------------------------------------------------------------------------
// A Transition contains the info associated with a terminal starting at a
// certain character position.
//
// The next field points to the next possible Transition at the same position,
// the trans_dest field indicates the StateNode at the first terminal
// position after the token.
//--------------------------------------------------------------------------*/
typedef struct Transition
{ TransitionType	type;
  Terminal		terminal;		/* Bitwise encoding */
  char *		text;
  unsigned		len;
  TaggedValue *		params;
  Penalty		penalty;
  struct StateNode*	trans_dest_state;
  struct Transition*	next;
} Transition;

/*--------------------------------------------------------------------------
// A StateNode contains info associated with a certain position in the text.
// The nr of negative memos allocated with this node is given by agfl-coder.
// The nr of positive memos is equal to the number of syntax rules
//------------------------------------------------------------------------*/
typedef struct StateNode
{ Transition** trans_lists;	/* The array of transition lists */
  NegMemo* neg_memos;		/* nr is given by agfl-coder */

#ifdef PMRTS
  local_PosMemo* pos_memos;	/* Positive memo array (per rule) */
  char *lrec_markers;		/* Left recursion marker array (per rule) */
#endif /* PMRTS */

  Position pos;			/* offset in chars relative to starting position */
  unsigned linenr;		/* original line */
  unsigned colnr;		/* original column */
} StateNode;

typedef struct StateNode *StateIndicator;

/*---------------------------------------------------------------------------
// Note that we assume an unsigned int to have at least 32 bits
// in the coding of Transition.terminal:
// 
// for nonterminals: 4 bits type, 20 bits number, 8 bits arity
// for terminals, regexp_match, and regexp_skip: 4 bits type, 28 bits number
// typebits 0000 nonterminal
//	    0001 terminal
//	    0010 regexp_match
//	    0011 regexp_skip
//	    0100 other
//-------------------------------------------------------------------------*/
#define DECODE_TERM_TYPE(term)		(((term) >> 28) & 0x0F)
#define TERM_IS_NONT(term)		(((term) & 0xF0000000) == 0x00000000)
#define DECODE_NONT_NUMBER(nont)	(((nont) >> 8) & 0xFFFFF)
#define DECODE_NONT_CLASS(nont)		(((nont) >> 8) & 0xFFFFF)
#define DECODE_NONT_ARITY(nont)		((nont) & 0xFF)
#define TERM_IS_TERM(term)		(((term) & 0xF0000000) == 0x10000000)
#define DECODE_TERM_NUMBER(term)	((term) & 0x0FFFFFFF)
#define TERM_IS_MATCH(term)		(((term) & 0xF0000000) == 0x20000000)
#define TERM_IS_SKIP(term)		(((term) & 0xF0000000) == 0x30000000)
#define TERM_IS_OTHER(term)		(((term) & 0xF0000000) == 0x40000000)
#define DECODE_REGEXP_NUMBER(term)	((term) & 0x0FFFFFFF)
#define ENCODE_NONT(nr,arity)		(((nr) << 8) | arity)
#define ENCODE_TERM(nr)			((nr) | 0x10000000)
#define ENCODE_MATCH(nr)		((nr) | 0x20000000)
#define ENCODE_SKIP(nr)			((nr) | 0x30000000)
#define ENCODE_OTHER(nr)		((nr) | 0x40000000)
#define DECODE_TERM_OR_RE_CLASS(term)	((((term) >> 28) & 0xf) - 1 + nr_lexicon_nonterminals)

/*----------------------------------------------------------------------------
// A trellis contains an array of pointers to statenodes. At each position, we
// have a possibly empty list of states, representing the tokens that have
// been matched at that position (if any).
//--------------------------------------------------------------------------*/
typedef struct
{ unsigned length;		/* Paragraph span of trellis i.e. largest position */
  StateNode* first_state;	/* Starting state of trellis */
  StateNode** states_row;	/* allocated separately */
  StateNode** pstates_row;	/* allocated separately, for parts */
  unsigned* last_part_end_from;
  Lexicon lexicon;
} Trellis;


/* Access functions */
Position STATE_POS (StateNode *ind);
void SET_FIRST_POS (Trellis *trel, Position pos);
StateIndicator GET_FIRST_STATE_INDICATOR (Trellis *trel);

int IS_LASTPART (Transition *t);
int HAS_PARTS_TRANSITION (Transition *t);
Terminal TRANSITION_TERMINAL (Transition *tra);
TaggedValue *TRANSITION_PARAMS (Transition* tra);
char *TRANSITION_TEXT (Transition* tra);
Transition *TRANSITION_NEXT_TRANS (Transition *tra);
Penalty TRANSITION_PENALTY (Transition *tra);

StateNode** GET_TRELLIS_STATE_ROW (Trellis *trel);
StateNode** GET_TRELLIS_PARTS_ROW (Trellis *trel);
Transition* GET_STATE_TRANSLIST (Trellis *trel, StateIndicator i_st, ARG cls);
StateIndicator TRANSITION_DEST_STATE_INDICATOR (Transition *tra, Trellis* trel);

/* Exported functions */
LexemeType get_transition_lex_type (Transition* transition);
void init_lexer (LexInfo* info);
void end_lexer (void);
StateNode* get_shortest_transition (Trellis *trellis, StateNode *istate);
Trellis* make_trellis_by_word_lexing (char *input, int linenr, int colnr, Lexicon the_lex);
void delete_trellis (Trellis* trellis);
void print_trellis (Trellis* trellis);
void show_neg_memo_blocks (Trellis* trellis);
int state_has_eos_transition (StateNode *the_state);
int is_empty_trellis (Trellis* trellis);

#ifdef PMRTS
void reset_trellis_pos_memos (Trellis *trellis);
#endif

unsigned get_nr_neg_memos(void);
unsigned long get_nr_syntax_nonterminals(void);
Terminal get_eos_terminal(void);

#endif /* RTSLEX_H */
