/*
   File: arts_ds.h
   Defines all data_structures relevant for the AGFL runtime system

   Copyright 2006 Radboud University of Nijmegen
 
   This program is free software; you can redistribute it and/or modify
   it under the terms of the GNU General Public License as published by
   the Free Software Foundation; either version 2 of the License, or
   (at your option) any later version.
 
   This program is distributed in the hope that it will be useful,
   but WITHOUT ANY WARRANTY; without even the implied warranty of
   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
   GNU Library General Public License for more details.
 
   You should have received a copy of the GNU General Public License
   along with this program; if not, write to the Free Software
   Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA.

   CVS ID: "$Id: arts_ds.h,v 1.18 2009/02/12 14:44:18 olafs Exp $"
*/
#ifndef IncArtsDs
#define IncArtsDs

/* standard includes */
#include <limits.h>

/* libabase includes */
#include <abase_repr.h>

/* liblexicon includes */
#include <lxcn_input.h>
#include <lxcn_lexicon.h>

/* Old defs; will be changed in due time */
typedef long		ARG;
typedef int		TransitionType;
typedef	int		Terminal;	/* should be used by rtsagfl.c */
typedef int		Position;	/* Define as int32 later */

/*------------------------------------------------------------------------------
// Reserve two special types and top domain values
//----------------------------------------------------------------------------*/
#define INT_TYPE        -1
#define TEXT_TYPE       -2
#define TOP_INT		0x80000000
#define TOP_TEXT	NULL

/*------------------------------------------------------------------------------
// Penalty management
// Penalties are defined to be 32 bit integers
//----------------------------------------------------------------------------*/
#define MAX_PENALTY	((Penalty) 0x7fffffff)
#define MIN_PENALTY	((Penalty) -MAX_PENALTY)
#define UNK_PENALTY	((Penalty) (MIN_PENALTY - 1))
#define NO_PENALTY	((Penalty) 0)

/*------------------------------------------------------------------------------
// Negative memoization
// The idea is that a parse should be blocked if the current penalty
// >= negative memo unless the negative memo is still unknown
//----------------------------------------------------------------------------*/
typedef Penalty		NegMemo;
#define NEGMEMO_BLOCKED MIN_PENALTY
#define NEGMEMO_UNKNOWN	UNK_PENALTY
#define NEGMEMO_SUCCESS MAX_PENALTY

/*------------------------------------------------------------------------------
// Positive memoization
// When a parse of a nonterminal has succeeded at a certain point in the input,
// we remember it with its resultant affixes. Positive memoizatoin is also used
// to implement left recursion
//----------------------------------------------------------------------------*/
typedef struct state_node_rec StateNode;
typedef struct transition_rec Transition;

typedef struct pos_memo_rec
{ /* Description of the recognized rule */
  int nont_nr;
  int nr_formals;
  int nr_variables;			/* formals + locals */
  int nr_sons;

  /* Contains formals, locals, posmemo's of sons and typing of sons */
  Value *variables;

  /* Next info */
  StateNode *next_state;		/* Next trellis state */
  Penalty penalty;			/* Penalty of this rule */
  void *pass2;

  /* Posmemo admin */
  struct pos_memo_rec* next;		/* Next solution pointer */
  struct pos_memo_rec* equiv;		/* Pointer to equivalent nodes */
  struct pos_memo_rec* prime;		/* Pointer to first equivalent node */
  int refcount;				/* How many times is this used? */
} PMPROD, *PosMemo;

/* Define shorthands for blocked and unknown PosMemo */
#define POSMEMO_BLOCKED ((PosMemo) 0L)
#define POSMEMO_UNKNOWN ((PosMemo) 1L)

/*---------------------------------------------------------------------------
// AggregatePartStates contains 1 bit for each of the 16
// possible partstates (0..15) at some position within token.
// At the position after the last valid part, it contains an extra flag
// NextTokenStartsHere, meaning we couldn't find a parts_transition but
// we did find a next token there.
//-------------------------------------------------------------------------*/
typedef int		AggregatePartStates;
#define PartBitsMask	0x0001F		/* translate part_state into number */
#define MaxPartState	MultiTokenBit	/* partState now only has latest kind */
  /* since 2000aug07, a partstate only has one bit (the last part type
  ** so far), so we introduced MaxPartState, which is no longer PartBitsMask
  ** ("PartBitsMask + 1" would push the bit over the size of a long)
  */
#define NextTokenStartsHere	(1 << (MaxPartState + 1))

/*----------------------------------------------------------------------------
// Bit-fields of TransitionType, also used for partState:
// (0..MaxPartState is also used for looping over all 16 possible part_states,
// so it must be the lower bits of a word)
//--------------------------------------------------------------------------*/
#define PrefixBit	0x00001		/* prefix token */
#define InfixBit	0x00002		/* infix token */
#define SuffixBit	0x00004		/* suffix token */
#define SingleTokenBit	0x00008		/* single token */
#define MultiTokenBit	0x00010		/* multi token */
#define FactBit		0x00020		/* terminal is fact */
#define OtherBit        0x00040		/* terminal is other  */
#define TermBit		0x00080		/* terminal is grammar lexeme */
#define LexBit		0x00100		/* terminal is lexicon lexeme */
#define MatchBit	0x00200		/* terminal is regexp match */
#define SkipBit		0x00400		/* terminal is regexp skip */
#define TransPartsBit	0x00800		/* has transition to next part */
#define TransTokenBit	0x01000		/* has transition to next token */
#define EosBit		0x02000		/* EOS */
#define TxtFreeBit	0x04000		/* transition -> text is malloced */
#define TokenStartBit	0x08000		/* =0 at start of 2nd and following parts */
#define TokenPartBit	0x10000		/* terminal is (live) part of token */
			/* Caution: Final and Nonfinal may occur together */
#define FinalPartBit	0x20000		/* part has transition to next token */
#define NonfinalPartBit	0x40000		/* part has transition to next part */

/*----------------------------------------------------------------------------
// A Transition contains the info associated with a terminal starting at a
// certain character position.
//
// The next field points to the next possible Transition at the same position,
// the trans_dest field indicates the StateNode at the first terminal
// position after the token.
//--------------------------------------------------------------------------*/
struct transition_rec
{ TransitionType	type;
  Terminal		terminal;		/* Bitwise encoding */
  char *		text;
  int			len;
  Value *		params;
  Penalty		penalty;
  StateNode *		trans_dest_state;
  Transition *		next;
};

/*--------------------------------------------------------------------------
// A StateNode contains info associated with a certain position in the text.
// The nr of negative memos allocated with this node is given by agfl-coder.
// The nr of positive memos is equal to the number of syntax rules
//------------------------------------------------------------------------*/
struct state_node_rec
{ Transition** trans_lists;	/* The array of transition lists */
  NegMemo* neg_memos;		/* nr is given by agfl-coder */
  PosMemo* pos_memos;		/* Positive memo array (per rule) */
  char *lrec_markers;		/* Left recursion marker array (per rule) */

  Position pos;			/* offset in chars relative to starting position */
  int linenr;			/* original line */
  int colnr;			/* original column */
};

typedef StateNode *StateIndicator;

/*---------------------------------------------------------------------------
// Note that we assume an int to have at least 32 bits
// in the coding of Transition.terminal:
// 
// for nonterminals: 4 bits type, 20 bits number, 8 bits arity
// for terminals, regexp_match, and regexp_skip: 4 bits type, 28 bits number
// typebits 0000 nonterminal
//	    0001 terminal
//	    0010 regexp_match
//	    0011 regexp_skip
//	    0100 other
//-------------------------------------------------------------------------*/
#define DECODE_TERM_TYPE(term)		(((term) >> 28) & 0xF)
#define TERM_IS_NONT(term)		(((term) & 0xF0000000) == 0x00000000)
#define DECODE_NONT_NUMBER(nont)	(((nont) >> 8) & 0xFFFFF)
#define DECODE_NONT_ARITY(nont)		((nont) & 0xFF)
#define TERM_IS_TERM(term)		(((term) & 0xF0000000) == 0x10000000)
#define DECODE_TERM_NUMBER(term)	((term) & 0x0FFFFFFF)
#define TERM_IS_MATCH(term)		(((term) & 0xF0000000) == 0x20000000)
#define TERM_IS_SKIP(term)		(((term) & 0xF0000000) == 0x30000000)
#define TERM_IS_OTHER(term)		(((term) & 0xF0000000) == 0x40000000)
#define TERM_IS_FACT(term)		(((term) & 0xF0000000) == 0x50000000)
#define DECODE_REGEXP_NUMBER(term)	((term) & 0x0FFFFFFF)
#define ENCODE_NONT(nr,arity)		(((nr) << 8) | arity)
#define ENCODE_TERM(nr)			((nr) | 0x10000000)
#define ENCODE_MATCH(nr)		((nr) | 0x20000000)
#define ENCODE_SKIP(nr)			((nr) | 0x30000000)
#define ENCODE_OTHER(nr)		((nr) | 0x40000000)
#define ENCODE_FACT(nr,arity)		(((nr) << 8) | arity | 0x50000000)
#define DECODE_TERM_OR_RE_CLASS(term)	((((term) >> 28) & 0xf) - 1 + arts_ifd.nr_lexicon_nonterminals)

/*----------------------------------------------------------------------------
// A trellis contains an array of pointers to statenodes. At each position, we
// have a possibly empty list of states, representing the tokens that have
// been matched at that position (if any).
//--------------------------------------------------------------------------*/
typedef struct
{ int length;			/* Paragraph span of trellis i.e. largest position */
  StateNode	*first_state;	/* Starting state of trellis */
  StateNode	**states_row;	/* allocated separately */
  StateNode	**pstates_row;	/* allocated separately, for parts */
  Position	*last_part_end_from;
  Transition	*trellis_facts;	/* allocated by fact_matches */
} Trellis;

/*------------------------------------------------------------------------------
// A machine word of the abstract machine should be large enough to hold
// an address. For architectures with 64 bit addresses, ints loaded from the
// object code are therefore sign extended to 64 bit ints
//
// On architecture with 32 bit addresses, machine words are only 32 bit large
//----------------------------------------------------------------------------*/
typedef union Cel CODE;
typedef union Cel DATA;
typedef union Cel
{   void        *action;
    char        *str;
    ARG         arg;
    Value       val;
    CODE        *code;
    DATA        *data;
    StateIndicator input_state;  /* these 3 not in generated code */
    Transition  *input_transition;
    PosMemo     pmprod;
    Penalty     penalty;
    long	lval;
    int         ival;
    int		*iptr;
} cel;

/*------------------------------------------------------------------------------
// The central datastructure of the abstract machine
//----------------------------------------------------------------------------*/
struct interface_rec
{ /* File names */
  char *	grammar_name;
  char *	input_fname;
  char *	output_fname;
  char *	input_pos_fname;

  /* The size of the code, data and rotext segment: make these signed? */
  size_t	code_size;
  size_t	data_size;
  size_t	rotext_size;

  /* The three segments */
  CODE *	code;
  DATA *	data;
  char *	rotext;

  /* The Lexicon */
  Lexicon	lexicon;

  /* Interface section */
  int		gra_version;
  int		nr_lexicon_nonterminals;
  int		nr_terminals;
  int		nr_match_regexps;
  int		nr_skip_regexps;
  int		nr_syntax_nonterminals;
  int		nr_neg_memos;
  int		nr_choices;
  int		nr_positions;
  int		eos_terminal;
  int		lexicon_used;

  /* Options set by compiler */
  int		neg_memo_option;
  int		directors_option;

  /* Options set by compiler, (should be) overrulable by command line options */
  int		counters_option;	/* Should become cumulative */
  int		profile_option;
  int		trace_option;
  int		generate_option;
  int		segment_mode;
  char *	alphabet_fname;

  /* Auxilary fields for lexer */
  char *	word_terminator_chars;
  char *	white_space_chars;
  char *	translate_src;
  char *	translate_dst;
  int *		translate_penalties;

  /* Options not yet set by compiler, but deemed desirable */
  int		hybrid_parsing_option;
  int		max_parses;

  /* Other runtime options, only set by command line */
  int		parsing_stats_option;
  int		research_stats_option;
  int		lexer_stats_option;
  int		total_stats_option;
  int		best_parsings_option;
  int		lcsdoc_sync_option;
  int		transduce_option;
  int		label_bracket;
  int		graph_option;
  int		no_output;
  int		max_parsetime;
  int		max_edit_distance;

  /* Interface to tables */
  long *	affix_domains;
  long **	affix_weights;
  long **	nont_domains;
  long **	term_memo_dir;
  long **	match_memo_dir;
  long **	skip_memo_dir;
  long **	lex_memo_dir;
  long *	lrec_rules_table;
  long *	lex_nont_nrs_table;
  char **	affix_names;
  char **	term_names;
  char **	nonterm_names;
  char **	match_regexp_names;
  char **	skip_regexp_names;
  long *	alternatives_profile_table;
  long **	term_posmemo_dir;
  long **	match_posmemo_dir;
  long **	skip_posmemo_dir;
  long **	lex_posmemo_dir;
  long **	other_posmemo_dir;
};

extern struct interface_rec arts_ifd; 
void arts_init_ds ();

#endif /* IncArtsDs */
