/*
   File: datfct_parser.c
   Parses dat and fact files

   Copyright 2009-2010 Radboud University of Nijmegen

   This program is free software: you can redistribute it and/or modify
   it under the terms of the GNU General Public License as published by
   the Free Software Foundation, either version 3 of the License, or
   (at your option) any later version.

   This program is distributed in the hope that it will be useful,
   but WITHOUT ANY WARRANTY; without even the implied warranty of
   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
   GNU General Public License for more details.

   You should have received a copy of the GNU General Public License
   along with this program.  If not, see <http://www.gnu.org/licenses/>.

   CVS ID: "$Id$"
*/

/* system includes */
#include <stdio.h>
#include <string.h>
#include <stdarg.h>

/* libabase includes */
#include <abase_repr.h>
#include <abase_error.h>
#include <abase_memalloc.h>
#include <abase_fileutil.h>

/* liblexicon includes */
#include <lxcn_input.h>
#include <lxcn_vocabulary.h>

/* local includes */
#include "options.h"
#include "globals.h"
#include "lexgen_lexer.h"
#include "dyn_array.h"
#include "lif_parser.h"
#include "datfct_parser.h"
#include "affix_values.h"
#include "nonterminals.h"
#include "entries.h"

/*
   A dat line has the form
   "WORDFORM"	nonterminal name [(PARAMS)] [NUMBER]

   A fact line has the form
   "WORDFORM"	nonterminal name [(PARAMS)] [NUMBER]
   where the WORDFORM will be ignored

   A triple has the form
   "WORDFORM"	RELATION "WORDFORM"
*/

/*
   Basic LL(1) parsing of the dat file

   Parameter recognization
   Actual int or text affixes of a call must be saved locally until it
   is clear whether they are critical or non critical parameters.
   So a limit is set to maximal 20 actual positions
   Set affixes are always directly registered through their index.

   We also store the formal typing, the affix indexes, and possible unions into
   three static separate arrays to avoid reallocating over and over again.
   The code for the punions array is quite dirty....

   Following the identification, non critical affixes are entered into
   their appropriate affix table. Critical affixes will be stored into
   their appropriate primary lookup tables.
*/
#define MAX_PARAMS 20
#define DEFAULT_SIZE 256
#define DEFAULT_MAX_MARKERS 8
#define DEFAULT_MAX_UNION 16
static int param_types[MAX_PARAMS];
static int actual_idxs[MAX_PARAMS];

static int act_int_params[MAX_PARAMS];
static char act_text_params[MAX_PARAMS][MAX_LEX_LINE_LEN + 1];
static int_array actual_unions[MAX_PARAMS];

static void init_datfct_parser ()
{ int ix;

  /* Explicitly initialize the unions array */
  for (ix = 0; ix < MAX_PARAMS; ix++)
    actual_unions[ix] = init_int_array (DEFAULT_MAX_UNION);
}

/*
   Integral affix recognition
*/
static int is_number_parameter (int idx)
{ int value;
  if (!is_signed_number (&value)) return (0);
  act_int_params[idx] = value;
  return (1);
}

/*
   Text affix recognition
*/
static int is_string_parameter (int idx)
{ return (is_string_with_expansion (act_text_params[idx]));
}

/*
   Recognition of set affixes
   For the reading of affix names, we accept letters, digits and ornaments
*/
static int is_affix_name (int *index)
{ char buf[MAX_LEX_LINE_LEN+1];
  if (!is_name (buf)) return (0);
  may_skip_white_space ();
  *index = lookup_set_affix (buf);
  if (*index < 0)
    { parse_error ("Unknown set affix '%s'", buf);
      *index = FormalERROR;
    };
  return (1);
}

static void should_be_affix_name (int *index)
{ if (is_affix_name (index)) return;
  parse_error ("affix name expected");
  skip_eoln ();
  *index = FormalERROR;
}

static int is_parameter (int idx)
{ int_array new_union;
  int index;
  if (is_number_parameter (idx))
    { param_types[idx] = FormalINT;
      return (1);
    }
  else if (is_string_parameter (idx))
    { param_types[idx] = FormalTEXT;
      return (1);
    };
  if (!is_affix_name (&index)) return (0);
  if (!is_char ('|'))
    { param_types[idx] = FormalSET;
      actual_idxs[idx] = index;
      return (1);
    };

  /* So we have a union of some set affixes */
  param_types[idx] = FormalUNION;
  new_union = actual_unions[idx];
  new_union -> size = 0;
  app_int_array (new_union, index);

  /* Recognize the rest of the union and directly uniquify the affix indices */
  should_be_affix_name (&index);
  app_sorted_int_array (new_union, index);
  while (is_char ('|'))
    { should_be_affix_name (&index);
      app_sorted_int_array (new_union, index);
    };
  return (1);
}
 
static void should_be_parameter (int idx)
{ if (is_parameter (idx)) return;
  parse_error ("parameter expected");
  skip_eoln ();
}

static int is_parameter_pack (int *nr_formals)
{ int idx = 0;
  if (is_char ('('))
    { should_be_parameter (idx++);
      while (is_char (','))
        { if (idx == MAX_PARAMS)
            parse_error ("Too many actual affix positions");
	  should_be_parameter (idx++);
	};
      should_be_char (')');

      /* Register parameter pack */
      *nr_formals = idx;
      return (1);
    };

  *nr_formals = idx;
  return (0);
}

static void may_be_parameter_pack (int *nr_of_formals)
{ (void) is_parameter_pack (nr_of_formals);
}

static void should_be_parameter_pack (int *nr_of_formals)
{ if (is_parameter_pack (nr_of_formals)) return;
  parse_error ("parameter pack expected");
  skip_eoln ();
}

/*
   After the nonterminal has been identified by name and nr of formals,
   the actual parameter pack is checked against the formal pack. The
   second task of the type check is to enter the int, string and union
   affixes into their proper critical/noncritical storage while registering
   their indexes.
*/
static void type_check_union_and_unify (int ix, int formal)
{ int_array act_union = actual_unions[ix];
  int part_error = 0;
  int aff_nr;
  int iy;

  /* Check if we have only one affix to unify */
  if (act_union -> size == 1)
    { aff_nr = act_union -> array[0];
      if (!affix_belongs_to_domain (aff_nr, formal))
	parse_error ("Type mismatch for parameter %d: union part %s against formal type %s",
		     ix + 1, affix_name_from_index (aff_nr), affix_name_from_index (formal));

      /* We only have a single affix in the union */
      parse_warning ("Union affix resolved into single affix %s", affix_name_from_index (aff_nr));
      param_types[ix] = FormalSET;
      actual_idxs[ix] = aff_nr;
      return;
    };

  /* Type check the union of the affixes against  */
  for (iy = 0; iy < act_union -> size; iy++)
    { aff_nr = act_union -> array[iy];
      if (!affix_belongs_to_domain (aff_nr, formal))
	{ parse_error ("Type mismatch for parameter %d: union part %s against formal type %s",
		       ix + 1, affix_name_from_index (aff_nr), affix_name_from_index (formal));
	  part_error = 1;
	};
    };

  /* Someone in error, resolve */
  if (part_error)
    { actual_idxs[ix] = FormalERROR;
      param_types[ix] = FormalERROR;
    };

  /* Turn it into a single affix */
  actual_idxs[ix] = register_new_union_affix (act_union, formal);
  param_types[ix] = FormalSET;
}

static void type_check_parameter (int ix, int formal, int critical)
{ int ptype = param_types[ix];
  switch (ptype)
    { case FormalINT:
	if (formal != FormalINT)
	  { parse_error ("Type mismatch for parameter %d: %s against formal type %s",
			 ix + 1, affix_name_from_index (ptype), affix_name_from_index (formal));
	    actual_idxs[ix] = FormalERROR;
	  }
	else if (critical)
	  { /* Enter critical int affix as itself */
	    actual_idxs[ix] = act_int_params[ix];
	  }
	else actual_idxs[ix] = register_new_int_affix (act_int_params[ix]);
	break;
      case FormalTEXT:
	if (formal != FormalTEXT)
	  { parse_error ("Type mismatch for parameter %d: %s against formal type %s",
			 ix + 1, affix_name_from_index (ptype), affix_name_from_index (formal));
	    actual_idxs[ix] = FormalERROR;
	  }
	else if (critical)
	  { /* Enter critical text affix in trie of all critical text affixes */
	    actual_idxs[ix] = register_critical_text (act_text_params[ix]);
	  }
	else actual_idxs[ix] = register_new_text_affix (act_text_params[ix]);
        break;
      case FormalSET:
	{ int aff_nr = actual_idxs[ix];
	  if ((formal < 0) || !affix_belongs_to_domain (aff_nr, formal))
	    parse_error ("Type mismatch for parameter %d: %s against formal type %s",
			 ix + 1, affix_name_from_index (aff_nr), affix_name_from_index (formal));
	}; break;
      case FormalUNION:
	{ if (formal < 0)
	    { parse_error ("Type mismatch for parameter %d: union type against formal type %s",
			   ix + 1, affix_name_from_index (formal));
	      break;
	    };
	  type_check_union_and_unify (ix, formal);
        };
      default: break;
    };
}

static void type_check_parameter_pack (int nont_index)
{ int_array formals = formals_from_nonterminal (nont_index);
  int_array criticals = criticals_from_nonterminal (nont_index);
  int ix;
  for (ix = 0; ix < formals -> size; ix++)
    type_check_parameter (ix, formals -> array[ix], criticals -> array[ix]);
}

/*
 * Calculate a bonus for multi-token matches: the square of the
 * number of space sequences in it.
 */

int multi_token_bonus(char *lexeme)
{
    int bonus = 0;

    while (*lexeme != 0) {
	if (*lexeme == ' ') {
	    bonus++;
	    while (*lexeme == ' ') {
		lexeme++;
	    }
	} else {
	    lexeme++;
	}
    }
    
    if (bonus) 
	return bonus * bonus;
    else
	return 0;
}


static void may_be_frequency (int *freq)
{ if (is_signed_number (freq)) ;
  else *freq = 0;
}

static int is_rule ()
{ int lexeme_marker;
  char lexeme_buf[MAX_LEX_LINE_LEN + 1];
  char nonterminal_buf[MAX_LEX_LINE_LEN + 1];

  if (is_word_form (lexeme_buf, &lexeme_marker))
    { /* Read remainder of rule */
      int nont_index, nr_params, freq, fact_nr, call_id;
      int_array criticals;
      int *info_ptr;
      int mt_bonus;

      should_be_nonterminal_name (nonterminal_buf);
      may_be_parameter_pack (&nr_params);
      may_be_frequency (&freq);

      /* Identification and typecheck */
      nont_index = lookup_nonterminal (nonterminal_buf, nr_params);
      if (nont_index < 0)
	{ parse_error ("No such nonterminal '%s/%d'", nonterminal_buf, nr_params);
	  return (1);
	}

      fact_nr = fact_nr_from_nonterminal (nont_index);
      if (fact_nr >= 0)
        { parse_error ("Nonterminal '%s/%d' is a fact", nonterminal_buf, nr_params);
          return (1);
        };
	
      mt_bonus = multi_token_bonus (lexeme_buf);
      /* Type check and register the entry */
      type_check_parameter_pack (nont_index);
      criticals = criticals_from_nonterminal (nont_index);
      call_id = register_new_call (nont_index, criticals, actual_idxs);
      info_ptr = enter_into_lexicon (lexeme_buf, lexeme_marker);
      register_new_entry (info_ptr, call_id, freq, mt_bonus);
      return (1);
    };

  return (0);
}

static int *collect_crits_and_enter_into_fact_table (int fact_nr, int_array criticals)
{ int crits[MAX_PARAMS];
  int ix, cidx, total;
  for (ix = 0, cidx = 1, total = 0; ix < criticals -> size; ix++)
    if (criticals -> array[ix])
      { crits[cidx] = actual_idxs[ix];
        cidx++;
	total++;
      };
  crits[0] = total;	/* Note: should always be the same for one fact */

  /* Enter into right fact table, with crits and total */
  return (enter_into_fact_table (fact_nr, crits));
}

static int is_fact ()
{ char string_buf[MAX_LEX_LINE_LEN + 1];
  char nonterminal_buf[MAX_LEX_LINE_LEN + 1];
  if (is_string (string_buf))
    { int nont_index, nr_params, freq, fact_nr, call_id;
      int_array criticals;
      int *info_ptr;

      /* Parse parameters and bonus */
      should_be_nonterminal_name (nonterminal_buf);
      should_be_parameter_pack (&nr_params);
      may_be_frequency (&freq);

      /* Identification and typecheck */
      nont_index = lookup_nonterminal (nonterminal_buf, nr_params);
      if (nont_index < 0)
	{ parse_error ("No such nonterminal '%s/%d'", nonterminal_buf, nr_params);
	  return (1);
	};

      fact_nr = fact_nr_from_nonterminal (nont_index);
      if (fact_nr < 0)
        { parse_error ("Nonterminal '%s/%d' is not a fact", nonterminal_buf, nr_params);
          return (1);
        };

      /* Type check and register the entry */
      type_check_parameter_pack (nont_index);
      criticals = criticals_from_nonterminal (nont_index);
      call_id = register_new_call (nont_index, criticals, actual_idxs);
      info_ptr = collect_crits_and_enter_into_fact_table (fact_nr, criticals);
      register_new_entry (info_ptr, call_id, freq, 0);
      return (1);
    };

  return (0);
}

/*
   Drive the lexicon, fact and triple file parsing
*/
static void parse_dat_file (char *lname)
{ try_open_lexer_file (lname, lexicon);
  while (!is_eof ())
    { /* Body should eat line */
      may_skip_white_space ();
      if (is_eoln ()) lexer_read_line ();
      else if (is_comment ()) ;
      else if (is_rule ())
        { /* May still be followed by a comment */
	  if (is_comment ()) ;
          else should_be_eoln ();
	}
      else
	{ parse_error ("incomprehensible syntax");
	  lexer_read_line ();
        };
    }
  close_lexer_file ();
}

static void parse_fct_file (char *fname)
{ try_open_lexer_file (fname, fact);
  while (!is_eof ())
    { /* Body should eat line */
      may_skip_white_space ();
      if (is_eoln ()) lexer_read_line ();
      else if (is_comment ()) ;
      else if (is_fact ())
        { /* May still be followed by a comment */
	  if (is_comment ()) ;
          else should_be_eoln ();
	}
      else
	{ parse_error ("incomprehensible syntax");
	  lexer_read_line ();
        };
    }
  close_lexer_file ();
}

void register_terminal_calls ()
{ int nr_nonterminals = nr_of_nonterminals ();
  int nr_terminals = nr_of_terminals ();
  int_array criticals = new_int_array ();
  int ix;
  for (ix = 0; ix < nr_terminals; ix++)
    { int term_idx = ix + nr_nonterminals;
      char *term_text;
      int *info_ptr;
      int lexeme_marker;
      get_terminal_text_and_marker (ix, &term_text, &lexeme_marker);
      if (lexeme_marker != TERMINAL_PLACEHOLDER)
        { int call_id = register_new_call (term_idx, criticals, NULL);
	  int mt_bonus = multi_token_bonus (term_text);
          info_ptr = enter_into_lexicon (term_text, lexeme_marker);
          register_new_entry (info_ptr, call_id, 1, mt_bonus);
	}
    };
}

void parse_datfct_files ()
{ int ix;
  init_datfct_parser ();
  for (ix = 0; ix < lexicon_names -> size; ix++)
    parse_dat_file (lexicon_names -> array[ix]);
  for (ix = 0; ix < fact_table_names -> size; ix++)
    parse_fct_file (fact_table_names -> array[ix]);
  squash_unique_frequencies ();
  register_terminal_calls ();
  apply_bonuses_for_multiwords ();
}
