/*
   File: parser.c
   Parses dat, fact and triple files

   Copyright 2012 Radboud University of Nijmegen

   This program is free software: you can redistribute it and/or modify
   it under the terms of the GNU General Public License as published by
   the Free Software Foundation, either version 3 of the License, or
   (at your option) any later version.

   This program is distributed in the hope that it will be useful,
   but WITHOUT ANY WARRANTY; without even the implied warranty of
   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
   GNU General Public License for more details.

   You should have received a copy of the GNU General Public License
   along with this program.  If not, see <http://www.gnu.org/licenses/>.

   CVS ID: "$Id: parser.c,v 1.12 2012/11/28 14:42:08 marcs Exp $"
*/

/* libdcg includes */
#include <dcg.h>
#include <dcg_error.h>

/* libebase includes */
#include <ebase_ds.h>
#include <ebase_input.h>
#include <ebase_affix_value_utils.h>

/* Local includes */
#include "globals.h"
#include "lexer.h"
#include "affix_values.h"
#include "calls.h"
#include "entries.h"
#include "parser.h"

/*
   Parsing of parameters. Note that we first have to collect
   the parameters as affix values with the corresponding formals
   before we can identify the nonterminal and hence decide which
   parameters are critical and which are not. This will cause
   some extra time overhead on the part of the memory management
   system (Lots of allocates and frees).
*/
static void may_be_rest_lattice_parameter (int dkind, int domain_nr, affix_value *result)
{ affix_value_list vals = new_affix_value_list ();
  int error = (dkind == TYPE_UNKNOWN);
  int kind, type_nr, elt_nr;
  char *str;
  app_affix_value_list (vals, attach_affix_value (*result));
  while (is_char ('|'))
    { if (is_affix_terminal (&str))
	{ if (!lookup_value_string (str, &kind, &type_nr, &elt_nr))
 	    { parse_error ("Could not identify affix terminal '%s'", str);
	      error = 1;
	    };
	  if (kind != TYPE_ELEMENT)
	    { app_affix_value_list (vals, new_Null_value (-1));
	      error = 1;
	    }
	  else 
	    { affix_value my_value = attach_affix_value (all_rt_domains -> array[type_nr] ->
						         elts -> array[elt_nr] -> value);
	      app_affix_value_list (vals, my_value);
	      if ((dkind != TYPE_UNKNOWN) && (type_nr != domain_nr))
		{ parse_error ("Type mismatch in union of lattice values");
		  error = 1;
		};
	    };
	  detach_string (&str);
	}
      else if (is_affix_nonterminal (&str))
	{ if (!lookup_value_string (str, &kind, &type_nr, &elt_nr))
 	    { parse_error ("Could not identify affix nonterminal '%s'", str);
	      error = 1;
	    };
	  if (kind != TYPE_AFFIX_NONTERMINAL)
	    { app_affix_value_list (vals, new_Null_value (-1));
	      error = 1;
	    }
	  else
	    { rt_type rt= all_rt_types -> array[type_nr];
	      affix_value my_value;
	      if (rt -> tag != TAGLattice_type)
	        dcg_bad_tag (rt -> tag, "is_parameter");
	      my_value = attach_affix_value (rt -> value);
	      app_affix_value_list (vals, my_value);
	      if ((dkind != TYPE_UNKNOWN) && (rt -> Lattice_type.dom != domain_nr))
		{ parse_error ("Type mismatch in union of lattice values");
		  error = 1;
		};
	    };
	  detach_string (&str);
	}
      else
	{ parse_error ("affix terminal or affix nonterminal expected");
	  error = 1;
	};
    };

  /* Unify the values, if there was no error */
  if (error) ;
  else if (vals -> size > 1)
    { detach_affix_value (result);
      *result = ebs_join_lattice_values (vals);
    };
  detach_affix_value_list (&vals);
}

/*
   Formal parameters either equal the runtime type nr, or
   -2 - the runtime domain nr, or -1: error.
*/
static int is_parameter (int_list rname_chars, int_list formals, affix_value_list params)
{ int kind, type_nr, elt_nr;
  real rvalue;
  char *str;
  int value;
  if (is_string_with_expansion (&str))
    { /* TEXT parameter */
      affix_value text_value = new_Text_value (-1, str);
      app_int_list (rname_chars, 0);
      app_int_list (formals, text_formal ());
      app_affix_value_list (params, text_value);
      return (1);
    }
  else if (is_real_number (&rvalue))
    { /* REAL parameter */
      affix_value real_value = new_Real_value (-1, rvalue);
      app_int_list (rname_chars, 0);
      app_int_list (formals, real_formal ());
      app_affix_value_list (params, real_value);
      return (1);
    }
  else if (is_signed_number (&value))
    { /* INT parameter */
      affix_value int_value = new_Int_value (-1, value);
      app_int_list (rname_chars, 0);
      app_int_list (formals, int_formal ());
      app_affix_value_list (params, int_value);
      return (1);
    }
  else if (is_affix_terminal (&str))
    { /*
	 From a single terminal, we cannot deduce whether we are
	 dealing with a single lattice element or multiple tree markers
      */
      string_list terms = new_string_list ();
      affix_value result_value;
      app_string_list (terms, str);
      while (is_affix_terminal (&str))
	app_string_list (terms, str);
      app_int_list (rname_chars, 0);

      /* Identify the bunch */
      if (terms -> size != 1)
	{ if (!lookup_value_string_list (terms, &kind, &type_nr, &elt_nr))
	    parse_error ("Could not identify list of markers '%s...'",
			 terms -> array[0]);
        }
      else if (!lookup_value_string (terms -> array[0], &kind, &type_nr, &elt_nr))
 	parse_error ("Could not identify affix terminal '%s'", terms -> array[0]);
      if (kind == TYPE_MARKER)
	{ /* Enumerable types are stored as ints */
	  app_int_list (formals, type_nr);
	  result_value = new_Int_value (type_nr, elt_nr);
	}
      else if (kind == TYPE_ELEMENT)
	{ app_int_list (formals, ENCODE_DOMAIN_FORMAL (type_nr));
	  result_value = attach_affix_value (all_rt_domains -> array[type_nr] ->
					     elts -> array[elt_nr] -> value);
	}
      else
	{ app_int_list (formals, ERROR_FORMAL);
	  result_value = new_Null_value (-1);
	};
      detach_string_list (&terms);
      may_be_rest_lattice_parameter (kind, type_nr, &result_value);
      app_affix_value_list (params, result_value);
      return (1);
    }
  else if (is_affix_nonterminal (&str))
    { affix_value lattice_value;
      int domain_nr = -1;
      app_int_list (rname_chars, 0);
      if (!lookup_value_string (str, &kind, &type_nr, &elt_nr))
	parse_error ("Could not identify affix nonterminal '%s'", str);
      if (kind == TYPE_AFFIX_NONTERMINAL)
	{ rt_type rt = all_rt_types -> array[type_nr];
	  if (rt -> tag != TAGLattice_type)
	    dcg_bad_tag (rt -> tag, "is_parameter");
	  domain_nr = rt -> Lattice_type.dom;

	  /* anonymize nonterminal value */
	  lattice_value = rdup_affix_value (rt -> value);
	  lattice_value -> rule_nr = -1;
	  app_int_list (formals, ENCODE_DOMAIN_FORMAL (domain_nr));
	}
      else
        { app_int_list (formals, ERROR_FORMAL);
	  lattice_value = new_Null_value (-1);
	};
      detach_string (&str);
      may_be_rest_lattice_parameter (kind, domain_nr, &lattice_value);
      app_affix_value_list (params, lattice_value);
      return (1);
    };
  return (0);
}

static void should_be_parameter (int_list rname_chars, int_list formals, affix_value_list params)
{ if (is_parameter (rname_chars, formals, params)) return;
  parse_error ("parameter expected");
}

/*
   Parsing of parameter packs
*/
static int is_parameters_pack (int_list rname_chars, int_list formals, affix_value_list params)
{ if (!is_char ('(')) return (0);
  should_be_parameter (rname_chars, formals, params);
  while (is_char (','))
    should_be_parameter (rname_chars, formals, params);
  should_be_char (')');
  return (1);
}

/*
   Type checking of calls. Note that we only have to check lattice types
   For all other types, the identification of the nonterminal is enough
*/
static void typecheck_actual_value (int nr, int formal, affix_value param)
{ rt_type rt = all_rt_types -> array[formal];
  if (rt -> tag == TAGSynonym_type)
    rt = all_rt_types -> array[rt -> Synonym_type.snr];
  if ((rt -> tag == TAGLattice_type) &&
      !ebs_lattice_value_is_subset (param, rt -> value))
    parse_error ("Parameter %d is not contained in affix nonterminal %s", rt -> name);
}

static void typecheck_actual_values (int_list formals, affix_value_list params)
{ int ix;
  if (formals -> size != params -> size)
    dcg_internal_error ("typecheck_actual_values");
  for (ix = 0; ix < formals -> size; ix++)
    typecheck_actual_value (ix + 1, formals -> array[ix], params -> array[ix]);
}

/*
   Collect all actual parameters
*/
static void collect_actual_parameters (int_list actuals, int_list cr_vec, affix_value_list params)
{ int ix;
  if (params -> size != cr_vec -> size)
    dcg_internal_error ("collect_actual_parameters");
  for (ix = 0; ix < params -> size; ix++)
    if (!cr_vec -> array[ix])
      app_int_list (actuals, collect_actual_parameter (params -> array[ix]));
}

static void collect_critical_parameters (int_list crits, int_list cr_vec, affix_value_list params)
{ int ix;
  if (params -> size != cr_vec -> size)
    dcg_internal_error ("collect_critical_parameters");
  for (ix = 0; ix < params -> size; ix++)
    if (cr_vec -> array[ix])
      { affix_value param = params -> array[ix];
	if (param -> tag != TAGText_value)
	  dcg_bad_tag (param -> tag, "collect_critical_parameters");
	app_int_list (crits, collect_critical_parameter (param -> Text_value.text));
      };
}

/*
   A dat line has the form
   "WORDFORM"   name part [(PARAMS)|name parts] [NUMBER]
*/
static int is_rule ()
{ int marker, frequency, nont_id;
  char *wform, *name_part;
  string_list rname_parts;
  int_list rname_chars;
  affix_value_list params;
  int_list formals;

  /* Parse the word form */
  if (!is_word_form (&wform, &marker, 0)) return (0);

  /* Collect the call */
  rname_parts = new_string_list ();
  rname_chars = new_int_list ();
  params = new_affix_value_list ();
  formals = new_int_list ();
  should_be_name_part (&name_part);
  app_string_list (rname_parts, name_part);
  app_int_list (rname_chars, 1);
  while (1)
    { if (is_name_part (&name_part))
	{ app_string_list (rname_parts, name_part);
	  app_int_list (rname_chars, 1);
	}
      else if (is_parameters_pack (rname_chars, formals, params)) ;
      else break;
    };
  if (!is_signed_number (&frequency)) frequency = 1;	/* Default */

  /* Process the call */
  nont_id = try_identify_nonterminal (rname_parts, formals);
  if (nont_id < 0)
    parse_error ("Could not identify call");
  else
    { lex_nont nont = all_lex_nonts -> array[nont_id];
      typecheck_actual_values (nont -> formals, params);
      if (nont -> fact_nr != -1)
	parse_error ("This nonterminals is a fact");
      else
	{ int_list actuals = init_int_list (formals -> size);
	  int call_id, *info_ptr;
	  if (!equal_int_list (rname_chars, nont -> name_chars))
	    parse_warning ("This nonterminal is usually written in another way");
	  collect_actual_parameters (actuals, nont -> crits, params);
	  call_id = enter_call (nont_id, actuals);
          info_ptr = enter_into_lexicon (wform, marker);
	  register_new_entry (info_ptr, call_id, frequency);
	  detach_int_list (&actuals);
	};
    }

  /* Line has been parsed */
  detach_string (&wform);
  detach_string_list (&rname_parts);
  detach_int_list (&rname_chars);
  detach_affix_value_list (&params);
  detach_int_list (&formals);
  return (1);
}

/*
   A fact line has the form
   "WORDFORM"   name part [(PARAMS)|name parts] [NUMBER] where the WORDFORM will be ignored
*/
static int is_fact ()
{ int marker, frequency, nont_id;
  char *wform, *name_part;
  string_list rname_parts;
  int_list rname_chars;
  affix_value_list params;
  int_list formals;

  /* Parse the word form and ignore it */
  if (!is_word_form (&wform, &marker, 1)) return (0);

  /* Collect the call */
  rname_parts = new_string_list ();
  rname_chars = new_int_list ();
  params = new_affix_value_list ();
  formals = new_int_list ();
  should_be_name_part (&name_part);
  app_string_list (rname_parts, name_part);
  app_int_list (rname_chars, 1);
  while (1)
    { if (is_name_part (&name_part))
	{ app_string_list (rname_parts, name_part);
	  app_int_list (rname_chars, 1);
	}
      else if (is_parameters_pack (rname_chars, formals, params)) ;
      else break;
    };
  if (!is_signed_number (&frequency)) frequency = 1;		/* Default for facts */

  /* Process the call */
  nont_id = try_identify_nonterminal (rname_parts, formals);
  if (nont_id < 0)
    parse_error ("Could not identify call");
  else
    { lex_nont nont = all_lex_nonts -> array[nont_id];
      typecheck_actual_values (nont -> formals, params);
      if (nont -> fact_nr == -1)
	parse_error ("This nonterminal is not a fact");
      else
	{ int_list actuals = init_int_list (formals -> size);
  	  int_list crits = init_int_list (nont -> ncrits);
	  int call_id, *info_ptr;
	  if (!equal_int_list (rname_chars, nont -> name_chars))
	    parse_warning ("This fact is usually written in another way");
	  collect_actual_parameters (actuals, nont -> crits, params);
	  collect_critical_parameters (crits, nont -> crits, params);
	  call_id = enter_call (nont_id, actuals);
	  info_ptr = enter_into_fact_table (nont -> fact_nr, crits);
	  register_new_entry (info_ptr, call_id, frequency);
	  detach_int_list (&actuals);
	  detach_int_list (&crits);
	};
    }

  /* Line has been parsed */
  detach_string (&wform);
  detach_string_list (&rname_parts);
  detach_int_list (&rname_chars);
  detach_affix_value_list (&params);
  detach_int_list (&formals);
  return (1);
}

/*
   A triple has the form
   "WORDFORM" RELATION "WORDFORM"
*/
static int is_triple ()
{ char *left, *rel, *right;
  int frequency;
  int *info_ptr;
  int_list crits;
  if (!is_string_with_expansion (&left)) return (0);
  should_be_relation (&rel);
  should_be_string_with_expansion (&right);
  if (!is_signed_number (&frequency)) frequency = 1;	/* Default for triples? */

  /* Enter the three parts as critical parameters of an anonymous extra fact */
  app_int_list (crits, collect_critical_parameter (left));
  app_int_list (crits, collect_critical_parameter (rel));
  app_int_list (crits, collect_critical_parameter (right));
  info_ptr = enter_into_fact_table (nr_facts, crits);
  *info_ptr += frequency;
  detach_int_list (&crits);
  detach_string (&left);
  detach_string (&rel);
  detach_string (&right);
  return (1);
}

/*
   A translation has the form
   srcletter	trgtletter	[penalty]

   where srcletter and trgtletter are either an unsigned number
   or a quoted character in which we handle the usual escapes.
   The optional penalty should be an unsigned number.
   The numbers will be checked against the current encoding
   (UTF8 processing: < 0x200000, !UTF8 processing: < 256)
*/
static int is_trans_letter (int *ret_nr)
{ if (is_unsigned_number (ret_nr)) return (1);
  return (is_quoted_character (ret_nr));
}

static int is_translation ()
{ int trans_src, trans_dest, trans_penalty;
  if (!is_trans_letter (&trans_src)) return (0);
  if (is_trans_letter (&trans_dest))
    { /* Process translation */
      if (!is_signed_number (&trans_penalty)) trans_penalty = 1;	/* Default */
      app_int_list (translation_sources, trans_src);
      app_int_list (translation_targets, trans_dest);
      app_int_list (translation_penalties, trans_penalty);
    }
  else
    parse_error ("translation target expected");
  return (1);
}

/*
   Drive the lexicon, fact and triple file parsing
*/
static void parse_dat_file (char *lname)
{ try_open_lexer_file (lname, lexicon);
  while (!is_eof ())
    { /* Body should eat line */
      may_skip_white_space ();
      if (is_eoln ()) lexer_read_line ();
      else if (is_comment ()) ;
      else if (is_rule ())
        { /* May still be followed by a comment */
          if (is_comment ()) ;
          else should_be_eoln ();
        }
      else
        { parse_error ("incomprehensible syntax");
          lexer_read_line ();
        };
    }
  close_lexer_file ();
}

static void parse_fct_file (char *fname)
{ try_open_lexer_file (fname, fact);
  while (!is_eof ())
    { /* Body should eat line */
      may_skip_white_space ();
      if (is_eoln ()) lexer_read_line ();
      else if (is_comment ()) ;
      else if (is_fact ())
        { /* May still be followed by a comment */
          if (is_comment ()) ;
          else should_be_eoln ();
        }
      else
        { parse_error ("incomprehensible syntax");
          lexer_read_line ();
        };
    }
  close_lexer_file ();
}

static void parse_trp_file (char *tname)
{ try_open_lexer_file (tname, triple);
  while (!is_eof ())
    { /* Body should eat line */
      may_skip_white_space ();
      if (is_eoln ()) lexer_read_line ();
      else if (is_comment ()) ;
      else if (is_triple ())
        { /* May still be followed by a comment */
          if (is_comment ()) ;
          else should_be_eoln ();
        }
      else
        { parse_error ("incomprehensible syntax");
          lexer_read_line ();
        };
    }
  close_lexer_file ();
}

/*
   A terminal is registered as a nonterminal with an index equal
   to the terminal number + the number of lex_nonts read from 
   the EAG3 compiler. Note that terminals that only differ in
   their marker are still registered with different terminal nr
   and hence different calls, entry lists, etc.
*/
static void register_terminal_entries ()
{ int_list null_actuals = new_int_list ();
  int nont_idx = all_lex_nonts -> size;
  int ix;
  for (ix = 0; ix < all_terminals -> size; ix++)
    { terminal term = all_terminals -> array[ix];
      int call_id = enter_call (nont_idx + ix, null_actuals);
      char *lexeme = convert_terminal_to_lexeme (term -> text);
      int *info_ptr = enter_into_lexicon (lexeme, term -> marker);
      app_string_list (all_terminal_texts, new_string (lexeme));
      register_new_entry (info_ptr, call_id, 1);	/* Default frequency */
    };
}

/*
   Handle the translations
*/
static void make_default_translations ()
{ int ix;
  for (ix = 'A'; ix <= 'Z'; ix++)
    { int lower_ix = ix + 0x20;
      app_int_list (translation_sources, ix);
      app_int_list (translation_targets, lower_ix);
      app_int_list (translation_penalties, 1);
    };
}

static void parse_translations ()
{ try_open_lexer_file (translate_fname, translations);
  while (!is_eof ())
    { /* Body should eat line */
      may_skip_white_space ();
      if (is_eoln ()) lexer_read_line ();
      else if (is_comment ()) ;
      else if (is_translation ())
        { /* May still be followed by a comment */
          if (is_comment ()) ;
          else should_be_eoln ();
        }
      else
        { parse_error ("incomprehensible translations syntax");
          lexer_read_line ();
        };
    }
  close_lexer_file ();
}

void parse_files ()
{ int ix;
  for (ix = 0; ix < all_lexica -> size; ix++)
    parse_dat_file (all_lexica -> array[ix]);
  for (ix = 0; ix < all_fact_tables -> size; ix++)
    parse_fct_file (all_fact_tables -> array[ix]);
  for (ix = 0; ix < all_triple_databases -> size; ix++)
    parse_trp_file (all_triple_databases -> array[ix]);
  register_terminal_entries ();
  switch (translate_option)
    { case TranslationsDefault: make_default_translations (); break;
      case TranslationsByFile: parse_translations ();
      case TranslationsOff: break;
      default: dcg_bad_tag (translate_option, "parse_files");
    };
}

