/*
   File: lif_parser.c
   Parses the lexicon interface file

   Copyright 2007 Radboud University of Nijmegen
 
   This program is free software; you can redistribute it and/or modify
   it under the terms of the GNU General Public License as published by
   the Free Software Foundation; either version 2 of the License, or
   (at your option) any later version.
 
   This program is distributed in the hope that it will be useful,
   but WITHOUT ANY WARRANTY; without even the implied warranty of
   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
   GNU Library General Public License for more details.
 
   You should have received a copy of the GNU General Public License
   along with this program; if not, write to the Free Software
   Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA.

   CVS ID: "$Id: lif_parser.c,v 1.14 2008/09/03 14:59:56 olafs Exp $"
*/

/* system includes */
#include <stdio.h>
#include <stdlib.h>
#include <stdarg.h>
#include <string.h>

/* libabase includes */
#include <abase_repr.h>
#include <abase_error.h>
#include <abase_memalloc.h>
#include <abase_fileutil.h>

/* local includes */
#include "options.h"
#include "globals.h"
#include "dyn_array.h"
#include "lif_parser.h"
#include "dat_parser.h"
#include "affix_values.h"
#include "nonterminals.h"

/*
   A lexicon interface file consists of 4 parts separated by a specific line
   containing '*****'. The 4 parts are
   1) A list of lexicon names (to be read by the dat parser)
   2) The affix table (defining all affix names)
   3) The nonterminal tables (defining all nonterminals with appropriate typing)
   4) Pragmats which influence the form or meaning of the lexicon in some way
*/

/*
   Character reading administration
*/
#define MAX_LIF_LINE_LEN 1023
static char lif_line_buffer[MAX_LIF_LINE_LEN + 1];
static char *lif_fname;
static char *line_ptr;
static int has_errors;
static int linenr;
static FILE *lif;

static void lif_error (char *format, ...)
{ va_list argp;
  abs_printf ("File '%s', line %d: ", lif_fname, linenr);
  va_start (argp, format);
  abs_vprintf (format, argp);
  va_end (argp);
  abs_printf ("\n");
  has_errors = 1;
}

/* White space is blank, tab, newline and more? */
static int ch_is_white_space (char ch)
{ return ((ch == ' ') || (ch == '\n') || (ch == '\f') || (ch == '\r') || (ch == '\t'));
}

/* Read line and eat all trailing white space */
static void read_line ()
{ char *line_end;
  int len;
  line_ptr = fgets (lif_line_buffer, MAX_LIF_LINE_LEN, lif);
  linenr++;
  if (line_ptr == NULL) return;
  len = (int) strlen (line_ptr);
  line_end = line_ptr + len - 1;
  while ((len != 0) && ch_is_white_space (*line_end))
    { line_end--; len--; }
  *++line_end = '\0';
}

/* Opening the lif file: no version control yet */
#define LIF_SUFFIX "lif"
static void try_open_lexicon_interface ()
{ lif_fname = abs_new_fmtd_string ("try_open_lexicon_interface", "%s.%s", basename, LIF_SUFFIX);
  lif = abs_fopen (lif_fname, "r");

  /* Prepare line buffer */
  has_errors = 0;
  linenr = 0;
  read_line ();

  /* Check AGFL version for compatibility */
  if (verbose)
    abs_message ("  reading lexicon interface \"%s\"", basename);
}

static void close_lexicon_interface ()
{ if (has_errors)
    abs_abort ("close_lexicon_interface", "Lif file '%s' contains errors", lif_fname);
  fclose (lif);
  if (verbose)
    abs_message ("    found %d set affixes and domains, %d lexicon nonterminals",
		 nr_of_set_affixes (), nr_of_nonterminals ());
}

static void may_skip_white_space ()
{ while (ch_is_white_space (*line_ptr)) line_ptr++;
}

static void may_skip_spaces ()
{ while (*line_ptr == ' ') line_ptr++;
}

static int is_eof ()
{ return (line_ptr == NULL);
}

static void should_be_eof ()
{ if (is_eof ()) return;
  lif_error ("End of file expected");
}

static int is_eoln ()
{ return (*line_ptr == '\0');
}

static int is_column_separator ()
{ if (*line_ptr == '\t')
    { line_ptr++;
      return (1);
    };
  return (0);
}

static void should_be_column_separator ()
{ if (is_column_separator ()) return;
  lif_error ("Column separator expected");
}

static void should_be_eoln ()
{ if (!is_eoln ())
    lif_error ("End of line expected");
  read_line ();
}

/*
   For the reading of names (file names or affix names),
   we accept all characters except white space. Currently
   there is no need to be more restrictive. Note: these
   names are immediately allocated
*/
static int is_name (char **ret_name)
{ char buf[MAX_LIF_LINE_LEN+1];
  char *dptr = buf;
  while (!is_eoln () && !ch_is_white_space (*line_ptr))
    *dptr++ = *line_ptr++;
  *dptr = '\0';
  if (line_ptr == buf) return (0);
  *ret_name = abs_new_string (buf, "is_name");
  return (1);
}

static void should_be_name (char **ret_name)
{ if (is_name (ret_name)) return;
  *ret_name = NULL;
  lif_error ("Name expected");
}

static int is_nonterminal_name (char **ret_name)
{ char buf[MAX_LIF_LINE_LEN+1];
  char *dptr = buf;
  while (!is_eoln () && !ch_is_white_space (*line_ptr))
    { *dptr++ = *line_ptr++;
      if (*line_ptr == ' ')
	{ may_skip_spaces ();

	  /* Check for a new idpart */
	  if (!is_eoln () && !ch_is_white_space (*line_ptr))
	    *dptr++ = ' ';
	};
    };
  *dptr = '\0';
  if (line_ptr == buf) return (0);
  *ret_name = abs_new_string (buf, "is_nonterminal_name");
  return (1);
}

static void should_be_nonterminal_name (char **ret_name)
{ if (is_nonterminal_name (ret_name)) return;
  *ret_name = NULL;
  lif_error ("Nonterminal name expected");
}

#define LIF_SEPARATOR "*****"
static int is_lif_separator ()
{ if (is_eof ()) return (0);
  if (strcmp (line_ptr, LIF_SEPARATOR) == 0)
    { read_line ();
      return (1);
    };
  return (0);
}

/*
   Reading of numbers
*/
static int is_digit (int *ret_val)
{ if (('0' <= (*line_ptr)) && (*line_ptr <= '9'))
    { *ret_val = (int) (((*line_ptr) - '0'));
      line_ptr++;
      return (1);
    };
  return (0);
}

static int is_number (int *ret_nr)
{ int value, digit;
  if (!is_digit (&value)) return (0);
  while (is_digit (&digit)) value = value * 10 + digit;
  *ret_nr = value;
  return (1);
}

static void should_be_number (int *ret_nr)
{ if (is_number (ret_nr)) return;
  *ret_nr = -1;
  lif_error ("Number expected");
}

static int is_hex_digit (unsigned *ret_val)
{ if (('0' <= (*line_ptr)) && (*line_ptr <= '9'))
    { *ret_val = (unsigned) (((*line_ptr) - '0'));
      line_ptr++;
      return (1);
    };
  if (('A' <= (*line_ptr)) && (*line_ptr <= 'F'))
    { *ret_val = (unsigned) (((*line_ptr) - 'A')) + 10;
      line_ptr++;
      return (1);
    };
  if (('a' <= (*line_ptr)) && (*line_ptr <= 'a'))
    { *ret_val = (unsigned) (((*line_ptr) - 'a')) + 10;
      line_ptr++;
      return (1);
    };
  return (0);
}

static int is_bitset (Bitset32 *ret_val)
{ unsigned value, hex_digit;
  if (!is_hex_digit (&value)) return (0);
  while (is_hex_digit (&hex_digit)) value = (value << 4) | hex_digit;
  *ret_val = value; 
  return (1);
}

static void should_be_bitset (Bitset32 *ret_val)
{ if (is_bitset (ret_val)) return;
  *ret_val = 0;
  lif_error ("Bitset expected");
}

static int is_lhs_or_rhs (int *ret_val)
{ char buf[MAX_LIF_LINE_LEN+1];
  char *dptr = buf;
  while (!is_eoln () && !ch_is_white_space (*line_ptr))
    *dptr++ = *line_ptr++;
  *dptr = '\0';
  if (strcmp (buf, "lhs") == 0) *ret_val = 1;
  else if (strcmp (buf, "rhs") == 0) *ret_val = 0;
  else return (0);
  return (1);
}

static void should_be_lhs_or_rhs (int *ret_val)
{ if (is_lhs_or_rhs (ret_val)) return;
  *ret_val = 0;
  lif_error ("lhs or rhs expected");
}

/*
   The lexica part is juts a list of lexicon names
*/
#define MAX_INIT_LEXICA 16
static void read_lexicon_names ()
{ lexicon_names = init_text_array (MAX_INIT_LEXICA);
  while (!is_eof () && !is_lif_separator ())
    { char *lname;
      may_skip_white_space ();
      should_be_name (&lname);
      if (lname != NULL)
	app_uniq_text_array (lexicon_names, lname);
      should_be_eoln ();
    };
}

/*
   The affix table consists of a number of lines, each line containing
   4 entries separated by TABs (as column separator):
   STRING 	affix name
   NUMBER 	coder index of affix
   HEXNUMBER	value of affix
   STRING	lhs/rhs indicating if the affix is a nonterminal or terminal
*/
static void read_affix_table ()
{ int prev_lhs = 0;
  int max_index = -1;
  while (!is_eof () && !is_lif_separator ())
    { char *affix_name;
      int coder_number;
      Bitset32 affix_value;
      int affix_lhs, stat;
      may_skip_white_space ();
      should_be_name (&affix_name);
      should_be_column_separator ();
      should_be_number (&coder_number);
      should_be_column_separator ();
      should_be_bitset (&affix_value);
      should_be_column_separator ();
      should_be_lhs_or_rhs (&affix_lhs);
      should_be_eoln ();

      /* register the new set affix: either the index is returned or < 0 */
      stat = register_new_set_affix (affix_name, coder_number, affix_value, affix_lhs, prev_lhs);
      if (stat < 0)
	lif_error ("inconsistent declaration of set affix '%s'", affix_name);
      else
	{ if (affix_lhs) prev_lhs = stat;

          /*
   	     if the index is less or equal than the previous one,
   	     it had already been entered, typically as a affix lhs
   	     so we can free the name
          */
          if (max_index < stat) max_index = stat;
          else abs_free (affix_name, "read_affix_table");
	};
    };
}

/*
   The nonterminal part consists of a number of lines, each containing
   a variable number of entries separated by TABs (as column separator):
   STRING	nonterminal name (which may contain spaces)
   NUMBER	nr of affix positions
   NUMBER	coder index (nonterminal_nr)
   STRING*	formal parameter type by name (may be INT or TEXT)
		as many as necessary
*/
#define streq(s1,s2) (strcmp((s1),(s2)) == 0)
static void read_nonterminal_table ()
{ while (!is_eof () && !is_lif_separator ())
    { char *nonterminal_name;
      int nr_of_positions;
      int nonterminal_nr;
      int_array formals;
      char *fpar_name;
      int ix;

      may_skip_white_space ();
      should_be_nonterminal_name (&nonterminal_name);
      should_be_column_separator ();
      should_be_number (&nr_of_positions);
      should_be_column_separator ();
      formals = init_int_array (nr_of_positions);
      should_be_number (&nonterminal_nr);
      for (ix = 0; ix < nr_of_positions; ix++)
        { should_be_column_separator ();
	  should_be_name (&fpar_name);
	  if (streq (fpar_name, "INT"))
	    app_int_array (formals, FormalINT);
	  else if (streq (fpar_name, "TEXT"))
	    app_int_array (formals, FormalTEXT);
	  else
	    { int set_affix = lookup_set_affix (fpar_name); 
	      if (set_affix < 0)
		{ app_int_array (formals, -42);
		  lif_error ("Unknown affix '%s'", fpar_name);
		}
	      else app_int_array (formals, set_affix);
	    };
	  abs_free (fpar_name, "read_nonterminal_table");
	};
      if (register_new_nonterminal (nonterminal_name, nonterminal_nr, formals) < 0)
	lif_error ("Incorrect declaration of '%s/%d'", nonterminal_name, formals);
      should_be_eoln ();
    };
}

static void read_pragmat_table ()
{ while (!is_eof () && !is_lif_separator ())
  { char *pragmat;
    should_be_name (&pragmat);
    if (streq(pragmat, "lexicon_has_frequencies")) lexicon_has_frequencies = 1;
    else if (streq(pragmat, "lexicon_has_bonuses")) lexicon_has_frequencies = 0;
    abs_free (pragmat, "read_pragmat_table");
    should_be_eoln ();
  }
}

/* exported actions */
void parse_lexicon_interface ()
{ try_open_lexicon_interface ();
  read_lexicon_names ();
  read_affix_table (); 
  read_nonterminal_table ();
  read_pragmat_table ();
  should_be_eof ();
  close_lexicon_interface ();
}
