/*
   File: blx_writer.c
   Generates the binary lexicon file
   Verifies an old binary lexicon file against a new binary lexicon interface file

   Copyright (C) 2012 Marc Seutter

   This program is free software: you can redistribute it and/or modify
   it under the terms of the GNU General Public License as published by
   the Free Software Foundation, either version 3 of the License, or
   (at your option) any later version.

   This program is distributed in the hope that it will be useful,
   but WITHOUT ANY WARRANTY; without even the implied warranty of
   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
   GNU General Public License for more details.

   You should have received a copy of the GNU General Public License
   along with this program.  If not, see <http://www.gnu.org/licenses/>.

   CVS ID: "$Id: blx_writer.c,v 1.9 2012/08/01 12:40:41 marcs Exp $"
*/

/* standard includes */
#include <stdio.h>

/* libdcg includes */
#include <dcg.h>
#include <dcg_error.h>
#include <dcg_fileutil.h>
#include <dcg_binfile.h>

/* libeagbase includes */
#include <ebase_version.h>
#include <ebase_ds.h>

/* Local includes */
#include "options.h"
#include "globals.h"
#include "affix_values.h"
#include "blx_writer.h"

/*
   The binary lexicon file is a binary encoded file consisting of the following parts
   Note that the first 11 items are equal to the ones in the lexicon interface file
   This is on purpose to make a comparison whether the binary lexicon needs to be
   regenerated (i.e. if these items have not changed and no dat/fct/trp files are newer
   than during the previous compilation, we do not have to recompile the binary lexicon).

   Although the character set table and regexp table are simply passed on, they are
   incorporated into the binary lexicon file to enable the user to test his lexica
   with independent tools before trying to use them in a full blown grammar.

   1) A list of lexicon names
   2) A list of fact table names
   3) A list of triples file names
   4) All affix super domains
   5) The type table
   6) Those pragmat settings that influence the lexicalization process
   7) The number of facts
   8) The lexicon/fact nonterminal table (defining all nonterminals with their typing)
   9) The terminal table (defining all grammar terminals)
   10) The character set table (defining all characters sets)
   11) The regexp nfa table (defining all nfas for the regexps).

   Parts, generated by lexgen:

   12) The collected set of translations
   13) The collected set of (actual) affix values
   14) The critical text vocabulary
   15) The set of all calls
   16) The set of all entry lists
   17) All vocabularies (list of (marker, vocabulary) pairs)
   18) All facts
*/
#define BLX_KIND "Lexicon"
static BinFile open_binary_lexicon_file (int for_write)
{ char *mode = (for_write)?"w":"r";
  BinFile bf;
  if (lexicon_fname == string_nil)
    { /* No commandline overruling */
      char *fname = dcg_new_fmtd_string ("%s.%s", base_gname, BLX_SUFFIX);
      bf = dcg_bin_fopen_path (dir_name, fname, mode, for_write);
      dcg_detach ((void **) &fname);
    }
  else if (lexicon_fname[0] != '/')
    bf = dcg_bin_fopen_path (dir_name, lexicon_fname, mode, for_write);
  else bf = dcg_bin_fopen (lexicon_fname, mode, for_write);
  return (bf);
}

static void save_pragmats (BinFile bf)
{ save_int (bf, hyphen_convention_active);
  save_int (bf, hybrid_parsing);
  save_int (bf, utf8_processing);
  save_int (bf, translate_option);
  save_int (bf, empty_white_space_option);
  save_string (bf, translate_fname);
  save_int_list (bf, white_spaces);
  save_int_list (bf, separators);
}

/*
   The vocabularies are saved as a list of pairs
   (marker, vocabulary)
*/
static void save_vocabularies (BinFile bf)
{ int nr_vocs = 0;
  int ix;
  for (ix = 0; ix < MAX_MARKERS; ix++)
    if (all_vocabularies[ix] != vocabulary_nil)
      nr_vocs++;
  save_int (bf, nr_vocs);
  for (ix = 0; ix < MAX_MARKERS; ix++)
    { Vocabulary voc = all_vocabularies[ix];
      if (voc != vocabulary_nil)
	{ save_int (bf, ix);	/* marker */
	  ebs_bin_save_vocabulary (bf, voc);
	};
    };
}

/*
   Facts are saved with its fact size (in case we implement fact
   resizing in the future), followed by a list of pairs
   (distance to next non empty bucket, contents of next bucket)
*/
static void save_facts (BinFile bf)
{ int my_nr_of_facts = nr_facts + hybrid_parsing;
  int ix, iy;
  for (ix = 0; ix < my_nr_of_facts; ix++)
    { int_list_list fact = all_compiled_fact_tables[ix];
      int prev = 0;
      save_int (bf, fact -> size);
      save_int (bf, fact_nr_to_lex_nont_nr -> array[ix]);
      for (iy = 0; iy < fact -> size; iy++)
	{ int_list bucket = fact -> array[iy];
	  if (bucket != int_list_nil)
	    { save_int (bf, iy - prev);
	      save_int_list (bf, bucket);
	      prev = iy;
	    }
	};
      save_int (bf, -1);	/* End of table marker */
    };
}

void write_binary_lexicon ()
{ BinFile bf = open_binary_lexicon_file (1);
  dcg_bin_save_header (bf, "EAG3", BLX_KIND, EAG3_VERSION);

  /* Parts shared with lexicon interface file */
  save_string_list (bf, all_lexica);
  save_string_list (bf, all_fact_tables);
  save_string_list (bf, all_triple_databases);
  save_rt_domain_list (bf, all_rt_domains);
  save_rt_type_list (bf, all_rt_types);
  save_pragmats (bf);
  save_int (bf, nr_facts);
  save_lex_nont_list (bf, all_lex_nonts);
  save_terminal_list (bf, all_terminals);
  save_cset_list (bf, all_csets);
  save_nfa_list (bf, all_regexp_nfas);

  /* Parts generated by lexicon generator */
  save_int_list (bf, translation_sources);
  save_int_list (bf, translation_targets);
  save_int_list (bf, translation_penalties);
  save_affix_value_list (bf, all_actuals);
  ebs_bin_save_vocabulary (bf, critical_text_vocabulary);
  save_int (bf, int_formal ());
  save_int (bf, real_formal ());
  save_int (bf, text_formal ());
  save_int_list_list (bf, all_calls);
  save_int_list_list (bf, all_entry_lists);
  save_vocabularies (bf);
  save_facts (bf);
  dcg_bin_save_eof (bf);
  dcg_bin_fclose (&bf);
}

/*
   Check if a new lexicon interface is compatible with the old binary lexicon.
*/
static int string_compatible (BinFile bf, char *str)
{ string old_text;
  int stat;
  load_string (bf, &old_text);
  stat = streq (old_text, str);
  detach_string (&old_text);
  return (stat);
}

static int int_compatible (BinFile bf, int val)
{ int old_val;
  load_int (bf, &old_val);
  return (old_val == val);
}

static int string_list_compatible (BinFile bf, string_list sl)
{ string_list old_sl;
  int stat;
  load_string_list (bf, &old_sl);
  stat = cmp_string_list (old_sl, sl);
  detach_string_list (&old_sl);
  return (stat == 0);
}

static int int_list_compatible (BinFile bf, int_list sl)
{ int_list old_sl;
  int stat;
  load_int_list (bf, &old_sl);
  stat = cmp_int_list (old_sl, sl);
  detach_int_list (&old_sl);
  return (stat == 0);
}

static int rt_domain_list_compatible (BinFile bf, rt_domain_list rts)
{ rt_domain_list old_rts;
  int stat;
  load_rt_domain_list (bf, &old_rts);
  stat = cmp_rt_domain_list (old_rts, rts);
  detach_rt_domain_list (&old_rts);
  return (stat == 0);
}

static int rt_type_list_compatible (BinFile bf, rt_type_list rts)
{ rt_type_list old_rts;
  int stat;
  load_rt_type_list (bf, &old_rts);
  stat = cmp_rt_type_list (old_rts, rts);
  detach_rt_type_list (&old_rts);
  return (stat == 0);
}

static int pragmats_compatible (BinFile bf)
{ if (!int_compatible (bf, hyphen_convention_active)) return (0);
  if (!int_compatible (bf, hybrid_parsing)) return (0);
  if (!int_compatible (bf, utf8_processing)) return (0);
  if (!int_compatible (bf, translate_option)) return (0);
  if (!int_compatible (bf, empty_white_space_option)) return (0);
  if (!string_compatible (bf, translate_fname)) return (0);
  if (!int_list_compatible (bf, white_spaces)) return (0);
  if (!int_list_compatible (bf, separators)) return (0);
  return (1);
}

static int lex_nont_list_compatible (BinFile bf, lex_nont_list rts)
{ lex_nont_list old_rts;
  int stat;
  load_lex_nont_list (bf, &old_rts);
  stat = cmp_lex_nont_list (old_rts, rts);
  detach_lex_nont_list (&old_rts);
  return (stat == 0);
}

static int terminal_list_compatible (BinFile bf, terminal_list rts)
{ terminal_list old_rts;
  int stat;
  load_terminal_list (bf, &old_rts);
  stat = cmp_terminal_list (old_rts, rts);
  detach_terminal_list (&old_rts);
  return (stat == 0);
}

static int cset_list_compatible (BinFile bf, cset_list rts)
{ cset_list old_rts;
  int stat;
  load_cset_list (bf, &old_rts);
  stat = cmp_cset_list (old_rts, rts);
  detach_cset_list (&old_rts);
  return (stat == 0);
}

static int nfa_list_compatible (BinFile bf, nfa_list rts)
{ nfa_list old_rts;
  int stat;
  load_nfa_list (bf, &old_rts);
  stat = cmp_nfa_list (old_rts, rts);
  detach_nfa_list (&old_rts);
  return (stat == 0);
}

static int binary_lexicon_contents_compatible (BinFile bf)
{ int old_nr_facts;
  if (!dcg_bin_verify_version (bf, "EAG3", BLX_KIND, EAG3_VERSION)) return (0);
  if (!string_list_compatible (bf, all_lexica)) return (0);
  if (!string_list_compatible (bf, all_fact_tables)) return (0);
  if (!string_list_compatible (bf, all_triple_databases)) return (0);
  if (!rt_domain_list_compatible (bf, all_rt_domains)) return (0);
  if (!rt_type_list_compatible (bf, all_rt_types)) return (0);
  if (!pragmats_compatible (bf)) return (0);
  load_int (bf, &old_nr_facts);
  if (old_nr_facts != nr_facts) return (0);
  if (!lex_nont_list_compatible (bf, all_lex_nonts)) return (0);
  if (!terminal_list_compatible (bf, all_terminals)) return (0);
  if (!cset_list_compatible (bf, all_csets)) return (0);
  if (!nfa_list_compatible (bf, all_regexp_nfas)) return (0);
  return (1);
}

int new_lexicon_interface_compatible ()
{ BinFile bf = open_binary_lexicon_file (0);
  int stat;
  if (bf == NULL) return (0);
  stat = binary_lexicon_contents_compatible (bf);
  dcg_bin_fclose (&bf);
  return (stat);
}

time_t get_binary_lexicon_age ()
{ char *path;
  time_t age;
  if (lexicon_fname == string_nil)
    { /* No commandline overruling */
      char *fname = dcg_new_fmtd_string ("%s.%s", base_gname, BLX_SUFFIX);
      path = dcg_construct_path (dir_name, fname);
      detach_string (&fname);
    }
  else if (lexicon_fname[0] != '/')
    path = dcg_construct_path (dir_name, lexicon_fname);
  else path = attach_string (lexicon_fname);
  if (!dcg_file_mtime (path, &age))
    dcg_panic ("Could not get age of file '%s'", path);
  detach_string (&path);
  return (age);
}
