/*
   File: ebase_lexicon.c
   Stores a lexicon (by which we mean a set of vocabularies with their
   feature space of nonterminals, actual affix values and fact tables).
   Note that this does include the encoding of the runtime type system,
   affix domains and affix value constants.

   Copyright 2012 Marc Seutter
 
   This program is free software: you can redistribute it and/or modify
   it under the terms of the GNU General Public License as published by
   the Free Software Foundation, either version 3 of the License, or
   (at your option) any later version.

   This program is distributed in the hope that it will be useful,
   but WITHOUT ANY WARRANTY; without even the implied warranty of
   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
   GNU General Public License for more details.

   You should have received a copy of the GNU General Public License
   along with this program.  If not, see <http://www.gnu.org/licenses/>.

   CVS ID: "$Id: ebase_lexicon.c,v 1.11 2012/10/27 12:42:40 marcs Exp $"
*/

/* global includes */
#include <stdio.h>
#include <ctype.h>
#include <string.h>

/* libdcg includes */
#include <dcg.h>
#include <dcg_alloc.h>
#include <dcg_error.h>
#include <dcg_string.h>
#include <dcg_binfile.h>
#include <dcg_plist.h>

/* Local includes */
#include "ebase_version.h"
#include "ebase_ds.h"
#include "ebase_vocabulary.h"
#include "ebase_lexicon.h"
#include "ebase_lexicon_impl.h"
#include "ebase_input.h"

#define BLX_SUFFIX "blx"
static BinFile open_binary_lexicon_file (char *dir_name, char *grammar_name)
{ char *fname = dcg_new_fmtd_string ("%s.%s", grammar_name, BLX_SUFFIX);
  BinFile bf = dcg_bin_fopen_path (dir_name, fname, "r", 1);
  dcg_detach ((void **) &fname);
  return (bf);
}

static void ignore_string (BinFile bf)
{ string ignore;
  load_string (bf, &ignore);
  detach_string (&ignore);
}

static void ignore_string_list (BinFile bf)
{ string_list ignore;
  load_string_list (bf, &ignore);
  detach_string_list (&ignore);
}

static void load_pragmats (BinFile bf, Lexicon my_lex)
{ int dummy;
  dcg_bin_load_int (bf, &dummy);	/* hyphen_convention */
  dcg_bin_load_int (bf, &my_lex -> hybrid_parsing);
  dcg_bin_load_int (bf, &my_lex -> utf8_processing);
  dcg_bin_load_int (bf, &dummy);	/* translate_option */
  dcg_bin_load_int (bf, &my_lex -> empty_white_space);
  ignore_string (bf);			/* translate_fname */
  load_int_list (bf, &my_lex -> white_spaces);
  load_int_list (bf, &my_lex -> separators);
}

static void load_vocabularies (BinFile bf, Lexicon my_lex)
{ int nr_vocs, ix;
  dcg_bin_load_int (bf, &nr_vocs);
  my_lex -> nr_rt_vocs = nr_vocs;
  my_lex -> rt_voc_markers = (int *) dcg_calloc (nr_vocs, sizeof (int));
  my_lex -> rt_vocabularies = (Vocabulary *) dcg_calloc (nr_vocs, sizeof (Vocabulary));
  for (ix = 0; ix < nr_vocs; ix++)
    { dcg_bin_load_int (bf, &my_lex -> rt_voc_markers[ix]);
      my_lex -> rt_vocabularies[ix] = ebs_new_vocabulary (my_lex -> vocabulary_pool);
      ebs_bin_load_vocabulary (bf, my_lex -> rt_vocabularies[ix]);
    };
}

static void load_facts (BinFile bf, Lexicon my_lex)
{ int my_nr_facts = my_lex -> nr_facts + my_lex -> hybrid_parsing;
  int ix, iy;
  if (!my_nr_facts)
    { /* No facts at all */
      my_lex -> rt_fact_tables = NULL;
      return;
    };
  my_lex -> rt_fact_tables = (FactTable *) dcg_calloc (my_nr_facts, sizeof (FactTable));
  for (ix = 0; ix < my_nr_facts; ix++)
    { FactTable ft = (FactTable) dcg_malloc (sizeof (struct fact_table_rec));
      int curr = 0;
      dcg_bin_load_int (bf, &ft -> hash_size);
      dcg_bin_load_int (bf, &ft -> lex_nont_nr);
      ft -> buckets = dcg_calloc (ft -> hash_size, sizeof (int_list));
      for (iy = 0; iy < ft -> hash_size; iy++)
	ft -> buckets[iy] = int_list_nil;
      while (curr < ft -> hash_size)
	{ int delta;
	  dcg_bin_load_int (bf, &delta);
	  if (delta == -1)	/* end of table marker */
	    curr = ft -> hash_size;
	  else
	    { curr = curr + delta;
	      load_int_list (bf, &ft -> buckets[curr]);
	    };
	};
      my_lex -> rt_fact_tables[ix] = ft;
    };
}

/* Read the lexicon */
#define BLX_KIND "Lexicon"
void ebs_read_lexicon (char *dir_name, char *grammar_name, Lexicon *ret_lex)
{ BinFile bf;
  Lexicon my_lex;
  if (debug)
    dcg_wlog ("Starting to load the lexicon");
  bf = open_binary_lexicon_file (dir_name, grammar_name);
  dcg_bin_verify_header (bf, "EAG3", BLX_KIND, EAG3_VERSION);
  my_lex = (Lexicon) dcg_malloc (sizeof (struct lexicon_rec));
  my_lex -> vocabulary_pool = dcg_pool_init ("eag3 vocabularies", 0);

  /* Parts shared with lexicon interface file */
  ignore_string_list (bf);	/* Lexica */
  ignore_string_list (bf);	/* fact tables */
  ignore_string_list (bf);	/* triple databases */
  load_rt_domain_list (bf, &my_lex -> rt_domains);
  load_rt_type_list (bf, &my_lex -> rt_types);
  load_pragmats (bf, my_lex);
  dcg_bin_load_int (bf, &my_lex -> nr_facts);
  load_lex_nont_list (bf, &my_lex -> rt_lex_nonts);
  load_terminal_list (bf, &my_lex -> rt_terminals);
  load_cset_list (bf, &my_lex -> rt_character_sets);
  load_nfa_list (bf, &my_lex -> rt_regexp_nfas);

  /* Parts generated by lexicon generator */
  load_int_list (bf, &my_lex -> translation_sources);
  load_int_list (bf, &my_lex -> translation_targets);
  load_int_list (bf, &my_lex -> translation_penalties);
  load_affix_value_list (bf, &my_lex -> rt_values);
  my_lex -> rt_criticals = ebs_new_vocabulary (my_lex -> vocabulary_pool);
  ebs_bin_load_vocabulary (bf, my_lex -> rt_criticals);
  load_int (bf, &my_lex -> rt_int);
  load_int (bf, &my_lex -> rt_real);
  load_int (bf, &my_lex -> rt_text);
  load_int_list_list (bf, &my_lex -> rt_lex_calls);
  load_int_list_list (bf, &my_lex -> rt_entry_lists);
  load_vocabularies (bf, my_lex);
  load_facts (bf, my_lex);
 
  /* Finalize reading */ 
  dcg_bin_verify_eof (bf);
  dcg_bin_fclose (&bf);
  
  /* Preprocess the input stage */
  ebs_prepare_lexicon_input (my_lex);
  if (debug)
    dcg_wlog ("Finished loading and preparing the lexicon");

  /* Done, return */
  *ret_lex = my_lex;
}

int_list ebs_get_entries_from_entry_nr (Lexicon lex, int entry_nr)
{ if ((0 <= entry_nr) && (entry_nr < lex -> rt_entry_lists -> size))
    return (lex -> rt_entry_lists -> array[entry_nr]);
  dcg_internal_error ("ebs_get_entries_from_entry_nr");
  return (int_list_nil);
}

static void detach_fact_table (FactTable *ft)
{ FactTable old;
  int ix;
  if (ft == NULL)
    dcg_internal_error ("detach_fact_table");
  old = *ft;
  for (ix = 0; ix < old -> hash_size; ix++)
    detach_int_list (&old -> buckets[ix]);
  dcg_detach ((void **) &old -> buckets);
  dcg_detach ((void **) ft);
}

void ebs_detach_lexicon (Lexicon *lex)
{ int my_nr_facts, ix;
  Lexicon old_lex;
  if (lex == NULL)
    dcg_internal_error ("ebs_detach_lexicon");

  /* Recursively free the subparts */
  old_lex = *lex;

  /* releases all vocabularies */
  dcg_pool_release (old_lex -> vocabulary_pool);
  detach_rt_domain_list (&old_lex -> rt_domains);
  detach_rt_type_list (&old_lex -> rt_types);
  detach_lex_nont_list (&old_lex -> rt_lex_nonts);
  detach_terminal_list (&old_lex -> rt_terminals);
  detach_cset_list (&old_lex -> rt_character_sets);
  detach_nfa_list (&old_lex -> rt_regexp_nfas);
  detach_int_list (&old_lex -> translation_sources);
  detach_int_list (&old_lex -> translation_targets);
  detach_int_list (&old_lex -> translation_penalties);
  detach_affix_value_list (&old_lex -> rt_values);
  detach_int_list_list (&old_lex -> rt_lex_calls);
  detach_int_list_list (&old_lex -> rt_entry_lists);
  my_nr_facts = old_lex -> nr_facts + old_lex -> hybrid_parsing;
  for (ix = 0; ix < my_nr_facts; ix++)
    detach_fact_table (&old_lex -> rt_fact_tables[ix]);
  dcg_detach ((void **) &old_lex -> rt_fact_tables); 
  dcg_detach ((void **) lex);
}
