/*
   File: entries.c
   Collects all lexicon and fact entries

   Copyright (C) 2012 Marc Seutter

   This program is free software: you can redistribute it and/or modify
   it under the terms of the GNU General Public License as published by
   the Free Software Foundation, either version 3 of the License, or
   (at your option) any later version.

   This program is distributed in the hope that it will be useful,
   but WITHOUT ANY WARRANTY; without even the implied warranty of
   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
   GNU General Public License for more details.

   You should have received a copy of the GNU General Public License
   along with this program.  If not, see <http://www.gnu.org/licenses/>.

   CVS ID: "$Id: entries.c,v 1.6 2012/05/30 14:22:53 marcs Exp $"
*/

/* standard includes */
#include <stdio.h>

/* libdcg includes */
#include <dcg.h>
#include <dcg_error.h>

/* libeagbase includes */
#include <ebase_version.h>
#include <ebase_ds.h>
#include <ebase_hash.h>

/* Local includes */
#include "options.h"
#include "globals.h"
#include "entries.h"

void prepare_facts ()
{ int my_nr_facts = nr_facts + hybrid_parsing;
  int ix;
  for (ix = 0; ix < my_nr_facts; ix++)
    app_int_list (fact_nr_to_lex_nont_nr, -1);
  for (ix = 0; ix < all_lex_nonts -> size; ix++)
    { lex_nont nt = all_lex_nonts -> array[ix];
      int fact_nr = nt -> fact_nr;
      if (fact_nr == -1) continue;
      if ((fact_nr < 0) || (fact_nr >= nr_facts))
	dcg_abort ("generate_fact_nr_mapping", "fact nr %d out of range", fact_nr);
      fact_nr_to_lex_nont_nr -> array[nt -> fact_nr] = ix;
    };
  for (ix = 0; ix < nr_facts; ix++)
    if (fact_nr_to_lex_nont_nr -> array[ix] == -1)
      dcg_abort ("generate_fact_nr_mapping", "Fact nr %d not mapped", ix);
}

int *enter_into_lexicon (char *lexeme, int lexeme_marker)
{ if ((lexeme_marker < 0) || (lexeme_marker >= MAX_MARKERS))
    dcg_internal_error ("enter_into_lexicon");

  /* Check if we have seen this marker */
  if (all_vocabularies[lexeme_marker] == vocabulary_nil)
    all_vocabularies[lexeme_marker] = ebs_new_vocabulary (my_default_pool);

  /* Ok, enter the lexeme */
  return (ebs_enter_into_vocabulary (all_vocabularies[lexeme_marker], lexeme));
}

/*
   An entry is the combination of a call_id and a frequency.
   Maybe we will look later to adding multi token bonusses as
   is done in AGFL: for the moment we will ignore these.

   Entry lists are therefore organised in pairs of ints,
   the first being the saved call_id and the second being
   the frequency of this call in this entry list.

   Each lexeme or fact with crits will map to a unique entry list.
   When an entry is registered on an entry list, for which there
   is already an entry with the same call_id, the frequencies are
   added instead.
*/
static int_list create_or_locate_entry_list (int *info_ptr)
{ int_list new_list;
  if (*info_ptr)
    { /* We already have an earlier entry list for this lexeme/fact */
      return (all_entry_lists -> array[*info_ptr]);
    };

  /* Create a new entry list */
  new_list = new_int_list ();
  *info_ptr = all_entry_lists -> size;
  app_int_list_list (all_entry_lists, new_list);
  return (new_list);
}

void register_new_entry (int *info_ptr, int call_id, int frequency)
{ int_list my_list = create_or_locate_entry_list (info_ptr);
  int ix;
  if (my_list -> size % 2)
    dcg_internal_error ("register_new_entry");
  for (ix = 0; ix < my_list -> size; ix += 2)
    if (my_list -> array[ix] == call_id)
      { /* Found earlier entry: add frequency */
	my_list -> array[ix + 1] += frequency;
	return;
      };

  /* This is a fresh entry, record it */
  app_int_list (my_list, call_id);
  app_int_list (my_list, frequency);
  nr_entries++;
}

/*
   A fact is stored as a vector of critical params + one entry reference
   From the search key (i.e. list of critical parameters), we calculate
   a hash key (Currently all facts have the same hash size). The bucket
   then holds all facts with this hash key.
*/
static int equal_fact (int_list bucket, int index, int_list crits)
{ int ix;
  for (ix = 0; ix < crits -> size; ix++)
    if (bucket -> array[index + ix] != crits -> array[ix])
      return (0);
  return (1);
}

int *enter_into_fact_table (int fact_nr, int_list crits)
{ int fact_len = crits -> size + 1;
  int hash = ebs_hash_search_key (crits, fact_hash_size);
  int_list bucket = all_compiled_fact_tables[fact_nr] -> array[hash];
  int ix;

  /* Check if we have a new hash entry */
  if (bucket == int_list_nil)
    { bucket = init_int_list (3 * fact_len);
      all_compiled_fact_tables[fact_nr] -> array[hash] = bucket;
    };

  /* Try and locate existing fact */
  if (bucket -> size % fact_len != 0)
    dcg_internal_error ("enter_into_fact_table");
  for (ix = 0; ix < bucket -> size; ix += fact_len)
    if (equal_fact (bucket, ix, crits))
      return (&bucket -> array[ix + crits -> size]);

  /* Fact is really new: create a fresh one in this bucket */
  for (ix = 0; ix < crits -> size; ix++)
    app_int_list (bucket, crits -> array[ix]);
  app_int_list (bucket, 0);
  return (&bucket -> array[bucket -> size] - 1);
}

/*
   A terminal is received as an UTF-32 encoded or single byte
   encoded sequence (depending on the utf8-processing flag).
   Convert it into a proper lexeme for the vocabularies.
*/
char *convert_terminal_to_lexeme (int_list text)
{ char *lexeme = dcg_malloc (4 * text -> size + 1);
  char *dptr = lexeme;
  int ix;
  for (ix = 0; ix < text -> size; ix++)
    { int entry = text -> array[ix];
      if (!utf8_processing && (entry >= 256))
	dcg_abort ("convert_terminal_to_lexeme", "Illegal terminal entry %d", entry);
      else if (!utf8_processing)
	*dptr++ = (char) (entry & 0xff);
      else if (entry < 128)
        { /* First 128 characters of UTF-8 */
          *dptr++ = (char) entry;
        }
      else if (entry < 0x0800)
        { /* one continuation byte */
          *dptr++ = (char)(0xC0 | ((entry >> 6) & 0x1F));
          *dptr++ = (char)(0x80 | (entry & 0x3F));
        }
      else if (entry < 0x10000)
        { /* two continuation bytes */
          *dptr++ = (char)(0xE0 | ((entry >> 12) & 0x0F));
          *dptr++ = (char)(0x80 | ((entry >> 6) & 0x3F));
          *dptr++ = (char)(0x80 | (entry & 0x3F));
        }
      else if (entry < 0x200000)
        { /* three continuation bytes */
          *dptr++ = (char)(0xF0 | ((entry >> 16) & 0x7));
          *dptr++ = (char)(0x80 | ((entry >> 12) & 0x3F));
          *dptr++ = (char)(0x80 | ((entry >> 6) & 0x3F));
          *dptr++ = (char)(0x80 | (entry & 0x3F));
        }
      else dcg_abort ("convert_terminal_to_lexeme", "too large utf encoding %d", entry);
    };
  *dptr = '\0';
  return (lexeme);
}
