/*
   File: ebase_input.c
   Defines the input routines and optimization to parse some elementary
   input according to a read lexicon. This also includes optimizations
   depending on the case utf8_processing or not

   Copyright 2012 Marc Seutter
 
   This program is free software: you can redistribute it and/or modify
   it under the terms of the GNU General Public License as published by
   the Free Software Foundation, either version 3 of the License, or
   (at your option) any later version.

   This program is distributed in the hope that it will be useful,
   but WITHOUT ANY WARRANTY; without even the implied warranty of
   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
   GNU General Public License for more details.

   You should have received a copy of the GNU General Public License
   along with this program.  If not, see <http://www.gnu.org/licenses/>.

   CVS ID: "$Id: ebase_input.c,v 1.8 2013/03/13 10:06:43 marcs Exp $"
*/

/* global includes */
#include <stdio.h>
#include <ctype.h>
#include <string.h>

/* libdcg includes */
#include <dcg.h>
#include <dcg_alloc.h>
#include <dcg_error.h>
#include <dcg_string.h>
#include <dcg_plist.h>

/* Local includes */
#include "ebase_version.h"
#include "ebase_ds.h"
#include "ebase_utils.h"
#include "ebase_vocabulary.h"
#include "ebase_lexicon.h"
#include "ebase_lexicon_impl.h"
#include "ebase_input.h"

void ebs_prepare_lexicon_input (Lexicon lex)
{ int ix;
  if ((lex -> translation_sources -> size != lex -> translation_targets -> size) ||
      (lex -> translation_targets -> size != lex -> translation_penalties -> size))
    dcg_internal_error ("ebs_prepare_lexicon_input");
  if (lex -> utf8_processing)
    { /* Initialize all character components to null */
      lex -> white_space_chars = (char *) NULL;
      lex -> separator_chars = (char *) NULL;
      lex -> translation_map = (char *) NULL;
      lex -> translation_map_penalty = (Penalty *) NULL;
      lex -> utf8_translation_targets = ebs_convert_utf8_chars (lex -> translation_targets);
      return;
    };

  /* Initialize the arrays, which are zeroed upon initialization */
  lex -> white_space_chars = (char *) dcg_calloc (256, sizeof (char));
  lex -> separator_chars = (char *) dcg_calloc (256, sizeof (char));
  lex -> translation_map = (char *) dcg_calloc (256, sizeof (char));
  lex -> translation_map_penalty = (Penalty *) dcg_calloc (256, sizeof (Penalty));
  lex -> utf8_translation_targets = string_list_nil;
  
  /* Fill white space array */
  for (ix = 0; ix < lex -> white_spaces -> size; ix++)
    { int nr = lex -> white_spaces -> array[ix];
      if ((nr <= 0) || (nr >= 256))
	dcg_internal_error ("ebs_prepare_lexicon_input");
      lex -> white_space_chars[nr] = 1;
    };

  /* Fill separator array */
  for (ix = 0; ix < lex -> separators -> size; ix++)
    { int nr = lex -> separators -> array[ix];
      if ((nr <= 0) || (nr >= 256))
	dcg_internal_error ("ebs_prepare_lexicon_input");
      lex -> separator_chars[nr] = 1;
    };

  /* Fill translation arrays */
  for (ix = 0; ix < lex -> translation_sources -> size; ix++)
    { int nr = lex -> translation_sources -> array[ix];
      int trans = lex -> translation_targets -> array[ix];
      Penalty penalty = (Penalty) lex -> translation_penalties -> array[ix];
      if ((nr <= 0) || (nr >= 256))
	dcg_internal_error ("ebs_prepare_lexicon_input");
      lex -> translation_map[nr] = (char)(unsigned char)(unsigned int) trans;
      lex -> translation_map_penalty[nr] = penalty;
    };
}

int ebs_is_utf8_char (char **input, int *ret_val)
{ char *ptr = *input;
  int val = (int)(unsigned int)(unsigned char) *ptr++;
  int nr_cbytes = 0;
  if ((val & 0x80) == 0) val &= 0x7F;
  else if ((val & 0xE0) == 0xC0)
    { nr_cbytes = 1;
      val &= 0x1F;
    }
  else if ((val & 0xF0) == 0xE0)
    { nr_cbytes = 2;
      val &= 0xF;
    }
  else if ((val & 0xF8) == 0xF0)
    { nr_cbytes = 3;
      val &= 0x7;
    }
  else return (0);
  while (nr_cbytes)
    { int tval = (int)(unsigned int)(unsigned char) *ptr++;
      if ((tval & 0xC0) != 0x80) return (0);
      val = (val << 6) | (tval & 0x3F);
      nr_cbytes--;
    };
  *input = ptr;
  *ret_val = val;
  return (1);
}

char *ebs_skip_one_char (Lexicon lex, char *input)
{ char *ptr = input;
  int utf8_value;
  if (!lex -> utf8_processing)
    return (ptr + 1);
  if (ebs_is_utf8_char (&ptr, &utf8_value))
    return (ptr);
  return (string_nil);
}

int ebs_is_separator (Lexicon lex, char **input)
{ char *ptr = *input;
  int value, ix;
  if (!lex -> separators -> size)
    return (0);
  if (!lex -> utf8_processing)
    { unsigned char ch = (unsigned char) *ptr++;
      if (!lex -> separator_chars[ch])
	return (0);
      *input = ptr;
      return (1);
    };

  if (!ebs_is_utf8_char (&ptr, &value))
    return (0);
  for (ix = 0; ix < lex -> separators -> size; ix++)
    if (value == lex -> separators -> array[ix])
      { *input = ptr;
	return (1);
      }
  return (0);
}

int ebs_ahead_separator (Lexicon lex, char *input)
{ char *ptr = input;
  return (ebs_is_separator (lex, &ptr));
}

int ebs_past_separator (Lexicon lex, char *input)
{ char *ptr = input;
  if (!lex -> separators -> size)
    return (0);
  if (!lex -> utf8_processing)
    { unsigned char ch = (unsigned char) *--ptr;
      return (lex -> separator_chars[ch]);
    };

  /*
     Here we have to backup one UTF8 character. We use the fact
     that we are only called when we are not at the start of input
     and that the whole input has already been checked on the
     validity of the UTF8 sequences
  */
  do
    { ptr--;
    }
  while ((*ptr & 0xC0) == 0x80);
  return (ebs_is_separator (lex, &ptr));
}

int ebs_is_white_space (Lexicon lex, char **input)
{ char *ptr = *input;
  int value, ix;
  if (!lex -> white_spaces -> size)
    return (0);
  if (!lex -> utf8_processing)
    { unsigned char ch = (unsigned char) *ptr++;
      if (!lex -> white_space_chars[ch])
	return (0);
      *input = ptr;
      return (1);
    };

  if (!ebs_is_utf8_char (&ptr, &value))
    return (0);
  for (ix = 0; ix < lex -> white_spaces -> size; ix++)
    if (value == lex -> white_spaces -> array[ix])
      { *input = ptr;
	return (1);
      }
  return (0);
}

int ebs_ahead_white_space (Lexicon lex, char *input)
{ char *ptr = input;
  return (ebs_is_white_space (lex, &ptr));
}

int ebs_is_translation (Lexicon lex, char **input,
			char *rch, int *rvalue, char **rstr, Penalty *rpenalty)
{ char *ptr = *input;
  int value, ix;
  if (!lex -> utf8_processing)
    { unsigned char ch = (unsigned char) *ptr++;
      char trch = lex -> translation_map[ch];
      if (!trch) return (0);
      *input = ptr;
      *rch = trch; 
      if (rstr != NULL)
        *rstr = NULL;
      *rpenalty += lex -> translation_map_penalty[ch];
      return (1);
    };

  if (!ebs_is_utf8_char (&ptr, &value))
    return (0);
  for (ix = 0; ix < lex -> translation_sources -> size; ix++)
    if (value == lex -> translation_sources -> array[ix])
      { *input = ptr;
	if (rvalue != NULL)
	  *rvalue = lex -> translation_targets -> array[ix];
	if (rstr != NULL)
	  *rstr = lex -> utf8_translation_targets -> array[ix];
	*rpenalty += (Penalty) lex -> translation_penalties -> array[ix];
	return (1);
      };
  return (0);
}

int ebs_is_white_spaces (Lexicon lex, char **input)
{ if (!ebs_is_white_space (lex, input)) return (0);
  while (ebs_is_white_space (lex, input)) ;
  return (1);
}

void ebs_update_position (Lexicon lex, char *from, char *to, int *line, int *col)
{ char *ptr_from = from;
  while (ptr_from < to)
    { int ch = 0;
      if (!lex -> utf8_processing)
	ch = (int)(unsigned int)(unsigned char)*ptr_from++;
      else if (!ebs_is_utf8_char (&ptr_from, &ch))
	dcg_internal_error ("ebs_update_position");
      if (ch == '\n')
	{ (*line) += 1;
	  *col = 1;
	}
      else (*col) += 1;
    };
  if (ptr_from > to)
    dcg_internal_error ("ebs_update_position");
}
