/*
   File: lexer.c
   Lexical analyzer for Lexicon, Fact, Triple and Lexicon Interface files

   Copyright 2009-2010 Radboud University of Nijmegen

   This program is free software: you can redistribute it and/or modify
   it under the terms of the GNU General Public License as published by
   the Free Software Foundation, either version 3 of the License, or
   (at your option) any later version.

   This program is distributed in the hope that it will be useful,
   but WITHOUT ANY WARRANTY; without even the implied warranty of
   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
   GNU General Public License for more details.

   You should have received a copy of the GNU General Public License
   along with this program.  If not, see <http://www.gnu.org/licenses/>.

   CVS ID: "$Id$"
*/

/* system includes */
#include <stdio.h>
#include <stdlib.h>
#include <stdarg.h>
#include <string.h>

/* libabase includes */
#include <abase_error.h>
#include <abase_memalloc.h>
#include <abase_fileutil.h>

/* liblexicon includes */
#include <lxcn_input.h>

/* local includes */
#include "options.h"
#include "globals.h"
#include "lexgen_lexer.h"

/*
   Character reading administration
*/
#define MAX_LEX_LINE_LEN 1023
static char lex_line_buffer[MAX_LEX_LINE_LEN + 1];
static char *lex_fname;
static FILE *lex_file;
static file_kind lex_kind;
static char *line_ptr;
static int has_errors;
static int linenr;

void parse_error (char *format, ...)
{ va_list argp;
  abs_printf ("File '%s', line %d: ", lex_fname, linenr);
  va_start (argp, format);
  abs_vprintf (format, argp);
  va_end (argp);
  abs_printf ("\n");
  has_errors = 1;
}

void parse_warning (char *format,...)
{ va_list argp;
  if (!verbose) return;
  abs_printf ("    File '%s', line %d: ", lex_fname, linenr);
  va_start (argp, format);
  abs_vprintf (format, argp);
  va_end (argp);
  abs_printf ("\n");
}

/* White space is blank, tab, newline and more? */
static int ch_is_white_space (char ch)
{ return ((ch == ' ') || (ch == '\n') || (ch == '\f') || (ch == '\r') || (ch == '\t'));
}

/* Read line and eat all trailing white space */
void lexer_read_line ()
{ char *line_end;
  int len;
  line_ptr = fgets (lex_line_buffer, MAX_LEX_LINE_LEN, lex_file);
  linenr++;
  if (line_ptr == NULL) return;
  len = (int) strlen (line_ptr);
  line_end = line_ptr + len - 1;
  while ((len != 0) && ch_is_white_space (*line_end))
    { line_end--; len--; }
  *++line_end = '\0';
}

/* Opening the lexer file: no version control yet */
#define DAT_SUFFIX "dat"
#define FCT_SUFFIX "fct"
#define TRP_SUFFIX "trp"
#define LIF_SUFFIX "lif"
char *suffix_from_file_kind (file_kind kind)
{ switch (kind)
    { case lexicon:		return (DAT_SUFFIX);
      case fact:		return (FCT_SUFFIX);
      case triple:		return (TRP_SUFFIX);
      case lexicon_interface:	return (LIF_SUFFIX);
      case relators:		return (DAT_SUFFIX);
      default: abs_bug ("suffix_from_file_kind", "called with strange file kind");
    };
  return (NULL);
}

static char *string_from_file_kind (file_kind kind)
{ switch (kind)
    { case lexicon:		return ("lexicon");
      case fact:		return ("fact table");
      case triple:		return ("triple collection");
      case lexicon_interface:	return ("lexicon interface");
      case relators:		return ("relators");
      default: abs_bug ("string_from_file_kind", "called with strange file kind");
    };
  return (NULL);
}

int try_open_lexer_file (char *base_name, file_kind kind)
{ /* Try open the file */
  lex_fname = abs_new_fmtd_string ("try_open_lexer_file", "%s.%s", base_name,
				   suffix_from_file_kind (kind));
  lex_file = abs_fopen (lex_fname, "r");
  lex_kind = kind;
  if (lex_file == NULL) return -1;

  /* Prepare lexer by reading the first line */
  has_errors = 0;
  linenr = 0;
  lexer_read_line ();

  /* Tell we are reading something */
  if (verbose >= 0)
  { abs_message ("  reading %s \"%s\"",
	         string_from_file_kind (lex_kind), base_name);
  }
  abs_file_skip_bom(lex_file);
  return 0;
}

void close_lexer_file ()
{ if (has_errors)
    abs_abort ("close_lexer_file", "%s '%s' contains errors",
	       string_from_file_kind (lex_kind), lex_fname);
  if (lex_file != NULL) fclose (lex_file);
  lex_file = NULL;
}

void may_skip_white_space ()
{ if (lex_kind == lexicon_interface)
    { while (*line_ptr == ' ') line_ptr++;
    }
  else
    { while (ch_is_white_space (*line_ptr)) line_ptr++;
    };
}

int is_eof ()
{ return (lex_file == NULL || line_ptr == NULL);
}

int is_end ()
{ return (is_eof () || has_errors);
}

void should_be_eof ()
{ if (is_eof ()) return;
  parse_error ("End of file expected");
}

int is_eoln ()
{ return (*line_ptr == '\0');
}

void should_be_eoln ()
{ if (!is_eoln ())
    parse_error ("End of line expected");
  lexer_read_line ();
}

void skip_eoln ()
{ while (!is_eof () && !is_eoln ()) line_ptr++;
}

int is_char (char ch)
{ if (ch == *line_ptr)
    { line_ptr++;
      may_skip_white_space ();
      return (1);
    };
  return (0);
}

void should_be_char (char ch)
{ if (is_char (ch)) return;
  parse_error ("Character '%c' expected", ch);
  skip_eoln ();
}

int is_lif_column_separator ()
{ if (*line_ptr == '\t')
    { line_ptr++;
      return (1);
    };
  return (0);
}

void should_be_lif_column_separator ()
{ if (is_lif_column_separator ()) return;
  parse_error ("Column separator expected");
}

#define LIF_SEPARATOR "*****"
int is_lif_section_separator ()
{ if (is_end ()) return (0);
  if (strcmp (line_ptr, LIF_SEPARATOR) == 0)
    { lexer_read_line ();
      return (1);
    };
  return (0);
}

static int ahead_letter ()
{ if (('a' <= (*line_ptr)) && (*line_ptr <= 'z')) return (1);
  if (('A' <= (*line_ptr)) && (*line_ptr <= 'Z')) return (1);
  return (0);
}

static int ahead_capital_letter ()
{ if (('A' <= (*line_ptr)) && (*line_ptr <= 'Z')) return (1);
  return (0);
}

                                   
static int ahead_name_char ()
{ if (('a' <= (*line_ptr)) && (*line_ptr <= 'z')) return (1);
  if (('A' <= (*line_ptr)) && (*line_ptr <= 'Z')) return (1);
  if (('0' <= (*line_ptr)) && (*line_ptr <= '9')) return (1);
  if ((((int) (*line_ptr)) & 0xff) >= 128) return (1);  /* Accept all extended ASCII */
  switch (*line_ptr)
    { case '$':
      case '+':
      case '-':
      case '?':
      case '@':  
      case '^':
      case '~':
      case '_': return (1);
      default: break;
    };
  return (0);
}

static int ahead_relation_ornament ()
{ if (*line_ptr == '_') return (1);
  return (0);
}

/*
   Basic lexical items of the files
*/
int is_comment ()
{ if (*line_ptr == '#')
    { lexer_read_line ();
      return (1);
    };
  return (0);
}

/*
   Reading of numbers:
   For parameters we must be able to distinguish between a signed number
   and a - as the first character of an affix name.
*/
static int ahead_signed_number ()
{ char *ptr;
  if (*line_ptr != '-') return (0);
  for (ptr = line_ptr + 1; ch_is_white_space (*ptr); ptr++) ;
  return (('0' <= *ptr) && (*ptr <= '9'));
}

static int is_digit (int *ret_val)
{ if (('0' <= (*line_ptr)) && (*line_ptr <= '9'))
    { *ret_val = (int) (((*line_ptr) - '0'));
      line_ptr++;
      return (1);
    };
  return (0);
}

int is_unsigned_number (int *ret_nr)
{ int value, digit;
  if (!is_digit (&value)) return (0);
  while (is_digit (&digit)) value = value * 10 + digit;
  may_skip_white_space ();
  *ret_nr = value;
  return (1);
}

void should_be_unsigned_number (int *ret_nr)
{ if (is_unsigned_number (ret_nr)) return;
  parse_error ("Number expected");
  skip_eoln ();
};

int is_signed_number (int *ret_nr)
{ int value;
  if (ahead_signed_number ())
    { should_be_char ('-');
      should_be_unsigned_number (&value);
      *ret_nr = -value;
      return (1);
    };
  return (is_unsigned_number (ret_nr));
}

void may_be_signed_number (int *ret_nr)
{ if (is_signed_number (ret_nr)) return;
  *ret_nr = 0;
}

/*
   For the reading of names (file names or affix names), we accept letters, digits and ornaments. 
   When we are not reading lexicon interface files, we read trailing white space away
   (In lexicon interface files, the tab is used as a column separator).
*/
int is_name (char *dest_buffer)
{ char *dptr = dest_buffer;
  while (!is_eoln () && ahead_name_char ())
    *dptr++ = *line_ptr++;
  *dptr = '\0';
  if (dptr == dest_buffer) return (0);
  may_skip_white_space ();
  return (1);
}

void should_be_name (char *dest_buffer)
{ if (is_name (dest_buffer)) return;
  parse_error ("Name expected");
}

/*
   Reading of lexicon/fact nonterminal names
*/
int is_nonterminal_name (char *dest_buffer)
{ char *dptr = dest_buffer;
  if (!ahead_letter ()) return (0);
  while (!is_eoln () && ahead_name_char ())
    { int nont_space;
      *dptr++ = *line_ptr++;
      nont_space = ((lex_kind == lexicon_interface) && (*line_ptr == ' ')) ||
                    ((lex_kind != lexicon_interface) && (ch_is_white_space (*line_ptr)));
      if (nont_space)
	{ may_skip_white_space ();
	  if (is_eoln () || !ahead_letter ()) break;
	  *dptr++ = ' ';
	};
    };
  *dptr = '\0';
  return (1);
}

void should_be_nonterminal_name (char *dest_buffer)
{ if (is_nonterminal_name (dest_buffer)) return;
  parse_error ("Nonterminal name expected");
}

/*
   Reading of relations
*/
int is_relation (char *dest_buffer)
{ char *dptr = dest_buffer;
  if (!ahead_capital_letter ()) return (0);
  while (!is_eoln () && (ahead_letter () || ahead_relation_ornament ()))
    *dptr++ = *line_ptr++;
  *dptr = '\0';
  may_skip_white_space ();
  return (1);
}

void should_be_relation (char *dest_buffer)
{ if (is_relation (dest_buffer)) return;
  parse_error ("Relation expected");
}

/*
   Reading of strings
*/
int is_string (char *dest_buffer)
{ char *dptr = dest_buffer;
  int done = 0;
  if (*line_ptr != '"') return (0);
  line_ptr++;
  while (!done)
    switch (*line_ptr)
      { case '\0': parse_error ("Unterminated string"); done = 1; break;
	case '"':  line_ptr++; done = 1; break;
	case '\\':
	  { line_ptr++;
	    if (is_eoln ()) break;
	    *dptr++ = '\\';
	    *dptr++ = *line_ptr++;
	  }; break;
	default:
	  *dptr++ = *line_ptr++;
      };
  *dptr = '\0';
  may_skip_white_space ();
  return (1);
}

int should_be_string_up_to (char *dest_buffer, char terminator)
{ char *dptr = dest_buffer;
  if (is_string_with_expansion(dest_buffer)) return 1;
  while (1)
  { if (*line_ptr == terminator) break;
    if (*line_ptr == '\0')
    { parse_error ("Unterminated string"); break; }
    *dptr++ = *line_ptr++;
  }
  while (dptr > dest_buffer && ch_is_white_space (dptr[-1])) --dptr;
  *dptr = '\0';
  return (1);
}

void should_be_string (char *dest_buffer)
{ if (is_string (dest_buffer)) return;
  parse_error ("String expected");
}

int is_string_with_expansion (char *dest_buffer)
{ char string_buf[MAX_LEX_LINE_LEN + 1];
  char *sptr = string_buf;
  char *dptr = dest_buffer;
  char ch;
  if (!is_string (string_buf)) return (0);

  /* Copy string from source to actual text param buffer while expanding escapes */
  while ((ch = *sptr++))
    switch (ch)
      { case '\\':
	  { ch = *sptr++;
	    switch (ch)
	      { case 'n': *dptr++ = '\n'; break;
		case 't': *dptr++ = '\t'; break;
		case 'r': *dptr++ = '\r'; break;

		/* To add: handling \u */
		/* To add: handling \x */
		default:
		  *dptr++ = ch;
	      };
	  }; break;
        default:
	  *dptr++ = ch;
      };
  *dptr = '\0';
  return (1);
}

void should_be_string_with_expansion (char *dest_buffer)
{ if (is_string (dest_buffer)) return;
  parse_error ("String expected");
}

/*
   Reading of bitsets
*/
static int ch_is_hex_digit (char ch, int *val)
{ if (('0' <= ch) && (ch <= '9'))
    *val = (int) (ch - '0');
  else if (('A' <= ch) && (ch <= 'F'))
    *val = (int) (ch - 'A') + 10;
  else if (('a' <= ch) && (ch <= 'f'))
    *val = (int) (ch - 'a') + 10;
  else return (0);
  return (1);
}

int is_bitset (Bitset64 *ret_val)
{ u_int64 value;
  int hex_digit;
  if (!ch_is_hex_digit (*line_ptr, &hex_digit)) return (0);
  line_ptr++;

  value = (u_int64) (unsigned) hex_digit;
  while (ch_is_hex_digit (*line_ptr, &hex_digit))
    { line_ptr++;
      value = (value << 4) | hex_digit;
    };
  may_skip_white_space ();
  *ret_val = (Bitset64) value; 
  return (1);
}

void should_be_bitset (Bitset64 *ret_val)
{ if (is_bitset (ret_val)) return;
  *ret_val = 0;
  parse_error ("Bitset expected");
} 

/* Recognition of lhs/rhs */
int is_lhs_or_rhs (int *ret_val)
{ if (strncmp (line_ptr, "lhs", 3) == 0)
    *ret_val = 1;
  else if (strncmp (line_ptr, "rhs", 3) == 0)
    *ret_val = 0;
  else return (0);
  line_ptr += 3;
  may_skip_white_space ();
  return (1);
}

void should_be_lhs_or_rhs (int *ret_val)
{ if (is_lhs_or_rhs (ret_val)) return;
  *ret_val = 0;
  parse_error ("lhs or rhs expected");
}

/*
   Recognition of word forms
*/
static void add_utf8_character (char **sptr, char **dptr, int nr)
{ char *ptr = *sptr;
  char *tptr = *dptr;
  int value = 0;
  int ix, dig, hdr, nr_bytes;

  /* scan character */
  for (ix = 0; ix < nr; ix++)
    { if (ch_is_hex_digit (*ptr, &dig))
	{ value = value * 16 + dig;
	  ptr++;
	}
      else
	{ parse_error ("Hex digit expected in unicode sequence");
	  break;
	};
    };
  *sptr = ptr;

  /* Encode character */
  if    (value & 0x1F0000) { hdr = 0xF0; nr_bytes = 4; }
  else if (value & 0xF800) { hdr = 0xE0; nr_bytes = 3; }
  else if (value & 0x0780) { hdr = 0xC0; nr_bytes = 2; }
  else { hdr = 0; nr_bytes = 1; };

  do
    { nr_bytes--;
      *tptr++ = (char) (hdr | (value >> (nr_bytes * 6)));
      value &= ((1 << (nr_bytes * 6)) - 1);
      hdr = 0x80;
    }
  while (nr_bytes > 0);
  *dptr = tptr;
}

static void add_hex_character (char **sptr, char **dptr)
{ char *ptr = *sptr;
  char *tptr = *dptr;
  int value = 0;
  int ix, dig;
  for (ix = 0; ix < 2; ix++)
    { if (ch_is_hex_digit (*ptr, &dig))
	{ value = value * 16 + dig;
	  ptr++;
	}
      else
	{ parse_error ("Hex digit expected in hexadecimal sequence");
	  break;
	};
    };
  if (value == 0xAD)
    parse_error ("An extended ASCII soft hyphen character is not allowed in a word form");
  *sptr = ptr;
  *tptr++ = (char) (unsigned int) value;
  *dptr = tptr;
}

/*
   When the hyphen convention is active, a prefix, suffix and infix hyphen become special.
   Special characters in word forms are interpreted as follows:

   "-"  at start of wordform         -> Suffix
   "-"  at start and end of wordform -> Infix
   "-"  at end of wordform           -> Prefix
   "\-" at start/end of wordform     -> a literal hyphen
   "-"  in middle of wordform        -> a literal hyphen
   "@"  in middle of wordform        -> Soft Hyphen
   "!"  at start of wordform         -> Literal match
   "\!" at start of wordform         -> a literal exclamation mark
   "\@" -> an at sign
   "\n" -> a newline
   "\t" -> a tab
   "\r" -> a return
   "\\" -> a backslash
   "\"" -> a double quote
   "\xHH"       -> a character with HH as its hexadecimal representation
   "\uHHHH"     -> UTF8 encoding of unicode character HHHH
   "\UHHHHHHHH" -> UTF8 encoding of unicode character HHHHHHHH

   Special markers may be combined:
   "-!oxy-" denotes a literal infix "oxy"

   Note: all leading and trailing white space in word forms is removed
         while all infix white space is reduced to a single white space.

   UTF8 encoding:           | 1st byte | 2nd byte | 3rd byte | 4th byte |
   U00000000 - U0000007F -> | 0xxxxxxx |          |          |          |
   U00000080 - U000007FF -> | 110yyyyy | 10xxxxxx |          |          |
   U00000800 - U0000FFFF -> | 1110zzzz | 10yyyyyy | 10xxxxxx |          |
   U00010000 - U001FFFFF -> | 11110uuu | 10uuzzzz | 10yyyyyy | 10zzzzzz |
*/
int is_word_form (char *dest_buffer, int *ret_marker)
{ char string_buf[MAX_LEX_LINE_LEN + 1];
  char *sptr = string_buf;
  char *dptr = dest_buffer;
  int marker = 0;
  int len;
  char ch;
  if (!is_string (string_buf)) return (0);
  len = (int) strlen (string_buf);
  
  /* Strip leading and trailing layout */
  while (ch = *sptr, ((ch == ' ') || (ch == '\t'))) { sptr++; len--; };
  while (ch = sptr[len - 1], ((ch == ' ') || (ch == '\t'))) sptr[--len] = '\0';

  /* Copy lexeme from source to lexeme_buf while expanding escapes */
  while ((ch = *sptr++))
    switch (ch)
      { case ' ':
	case '\t':
	  { /* Eat all other white space */
	    while ((ch = *sptr), (ch == ' ') || (ch == '\t')) sptr++;
	    *dptr++ = ' ';
	    marker |= LexemeMultiWordBit;
	  }; break;
	case '-':
	  { if (!hyphen_convention_active)
	      *dptr++ = '-';
	    else if ((dptr == dest_buffer) && !(marker & LexemeSuffixBit))
	      /* "-" is leading the lexeme, hence a suffix marker */
	      marker |= LexemeSuffixBit;
	    else if (!(*sptr))
	      /* "-" is trailing the lexeme, hence a prefix marker */
	      marker |= LexemePrefixBit;
	    else
	      /* In the middle of a lexeme, the - denotes a hard hyphen */
	      *dptr++ = '-';
	  }; break;
	case '@':
	  { if (!(*sptr) || (dptr == dest_buffer))
	      parse_error ("A soft hyphen may not occur at the start or end of a lexeme");
	    else *dptr++ = SoftHyphenChar;
	  }; break;
	case '!':
	  if (dptr == dest_buffer)
	    { if (marker & LexemeLiteralBit) *dptr++ = '!';
	      else marker |= LexemeLiteralBit;
	    }
	  else *dptr++ = ch;
	  break;
	case '\\':
	  { ch = *sptr++;
	    switch (ch)
	      { case '@': *dptr++ = '@'; break;
		case 'n': *dptr++ = '\n'; break;
		case 't': *dptr++ = '\t'; break;
		case 'r': *dptr++ = '\r'; break;
		case '\\': *dptr++ = '\\'; break;
		case '"': *dptr++ = '"'; break;
		case '!':
		  if (dptr != dest_buffer)
		    { parse_warning ("Literal escape '\\!' is only necessary at start of lexeme");
		      *dptr++ = ch;
		    }
		  else if (marker & LexemeLiteralBit) /* "!\!" is leading the lexeme */
		    { parse_warning ("Literal escape '\\!' is only necessary at start of lexeme");
		      *dptr++ = ch;
		    }
		  else *dptr++ = '!';
		  break;
		case '-': *dptr++ = '-'; break;
		case 'u': add_utf8_character (&sptr, &dptr, 4); break;
		case 'U': add_utf8_character (&sptr, &dptr, 8); break;
		case 'x': add_hex_character (&sptr, &dptr); break;
		default:
		  parse_warning ("Unknown escape sequence '\\%c'", ch);
		  *dptr++ = ch;
	      };
	  }; break;
	default:
	  *dptr++ = ch;
      };
  *dptr = '\0';

  /* Check for empty lexeme */
  if (!strlen (dest_buffer))
    parse_error ("Lexeme only consists of white space and literal/hyphen marks");

  /* Return with success */
  *ret_marker = marker;
  return (1);
}

void should_be_word_form (char *dest_buffer, int *ret_marker)
{ if (is_word_form (dest_buffer, ret_marker)) return;
  parse_error ("word form expected");
}

int is_placeholder_terminal (char *dest_buffer, int *ret_marker)
{ if (*line_ptr != '$') return 0;
  *dest_buffer++ = *line_ptr++;
  while (*line_ptr && *line_ptr != '$') *dest_buffer++ = *line_ptr++;
  if (*line_ptr) *dest_buffer++ = *line_ptr++;
  *dest_buffer = '\0';
  may_skip_white_space();
  *ret_marker = TERMINAL_PLACEHOLDER;
  return 1;
}
