/*
   File: lexer.c
   Lexical analyzer for lexicon, fact and triple files

   Copyright 2012 Radboud University of Nijmegen

   This program is free software: you can redistribute it and/or modify
   it under the terms of the GNU General Public License as published by
   the Free Software Foundation, either version 3 of the License, or
   (at your option) any later version.

   This program is distributed in the hope that it will be useful,
   but WITHOUT ANY WARRANTY; without even the implied warranty of
   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
   GNU General Public License for more details.

   You should have received a copy of the GNU General Public License
   along with this program.  If not, see <http://www.gnu.org/licenses/>.

   CVS ID: "$Id: lexer.c,v 1.7 2013/03/13 10:07:34 marcs Exp $"
*/

/* system includes */
#include <stdio.h>
#include <stdlib.h>
#include <stdarg.h>
#include <string.h>
#include <math.h>

/* libdcg includes */
#include <dcg_error.h>
#include <dcg_alloc.h>
#include <dcg_string.h>
#include <dcg_fileutil.h>

/* libeagbase includes */
#include <ebase_input.h>

/* local includes */
#include "options.h"
#include "globals.h"
#include "lexer.h"

/*
   Character reading administration
*/
#define MAX_LEX_LINE_LEN 1023
static char lex_line_buffer[MAX_LEX_LINE_LEN + 1];
static char *lex_fname;
static FILE *lex_file;
static file_kind lex_kind;
static char *line_ptr;
static int has_errors;
static int linenr;

void parse_error (char *format, ...)
{ char buf[MAXSTRLEN];
  va_list argp;
  va_start (argp, format);
  vsprintf (buf, format, argp);
  va_end (argp);
  has_errors = 1;
  dcg_error (0, "File '%s', line %d: %s", lex_fname, linenr, buf);
}

void parse_warning (char *format,...)
{ char buf[MAXSTRLEN];
  va_list argp;
  if (!verbose) return;
  va_start (argp, format);
  vsprintf (buf, format, argp);
  va_end (argp);
  dcg_warning (0, "File '%s', line %d: %s", lex_fname, linenr, buf);
}

/* White space is blank, tab, newline and more? */
static int ch_is_white_space (char ch)
{ return ((ch == ' ') || (ch == '\n') || (ch == '\f') || (ch == '\r') || (ch == '\t'));
}

/* Read line and eat all trailing white space */
void lexer_read_line ()
{ char *line_end;
  int len;
  line_ptr = fgets (lex_line_buffer, MAX_LEX_LINE_LEN, lex_file);
  linenr++;
  if (line_ptr == NULL) return;
  len = (int) strlen (line_ptr);
  line_end = line_ptr + len - 1;
  while ((len != 0) && ch_is_white_space (*line_end))
    { line_end--; len--; }
  *++line_end = '\0';
}

/* Opening the lexer file: no version control yet */
#define DAT_SUFFIX "dat"
#define FCT_SUFFIX "fct"
#define TRP_SUFFIX "trp"
char *suffix_from_file_kind (file_kind kind)
{ switch (kind)
    { case lexicon:		return (DAT_SUFFIX);
      case fact:		return (FCT_SUFFIX);
      case triple:		return (TRP_SUFFIX);
      case translations:	return (NULL);
      default: dcg_bug ("suffix_from_file_kind", "called with strange file kind");
    };
  return (NULL);
}

static char *string_from_file_kind (file_kind kind)
{ switch (kind)
    { case lexicon:		return ("lexicon");
      case fact:		return ("fact table");
      case triple:		return ("triple collection");
      case translations:	return ("translations");
      default: dcg_bug ("string_from_file_kind", "called with strange file kind");
    };
  return (NULL);
}

int try_open_lexer_file (char *name, file_kind kind)
{ /* Try open the file */
  char *suffix = suffix_from_file_kind (kind);
  if (suffix == NULL) lex_fname = name;
  else lex_fname = dcg_new_fmtd_string ("%s.%s", name, suffix_from_file_kind (kind));
  lex_file = dcg_try_fopen_path (dir_name, lex_fname, "r");
  lex_kind = kind;
  if (lex_file == NULL) return -1;

  /* Prepare lexer by reading the first line */
  has_errors = 0;
  linenr = 0;
  lexer_read_line ();

  /* Tell we are reading something */
  dcg_warning (0, "   parsing %s \"%s\"...", string_from_file_kind (lex_kind), name);
  return 0;
}

void close_lexer_file ()
{ if (has_errors)
    dcg_abort ("close_lexer_file", "%s '%s' contains errors",
	       string_from_file_kind (lex_kind), lex_fname);
  if (lex_file != NULL) fclose (lex_file);
  lex_file = NULL;
}

void may_skip_white_space ()
{ while (ch_is_white_space (*line_ptr)) line_ptr++;
}

int is_eof ()
{ return (lex_file == NULL || line_ptr == NULL);
}

int is_end ()
{ return (is_eof () || has_errors);
}

void should_be_eof ()
{ if (is_eof ()) return;
  parse_error ("End of file expected");
}

int is_eoln ()
{ return (*line_ptr == '\0');
}

void should_be_eoln ()
{ if (!is_eoln ())
    parse_error ("End of line expected");
  lexer_read_line ();
}

void skip_eoln ()
{ while (!is_eof () && !is_eoln ()) line_ptr++;
}

int is_char (char ch)
{ if (ch == *line_ptr)
    { line_ptr++;
      may_skip_white_space ();
      return (1);
    };
  return (0);
}

void should_be_char (char ch)
{ if (is_char (ch)) return;
  parse_error ("Character '%c' expected", ch);
  skip_eoln ();
}

static int ahead_small_letter ()
{ if (('a' <= (*line_ptr)) && (*line_ptr <= 'z')) return (1);
  return (0);
}

static int ahead_capital_letter ()
{ if (('A' <= (*line_ptr)) && (*line_ptr <= 'Z')) return (1);
  return (0);
}

static int ahead_letter ()
{ if (ahead_small_letter ()) return (1);
  if (ahead_capital_letter ()) return (1);
  return (0);
}

static int ahead_digit ()
{ if (('0' <= (*line_ptr)) && (*line_ptr <= '9')) return (1);
  return (0);
}

static int ahead_extended_letter ()
{ if ((((int) (*line_ptr)) & 0xff) >= 128) return (1);  /* Accept all extended ASCII */
  return (0);
}

static int ahead_ornament ()
{ switch (*line_ptr)
    { case '+':
      case '-':
      case '?':
      case '@':
      case '_': return (1);
      default: break;
    };
  return (0);
}

static int ahead_letment ()
{ if (ahead_letter ()) return (1);
  if (ahead_digit ()) return (1);
  if (ahead_extended_letter ()) return (1);
  if (ahead_ornament ()) return (1);
  return (0);
}

static int ahead_affix_terminal_first_letment ()
{ if (ahead_small_letter ()) return (1);
  if ((*line_ptr != '-') && (*line_ptr != '+')) return (0);
  return (('a' <= line_ptr[1]) && (line_ptr[1] <= 'z'));
}

static int ahead_affix_terminal_letment ()
{ if (ahead_small_letter ()) return (1);
  if (ahead_digit ()) return (1);
  if (ahead_extended_letter ()) return (1);
  if (ahead_ornament ()) return (1);
  return (0);
}

static int ahead_affix_nonterminal_letment ()
{ if (ahead_capital_letter ()) return (1);
  if (ahead_extended_letter ()) return (1);
  switch (*line_ptr)
    { case '?':
      case '@':
      case '_': return (1);
      default: break;
    };
  return (0);
}

/*
   Basic lexical items of the files
*/
int is_comment ()
{ if (*line_ptr == '#')
    { lexer_read_line ();
      return (1);
    };
  return (0);
}

/*
   Reading of numbers:
   For parameters we must be able to distinguish between a signed number
   and a +/- as the first character of an affix name.
*/
static int ch_is_digit (char ch, int *val)
{ if (('0' <= ch) && (ch <= '9'))
    *val = (int) (ch - '0');
  else return (0);
  return (1);
}

static int ch_is_octal_digit (char ch, int *val)
{ if (('0' <= ch) && (ch <= '7'))
    *val = (int) (ch - '0');
  else return (0);
  return (1);
}

static int ch_is_hex_digit (char ch, int *val)
{ if (('0' <= ch) && (ch <= '9'))
    *val = (int) (ch - '0');
  else if (('A' <= ch) && (ch <= 'F'))
    *val = (int) (ch - 'A') + 10;
  else if (('a' <= ch) && (ch <= 'f'))
    *val = (int) (ch - 'a') + 10;
  else return (0);
  return (1);
}

/* We need this lookahead to determine between affixes and signed numbers */
static int ahead_signed_number ()
{ char *ptr;
  int val;
  if ((*line_ptr != '-') && (*line_ptr != '+')) return (0);
  for (ptr = line_ptr + 1; ch_is_white_space (*ptr); ptr++) ;
  return (ch_is_digit (*ptr, &val));
}

/*
   In the lookahead for real numbers we do not check for the exponent.
   This is alright because we require that every real number contains a period.
*/
static int ahead_real_number ()
{ char *ptr = line_ptr;
  int val;
  if ((*ptr == '+') || (*ptr == '-'))
    for (ptr++; ch_is_white_space (*ptr); ptr++) ;
  if (!ch_is_digit (*ptr, &val)) return (0);
  for (ptr++; ch_is_digit (*ptr, &val); ptr++)
  if (*ptr != '.') return (0);
  return (ch_is_digit (*++ptr, &val));
}

static int is_digit (int *ret_val)
{ if (ch_is_digit (*line_ptr, ret_val))
    { line_ptr++;
      return (1);
    };
  return (0);
}

static int is_hex_digit (int *ret_val)
{ if (ch_is_hex_digit (*line_ptr, ret_val))
    { line_ptr++;
      return (1);
    };
  return (0);
}

int is_unsigned_number (int *ret_nr)
{ int value, digit;
  if (!is_digit (&value)) return (0);
  if ((value == 0) && (*line_ptr == 'x'))
    { /* Hexadecimal input */
      line_ptr++;
      if (!is_hex_digit (&value))
	{ parse_error ("Hexadecimal number expected");
	  *ret_nr = 0;
	  return (1);
	};
      while (is_hex_digit (&digit))
	value = value * 16 + digit;
    }
  else
    { /* Decimal input */
      while (is_digit (&digit))
	value = value * 10 + digit;
    };
  may_skip_white_space ();
  *ret_nr = value;
  return (1);
}

void should_be_unsigned_number (int *ret_nr)
{ if (is_unsigned_number (ret_nr)) return;
  parse_error ("Unsigned number expected");
  skip_eoln ();
};

int is_signed_number (int *ret_nr)
{ int minus = 0;
  int value;
  if (ahead_signed_number ())
    { if (is_char ('-')) minus = 1;
      else should_be_char ('+');
      should_be_unsigned_number (&value);
      if (minus) *ret_nr = -value;
      else *ret_nr = value;
      return (1);
    };
  return (is_unsigned_number (ret_nr));
}

void may_be_signed_number (int *ret_nr)
{ if (is_signed_number (ret_nr)) return;
  *ret_nr = 0;
}

int is_real_number (real *ret_nr)
{ real value = 0.0;
  int cdigits = 0;
  int minus = 0;
  int digit;
  if (!ahead_real_number ())
    return (0);

  /* Eat optional sign */
  if (is_char ('+')) ;
  else if (is_char ('-')) minus = 1;

  while (is_digit (&digit))
    value = value * 10.0 + digit;

  if (!is_char ('.'))
    dcg_internal_error ("is_real_number");
  
  while (is_digit (&digit))
    { value = value * 10.0 + digit;
      cdigits++;
    };

  value = value / pow (10.0, (real) cdigits);
  if (is_char ('E') || is_char ('e'))
    { int e_minus = 0;
      real exp_value;
      int exponent;
      if (is_char ('+')) ;
      else if (is_char ('-')) e_minus = 1;
      if (!is_digit (&exponent))
	parse_error ("exponent expected");
      while (is_digit (&digit))
	exponent = exponent * 10 + digit;
      exp_value = pow (10.0, (real) exponent);
      if (e_minus) value = value / exp_value;
      else value = value * exp_value;
    };

  if (minus) value = -value;
  *ret_nr = value;
  return (1);
}

/*
   Lexing of affix terminals
*/
int is_affix_terminal (char **rname)
{ char buf[MAX_LEX_LINE_LEN];
  char *dptr = buf;
  if (is_eoln ()) return (0);
  if (!ahead_affix_terminal_first_letment ()) return (0);
  while (!is_eoln () && ahead_affix_terminal_letment ())
    *dptr++ = *line_ptr++;
  *dptr = '\0';
  *rname = new_string (buf);
  may_skip_white_space ();
  return (1);
}

/*
   Lexing of affix nonterminals
*/
int is_affix_nonterminal (char **rname)
{ char buf[MAX_LEX_LINE_LEN];
  char *dptr = buf;
  if (is_eoln ()) return (0);
  if (!ahead_capital_letter ()) return (0);
  while (!is_eoln () && ahead_affix_nonterminal_letment ())
    *dptr++ = *line_ptr++;
  *dptr = '\0';
  *rname = new_string (buf);
  may_skip_white_space ();
  return (1);
}

/*
   For the reading of nonterminal name parts, we accept a letter, followed by letments
*/
int is_name_part (char **rname)
{ char buf[MAX_LEX_LINE_LEN];
  char *dptr = buf;
  if (is_eoln ()) return (0);
  if (!ahead_letter ()) return (0);
  while (!is_eoln () && ahead_letment ())
    *dptr++ = *line_ptr++;
  *dptr = '\0';
  *rname = new_string (buf);
  may_skip_white_space ();
  return (1);
}

void should_be_name_part (char **rname)
{ if (is_name_part (rname)) return;
  parse_error ("Name expected");
  *rname = new_string ("<ERROR>");
}

/*
   Reading of relations
*/
int is_relation (char **rname)
{ char buf[MAX_LEX_LINE_LEN];
  char *dptr = buf;
  if (!ahead_capital_letter ()) return (0);
  while (!is_eoln () && ahead_letter ())
    *dptr++ = *line_ptr++;
  *dptr = '\0';
  *rname = new_string (buf);
  may_skip_white_space ();
  return (1);
}

void should_be_relation (char **rname)
{ if (is_relation (rname)) return;
  parse_error ("Relation expected");
}

/*
   Reading of strings
*/
int is_string (char **str)
{ char buf[MAX_LEX_LINE_LEN];
  char *dptr = buf;
  int done = 0;
  if (*line_ptr != '"') return (0);
  line_ptr++;
  while (!done)
    switch (*line_ptr)
      { case '\0': parse_error ("Unterminated string"); done = 1; break;
	case '"':  line_ptr++; done = 1; break;
	case '\\':
	  { line_ptr++;
	    if (is_eoln ()) break;
	    *dptr++ = '\\';
	    *dptr++ = *line_ptr++;
	  }; break;
	default:
	  *dptr++ = *line_ptr++;
      };
  *dptr = '\0';
  *str = new_string (buf);
  may_skip_white_space ();
  return (1);
}

void should_be_string (char **str)
{ if (is_string (str)) return;
  parse_error ("String expected");
}

/*
   Reading of strings with expanding of escapes
*/
static void yield_encoded_char (char ch, char **dptr, int *ret_nr)
{ if (dptr != NULL)
    { **dptr = ch;
      *dptr = (*dptr + 1);
    }
  else if (ret_nr != NULL)
    *ret_nr = (int)(unsigned int)(unsigned char) ch;
  else dcg_internal_error ("yield_encoded_char");
}

static void add_utf8_character (char **sptr, char **dptr, int *ret_nr, int nr)
{ int ix, dig, hdr, nr_bytes;
  int value = 0;
  char *ptr = *sptr;
  char *tptr = NULL;

  /* scan character */
  for (ix = 0; ix < nr; ix++)
    { if (ch_is_hex_digit (*ptr, &dig))
	{ value = value * 16 + dig;
	  ptr++;
	}
      else
	{ parse_error ("Hex digit expected in unicode sequence");
	  break;
	};
    };
  *sptr = ptr;

  /* Either directly output or convert */
  if (dptr != NULL)
    tptr = *dptr;
  else if (ret_nr != NULL)
    { *ret_nr = value;
      return;
    }
  else dcg_internal_error ("add_utf8_character");

  /* Encode character */
  if    (value & 0x1F0000) { hdr = 0xF0; nr_bytes = 4; }
  else if (value & 0xF800) { hdr = 0xE0; nr_bytes = 3; }
  else if (value & 0x0780) { hdr = 0xC0; nr_bytes = 2; }
  else { hdr = 0; nr_bytes = 1; };

  do
    { nr_bytes--;
      *tptr++ = (char) (hdr | (value >> (nr_bytes * 6)));
      value &= ((1 << (nr_bytes * 6)) - 1);
      hdr = 0x80;
    }
  while (nr_bytes > 0);
  *dptr = tptr;
}

static void add_octal_character (char **sptr, char **dptr, int *ret_nr, int wform)
{ char *ptr = *sptr;
  int value = 0;
  int ix, dig;
  for (ix = 0; ix < 3; ix++)
    { if (ch_is_octal_digit (*ptr, &dig))
	{ value = 8 * value + dig;
	  ptr++;
	}
      else
	{ parse_error ("Octal digit expected in octal sequence");
	  break;
	};
    };
  if (wform & (value == 0xAD))
    parse_error ("An extended ASCII soft hyphen character is not allowed in a word form");
  *sptr = ptr;
  yield_encoded_char ((char) (unsigned int) value, dptr, ret_nr);
}

static void add_hex_character (char **sptr, char **dptr, int *ret_nr, int wform)
{ char *ptr = *sptr;
  int value = 0;
  int ix, dig;
  for (ix = 0; ix < 2; ix++)
    { if (ch_is_hex_digit (*ptr, &dig))
	{ value = 16 * value + dig;
	  ptr++;
	}
      else
	{ parse_error ("Hex digit expected in hexadecimal sequence");
	  break;
	};
    };
  if (wform && (value == 0xAD))
    parse_error ("An extended ASCII soft hyphen character is not allowed in a word form");
  *sptr = ptr;
  yield_encoded_char ((char) (unsigned int) value, dptr, ret_nr);
}

static int is_encoded_char (char **sptr, char **dptr, int *ret_nr)
{ char ch = **sptr;
  if (!ch) return (0);
  *sptr = (*sptr) + 1;
  switch (ch)
    { case '\\':
	{ ch = **sptr;
	  *sptr = (*sptr) + 1;
	  switch (ch)
	    { case '\'': yield_encoded_char ('\'', dptr, ret_nr); break;
	      case 'a':  yield_encoded_char ('\a', dptr, ret_nr); break;
	      case 'b':  yield_encoded_char ('\b', dptr, ret_nr); break;
	      case 'f':  yield_encoded_char ('\f', dptr, ret_nr); break;
	      case 'n':  yield_encoded_char ('\n', dptr, ret_nr); break;
	      case 't':  yield_encoded_char ('\t', dptr, ret_nr); break;
	      case 'r':  yield_encoded_char ('\r', dptr, ret_nr); break;
	      case 'u': add_utf8_character (sptr, dptr, ret_nr, 4); break;
	      case 'U': add_utf8_character (sptr, dptr, ret_nr, 8); break;
	      case 'x': add_hex_character (sptr, dptr, ret_nr, 0); break;
	      case '0':
	      case '1':
	      case '2':
	      case '3': add_octal_character (sptr, dptr, ret_nr, 0); break;
	      default:
		parse_warning ("Unknown escape sequence '\\%c'", ch);
		yield_encoded_char (ch, dptr, ret_nr);
	    };
	}; break;
      default:
	yield_encoded_char (ch, dptr, ret_nr);
    };
  return (1);
}

/*
   Reading of quoted character in translations
*/
int is_quoted_character (int *ret_nr)
{ if (*line_ptr != '\'') return (0);
  line_ptr++;
  if (!is_encoded_char (&line_ptr, NULL, ret_nr))
    parse_error ("character expected after quote");
  else if (*line_ptr != '\'')
    parse_error ("quote expected");
  else line_ptr++;
  may_skip_white_space ();
  return (1);
}

/*
   Reading of strings with expanding of escapes
*/
int is_string_with_expansion (char **str)
{ char buf[MAX_LEX_LINE_LEN + 1];
  char *dptr = buf;
  char *source;
  char *sptr;
  if (!is_string (&source)) return (0);

  /* Copy string from source to buffer while expanding escapes */
  sptr = source;
  while (is_encoded_char (&sptr, &dptr, NULL)) ;
  *dptr = '\0';
  *str = new_string (buf);
  detach_string (&source);
  return (1);
}

void should_be_string_with_expansion (char **str)
{ if (is_string (str)) return;
  parse_error ("String expected");
}

/*
   Recognition of word forms
*/
/*
   When the hyphen convention is active, a prefix, suffix and infix hyphen become special.
   Special characters in word forms are interpreted as follows:

   "-"  at start of wordform         -> Suffix
   "-"  at start and end of wordform -> Infix
   "-"  at end of wordform           -> Prefix
   "\-" at start/end of wordform     -> a literal hyphen
   "-"  in middle of wordform        -> a literal hyphen
   "@"  in middle of wordform        -> Soft Hyphen
   "!"  at start of wordform         -> Literal match
   "\!" at start of wordform         -> a literal exclamation mark
   "\@" -> an at sign
   "\a" -> a BEL
   "\b" -> a BS
   "\f" -> a FF
   "\n" -> a newline
   "\t" -> a tab
   "\r" -> a return
   "\\" -> a backslash
   "\"" -> a double quote
   "\ooo"	-> a character with ooo as its octal representation
   "\xHH"       -> a character with HH as its hexadecimal representation
   "\uHHHH"     -> UTF8 encoding of unicode character HHHH
   "\UHHHHHHHH" -> UTF8 encoding of unicode character HHHHHHHH

   Special markers may be combined:
   "-!oxy-" denotes a literal infix "oxy"

   Note: all leading and trailing white space in word forms is removed
	 while all infix white space is reduced to a single white space.

   UTF8 encoding:           | 1st byte | 2nd byte | 3rd byte | 4th byte |
   U00000000 - U0000007F -> | 0xxxxxxx |          |          |          |
   U00000080 - U000007FF -> | 110yyyyy | 10xxxxxx |          |          |
   U00000800 - U0000FFFF -> | 1110zzzz | 10yyyyyy | 10xxxxxx |          |
   U00010000 - U001FFFFF -> | 11110uuu | 10uuzzzz | 10yyyyyy | 10zzzzzz |
*/
int is_word_form (char **wform, int *ret_marker, int empty_ok)
{ char buf[MAX_LEX_LINE_LEN + 1];
  char *dptr = buf;
  int marker = 0;
  char *source;
  char *sptr;
  int len;
  char ch;

  if (!is_string (&source)) return (0);
  sptr = source;
  len = (int) strlen (source);
  
  /* Strip leading and trailing layout */
  while (ch = *sptr, ((ch == ' ') || (ch == '\t'))) { sptr++; len--; };
  while (ch = sptr[len - 1], ((ch == ' ') || (ch == '\t'))) sptr[--len] = '\0';

  /* Copy lexeme from source to buf while expanding escapes */
  while ((ch = *sptr++))
    switch (ch)
      { case ' ':
	case '\t':
	  { /* Eat all other white space */
	    while ((ch = *sptr), (ch == ' ') || (ch == '\t')) sptr++;
	    *dptr++ = ' ';
	    marker |= LexemeMultiWordBit;
	  }; break;
	case '-':
	  { if (!hyphen_convention_active)
	      *dptr++ = '-';
	    else if ((dptr == buf) && !(marker & LexemeSuffixBit))
	      /* "-" is leading the lexeme, hence a suffix marker */
	      marker |= LexemeSuffixBit;
	    else if (!(*sptr))
	      /* "-" is trailing the lexeme, hence a prefix marker */
	      marker |= LexemePrefixBit;
	    else
	      /* In the middle of a lexeme, the - denotes a hard hyphen */
	      *dptr++ = '-';
	  }; break;
	case '@':
	  { if (!(*sptr) || (dptr == buf))
	      parse_error ("A soft hyphen may not occur at the start or end of a lexeme");
	    else *dptr++ = SoftHyphenChar;
	  }; break;
	case '!':
	  if (dptr == buf)
	    { if (marker & LexemeLiteralBit) *dptr++ = '!';
	      else marker |= LexemeLiteralBit;
	    }
	  else *dptr++ = ch;
	  break;
	case '\\':
	  { ch = *sptr++;
	    switch (ch)
	      { case '@': *dptr++ = '@'; break;
		case 'a': *dptr++ = '\a'; break;
		case 'b': *dptr++ = '\b'; break;
		case 'f': *dptr++ = '\f'; break;
		case 'n': *dptr++ = '\n'; break;
		case 't': *dptr++ = '\t'; break;
		case 'r': *dptr++ = '\r'; break;
		case '\\': *dptr++ = '\\'; break;
		case '"': *dptr++ = '"'; break;
		case '!':
		  if (dptr != buf)
		    { parse_warning ("Literal escape '\\!' is only necessary at start of lexeme");
		      *dptr++ = ch;
		    }
		  else if (marker & LexemeLiteralBit) /* "!\!" is leading the lexeme */
		    { parse_warning ("Literal escape '\\!' is only necessary at start of lexeme");
		      *dptr++ = ch;
		    }
		  else *dptr++ = '!';
		  break;
		case '-': *dptr++ = '-'; break;
		case 'u': add_utf8_character (&sptr, &dptr, NULL, 4); break;
		case 'U': add_utf8_character (&sptr, &dptr, NULL, 8); break;
		case 'x': add_hex_character (&sptr, &dptr, NULL, 1); break;
		case '0':
		case '1':
		case '2':
		case '3': add_octal_character (&sptr, &dptr, NULL, 1); break;
		default:
		  parse_warning ("Unknown escape sequence '\\%c'", ch);
		  *dptr++ = ch;
	      };
	  }; break;
	default:
	  *dptr++ = ch;
      };
  *dptr = '\0';

  /* Check for empty lexeme */
  if (!strlen (buf) && !empty_ok)
    { parse_error ("Lexeme only consists of white space and literal/hyphen marks");
      detach_string (&source);
      return (0);
    }

  /* Return with success */
  *wform = new_string (buf);
  *ret_marker = marker;
  detach_string (&source);
  return (1);
}

