/* String utility functions for lexicon library.
 *
 * Copyright 2000 KUN.
 *
 *  This program is free software; you can redistribute it and/or modify
 *  it under the terms of the GNU General Public License as published by
 *  the Free Software Foundation; either version 2 of the License, or
 *  (at your option) any later version.
 *
 *  This program is distributed in the hope that it will be useful,
 *  but WITHOUT ANY WARRANTY; without even the implied warranty of
 *  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 *  GNU Library General Public License for more details.
 *
 *  You should have received a copy of the GNU General Public License
 *  along with this program; if not, write to the Free Software
 *  Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA.
 */

/* $Id: str_util.c,v 1.7 2002/10/08 13:42:02 peterb Exp $ */

#include <string.h>
#include <ctype.h>
#include <stdio.h>

const char PrefixMark           = '\1';
const char SuffixMark           = '\2';
const char InfixMark            = '\3';
const char MultiTokenMark       = '\4';

//------------------------------------------------------------------------------
// Function:
//	char* lexeme_copy(char* dst, char* src, int len)
//
// Description:
//	Copy quoted string src with length len to dst.
//	Reduce multiple spaces or tabs to one space.
//	Strip quotes, and strip hyphens from prefixes, suffixes and infixes.
//	Replace \- with -, \\ with \, \n with newline, \t with tab, \" with "
//	Insert control codes before prefixes, infixes and suffixes.
//
// Return value:
//	Pointer dst.
//
// Side Effects:
//	Contents of dst are overwritten.
//
// Memory management:
//	None.
//
// Note:
//	String src should not be preceeded or followed by layout.
//	String dst should be at least len + 1 bytes long.
//
//	Adaptions to lexeme_copy() should also be applied to other components
//	of lexicon system (lexicon, agfl, and rts).
//------------------------------------------------------------------------------

char *lexeme_copy(char* dst, char* src, int len)
{
  int	layout;
  int	prefix = 0;
  int	suffix = 0;
  int	multi_token = 0;
  char	c;
  char* p = src;
  char* d = dst;

	// strip quotes
  src[len - 1] = '\0';
  src += 1;
  len -= 2;

	// strip trailing and leading layout
  while ((c = *src), (c == ' ') || (c == '\t'))
    { src++; len--; }
  while ((c = src[len - 1]), (c == ' ') || (c == '\t'))
    src[--len] = '\0';

	// strip prefix and suffix marks
  if (*src == '-')
  {
    suffix = 1;
    src++; len--;
  };
  if ((len > 0) && (src[len - 1] == '-')
      && !((len > 1) && (src[len - 2] == '\\')))
  {
    prefix = 1;
    src[--len] = '\0';
  };

	// strip trailing and leading layout
  while ((c = *src), (c == ' ') || (c == '\t'))
    { src++; len--; }
  while ((c = src[len - 1]), (c == ' ') || (c == '\t'))
    src[--len] = '\0';

	// maybe there were only hyphens and layout in wordform
  if (!*src)
  {
    if (prefix && suffix)
      strcpy(dst, "--");
    else if (prefix || suffix)
      strcpy(dst, "-");
    else
      strcpy(dst, " ");
    return dst;
  };

	// check for multi token
  while ((c = *p++))
  {
    if ((c == ' ') || (c == '\t'))
    {
      multi_token = 1;
      break;
    };
  }; 

	// mark word form with type
  if (multi_token)
  {
    *d++ = MultiTokenMark;
    if (suffix)
      *d++ = '-';
  }
  else if (prefix && suffix)
    *d++ = InfixMark;
  else if (prefix)
    *d++ = PrefixMark;
  else if (suffix)
    *d++ = SuffixMark;

	// copy word form
  layout = 1;
  while ((c = *src++))
  {
    switch(c)
    {
      case ' ':
      case '\t':
        if (!layout)
        {
          *d++ = ' ';
          layout = 1;
        };
        break;
      case '\\':
        c = *src++;
        switch (c)
        {
          case '-':
          case '"':
          case '\\':
            *d++ = c;
            break;
          case 'n':
            *d++ = '\n';
            break;
          case 't':
            *d++ = '\t';
            break;
          default:
            *d++ = c;
        };
        layout = 0;
        break;
      default:
        *d++ = c;
        layout = 0;
    };
  };
  if (multi_token && prefix)
    *d++ = '-';
  *d = '\0';
  return dst;
}

//------------------------------------------------------------------------------
// Function:
//	char* unquote_copy(char* dst, const char* src, int len)
//
// Description:
//	Copy quoted string src with length len to dst.
//	Strip quotes; replace \\ with \, \n with newline, \t with tab,
//	\" with ".
//
// Return value:
//	Pointer dst.
//
// Side Effects:
//	Contents of dst are overwritten.
//
// Memory management:
//	None.
//
// Note:
//	String src should not be preceeded or followed by layout.
//	String dst should be at least len bytes long.
//
//	Adaptions to unquote_copy() should also be applied to other components
//	of lexicon system (lexicon, agfl, and rts).
//------------------------------------------------------------------------------

#if 0
char *unquote_copy(char* dst, const char* src, int len)
{
  src++;		// strip quote
  char*	p = dst;
  char	c;
  while ((c = *src++))
  {
    if (c == '\\')
    {
      c = *src++;
      switch (c)
      {
        case '\\':
        case '"':
          *p++ = c;
          break;
        case 'n':
          *p++ = '\n';
          break;
        case 't':
          *p++ = '\t';
          break;
        default:
          *p++ = c;
      };
    }
    else
      *p++ = c;
  }
  p[-1] = '\0';	// strip quote
  return dst;
}
#endif

//------------------------------------------------------------------------------
// Function:
//	char* strip_copy(char* dst, const char* src)
//
// Description:
//	Copy string src to string dst.
//	Reduce multiple spaces or tabs to one space.
//
// Return value:
//	Pointer dst.
//
// Side Effects:
//	Contents of dst are overwritten.
//
// Memory management:
//	None
//
// Note:
//	String dst should be at least strlen(src) + 1 bytes long.
//------------------------------------------------------------------------------

char *strip_copy(char* dst, const char* src)
{
  int layout;
  char* d = dst;
  char c;

  layout = 1;
  while ((c = *src++))
  {
    
    switch (c)
    {
      case ' ':
      case '\t':
        if (!layout)
        {
          *d++ = ' ';
          layout = 1;
        };
        break;
      default:
        layout = 0;
        *d++ = c;
    };
  };
  *d = '\0';
  return dst;
}


int is_affix_nonterminal_part(char* txt)
{
    while (*txt) {
        if (!(isupper(*txt & 0xff) || ((*txt) == '_'))) {
            return 0;
        }

        txt++;
    }

    return 1;
}

int is_affix_terminal_part(char* txt)
{
    while (*txt) {
        if (!(islower(*txt & 0xff) || (*txt == '_') || (*txt == '-') || (*txt == '+') || (*txt == '@'))) {
            return 0;
        }

        txt++;
    }

    return 1;
}


int is_syntax_nonterminal_part(char* txt)
{
    while (*txt) {
        if (!(islower(*txt & 0xff) || isupper(*txt & 0xff) || (*txt == '_'))) {
            return 0;
        }

        txt++;
    }

    return 1;
}

int is_syntax_nonterminal_id(char* txt)
{
    while (*txt) {
        if (!(islower(*txt & 0xff) || isupper(*txt & 0xff) || (*txt == '_') || (*txt == ' '))) {
            return 0;
        }

        txt++;
    }

    return 1;
}

/* 2002-09-16 PB Allowed for underscores in module names after initial lowercase character */

int is_module_name_part(char* txt)
{
    if (!islower(*txt & 0xff)) {
        return 0;
    }
    while (*txt) {
        if (!(islower(*txt & 0xff) || isdigit(*txt & 0xff) || (*txt & 0xff) == '_')) {
            return 0;
        }

        txt++;
    }

    return 1;
}

